initial-retrieval.ipynb

{"cells":[{"cell_type":"markdown","metadata":{},"source":["<h2> Imports </h2>"]},{"cell_type":"code","execution_count":1,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T18:52:18.410005Z","iopub.status.busy":"2024-01-03T18:52:18.409290Z","iopub.status.idle":"2024-01-03T18:56:01.555478Z","shell.execute_reply":"2024-01-03T18:56:01.553732Z","shell.execute_reply.started":"2024-01-03T18:52:18.409894Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["Defaulting to user installation because normal site-packages is not writeable\n","Requirement already satisfied: rank_bm25 in /home/hanna/.local/lib/python3.10/site-packages (0.2.2)\n","Requirement already satisfied: numpy in /home/hanna/.local/lib/python3.10/site-packages (from rank_bm25) (1.24.1)\n","\n","\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.1.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.3.2\u001b[0m\n","\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n","Note: you may need to restart the kernel to use updated packages.\n"]}],"source":["%pip install rank_bm25"]},{"cell_type":"code","execution_count":2,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T18:56:01.558915Z","iopub.status.busy":"2024-01-03T18:56:01.558368Z","iopub.status.idle":"2024-01-03T18:56:32.924438Z","shell.execute_reply":"2024-01-03T18:56:32.919977Z","shell.execute_reply.started":"2024-01-03T18:56:01.558854Z"},"trusted":true},"outputs":[],"source":["import nltk\n","from nltk.corpus import stopwords\n","from nltk.tokenize import word_tokenize\n","from nltk.tokenize.toktok import ToktokTokenizer\n","import re\n","stopword_list = nltk.corpus.stopwords.words('english')\n","import pandas as pd\n","from tqdm import tqdm\n","tqdm.pandas()\n","from rank_bm25 import BM25Okapi\n","import pandas as pd \n","import string\n","from nltk.stem import WordNetLemmatizer\n","import pickle \n"]},{"cell_type":"markdown","metadata":{},"source":["<h2> Load datasets and model</h2>"]},{"cell_type":"code","execution_count":3,"metadata":{"_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","execution":{"iopub.execute_input":"2024-01-03T18:56:32.931742Z","iopub.status.busy":"2024-01-03T18:56:32.929029Z","iopub.status.idle":"2024-01-03T18:56:33.045085Z","shell.execute_reply":"2024-01-03T18:56:33.043862Z","shell.execute_reply.started":"2024-01-03T18:56:32.931643Z"},"trusted":true},"outputs":[],"source":["queries = pd.read_csv(\"./data/cisi-csv/queries.csv\")\n","docs = pd.read_csv(\"./data/cisi-csv/docs.csv\")\n","rels = pd.read_csv(\"./data/cisi-csv/rels.csv\")\n","\n","full_doc = docs['text'].to_list()\n","full_query = queries['text'].to_list()"]},{"cell_type":"markdown","metadata":{},"source":["<h2>Initial retrieval with bm25</h2>"]},{"cell_type":"code","execution_count":4,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T18:57:01.149061Z","iopub.status.busy":"2024-01-03T18:57:01.148640Z","iopub.status.idle":"2024-01-03T18:57:01.166124Z","shell.execute_reply":"2024-01-03T18:57:01.164536Z","shell.execute_reply.started":"2024-01-03T18:57:01.149026Z"},"trusted":true},"outputs":[],"source":["# just the same code as above to clean the df texts for bm25\n","def data_clean_df(text):\n","    # Regex pattern to keep only alphanumeric characters and spaces\n","    pattern = r'[^a-zA-Z0-9\\s]'\n","    text = re.sub(pattern, '', text)\n","    tokens = [token.strip() for token in text.split()]\n","    return ' '.join(tokens)\n","\n","#some queries have a .T in the begining we want to remove this\n","def clean_query(text):\n","    pattern = r'^\\.T\\s'\n","    text = re.sub(pattern, '', text)\n","    tokens = [token.strip() for token in text.split()]\n","    return ' '.join(tokens)\n","\n","#special pre-processing for bm25, because for embeddings we don't want to pre-process that much\n","def data_clean_for_bm25(text):\n","   # Lowercasing the text\n","    text = text.lower()\n","    # Removing digits\n","    text = re.sub(r'\\d+', '', text)\n","    # Removing punctuation\n","    translator = str.maketrans('', '', string.punctuation)\n","    text = text.translate(translator)\n","    # Whitespace normalization\n","    text = \" \".join(text.split())\n","    # Stopword removal\n","    stop_words = set(stopwords.words(\"english\"))\n","    word_tokens = word_tokenize(text)\n","    filtered_words = [word for word in word_tokens if word not in stop_words]\n","    # Lemmatization\n","    lemmatizer = WordNetLemmatizer()\n","    lemmas = [lemmatizer.lemmatize(word) for word in filtered_words]\n","\n","    return lemmas"]},{"cell_type":"code","execution_count":5,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T18:57:04.266231Z","iopub.status.busy":"2024-01-03T18:57:04.265732Z","iopub.status.idle":"2024-01-03T18:57:10.812646Z","shell.execute_reply":"2024-01-03T18:57:10.811193Z","shell.execute_reply.started":"2024-01-03T18:57:04.266183Z"},"trusted":true},"outputs":[{"data":{"text/html":["<div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>id</th>\n","      <th>text</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>0</th>\n","      <td>1</td>\n","      <td>[problem, concern, making, descriptive, title,...</td>\n","    </tr>\n","    <tr>\n","      <th>1</th>\n","      <td>2</td>\n","      <td>[actually, pertinent, data, opposed, reference...</td>\n","    </tr>\n","    <tr>\n","      <th>2</th>\n","      <td>3</td>\n","      <td>[information, science, give, definition, possi...</td>\n","    </tr>\n","    <tr>\n","      <th>3</th>\n","      <td>4</td>\n","      <td>[image, recognition, method, automatically, tr...</td>\n","    </tr>\n","    <tr>\n","      <th>4</th>\n","      <td>5</td>\n","      <td>[special, training, ordinary, researcher, busi...</td>\n","    </tr>\n","    <tr>\n","      <th>...</th>\n","      <td>...</td>\n","      <td>...</td>\n","    </tr>\n","    <tr>\n","      <th>107</th>\n","      <td>108</td>\n","      <td>[program, machinemediated, searching, toliver,...</td>\n","    </tr>\n","    <tr>\n","      <th>108</th>\n","      <td>109</td>\n","      <td>[author, cocitation, literature, measure, inte...</td>\n","    </tr>\n","    <tr>\n","      <th>109</th>\n","      <td>110</td>\n","      <td>[progress, documentation, word, processing, in...</td>\n","    </tr>\n","    <tr>\n","      <th>110</th>\n","      <td>111</td>\n","      <td>[document, clustering, using, inverted, file, ...</td>\n","    </tr>\n","    <tr>\n","      <th>111</th>\n","      <td>112</td>\n","      <td>[fast, procedure, calculation, similarity, coe...</td>\n","    </tr>\n","  </tbody>\n","</table>\n","<p>112 rows × 2 columns</p>\n","</div>"],"text/plain":["      id                                               text\n","0      1  [problem, concern, making, descriptive, title,...\n","1      2  [actually, pertinent, data, opposed, reference...\n","2      3  [information, science, give, definition, possi...\n","3      4  [image, recognition, method, automatically, tr...\n","4      5  [special, training, ordinary, researcher, busi...\n","..   ...                                                ...\n","107  108  [program, machinemediated, searching, toliver,...\n","108  109  [author, cocitation, literature, measure, inte...\n","109  110  [progress, documentation, word, processing, in...\n","110  111  [document, clustering, using, inverted, file, ...\n","111  112  [fast, procedure, calculation, similarity, coe...\n","\n","[112 rows x 2 columns]"]},"execution_count":5,"metadata":{},"output_type":"execute_result"}],"source":["queries_cleaned = queries.copy()\n","queries_cleaned['text'] = queries_cleaned['text'].apply(data_clean_df)\n","queries_cleaned['text'] = queries_cleaned['text'].apply(clean_query)\n","\n","docs_cleaned = docs.copy()\n","docs_cleaned['text'] = docs_cleaned['text'].apply(data_clean_df)\n","docs_cleaned\n","\n","queries_cleaned_bm25 = queries.copy()\n","queries_cleaned_bm25['text'] = queries_cleaned_bm25['text'].apply(data_clean_for_bm25)\n","\n","docs_cleaned_bm25 = docs.copy()\n","docs_cleaned_bm25['text'] = docs_cleaned_bm25['text'].apply(data_clean_for_bm25)\n","queries_cleaned_bm25"]},{"cell_type":"code","execution_count":6,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T18:57:10.815397Z","iopub.status.busy":"2024-01-03T18:57:10.814883Z","iopub.status.idle":"2024-01-03T18:57:10.918743Z","shell.execute_reply":"2024-01-03T18:57:10.917514Z","shell.execute_reply.started":"2024-01-03T18:57:10.815350Z"},"trusted":true},"outputs":[],"source":["corpus = docs_cleaned_bm25['text'].to_list()\n","bm25 = BM25Okapi(corpus)"]},{"cell_type":"code","execution_count":7,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T18:57:25.428988Z","iopub.status.busy":"2024-01-03T18:57:25.428280Z","iopub.status.idle":"2024-01-03T18:57:25.442445Z","shell.execute_reply":"2024-01-03T18:57:25.440524Z","shell.execute_reply.started":"2024-01-03T18:57:25.428941Z"},"trusted":true},"outputs":[],"source":["def initial_retrieval_bm25(query_text, bm25, k):\n","    query = query_text\n","    document_ids = docs_cleaned['id'].to_list()\n","    tokenized_query = query.split(\" \")\n","    doc_scores = bm25.get_scores(tokenized_query)\n","    doc_scores_dict = dict(zip(document_ids, doc_scores))\n","    most_similar_init_k_documents = {doc_id: [score] for doc_id, score in sorted(doc_scores_dict.items(), key=lambda item: item[1], reverse=True)}\n","    most_similar_k_documents = {}\n","    counter = 0\n","    for id, score in most_similar_init_k_documents.items():\n","        try:\n","            if counter == k:\n","                break\n","            most_similar_k_documents[id]=score\n","            counter += 1\n","        except:\n","            break\n","\n","    return most_similar_k_documents   "]},{"cell_type":"markdown","metadata":{},"source":["test initial retrieval"]},{"cell_type":"code","execution_count":8,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T20:29:57.136014Z","iopub.status.busy":"2024-01-03T20:29:57.135199Z","iopub.status.idle":"2024-01-03T20:29:57.164581Z","shell.execute_reply":"2024-01-03T20:29:57.163070Z","shell.execute_reply.started":"2024-01-03T20:29:57.135955Z"},"trusted":true},"outputs":[{"data":{"text/plain":["{1399: [11.776525442346998],\n"," 166: [10.781292227767148],\n"," 1071: [10.763945625290491],\n"," 1096: [9.519484116329025],\n"," 523: [7.7636491859711825],\n"," 145: [7.635257862877457],\n"," 374: [7.63466463549917],\n"," 810: [7.1141474648611664],\n"," 778: [6.9228491481657555],\n"," 1054: [6.886386163257785]}"]},"execution_count":8,"metadata":{},"output_type":"execute_result"}],"source":["initial_retrieval_bm25(queries_cleaned['text'][1], bm25, 10)"]},{"cell_type":"markdown","metadata":{},"source":["retrieve documents for all queries"]},{"cell_type":"code","execution_count":9,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T19:43:48.581787Z","iopub.status.busy":"2024-01-03T19:43:48.580745Z","iopub.status.idle":"2024-01-03T19:43:53.300653Z","shell.execute_reply":"2024-01-03T19:43:53.299119Z","shell.execute_reply.started":"2024-01-03T19:43:48.581700Z"},"trusted":true},"outputs":[],"source":["initial_retrieval = dict()\n","initial_retrieval_with_bm25_scores = dict()\n","for index, row in queries_cleaned.iterrows():\n","    query_id = row[0]\n","    query_text = row[1]\n","    retrieved_documents = initial_retrieval_bm25(query_text, bm25, 100)\n","    initial_retrieval[query_id] = list(retrieved_documents.keys())\n","    initial_retrieval_with_bm25_scores[query_id] = retrieved_documents\n","\n","with open('./results/initial_retrieval_with_bm25_scores.pkl', 'wb') as f:\n","    pickle.dump(initial_retrieval_with_bm25_scores, f)"]}],"metadata":{"kaggle":{"accelerator":"none","dataSources":[{"datasetId":6763,"sourceId":9801,"sourceType":"datasetVersion"},{"datasetId":576263,"sourceId":1043323,"sourceType":"datasetVersion"},{"datasetId":4135603,"sourceId":7160356,"sourceType":"datasetVersion"},{"datasetId":4137237,"sourceId":7162602,"sourceType":"datasetVersion"}],"dockerImageVersionId":30120,"isGpuEnabled":false,"isInternetEnabled":true,"language":"python","sourceType":"notebook"},"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.10.12"}},"nbformat":4,"nbformat_minor":4}