Newer
Older
{"cells":[{"cell_type":"markdown","metadata":{},"source":["<h2> Imports </h2>"]},{"cell_type":"code","execution_count":1,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T18:52:18.410005Z","iopub.status.busy":"2024-01-03T18:52:18.409290Z","iopub.status.idle":"2024-01-03T18:56:01.555478Z","shell.execute_reply":"2024-01-03T18:56:01.553732Z","shell.execute_reply.started":"2024-01-03T18:52:18.409894Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["Defaulting to user installation because normal site-packages is not writeable\n","Requirement already satisfied: rank_bm25 in /home/hanna/.local/lib/python3.10/site-packages (0.2.2)\n","Requirement already satisfied: numpy in /home/hanna/.local/lib/python3.10/site-packages (from rank_bm25) (1.24.1)\n","\n","\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.1.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.3.2\u001b[0m\n","\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n","Note: you may need to restart the kernel to use updated packages.\n"]}],"source":["%pip install rank_bm25"]},{"cell_type":"code","execution_count":2,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T18:56:01.558915Z","iopub.status.busy":"2024-01-03T18:56:01.558368Z","iopub.status.idle":"2024-01-03T18:56:32.924438Z","shell.execute_reply":"2024-01-03T18:56:32.919977Z","shell.execute_reply.started":"2024-01-03T18:56:01.558854Z"},"trusted":true},"outputs":[],"source":["import nltk\n","from nltk.corpus import stopwords\n","from nltk.tokenize import word_tokenize\n","from nltk.tokenize.toktok import ToktokTokenizer\n","import re\n","stopword_list = nltk.corpus.stopwords.words('english')\n","import pandas as pd\n","from tqdm import tqdm\n","tqdm.pandas()\n","from rank_bm25 import BM25Okapi\n","import pandas as pd \n","import string\n","from nltk.stem import WordNetLemmatizer\n","import pickle \n"]},{"cell_type":"markdown","metadata":{},"source":["<h2> Load datasets and model</h2>"]},{"cell_type":"code","execution_count":3,"metadata":{"_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","execution":{"iopub.execute_input":"2024-01-03T18:56:32.931742Z","iopub.status.busy":"2024-01-03T18:56:32.929029Z","iopub.status.idle":"2024-01-03T18:56:33.045085Z","shell.execute_reply":"2024-01-03T18:56:33.043862Z","shell.execute_reply.started":"2024-01-03T18:56:32.931643Z"},"trusted":true},"outputs":[],"source":["queries = pd.read_csv(\"./data/cisi-csv/queries.csv\")\n","docs = pd.read_csv(\"./data/cisi-csv/docs.csv\")\n","rels = pd.read_csv(\"./data/cisi-csv/rels.csv\")\n","\n","full_doc = docs['text'].to_list()\n","full_query = queries['text'].to_list()"]},{"cell_type":"markdown","metadata":{},"source":["<h2>Initial retrieval with bm25</h2>"]},{"cell_type":"code","execution_count":4,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T18:57:01.149061Z","iopub.status.busy":"2024-01-03T18:57:01.148640Z","iopub.status.idle":"2024-01-03T18:57:01.166124Z","shell.execute_reply":"2024-01-03T18:57:01.164536Z","shell.execute_reply.started":"2024-01-03T18:57:01.149026Z"},"trusted":true},"outputs":[],"source":["# just the same code as above to clean the df texts for bm25\n","def data_clean_df(text):\n"," # Regex pattern to keep only alphanumeric characters and spaces\n"," pattern = r'[^a-zA-Z0-9\\s]'\n"," text = re.sub(pattern, '', text)\n"," tokens = [token.strip() for token in text.split()]\n"," return ' '.join(tokens)\n","\n","#some queries have a .T in the begining we want to remove this\n","def clean_query(text):\n"," pattern = r'^\\.T\\s'\n"," text = re.sub(pattern, '', text)\n"," tokens = [token.strip() for token in text.split()]\n"," return ' '.join(tokens)\n","\n","#special pre-processing for bm25, because for embeddings we don't want to pre-process that much\n","def data_clean_for_bm25(text):\n"," # Lowercasing the text\n"," text = text.lower()\n"," # Removing digits\n"," text = re.sub(r'\\d+', '', text)\n"," # Removing punctuation\n"," translator = str.maketrans('', '', string.punctuation)\n"," text = text.translate(translator)\n"," # Whitespace normalization\n"," text = \" \".join(text.split())\n"," # Stopword removal\n"," stop_words = set(stopwords.words(\"english\"))\n"," word_tokens = word_tokenize(text)\n"," filtered_words = [word for word in word_tokens if word not in stop_words]\n"," # Lemmatization\n"," lemmatizer = WordNetLemmatizer()\n"," lemmas = [lemmatizer.lemmatize(word) for word in filtered_words]\n","\n"," return lemmas"]},{"cell_type":"code","execution_count":5,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T18:57:04.266231Z","iopub.status.busy":"2024-01-03T18:57:04.265732Z","iopub.status.idle":"2024-01-03T18:57:10.812646Z","shell.execute_reply":"2024-01-03T18:57:10.811193Z","shell.execute_reply.started":"2024-01-03T18:57:04.266183Z"},"trusted":true},"outputs":[{"data":{"text/html":["<div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>id</th>\n"," <th>text</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>1</td>\n"," <td>[problem, concern, making, descriptive, title,...</td>\n"," </tr>\n"," <tr>\n"," <th>1</th>\n"," <td>2</td>\n"," <td>[actually, pertinent, data, opposed, reference...</td>\n"," </tr>\n"," <tr>\n"," <th>2</th>\n"," <td>3</td>\n"," <td>[information, science, give, definition, possi...</td>\n"," </tr>\n"," <tr>\n"," <th>3</th>\n"," <td>4</td>\n"," <td>[image, recognition, method, automatically, tr...</td>\n"," </tr>\n"," <tr>\n"," <th>4</th>\n"," <td>5</td>\n"," <td>[special, training, ordinary, researcher, busi...</td>\n"," </tr>\n"," <tr>\n"," <th>...</th>\n"," <td>...</td>\n"," <td>...</td>\n"," </tr>\n"," <tr>\n"," <th>107</th>\n"," <td>108</td>\n"," <td>[program, machinemediated, searching, toliver,...</td>\n"," </tr>\n"," <tr>\n"," <th>108</th>\n"," <td>109</td>\n"," <td>[author, cocitation, literature, measure, inte...</td>\n"," </tr>\n"," <tr>\n"," <th>109</th>\n"," <td>110</td>\n"," <td>[progress, documentation, word, processing, in...</td>\n"," </tr>\n"," <tr>\n"," <th>110</th>\n"," <td>111</td>\n"," <td>[document, clustering, using, inverted, file, ...</td>\n"," </tr>\n"," <tr>\n"," <th>111</th>\n"," <td>112</td>\n"," <td>[fast, procedure, calculation, similarity, coe...</td>\n"," </tr>\n"," </tbody>\n","</table>\n","<p>112 rows × 2 columns</p>\n","</div>"],"text/plain":[" id text\n","0 1 [problem, concern, making, descriptive, title,...\n","1 2 [actually, pertinent, data, opposed, reference...\n","2 3 [information, science, give, definition, possi...\n","3 4 [image, recognition, method, automatically, tr...\n","4 5 [special, training, ordinary, researcher, busi...\n",".. ... ...\n","107 108 [program, machinemediated, searching, toliver,...\n","108 109 [author, cocitation, literature, measure, inte...\n","109 110 [progress, documentation, word, processing, in...\n","110 111 [document, clustering, using, inverted, file, ...\n","111 112 [fast, procedure, calculation, similarity, coe...\n","\n","[112 rows x 2 columns]"]},"execution_count":5,"metadata":{},"output_type":"execute_result"}],"source":["queries_cleaned = queries.copy()\n","queries_cleaned['text'] = queries_cleaned['text'].apply(data_clean_df)\n","queries_cleaned['text'] = queries_cleaned['text'].apply(clean_query)\n","\n","docs_cleaned = docs.copy()\n","docs_cleaned['text'] = docs_cleaned['text'].apply(data_clean_df)\n","docs_cleaned\n","\n","queries_cleaned_bm25 = queries.copy()\n","queries_cleaned_bm25['text'] = queries_cleaned_bm25['text'].apply(data_clean_for_bm25)\n","\n","docs_cleaned_bm25 = docs.copy()\n","docs_cleaned_bm25['text'] = docs_cleaned_bm25['text'].apply(data_clean_for_bm25)\n","queries_cleaned_bm25"]},{"cell_type":"code","execution_count":6,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T18:57:10.815397Z","iopub.status.busy":"2024-01-03T18:57:10.814883Z","iopub.status.idle":"2024-01-03T18:57:10.918743Z","shell.execute_reply":"2024-01-03T18:57:10.917514Z","shell.execute_reply.started":"2024-01-03T18:57:10.815350Z"},"trusted":true},"outputs":[],"source":["corpus = docs_cleaned_bm25['text'].to_list()\n","bm25 = BM25Okapi(corpus)"]},{"cell_type":"code","execution_count":7,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T18:57:25.428988Z","iopub.status.busy":"2024-01-03T18:57:25.428280Z","iopub.status.idle":"2024-01-03T18:57:25.442445Z","shell.execute_reply":"2024-01-03T18:57:25.440524Z","shell.execute_reply.started":"2024-01-03T18:57:25.428941Z"},"trusted":true},"outputs":[],"source":["def initial_retrieval_bm25(query_text, bm25, k):\n"," query = query_text\n"," document_ids = docs_cleaned['id'].to_list()\n"," tokenized_query = query.split(\" \")\n"," doc_scores = bm25.get_scores(tokenized_query)\n"," doc_scores_dict = dict(zip(document_ids, doc_scores))\n"," most_similar_init_k_documents = {doc_id: [score] for doc_id, score in sorted(doc_scores_dict.items(), key=lambda item: item[1], reverse=True)}\n"," most_similar_k_documents = {}\n"," counter = 0\n"," for id, score in most_similar_init_k_documents.items():\n"," try:\n"," if counter == k:\n"," break\n"," most_similar_k_documents[id]=score\n"," counter += 1\n"," except:\n"," break\n","\n"," return most_similar_k_documents "]},{"cell_type":"markdown","metadata":{},"source":["test initial retrieval"]},{"cell_type":"code","execution_count":8,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T20:29:57.136014Z","iopub.status.busy":"2024-01-03T20:29:57.135199Z","iopub.status.idle":"2024-01-03T20:29:57.164581Z","shell.execute_reply":"2024-01-03T20:29:57.163070Z","shell.execute_reply.started":"2024-01-03T20:29:57.135955Z"},"trusted":true},"outputs":[{"data":{"text/plain":["{1399: [11.776525442346998],\n"," 166: [10.781292227767148],\n"," 1071: [10.763945625290491],\n"," 1096: [9.519484116329025],\n"," 523: [7.7636491859711825],\n"," 145: [7.635257862877457],\n"," 374: [7.63466463549917],\n"," 810: [7.1141474648611664],\n"," 778: [6.9228491481657555],\n"," 1054: [6.886386163257785]}"]},"execution_count":8,"metadata":{},"output_type":"execute_result"}],"source":["initial_retrieval_bm25(queries_cleaned['text'][1], bm25, 10)"]},{"cell_type":"markdown","metadata":{},"source":["retrieve documents for all queries"]},{"cell_type":"code","execution_count":9,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T19:43:48.581787Z","iopub.status.busy":"2024-01-03T19:43:48.580745Z","iopub.status.idle":"2024-01-03T19:43:53.300653Z","shell.execute_reply":"2024-01-03T19:43:53.299119Z","shell.execute_reply.started":"2024-01-03T19:43:48.581700Z"},"trusted":true},"outputs":[],"source":["initial_retrieval = dict()\n","initial_retrieval_with_bm25_scores = dict()\n","for index, row in queries_cleaned.iterrows():\n"," query_id = row[0]\n"," query_text = row[1]\n"," retrieved_documents = initial_retrieval_bm25(query_text, bm25, 100)\n"," initial_retrieval[query_id] = list(retrieved_documents.keys())\n"," initial_retrieval_with_bm25_scores[query_id] = retrieved_documents\n","\n","with open('./results/initial_retrieval_with_bm25_scores.pkl', 'wb') as f:\n"," pickle.dump(initial_retrieval_with_bm25_scores, f)"]}],"metadata":{"kaggle":{"accelerator":"none","dataSources":[{"datasetId":6763,"sourceId":9801,"sourceType":"datasetVersion"},{"datasetId":576263,"sourceId":1043323,"sourceType":"datasetVersion"},{"datasetId":4135603,"sourceId":7160356,"sourceType":"datasetVersion"},{"datasetId":4137237,"sourceId":7162602,"sourceType":"datasetVersion"}],"dockerImageVersionId":30120,"isGpuEnabled":false,"isInternetEnabled":true,"language":"python","sourceType":"notebook"},"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.10.12"}},"nbformat":4,"nbformat_minor":4}