Skip to content
Snippets Groups Projects
initial-retrieval-kaggle.ipynb 365 KiB
Newer Older
{"cells":[{"cell_type":"markdown","metadata":{},"source":["<h2> Imports </h2>"]},{"cell_type":"code","execution_count":1,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T18:52:18.410005Z","iopub.status.busy":"2024-01-03T18:52:18.409290Z","iopub.status.idle":"2024-01-03T18:56:01.555478Z","shell.execute_reply":"2024-01-03T18:56:01.553732Z","shell.execute_reply.started":"2024-01-03T18:52:18.409894Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["Collecting tensorflow==2.5\n","  Downloading tensorflow-2.5.0-cp37-cp37m-manylinux2010_x86_64.whl (454.3 MB)\n","\u001b[K     |████████████████████████████████| 454.3 MB 11 kB/s s eta 0:00:01   |█▌                              | 20.7 MB 7.4 MB/s eta 0:00:59     |█████████████████████████▋      | 363.2 MB 62.2 MB/s eta 0:00:02     |██████████████████████████████▏ | 427.8 MB 46.2 MB/s eta 0:00:01\n","\u001b[?25hRequirement already satisfied: opt-einsum~=3.3.0 in /opt/conda/lib/python3.7/site-packages (from tensorflow==2.5) (3.3.0)\n","Requirement already satisfied: astunparse~=1.6.3 in /opt/conda/lib/python3.7/site-packages (from tensorflow==2.5) (1.6.3)\n","Requirement already satisfied: absl-py~=0.10 in /opt/conda/lib/python3.7/site-packages (from tensorflow==2.5) (0.12.0)\n","Requirement already satisfied: typing-extensions~=3.7.4 in /opt/conda/lib/python3.7/site-packages (from tensorflow==2.5) (3.7.4.3)\n","Requirement already satisfied: keras-preprocessing~=1.1.2 in /opt/conda/lib/python3.7/site-packages (from tensorflow==2.5) (1.1.2)\n","Collecting h5py~=3.1.0\n","  Downloading h5py-3.1.0-cp37-cp37m-manylinux1_x86_64.whl (4.0 MB)\n","\u001b[K     |████████████████████████████████| 4.0 MB 57.1 MB/s eta 0:00:01\n","\u001b[?25hCollecting gast==0.4.0\n","  Downloading gast-0.4.0-py3-none-any.whl (9.8 kB)\n","Requirement already satisfied: protobuf>=3.9.2 in /opt/conda/lib/python3.7/site-packages (from tensorflow==2.5) (3.17.3)\n","Collecting keras-nightly~=2.5.0.dev\n","  Downloading keras_nightly-2.5.0.dev2021032900-py2.py3-none-any.whl (1.2 MB)\n","\u001b[K     |████████████████████████████████| 1.2 MB 42.3 MB/s eta 0:00:01\n","\u001b[?25hRequirement already satisfied: wheel~=0.35 in /opt/conda/lib/python3.7/site-packages (from tensorflow==2.5) (0.36.2)\n","Requirement already satisfied: google-pasta~=0.2 in /opt/conda/lib/python3.7/site-packages (from tensorflow==2.5) (0.2.0)\n","Requirement already satisfied: six~=1.15.0 in /opt/conda/lib/python3.7/site-packages (from tensorflow==2.5) (1.15.0)\n","Requirement already satisfied: termcolor~=1.1.0 in /opt/conda/lib/python3.7/site-packages (from tensorflow==2.5) (1.1.0)\n","Requirement already satisfied: flatbuffers~=1.12.0 in /opt/conda/lib/python3.7/site-packages (from tensorflow==2.5) (1.12)\n","Collecting grpcio~=1.34.0\n","  Downloading grpcio-1.34.1-cp37-cp37m-manylinux2014_x86_64.whl (4.0 MB)\n","\u001b[K     |████████████████████████████████| 4.0 MB 56.1 MB/s eta 0:00:01\n","\u001b[?25hCollecting tensorflow-estimator<2.6.0,>=2.5.0rc0\n","  Downloading tensorflow_estimator-2.5.0-py2.py3-none-any.whl (462 kB)\n","\u001b[K     |████████████████████████████████| 462 kB 61.4 MB/s eta 0:00:01\n","\u001b[?25hRequirement already satisfied: numpy~=1.19.2 in /opt/conda/lib/python3.7/site-packages (from tensorflow==2.5) (1.19.5)\n","Collecting tensorboard~=2.5\n","  Downloading tensorboard-2.11.2-py3-none-any.whl (6.0 MB)\n","\u001b[K     |████████████████████████████████| 6.0 MB 38.8 MB/s eta 0:00:01\n","\u001b[?25hRequirement already satisfied: wrapt~=1.12.1 in /opt/conda/lib/python3.7/site-packages (from tensorflow==2.5) (1.12.1)\n","Collecting cached-property\n","  Downloading cached_property-1.5.2-py2.py3-none-any.whl (7.6 kB)\n","Requirement already satisfied: markdown>=2.6.8 in /opt/conda/lib/python3.7/site-packages (from tensorboard~=2.5->tensorflow==2.5) (3.3.4)\n","Requirement already satisfied: google-auth<3,>=1.6.3 in /opt/conda/lib/python3.7/site-packages (from tensorboard~=2.5->tensorflow==2.5) (1.30.2)\n","Requirement already satisfied: requests<3,>=2.21.0 in /opt/conda/lib/python3.7/site-packages (from tensorboard~=2.5->tensorflow==2.5) (2.25.1)\n","Requirement already satisfied: google-auth-oauthlib<0.5,>=0.4.1 in /opt/conda/lib/python3.7/site-packages (from tensorboard~=2.5->tensorflow==2.5) (0.4.4)\n","Requirement already satisfied: setuptools>=41.0.0 in /opt/conda/lib/python3.7/site-packages (from tensorboard~=2.5->tensorflow==2.5) (49.6.0.post20210108)\n","Requirement already satisfied: werkzeug>=1.0.1 in /opt/conda/lib/python3.7/site-packages (from tensorboard~=2.5->tensorflow==2.5) (2.0.1)\n","Requirement already satisfied: tensorboard-data-server<0.7.0,>=0.6.0 in /opt/conda/lib/python3.7/site-packages (from tensorboard~=2.5->tensorflow==2.5) (0.6.1)\n","Requirement already satisfied: tensorboard-plugin-wit>=1.6.0 in /opt/conda/lib/python3.7/site-packages (from tensorboard~=2.5->tensorflow==2.5) (1.8.0)\n","Requirement already satisfied: cachetools<5.0,>=2.0.0 in /opt/conda/lib/python3.7/site-packages (from google-auth<3,>=1.6.3->tensorboard~=2.5->tensorflow==2.5) (4.2.2)\n","Requirement already satisfied: rsa<5,>=3.1.4 in /opt/conda/lib/python3.7/site-packages (from google-auth<3,>=1.6.3->tensorboard~=2.5->tensorflow==2.5) (4.7.2)\n","Requirement already satisfied: pyasn1-modules>=0.2.1 in /opt/conda/lib/python3.7/site-packages (from google-auth<3,>=1.6.3->tensorboard~=2.5->tensorflow==2.5) (0.2.7)\n","Requirement already satisfied: requests-oauthlib>=0.7.0 in /opt/conda/lib/python3.7/site-packages (from google-auth-oauthlib<0.5,>=0.4.1->tensorboard~=2.5->tensorflow==2.5) (1.3.0)\n","Requirement already satisfied: importlib-metadata in /opt/conda/lib/python3.7/site-packages (from markdown>=2.6.8->tensorboard~=2.5->tensorflow==2.5) (3.4.0)\n","Requirement already satisfied: pyasn1<0.5.0,>=0.4.6 in /opt/conda/lib/python3.7/site-packages (from pyasn1-modules>=0.2.1->google-auth<3,>=1.6.3->tensorboard~=2.5->tensorflow==2.5) (0.4.8)\n","Requirement already satisfied: idna<3,>=2.5 in /opt/conda/lib/python3.7/site-packages (from requests<3,>=2.21.0->tensorboard~=2.5->tensorflow==2.5) (2.10)\n","Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.7/site-packages (from requests<3,>=2.21.0->tensorboard~=2.5->tensorflow==2.5) (2021.5.30)\n","Requirement already satisfied: urllib3<1.27,>=1.21.1 in /opt/conda/lib/python3.7/site-packages (from requests<3,>=2.21.0->tensorboard~=2.5->tensorflow==2.5) (1.26.5)\n","Requirement already satisfied: chardet<5,>=3.0.2 in /opt/conda/lib/python3.7/site-packages (from requests<3,>=2.21.0->tensorboard~=2.5->tensorflow==2.5) (4.0.0)\n","Requirement already satisfied: oauthlib>=3.0.0 in /opt/conda/lib/python3.7/site-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib<0.5,>=0.4.1->tensorboard~=2.5->tensorflow==2.5) (3.1.1)\n","Requirement already satisfied: zipp>=0.5 in /opt/conda/lib/python3.7/site-packages (from importlib-metadata->markdown>=2.6.8->tensorboard~=2.5->tensorflow==2.5) (3.4.1)\n","Installing collected packages: grpcio, cached-property, tensorflow-estimator, tensorboard, keras-nightly, h5py, gast, tensorflow\n","  Attempting uninstall: grpcio\n","    Found existing installation: grpcio 1.32.0\n","    Uninstalling grpcio-1.32.0:\n","      Successfully uninstalled grpcio-1.32.0\n","  Attempting uninstall: tensorflow-estimator\n","    Found existing installation: tensorflow-estimator 2.4.0\n","    Uninstalling tensorflow-estimator-2.4.0:\n","      Successfully uninstalled tensorflow-estimator-2.4.0\n","  Attempting uninstall: tensorboard\n","    Found existing installation: tensorboard 2.4.1\n","    Uninstalling tensorboard-2.4.1:\n","      Successfully uninstalled tensorboard-2.4.1\n","  Attempting uninstall: h5py\n","    Found existing installation: h5py 2.10.0\n","    Uninstalling h5py-2.10.0:\n","      Successfully uninstalled h5py-2.10.0\n","  Attempting uninstall: gast\n","    Found existing installation: gast 0.3.3\n","    Uninstalling gast-0.3.3:\n","      Successfully uninstalled gast-0.3.3\n","  Attempting uninstall: tensorflow\n","    Found existing installation: tensorflow 2.4.1\n","    Uninstalling tensorflow-2.4.1:\n","      Successfully uninstalled tensorflow-2.4.1\n","Successfully installed cached-property-1.5.2 gast-0.4.0 grpcio-1.34.1 h5py-3.1.0 keras-nightly-2.5.0.dev2021032900 tensorboard-2.11.2 tensorflow-2.5.0 tensorflow-estimator-2.5.0\n","\u001b[33mWARNING: Running pip as root will break packages and permissions. You should install packages reliably by using venv: https://pip.pypa.io/warnings/venv\u001b[0m\n","Collecting rank_bm25\n","  Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)\n","Requirement already satisfied: numpy in /opt/conda/lib/python3.7/site-packages (from rank_bm25) (1.19.5)\n","Installing collected packages: rank-bm25\n","Successfully installed rank-bm25-0.2.2\n","\u001b[33mWARNING: Running pip as root will break packages and permissions. You should install packages reliably by using venv: https://pip.pypa.io/warnings/venv\u001b[0m\n","\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n","yellowbrick 1.3.post1 requires numpy<1.20,>=1.16.0, but you have numpy 1.21.6 which is incompatible.\n","tensorflow 2.5.0 requires numpy~=1.19.2, but you have numpy 1.21.6 which is incompatible.\n","s3fs 2021.6.1 requires fsspec==2021.06.1, but you have fsspec 2023.1.0 which is incompatible.\n","pdpbox 0.2.1 requires matplotlib==3.1.1, but you have matplotlib 3.4.2 which is incompatible.\n","matrixprofile 1.1.10 requires protobuf==3.11.2, but you have protobuf 3.17.3 which is incompatible.\n","kornia 0.5.5 requires numpy<=1.19, but you have numpy 1.21.6 which is incompatible.\n","imbalanced-learn 0.8.0 requires scikit-learn>=0.24, but you have scikit-learn 0.23.2 which is incompatible.\n","gcsfs 2021.6.0 requires fsspec==2021.06.0, but you have fsspec 2023.1.0 which is incompatible.\n","allennlp 2.5.0 requires transformers<4.7,>=4.1, but you have transformers 4.30.2 which is incompatible.\u001b[0m\n","\u001b[33mWARNING: Running pip as root will break packages and permissions. You should install packages reliably by using venv: https://pip.pypa.io/warnings/venv\u001b[0m\n","Note: you may need to restart the kernel to use updated packages.\n"]}],"source":["!pip install tensorflow==2.5\n","!pip install rank_bm25\n","%pip install -Uq sentence-transformers faiss-cpu accelerate hdbscan bertopic evaluate kaleido datasets>=2.11"]},{"cell_type":"code","execution_count":39,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T18:56:01.558915Z","iopub.status.busy":"2024-01-03T18:56:01.558368Z","iopub.status.idle":"2024-01-03T18:56:32.924438Z","shell.execute_reply":"2024-01-03T18:56:32.919977Z","shell.execute_reply.started":"2024-01-03T18:56:01.558854Z"},"trusted":true},"outputs":[],"source":["import gensim\n","import numpy as np\n","import nltk\n","from nltk.corpus import stopwords\n","from nltk.tokenize import word_tokenize\n","from scipy import spatial\n","from nltk.tokenize.toktok import ToktokTokenizer\n","import re\n","tokenizer = ToktokTokenizer()\n","stopword_list = nltk.corpus.stopwords.words('english')\n","import pandas as pd\n","from tqdm import tqdm\n","tqdm.pandas()\n","from rank_bm25 import BM25Okapi\n","from bertopic import BERTopic\n","from bertopic.vectorizers import ClassTfidfTransformer\n","import numpy as np\n","import pandas as pd \n","import os\n","from nltk.stem.porter import PorterStemmer\n","import string\n","from nltk.stem import WordNetLemmatizer\n","import matplotlib.pyplot as plt\n","import pickle \n"]},{"cell_type":"markdown","metadata":{},"source":["<h2> Load datasets and model</h2>"]},{"cell_type":"code","execution_count":30,"metadata":{"_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","execution":{"iopub.execute_input":"2024-01-03T18:56:32.931742Z","iopub.status.busy":"2024-01-03T18:56:32.929029Z","iopub.status.idle":"2024-01-03T18:56:33.045085Z","shell.execute_reply":"2024-01-03T18:56:33.043862Z","shell.execute_reply.started":"2024-01-03T18:56:32.931643Z"},"trusted":true},"outputs":[],"source":["queries = pd.read_csv(\"./data/cisi-csv/queries.csv\")\n","docs = pd.read_csv(\"./data/cisi-csv/docs.csv\")\n","rels = pd.read_csv(\"./data/cisi-csv/rels.csv\")\n","\n","full_doc = docs['text'].to_list()\n","full_query = queries['text'].to_list()"]},{"cell_type":"code","execution_count":31,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T18:56:33.047676Z","iopub.status.busy":"2024-01-03T18:56:33.046992Z","iopub.status.idle":"2024-01-03T18:56:34.042871Z","shell.execute_reply":"2024-01-03T18:56:34.041391Z","shell.execute_reply.started":"2024-01-03T18:56:33.047624Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["There are 36 queries without a groundtruth.\n","Remaining queries: 76.\n"]},{"data":{"text/html":["<div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>id</th>\n","      <th>text</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>0</th>\n","      <td>1</td>\n","      <td>What problems and concerns are there in making...</td>\n","    </tr>\n","    <tr>\n","      <th>1</th>\n","      <td>2</td>\n","      <td>How can actually pertinent data, as opposed to...</td>\n","    </tr>\n","    <tr>\n","      <th>2</th>\n","      <td>3</td>\n","      <td>What is information science? Give definitions ...</td>\n","    </tr>\n","    <tr>\n","      <th>3</th>\n","      <td>4</td>\n","      <td>Image recognition and any other methods of aut...</td>\n","    </tr>\n","    <tr>\n","      <th>4</th>\n","      <td>5</td>\n","      <td>What special training will ordinary researcher...</td>\n","    </tr>\n","    <tr>\n","      <th>...</th>\n","      <td>...</td>\n","      <td>...</td>\n","    </tr>\n","    <tr>\n","      <th>100</th>\n","      <td>101</td>\n","      <td>.T Parallel Computations in Information Retrie...</td>\n","    </tr>\n","    <tr>\n","      <th>101</th>\n","      <td>102</td>\n","      <td>.T The Measurement of Term Importance in Autom...</td>\n","    </tr>\n","    <tr>\n","      <th>103</th>\n","      <td>104</td>\n","      <td>.T The Selection of Good Search Terms .A van R...</td>\n","    </tr>\n","    <tr>\n","      <th>108</th>\n","      <td>109</td>\n","      <td>.T Author Cocitation: A Literature Measure of ...</td>\n","    </tr>\n","    <tr>\n","      <th>110</th>\n","      <td>111</td>\n","      <td>.T Document Clustering Using an Inverted File ...</td>\n","    </tr>\n","  </tbody>\n","</table>\n","<p>76 rows × 2 columns</p>\n","</div>"],"text/plain":["      id                                               text\n","0      1  What problems and concerns are there in making...\n","1      2  How can actually pertinent data, as opposed to...\n","2      3  What is information science? Give definitions ...\n","3      4  Image recognition and any other methods of aut...\n","4      5  What special training will ordinary researcher...\n","..   ...                                                ...\n","100  101  .T Parallel Computations in Information Retrie...\n","101  102  .T The Measurement of Term Importance in Autom...\n","103  104  .T The Selection of Good Search Terms .A van R...\n","108  109  .T Author Cocitation: A Literature Measure of ...\n","110  111  .T Document Clustering Using an Inverted File ...\n","\n","[76 rows x 2 columns]"]},"execution_count":31,"metadata":{},"output_type":"execute_result"}],"source":["# TODO: this is not really necessary I think? because\n","#remove queries where we don't have a groundtruth for:\n","queries_wo_gt = [36,38,40,47,48,51,53,59,60,63,64,68,70,72,73,74,75,77,78,80,83,85,86,87,88,89,91,93,94,103,105,106,107,108,110,112]\n","print(f'There are {len(queries_wo_gt)} queries without a groundtruth.')\n","print(f'Remaining queries: {len(queries)-len(queries_wo_gt)}.')\n","\n","queries = queries[~queries['id'].isin(queries_wo_gt)]\n","queries"]},{"cell_type":"code","execution_count":32,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T19:04:27.585430Z","iopub.status.busy":"2024-01-03T19:04:27.584998Z","iopub.status.idle":"2024-01-03T19:06:04.031449Z","shell.execute_reply":"2024-01-03T19:06:04.028796Z","shell.execute_reply.started":"2024-01-03T19:04:27.585396Z"},"trusted":true},"outputs":[{"ename":"FileNotFoundError","evalue":"[Errno 2] No such file or directory: '../input/googlenewsvectorsnegative300/GoogleNews-vectors-negative300.bin'","output_type":"error","traceback":["\u001b[0;31m---------------------------------------------------------------------------\u001b[0m","\u001b[0;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)","\u001b[1;32m/home/hanna/Documents/air-23/initial-retrieval.ipynb Cell 7\u001b[0m line \u001b[0;36m1\n\u001b[0;32m----> <a href='vscode-notebook-cell:/home/hanna/Documents/air-23/initial-retrieval.ipynb#Y125sZmlsZQ%3D%3D?line=0'>1</a>\u001b[0m model \u001b[39m=\u001b[39m gensim\u001b[39m.\u001b[39;49mmodels\u001b[39m.\u001b[39;49mKeyedVectors\u001b[39m.\u001b[39;49mload_word2vec_format(\u001b[39m'\u001b[39;49m\u001b[39m../input/googlenewsvectorsnegative300/GoogleNews-vectors-negative300.bin\u001b[39;49m\u001b[39m'\u001b[39;49m, binary\u001b[39m=\u001b[39;49m\u001b[39mTrue\u001b[39;49;00m)\n","File \u001b[0;32m~/.local/lib/python3.10/site-packages/gensim/models/keyedvectors.py:1719\u001b[0m, in \u001b[0;36mKeyedVectors.load_word2vec_format\u001b[0;34m(cls, fname, fvocab, binary, encoding, unicode_errors, limit, datatype, no_header)\u001b[0m\n\u001b[1;32m   1672\u001b[0m \u001b[39m@classmethod\u001b[39m\n\u001b[1;32m   1673\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mload_word2vec_format\u001b[39m(\n\u001b[1;32m   1674\u001b[0m         \u001b[39mcls\u001b[39m, fname, fvocab\u001b[39m=\u001b[39m\u001b[39mNone\u001b[39;00m, binary\u001b[39m=\u001b[39m\u001b[39mFalse\u001b[39;00m, encoding\u001b[39m=\u001b[39m\u001b[39m'\u001b[39m\u001b[39mutf8\u001b[39m\u001b[39m'\u001b[39m, unicode_errors\u001b[39m=\u001b[39m\u001b[39m'\u001b[39m\u001b[39mstrict\u001b[39m\u001b[39m'\u001b[39m,\n\u001b[1;32m   1675\u001b[0m         limit\u001b[39m=\u001b[39m\u001b[39mNone\u001b[39;00m, datatype\u001b[39m=\u001b[39mREAL, no_header\u001b[39m=\u001b[39m\u001b[39mFalse\u001b[39;00m,\n\u001b[1;32m   1676\u001b[0m     ):\n\u001b[1;32m   1677\u001b[0m \u001b[39m    \u001b[39m\u001b[39m\"\"\"Load KeyedVectors from a file produced by the original C word2vec-tool format.\u001b[39;00m\n\u001b[1;32m   1678\u001b[0m \n\u001b[1;32m   1679\u001b[0m \u001b[39m    Warnings\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m   1717\u001b[0m \n\u001b[1;32m   1718\u001b[0m \u001b[39m    \"\"\"\u001b[39;00m\n\u001b[0;32m-> 1719\u001b[0m     \u001b[39mreturn\u001b[39;00m _load_word2vec_format(\n\u001b[1;32m   1720\u001b[0m         \u001b[39mcls\u001b[39;49m, fname, fvocab\u001b[39m=\u001b[39;49mfvocab, binary\u001b[39m=\u001b[39;49mbinary, encoding\u001b[39m=\u001b[39;49mencoding, unicode_errors\u001b[39m=\u001b[39;49municode_errors,\n\u001b[1;32m   1721\u001b[0m         limit\u001b[39m=\u001b[39;49mlimit, datatype\u001b[39m=\u001b[39;49mdatatype, no_header\u001b[39m=\u001b[39;49mno_header,\n\u001b[1;32m   1722\u001b[0m     )\n","File \u001b[0;32m~/.local/lib/python3.10/site-packages/gensim/models/keyedvectors.py:2048\u001b[0m, in \u001b[0;36m_load_word2vec_format\u001b[0;34m(cls, fname, fvocab, binary, encoding, unicode_errors, limit, datatype, no_header, binary_chunk_size)\u001b[0m\n\u001b[1;32m   2045\u001b[0m             counts[word] \u001b[39m=\u001b[39m \u001b[39mint\u001b[39m(count)\n\u001b[1;32m   2047\u001b[0m logger\u001b[39m.\u001b[39minfo(\u001b[39m\"\u001b[39m\u001b[39mloading projection weights from \u001b[39m\u001b[39m%s\u001b[39;00m\u001b[39m\"\u001b[39m, fname)\n\u001b[0;32m-> 2048\u001b[0m \u001b[39mwith\u001b[39;00m utils\u001b[39m.\u001b[39;49mopen(fname, \u001b[39m'\u001b[39;49m\u001b[39mrb\u001b[39;49m\u001b[39m'\u001b[39;49m) \u001b[39mas\u001b[39;00m fin:\n\u001b[1;32m   2049\u001b[0m     \u001b[39mif\u001b[39;00m no_header:\n\u001b[1;32m   2050\u001b[0m         \u001b[39m# deduce both vocab_size & vector_size from 1st pass over file\u001b[39;00m\n\u001b[1;32m   2051\u001b[0m         \u001b[39mif\u001b[39;00m binary:\n","File \u001b[0;32m~/.local/lib/python3.10/site-packages/smart_open/smart_open_lib.py:177\u001b[0m, in \u001b[0;36mopen\u001b[0;34m(uri, mode, buffering, encoding, errors, newline, closefd, opener, compression, transport_params)\u001b[0m\n\u001b[1;32m    174\u001b[0m \u001b[39mif\u001b[39;00m transport_params \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[1;32m    175\u001b[0m     transport_params \u001b[39m=\u001b[39m {}\n\u001b[0;32m--> 177\u001b[0m fobj \u001b[39m=\u001b[39m _shortcut_open(\n\u001b[1;32m    178\u001b[0m     uri,\n\u001b[1;32m    179\u001b[0m     mode,\n\u001b[1;32m    180\u001b[0m     compression\u001b[39m=\u001b[39;49mcompression,\n\u001b[1;32m    181\u001b[0m     buffering\u001b[39m=\u001b[39;49mbuffering,\n\u001b[1;32m    182\u001b[0m     encoding\u001b[39m=\u001b[39;49mencoding,\n\u001b[1;32m    183\u001b[0m     errors\u001b[39m=\u001b[39;49merrors,\n\u001b[1;32m    184\u001b[0m     newline\u001b[39m=\u001b[39;49mnewline,\n\u001b[1;32m    185\u001b[0m )\n\u001b[1;32m    186\u001b[0m \u001b[39mif\u001b[39;00m fobj \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[1;32m    187\u001b[0m     \u001b[39mreturn\u001b[39;00m fobj\n","File \u001b[0;32m~/.local/lib/python3.10/site-packages/smart_open/smart_open_lib.py:363\u001b[0m, in \u001b[0;36m_shortcut_open\u001b[0;34m(uri, mode, compression, buffering, encoding, errors, newline)\u001b[0m\n\u001b[1;32m    360\u001b[0m \u001b[39mif\u001b[39;00m errors \u001b[39mand\u001b[39;00m \u001b[39m'\u001b[39m\u001b[39mb\u001b[39m\u001b[39m'\u001b[39m \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m mode:\n\u001b[1;32m    361\u001b[0m     open_kwargs[\u001b[39m'\u001b[39m\u001b[39merrors\u001b[39m\u001b[39m'\u001b[39m] \u001b[39m=\u001b[39m errors\n\u001b[0;32m--> 363\u001b[0m \u001b[39mreturn\u001b[39;00m _builtin_open(local_path, mode, buffering\u001b[39m=\u001b[39;49mbuffering, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mopen_kwargs)\n","\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '../input/googlenewsvectorsnegative300/GoogleNews-vectors-negative300.bin'"]}],"source":["model = gensim.models.KeyedVectors.load_word2vec_format('./models/GoogleNews-vectors-negative300.bin', binary=True)"]},{"cell_type":"markdown","metadata":{},"source":["<h2>Initial retrieval with bm25</h2>"]},{"cell_type":"code","execution_count":33,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T18:57:01.149061Z","iopub.status.busy":"2024-01-03T18:57:01.148640Z","iopub.status.idle":"2024-01-03T18:57:01.166124Z","shell.execute_reply":"2024-01-03T18:57:01.164536Z","shell.execute_reply.started":"2024-01-03T18:57:01.149026Z"},"trusted":true},"outputs":[],"source":["def data_clean(text):\n","    pattern = r'[^a-zA-Z0-9\\s]'\n","    text = re.sub(pattern,'',' '.join(text))\n","    tokens = [token.strip() for token in text.split()]\n","    filtered = [token for token in tokens if token.lower() not in stopword_list]\n","    filtered = ' '.join(filtered)\n","    return filtered\n","\n","# just the same code as above to clean the df texts for bm25\n","def data_clean_df(text):\n","    # Regex pattern to keep only alphanumeric characters and spaces\n","    pattern = r'[^a-zA-Z0-9\\s]'\n","    text = re.sub(pattern, '', text)\n","    tokens = [token.strip() for token in text.split()]\n","    return ' '.join(tokens)\n","\n","\n","#function is needed to get the texts of the relevant documents from initial retrieval\n","def get_texts_from_df(doc_ids, df):\n","    return df[df['id'].isin(doc_ids)]['text'].tolist()\n","\n","def embeddings(word):\n","    if word in model.key_to_index:\n","        return model.get_vector(word)\n","    else:\n","        return np.zeros(300)\n","    \n","def get_sim(average_vec_query, average_vec_docs):\n","    sim = [(1 - spatial.distance.cosine(average_vec_query, average_vec_docs))]\n","    return sim\n","\n","#some queries have a .T in the begining we want to remove this\n","def clean_query(text):\n","    pattern = r'^\\.T\\s'\n","    tokens = [token.strip() for token in text.split()]\n","    return ' '.join(tokens)\n","\n","#special pre-processing for bm25, because for embeddings we don't want to pre-process that much\n","def data_clean_for_bm25(text):\n","   # Lowercasing the text\n","    text = text.lower()\n","    # Removing digits\n","    text = re.sub(r'\\d+', '', text)\n","    # Removing punctuation\n","    translator = str.maketrans('', '', string.punctuation)\n","    text = text.translate(translator)\n","    # Whitespace normalization\n","    text = \" \".join(text.split())\n","    # Stopword removal\n","    stop_words = set(stopwords.words(\"english\"))\n","    word_tokens = word_tokenize(text)\n","    filtered_words = [word for word in word_tokens if word not in stop_words]\n","    # Lemmatization\n","    lemmatizer = WordNetLemmatizer()\n","    lemmas = [lemmatizer.lemmatize(word) for word in filtered_words]\n","\n","    return lemmas"]},{"cell_type":"code","execution_count":34,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T18:57:04.266231Z","iopub.status.busy":"2024-01-03T18:57:04.265732Z","iopub.status.idle":"2024-01-03T18:57:10.812646Z","shell.execute_reply":"2024-01-03T18:57:10.811193Z","shell.execute_reply.started":"2024-01-03T18:57:04.266183Z"},"trusted":true},"outputs":[{"data":{"text/html":["<div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>id</th>\n","      <th>text</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>0</th>\n","      <td>1</td>\n","      <td>[problem, concern, making, descriptive, title,...</td>\n","    </tr>\n","    <tr>\n","      <th>1</th>\n","      <td>2</td>\n","      <td>[actually, pertinent, data, opposed, reference...</td>\n","    </tr>\n","    <tr>\n","      <th>2</th>\n","      <td>3</td>\n","      <td>[information, science, give, definition, possi...</td>\n","    </tr>\n","    <tr>\n","      <th>3</th>\n","      <td>4</td>\n","      <td>[image, recognition, method, automatically, tr...</td>\n","    </tr>\n","    <tr>\n","      <th>4</th>\n","      <td>5</td>\n","      <td>[special, training, ordinary, researcher, busi...</td>\n","    </tr>\n","    <tr>\n","      <th>...</th>\n","      <td>...</td>\n","      <td>...</td>\n","    </tr>\n","    <tr>\n","      <th>100</th>\n","      <td>101</td>\n","      <td>[parallel, computation, information, retrieval...</td>\n","    </tr>\n","    <tr>\n","      <th>101</th>\n","      <td>102</td>\n","      <td>[measurement, term, importance, automatic, ind...</td>\n","    </tr>\n","    <tr>\n","      <th>103</th>\n","      <td>104</td>\n","      <td>[selection, good, search, term, van, rijsberge...</td>\n","    </tr>\n","    <tr>\n","      <th>108</th>\n","      <td>109</td>\n","      <td>[author, cocitation, literature, measure, inte...</td>\n","    </tr>\n","    <tr>\n","      <th>110</th>\n","      <td>111</td>\n","      <td>[document, clustering, using, inverted, file, ...</td>\n","    </tr>\n","  </tbody>\n","</table>\n","<p>76 rows × 2 columns</p>\n","</div>"],"text/plain":["      id                                               text\n","0      1  [problem, concern, making, descriptive, title,...\n","1      2  [actually, pertinent, data, opposed, reference...\n","2      3  [information, science, give, definition, possi...\n","3      4  [image, recognition, method, automatically, tr...\n","4      5  [special, training, ordinary, researcher, busi...\n","..   ...                                                ...\n","100  101  [parallel, computation, information, retrieval...\n","101  102  [measurement, term, importance, automatic, ind...\n","103  104  [selection, good, search, term, van, rijsberge...\n","108  109  [author, cocitation, literature, measure, inte...\n","110  111  [document, clustering, using, inverted, file, ...\n","\n","[76 rows x 2 columns]"]},"execution_count":34,"metadata":{},"output_type":"execute_result"}],"source":["queries_cleaned = queries.copy()\n","queries_cleaned['text'] = queries_cleaned['text'].apply(data_clean_df)\n","queries_cleaned['text'] = queries_cleaned['text'].apply(clean_query)\n","\n","docs_cleaned = docs.copy()\n","docs_cleaned['text'] = docs_cleaned['text'].apply(data_clean_df)\n","docs_cleaned\n","\n","queries_cleaned_bm25 = queries.copy()\n","queries_cleaned_bm25['text'] = queries_cleaned_bm25['text'].apply(data_clean_for_bm25)\n","\n","docs_cleaned_bm25 = docs.copy()\n","docs_cleaned_bm25['text'] = docs_cleaned_bm25['text'].apply(data_clean_for_bm25)\n","queries_cleaned_bm25"]},{"cell_type":"code","execution_count":35,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T18:57:10.815397Z","iopub.status.busy":"2024-01-03T18:57:10.814883Z","iopub.status.idle":"2024-01-03T18:57:10.918743Z","shell.execute_reply":"2024-01-03T18:57:10.917514Z","shell.execute_reply.started":"2024-01-03T18:57:10.815350Z"},"trusted":true},"outputs":[],"source":["corpus = docs_cleaned_bm25['text'].to_list()\n","bm25 = BM25Okapi(corpus)"]},{"cell_type":"code","execution_count":36,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T18:57:25.428988Z","iopub.status.busy":"2024-01-03T18:57:25.428280Z","iopub.status.idle":"2024-01-03T18:57:25.442445Z","shell.execute_reply":"2024-01-03T18:57:25.440524Z","shell.execute_reply.started":"2024-01-03T18:57:25.428941Z"},"trusted":true},"outputs":[],"source":["def initial_retrieval_bm25(query_id, query_text, bm25, k):\n","    query = query_text\n","    document_ids = docs_cleaned['id'].to_list()\n","    tokenized_query = query.split(\" \")\n","    doc_scores = bm25.get_scores(tokenized_query)\n","    doc_scores_dict = dict(zip(document_ids, doc_scores))\n","    #print(doc_scores_dict)\n","    most_similar_init_k_documents = {doc_id: [score] for doc_id, score in sorted(doc_scores_dict.items(), key=lambda item: item[1], reverse=True)}\n","    most_similar_k_documents = {}\n","    counter = 0\n","    for id, score in most_similar_init_k_documents.items():\n","        try:\n","            if counter == k:\n","                break\n","            most_similar_k_documents[id]=score\n","            counter += 1\n","        except:\n","            break\n","\n","    return most_similar_k_documents   "]},{"cell_type":"markdown","metadata":{},"source":["test initial retrieval"]},{"cell_type":"code","execution_count":37,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T20:29:57.136014Z","iopub.status.busy":"2024-01-03T20:29:57.135199Z","iopub.status.idle":"2024-01-03T20:29:57.164581Z","shell.execute_reply":"2024-01-03T20:29:57.163070Z","shell.execute_reply.started":"2024-01-03T20:29:57.135955Z"},"trusted":true},"outputs":[{"data":{"text/plain":["{1399: [11.776525442346998],\n"," 166: [10.781292227767148],\n"," 1071: [10.763945625290491],\n"," 1096: [9.519484116329025],\n"," 523: [7.7636491859711825],\n"," 145: [7.635257862877457],\n"," 374: [7.63466463549917],\n"," 810: [7.1141474648611664],\n"," 778: [6.9228491481657555],\n"," 1054: [6.886386163257785]}"]},"execution_count":37,"metadata":{},"output_type":"execute_result"}],"source":["initial_retrieval_bm25(2, queries_cleaned['text'][1], bm25, 10)"]},{"cell_type":"markdown","metadata":{},"source":["retrieve documents for all queries"]},{"cell_type":"code","execution_count":38,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T19:43:48.581787Z","iopub.status.busy":"2024-01-03T19:43:48.580745Z","iopub.status.idle":"2024-01-03T19:43:53.300653Z","shell.execute_reply":"2024-01-03T19:43:53.299119Z","shell.execute_reply.started":"2024-01-03T19:43:48.581700Z"},"trusted":true},"outputs":[],"source":["initial_retrieval = dict()\n","initial_retrieval_with_scores = dict()\n","for index, row in queries_cleaned.iterrows():\n","    query_id = row[0]\n","    query_text = row[1]\n","    retrieved_documents = initial_retrieval_bm25(query_id, query_text, bm25, 100)\n","    initial_retrieval[query_id] = list(retrieved_documents.keys())\n","    initial_retrieval_with_scores[query_id] = retrieved_documents\n","\n","\n","with open('saved_dictionary.pkl', 'wb') as f:\n","    pickle.dump(dictionary, f)\n","        \n","with open('saved_dictionary.pkl', 'rb') as f:\n","    loaded_dict = pickle.load(f)"]},{"cell_type":"markdown","metadata":{},"source":["<h2> Re-rank with word embeddings and cosine similarity </h2>"]},{"cell_type":"code","execution_count":21,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T19:06:04.035877Z","iopub.status.busy":"2024-01-03T19:06:04.035379Z","iopub.status.idle":"2024-01-03T19:06:07.936834Z","shell.execute_reply":"2024-01-03T19:06:07.935286Z","shell.execute_reply.started":"2024-01-03T19:06:04.035826Z"},"trusted":true},"outputs":[],"source":["# Average vector for each document\n","out_dict_docs = {}\n","n = 1\n","for sen in full_doc:\n","    average_vector = (np.mean(np.array([embeddings(x) for x in data_clean(nltk.word_tokenize(sen)).split()]), axis=0))\n","    d1 = {n: (average_vector)}\n","    out_dict_docs.update(d1)\n","    n +=1 "]},{"cell_type":"code","execution_count":22,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T19:06:07.940189Z","iopub.status.busy":"2024-01-03T19:06:07.939757Z","iopub.status.idle":"2024-01-03T19:06:08.212281Z","shell.execute_reply":"2024-01-03T19:06:08.210836Z","shell.execute_reply.started":"2024-01-03T19:06:07.940125Z"},"trusted":true},"outputs":[],"source":["# Average vector for each document\n","out_dict_queries = {}\n","n = 1\n","for sen in full_query:\n","    average_vector = (np.mean(np.array([embeddings(x) for x in data_clean(nltk.word_tokenize(sen)).split()]), axis=0))\n","    d1 = {n: (average_vector)}\n","    out_dict_queries.update(d1)\n","    n +=1 "]},{"cell_type":"code","execution_count":23,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T19:06:08.214490Z","iopub.status.busy":"2024-01-03T19:06:08.214177Z","iopub.status.idle":"2024-01-03T19:06:08.264818Z","shell.execute_reply":"2024-01-03T19:06:08.263685Z","shell.execute_reply.started":"2024-01-03T19:06:08.214460Z"},"trusted":true},"outputs":[{"data":{"text/html":["<div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>id</th>\n","      <th>text</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>0</th>\n","      <td>1</td>\n","      <td>[0.053237017, 0.07729071, -0.07088695, 0.08220...</td>\n","    </tr>\n","    <tr>\n","      <th>1</th>\n","      <td>2</td>\n","      <td>[-0.019184113, -0.014770508, 0.012769063, 0.02...</td>\n","    </tr>\n","    <tr>\n","      <th>2</th>\n","      <td>3</td>\n","      <td>[0.0105957035, -0.0440918, 0.12583008, 0.15257...</td>\n","    </tr>\n","    <tr>\n","      <th>3</th>\n","      <td>4</td>\n","      <td>[0.03293185763888889, 0.05387708875868055, -0....</td>\n","    </tr>\n","    <tr>\n","      <th>4</th>\n","      <td>5</td>\n","      <td>[-0.013476203, 0.03181727, -0.039708756, 0.028...</td>\n","    </tr>\n","    <tr>\n","      <th>...</th>\n","      <td>...</td>\n","      <td>...</td>\n","    </tr>\n","    <tr>\n","      <th>107</th>\n","      <td>108</td>\n","      <td>[-0.013725492689344618, -0.03991911146375868, ...</td>\n","    </tr>\n","    <tr>\n","      <th>108</th>\n","      <td>109</td>\n","      <td>[0.016849247276360262, -0.006489138231209829, ...</td>\n","    </tr>\n","    <tr>\n","      <th>109</th>\n","      <td>110</td>\n","      <td>[0.0070692516508556544, 0.009782482328869047, ...</td>\n","    </tr>\n","    <tr>\n","      <th>110</th>\n","      <td>111</td>\n","      <td>[0.06036170054290254, 0.027709379034527276, 0....</td>\n","    </tr>\n","    <tr>\n","      <th>111</th>\n","      <td>112</td>\n","      <td>[0.022738986545138888, 0.027384086891456886, 0...</td>\n","    </tr>\n","  </tbody>\n","</table>\n","<p>112 rows × 2 columns</p>\n","</div>"],"text/plain":["      id                                               text\n","0      1  [0.053237017, 0.07729071, -0.07088695, 0.08220...\n","1      2  [-0.019184113, -0.014770508, 0.012769063, 0.02...\n","2      3  [0.0105957035, -0.0440918, 0.12583008, 0.15257...\n","3      4  [0.03293185763888889, 0.05387708875868055, -0....\n","4      5  [-0.013476203, 0.03181727, -0.039708756, 0.028...\n","..   ...                                                ...\n","107  108  [-0.013725492689344618, -0.03991911146375868, ...\n","108  109  [0.016849247276360262, -0.006489138231209829, ...\n","109  110  [0.0070692516508556544, 0.009782482328869047, ...\n","110  111  [0.06036170054290254, 0.027709379034527276, 0....\n","111  112  [0.022738986545138888, 0.027384086891456886, 0...\n","\n","[112 rows x 2 columns]"]},"execution_count":23,"metadata":{},"output_type":"execute_result"}],"source":["average_vec_queries = pd.DataFrame(list(out_dict_queries.items()), columns=['id', 'text'])\n","average_vec_queries"]},{"cell_type":"code","execution_count":25,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T19:06:08.323471Z","iopub.status.busy":"2024-01-03T19:06:08.322893Z","iopub.status.idle":"2024-01-03T19:06:08.382600Z","shell.execute_reply":"2024-01-03T19:06:08.381085Z","shell.execute_reply.started":"2024-01-03T19:06:08.323410Z"},"trusted":true},"outputs":[{"data":{"text/html":["<div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>id</th>\n","      <th>text</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>0</th>\n","      <td>1</td>\n","      <td>[0.00830413818359375, 0.00127777099609375, -0....</td>\n","    </tr>\n","    <tr>\n","      <th>1</th>\n","      <td>2</td>\n","      <td>[0.05055378758630087, -0.02922665795614553, 0....</td>\n","    </tr>\n","    <tr>\n","      <th>2</th>\n","      <td>3</td>\n","      <td>[0.07028712, 0.0061023016, 0.02422772, -0.0077...</td>\n","    </tr>\n","    <tr>\n","      <th>3</th>\n","      <td>4</td>\n","      <td>[-0.0112152099609375, 0.007620472019001589, 0....</td>\n","    </tr>\n","    <tr>\n","      <th>4</th>\n","      <td>5</td>\n","      <td>[0.028684964743993617, 0.040372458837365593, 0...</td>\n","    </tr>\n","    <tr>\n","      <th>...</th>\n","      <td>...</td>\n","      <td>...</td>\n","    </tr>\n","    <tr>\n","      <th>1455</th>\n","      <td>1456</td>\n","      <td>[0.006353525, 0.023849096, 0.0082289865, 0.072...</td>\n","    </tr>\n","    <tr>\n","      <th>1456</th>\n","      <td>1457</td>\n","      <td>[0.019117838, 0.03813685, 0.039766844, 0.10477...</td>\n","    </tr>\n","    <tr>\n","      <th>1457</th>\n","      <td>1458</td>\n","      <td>[-0.014762384, 0.016318252, -0.0022795142, 0.1...</td>\n","    </tr>\n","    <tr>\n","      <th>1458</th>\n","      <td>1459</td>\n","      <td>[0.087884314, -0.021434652, 0.05227727, 0.1237...</td>\n","    </tr>\n","    <tr>\n","      <th>1459</th>\n","      <td>1460</td>\n","      <td>[-0.045013427734375, 0.01713788067853009, 0.05...</td>\n","    </tr>\n","  </tbody>\n","</table>\n","<p>1460 rows × 2 columns</p>\n","</div>"],"text/plain":["        id                                               text\n","0        1  [0.00830413818359375, 0.00127777099609375, -0....\n","1        2  [0.05055378758630087, -0.02922665795614553, 0....\n","2        3  [0.07028712, 0.0061023016, 0.02422772, -0.0077...\n","3        4  [-0.0112152099609375, 0.007620472019001589, 0....\n","4        5  [0.028684964743993617, 0.040372458837365593, 0...\n","...    ...                                                ...\n","1455  1456  [0.006353525, 0.023849096, 0.0082289865, 0.072...\n","1456  1457  [0.019117838, 0.03813685, 0.039766844, 0.10477...\n","1457  1458  [-0.014762384, 0.016318252, -0.0022795142, 0.1...\n","1458  1459  [0.087884314, -0.021434652, 0.05227727, 0.1237...\n","1459  1460  [-0.045013427734375, 0.01713788067853009, 0.05...\n","\n","[1460 rows x 2 columns]"]},"execution_count":25,"metadata":{},"output_type":"execute_result"}],"source":["average_vec_docs = pd.DataFrame(list(out_dict_docs.items()), columns=['id', 'text'])\n","average_vec_docs"]},{"cell_type":"code","execution_count":29,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T19:12:12.457922Z","iopub.status.busy":"2024-01-03T19:12:12.457418Z","iopub.status.idle":"2024-01-03T19:12:12.558607Z","shell.execute_reply":"2024-01-03T19:12:12.556953Z","shell.execute_reply.started":"2024-01-03T19:12:12.457878Z"},"trusted":true},"outputs":[],"source":["def basline_reranker(initial_retrieval, query_id, average_vec_query, k):\n","    retrieved_document_ids = initial_retrieval\n","    retrieved_document_embeddings = average_vec_docs[average_vec_docs['id'].isin(retrieved_document_ids)]\n","    similarities = dict()\n","    for index, row in retrieved_document_embeddings.iterrows():\n","        document_vec = row.values[1]\n","        document_id = row.values[0]\n","        similarity = get_sim(average_vec_query, document_vec)\n","        similarities[document_id]=similarity\n","    similarities = dict(sorted(similarities.items(), key=lambda item: item[1], reverse=True))\n","    \n","    most_similar_k_documents = {}\n","    counter = 0\n","    for id, score in similarities.items():\n","        try:\n","            if counter == k:\n","                break\n","            most_similar_k_documents[id]=score\n","            counter += 1\n","        except:\n","            break\n","\n","    return most_similar_k_documents   \n"]},{"cell_type":"markdown","metadata":{},"source":["test for one query"]},{"cell_type":"code","execution_count":73,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T20:31:31.280564Z","iopub.status.busy":"2024-01-03T20:31:31.279777Z","iopub.status.idle":"2024-01-03T20:31:31.332206Z","shell.execute_reply":"2024-01-03T20:31:31.329630Z","shell.execute_reply.started":"2024-01-03T20:31:31.280499Z"},"trusted":true},"outputs":[{"data":{"text/plain":["{429: [0.7788516433824191],\n"," 523: [0.7654931162749747],\n"," 68: [0.7621192073077985],\n"," 381: [0.756619393825531],\n"," 145: [0.7549756090477774],\n"," 421: [0.7507395596452094],\n"," 492: [0.745815041875989],\n"," 1078: [0.7430375974739672],\n"," 1054: [0.7420839667320251],\n"," 202: [0.7335939972659942]}"]},"execution_count":73,"metadata":{},"output_type":"execute_result"}],"source":["basline_reranker(initial_retrieval[2], 2, average_vec_queries['text'][1], 10)"]},{"cell_type":"markdown","metadata":{"execution":{"iopub.execute_input":"2023-12-28T18:34:19.926855Z","iopub.status.busy":"2023-12-28T18:34:19.926354Z","iopub.status.idle":"2023-12-28T18:34:19.963082Z","shell.execute_reply":"2023-12-28T18:34:19.961716Z","shell.execute_reply.started":"2023-12-28T18:34:19.926815Z"}},"source":["re-rank documents for all queries"]},{"cell_type":"code","execution_count":42,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T19:47:00.276459Z","iopub.status.busy":"2024-01-03T19:47:00.275796Z","iopub.status.idle":"2024-01-03T19:47:01.953982Z","shell.execute_reply":"2024-01-03T19:47:01.952820Z","shell.execute_reply.started":"2024-01-03T19:47:00.276401Z"},"trusted":true},"outputs":[],"source":["baseline_reranker_retrieval = dict()\n","for index, row in queries_cleaned.iterrows():\n","    query_id = row[0]\n","    query_text = row[1]\n","    query_embeddings = average_vec_queries['text'][query_id-1]\n","    retrieved_documents = initial_retrieval[query_id]\n","    basline_reranker_documents = basline_reranker(retrieved_documents, query_id, query_embeddings, 50)\n","    baseline_reranker_retrieval[query_id] = list(basline_reranker_documents.keys())\n","    #print(list(retrieved_documents.keys()))"]},{"cell_type":"markdown","metadata":{},"source":["## Initializing BERTopic model"]},{"cell_type":"code","execution_count":14,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T18:57:52.932829Z","iopub.status.busy":"2024-01-03T18:57:52.932037Z","iopub.status.idle":"2024-01-03T18:57:52.940868Z","shell.execute_reply":"2024-01-03T18:57:52.939770Z","shell.execute_reply.started":"2024-01-03T18:57:52.932765Z"},"trusted":true},"outputs":[],"source":["from umap import UMAP\n","\n","model_name = 'sentence-transformers/all-MiniLM-L6-v2'\n","\n","# UMAP is stochastic, so re-produce results you need to set the random_state for umap and pass this umap model to BERTopic:\n","# umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)\n","# HOW TO SET UMAP MODEL: topic_model = BERTopic(umap_model=umap_model)\n","\n","topic_model = BERTopic(embedding_model=model_name, ctfidf_model=ClassTfidfTransformer(reduce_frequent_words=True))"]},{"cell_type":"code","execution_count":15,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T18:57:55.031840Z","iopub.status.busy":"2024-01-03T18:57:55.031137Z","iopub.status.idle":"2024-01-03T19:00:05.623076Z","shell.execute_reply":"2024-01-03T19:00:05.621204Z","shell.execute_reply.started":"2024-01-03T18:57:55.031784Z"},"trusted":true},"outputs":[{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"3e55827322034c1d843039710fe4041e","version_major":2,"version_minor":0},"text/plain":["Downloading .gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"5fbf40e5b9b04bc5b73da81c1409943d","version_major":2,"version_minor":0},"text/plain":["Downloading 1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"66be26f4c7f4484e894a4f96a26324ac","version_major":2,"version_minor":0},"text/plain":["Downloading README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"eac12f55c119482b9e8fc286dae5c709","version_major":2,"version_minor":0},"text/plain":["Downloading config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"5b48033f288844fd99b4d0b227b6f96b","version_major":2,"version_minor":0},"text/plain":["Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"f96a99058b554cd8ba61b9a70634b956","version_major":2,"version_minor":0},"text/plain":["Downloading data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"21bf0cf5777742a2abe36c0495b0f091","version_major":2,"version_minor":0},"text/plain":["Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"d8805775a7574c1cb58542c47333e259","version_major":2,"version_minor":0},"text/plain":["Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"120157781f3a42b695d3625eef6a193e","version_major":2,"version_minor":0},"text/plain":["Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"4f5b544f3c8c4435ae83de863ad60f73","version_major":2,"version_minor":0},"text/plain":["Downloading tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"e3117d7067574f428b7f8ed00672fe4f","version_major":2,"version_minor":0},"text/plain":["Downloading tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"615371625e25447fa1377fb1dc4e9fb1","version_major":2,"version_minor":0},"text/plain":["Downloading train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"f9d8faee1348468bad7cc8bd5803f910","version_major":2,"version_minor":0},"text/plain":["Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"73638937ef0f4e18bab90ca6e6c0d3bf","version_major":2,"version_minor":0},"text/plain":["Downloading modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"text/html":["<div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>Topic</th>\n","      <th>Count</th>\n","      <th>Name</th>\n","      <th>Representation</th>\n","      <th>Representative_Docs</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>0</th>\n","      <td>-1</td>\n","      <td>269</td>\n","      <td>-1_document_system_indexing_retrieval</td>\n","      <td>[document, system, indexing, retrieval, on, an...</td>\n","      <td>[PRECIS: a manual of concept analysis and subj...</td>\n","    </tr>\n","    <tr>\n","      <th>1</th>\n","      <td>0</td>\n","      <td>384</td>\n","      <td>0_scientific_science_journals_social</td>\n","      <td>[scientific, science, journals, social, scient...</td>\n","      <td>[Recent Growth of the Literature of Biochemist...</td>\n","    </tr>\n","    <tr>\n","      <th>2</th>\n","      <td>1</td>\n","      <td>253</td>\n","      <td>1_library_libraries_university_academic</td>\n","      <td>[library, libraries, university, academic, pub...</td>\n","      <td>[Undergraduate Library The development of the ...</td>\n","    </tr>\n","    <tr>\n","      <th>3</th>\n","      <td>2</td>\n","      <td>90</td>\n","      <td>2_chemical_compounds_notation_search</td>\n","      <td>[chemical, compounds, notation, search, titles...</td>\n","      <td>[Experiences of IIT Research Institute in Oper...</td>\n","    </tr>\n","    <tr>\n","      <th>4</th>\n","      <td>3</td>\n","      <td>54</td>\n","      <td>3_automatic_indexing_classification_document</td>\n","      <td>[automatic, indexing, classification, document...</td>\n","      <td>[What Makes An Automatic Keyword Classificatio...</td>\n","    </tr>\n","  </tbody>\n","</table>\n","</div>"],"text/plain":["   Topic  Count                                          Name  \\\n","0     -1    269         -1_document_system_indexing_retrieval   \n","1      0    384          0_scientific_science_journals_social   \n","2      1    253       1_library_libraries_university_academic   \n","3      2     90          2_chemical_compounds_notation_search   \n","4      3     54  3_automatic_indexing_classification_document   \n","\n","                                      Representation  \\\n","0  [document, system, indexing, retrieval, on, an...   \n","1  [scientific, science, journals, social, scient...   \n","2  [library, libraries, university, academic, pub...   \n","3  [chemical, compounds, notation, search, titles...   \n","4  [automatic, indexing, classification, document...   \n","\n","                                 Representative_Docs  \n","0  [PRECIS: a manual of concept analysis and subj...  \n","1  [Recent Growth of the Literature of Biochemist...  \n","2  [Undergraduate Library The development of the ...  \n","3  [Experiences of IIT Research Institute in Oper...  \n","4  [What Makes An Automatic Keyword Classificatio...  "]},"execution_count":15,"metadata":{},"output_type":"execute_result"}],"source":["# training bert model\n","docs_for_bert = docs[\"text\"]\n","topic_model.fit(docs_for_bert)"]},{"cell_type":"code","execution_count":77,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T20:37:47.548151Z","iopub.status.busy":"2024-01-03T20:37:47.547570Z","iopub.status.idle":"2024-01-03T20:37:47.601583Z","shell.execute_reply":"2024-01-03T20:37:47.599861Z","shell.execute_reply.started":"2024-01-03T20:37:47.548107Z"},"trusted":true},"outputs":[{"data":{"text/html":["<div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>Topic</th>\n","      <th>Count</th>\n","      <th>Name</th>\n","      <th>Representation</th>\n","      <th>Representative_Docs</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>0</th>\n","      <td>-1</td>\n","      <td>269</td>\n","      <td>-1_document_system_indexing_retrieval</td>\n","      <td>[document, system, indexing, retrieval, on, an...</td>\n","      <td>[PRECIS: a manual of concept analysis and subj...</td>\n","    </tr>\n","    <tr>\n","      <th>1</th>\n","      <td>0</td>\n","      <td>384</td>\n","      <td>0_scientific_science_journals_social</td>\n","      <td>[scientific, science, journals, social, scient...</td>\n","      <td>[Recent Growth of the Literature of Biochemist...</td>\n","    </tr>\n","    <tr>\n","      <th>2</th>\n","      <td>1</td>\n","      <td>253</td>\n","      <td>1_library_libraries_university_academic</td>\n","      <td>[library, libraries, university, academic, pub...</td>\n","      <td>[Undergraduate Library The development of the ...</td>\n","    </tr>\n","    <tr>\n","      <th>3</th>\n","      <td>2</td>\n","      <td>90</td>\n","      <td>2_chemical_compounds_notation_search</td>\n","      <td>[chemical, compounds, notation, search, titles...</td>\n","      <td>[Experiences of IIT Research Institute in Oper...</td>\n","    </tr>\n","    <tr>\n","      <th>4</th>\n","      <td>3</td>\n","      <td>54</td>\n","      <td>3_automatic_indexing_classification_document</td>\n","      <td>[automatic, indexing, classification, document...</td>\n","      <td>[What Makes An Automatic Keyword Classificatio...</td>\n","    </tr>\n","    <tr>\n","      <th>5</th>\n","      <td>4</td>\n","      <td>50</td>\n","      <td>4_bases_data_bibliographic_line</td>\n","      <td>[bases, data, bibliographic, line, readable, s...</td>\n","      <td>[Survey of Commercially Available Computer-Rea...</td>\n","    </tr>\n","    <tr>\n","      <th>6</th>\n","      <td>5</td>\n","      <td>46</td>\n","      <td>5_relevance_retrieval_answer_relevant</td>\n","      <td>[relevance, retrieval, answer, relevant, docum...</td>\n","      <td>[On Relevance, Probabilistic Indexing and Info...</td>\n","    </tr>\n","    <tr>\n","      <th>7</th>\n","      <td>6</td>\n","      <td>38</td>\n","      <td>6_catalog_catalogs_cataloging_card</td>\n","      <td>[catalog, catalogs, cataloging, card, catalogu...</td>\n","      <td>[The Potential Usefulness of Catalog Access Po...</td>\n","    </tr>\n","    <tr>\n","      <th>8</th>\n","      <td>7</td>\n","      <td>28</td>\n","      <td>7_classification_decimal_udc_dewey</td>\n","      <td>[classification, decimal, udc, dewey, schemes,...</td>\n","      <td>[Progress in Documentation Thirty years or mor...</td>\n","    </tr>\n","    <tr>\n","      <th>9</th>\n","      <td>8</td>\n","      <td>27</td>\n","      <td>8_language_linguistics_linguistic_semantic</td>\n","      <td>[language, linguistics, linguistic, semantic, ...</td>\n","      <td>[Functional Approach The present book sums up ...</td>\n","    </tr>\n","    <tr>\n","      <th>10</th>\n","      <td>9</td>\n","      <td>25</td>\n","      <td>9_medical_health_hospital_manpower</td>\n","      <td>[medical, health, hospital, manpower, hospital...</td>\n","      <td>[Library Practice in Hospitals According to a ...</td>\n","    </tr>\n","    <tr>\n","      <th>11</th>\n","      <td>10</td>\n","      <td>22</td>\n","      <td>10_automation_library_processing_telefacsimile</td>\n","      <td>[automation, library, processing, telefacsimil...</td>\n","      <td>[HDB of Data Processing for Libraries The four...</td>\n","    </tr>\n","    <tr>\n","      <th>12</th>\n","      <td>11</td>\n","      <td>21</td>\n","      <td>11_retrieval_user_systems_isrs</td>\n","      <td>[retrieval, user, systems, isrs, system, dialo...</td>\n","      <td>[Information Retrieval Systems This book is co...</td>\n","    </tr>\n","    <tr>\n","      <th>13</th>\n","      <td>12</td>\n","      <td>19</td>\n","      <td>12_thesaurus_thesauri_vocabularies_vocabulary</td>\n","      <td>[thesaurus, thesauri, vocabularies, vocabulary...</td>\n","      <td>[Theoretical Foundations of Thesaurus-Construc...</td>\n","    </tr>\n","    <tr>\n","      <th>14</th>\n","      <td>13</td>\n","      <td>18</td>\n","      <td>13_evaluation_cost_costs_systems</td>\n","      <td>[evaluation, cost, costs, systems, scale, serv...</td>\n","      <td>[Design and Evaluation of Information Systems ...</td>\n","    </tr>\n","    <tr>\n","      <th>15</th>\n","      <td>14</td>\n","      <td>18</td>\n","      <td>14_fuzzy_classification_sets_membership</td>\n","      <td>[fuzzy, classification, sets, membership, hedg...</td>\n","      <td>[Prospects for a New General Classification In...</td>\n","    </tr>\n","    <tr>\n","      <th>16</th>\n","      <td>15</td>\n","      <td>17</td>\n","      <td>15_serials_isbd_serial_international</td>\n","      <td>[serials, isbd, serial, international, rules, ...</td>\n","      <td>[No Special Rules for Entry of Serials One of ...</td>\n","    </tr>\n","    <tr>\n","      <th>17</th>\n","      <td>16</td>\n","      <td>16</td>\n","      <td>16_compression_coding_length_grams</td>\n","      <td>[compression, coding, length, grams, error, na...</td>\n","      <td>[An Information-Theoretic Approach to Text Sea...</td>\n","    </tr>\n","    <tr>\n","      <th>18</th>\n","      <td>17</td>\n","      <td>15</td>\n","      <td>17_microfiche_microforms_microform_microfilm</td>\n","      <td>[microfiche, microforms, microform, microfilm,...</td>\n","      <td>[The Microform Revolution Librarians have trie...</td>\n","    </tr>\n","    <tr>\n","      <th>19</th>\n","      <td>18</td>\n","      <td>15</td>\n","      <td>18_network_networks_cable_television</td>\n","      <td>[network, networks, cable, television, communi...</td>\n","      <td>[The National Biomedical Communications Networ...</td>\n","    </tr>\n","    <tr>\n","      <th>20</th>\n","      <td>19</td>\n","      <td>14</td>\n","      <td>19_medlars_medline_twx_medicus</td>\n","      <td>[medlars, medline, twx, medicus, medicine, nlm...</td>\n","      <td>[MEDLARS: A Summary Review and Evaluation of T...</td>\n","    </tr>\n","    <tr>\n","      <th>21</th>\n","      <td>20</td>\n","      <td>11</td>\n","      <td>20_centers_services_systems_micrographic</td>\n","      <td>[centers, services, systems, micrographic, enc...</td>\n","      <td>[The Annual Review of Information Science and ...</td>\n","    </tr>\n","    <tr>\n","      <th>22</th>\n","      <td>21</td>\n","      <td>10</td>\n","      <td>21_marc_records_readable_pilot</td>\n","      <td>[marc, records, readable, pilot, cobol, machin...</td>\n","      <td>[The Marc II Format:                        A ...</td>\n","    </tr>\n","  </tbody>\n","</table>\n","</div>"],"text/plain":["    Topic  Count                                            Name  \\\n","0      -1    269           -1_document_system_indexing_retrieval   \n","1       0    384            0_scientific_science_journals_social   \n","2       1    253         1_library_libraries_university_academic   \n","3       2     90            2_chemical_compounds_notation_search   \n","4       3     54    3_automatic_indexing_classification_document   \n","5       4     50                 4_bases_data_bibliographic_line   \n","6       5     46           5_relevance_retrieval_answer_relevant   \n","7       6     38              6_catalog_catalogs_cataloging_card   \n","8       7     28              7_classification_decimal_udc_dewey   \n","9       8     27      8_language_linguistics_linguistic_semantic   \n","10      9     25              9_medical_health_hospital_manpower   \n","11     10     22  10_automation_library_processing_telefacsimile   \n","12     11     21                  11_retrieval_user_systems_isrs   \n","13     12     19   12_thesaurus_thesauri_vocabularies_vocabulary   \n","14     13     18                13_evaluation_cost_costs_systems   \n","15     14     18         14_fuzzy_classification_sets_membership   \n","16     15     17            15_serials_isbd_serial_international   \n","17     16     16              16_compression_coding_length_grams   \n","18     17     15    17_microfiche_microforms_microform_microfilm   \n","19     18     15            18_network_networks_cable_television   \n","20     19     14                  19_medlars_medline_twx_medicus   \n","21     20     11        20_centers_services_systems_micrographic   \n","22     21     10                  21_marc_records_readable_pilot   \n","\n","                                       Representation  \\\n","0   [document, system, indexing, retrieval, on, an...   \n","1   [scientific, science, journals, social, scient...   \n","2   [library, libraries, university, academic, pub...   \n","3   [chemical, compounds, notation, search, titles...   \n","4   [automatic, indexing, classification, document...   \n","5   [bases, data, bibliographic, line, readable, s...   \n","6   [relevance, retrieval, answer, relevant, docum...   \n","7   [catalog, catalogs, cataloging, card, catalogu...   \n","8   [classification, decimal, udc, dewey, schemes,...   \n","9   [language, linguistics, linguistic, semantic, ...   \n","10  [medical, health, hospital, manpower, hospital...   \n","11  [automation, library, processing, telefacsimil...   \n","12  [retrieval, user, systems, isrs, system, dialo...   \n","13  [thesaurus, thesauri, vocabularies, vocabulary...   \n","14  [evaluation, cost, costs, systems, scale, serv...   \n","15  [fuzzy, classification, sets, membership, hedg...   \n","16  [serials, isbd, serial, international, rules, ...   \n","17  [compression, coding, length, grams, error, na...   \n","18  [microfiche, microforms, microform, microfilm,...   \n","19  [network, networks, cable, television, communi...   \n","20  [medlars, medline, twx, medicus, medicine, nlm...   \n","21  [centers, services, systems, micrographic, enc...   \n","22  [marc, records, readable, pilot, cobol, machin...   \n","\n","                                  Representative_Docs  \n","0   [PRECIS: a manual of concept analysis and subj...  \n","1   [Recent Growth of the Literature of Biochemist...  \n","2   [Undergraduate Library The development of the ...  \n","3   [Experiences of IIT Research Institute in Oper...  \n","4   [What Makes An Automatic Keyword Classificatio...  \n","5   [Survey of Commercially Available Computer-Rea...  \n","6   [On Relevance, Probabilistic Indexing and Info...  \n","7   [The Potential Usefulness of Catalog Access Po...  \n","8   [Progress in Documentation Thirty years or mor...  \n","9   [Functional Approach The present book sums up ...  \n","10  [Library Practice in Hospitals According to a ...  \n","11  [HDB of Data Processing for Libraries The four...  \n","12  [Information Retrieval Systems This book is co...  \n","13  [Theoretical Foundations of Thesaurus-Construc...  \n","14  [Design and Evaluation of Information Systems ...  \n","15  [Prospects for a New General Classification In...  \n","16  [No Special Rules for Entry of Serials One of ...  \n","17  [An Information-Theoretic Approach to Text Sea...  \n","18  [The Microform Revolution Librarians have trie...  \n","19  [The National Biomedical Communications Networ...  \n","20  [MEDLARS: A Summary Review and Evaluation of T...  \n","21  [The Annual Review of Information Science and ...  \n","22  [The Marc II Format:                        A ...  "]},"execution_count":77,"metadata":{},"output_type":"execute_result"}],"source":["topic_model.get_topic_info()"]},{"cell_type":"code","execution_count":18,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T19:00:05.785848Z","iopub.status.busy":"2024-01-03T19:00:05.785233Z","iopub.status.idle":"2024-01-03T19:00:05.894032Z","shell.execute_reply":"2024-01-03T19:00:05.892648Z","shell.execute_reply.started":"2024-01-03T19:00:05.785783Z"},"trusted":true},"outputs":[{"data":{"text/html":["<div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>Document</th>\n","      <th>Topic</th>\n","      <th>Name</th>\n","      <th>Representation</th>\n","      <th>Representative_Docs</th>\n","      <th>Top_n_words</th>\n","      <th>Probability</th>\n","      <th>Representative_document</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>0</th>\n","      <td>18 Editions of the Dewey Decimal Classificatio...</td>\n","      <td>7</td>\n","      <td>7_classification_decimal_udc_dewey</td>\n","      <td>[classification, decimal, udc, dewey, schemes,...</td>\n","      <td>[Progress in Documentation Thirty years or mor...</td>\n","      <td>classification - decimal - udc - dewey - schem...</td>\n","      <td>0.781595</td>\n","      <td>False</td>\n","    </tr>\n","    <tr>\n","      <th>1</th>\n","      <td>Use Made of Technical Libraries This report is...</td>\n","      <td>1</td>\n","      <td>1_library_libraries_university_academic</td>\n","      <td>[library, libraries, university, academic, pub...</td>\n","      <td>[Undergraduate Library The development of the ...</td>\n","      <td>library - libraries - university - academic - ...</td>\n","      <td>0.737408</td>\n","      <td>False</td>\n","    </tr>\n","    <tr>\n","      <th>2</th>\n","      <td>Two Kinds of Power An Essay on Bibliographic C...</td>\n","      <td>-1</td>\n","      <td>-1_document_system_indexing_retrieval</td>\n","      <td>[document, system, indexing, retrieval, on, an...</td>\n","      <td>[PRECIS: a manual of concept analysis and subj...</td>\n","      <td>document - system - indexing - retrieval - on ...</td>\n","      <td>0.000000</td>\n","      <td>False</td>\n","    </tr>\n","    <tr>\n","      <th>3</th>\n","      <td>Systems Analysis of a University Library; fina...</td>\n","      <td>1</td>\n","      <td>1_library_libraries_university_academic</td>\n","      <td>[library, libraries, university, academic, pub...</td>\n","      <td>[Undergraduate Library The development of the ...</td>\n","      <td>library - libraries - university - academic - ...</td>\n","      <td>0.993856</td>\n","      <td>False</td>\n","    </tr>\n","    <tr>\n","      <th>4</th>\n","      <td>A Library Management Game: a report on a resea...</td>\n","      <td>-1</td>\n","      <td>-1_document_system_indexing_retrieval</td>\n","      <td>[document, system, indexing, retrieval, on, an...</td>\n","      <td>[PRECIS: a manual of concept analysis and subj...</td>\n","      <td>document - system - indexing - retrieval - on ...</td>\n","      <td>0.000000</td>\n","      <td>False</td>\n","    </tr>\n","    <tr>\n","      <th>...</th>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","    </tr>\n","    <tr>\n","      <th>1455</th>\n","      <td>World Dynamics Over the last several decades i...</td>\n","      <td>0</td>\n","      <td>0_scientific_science_journals_social</td>\n","      <td>[scientific, science, journals, social, scient...</td>\n","      <td>[Recent Growth of the Literature of Biochemist...</td>\n","      <td>scientific - science - journals - social - sci...</td>\n","      <td>1.000000</td>\n","      <td>False</td>\n","    </tr>\n","    <tr>\n","      <th>1456</th>\n","      <td>World Trends in Library Education One of the m...</td>\n","      <td>1</td>\n","      <td>1_library_libraries_university_academic</td>\n","      <td>[library, libraries, university, academic, pub...</td>\n","      <td>[Undergraduate Library The development of the ...</td>\n","      <td>library - libraries - university - academic - ...</td>\n","      <td>1.000000</td>\n","      <td>False</td>\n","    </tr>\n","    <tr>\n","      <th>1457</th>\n","      <td>Legal Restrictions on Exploitation of the Pate...</td>\n","      <td>-1</td>\n","      <td>-1_document_system_indexing_retrieval</td>\n","      <td>[document, system, indexing, retrieval, on, an...</td>\n","      <td>[PRECIS: a manual of concept analysis and subj...</td>\n","      <td>document - system - indexing - retrieval - on ...</td>\n","      <td>0.000000</td>\n","      <td>False</td>\n","    </tr>\n","    <tr>\n","      <th>1458</th>\n","      <td>Language and Thought This book considers the b...</td>\n","      <td>8</td>\n","      <td>8_language_linguistics_linguistic_semantic</td>\n","      <td>[language, linguistics, linguistic, semantic, ...</td>\n","      <td>[Functional Approach The present book sums up ...</td>\n","      <td>language - linguistics - linguistic - semantic...</td>\n","      <td>1.000000</td>\n","      <td>False</td>\n","    </tr>\n","    <tr>\n","      <th>1459</th>\n","      <td>Modern Integral Information Systems for Chemis...</td>\n","      <td>2</td>\n","      <td>2_chemical_compounds_notation_search</td>\n","      <td>[chemical, compounds, notation, search, titles...</td>\n","      <td>[Experiences of IIT Research Institute in Oper...</td>\n","      <td>chemical - compounds - notation - search - tit...</td>\n","      <td>0.805337</td>\n","      <td>False</td>\n","    </tr>\n","  </tbody>\n","</table>\n","<p>1460 rows × 8 columns</p>\n","</div>"],"text/plain":["                                               Document  Topic  \\\n","0     18 Editions of the Dewey Decimal Classificatio...      7   \n","1     Use Made of Technical Libraries This report is...      1   \n","2     Two Kinds of Power An Essay on Bibliographic C...     -1   \n","3     Systems Analysis of a University Library; fina...      1   \n","4     A Library Management Game: a report on a resea...     -1   \n","...                                                 ...    ...   \n","1455  World Dynamics Over the last several decades i...      0   \n","1456  World Trends in Library Education One of the m...      1   \n","1457  Legal Restrictions on Exploitation of the Pate...     -1   \n","1458  Language and Thought This book considers the b...      8   \n","1459  Modern Integral Information Systems for Chemis...      2   \n","\n","                                            Name  \\\n","0             7_classification_decimal_udc_dewey   \n","1        1_library_libraries_university_academic   \n","2          -1_document_system_indexing_retrieval   \n","3        1_library_libraries_university_academic   \n","4          -1_document_system_indexing_retrieval   \n","...                                          ...   \n","1455        0_scientific_science_journals_social   \n","1456     1_library_libraries_university_academic   \n","1457       -1_document_system_indexing_retrieval   \n","1458  8_language_linguistics_linguistic_semantic   \n","1459        2_chemical_compounds_notation_search   \n","\n","                                         Representation  \\\n","0     [classification, decimal, udc, dewey, schemes,...   \n","1     [library, libraries, university, academic, pub...   \n","2     [document, system, indexing, retrieval, on, an...   \n","3     [library, libraries, university, academic, pub...   \n","4     [document, system, indexing, retrieval, on, an...   \n","...                                                 ...   \n","1455  [scientific, science, journals, social, scient...   \n","1456  [library, libraries, university, academic, pub...   \n","1457  [document, system, indexing, retrieval, on, an...   \n","1458  [language, linguistics, linguistic, semantic, ...   \n","1459  [chemical, compounds, notation, search, titles...   \n","\n","                                    Representative_Docs  \\\n","0     [Progress in Documentation Thirty years or mor...   \n","1     [Undergraduate Library The development of the ...   \n","2     [PRECIS: a manual of concept analysis and subj...   \n","3     [Undergraduate Library The development of the ...   \n","4     [PRECIS: a manual of concept analysis and subj...   \n","...                                                 ...   \n","1455  [Recent Growth of the Literature of Biochemist...   \n","1456  [Undergraduate Library The development of the ...   \n","1457  [PRECIS: a manual of concept analysis and subj...   \n","1458  [Functional Approach The present book sums up ...   \n","1459  [Experiences of IIT Research Institute in Oper...   \n","\n","                                            Top_n_words  Probability  \\\n","0     classification - decimal - udc - dewey - schem...     0.781595   \n","1     library - libraries - university - academic - ...     0.737408   \n","2     document - system - indexing - retrieval - on ...     0.000000   \n","3     library - libraries - university - academic - ...     0.993856   \n","4     document - system - indexing - retrieval - on ...     0.000000   \n","...                                                 ...          ...   \n","1455  scientific - science - journals - social - sci...     1.000000   \n","1456  library - libraries - university - academic - ...     1.000000   \n","1457  document - system - indexing - retrieval - on ...     0.000000   \n","1458  language - linguistics - linguistic - semantic...     1.000000   \n","1459  chemical - compounds - notation - search - tit...     0.805337   \n","\n","      Representative_document  \n","0                       False  \n","1                       False  \n","2                       False  \n","3                       False  \n","4                       False  \n","...                       ...  \n","1455                    False  \n","1456                    False  \n","1457                    False  \n","1458                    False  \n","1459                    False  \n","\n","[1460 rows x 8 columns]"]},"execution_count":18,"metadata":{},"output_type":"execute_result"}],"source":["doc_info = topic_model.get_document_info(docs[\"text\"])\n","doc_info"]},{"cell_type":"markdown","metadata":{},"source":["<h2> Re-ranking with bertopic</h2>"]},{"cell_type":"code","execution_count":78,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T20:38:18.212878Z","iopub.status.busy":"2024-01-03T20:38:18.212296Z","iopub.status.idle":"2024-01-03T20:42:37.435531Z","shell.execute_reply":"2024-01-03T20:42:37.433913Z","shell.execute_reply.started":"2024-01-03T20:38:18.212829Z"},"trusted":true},"outputs":[],"source":["# transform queries to topics\n","query_topics = {}\n","for index, row in queries_cleaned.iterrows():\n","    query_id = row[0]\n","    qq = queries.loc[queries['id'] == query_id]\n","    topic, prob = topic_model.transform(qq[\"text\"].values.tolist())\n","    query_topics[query_id] = topic"]},{"cell_type":"code","execution_count":79,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T20:42:37.438390Z","iopub.status.busy":"2024-01-03T20:42:37.437898Z","iopub.status.idle":"2024-01-03T20:42:37.454106Z","shell.execute_reply":"2024-01-03T20:42:37.452690Z","shell.execute_reply.started":"2024-01-03T20:42:37.438329Z"},"trusted":true},"outputs":[{"data":{"text/plain":["{1: [-1],\n"," 2: [4],\n"," 3: [0],\n"," 4: [-1],\n"," 5: [11],\n"," 6: [0],\n"," 7: [-1],\n"," 8: [-1],\n"," 9: [-1],\n"," 10: [-1],\n"," 11: [0],\n"," 12: [0],\n"," 13: [-1],\n"," 14: [19],\n"," 15: [4],\n"," 16: [11],\n"," 17: [-1],\n"," 18: [2],\n"," 19: [-1],\n"," 20: [-1],\n"," 21: [0],\n"," 22: [19],\n"," 23: [10],\n"," 24: [0],\n"," 25: [-1],\n"," 26: [13],\n"," 27: [3],\n"," 28: [2],\n"," 29: [3],\n"," 30: [0],\n"," 31: [0],\n"," 32: [3],\n"," 33: [11],\n"," 34: [-1],\n"," 35: [20],\n"," 37: [-1],\n"," 39: [11],\n"," 41: [-1],\n"," 42: [11],\n"," 43: [11],\n"," 44: [0],\n"," 45: [10],\n"," 46: [10],\n"," 49: [11],\n"," 50: [-1],\n"," 52: [19],\n"," 54: [10],\n"," 55: [19],\n"," 56: [-1],\n"," 57: [-1],\n"," 58: [18],\n"," 61: [5],\n"," 62: [-1],\n"," 65: [-1],\n"," 66: [18],\n"," 67: [-1],\n"," 69: [12],\n"," 71: [-1],\n"," 76: [-1],\n"," 79: [-1],\n"," 81: [12],\n"," 82: [-1],\n"," 84: [5],\n"," 90: [0],\n"," 92: [4],\n"," 95: [5],\n"," 96: [5],\n"," 97: [5],\n"," 98: [11],\n"," 99: [11],\n"," 100: [-1],\n"," 101: [-1],\n"," 102: [3],\n"," 104: [-1],\n"," 109: [0],\n"," 111: [-1]}"]},"execution_count":79,"metadata":{},"output_type":"execute_result"}],"source":["query_topics"]},{"cell_type":"code","execution_count":58,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T20:12:01.716284Z","iopub.status.busy":"2024-01-03T20:12:01.715582Z","iopub.status.idle":"2024-01-03T20:12:01.726826Z","shell.execute_reply":"2024-01-03T20:12:01.725962Z","shell.execute_reply.started":"2024-01-03T20:12:01.716244Z"},"trusted":true},"outputs":[],"source":["def bertopic_reranker(initial_retrieval, query_id, k, lam=0.2):\n","    #print(topic_model.get_topic_info())\n","    qq = queries.loc[queries['id'] == query_id]\n","    #print(\"query: \", qq[\"text\"].values.tolist())\n","    #topic, prob = topic_model.transform(qq[\"text\"].values.tolist())\n","    topic = query_topics[query_id]\n","    topic = topic[0]\n","    most_similar_init_k_documents = {}\n","    #print(\"topic: \", topic)\n","    i = 0\n","    for id, score in initial_retrieval.items():\n","        doc_topic = doc_info.iloc[id-1][\"Topic\"]\n","        #print(\"d\", id, doc_topic)\n","        if doc_topic == topic and topic != -1:\n","            #print(\"same topic, increase score\", score)\n","            most_similar_init_k_documents[id]=[score[0] * lam]\n","        else:\n","            most_similar_init_k_documents[id]=[score[0]]\n","        i += 1\n","    most_similar_init_k_documents = dict(sorted(most_similar_init_k_documents.items(), key=lambda item: item[1], reverse=True))\n","\n","    most_similar_k_documents = {}\n","    counter = 0\n","    for id, score in most_similar_init_k_documents.items():\n","        try:\n","            if counter == k:\n","                break\n","            most_similar_k_documents[id]=score\n","            counter += 1\n","        except:\n","            break\n","\n","    return most_similar_k_documents   "]},{"cell_type":"markdown","metadata":{},"source":["test for one document"]},{"cell_type":"code","execution_count":80,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T20:52:22.053797Z","iopub.status.busy":"2024-01-03T20:52:22.053248Z","iopub.status.idle":"2024-01-03T20:52:22.084399Z","shell.execute_reply":"2024-01-03T20:52:22.082985Z","shell.execute_reply.started":"2024-01-03T20:52:22.053751Z"},"trusted":true},"outputs":[{"data":{"text/plain":["{145: [15.270515725754914],\n"," 1399: [11.776525442346998],\n"," 597: [11.654505315568294],\n"," 166: [10.781292227767148],\n"," 1071: [10.763945625290491],\n"," 546: [10.405693755713079],\n"," 626: [9.676309340155163],\n"," 1096: [9.519484116329025],\n"," 728: [8.859895015786782],\n"," 1197: [8.78125771012449]}"]},"execution_count":80,"metadata":{},"output_type":"execute_result"}],"source":["bertopic_reranker(initial_retrieval_with_scores[2], 2, 10, lam=2.0)"]},{"cell_type":"markdown","metadata":{},"source":["re-rank documents for all queries"]},{"cell_type":"code","execution_count":112,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T21:23:21.816142Z","iopub.status.busy":"2024-01-03T21:23:21.815690Z","iopub.status.idle":"2024-01-03T21:23:29.221491Z","shell.execute_reply":"2024-01-03T21:23:29.220201Z","shell.execute_reply.started":"2024-01-03T21:23:21.816101Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["Currently retrieving for lam: 1.2\n","Currently retrieving for lam: 1.5\n","Currently retrieving for lam: 2\n","Currently retrieving for lam: 2.5\n","Currently retrieving for lam: 3\n"]}],"source":["lam_values = [1.2, 1.5, 2, 2.5, 3]\n","\n","results_for_different_lams = dict()\n","for lam_value in lam_values:\n","    print(f'Currently retrieving for lam: {lam_value}')\n","    bertopic_reranker_retrieval = dict()\n","    for index, row in queries_cleaned.iterrows():\n","        query_id = row[0]\n","        query_text = row[1]\n","        retrieved_documents = initial_retrieval_with_scores[query_id]\n","        bertopic_reranker_documents = bertopic_reranker(retrieved_documents, query_id, 50, lam=lam_value)\n","        bertopic_reranker_retrieval[query_id] = list(bertopic_reranker_documents.keys())\n","    results_for_different_lams[lam_value] = bertopic_reranker_retrieval"]},{"cell_type":"code","execution_count":113,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T21:23:35.507029Z","iopub.status.busy":"2024-01-03T21:23:35.506573Z","iopub.status.idle":"2024-01-03T21:23:35.833096Z","shell.execute_reply":"2024-01-03T21:23:35.831944Z","shell.execute_reply.started":"2024-01-03T21:23:35.506994Z"},"trusted":true},"outputs":[],"source":["# Initialize an empty dictionary to store the results\n","result_dict = {}\n","\n","# Iterate through the DataFrame and populate the dictionary\n","for index, row in rels.iterrows():\n","    query_id = row['queryID']\n","    doc_id = row['docID']\n","    # If the query ID is not already in the dictionary, add it with an empty list\n","    if query_id not in result_dict:\n","        result_dict[query_id] = []\n","    # Append the document ID to the list associated with the query ID\n","    result_dict[query_id].append(doc_id)\n"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2023-12-31T12:12:26.418730Z","iopub.status.busy":"2023-12-31T12:12:26.418435Z","iopub.status.idle":"2023-12-31T12:12:26.422457Z","shell.execute_reply":"2023-12-31T12:12:26.421575Z","shell.execute_reply.started":"2023-12-31T12:12:26.418702Z"},"trusted":true},"outputs":[],"source":["# ! When we sort here, we already use the information of the similarity ordering, so we cannot calculate measures @ k!\n","\n","# Sort each list in the dictionary\n","#sorted_dict = {key: sorted(value) for key, value in final_retrieval.items()}\n","#sorted_dict_bert = {key: sorted(value) for key, value in final_retrieval_bert.items()}\n","\n","# Print the sorted dictionary\n","#sorted_dict"]},{"cell_type":"markdown","metadata":{},"source":["<h2> Evalution </h2>"]},{"cell_type":"code","execution_count":115,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T21:23:42.614233Z","iopub.status.busy":"2024-01-03T21:23:42.613727Z","iopub.status.idle":"2024-01-03T21:23:42.637263Z","shell.execute_reply":"2024-01-03T21:23:42.635902Z","shell.execute_reply.started":"2024-01-03T21:23:42.614194Z"},"trusted":true},"outputs":[],"source":["def evaluate(predictions, k):\n","    f_1 = 0\n","    precision = 0\n","    recall = 0\n","    number_queries_evaluated = 0\n","    for prediction in predictions.items():\n","        q_id = prediction[0]\n","        predicted_documents_k_relevant = prediction[1]\n","\n","        ground_truth = rels\n","        ground_truth_k_relevant = ground_truth[ground_truth[\"queryID\"] == q_id].iloc[:k]\n","        ground_truth_k_relevant = ground_truth_k_relevant['docID'].to_list()\n","\n","        false_positives = 0\n","        for predicted in predicted_documents_k_relevant:\n","            if predicted not in ground_truth_k_relevant:\n","                false_positives += 1\n","\n","        true_positives = 0 \n","        false_negatives = 0      \n","        for truth in ground_truth_k_relevant:\n","            if truth in predicted_documents_k_relevant:\n","                true_positives += 1\n","            if truth not in predicted_documents_k_relevant:\n","                false_negatives += 1\n","        try:\n","            query_precision = true_positives/(true_positives+false_positives)\n","            query_recall = true_positives/(true_positives+false_negatives)\n","        except:\n","            query_precision = 0\n","            query_recall = 0\n","        if query_precision > 0 or query_recall > 0:\n","            #print(f'precision: {query_precision} recall: {query_recall}')\n","            query_f_1 = (2*query_precision*query_recall)/(query_precision+query_recall)\n","            f_1 += query_f_1\n","            precision += query_precision\n","            recall += query_recall\n","            number_queries_evaluated += 1\n","        else:\n","            f_1 += 0\n","            precision += 0\n","            recall += query_recall\n","            number_queries_evaluated += 1\n","    \n","    f_1 /= number_queries_evaluated\n","    precision /= number_queries_evaluated\n","    recall /= number_queries_evaluated\n","    \n","    ndcg = 0\n","    number_queries_evaluated = 0\n","    for prediction in predictions.items():\n","        q_id = prediction[0]\n","        relevant_items = rels[rels[\"queryID\"] == q_id]\n","        relevant_items = relevant_items[\"docID\"].to_list()\n","        documents = prediction[1][:k] \n","        i = 1\n","        dcg_document = 0\n","        idcg_document = 0\n","        for document_id in documents:\n","            idcg_i = (1/np.log2(i+1))\n","            idcg_document += idcg_i\n","            if document_id in relevant_items:\n","                dcg_i = (1/np.log2(i+1))                    \n","                dcg_document += dcg_i\n","            i +=1\n","        ndgc_document = (dcg_document/idcg_document) if idcg_document != 0 else 0\n","        ndcg += ndgc_document\n","        number_queries_evaluated += 1\n","        \n","    ndcg/=number_queries_evaluated\n","    \n","    return {'f_1':f_1, 'precision':precision, 'recall':recall, 'nDCG':ndcg}"]},{"cell_type":"markdown","metadata":{},"source":["K's to evaluate for"]},{"cell_type":"code","execution_count":116,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T21:23:47.877918Z","iopub.status.busy":"2024-01-03T21:23:47.877437Z","iopub.status.idle":"2024-01-03T21:23:47.883332Z","shell.execute_reply":"2024-01-03T21:23:47.882258Z","shell.execute_reply.started":"2024-01-03T21:23:47.877879Z"},"trusted":true},"outputs":[],"source":["k_values = [3,4,5,6,7,10,20,30,40,50]"]},{"cell_type":"markdown","metadata":{},"source":["Scores for the initial retrieval"]},{"cell_type":"code","execution_count":117,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T21:23:52.563434Z","iopub.status.busy":"2024-01-03T21:23:52.562784Z","iopub.status.idle":"2024-01-03T21:23:53.711365Z","shell.execute_reply":"2024-01-03T21:23:53.709924Z","shell.execute_reply.started":"2024-01-03T21:23:52.563393Z"},"trusted":true},"outputs":[],"source":["initial_retrieval_scores = dict()\n","for k in k_values:\n","    scores = evaluate(initial_retrieval, k)\n","    initial_retrieval_scores[k] = scores"]},{"cell_type":"markdown","metadata":{},"source":["Scores for the Baseline Re-ranker"]},{"cell_type":"code","execution_count":118,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T21:23:54.981836Z","iopub.status.busy":"2024-01-03T21:23:54.981366Z","iopub.status.idle":"2024-01-03T21:23:56.215784Z","shell.execute_reply":"2024-01-03T21:23:56.214384Z","shell.execute_reply.started":"2024-01-03T21:23:54.981798Z"},"trusted":true},"outputs":[],"source":["baseline_reranker_retrieval_scores = dict()\n","for k in k_values:\n","    scores = evaluate(baseline_reranker_retrieval, k)\n","    baseline_reranker_retrieval_scores[k] = scores"]},{"cell_type":"markdown","metadata":{},"source":["Scores for the Bertopic-Reranker"]},{"cell_type":"code","execution_count":127,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T21:27:50.108059Z","iopub.status.busy":"2024-01-03T21:27:50.107464Z","iopub.status.idle":"2024-01-03T21:27:51.305679Z","shell.execute_reply":"2024-01-03T21:27:51.304065Z","shell.execute_reply.started":"2024-01-03T21:27:50.108014Z"},"trusted":true},"outputs":[],"source":["bertopic_reranker_retrieval_scores = dict()\n","for k in k_values:\n","    scores = evaluate(results_for_different_lams[3], k)\n","    bertopic_reranker_retrieval_scores[k] = scores"]},{"cell_type":"code","execution_count":128,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T21:27:53.094975Z","iopub.status.busy":"2024-01-03T21:27:53.094546Z","iopub.status.idle":"2024-01-03T21:27:53.107728Z","shell.execute_reply":"2024-01-03T21:27:53.106596Z","shell.execute_reply.started":"2024-01-03T21:27:53.094939Z"},"trusted":true},"outputs":[],"source":["initial_retrieval_results_f_1 = []\n","baseline_reranker_results_f_1 = []\n","bertopic_reranker_results_f_1 = []\n","\n","initial_retrieval_results_recall = []\n","baseline_reranker_results_recall = []\n","bertopic_reranker_results_recall = []\n","\n","initial_retrieval_results_precision = []\n","baseline_reranker_results_precision = []\n","bertopic_reranker_results_precision = []\n","\n","initial_retrieval_results_nDCG = []\n","baseline_reranker_results_nDCG  = []\n","bertopic_reranker_results_nDCG  = []\n","\n","for k in k_values:\n","    initial_retrieval_results_f_1.append(initial_retrieval_scores[k]['f_1'])\n","    baseline_reranker_results_f_1.append(baseline_reranker_retrieval_scores[k]['f_1'])\n","    bertopic_reranker_results_f_1.append(bertopic_reranker_retrieval_scores[k]['f_1'])\n","    \n","    initial_retrieval_results_recall.append(initial_retrieval_scores[k]['recall'])\n","    baseline_reranker_results_recall.append(baseline_reranker_retrieval_scores[k]['recall'])\n","    bertopic_reranker_results_recall.append(bertopic_reranker_retrieval_scores[k]['recall'])\n","    \n","    initial_retrieval_results_precision.append(initial_retrieval_scores[k]['precision'])\n","    baseline_reranker_results_precision.append(baseline_reranker_retrieval_scores[k]['precision'])\n","    bertopic_reranker_results_precision.append(bertopic_reranker_retrieval_scores[k]['precision'])\n","    \n","    initial_retrieval_results_nDCG.append(initial_retrieval_scores[k]['nDCG'])\n","    baseline_reranker_results_nDCG.append(baseline_reranker_retrieval_scores[k]['nDCG'])\n","    bertopic_reranker_results_nDCG.append(bertopic_reranker_retrieval_scores[k]['nDCG'])"]},{"cell_type":"code","execution_count":129,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T21:27:57.114992Z","iopub.status.busy":"2024-01-03T21:27:57.114467Z","iopub.status.idle":"2024-01-03T21:27:57.125581Z","shell.execute_reply":"2024-01-03T21:27:57.123592Z","shell.execute_reply.started":"2024-01-03T21:27:57.114950Z"},"trusted":true},"outputs":[{"data":{"text/plain":["[0.3560716124730853,\n"," 0.3426624624489207,\n"," 0.33741020727621196,\n"," 0.33791628338014396,\n"," 0.32865531690243627,\n"," 0.30763712181088515,\n"," 0.2571290703967697,\n"," 0.2282620325948478,\n"," 0.2060150933125378,\n"," 0.19024116999894183]"]},"execution_count":129,"metadata":{},"output_type":"execute_result"}],"source":["bertopic_reranker_results_nDCG"]},{"cell_type":"code","execution_count":130,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T21:28:00.248875Z","iopub.status.busy":"2024-01-03T21:28:00.248077Z","iopub.status.idle":"2024-01-03T21:28:01.314148Z","shell.execute_reply":"2024-01-03T21:28:01.313253Z","shell.execute_reply.started":"2024-01-03T21:28:00.248831Z"},"trusted":true},"outputs":[{"data":{"image/png":"","text/plain":["<Figure size 1440x1080 with 4 Axes>"]},"metadata":{"needs_background":"light"},"output_type":"display_data"}],"source":["# Creating a subplot with 2 rows and 2 columns\n","fig, axs = plt.subplots(2, 2, figsize=(20, 15))\n","\n","# Plotting F1 Score\n","axs[0, 0].plot(k_values, initial_retrieval_results_f_1, marker='o', label='Initial Retrieval')\n","axs[0, 0].plot(k_values, baseline_reranker_results_f_1, marker='s', label='Baseline Reranker')\n","axs[0, 0].plot(k_values, bertopic_reranker_results_f_1, marker='^', label='BERTopic Reranker')\n","axs[0, 0].set_title('F1 Score by k values')\n","axs[0, 0].set_xlabel('k value')\n","axs[0, 0].set_ylabel('F1 Score')\n","axs[0, 0].legend()\n","axs[0, 0].grid(True)\n","\n","# Plotting Recall\n","axs[0, 1].plot(k_values, initial_retrieval_results_recall, marker='o', label='Initial Retrieval')\n","axs[0, 1].plot(k_values, baseline_reranker_results_recall, marker='s', label='Baseline Reranker')\n","axs[0, 1].plot(k_values, bertopic_reranker_results_recall, marker='^', label='BERTopic Reranker')\n","axs[0, 1].set_title('Recall by k values')\n","axs[0, 1].set_xlabel('k value')\n","axs[0, 1].set_ylabel('Recall')\n","axs[0, 1].legend()\n","axs[0, 1].grid(True)\n","\n","# Plotting Precision\n","axs[1, 0].plot(k_values, initial_retrieval_results_precision, marker='o', label='Initial Retrieval')\n","axs[1, 0].plot(k_values, baseline_reranker_results_precision, marker='s', label='Baseline Reranker')\n","axs[1, 0].plot(k_values, bertopic_reranker_results_precision, marker='^', label='BERTopic Reranker')\n","axs[1, 0].set_title('Precision by k values')\n","axs[1, 0].set_xlabel('k value')\n","axs[1, 0].set_ylabel('Precision')\n","axs[1, 0].legend()\n","axs[1, 0].grid(True)\n","\n","# Plotting nDCG Score\n","axs[1, 1].plot(k_values, initial_retrieval_results_nDCG, marker='o', label='Initial Retrieval')\n","axs[1, 1].plot(k_values, baseline_reranker_results_nDCG, marker='s', label='Baseline Reranker')\n","axs[1, 1].plot(k_values, bertopic_reranker_results_nDCG, marker='^', label='BERTopic Reranker')\n","axs[1, 1].set_title('nDCG Score by k values')\n","axs[1, 1].set_xlabel('k value')\n","axs[1, 1].set_ylabel('nDCG Score')\n","axs[1, 1].legend()\n","axs[1, 1].grid(True)\n","\n","# Adjusting layout and displaying the plots\n","plt.tight_layout()\n","plt.show()"]},{"cell_type":"markdown","metadata":{},"source":["<h2> Sources: </h2>\n","\n","* https://www.geeksforgeeks.org/text-preprocessing-in-python-set-1/\n","* https://www.kaggle.com/code/namansood/document-ranking-ir-system-word2vec-embeddings\n","* https://pypi.org/project/rank-bm25/"]},{"cell_type":"code","execution_count":124,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T21:24:23.876178Z","iopub.status.busy":"2024-01-03T21:24:23.875511Z","iopub.status.idle":"2024-01-03T21:24:29.656502Z","shell.execute_reply":"2024-01-03T21:24:29.655288Z","shell.execute_reply.started":"2024-01-03T21:24:23.876120Z"},"trusted":true},"outputs":[],"source":["scores_lam = dict()\n","for lam_value in lam_values:\n","    bertopic_reranker_lam_scores = dict()\n","    for k in k_values:\n","        scores = evaluate(results_for_different_lams[lam_value], k)\n","        bertopic_reranker_lam_scores[k] = scores\n","    scores_lam[lam_value] = bertopic_reranker_lam_scores"]},{"cell_type":"code","execution_count":125,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T21:24:49.722093Z","iopub.status.busy":"2024-01-03T21:24:49.721617Z","iopub.status.idle":"2024-01-03T21:24:49.741793Z","shell.execute_reply":"2024-01-03T21:24:49.740575Z","shell.execute_reply.started":"2024-01-03T21:24:49.722055Z"},"trusted":true},"outputs":[{"data":{"text/plain":["{1.2: {3: {'f_1': 0.02137001771910353,\n","   'precision': 0.011315789473684215,\n","   'recall': 0.19736842105263155,\n","   'nDCG': 0.3567336087776632},\n","  4: {'f_1': 0.029268432519206498,\n","   'precision': 0.015789473684210534,\n","   'recall': 0.20723684210526316,\n","   'nDCG': 0.3387887500965494},\n","  5: {'f_1': 0.03783657003471244,\n","   'precision': 0.020789473684210538,\n","   'recall': 0.21842105263157896,\n","   'nDCG': 0.3254128326590611},\n","  6: {'f_1': 0.04468892820286009,\n","   'precision': 0.025000000000000015,\n","   'recall': 0.21929824561403505,\n","   'nDCG': 0.3158658688374358},\n","  7: {'f_1': 0.05273536007200668,\n","   'precision': 0.030000000000000016,\n","   'recall': 0.22744360902255642,\n","   'nDCG': 0.308625281389308},\n","  10: {'f_1': 0.07406262545006927,\n","   'precision': 0.0442105263157895,\n","   'recall': 0.24057017543859646,\n","   'nDCG': 0.28300583381381733},\n","  20: {'f_1': 0.1132536014591653,\n","   'precision': 0.07631578947368418,\n","   'recall': 0.24656228274649325,\n","   'nDCG': 0.24020967725033257},\n","  30: {'f_1': 0.12628539760607768,\n","   'precision': 0.09342105263157892,\n","   'recall': 0.23542462058342276,\n","   'nDCG': 0.2187536976058229},\n","  40: {'f_1': 0.13881667308622633,\n","   'precision': 0.11052631578947364,\n","   'recall': 0.23716928643641721,\n","   'nDCG': 0.19742563005153},\n","  50: {'f_1': 0.14556567488821576,\n","   'precision': 0.1228947368421052,\n","   'recall': 0.23680116546834148,\n","   'nDCG': 0.18633380335568112}},\n"," 1.5: {3: {'f_1': 0.02137001771910353,\n","   'precision': 0.011315789473684215,\n","   'recall': 0.19736842105263155,\n","   'nDCG': 0.36761259805323043},\n","  4: {'f_1': 0.028781103084508642,\n","   'precision': 0.015526315789473692,\n","   'recall': 0.20394736842105263,\n","   'nDCG': 0.3522630905573355},\n","  5: {'f_1': 0.03735810113519091,\n","   'precision': 0.020526315789473698,\n","   'recall': 0.2157894736842105,\n","   'nDCG': 0.34057203643295536},\n","  6: {'f_1': 0.044219003390830014,\n","   'precision': 0.024736842105263175,\n","   'recall': 0.2171052631578947,\n","   'nDCG': 0.33648246659831077},\n","  7: {'f_1': 0.052273679554924506,\n","   'precision': 0.029736842105263173,\n","   'recall': 0.22556390977443613,\n","   'nDCG': 0.3273528742202902},\n","  10: {'f_1': 0.07406262545006928,\n","   'precision': 0.0442105263157895,\n","   'recall': 0.24057017543859646,\n","   'nDCG': 0.30027573618182124},\n","  20: {'f_1': 0.11357254720520335,\n","   'precision': 0.07657894736842104,\n","   'recall': 0.24683065472539154,\n","   'nDCG': 0.25271751792123504},\n","  30: {'f_1': 0.12714793205359917,\n","   'precision': 0.09421052631578944,\n","   'recall': 0.2360219399307421,\n","   'nDCG': 0.2304155597260177},\n","  40: {'f_1': 0.139247256443902,\n","   'precision': 0.11105263157894732,\n","   'recall': 0.237016374943478,\n","   'nDCG': 0.20663999242512765},\n","  50: {'f_1': 0.14541447319529882,\n","   'precision': 0.1228947368421052,\n","   'recall': 0.23599731372837882,\n","   'nDCG': 0.19022281905314956}},\n"," 2: {3: {'f_1': 0.02137001771910353,\n","   'precision': 0.011315789473684215,\n","   'recall': 0.19736842105263155,\n","   'nDCG': 0.35217579785511344},\n","  4: {'f_1': 0.028781103084508642,\n","   'precision': 0.015526315789473692,\n","   'recall': 0.20394736842105263,\n","   'nDCG': 0.34827045924291733},\n","  5: {'f_1': 0.03735810113519091,\n","   'precision': 0.020526315789473698,\n","   'recall': 0.2157894736842105,\n","   'nDCG': 0.33882964121847015},\n","  6: {'f_1': 0.044219003390830014,\n","   'precision': 0.024736842105263175,\n","   'recall': 0.2171052631578947,\n","   'nDCG': 0.3363461610398492},\n","  7: {'f_1': 0.052273679554924506,\n","   'precision': 0.029736842105263173,\n","   'recall': 0.22556390977443613,\n","   'nDCG': 0.3308458515733193},\n","  10: {'f_1': 0.07450122194129735,\n","   'precision': 0.044473684210526346,\n","   'recall': 0.2418859649122807,\n","   'nDCG': 0.30593150319079393},\n","  20: {'f_1': 0.11400395359001787,\n","   'precision': 0.07684210526315786,\n","   'recall': 0.2480268269741954,\n","   'nDCG': 0.2567146283321391},\n","  30: {'f_1': 0.12757933843841368,\n","   'precision': 0.09447368421052628,\n","   'recall': 0.23721811217954594,\n","   'nDCG': 0.23066769026982406},\n","  40: {'f_1': 0.13967866282871652,\n","   'precision': 0.11131578947368415,\n","   'recall': 0.23821254719228185,\n","   'nDCG': 0.2063959838017688},\n","  50: {'f_1': 0.1458458795801133,\n","   'precision': 0.12315789473684202,\n","   'recall': 0.23719348597718257,\n","   'nDCG': 0.1905707583249521}},\n"," 2.5: {3: {'f_1': 0.02137001771910353,\n","   'precision': 0.011315789473684215,\n","   'recall': 0.19736842105263155,\n","   'nDCG': 0.3583505179343602},\n","  4: {'f_1': 0.028781103084508642,\n","   'precision': 0.015526315789473692,\n","   'recall': 0.20394736842105263,\n","   'nDCG': 0.34455822114959667},\n","  5: {'f_1': 0.03735810113519091,\n","   'precision': 0.020526315789473698,\n","   'recall': 0.2157894736842105,\n","   'nDCG': 0.34078361540846763},\n","  6: {'f_1': 0.044219003390830014,\n","   'precision': 0.024736842105263175,\n","   'recall': 0.2171052631578947,\n","   'nDCG': 0.34092607470195774},\n","  7: {'f_1': 0.052273679554924506,\n","   'precision': 0.029736842105263173,\n","   'recall': 0.22556390977443613,\n","   'nDCG': 0.3313893347571242},\n","  10: {'f_1': 0.07450122194129735,\n","   'precision': 0.044473684210526346,\n","   'recall': 0.2418859649122807,\n","   'nDCG': 0.30814484944827053},\n","  20: {'f_1': 0.11362801374039382,\n","   'precision': 0.07657894736842104,\n","   'recall': 0.24736893223735326,\n","   'nDCG': 0.25702637613212626},\n","  30: {'f_1': 0.1272503910699926,\n","   'precision': 0.09421052631578944,\n","   'recall': 0.23677951568831784,\n","   'nDCG': 0.22942427360812398},\n","  40: {'f_1': 0.1393862651678978,\n","   'precision': 0.11105263157894732,\n","   'recall': 0.2378835998238608,\n","   'nDCG': 0.20674307592426877},\n","  50: {'f_1': 0.14558272168537648,\n","   'precision': 0.1228947368421052,\n","   'recall': 0.2369303280824458,\n","   'nDCG': 0.19068412896415185}},\n"," 3: {3: {'f_1': 0.02137001771910353,\n","   'precision': 0.011315789473684215,\n","   'recall': 0.19736842105263155,\n","   'nDCG': 0.3560716124730853},\n","  4: {'f_1': 0.028781103084508642,\n","   'precision': 0.015526315789473692,\n","   'recall': 0.20394736842105263,\n","   'nDCG': 0.3426624624489207},\n","  5: {'f_1': 0.03735810113519091,\n","   'precision': 0.020526315789473698,\n","   'recall': 0.2157894736842105,\n","   'nDCG': 0.33741020727621196},\n","  6: {'f_1': 0.044219003390830014,\n","   'precision': 0.024736842105263175,\n","   'recall': 0.2171052631578947,\n","   'nDCG': 0.33791628338014396},\n","  7: {'f_1': 0.052273679554924506,\n","   'precision': 0.029736842105263173,\n","   'recall': 0.22556390977443613,\n","   'nDCG': 0.32865531690243627},\n","  10: {'f_1': 0.07450122194129735,\n","   'precision': 0.044473684210526346,\n","   'recall': 0.2418859649122807,\n","   'nDCG': 0.30763712181088515},\n","  20: {'f_1': 0.11362801374039382,\n","   'precision': 0.07657894736842104,\n","   'recall': 0.24736893223735326,\n","   'nDCG': 0.2571290703967697},\n","  30: {'f_1': 0.1272503910699926,\n","   'precision': 0.09421052631578944,\n","   'recall': 0.23677951568831784,\n","   'nDCG': 0.2282620325948478},\n","  40: {'f_1': 0.1393862651678978,\n","   'precision': 0.11105263157894732,\n","   'recall': 0.2378835998238608,\n","   'nDCG': 0.2060150933125378},\n","  50: {'f_1': 0.14558272168537648,\n","   'precision': 0.1228947368421052,\n","   'recall': 0.2369303280824458,\n","   'nDCG': 0.19024116999894183}}}"]},"execution_count":125,"metadata":{},"output_type":"execute_result"}],"source":["scores_lam"]},{"cell_type":"code","execution_count":null,"metadata":{"trusted":true},"outputs":[],"source":[]},{"cell_type":"code","execution_count":95,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T21:00:40.206275Z","iopub.status.busy":"2024-01-03T21:00:40.205513Z","iopub.status.idle":"2024-01-03T21:00:40.229921Z","shell.execute_reply":"2024-01-03T21:00:40.227362Z","shell.execute_reply.started":"2024-01-03T21:00:40.206205Z"},"trusted":true},"outputs":[],"source":["lam_2_f1 = []\n","#lam_4_f1 = []\n","#lam_5_f1 = []\n","lam_1_2_f1 = []\n","lam_1_5_f1 = []\n","lam_2_5_f1 = []\n","lam_3_f1 = []\n","#lam_10_f1 = []\n","\n","lam_2_nDCG = []\n","#lam_4_nDCG = []\n","#lam_5_nDCG = []\n","lam_1_2_nDCG = []\n","lam_1_5_nDCG = []\n","lam_2_5_nDCG = []\n","lam_3_nDCG = []\n","#lam_10_nDCG = []\n","\n","for k in k_values:\n","    lam_2_f1.append(scores_lam[2][k]['f_1'])\n"," #   lam_4_f1.append(scores_lam[4][k]['f_1'])\n"," #   lam_5_f1.append(scores_lam[5][k]['f_1'])\n","    lam_1_2_f1.append(scores_lam[1.2][k]['f_1'])\n","    lam_1_5_f1.append(scores_lam[1.5][k]['f_1'])\n","    lam_2_5_f1.append(scores_lam[2.5][k]['f_1'])\n","    lam_3_f1.append(scores_lam[3][k]['f_1'])\n"," #   lam_10_f1.append(scores_lam[10][k]['f_1'])\n","    \n","    lam_2_nDCG.append(scores_lam[2][k]['nDCG'])\n"," #   lam_4_nDCG.append(scores_lam[4][k]['nDCG'])\n"," #   lam_5_nDCG.append(scores_lam[5][k]['nDCG'])\n","    lam_1_2_nDCG.append(scores_lam[1.2][k]['nDCG'])\n","    lam_1_5_nDCG.append(scores_lam[1.5][k]['nDCG'])\n","    lam_2_5_nDCG.append(scores_lam[2.5][k]['nDCG'])\n","    lam_3_nDCG.append(scores_lam[3][k]['nDCG'])\n"," #   lam_10_nDCG.append(scores_lam[10][k]['nDCG'])"]},{"cell_type":"code","execution_count":102,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T21:02:40.020032Z","iopub.status.busy":"2024-01-03T21:02:40.019397Z","iopub.status.idle":"2024-01-03T21:02:40.336791Z","shell.execute_reply":"2024-01-03T21:02:40.335971Z","shell.execute_reply.started":"2024-01-03T21:02:40.019976Z"},"trusted":true},"outputs":[{"data":{"image/png":"","text/plain":["<Figure size 1440x864 with 1 Axes>"]},"metadata":{"needs_background":"light"},"output_type":"display_data"}],"source":["plt.figure(figsize=(20, 12))\n","#plt.plot(k_values, lam_2_f1, marker='o', label='lam=2')\n","#plt.plot(k_values, lam_4_f1, marker='s', label='lam=4')\n","#plt.plot(k_values, lam_5_f1, marker='^', label='lam=5')\n","plt.plot(k_values, lam_1_2_f1, marker='*', label='lam=1.2')\n","#plt.plot(k_values, lam_1_5_f1, marker='D', label='lam=1.5')\n","#plt.plot(k_values, lam_2_5_f1, marker='h', label='lam=2.5')\n","plt.plot(k_values, lam_3_f1, marker='.', label='lam=3')\n","#plt.plot(k_values, lam_10_f1, marker='x', label='lam=5')\n","\n","\n","plt.title('F1 Score by k values for lam values BERTopic')\n","plt.xlabel('k value')\n","plt.ylabel('F1 Score')\n","plt.legend()\n","plt.grid(True)\n","plt.show()"]},{"cell_type":"code","execution_count":98,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T21:01:22.782430Z","iopub.status.busy":"2024-01-03T21:01:22.782006Z","iopub.status.idle":"2024-01-03T21:01:23.036331Z","shell.execute_reply":"2024-01-03T21:01:23.034616Z","shell.execute_reply.started":"2024-01-03T21:01:22.782393Z"},"trusted":true},"outputs":[{"data":{"image/png":"","text/plain":["<Figure size 720x432 with 1 Axes>"]},"metadata":{"needs_background":"light"},"output_type":"display_data"}],"source":["plt.figure(figsize=(10, 6))\n","plt.plot(k_values, lam_2_nDCG, marker='o', label='lam=2')\n","#plt.plot(k_values, lam_4_nDCG, marker='s', label='lam=4')\n","#plt.plot(k_values, lam_5_nDCG, marker='^', label='lam=5')\n","plt.plot(k_values, lam_1_2_nDCG, marker='*', label='lam=5')\n","plt.plot(k_values, lam_1_5_nDCG, marker='D', label='lam=5')\n","plt.plot(k_values, lam_2_5_nDCG, marker='h', label='lam=5')\n","plt.plot(k_values, lam_3_nDCG, marker='.', label='lam=5')\n","#plt.plot(k_values, lam_10_nDCG, marker='x', label='lam=5')\n","\n","plt.title('nDCG Score by k values for lam values BERTopic')\n","plt.xlabel('k value')\n","plt.ylabel('nDCG Score')\n","plt.legend()\n","plt.grid(True)\n","plt.show()"]}],"metadata":{"kaggle":{"accelerator":"none","dataSources":[{"datasetId":6763,"sourceId":9801,"sourceType":"datasetVersion"},{"datasetId":576263,"sourceId":1043323,"sourceType":"datasetVersion"},{"datasetId":4135603,"sourceId":7160356,"sourceType":"datasetVersion"},{"datasetId":4137237,"sourceId":7162602,"sourceType":"datasetVersion"}],"dockerImageVersionId":30120,"isGpuEnabled":false,"isInternetEnabled":true,"language":"python","sourceType":"notebook"},"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.7.10"}},"nbformat":4,"nbformat_minor":4}