Newer
Older
{"cells":[{"cell_type":"markdown","metadata":{},"source":["<h2> Imports </h2>"]},{"cell_type":"code","execution_count":1,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T18:52:18.410005Z","iopub.status.busy":"2024-01-03T18:52:18.409290Z","iopub.status.idle":"2024-01-03T18:56:01.555478Z","shell.execute_reply":"2024-01-03T18:56:01.553732Z","shell.execute_reply.started":"2024-01-03T18:52:18.409894Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["Collecting tensorflow==2.5\n"," Downloading tensorflow-2.5.0-cp37-cp37m-manylinux2010_x86_64.whl (454.3 MB)\n","\u001b[K |████████████████████████████████| 454.3 MB 11 kB/s s eta 0:00:01 |█▌ | 20.7 MB 7.4 MB/s eta 0:00:59 |█████████████████████████▋ | 363.2 MB 62.2 MB/s eta 0:00:02 |██████████████████████████████▏ | 427.8 MB 46.2 MB/s eta 0:00:01\n","\u001b[?25hRequirement already satisfied: opt-einsum~=3.3.0 in /opt/conda/lib/python3.7/site-packages (from tensorflow==2.5) (3.3.0)\n","Requirement already satisfied: astunparse~=1.6.3 in /opt/conda/lib/python3.7/site-packages (from tensorflow==2.5) (1.6.3)\n","Requirement already satisfied: absl-py~=0.10 in /opt/conda/lib/python3.7/site-packages (from tensorflow==2.5) (0.12.0)\n","Requirement already satisfied: typing-extensions~=3.7.4 in /opt/conda/lib/python3.7/site-packages (from tensorflow==2.5) (3.7.4.3)\n","Requirement already satisfied: keras-preprocessing~=1.1.2 in /opt/conda/lib/python3.7/site-packages (from tensorflow==2.5) (1.1.2)\n","Collecting h5py~=3.1.0\n"," Downloading h5py-3.1.0-cp37-cp37m-manylinux1_x86_64.whl (4.0 MB)\n","\u001b[K |████████████████████████████████| 4.0 MB 57.1 MB/s eta 0:00:01\n","\u001b[?25hCollecting gast==0.4.0\n"," Downloading gast-0.4.0-py3-none-any.whl (9.8 kB)\n","Requirement already satisfied: protobuf>=3.9.2 in /opt/conda/lib/python3.7/site-packages (from tensorflow==2.5) (3.17.3)\n","Collecting keras-nightly~=2.5.0.dev\n"," Downloading keras_nightly-2.5.0.dev2021032900-py2.py3-none-any.whl (1.2 MB)\n","\u001b[K |████████████████████████████████| 1.2 MB 42.3 MB/s eta 0:00:01\n","\u001b[?25hRequirement already satisfied: wheel~=0.35 in /opt/conda/lib/python3.7/site-packages (from tensorflow==2.5) (0.36.2)\n","Requirement already satisfied: google-pasta~=0.2 in /opt/conda/lib/python3.7/site-packages (from tensorflow==2.5) (0.2.0)\n","Requirement already satisfied: six~=1.15.0 in /opt/conda/lib/python3.7/site-packages (from tensorflow==2.5) (1.15.0)\n","Requirement already satisfied: termcolor~=1.1.0 in /opt/conda/lib/python3.7/site-packages (from tensorflow==2.5) (1.1.0)\n","Requirement already satisfied: flatbuffers~=1.12.0 in /opt/conda/lib/python3.7/site-packages (from tensorflow==2.5) (1.12)\n","Collecting grpcio~=1.34.0\n"," Downloading grpcio-1.34.1-cp37-cp37m-manylinux2014_x86_64.whl (4.0 MB)\n","\u001b[K |████████████████████████████████| 4.0 MB 56.1 MB/s eta 0:00:01\n","\u001b[?25hCollecting tensorflow-estimator<2.6.0,>=2.5.0rc0\n"," Downloading tensorflow_estimator-2.5.0-py2.py3-none-any.whl (462 kB)\n","\u001b[K |████████████████████████████████| 462 kB 61.4 MB/s eta 0:00:01\n","\u001b[?25hRequirement already satisfied: numpy~=1.19.2 in /opt/conda/lib/python3.7/site-packages (from tensorflow==2.5) (1.19.5)\n","Collecting tensorboard~=2.5\n"," Downloading tensorboard-2.11.2-py3-none-any.whl (6.0 MB)\n","\u001b[K |████████████████████████████████| 6.0 MB 38.8 MB/s eta 0:00:01\n","\u001b[?25hRequirement already satisfied: wrapt~=1.12.1 in /opt/conda/lib/python3.7/site-packages (from tensorflow==2.5) (1.12.1)\n","Collecting cached-property\n"," Downloading cached_property-1.5.2-py2.py3-none-any.whl (7.6 kB)\n","Requirement already satisfied: markdown>=2.6.8 in /opt/conda/lib/python3.7/site-packages (from tensorboard~=2.5->tensorflow==2.5) (3.3.4)\n","Requirement already satisfied: google-auth<3,>=1.6.3 in /opt/conda/lib/python3.7/site-packages (from tensorboard~=2.5->tensorflow==2.5) (1.30.2)\n","Requirement already satisfied: requests<3,>=2.21.0 in /opt/conda/lib/python3.7/site-packages (from tensorboard~=2.5->tensorflow==2.5) (2.25.1)\n","Requirement already satisfied: google-auth-oauthlib<0.5,>=0.4.1 in /opt/conda/lib/python3.7/site-packages (from tensorboard~=2.5->tensorflow==2.5) (0.4.4)\n","Requirement already satisfied: setuptools>=41.0.0 in /opt/conda/lib/python3.7/site-packages (from tensorboard~=2.5->tensorflow==2.5) (49.6.0.post20210108)\n","Requirement already satisfied: werkzeug>=1.0.1 in /opt/conda/lib/python3.7/site-packages (from tensorboard~=2.5->tensorflow==2.5) (2.0.1)\n","Requirement already satisfied: tensorboard-data-server<0.7.0,>=0.6.0 in /opt/conda/lib/python3.7/site-packages (from tensorboard~=2.5->tensorflow==2.5) (0.6.1)\n","Requirement already satisfied: tensorboard-plugin-wit>=1.6.0 in /opt/conda/lib/python3.7/site-packages (from tensorboard~=2.5->tensorflow==2.5) (1.8.0)\n","Requirement already satisfied: cachetools<5.0,>=2.0.0 in /opt/conda/lib/python3.7/site-packages (from google-auth<3,>=1.6.3->tensorboard~=2.5->tensorflow==2.5) (4.2.2)\n","Requirement already satisfied: rsa<5,>=3.1.4 in /opt/conda/lib/python3.7/site-packages (from google-auth<3,>=1.6.3->tensorboard~=2.5->tensorflow==2.5) (4.7.2)\n","Requirement already satisfied: pyasn1-modules>=0.2.1 in /opt/conda/lib/python3.7/site-packages (from google-auth<3,>=1.6.3->tensorboard~=2.5->tensorflow==2.5) (0.2.7)\n","Requirement already satisfied: requests-oauthlib>=0.7.0 in /opt/conda/lib/python3.7/site-packages (from google-auth-oauthlib<0.5,>=0.4.1->tensorboard~=2.5->tensorflow==2.5) (1.3.0)\n","Requirement already satisfied: importlib-metadata in /opt/conda/lib/python3.7/site-packages (from markdown>=2.6.8->tensorboard~=2.5->tensorflow==2.5) (3.4.0)\n","Requirement already satisfied: pyasn1<0.5.0,>=0.4.6 in /opt/conda/lib/python3.7/site-packages (from pyasn1-modules>=0.2.1->google-auth<3,>=1.6.3->tensorboard~=2.5->tensorflow==2.5) (0.4.8)\n","Requirement already satisfied: idna<3,>=2.5 in /opt/conda/lib/python3.7/site-packages (from requests<3,>=2.21.0->tensorboard~=2.5->tensorflow==2.5) (2.10)\n","Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.7/site-packages (from requests<3,>=2.21.0->tensorboard~=2.5->tensorflow==2.5) (2021.5.30)\n","Requirement already satisfied: urllib3<1.27,>=1.21.1 in /opt/conda/lib/python3.7/site-packages (from requests<3,>=2.21.0->tensorboard~=2.5->tensorflow==2.5) (1.26.5)\n","Requirement already satisfied: chardet<5,>=3.0.2 in /opt/conda/lib/python3.7/site-packages (from requests<3,>=2.21.0->tensorboard~=2.5->tensorflow==2.5) (4.0.0)\n","Requirement already satisfied: oauthlib>=3.0.0 in /opt/conda/lib/python3.7/site-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib<0.5,>=0.4.1->tensorboard~=2.5->tensorflow==2.5) (3.1.1)\n","Requirement already satisfied: zipp>=0.5 in /opt/conda/lib/python3.7/site-packages (from importlib-metadata->markdown>=2.6.8->tensorboard~=2.5->tensorflow==2.5) (3.4.1)\n","Installing collected packages: grpcio, cached-property, tensorflow-estimator, tensorboard, keras-nightly, h5py, gast, tensorflow\n"," Attempting uninstall: grpcio\n"," Found existing installation: grpcio 1.32.0\n"," Uninstalling grpcio-1.32.0:\n"," Successfully uninstalled grpcio-1.32.0\n"," Attempting uninstall: tensorflow-estimator\n"," Found existing installation: tensorflow-estimator 2.4.0\n"," Uninstalling tensorflow-estimator-2.4.0:\n"," Successfully uninstalled tensorflow-estimator-2.4.0\n"," Attempting uninstall: tensorboard\n"," Found existing installation: tensorboard 2.4.1\n"," Uninstalling tensorboard-2.4.1:\n"," Successfully uninstalled tensorboard-2.4.1\n"," Attempting uninstall: h5py\n"," Found existing installation: h5py 2.10.0\n"," Uninstalling h5py-2.10.0:\n"," Successfully uninstalled h5py-2.10.0\n"," Attempting uninstall: gast\n"," Found existing installation: gast 0.3.3\n"," Uninstalling gast-0.3.3:\n"," Successfully uninstalled gast-0.3.3\n"," Attempting uninstall: tensorflow\n"," Found existing installation: tensorflow 2.4.1\n"," Uninstalling tensorflow-2.4.1:\n"," Successfully uninstalled tensorflow-2.4.1\n","Successfully installed cached-property-1.5.2 gast-0.4.0 grpcio-1.34.1 h5py-3.1.0 keras-nightly-2.5.0.dev2021032900 tensorboard-2.11.2 tensorflow-2.5.0 tensorflow-estimator-2.5.0\n","\u001b[33mWARNING: Running pip as root will break packages and permissions. You should install packages reliably by using venv: https://pip.pypa.io/warnings/venv\u001b[0m\n","Collecting rank_bm25\n"," Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)\n","Requirement already satisfied: numpy in /opt/conda/lib/python3.7/site-packages (from rank_bm25) (1.19.5)\n","Installing collected packages: rank-bm25\n","Successfully installed rank-bm25-0.2.2\n","\u001b[33mWARNING: Running pip as root will break packages and permissions. You should install packages reliably by using venv: https://pip.pypa.io/warnings/venv\u001b[0m\n","\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n","yellowbrick 1.3.post1 requires numpy<1.20,>=1.16.0, but you have numpy 1.21.6 which is incompatible.\n","tensorflow 2.5.0 requires numpy~=1.19.2, but you have numpy 1.21.6 which is incompatible.\n","s3fs 2021.6.1 requires fsspec==2021.06.1, but you have fsspec 2023.1.0 which is incompatible.\n","pdpbox 0.2.1 requires matplotlib==3.1.1, but you have matplotlib 3.4.2 which is incompatible.\n","matrixprofile 1.1.10 requires protobuf==3.11.2, but you have protobuf 3.17.3 which is incompatible.\n","kornia 0.5.5 requires numpy<=1.19, but you have numpy 1.21.6 which is incompatible.\n","imbalanced-learn 0.8.0 requires scikit-learn>=0.24, but you have scikit-learn 0.23.2 which is incompatible.\n","gcsfs 2021.6.0 requires fsspec==2021.06.0, but you have fsspec 2023.1.0 which is incompatible.\n","allennlp 2.5.0 requires transformers<4.7,>=4.1, but you have transformers 4.30.2 which is incompatible.\u001b[0m\n","\u001b[33mWARNING: Running pip as root will break packages and permissions. You should install packages reliably by using venv: https://pip.pypa.io/warnings/venv\u001b[0m\n","Note: you may need to restart the kernel to use updated packages.\n"]}],"source":["%pip install tensorflow==2.5\n","%pip install -Uq sentence-transformers faiss-cpu accelerate hdbscan bertopic evaluate kaleido datasets>=2.11"]},{"cell_type":"code","execution_count":2,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T18:56:01.558915Z","iopub.status.busy":"2024-01-03T18:56:01.558368Z","iopub.status.idle":"2024-01-03T18:56:32.924438Z","shell.execute_reply":"2024-01-03T18:56:32.919977Z","shell.execute_reply.started":"2024-01-03T18:56:01.558854Z"},"trusted":true},"outputs":[],"source":["import numpy as np\n","import nltk\n","from nltk.corpus import stopwords\n","from nltk.tokenize import word_tokenize\n","import re\n","stopword_list = nltk.corpus.stopwords.words('english')\n","import pandas as pd\n","from tqdm import tqdm\n","tqdm.pandas()\n","from bertopic import BERTopic\n","from bertopic.vectorizers import ClassTfidfTransformer\n","import numpy as np\n","import pandas as pd \n","import os\n","import string\n","from nltk.stem import WordNetLemmatizer\n","import pickle \n"]},{"cell_type":"markdown","metadata":{},"source":["<h2> Load datasets and model</h2>"]},{"cell_type":"code","execution_count":3,"metadata":{},"outputs":[],"source":["with open('./results/initial_retrieval_with_bm25_scores.pkl', 'rb') as f:\n"," initial_retrieval_with_bm25_scores = pickle.load(f)"]},{"cell_type":"code","execution_count":4,"metadata":{"_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","execution":{"iopub.execute_input":"2024-01-03T18:56:32.931742Z","iopub.status.busy":"2024-01-03T18:56:32.929029Z","iopub.status.idle":"2024-01-03T18:56:33.045085Z","shell.execute_reply":"2024-01-03T18:56:33.043862Z","shell.execute_reply.started":"2024-01-03T18:56:32.931643Z"},"trusted":true},"outputs":[],"source":["queries = pd.read_csv(\"./data/cisi-csv/queries.csv\")\n","docs = pd.read_csv(\"./data/cisi-csv/docs.csv\")\n","rels = pd.read_csv(\"./data/cisi-csv/rels.csv\")\n","\n","full_doc = docs['text'].to_list()\n","full_query = queries['text'].to_list()"]},{"cell_type":"code","execution_count":5,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T18:56:33.047676Z","iopub.status.busy":"2024-01-03T18:56:33.046992Z","iopub.status.idle":"2024-01-03T18:56:34.042871Z","shell.execute_reply":"2024-01-03T18:56:34.041391Z","shell.execute_reply.started":"2024-01-03T18:56:33.047624Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["There are 36 queries without a groundtruth.\n","Remaining queries: 76.\n"]},{"data":{"text/html":["<div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>id</th>\n"," <th>text</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>1</td>\n"," <td>What problems and concerns are there in making...</td>\n"," </tr>\n"," <tr>\n"," <th>1</th>\n"," <td>2</td>\n"," <td>How can actually pertinent data, as opposed to...</td>\n"," </tr>\n"," <tr>\n"," <th>2</th>\n"," <td>3</td>\n"," <td>What is information science? Give definitions ...</td>\n"," </tr>\n"," <tr>\n"," <th>3</th>\n"," <td>4</td>\n"," <td>Image recognition and any other methods of aut...</td>\n"," </tr>\n"," <tr>\n"," <th>4</th>\n"," <td>5</td>\n"," <td>What special training will ordinary researcher...</td>\n"," </tr>\n"," <tr>\n"," <th>...</th>\n"," <td>...</td>\n"," <td>...</td>\n"," </tr>\n"," <tr>\n"," <th>100</th>\n"," <td>101</td>\n"," <td>.T Parallel Computations in Information Retrie...</td>\n"," </tr>\n"," <tr>\n"," <th>101</th>\n"," <td>102</td>\n"," <td>.T The Measurement of Term Importance in Autom...</td>\n"," </tr>\n"," <tr>\n"," <th>103</th>\n"," <td>104</td>\n"," <td>.T The Selection of Good Search Terms .A van R...</td>\n"," </tr>\n"," <tr>\n"," <th>108</th>\n"," <td>109</td>\n"," <td>.T Author Cocitation: A Literature Measure of ...</td>\n"," </tr>\n"," <tr>\n"," <th>110</th>\n"," <td>111</td>\n"," <td>.T Document Clustering Using an Inverted File ...</td>\n"," </tr>\n"," </tbody>\n","</table>\n","<p>76 rows × 2 columns</p>\n","</div>"],"text/plain":[" id text\n","0 1 What problems and concerns are there in making...\n","1 2 How can actually pertinent data, as opposed to...\n","2 3 What is information science? Give definitions ...\n","3 4 Image recognition and any other methods of aut...\n","4 5 What special training will ordinary researcher...\n",".. ... ...\n","100 101 .T Parallel Computations in Information Retrie...\n","101 102 .T The Measurement of Term Importance in Autom...\n","103 104 .T The Selection of Good Search Terms .A van R...\n","108 109 .T Author Cocitation: A Literature Measure of ...\n","110 111 .T Document Clustering Using an Inverted File ...\n","\n","[76 rows x 2 columns]"]},"execution_count":5,"metadata":{},"output_type":"execute_result"}],"source":["# TODO: this is not really necessary I think? because\n","#remove queries where we don't have a groundtruth for:\n","queries_wo_gt = [36,38,40,47,48,51,53,59,60,63,64,68,70,72,73,74,75,77,78,80,83,85,86,87,88,89,91,93,94,103,105,106,107,108,110,112]\n","print(f'There are {len(queries_wo_gt)} queries without a groundtruth.')\n","print(f'Remaining queries: {len(queries)-len(queries_wo_gt)}.')\n","\n","queries = queries[~queries['id'].isin(queries_wo_gt)]\n","queries"]},{"cell_type":"code","execution_count":6,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T18:57:01.149061Z","iopub.status.busy":"2024-01-03T18:57:01.148640Z","iopub.status.idle":"2024-01-03T18:57:01.166124Z","shell.execute_reply":"2024-01-03T18:57:01.164536Z","shell.execute_reply.started":"2024-01-03T18:57:01.149026Z"},"trusted":true},"outputs":[],"source":["def data_clean(text):\n"," pattern = r'[^a-zA-Z0-9\\s]'\n"," text = re.sub(pattern,'',' '.join(text))\n"," tokens = [token.strip() for token in text.split()]\n"," filtered = [token for token in tokens if token.lower() not in stopword_list]\n"," filtered = ' '.join(filtered)\n"," return filtered\n","\n","# just the same code as above to clean the df texts for bm25\n","def data_clean_df(text):\n"," # Regex pattern to keep only alphanumeric characters and spaces\n"," pattern = r'[^a-zA-Z0-9\\s]'\n"," text = re.sub(pattern, '', text)\n"," tokens = [token.strip() for token in text.split()]\n"," return ' '.join(tokens)\n","\n","\n","#function is needed to get the texts of the relevant documents from initial retrieval\n","def get_texts_from_df(doc_ids, df):\n"," return df[df['id'].isin(doc_ids)]['text'].tolist()\n","\n","#some queries have a .T in the begining we want to remove this\n","def clean_query(text):\n"," pattern = r'^\\.T\\s'\n"," tokens = [token.strip() for token in text.split()]\n"," return ' '.join(tokens)\n","\n","#special pre-processing for bm25, because for embeddings we don't want to pre-process that much\n","def data_clean_for_bm25(text):\n"," # Lowercasing the text\n"," text = text.lower()\n"," # Removing digits\n"," text = re.sub(r'\\d+', '', text)\n"," # Removing punctuation\n"," translator = str.maketrans('', '', string.punctuation)\n"," text = text.translate(translator)\n"," # Whitespace normalization\n"," text = \" \".join(text.split())\n"," # Stopword removal\n"," stop_words = set(stopwords.words(\"english\"))\n"," word_tokens = word_tokenize(text)\n"," filtered_words = [word for word in word_tokens if word not in stop_words]\n"," # Lemmatization\n"," lemmatizer = WordNetLemmatizer()\n"," lemmas = [lemmatizer.lemmatize(word) for word in filtered_words]\n","\n"," return lemmas"]},{"cell_type":"code","execution_count":7,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T18:57:04.266231Z","iopub.status.busy":"2024-01-03T18:57:04.265732Z","iopub.status.idle":"2024-01-03T18:57:10.812646Z","shell.execute_reply":"2024-01-03T18:57:10.811193Z","shell.execute_reply.started":"2024-01-03T18:57:04.266183Z"},"trusted":true},"outputs":[{"data":{"text/html":["<div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>id</th>\n"," <th>text</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>1</td>\n"," <td>[problem, concern, making, descriptive, title,...</td>\n"," </tr>\n"," <tr>\n"," <th>1</th>\n"," <td>2</td>\n"," <td>[actually, pertinent, data, opposed, reference...</td>\n"," </tr>\n"," <tr>\n"," <th>2</th>\n"," <td>3</td>\n"," <td>[information, science, give, definition, possi...</td>\n"," </tr>\n"," <tr>\n"," <th>3</th>\n"," <td>4</td>\n"," <td>[image, recognition, method, automatically, tr...</td>\n"," </tr>\n"," <tr>\n"," <th>4</th>\n"," <td>5</td>\n"," <td>[special, training, ordinary, researcher, busi...</td>\n"," </tr>\n"," <tr>\n"," <th>...</th>\n"," <td>...</td>\n"," <td>...</td>\n"," </tr>\n"," <tr>\n"," <th>100</th>\n"," <td>101</td>\n"," <td>[parallel, computation, information, retrieval...</td>\n"," </tr>\n"," <tr>\n"," <th>101</th>\n"," <td>102</td>\n"," <td>[measurement, term, importance, automatic, ind...</td>\n"," </tr>\n"," <tr>\n"," <th>103</th>\n"," <td>104</td>\n"," <td>[selection, good, search, term, van, rijsberge...</td>\n"," </tr>\n"," <tr>\n"," <th>108</th>\n"," <td>109</td>\n"," <td>[author, cocitation, literature, measure, inte...</td>\n"," </tr>\n"," <tr>\n"," <th>110</th>\n"," <td>111</td>\n"," <td>[document, clustering, using, inverted, file, ...</td>\n"," </tr>\n"," </tbody>\n","</table>\n","<p>76 rows × 2 columns</p>\n","</div>"],"text/plain":[" id text\n","0 1 [problem, concern, making, descriptive, title,...\n","1 2 [actually, pertinent, data, opposed, reference...\n","2 3 [information, science, give, definition, possi...\n","3 4 [image, recognition, method, automatically, tr...\n","4 5 [special, training, ordinary, researcher, busi...\n",".. ... ...\n","100 101 [parallel, computation, information, retrieval...\n","101 102 [measurement, term, importance, automatic, ind...\n","103 104 [selection, good, search, term, van, rijsberge...\n","108 109 [author, cocitation, literature, measure, inte...\n","110 111 [document, clustering, using, inverted, file, ...\n","\n","[76 rows x 2 columns]"]},"execution_count":7,"metadata":{},"output_type":"execute_result"}],"source":["queries_cleaned = queries.copy()\n","queries_cleaned['text'] = queries_cleaned['text'].apply(data_clean_df)\n","queries_cleaned['text'] = queries_cleaned['text'].apply(clean_query)\n","\n","docs_cleaned = docs.copy()\n","docs_cleaned['text'] = docs_cleaned['text'].apply(data_clean_df)\n","docs_cleaned\n","\n","queries_cleaned_bm25 = queries.copy()\n","queries_cleaned_bm25['text'] = queries_cleaned_bm25['text'].apply(data_clean_for_bm25)\n","\n","docs_cleaned_bm25 = docs.copy()\n","docs_cleaned_bm25['text'] = docs_cleaned_bm25['text'].apply(data_clean_for_bm25)\n","queries_cleaned_bm25"]},{"cell_type":"markdown","metadata":{},"source":["## Initializing and fitting BERTopic model"]},{"cell_type":"code","execution_count":8,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T18:57:52.932829Z","iopub.status.busy":"2024-01-03T18:57:52.932037Z","iopub.status.idle":"2024-01-03T18:57:52.940868Z","shell.execute_reply":"2024-01-03T18:57:52.939770Z","shell.execute_reply.started":"2024-01-03T18:57:52.932765Z"},"trusted":true},"outputs":[],"source":["from umap import UMAP\n","\n","model_name = 'sentence-transformers/all-MiniLM-L6-v2'\n","\n","# UMAP is stochastic, so re-produce results you need to set the random_state for umap and pass this umap model to BERTopic:\n","# umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)\n","# HOW TO SET UMAP MODEL: topic_model = BERTopic(umap_model=umap_model)\n","\n","topic_model = BERTopic(embedding_model=model_name, ctfidf_model=ClassTfidfTransformer(reduce_frequent_words=True), calculate_probabilities=True)\n","topic_model_auto = BERTopic(embedding_model=model_name, ctfidf_model=ClassTfidfTransformer(reduce_frequent_words=True), nr_topics=\"auto\", calculate_probabilities=True)"]},{"cell_type":"code","execution_count":10,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T18:57:55.031840Z","iopub.status.busy":"2024-01-03T18:57:55.031137Z","iopub.status.idle":"2024-01-03T19:00:05.623076Z","shell.execute_reply":"2024-01-03T19:00:05.621204Z","shell.execute_reply.started":"2024-01-03T18:57:55.031784Z"},"trusted":true},"outputs":[{"data":{"text/plain":["<bertopic._bertopic.BERTopic at 0x7fd30e721330>"]},"execution_count":10,"metadata":{},"output_type":"execute_result"}],"source":["# training bert model\n","docs_for_bert = docs[\"text\"] # todo comment why not preprocessed\n","topic_model.fit(docs_for_bert)\n","topic_model_auto.fit(docs_for_bert)"]},{"cell_type":"code","execution_count":11,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T20:37:47.548151Z","iopub.status.busy":"2024-01-03T20:37:47.547570Z","iopub.status.idle":"2024-01-03T20:37:47.601583Z","shell.execute_reply":"2024-01-03T20:37:47.599861Z","shell.execute_reply.started":"2024-01-03T20:37:47.548107Z"},"trusted":true},"outputs":[{"data":{"text/html":["<div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>Topic</th>\n"," <th>Count</th>\n"," <th>Name</th>\n"," <th>Representation</th>\n"," <th>Representative_Docs</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>-1</td>\n"," <td>225</td>\n"," <td>-1_retrieval_document_system_project</td>\n"," <td>[retrieval, document, system, project, on, com...</td>\n"," <td>[Studies to Compare Retrieval Using Titles wit...</td>\n"," </tr>\n"," <tr>\n"," <th>1</th>\n"," <td>0</td>\n"," <td>416</td>\n"," <td>0_scientific_science_journals_scientists</td>\n"," <td>[scientific, science, journals, scientists, so...</td>\n"," <td>[Education and Training for Scientific and Tec...</td>\n"," </tr>\n"," <tr>\n"," <th>2</th>\n"," <td>1</td>\n"," <td>258</td>\n"," <td>1_library_libraries_university_academic</td>\n"," <td>[library, libraries, university, academic, pub...</td>\n"," <td>[Undergraduate Library The development of the ...</td>\n"," </tr>\n"," <tr>\n"," <th>3</th>\n"," <td>2</td>\n"," <td>96</td>\n"," <td>2_chemical_search_compounds_notation</td>\n"," <td>[chemical, search, compounds, notation, titles...</td>\n"," <td>[Experiences of IIT Research Institute in Oper...</td>\n"," </tr>\n"," <tr>\n"," <th>4</th>\n"," <td>3</td>\n"," <td>61</td>\n"," <td>3_automatic_indexing_index_classification</td>\n"," <td>[automatic, indexing, index, classification, d...</td>\n"," <td>[What Makes An Automatic Keyword Classificatio...</td>\n"," </tr>\n"," <tr>\n"," <th>5</th>\n"," <td>4</td>\n"," <td>46</td>\n"," <td>4_relevance_relevant_retrieval_documents</td>\n"," <td>[relevance, relevant, retrieval, documents, me...</td>\n"," <td>[On Relevance, Probabilistic Indexing and Info...</td>\n"," </tr>\n"," <tr>\n"," <th>6</th>\n"," <td>5</td>\n"," <td>42</td>\n"," <td>5_bases_data_bibliographic_line</td>\n"," <td>[bases, data, bibliographic, line, readable, s...</td>\n"," <td>[Survey of Commercially Available Computer-Rea...</td>\n"," </tr>\n"," <tr>\n"," <th>7</th>\n"," <td>6</td>\n"," <td>38</td>\n"," <td>6_catalog_catalogs_card_cataloging</td>\n"," <td>[catalog, catalogs, card, cataloging, cards, d...</td>\n"," <td>[Prejudices and Antipathies: A tract on the LC...</td>\n"," </tr>\n"," <tr>\n"," <th>8</th>\n"," <td>7</td>\n"," <td>28</td>\n"," <td>7_language_linguistics_linguistic_semantic</td>\n"," <td>[language, linguistics, linguistic, semantic, ...</td>\n"," <td>[Functional Approach The present book sums up ...</td>\n"," </tr>\n"," <tr>\n"," <th>9</th>\n"," <td>8</td>\n"," <td>25</td>\n"," <td>8_medical_health_hospital_manpower</td>\n"," <td>[medical, health, hospital, manpower, hospital...</td>\n"," <td>[Library Practice in Hospitals According to a ...</td>\n"," </tr>\n"," <tr>\n"," <th>10</th>\n"," <td>9</td>\n"," <td>21</td>\n"," <td>9_classification_dewey_decimal_schemes</td>\n"," <td>[classification, dewey, decimal, schemes, cata...</td>\n"," <td>[Progress in Documentation Thirty years or mor...</td>\n"," </tr>\n"," <tr>\n"," <th>11</th>\n"," <td>10</td>\n"," <td>21</td>\n"," <td>10_automation_processing_library_telefacsimile</td>\n"," <td>[automation, processing, library, telefacsimil...</td>\n"," <td>[Application of Computer Technology to Library...</td>\n"," </tr>\n"," <tr>\n"," <th>12</th>\n"," <td>11</td>\n"," <td>18</td>\n"," <td>11_fuzzy_sets_classification_hedge</td>\n"," <td>[fuzzy, sets, classification, hedge, membershi...</td>\n"," <td>[Prospects for a New General Classification In...</td>\n"," </tr>\n"," <tr>\n"," <th>13</th>\n"," <td>12</td>\n"," <td>18</td>\n"," <td>12_retrieval_user_dialog_interaction</td>\n"," <td>[retrieval, user, dialog, interaction, systems...</td>\n"," <td>[Information Retrieval Systems This book is co...</td>\n"," </tr>\n"," <tr>\n"," <th>14</th>\n"," <td>13</td>\n"," <td>17</td>\n"," <td>13_evaluation_costs_cost_scale</td>\n"," <td>[evaluation, costs, cost, scale, systems, appr...</td>\n"," <td>[Standard Costing for Information Systems: Bac...</td>\n"," </tr>\n"," <tr>\n"," <th>15</th>\n"," <td>14</td>\n"," <td>16</td>\n"," <td>14_thesaurus_thesauri_vocabulary_construction</td>\n"," <td>[thesaurus, thesauri, vocabulary, construction...</td>\n"," <td>[Theoretical Foundations of Thesaurus-Construc...</td>\n"," </tr>\n"," <tr>\n"," <th>16</th>\n"," <td>15</td>\n"," <td>16</td>\n"," <td>15_compression_coding_length_base</td>\n"," <td>[compression, coding, length, base, grams, inv...</td>\n"," <td>[An Information-Theoretic Approach to Text Sea...</td>\n"," </tr>\n"," <tr>\n"," <th>17</th>\n"," <td>16</td>\n"," <td>15</td>\n"," <td>16_medlars_medline_twx_medicus</td>\n"," <td>[medlars, medline, twx, medicus, medicine, nlm...</td>\n"," <td>[MEDLARS: A Summary Review and Evaluation of T...</td>\n"," </tr>\n"," <tr>\n"," <th>18</th>\n"," <td>17</td>\n"," <td>15</td>\n"," <td>17_network_networks_cable_television</td>\n"," <td>[network, networks, cable, television, communi...</td>\n"," <td>[The National Biomedical Communications Networ...</td>\n"," </tr>\n"," <tr>\n"," <th>19</th>\n"," <td>18</td>\n"," <td>15</td>\n"," <td>18_centers_telephone_services_information</td>\n"," <td>[centers, telephone, services, information, an...</td>\n"," <td>[The Annual Review of Information Science and ...</td>\n"," </tr>\n"," <tr>\n"," <th>20</th>\n"," <td>19</td>\n"," <td>15</td>\n"," <td>19_microfiche_microforms_microform_microfilm</td>\n"," <td>[microfiche, microforms, microform, microfilm,...</td>\n"," <td>[The Microform Revolution Librarians have trie...</td>\n"," </tr>\n"," <tr>\n"," <th>21</th>\n"," <td>20</td>\n"," <td>14</td>\n"," <td>20_serials_isbd_international_aacr</td>\n"," <td>[serials, isbd, international, aacr, rules, se...</td>\n"," <td>[Serial Cataloging Problems: Rules of Entry an...</td>\n"," </tr>\n"," <tr>\n"," <th>22</th>\n"," <td>21</td>\n"," <td>13</td>\n"," <td>21_marc_records_readable_pilot</td>\n"," <td>[marc, records, readable, pilot, ii, machine, ...</td>\n"," <td>[The Marc II Format: A ...</td>\n"," </tr>\n"," <tr>\n"," <th>23</th>\n"," <td>22</td>\n"," <td>11</td>\n"," <td>22_awareness_sdi_current_notices</td>\n"," <td>[awareness, sdi, current, notices, disseminati...</td>\n"," <td>[The Implementation, Evaluation, and Refinemen...</td>\n"," </tr>\n"," </tbody>\n","</table>\n","</div>"],"text/plain":[" Topic Count Name \\\n","0 -1 225 -1_retrieval_document_system_project \n","1 0 416 0_scientific_science_journals_scientists \n","2 1 258 1_library_libraries_university_academic \n","3 2 96 2_chemical_search_compounds_notation \n","4 3 61 3_automatic_indexing_index_classification \n","5 4 46 4_relevance_relevant_retrieval_documents \n","6 5 42 5_bases_data_bibliographic_line \n","7 6 38 6_catalog_catalogs_card_cataloging \n","8 7 28 7_language_linguistics_linguistic_semantic \n","9 8 25 8_medical_health_hospital_manpower \n","10 9 21 9_classification_dewey_decimal_schemes \n","11 10 21 10_automation_processing_library_telefacsimile \n","12 11 18 11_fuzzy_sets_classification_hedge \n","13 12 18 12_retrieval_user_dialog_interaction \n","14 13 17 13_evaluation_costs_cost_scale \n","15 14 16 14_thesaurus_thesauri_vocabulary_construction \n","16 15 16 15_compression_coding_length_base \n","17 16 15 16_medlars_medline_twx_medicus \n","18 17 15 17_network_networks_cable_television \n","19 18 15 18_centers_telephone_services_information \n","20 19 15 19_microfiche_microforms_microform_microfilm \n","21 20 14 20_serials_isbd_international_aacr \n","22 21 13 21_marc_records_readable_pilot \n","23 22 11 22_awareness_sdi_current_notices \n","\n"," Representation \\\n","0 [retrieval, document, system, project, on, com... \n","1 [scientific, science, journals, scientists, so... \n","2 [library, libraries, university, academic, pub... \n","3 [chemical, search, compounds, notation, titles... \n","4 [automatic, indexing, index, classification, d... \n","5 [relevance, relevant, retrieval, documents, me... \n","6 [bases, data, bibliographic, line, readable, s... \n","7 [catalog, catalogs, card, cataloging, cards, d... \n","8 [language, linguistics, linguistic, semantic, ... \n","9 [medical, health, hospital, manpower, hospital... \n","10 [classification, dewey, decimal, schemes, cata... \n","11 [automation, processing, library, telefacsimil... \n","12 [fuzzy, sets, classification, hedge, membershi... \n","13 [retrieval, user, dialog, interaction, systems... \n","14 [evaluation, costs, cost, scale, systems, appr... \n","15 [thesaurus, thesauri, vocabulary, construction... \n","16 [compression, coding, length, base, grams, inv... \n","17 [medlars, medline, twx, medicus, medicine, nlm... \n","18 [network, networks, cable, television, communi... \n","19 [centers, telephone, services, information, an... \n","20 [microfiche, microforms, microform, microfilm,... \n","21 [serials, isbd, international, aacr, rules, se... \n","22 [marc, records, readable, pilot, ii, machine, ... \n","23 [awareness, sdi, current, notices, disseminati... \n","\n"," Representative_Docs \n","0 [Studies to Compare Retrieval Using Titles wit... \n","1 [Education and Training for Scientific and Tec... \n","2 [Undergraduate Library The development of the ... \n","3 [Experiences of IIT Research Institute in Oper... \n","4 [What Makes An Automatic Keyword Classificatio... \n","5 [On Relevance, Probabilistic Indexing and Info... \n","6 [Survey of Commercially Available Computer-Rea... \n","7 [Prejudices and Antipathies: A tract on the LC... \n","8 [Functional Approach The present book sums up ... \n","9 [Library Practice in Hospitals According to a ... \n","10 [Progress in Documentation Thirty years or mor... \n","11 [Application of Computer Technology to Library... \n","12 [Prospects for a New General Classification In... \n","13 [Information Retrieval Systems This book is co... \n","14 [Standard Costing for Information Systems: Bac... \n","15 [Theoretical Foundations of Thesaurus-Construc... \n","16 [An Information-Theoretic Approach to Text Sea... \n","17 [MEDLARS: A Summary Review and Evaluation of T... \n","18 [The National Biomedical Communications Networ... \n","19 [The Annual Review of Information Science and ... \n","20 [The Microform Revolution Librarians have trie... \n","21 [Serial Cataloging Problems: Rules of Entry an... \n","22 [The Marc II Format: A ... \n","23 [The Implementation, Evaluation, and Refinemen... "]},"execution_count":11,"metadata":{},"output_type":"execute_result"}],"source":["topic_model.get_topic_info()"]},{"cell_type":"code","execution_count":12,"metadata":{},"outputs":[{"data":{"text/html":["<div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>Topic</th>\n"," <th>Count</th>\n"," <th>Name</th>\n"," <th>Representation</th>\n"," <th>Representative_Docs</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>-1</td>\n"," <td>309</td>\n"," <td>-1_and_to_in_for</td>\n"," <td>[and, to, in, for, of, on, system, are, inform...</td>\n"," <td>[Studies to Compare Retrieval Using Titles wit...</td>\n"," </tr>\n"," <tr>\n"," <th>1</th>\n"," <td>0</td>\n"," <td>998</td>\n"," <td>0_in_to_and_of</td>\n"," <td>[in, to, and, of, is, the, for, that, informat...</td>\n"," <td>[Information Needs and Uses in Science and Tec...</td>\n"," </tr>\n"," <tr>\n"," <th>2</th>\n"," <td>1</td>\n"," <td>65</td>\n"," <td>1_medical_health_language_medlars</td>\n"," <td>[medical, health, language, medlars, hospital,...</td>\n"," <td>[Library Practice in Hospitals According to a ...</td>\n"," </tr>\n"," <tr>\n"," <th>3</th>\n"," <td>2</td>\n"," <td>34</td>\n"," <td>2_chemical_notation_compounds_structures</td>\n"," <td>[chemical, notation, compounds, structures, at...</td>\n"," <td>[The Chemical Abstracts Service Chemical Regis...</td>\n"," </tr>\n"," <tr>\n"," <th>4</th>\n"," <td>3</td>\n"," <td>18</td>\n"," <td>3_fuzzy_classification_sets_relations</td>\n"," <td>[fuzzy, classification, sets, relations, membe...</td>\n"," <td>[Prospects for a New General Classification In...</td>\n"," </tr>\n"," <tr>\n"," <th>5</th>\n"," <td>4</td>\n"," <td>15</td>\n"," <td>4_microfiche_microforms_microform_microfilm</td>\n"," <td>[microfiche, microforms, microform, microfilm,...</td>\n"," <td>[The Microform Revolution Librarians have trie...</td>\n"," </tr>\n"," <tr>\n"," <th>6</th>\n"," <td>5</td>\n"," <td>11</td>\n"," <td>5_awareness_current_sdi_dissemination</td>\n"," <td>[awareness, current, sdi, dissemination, notic...</td>\n"," <td>[The Implementation, Evaluation, and Refinemen...</td>\n"," </tr>\n"," <tr>\n"," <th>7</th>\n"," <td>6</td>\n"," <td>10</td>\n"," <td>6_chemical_gremas_idc_chemists</td>\n"," <td>[chemical, gremas, idc, chemists, information,...</td>\n"," <td>[French National Policy for Chemical Informati...</td>\n"," </tr>\n"," </tbody>\n","</table>\n","</div>"],"text/plain":[" Topic Count Name \\\n","0 -1 309 -1_and_to_in_for \n","1 0 998 0_in_to_and_of \n","2 1 65 1_medical_health_language_medlars \n","3 2 34 2_chemical_notation_compounds_structures \n","4 3 18 3_fuzzy_classification_sets_relations \n","5 4 15 4_microfiche_microforms_microform_microfilm \n","6 5 11 5_awareness_current_sdi_dissemination \n","7 6 10 6_chemical_gremas_idc_chemists \n","\n"," Representation \\\n","0 [and, to, in, for, of, on, system, are, inform... \n","1 [in, to, and, of, is, the, for, that, informat... \n","2 [medical, health, language, medlars, hospital,... \n","3 [chemical, notation, compounds, structures, at... \n","4 [fuzzy, classification, sets, relations, membe... \n","5 [microfiche, microforms, microform, microfilm,... \n","6 [awareness, current, sdi, dissemination, notic... \n","7 [chemical, gremas, idc, chemists, information,... \n","\n"," Representative_Docs \n","0 [Studies to Compare Retrieval Using Titles wit... \n","1 [Information Needs and Uses in Science and Tec... \n","2 [Library Practice in Hospitals According to a ... \n","3 [The Chemical Abstracts Service Chemical Regis... \n","4 [Prospects for a New General Classification In... \n","5 [The Microform Revolution Librarians have trie... \n","6 [The Implementation, Evaluation, and Refinemen... \n","7 [French National Policy for Chemical Informati... "]},"execution_count":12,"metadata":{},"output_type":"execute_result"}],"source":["topic_model_auto.get_topic_info()"]},{"cell_type":"code","execution_count":13,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T19:00:05.785848Z","iopub.status.busy":"2024-01-03T19:00:05.785233Z","iopub.status.idle":"2024-01-03T19:00:05.894032Z","shell.execute_reply":"2024-01-03T19:00:05.892648Z","shell.execute_reply.started":"2024-01-03T19:00:05.785783Z"},"trusted":true},"outputs":[{"data":{"text/html":["<div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>Document</th>\n"," <th>Topic</th>\n"," <th>Name</th>\n"," <th>Representation</th>\n"," <th>Representative_Docs</th>\n"," <th>Top_n_words</th>\n"," <th>Probability</th>\n"," <th>Representative_document</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>18 Editions of the Dewey Decimal Classificatio...</td>\n"," <td>9</td>\n"," <td>9_classification_dewey_decimal_schemes</td>\n"," <td>[classification, dewey, decimal, schemes, cata...</td>\n"," <td>[Progress in Documentation Thirty years or mor...</td>\n"," <td>classification - dewey - decimal - schemes - c...</td>\n"," <td>0.641904</td>\n"," <td>False</td>\n"," </tr>\n"," <tr>\n"," <th>1</th>\n"," <td>Use Made of Technical Libraries This report is...</td>\n"," <td>1</td>\n"," <td>1_library_libraries_university_academic</td>\n"," <td>[library, libraries, university, academic, pub...</td>\n"," <td>[Undergraduate Library The development of the ...</td>\n"," <td>library - libraries - university - academic - ...</td>\n"," <td>0.121461</td>\n"," <td>False</td>\n"," </tr>\n"," <tr>\n"," <th>2</th>\n"," <td>Two Kinds of Power An Essay on Bibliographic C...</td>\n"," <td>-1</td>\n"," <td>-1_retrieval_document_system_project</td>\n"," <td>[retrieval, document, system, project, on, com...</td>\n"," <td>[Studies to Compare Retrieval Using Titles wit...</td>\n"," <td>retrieval - document - system - project - on -...</td>\n"," <td>0.314410</td>\n"," <td>False</td>\n"," </tr>\n"," <tr>\n"," <th>3</th>\n"," <td>Systems Analysis of a University Library; fina...</td>\n"," <td>1</td>\n"," <td>1_library_libraries_university_academic</td>\n"," <td>[library, libraries, university, academic, pub...</td>\n"," <td>[Undergraduate Library The development of the ...</td>\n"," <td>library - libraries - university - academic - ...</td>\n"," <td>0.171636</td>\n"," <td>False</td>\n"," </tr>\n"," <tr>\n"," <th>4</th>\n"," <td>A Library Management Game: a report on a resea...</td>\n"," <td>-1</td>\n"," <td>-1_retrieval_document_system_project</td>\n"," <td>[retrieval, document, system, project, on, com...</td>\n"," <td>[Studies to Compare Retrieval Using Titles wit...</td>\n"," <td>retrieval - document - system - project - on -...</td>\n"," <td>0.543388</td>\n"," <td>False</td>\n"," </tr>\n"," <tr>\n"," <th>...</th>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," </tr>\n"," <tr>\n"," <th>1455</th>\n"," <td>World Dynamics Over the last several decades i...</td>\n"," <td>0</td>\n"," <td>0_scientific_science_journals_scientists</td>\n"," <td>[scientific, science, journals, scientists, so...</td>\n"," <td>[Education and Training for Scientific and Tec...</td>\n"," <td>scientific - science - journals - scientists -...</td>\n"," <td>1.000000</td>\n"," <td>False</td>\n"," </tr>\n"," <tr>\n"," <th>1456</th>\n"," <td>World Trends in Library Education One of the m...</td>\n"," <td>1</td>\n"," <td>1_library_libraries_university_academic</td>\n"," <td>[library, libraries, university, academic, pub...</td>\n"," <td>[Undergraduate Library The development of the ...</td>\n"," <td>library - libraries - university - academic - ...</td>\n"," <td>1.000000</td>\n"," <td>False</td>\n"," </tr>\n"," <tr>\n"," <th>1457</th>\n"," <td>Legal Restrictions on Exploitation of the Pate...</td>\n"," <td>0</td>\n"," <td>0_scientific_science_journals_scientists</td>\n"," <td>[scientific, science, journals, scientists, so...</td>\n"," <td>[Education and Training for Scientific and Tec...</td>\n"," <td>scientific - science - journals - scientists -...</td>\n"," <td>0.148646</td>\n"," <td>False</td>\n"," </tr>\n"," <tr>\n"," <th>1458</th>\n"," <td>Language and Thought This book considers the b...</td>\n"," <td>7</td>\n"," <td>7_language_linguistics_linguistic_semantic</td>\n"," <td>[language, linguistics, linguistic, semantic, ...</td>\n"," <td>[Functional Approach The present book sums up ...</td>\n"," <td>language - linguistics - linguistic - semantic...</td>\n"," <td>1.000000</td>\n"," <td>False</td>\n"," </tr>\n"," <tr>\n"," <th>1459</th>\n"," <td>Modern Integral Information Systems for Chemis...</td>\n"," <td>2</td>\n"," <td>2_chemical_search_compounds_notation</td>\n"," <td>[chemical, search, compounds, notation, titles...</td>\n"," <td>[Experiences of IIT Research Institute in Oper...</td>\n"," <td>chemical - search - compounds - notation - tit...</td>\n"," <td>0.024206</td>\n"," <td>False</td>\n"," </tr>\n"," </tbody>\n","</table>\n","<p>1460 rows × 8 columns</p>\n","</div>"],"text/plain":[" Document Topic \\\n","0 18 Editions of the Dewey Decimal Classificatio... 9 \n","1 Use Made of Technical Libraries This report is... 1 \n","2 Two Kinds of Power An Essay on Bibliographic C... -1 \n","3 Systems Analysis of a University Library; fina... 1 \n","4 A Library Management Game: a report on a resea... -1 \n","... ... ... \n","1455 World Dynamics Over the last several decades i... 0 \n","1456 World Trends in Library Education One of the m... 1 \n","1457 Legal Restrictions on Exploitation of the Pate... 0 \n","1458 Language and Thought This book considers the b... 7 \n","1459 Modern Integral Information Systems for Chemis... 2 \n","\n"," Name \\\n","0 9_classification_dewey_decimal_schemes \n","1 1_library_libraries_university_academic \n","2 -1_retrieval_document_system_project \n","3 1_library_libraries_university_academic \n","4 -1_retrieval_document_system_project \n","... ... \n","1455 0_scientific_science_journals_scientists \n","1456 1_library_libraries_university_academic \n","1457 0_scientific_science_journals_scientists \n","1458 7_language_linguistics_linguistic_semantic \n","1459 2_chemical_search_compounds_notation \n","\n"," Representation \\\n","0 [classification, dewey, decimal, schemes, cata... \n","1 [library, libraries, university, academic, pub... \n","2 [retrieval, document, system, project, on, com... \n","3 [library, libraries, university, academic, pub... \n","4 [retrieval, document, system, project, on, com... \n","... ... \n","1455 [scientific, science, journals, scientists, so... \n","1456 [library, libraries, university, academic, pub... \n","1457 [scientific, science, journals, scientists, so... \n","1458 [language, linguistics, linguistic, semantic, ... \n","1459 [chemical, search, compounds, notation, titles... \n","\n"," Representative_Docs \\\n","0 [Progress in Documentation Thirty years or mor... \n","1 [Undergraduate Library The development of the ... \n","2 [Studies to Compare Retrieval Using Titles wit... \n","3 [Undergraduate Library The development of the ... \n","4 [Studies to Compare Retrieval Using Titles wit... \n","... ... \n","1455 [Education and Training for Scientific and Tec... \n","1456 [Undergraduate Library The development of the ... \n","1457 [Education and Training for Scientific and Tec... \n","1458 [Functional Approach The present book sums up ... \n","1459 [Experiences of IIT Research Institute in Oper... \n","\n"," Top_n_words Probability \\\n","0 classification - dewey - decimal - schemes - c... 0.641904 \n","1 library - libraries - university - academic - ... 0.121461 \n","2 retrieval - document - system - project - on -... 0.314410 \n","3 library - libraries - university - academic - ... 0.171636 \n","4 retrieval - document - system - project - on -... 0.543388 \n","... ... ... \n","1455 scientific - science - journals - scientists -... 1.000000 \n","1456 library - libraries - university - academic - ... 1.000000 \n","1457 scientific - science - journals - scientists -... 0.148646 \n","1458 language - linguistics - linguistic - semantic... 1.000000 \n","1459 chemical - search - compounds - notation - tit... 0.024206 \n","\n"," Representative_document \n","0 False \n","1 False \n","2 False \n","3 False \n","4 False \n","... ... \n","1455 False \n","1456 False \n","1457 False \n","1458 False \n","1459 False \n","\n","[1460 rows x 8 columns]"]},"execution_count":13,"metadata":{},"output_type":"execute_result"}],"source":["doc_info = topic_model.get_document_info(docs[\"text\"])\n","doc_info"]},{"cell_type":"markdown","metadata":{},"source":["## Transforming queries to topics"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["all_query_topics = []\n","for index, row in queries_cleaned.iterrows():\n"," all_query_topics.append(topic_model.find_topics(row[\"text\"], top_n=5))\n","all_query_topics"]},{"cell_type":"code","execution_count":18,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T20:38:18.212878Z","iopub.status.busy":"2024-01-03T20:38:18.212296Z","iopub.status.idle":"2024-01-03T20:42:37.435531Z","shell.execute_reply":"2024-01-03T20:42:37.433913Z","shell.execute_reply.started":"2024-01-03T20:38:18.212829Z"},"trusted":true},"outputs":[],"source":["# transform queries to topics\n","query_topics = {}\n","query_topics_auto = {}\n","\n","for index, row in queries_cleaned.iterrows():\n"," query_id = row[0]\n"," qq = queries.loc[queries['id'] == query_id]\n","\n"," topics, probs = topic_model.find_topics(row[\"text\"], top_n=5)\n"," query_topics[query_id] = topics[0]\n","\n"," topics, probs = topic_model_auto.find_topics(row[\"text\"], top_n=5)\n"," query_topics_auto[query_id] = topics[0]"]},{"cell_type":"code","execution_count":21,"metadata":{},"outputs":[],"source":["with open('./results/query_topics.pkl', 'wb') as f:\n"," pickle.dump(query_topics, f)\n","with open('./results/query_topics_auto.pkl', 'wb') as f:\n"," pickle.dump(query_topics_auto, f)"]},{"cell_type":"code","execution_count":28,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T20:42:37.438390Z","iopub.status.busy":"2024-01-03T20:42:37.437898Z","iopub.status.idle":"2024-01-03T20:42:37.454106Z","shell.execute_reply":"2024-01-03T20:42:37.452690Z","shell.execute_reply.started":"2024-01-03T20:42:37.438329Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["Query Topics: {1: -1, 2: 5, 3: 18, 4: 15, 5: 12, 6: 7, 7: -1, 8: 3, 9: 3, 10: 4, 11: 4, 12: 19, 13: 4, 14: 16, 15: 5, 16: 12, 17: 12, 18: 2, 19: 15, 20: 13, 21: 18, 22: 16, 23: 10, 24: 18, 25: 18, 26: 13, 27: 12, 28: 2, 29: 3, 30: 0, 31: 18, 32: 3, 33: 12, 34: 15, 35: 18, 37: 3, 39: 12, 41: -1, 42: 12, 43: 12, 44: 0, 45: 10, 46: 10, 49: 12, 50: 9, 52: 16, 54: 10, 55: 16, 56: 6, 57: 6, 58: 17, 61: 4, 62: 4, 65: 3, 66: 17, 67: 3, 69: 14, 71: 3, 76: 17, 79: 15, 81: 14, 82: 12, 84: 4, 90: 0, 92: 5, 95: 4, 96: 4, 97: 4, 98: 12, 99: 12, 100: 12, 101: 12, 102: 3, 104: 4, 109: 0, 111: 3}\n","-1 3\n","Query Topics: {1: -1, 2: -1, 3: 0, 4: 4, 5: -1, 6: -1, 7: -1, 8: -1, 9: -1, 10: 3, 11: 0, 12: 4, 13: -1, 14: 1, 15: 0, 16: -1, 17: -1, 18: 2, 19: 2, 20: -1, 21: 0, 22: 1, 23: 0, 24: -1, 25: 5, 26: -1, 27: -1, 28: 6, 29: -1, 30: 0, 31: 6, 32: -1, 33: -1, 34: 2, 35: 5, 37: -1, 39: -1, 41: -1, 42: -1, 43: -1, 44: 0, 45: 0, 46: -1, 49: -1, 50: -1, 52: 1, 54: 0, 55: 1, 56: -1, 57: -1, 58: 0, 61: -1, 62: 3, 65: -1, 66: 0, 67: -1, 69: -1, 71: -1, 76: 0, 79: -1, 81: -1, 82: -1, 84: -1, 90: -1, 92: 0, 95: -1, 96: -1, 97: -1, 98: -1, 99: -1, 100: -1, 101: -1, 102: -1, 104: -1, 109: 0, 111: -1}\n","-1 47\n"]}],"source":["print(\"Query Topics:\", query_topics)\n","print(\"-1\", list(query_topics.values()).count(-1))\n","print(\"Query Topics:\", query_topics_auto)\n","print(\"-1\", list(query_topics_auto.values()).count(-1))"]},{"cell_type":"markdown","metadata":{},"source":["<h2> Re-ranking with BERTopic</h2>"]},{"cell_type":"code","execution_count":29,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T20:12:01.716284Z","iopub.status.busy":"2024-01-03T20:12:01.715582Z","iopub.status.idle":"2024-01-03T20:12:01.726826Z","shell.execute_reply":"2024-01-03T20:12:01.725962Z","shell.execute_reply.started":"2024-01-03T20:12:01.716244Z"},"trusted":true},"outputs":[],"source":["def bertopic_reranker(q_topics, initial_retrieval, query_id, k, lam=0.2):\n"," topic = q_topics[query_id]\n"," topic = topic\n"," most_similar_init_k_documents = {}\n"," i = 0\n"," for id, score in initial_retrieval.items():\n"," doc_topic = doc_info.iloc[id-1][\"Topic\"]\n"," if doc_topic == topic and topic != -1:\n"," # same topic, increase score (unless -1/general topic)\n"," most_similar_init_k_documents[id]=[score[0] * lam]\n"," else:\n"," most_similar_init_k_documents[id]=[score[0]]\n"," i += 1\n"," most_similar_init_k_documents = dict(sorted(most_similar_init_k_documents.items(), key=lambda item: item[1], reverse=True))\n","\n"," most_similar_k_documents = {}\n"," counter = 0\n"," for id, score in most_similar_init_k_documents.items():\n"," try:\n"," if counter == k:\n"," break\n"," most_similar_k_documents[id]=score\n"," counter += 1\n"," except:\n"," break\n","\n"," return most_similar_k_documents "]},{"cell_type":"code","execution_count":30,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T20:52:22.053797Z","iopub.status.busy":"2024-01-03T20:52:22.053248Z","iopub.status.idle":"2024-01-03T20:52:22.084399Z","shell.execute_reply":"2024-01-03T20:52:22.082985Z","shell.execute_reply.started":"2024-01-03T20:52:22.053751Z"},"trusted":true},"outputs":[{"data":{"text/plain":["{145: [15.270515725754914],\n"," 1399: [11.776525442346998],\n"," 597: [11.654505315568294],\n"," 166: [10.781292227767148],\n"," 1071: [10.763945625290491],\n"," 546: [10.405693755713079],\n"," 626: [9.676309340155163],\n"," 1096: [9.519484116329025],\n"," 728: [8.859895015786782],\n"," 1197: [8.78125771012449]}"]},"execution_count":30,"metadata":{},"output_type":"execute_result"}],"source":["# test for one document\n","bertopic_reranker(query_topics, initial_retrieval_with_bm25_scores[2], 2, 10, lam=2.0)"]},{"cell_type":"code","execution_count":31,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T21:23:21.816142Z","iopub.status.busy":"2024-01-03T21:23:21.815690Z","iopub.status.idle":"2024-01-03T21:23:29.221491Z","shell.execute_reply":"2024-01-03T21:23:29.220201Z","shell.execute_reply.started":"2024-01-03T21:23:21.816101Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["Currently retrieving for lam: 1.2\n","Currently retrieving for lam: 1.5\n","Currently retrieving for lam: 2\n","Currently retrieving for lam: 2.5\n","Currently retrieving for lam: 3\n","Currently retrieving for lam: 10\n"]}],"source":["# re-rank documents for all queries\n","lam_values = [1.2, 1.5, 2, 2.5, 3, 10]\n","\n","results_for_different_lams = dict()\n","for lam_value in lam_values:\n"," print(f'Currently retrieving for lam: {lam_value}')\n"," bertopic_reranker_retrieval = dict()\n"," for index, row in queries_cleaned.iterrows():\n"," query_id = row[0]\n"," query_text = row[1]\n"," retrieved_documents = initial_retrieval_with_bm25_scores[query_id]\n"," bertopic_reranker_documents = bertopic_reranker(query_topics, retrieved_documents, query_id, 50, lam=lam_value)\n"," bertopic_reranker_retrieval[query_id] = list(bertopic_reranker_documents.keys())\n"," results_for_different_lams[lam_value] = bertopic_reranker_retrieval\n","\n","with open('./results/reranker_bertopic_results_topic_model.pkl', 'wb') as f:\n"," pickle.dump(results_for_different_lams, f)"]},{"cell_type":"code","execution_count":32,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Currently retrieving for lam: 1.2\n","Currently retrieving for lam: 1.5\n","Currently retrieving for lam: 2\n","Currently retrieving for lam: 2.5\n","Currently retrieving for lam: 3\n","Currently retrieving for lam: 10\n"]}],"source":["lam_values = [1.2, 1.5, 2, 2.5, 3, 10]\n","\n","results_for_different_lams = dict()\n","for lam_value in lam_values:\n"," print(f'Currently retrieving for lam: {lam_value}')\n"," bertopic_reranker_retrieval = dict()\n"," for index, row in queries_cleaned.iterrows():\n"," query_id = row[0]\n"," query_text = row[1]\n"," retrieved_documents = initial_retrieval_with_bm25_scores[query_id]\n"," bertopic_reranker_documents = bertopic_reranker(query_topics_auto, retrieved_documents, query_id, 50, lam=lam_value)\n"," bertopic_reranker_retrieval[query_id] = list(bertopic_reranker_documents.keys())\n"," results_for_different_lams[lam_value] = bertopic_reranker_retrieval\n","\n","with open('./results/reranker_bertopic_results_topic_model_auto.pkl', 'wb') as f:\n"," pickle.dump(results_for_different_lams, f)"]}],"metadata":{"kaggle":{"accelerator":"none","dataSources":[{"datasetId":6763,"sourceId":9801,"sourceType":"datasetVersion"},{"datasetId":576263,"sourceId":1043323,"sourceType":"datasetVersion"},{"datasetId":4135603,"sourceId":7160356,"sourceType":"datasetVersion"},{"datasetId":4137237,"sourceId":7162602,"sourceType":"datasetVersion"}],"dockerImageVersionId":30120,"isGpuEnabled":false,"isInternetEnabled":true,"language":"python","sourceType":"notebook"},"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.10.12"}},"nbformat":4,"nbformat_minor":4}