Skip to content
Snippets Groups Projects
reranker-bertopic.ipynb 52.5 KiB
Newer Older
{"cells":[{"cell_type":"markdown","metadata":{},"source":["<h2> Imports </h2>"]},{"cell_type":"code","execution_count":1,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T18:52:18.410005Z","iopub.status.busy":"2024-01-03T18:52:18.409290Z","iopub.status.idle":"2024-01-03T18:56:01.555478Z","shell.execute_reply":"2024-01-03T18:56:01.553732Z","shell.execute_reply.started":"2024-01-03T18:52:18.409894Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["Defaulting to user installation because normal site-packages is not writeable\n","\u001b[31mERROR: Could not find a version that satisfies the requirement tensorflow==2.5 (from versions: 2.8.0rc0, 2.8.0rc1, 2.8.0, 2.8.1, 2.8.2, 2.8.3, 2.8.4, 2.9.0rc0, 2.9.0rc1, 2.9.0rc2, 2.9.0, 2.9.1, 2.9.2, 2.9.3, 2.10.0rc0, 2.10.0rc1, 2.10.0rc2, 2.10.0rc3, 2.10.0, 2.10.1, 2.11.0rc0, 2.11.0rc1, 2.11.0rc2, 2.11.0, 2.11.1, 2.12.0rc0, 2.12.0rc1, 2.12.0, 2.12.1, 2.13.0rc0, 2.13.0rc1, 2.13.0rc2, 2.13.0, 2.13.1, 2.14.0rc0, 2.14.0rc1, 2.14.0, 2.14.1, 2.15.0rc0, 2.15.0rc1, 2.15.0, 2.15.0.post1)\u001b[0m\u001b[31m\n","\u001b[0m\u001b[31mERROR: No matching distribution found for tensorflow==2.5\u001b[0m\u001b[31m\n","\u001b[0m\n","\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.1.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.3.2\u001b[0m\n","\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n","Note: you may need to restart the kernel to use updated packages.\n","\n","\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.1.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.3.2\u001b[0m\n","\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n","Note: you may need to restart the kernel to use updated packages.\n"]}],"source":["%pip install tensorflow==2.5\n","%pip install -Uq sentence-transformers faiss-cpu accelerate hdbscan bertopic evaluate kaleido datasets>=2.11"]},{"cell_type":"code","execution_count":2,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T18:56:01.558915Z","iopub.status.busy":"2024-01-03T18:56:01.558368Z","iopub.status.idle":"2024-01-03T18:56:32.924438Z","shell.execute_reply":"2024-01-03T18:56:32.919977Z","shell.execute_reply.started":"2024-01-03T18:56:01.558854Z"},"trusted":true},"outputs":[],"source":["import nltk\n","import re\n","stopword_list = nltk.corpus.stopwords.words('english')\n","from tqdm import tqdm\n","tqdm.pandas()\n","from bertopic import BERTopic\n","from bertopic.vectorizers import ClassTfidfTransformer\n","import pandas as pd \n","import pickle "]},{"cell_type":"markdown","metadata":{},"source":["<h2> Load datasets and model</h2>"]},{"cell_type":"code","execution_count":3,"metadata":{},"outputs":[],"source":["with open('./results/initial_retrieval_with_bm25_scores.pkl', 'rb') as f:\n","    initial_retrieval_with_bm25_scores = pickle.load(f)"]},{"cell_type":"code","execution_count":4,"metadata":{"_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","execution":{"iopub.execute_input":"2024-01-03T18:56:32.931742Z","iopub.status.busy":"2024-01-03T18:56:32.929029Z","iopub.status.idle":"2024-01-03T18:56:33.045085Z","shell.execute_reply":"2024-01-03T18:56:33.043862Z","shell.execute_reply.started":"2024-01-03T18:56:32.931643Z"},"trusted":true},"outputs":[],"source":["queries = pd.read_csv(\"./data/cisi-csv/queries.csv\")\n","docs = pd.read_csv(\"./data/cisi-csv/docs.csv\")\n","rels = pd.read_csv(\"./data/cisi-csv/rels.csv\")\n","\n","full_doc = docs['text'].to_list()\n","full_query = queries['text'].to_list()"]},{"cell_type":"code","execution_count":5,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T18:57:01.149061Z","iopub.status.busy":"2024-01-03T18:57:01.148640Z","iopub.status.idle":"2024-01-03T18:57:01.166124Z","shell.execute_reply":"2024-01-03T18:57:01.164536Z","shell.execute_reply.started":"2024-01-03T18:57:01.149026Z"},"trusted":true},"outputs":[],"source":["def data_clean(text):\n","    pattern = r'[^a-zA-Z0-9\\s]'\n","    text = re.sub(pattern,'',' '.join(text))\n","    tokens = [token.strip() for token in text.split()]\n","    filtered = [token for token in tokens if token.lower() not in stopword_list]\n","    return ' '.join(filtered)\n","\n","# just the same code as above to clean the df texts for bm25\n","def data_clean_df(text):\n","    # Regex pattern to keep only alphanumeric characters and spaces\n","    pattern = r'[^a-zA-Z0-9\\s]'\n","    text = re.sub(pattern, '', text)\n","    tokens = [token.strip() for token in text.split()]\n","    return ' '.join(tokens)\n","\n","#some queries have a .T in the begining we want to remove this\n","def clean_query(text):\n","    pattern = r'^\\.T\\s'\n","    text = re.sub(pattern, '', text)\n","    tokens = [token.strip() for token in text.split()]\n","    return ' '.join(tokens)"]},{"cell_type":"code","execution_count":6,"metadata":{},"outputs":[],"source":["queries_cleaned = queries.copy()\n","queries_cleaned['text'] = queries_cleaned['text'].apply(data_clean_df)\n","queries_cleaned['text'] = queries_cleaned['text'].apply(clean_query)"]},{"cell_type":"markdown","metadata":{},"source":["## Initializing and fitting BERTopic model"]},{"cell_type":"code","execution_count":7,"metadata":{},"outputs":[],"source":["docs_for_bert = docs[\"text\"]\n","# As removing stop words as a preprocessing step is not advised, we use unpreprocessed documents to fit BERTopic. \n","# The stop words are removed with the ClassTfidfTransformer using \"reduce_frequent_words=True\".\n","# https://maartengr.github.io/BERTopic/getting_started/tips_and_tricks/tips_and_tricks.html#removing-stop-words"]},{"cell_type":"code","execution_count":8,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T18:57:52.932829Z","iopub.status.busy":"2024-01-03T18:57:52.932037Z","iopub.status.idle":"2024-01-03T18:57:52.940868Z","shell.execute_reply":"2024-01-03T18:57:52.939770Z","shell.execute_reply.started":"2024-01-03T18:57:52.932765Z"},"trusted":true},"outputs":[],"source":["from umap import UMAP\n","\n","model_name = 'sentence-transformers/all-MiniLM-L6-v2'\n","# using this model truncates all input at 256 word pieces/tokens\n","# The CISI dataset that we're using only contains very few documents and queries exceeding that limit, so \n","# we capturing most of their information. When using a collection with longer documents or queries, the model\n","# cannot capture all the information so the results migth not be as good as expected.\n","# https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2\n","\n","# UMAP is stochastic, so re-produce results you need to set the random_state for umap and pass this umap model to BERTopic:\n","# HOW TO SET UMAP MODEL: topic_model = BERTopic(umap_model=umap_model)\n","umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=4224)\n","\n","# You can also use this representation model to reduce/remove stop words.\n","# from bertopic.representation import KeyBERTInspired\n","# representation_model = KeyBERTInspired()\n","\n","# Use the representation model in BERTopic on top of the default pipeline\n","topic_model = BERTopic(umap_model=umap_model, embedding_model=model_name, ctfidf_model=ClassTfidfTransformer(reduce_frequent_words=True), calculate_probabilities=True)\n","topic_model_auto = BERTopic(umap_model=umap_model, embedding_model=model_name, ctfidf_model=ClassTfidfTransformer(reduce_frequent_words=True), nr_topics=\"auto\", calculate_probabilities=True)"]},{"cell_type":"code","execution_count":9,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T18:57:55.031840Z","iopub.status.busy":"2024-01-03T18:57:55.031137Z","iopub.status.idle":"2024-01-03T19:00:05.623076Z","shell.execute_reply":"2024-01-03T19:00:05.621204Z","shell.execute_reply.started":"2024-01-03T18:57:55.031784Z"},"trusted":true},"outputs":[{"data":{"text/plain":["<bertopic._bertopic.BERTopic at 0x7f0ab6013f10>"]},"execution_count":9,"metadata":{},"output_type":"execute_result"}],"source":["# Fitting BERTopic\n","topic_model.fit(docs_for_bert)\n","topic_model_auto.fit(docs_for_bert)"]},{"cell_type":"code","execution_count":10,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T20:37:47.548151Z","iopub.status.busy":"2024-01-03T20:37:47.547570Z","iopub.status.idle":"2024-01-03T20:37:47.601583Z","shell.execute_reply":"2024-01-03T20:37:47.599861Z","shell.execute_reply.started":"2024-01-03T20:37:47.548107Z"},"trusted":true},"outputs":[{"data":{"text/html":["<div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>Topic</th>\n","      <th>Count</th>\n","      <th>Name</th>\n","      <th>Representation</th>\n","      <th>Representative_Docs</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>0</th>\n","      <td>-1</td>\n","      <td>415</td>\n","      <td>-1_literature_titles_search_abstracts</td>\n","      <td>[literature, titles, search, abstracts, data, ...</td>\n","      <td>[Future Developments in Telecommunications Com...</td>\n","    </tr>\n","    <tr>\n","      <th>1</th>\n","      <td>0</td>\n","      <td>256</td>\n","      <td>0_libraries_library_university_academic</td>\n","      <td>[libraries, library, university, academic, pub...</td>\n","      <td>[Measuring Readers' Failure at the Shelf in Th...</td>\n","    </tr>\n","    <tr>\n","      <th>2</th>\n","      <td>1</td>\n","      <td>98</td>\n","      <td>1_social_communication_scientists_scientific</td>\n","      <td>[social, communication, scientists, scientific...</td>\n","      <td>[The Coming Crisis of Western Sociology The cr...</td>\n","    </tr>\n","    <tr>\n","      <th>3</th>\n","      <td>2</td>\n","      <td>85</td>\n","      <td>2_information_science_needs_theory</td>\n","      <td>[information, science, needs, theory, flow, sc...</td>\n","      <td>[Medical Libraries and the Assessment of User ...</td>\n","    </tr>\n","    <tr>\n","      <th>4</th>\n","      <td>3</td>\n","      <td>66</td>\n","      <td>3_indexing_automatic_index_classification</td>\n","      <td>[indexing, automatic, index, classification, d...</td>\n","      <td>[What Makes An Automatic Keyword Classificatio...</td>\n","    </tr>\n","    <tr>\n","      <th>5</th>\n","      <td>4</td>\n","      <td>42</td>\n","      <td>4_relevance_relevant_measure_retrieval</td>\n","      <td>[relevance, relevant, measure, retrieval, docu...</td>\n","      <td>[On Relevance, Probabilistic Indexing and Info...</td>\n","    </tr>\n","    <tr>\n","      <th>6</th>\n","      <td>5</td>\n","      <td>38</td>\n","      <td>5_bases_data_line_bibliographic</td>\n","      <td>[bases, data, line, bibliographic, services, r...</td>\n","      <td>[User Assessment of Computer-Based Bibliograph...</td>\n","    </tr>\n","    <tr>\n","      <th>7</th>\n","      <td>6</td>\n","      <td>35</td>\n","      <td>6_citation_citations_coupling_citing</td>\n","      <td>[citation, citations, coupling, citing, cross,...</td>\n","      <td>[Citation Indexes for Science In this paper I ...</td>\n","    </tr>\n","    <tr>\n","      <th>8</th>\n","      <td>7</td>\n","      <td>35</td>\n","      <td>7_notation_chemical_compounds_atom</td>\n","      <td>[notation, chemical, compounds, atom, structur...</td>\n","      <td>[The Chemical Abstracts Service Chemical Regis...</td>\n","    </tr>\n","    <tr>\n","      <th>9</th>\n","      <td>8</td>\n","      <td>30</td>\n","      <td>8_catalog_catalogs_card_divided</td>\n","      <td>[catalog, catalogs, card, divided, cataloging,...</td>\n","      <td>[The Potential Usefulness of Catalog Access Po...</td>\n","    </tr>\n","    <tr>\n","      <th>10</th>\n","      <td>9</td>\n","      <td>26</td>\n","      <td>9_compression_inverted_length_coding</td>\n","      <td>[compression, inverted, length, coding, keys, ...</td>\n","      <td>[An Information-Theoretic Approach to Text Sea...</td>\n","    </tr>\n","    <tr>\n","      <th>11</th>\n","      <td>10</td>\n","      <td>25</td>\n","      <td>10_medical_health_hospital_manpower</td>\n","      <td>[medical, health, hospital, manpower, hospital...</td>\n","      <td>[Library Practice in Hospitals According to a ...</td>\n","    </tr>\n","    <tr>\n","      <th>12</th>\n","      <td>11</td>\n","      <td>24</td>\n","      <td>11_journals_coverage_periodical_articles</td>\n","      <td>[journals, coverage, periodical, articles, mon...</td>\n","      <td>[Citation Patterns fo the Cardiovascular Seria...</td>\n","    </tr>\n","    <tr>\n","      <th>13</th>\n","      <td>12</td>\n","      <td>21</td>\n","      <td>12_classification_dewey_decimal_schemes</td>\n","      <td>[classification, dewey, decimal, schemes, cata...</td>\n","      <td>[Progress in Documentation Thirty years or mor...</td>\n","    </tr>\n","    <tr>\n","      <th>14</th>\n","      <td>13</td>\n","      <td>20</td>\n","      <td>13_automation_telefacsimile_automated_processing</td>\n","      <td>[automation, telefacsimile, automated, process...</td>\n","      <td>[HDB of Data Processing for Libraries The four...</td>\n","    </tr>\n","    <tr>\n","      <th>15</th>\n","      <td>14</td>\n","      <td>20</td>\n","      <td>14_retrieval_user_systems_isrs</td>\n","      <td>[retrieval, user, systems, isrs, interaction, ...</td>\n","      <td>[Human Factors in the Design of an Interactive...</td>\n","    </tr>\n","    <tr>\n","      <th>16</th>\n","      <td>15</td>\n","      <td>18</td>\n","      <td>15_fuzzy_sets_classification_hedge</td>\n","      <td>[fuzzy, sets, classification, hedge, membershi...</td>\n","      <td>[Prospects for a New General Classification In...</td>\n","    </tr>\n","    <tr>\n","      <th>17</th>\n","      <td>16</td>\n","      <td>17</td>\n","      <td>16_evaluation_costs_cost_scale</td>\n","      <td>[evaluation, costs, cost, scale, appraisal, sy...</td>\n","      <td>[Standard Costing for Information Systems: Bac...</td>\n","    </tr>\n","    <tr>\n","      <th>18</th>\n","      <td>17</td>\n","      <td>16</td>\n","      <td>17_serials_isbd_rules_international</td>\n","      <td>[serials, isbd, rules, international, entry, a...</td>\n","      <td>[No Special Rules for Entry of Serials One of ...</td>\n","    </tr>\n","    <tr>\n","      <th>19</th>\n","      <td>18</td>\n","      <td>16</td>\n","      <td>18_centers_services_systems_annual</td>\n","      <td>[centers, services, systems, annual, chapters,...</td>\n","      <td>[Annual Review of Information Science and Tech...</td>\n","    </tr>\n","    <tr>\n","      <th>20</th>\n","      <td>19</td>\n","      <td>16</td>\n","      <td>19_microfiche_microforms_microform_microfilm</td>\n","      <td>[microfiche, microforms, microform, microfilm,...</td>\n","      <td>[The Microform Revolution Librarians have trie...</td>\n","    </tr>\n","    <tr>\n","      <th>21</th>\n","      <td>20</td>\n","      <td>15</td>\n","      <td>20_network_networks_cable_television</td>\n","      <td>[network, networks, cable, television, communi...</td>\n","      <td>[The National Biomedical Communications Networ...</td>\n","    </tr>\n","    <tr>\n","      <th>22</th>\n","      <td>21</td>\n","      <td>15</td>\n","      <td>21_bradford_law_zipf_distribution</td>\n","      <td>[bradford, law, zipf, distribution, straight, ...</td>\n","      <td>[A New Look at Reference Scattering It was fir...</td>\n","    </tr>\n","    <tr>\n","      <th>23</th>\n","      <td>22</td>\n","      <td>15</td>\n","      <td>22_marc_records_faculty_readable</td>\n","      <td>[marc, records, faculty, readable, pilot, hit,...</td>\n","      <td>[The MARC Pilot Project: The Final Report The ...</td>\n","    </tr>\n","    <tr>\n","      <th>24</th>\n","      <td>23</td>\n","      <td>14</td>\n","      <td>23_medlars_medline_twx_medicus</td>\n","      <td>[medlars, medline, twx, medicus, medicine, nlm...</td>\n","      <td>[MEDLARS: A Summary Review and Evaluation of T...</td>\n","    </tr>\n","    <tr>\n","      <th>25</th>\n","      <td>24</td>\n","      <td>14</td>\n","      <td>24_thesaurus_thesauri_construction_updating</td>\n","      <td>[thesaurus, thesauri, construction, updating, ...</td>\n","      <td>[Automatic Construction of Thesauri and of Con...</td>\n","    </tr>\n","    <tr>\n","      <th>26</th>\n","      <td>25</td>\n","      <td>13</td>\n","      <td>25_linguistic_automata_language_linguistics</td>\n","      <td>[linguistic, automata, language, linguistics, ...</td>\n","      <td>[Adventures in Theory of Languages In trying t...</td>\n","    </tr>\n","    <tr>\n","      <th>27</th>\n","      <td>26</td>\n","      <td>12</td>\n","      <td>26_organizations_organizational_business_enter...</td>\n","      <td>[organizations, organizational, business, ente...</td>\n","      <td>[Principles of Operations Research with Applic...</td>\n","    </tr>\n","    <tr>\n","      <th>28</th>\n","      <td>27</td>\n","      <td>12</td>\n","      <td>27_journal_primary_museum_journals</td>\n","      <td>[journal, primary, museum, journals, abstract,...</td>\n","      <td>[Tests on Abstracts Journals The amount of sci...</td>\n","    </tr>\n","    <tr>\n","      <th>29</th>\n","      <td>28</td>\n","      <td>11</td>\n","      <td>28_awareness_sdi_current_notices</td>\n","      <td>[awareness, sdi, current, notices, disseminati...</td>\n","      <td>[The Implementation, Evaluation, and Refinemen...</td>\n","    </tr>\n","    <tr>\n","      <th>30</th>\n","      <td>29</td>\n","      <td>10</td>\n","      <td>29_chemical_gremas_idc_nerac</td>\n","      <td>[chemical, gremas, idc, nerac, darc, chemists,...</td>\n","      <td>[A Chemical Search System for a Small Computer...</td>\n","    </tr>\n","    <tr>\n","      <th>31</th>\n","      <td>30</td>\n","      <td>10</td>\n","      <td>30_semantic_text_language_russian</td>\n","      <td>[semantic, text, language, russian, assertions...</td>\n","      <td>[Linguistics and Information Science This pape...</td>\n","    </tr>\n","  </tbody>\n","</table>\n","</div>"],"text/plain":["    Topic  Count                                               Name  \\\n","0      -1    415              -1_literature_titles_search_abstracts   \n","1       0    256            0_libraries_library_university_academic   \n","2       1     98       1_social_communication_scientists_scientific   \n","3       2     85                 2_information_science_needs_theory   \n","4       3     66          3_indexing_automatic_index_classification   \n","5       4     42             4_relevance_relevant_measure_retrieval   \n","6       5     38                    5_bases_data_line_bibliographic   \n","7       6     35               6_citation_citations_coupling_citing   \n","8       7     35                 7_notation_chemical_compounds_atom   \n","9       8     30                    8_catalog_catalogs_card_divided   \n","10      9     26               9_compression_inverted_length_coding   \n","11     10     25                10_medical_health_hospital_manpower   \n","12     11     24           11_journals_coverage_periodical_articles   \n","13     12     21            12_classification_dewey_decimal_schemes   \n","14     13     20   13_automation_telefacsimile_automated_processing   \n","15     14     20                     14_retrieval_user_systems_isrs   \n","16     15     18                 15_fuzzy_sets_classification_hedge   \n","17     16     17                     16_evaluation_costs_cost_scale   \n","18     17     16                17_serials_isbd_rules_international   \n","19     18     16                 18_centers_services_systems_annual   \n","20     19     16       19_microfiche_microforms_microform_microfilm   \n","21     20     15               20_network_networks_cable_television   \n","22     21     15                  21_bradford_law_zipf_distribution   \n","23     22     15                   22_marc_records_faculty_readable   \n","24     23     14                     23_medlars_medline_twx_medicus   \n","25     24     14        24_thesaurus_thesauri_construction_updating   \n","26     25     13        25_linguistic_automata_language_linguistics   \n","27     26     12  26_organizations_organizational_business_enter...   \n","28     27     12                 27_journal_primary_museum_journals   \n","29     28     11                   28_awareness_sdi_current_notices   \n","30     29     10                       29_chemical_gremas_idc_nerac   \n","31     30     10                  30_semantic_text_language_russian   \n","\n","                                       Representation  \\\n","0   [literature, titles, search, abstracts, data, ...   \n","1   [libraries, library, university, academic, pub...   \n","2   [social, communication, scientists, scientific...   \n","3   [information, science, needs, theory, flow, sc...   \n","4   [indexing, automatic, index, classification, d...   \n","5   [relevance, relevant, measure, retrieval, docu...   \n","6   [bases, data, line, bibliographic, services, r...   \n","7   [citation, citations, coupling, citing, cross,...   \n","8   [notation, chemical, compounds, atom, structur...   \n","9   [catalog, catalogs, card, divided, cataloging,...   \n","10  [compression, inverted, length, coding, keys, ...   \n","11  [medical, health, hospital, manpower, hospital...   \n","12  [journals, coverage, periodical, articles, mon...   \n","13  [classification, dewey, decimal, schemes, cata...   \n","14  [automation, telefacsimile, automated, process...   \n","15  [retrieval, user, systems, isrs, interaction, ...   \n","16  [fuzzy, sets, classification, hedge, membershi...   \n","17  [evaluation, costs, cost, scale, appraisal, sy...   \n","18  [serials, isbd, rules, international, entry, a...   \n","19  [centers, services, systems, annual, chapters,...   \n","20  [microfiche, microforms, microform, microfilm,...   \n","21  [network, networks, cable, television, communi...   \n","22  [bradford, law, zipf, distribution, straight, ...   \n","23  [marc, records, faculty, readable, pilot, hit,...   \n","24  [medlars, medline, twx, medicus, medicine, nlm...   \n","25  [thesaurus, thesauri, construction, updating, ...   \n","26  [linguistic, automata, language, linguistics, ...   \n","27  [organizations, organizational, business, ente...   \n","28  [journal, primary, museum, journals, abstract,...   \n","29  [awareness, sdi, current, notices, disseminati...   \n","30  [chemical, gremas, idc, nerac, darc, chemists,...   \n","31  [semantic, text, language, russian, assertions...   \n","\n","                                  Representative_Docs  \n","0   [Future Developments in Telecommunications Com...  \n","1   [Measuring Readers' Failure at the Shelf in Th...  \n","2   [The Coming Crisis of Western Sociology The cr...  \n","3   [Medical Libraries and the Assessment of User ...  \n","4   [What Makes An Automatic Keyword Classificatio...  \n","5   [On Relevance, Probabilistic Indexing and Info...  \n","6   [User Assessment of Computer-Based Bibliograph...  \n","7   [Citation Indexes for Science In this paper I ...  \n","8   [The Chemical Abstracts Service Chemical Regis...  \n","9   [The Potential Usefulness of Catalog Access Po...  \n","10  [An Information-Theoretic Approach to Text Sea...  \n","11  [Library Practice in Hospitals According to a ...  \n","12  [Citation Patterns fo the Cardiovascular Seria...  \n","13  [Progress in Documentation Thirty years or mor...  \n","14  [HDB of Data Processing for Libraries The four...  \n","15  [Human Factors in the Design of an Interactive...  \n","16  [Prospects for a New General Classification In...  \n","17  [Standard Costing for Information Systems: Bac...  \n","18  [No Special Rules for Entry of Serials One of ...  \n","19  [Annual Review of Information Science and Tech...  \n","20  [The Microform Revolution Librarians have trie...  \n","21  [The National Biomedical Communications Networ...  \n","22  [A New Look at Reference Scattering It was fir...  \n","23  [The MARC Pilot Project: The Final Report The ...  \n","24  [MEDLARS: A Summary Review and Evaluation of T...  \n","25  [Automatic Construction of Thesauri and of Con...  \n","26  [Adventures in Theory of Languages In trying t...  \n","27  [Principles of Operations Research with Applic...  \n","28  [Tests on Abstracts Journals The amount of sci...  \n","29  [The Implementation, Evaluation, and Refinemen...  \n","30  [A Chemical Search System for a Small Computer...  \n","31  [Linguistics and Information Science This pape...  "]},"execution_count":10,"metadata":{},"output_type":"execute_result"}],"source":["topic_model.get_topic_info()"]},{"cell_type":"code","execution_count":11,"metadata":{},"outputs":[{"data":{"text/html":["<div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>Topic</th>\n","      <th>Count</th>\n","      <th>Name</th>\n","      <th>Representation</th>\n","      <th>Representative_Docs</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>0</th>\n","      <td>-1</td>\n","      <td>415</td>\n","      <td>-1_are_in_on_for</td>\n","      <td>[are, in, on, for, to, information, be, and, w...</td>\n","      <td>[Current Awareness Searches on CT, CBAS and AS...</td>\n","    </tr>\n","    <tr>\n","      <th>1</th>\n","      <td>0</td>\n","      <td>691</td>\n","      <td>0_library_libraries_information_for</td>\n","      <td>[library, libraries, information, for, is, to,...</td>\n","      <td>[Information Needs and Uses in Science and Tec...</td>\n","    </tr>\n","    <tr>\n","      <th>2</th>\n","      <td>1</td>\n","      <td>98</td>\n","      <td>1_social_communication_scientific_science</td>\n","      <td>[social, communication, scientific, science, s...</td>\n","      <td>[Communication among Scientists and Engineers ...</td>\n","    </tr>\n","    <tr>\n","      <th>3</th>\n","      <td>2</td>\n","      <td>71</td>\n","      <td>2_journals_citation_citations_journal</td>\n","      <td>[journals, citation, citations, journal, liter...</td>\n","      <td>[Citation Patterns fo the Cardiovascular Seria...</td>\n","    </tr>\n","    <tr>\n","      <th>4</th>\n","      <td>3</td>\n","      <td>35</td>\n","      <td>3_chemical_notation_compounds_structures</td>\n","      <td>[chemical, notation, compounds, structures, at...</td>\n","      <td>[The Chemical Abstracts Service Chemical Regis...</td>\n","    </tr>\n","    <tr>\n","      <th>5</th>\n","      <td>4</td>\n","      <td>26</td>\n","      <td>4_compression_inverted_length_coding</td>\n","      <td>[compression, inverted, length, coding, file, ...</td>\n","      <td>[An Information-Theoretic Approach to Text Sea...</td>\n","    </tr>\n","    <tr>\n","      <th>6</th>\n","      <td>5</td>\n","      <td>18</td>\n","      <td>5_fuzzy_classification_sets_membership</td>\n","      <td>[fuzzy, classification, sets, membership, rela...</td>\n","      <td>[Prospects for a New General Classification In...</td>\n","    </tr>\n","    <tr>\n","      <th>7</th>\n","      <td>6</td>\n","      <td>16</td>\n","      <td>6_microfiche_microforms_microform_microfilm</td>\n","      <td>[microfiche, microforms, microform, microfilm,...</td>\n","      <td>[The Microform Revolution Librarians have trie...</td>\n","    </tr>\n","    <tr>\n","      <th>8</th>\n","      <td>7</td>\n","      <td>15</td>\n","      <td>7_bradford_law_zipf_distribution</td>\n","      <td>[bradford, law, zipf, distribution, references...</td>\n","      <td>[Progress in Documentation Empirical Hyperboli...</td>\n","    </tr>\n","    <tr>\n","      <th>9</th>\n","      <td>8</td>\n","      <td>15</td>\n","      <td>8_network_networks_cable_television</td>\n","      <td>[network, networks, cable, television, communi...</td>\n","      <td>[The National Biomedical Communications Networ...</td>\n","    </tr>\n","    <tr>\n","      <th>10</th>\n","      <td>9</td>\n","      <td>14</td>\n","      <td>9_medlars_medline_twx_medicus</td>\n","      <td>[medlars, medline, twx, medicus, medicine, nlm...</td>\n","      <td>[MEDLARS: A Summary Review and Evaluation of T...</td>\n","    </tr>\n","    <tr>\n","      <th>11</th>\n","      <td>10</td>\n","      <td>13</td>\n","      <td>10_language_linguistic_automata_linguistics</td>\n","      <td>[language, linguistic, automata, linguistics, ...</td>\n","      <td>[Adventures in Theory of Languages In trying t...</td>\n","    </tr>\n","    <tr>\n","      <th>12</th>\n","      <td>11</td>\n","      <td>12</td>\n","      <td>11_organizations_organizational_business_enter...</td>\n","      <td>[organizations, organizational, business, ente...</td>\n","      <td>[Principles of Operations Research with Applic...</td>\n","    </tr>\n","    <tr>\n","      <th>13</th>\n","      <td>12</td>\n","      <td>11</td>\n","      <td>12_awareness_current_sdi_dissemination</td>\n","      <td>[awareness, current, sdi, dissemination, notic...</td>\n","      <td>[The Implementation, Evaluation, and Refinemen...</td>\n","    </tr>\n","    <tr>\n","      <th>14</th>\n","      <td>13</td>\n","      <td>10</td>\n","      <td>13_chemical_gremas_idc_darc</td>\n","      <td>[chemical, gremas, idc, darc, nerac, chemists,...</td>\n","      <td>[A Chemical Search System for a Small Computer...</td>\n","    </tr>\n","  </tbody>\n","</table>\n","</div>"],"text/plain":["    Topic  Count                                               Name  \\\n","0      -1    415                                   -1_are_in_on_for   \n","1       0    691                0_library_libraries_information_for   \n","2       1     98          1_social_communication_scientific_science   \n","3       2     71              2_journals_citation_citations_journal   \n","4       3     35           3_chemical_notation_compounds_structures   \n","5       4     26               4_compression_inverted_length_coding   \n","6       5     18             5_fuzzy_classification_sets_membership   \n","7       6     16        6_microfiche_microforms_microform_microfilm   \n","8       7     15                   7_bradford_law_zipf_distribution   \n","9       8     15                8_network_networks_cable_television   \n","10      9     14                      9_medlars_medline_twx_medicus   \n","11     10     13        10_language_linguistic_automata_linguistics   \n","12     11     12  11_organizations_organizational_business_enter...   \n","13     12     11             12_awareness_current_sdi_dissemination   \n","14     13     10                        13_chemical_gremas_idc_darc   \n","\n","                                       Representation  \\\n","0   [are, in, on, for, to, information, be, and, w...   \n","1   [library, libraries, information, for, is, to,...   \n","2   [social, communication, scientific, science, s...   \n","3   [journals, citation, citations, journal, liter...   \n","4   [chemical, notation, compounds, structures, at...   \n","5   [compression, inverted, length, coding, file, ...   \n","6   [fuzzy, classification, sets, membership, rela...   \n","7   [microfiche, microforms, microform, microfilm,...   \n","8   [bradford, law, zipf, distribution, references...   \n","9   [network, networks, cable, television, communi...   \n","10  [medlars, medline, twx, medicus, medicine, nlm...   \n","11  [language, linguistic, automata, linguistics, ...   \n","12  [organizations, organizational, business, ente...   \n","13  [awareness, current, sdi, dissemination, notic...   \n","14  [chemical, gremas, idc, darc, nerac, chemists,...   \n","\n","                                  Representative_Docs  \n","0   [Current Awareness Searches on CT, CBAS and AS...  \n","1   [Information Needs and Uses in Science and Tec...  \n","2   [Communication among Scientists and Engineers ...  \n","3   [Citation Patterns fo the Cardiovascular Seria...  \n","4   [The Chemical Abstracts Service Chemical Regis...  \n","5   [An Information-Theoretic Approach to Text Sea...  \n","6   [Prospects for a New General Classification In...  \n","7   [The Microform Revolution Librarians have trie...  \n","8   [Progress in Documentation Empirical Hyperboli...  \n","9   [The National Biomedical Communications Networ...  \n","10  [MEDLARS: A Summary Review and Evaluation of T...  \n","11  [Adventures in Theory of Languages In trying t...  \n","12  [Principles of Operations Research with Applic...  \n","13  [The Implementation, Evaluation, and Refinemen...  \n","14  [A Chemical Search System for a Small Computer...  "]},"execution_count":11,"metadata":{},"output_type":"execute_result"}],"source":["topic_model_auto.get_topic_info()"]},{"cell_type":"code","execution_count":12,"metadata":{},"outputs":[],"source":["doc_info = topic_model.get_document_info(docs[\"text\"])\n","doc_info_auto = topic_model.get_document_info(docs[\"text\"])"]},{"cell_type":"markdown","metadata":{},"source":["## Transforming queries to topics"]},{"cell_type":"code","execution_count":13,"metadata":{},"outputs":[{"data":{"text/plain":["[([-1, 9, 12, 17, 3],\n","  [0.43946862, 0.41880205, 0.4149564, 0.41418025, 0.40805078]),\n"," ([5, 4, -1, 14, 2],\n","  [0.5506566, 0.5335574, 0.5298565, 0.47224665, 0.46810776]),\n"," ([2, 18, 16, 14, -1],\n","  [0.7132367, 0.5734414, 0.54921937, 0.42914498, 0.42059672]),\n"," ([19, 7, 29, 13, 9],\n","  [0.36313418, 0.36077052, 0.36055297, 0.35785037, 0.35454556]),\n"," ([14, 5, -1, 2, 18],\n","  [0.63732255, 0.6049315, 0.5938767, 0.58166575, 0.573578]),\n"," ([25, 30, 24, 2, 14],\n","  [0.4233933, 0.3811789, 0.33851522, 0.32847452, 0.2825574]),\n"," ([-1, 5, 18, 14, 27], [0.586249, 0.5534146, 0.5532431, 0.5423152, 0.5321228]),\n"," ([3, -1, 4, 30, 14],\n","  [0.72906816, 0.69085574, 0.6695913, 0.6095079, 0.60201454]),\n"," ([3, 30, 4, -1, 24],\n","  [0.65537703, 0.64077955, 0.61703444, 0.59085715, 0.5524008]),\n"," ([4, -1, 15, 14, 12],\n","  [0.49886248, 0.48655885, 0.4759783, 0.4564567, 0.4404194]),\n"," ([-1, 2, 4, 5, 6], [0.59474576, 0.57027644, 0.5640488, 0.5327206, 0.5318384]),\n"," ([27, 11, 6, -1, 19],\n","  [0.6831566, 0.54341924, 0.5078563, 0.49389306, 0.46976942]),\n"," ([4, 14, -1, 28, 18], [0.6848404, 0.6529045, 0.628243, 0.6130773, 0.5910128]),\n"," ([23, 10, -1, 28, 27],\n","  [0.394485, 0.28473014, 0.2793713, 0.27368593, 0.27084613]),\n"," ([5, 13, 0, -1, 18],\n","  [0.66717654, 0.60948914, 0.605474, 0.60479736, 0.55022144]),\n"," ([14, 5, 18, -1, 13],\n","  [0.6695314, 0.64486283, 0.59228134, 0.57756793, 0.531935]),\n"," ([14, 5, 4, -1, 16],\n","  [0.6451344, 0.6150092, 0.61491966, 0.59280485, 0.5479083]),\n"," ([7, 29, 15, 9, 30],\n","  [0.65809774, 0.39408958, 0.30392945, 0.26923907, 0.24304281]),\n"," ([9, 3, 7, 14, 5],\n","  [0.5720229, 0.45131856, 0.44804746, 0.43973652, 0.41046298]),\n"," ([16, 14, 18, 13, 2],\n","  [0.55782694, 0.45003995, 0.4401703, 0.43288237, 0.35791236]),\n"," ([2, 18, 16, 5, 14],\n","  [0.67245257, 0.6078949, 0.54909295, 0.5179424, 0.49667463]),\n"," ([23, -1, 2, 3, 29],\n","  [0.57611, 0.45642027, 0.4525363, 0.42393416, 0.42241752]),\n"," ([13, 0, 18, 5, -1], [0.7885022, 0.7457253, 0.6765168, 0.6661721, 0.6611547]),\n"," ([2, 18, 16, 14, 20],\n","  [0.60421497, 0.54797477, 0.5194111, 0.4528476, 0.41852435]),\n"," ([18, 2, 20, 28, 16],\n","  [0.5248172, 0.4718421, 0.44392508, 0.44164604, 0.43669853]),\n"," ([16, 14, 13, 2, 18],\n","  [0.69062257, 0.57122266, 0.50894046, 0.5032575, 0.4928528]),\n"," ([3, 14, 9, -1, 4],\n","  [0.71340716, 0.69933736, 0.6340102, 0.60500455, 0.6034043]),\n"," ([29, 7, 5, -1, 18],\n","  [0.7544701, 0.5773044, 0.5170183, 0.51580286, 0.5039904]),\n"," ([3, 14, 9, 5, -1], [0.5976268, 0.5747107, 0.5592833, 0.5336425, 0.5292728]),\n"," ([27, 11, -1, 2, 18],\n","  [0.7268735, 0.59747267, 0.58568096, 0.5685785, 0.54794854]),\n"," ([2, 18, 16, 29, 27],\n","  [0.58573735, 0.54946905, 0.4939779, 0.45500967, 0.43335396]),\n"," ([9, 3, -1, 5, 22],\n","  [0.6762944, 0.66291255, 0.6535969, 0.5946059, 0.56346834]),\n"," ([14, 4, 5, 9, -1],\n","  [0.7240089, 0.60280347, 0.58723843, 0.5539646, 0.5367634]),\n"," ([9, 3, 7, 14, -1],\n","  [0.63498914, 0.58494914, 0.48357353, 0.46796748, 0.4317798]),\n"," ([18, 5, 28, 20, 2],\n","  [0.5713387, 0.49572966, 0.49325383, 0.48507428, 0.47862524]),\n"," ([30, 25, 24, -1, 18],\n","  [0.52570724, 0.4609796, 0.3537985, 0.31831554, 0.29878813]),\n"," ([3, 24, -1, 12, 9],\n","  [0.6645347, 0.60953087, 0.5462487, 0.5459014, 0.50182223]),\n"," ([3, 24, 14, -1, 4], [0.64134264, 0.6391661, 0.63513565, 0.6339748, 0.59182]),\n"," ([14, 18, 5, 13, -1], [0.6797056, 0.6126902, 0.5904062, 0.5702255, 0.561237]),\n"," ([30, 13, -1, 22, 5],\n","  [0.47840467, 0.43768466, 0.42415893, 0.42225903, 0.40975153]),\n"," ([-1, 3, 9, 12, 8],\n","  [0.5098518, 0.50381076, 0.49647248, 0.4958871, 0.47943395]),\n"," ([14, 4, -1, 18, 2],\n","  [0.71822476, 0.6543433, 0.64678645, 0.6105839, 0.6019604]),\n"," ([14, -1, 4, 3, 5],\n","  [0.7416566, 0.67859066, 0.6320094, 0.60466164, 0.59392494]),\n"," ([27, 11, -1, 6, 5],\n","  [0.7159792, 0.6242793, 0.55492496, 0.52134204, 0.45982027]),\n"," ([13, 0, 5, 20, 18],\n","  [0.66124797, 0.4436018, 0.33302006, 0.33021757, 0.3279614]),\n"," ([13, 5, 0, 22, 18],\n","  [0.7598246, 0.53286564, 0.52860546, 0.5117869, 0.4979099]),\n"," ([-1, 5, 4, 14, 18],\n","  [0.5376029, 0.51596075, 0.49152702, 0.48902744, 0.48768216]),\n"," ([14, 5, 4, -1, 18],\n","  [0.57128096, 0.5528879, 0.54887265, 0.5370271, 0.49455565]),\n"," ([14, -1, 4, 5, 2], [0.6658204, 0.6513024, 0.6200825, 0.6149922, 0.565948]),\n"," ([12, 4, 14, -1, 3],\n","  [0.5731784, 0.5267728, 0.52332544, 0.5044216, 0.50299174]),\n"," ([3, 9, 14, 4, -1],\n","  [0.5672143, 0.50181603, 0.48238015, 0.48028436, 0.46403727]),\n"," ([23, 11, -1, 10, 6],\n","  [0.796619, 0.59637135, 0.5574412, 0.5481226, 0.4925297]),\n"," ([23, 13, 5, -1, 10],\n","  [0.5235565, 0.48535678, 0.44240212, 0.42456418, 0.41912878]),\n"," ([13, 0, 18, -1, 5],\n","  [0.72563654, 0.5774363, 0.4999836, 0.49953014, 0.4985675]),\n"," ([23, -1, 11, 4, 3],\n","  [0.78510076, 0.5536922, 0.5091709, 0.48407567, 0.4652769]),\n"," ([8, 12, -1, 3, 9],\n","  [0.68968606, 0.6349219, 0.58376145, 0.5704099, 0.56006896]),\n"," ([17, 12, 3, 8, 22],\n","  [0.5310513, 0.5119928, 0.4982757, 0.49805814, 0.47424167]),\n"," ([20, 13, 18, 0, 5], [0.7841825, 0.61952144, 0.5801623, 0.556049, 0.5343365]),\n"," ([4, 3, 14, -1, 9],\n","  [0.7910535, 0.73730683, 0.66456765, 0.66065866, 0.6074703]),\n"," ([4, 6, 3, -1, 5], [0.6828742, 0.62897843, 0.60692346, 0.5633592, 0.5235776]),\n"," ([4, 14, 3, -1, 5], [0.7216079, 0.6466953, 0.6209685, 0.5954999, 0.5826736]),\n"," ([4, 3, 15, 14, 9], [0.5961854, 0.5541898, 0.5418922, 0.525858, 0.47196358]),\n"," ([14, 5, 4, 3, -1],\n","  [0.76300657, 0.68121135, 0.6755144, 0.67414314, 0.6524559]),\n"," ([2, 18, 16, -1, 4],\n","  [0.62062484, 0.4620458, 0.46013168, 0.44602284, 0.43323788]),\n"," ([3, 4, 14, -1, 9], [0.65397364, 0.6097267, 0.5654292, 0.518716, 0.51716286]),\n"," ([20, 18, 5, 13, 0],\n","  [0.8496496, 0.68585455, 0.68538654, 0.6273924, 0.6232301]),\n"," ([3, 24, -1, 9, 30],\n","  [0.7020184, 0.6532426, 0.62343764, 0.61624086, 0.61549336]),\n"," ([3, 4, 9, -1, 6],\n","  [0.73147357, 0.67798525, 0.6278658, 0.59919214, 0.5528325]),\n"," ([24, 3, -1, 30, 9], [0.7576981, 0.5957972, 0.5850156, 0.54154, 0.5213249]),\n"," ([16, 14, 2, 18, -1],\n","  [0.5931117, 0.5319371, 0.52351785, 0.5175258, 0.45663232]),\n"," ([3, -1, 30, 24, 9],\n","  [0.72187364, 0.6885881, 0.6628084, 0.6276779, 0.5984311]),\n"," ([4, 3, -1, 9, 14],\n","  [0.71987563, 0.5929074, 0.58418965, 0.54974097, 0.5274082]),\n"," ([9, 3, 14, 4, 5], [0.721241, 0.56219536, 0.494873, 0.4892523, 0.46870685]),\n"," ([20, 0, 5, 18, 13],\n","  [0.7657826, 0.594528, 0.55305135, 0.55194545, 0.52247596]),\n"," ([30, 25, 24, 15, 12],\n","  [0.6220827, 0.57177144, 0.35192466, 0.34379962, 0.3258144]),\n"," ([20, 0, 18, 13, 5], [0.7472915, 0.6888541, 0.5747257, 0.5620423, 0.5394141]),\n"," ([9, 3, 4, -1, 30],\n","  [0.66901183, 0.6572521, 0.5920358, 0.55391014, 0.54544646]),\n"," ([25, 30, 24, 15, 12],\n","  [0.585333, 0.5620806, 0.42489907, 0.40980315, 0.31456697]),\n"," ([9, 3, 4, 14, 30], [0.6640452, 0.5684121, 0.5123369, 0.46819523, 0.4440838]),\n"," ([21, 11, 6, 27, -1],\n","  [0.65961766, 0.45226324, 0.44113496, 0.43925506, 0.40345058]),\n"," ([24, 30, 14, 9, -1],\n","  [0.5934275, 0.54141045, 0.51478636, 0.49261615, 0.4837809]),\n"," ([4, 14, 5, -1, 9],\n","  [0.5755979, 0.55964714, 0.5157747, 0.51070285, 0.50819844]),\n"," ([18, 13, 14, 5, 22],\n","  [0.59549654, 0.5647844, 0.52086574, 0.5111462, 0.47077027]),\n"," ([4, 3, 14, -1, 9], [0.8162118, 0.71314937, 0.6604308, 0.61543566, 0.6099]),\n"," ([30, 24, 25, 3, 15],\n","  [0.6665852, 0.4976496, 0.4638757, 0.44847465, 0.42205635]),\n"," ([9, 3, 30, 4, 7], [0.5720993, 0.4860065, 0.42191255, 0.39000756, 0.3822816]),\n"," ([5, 19, -1, 18, 14],\n","  [0.68209416, 0.6696117, 0.5830734, 0.5684111, 0.5637687]),\n"," ([14, 30, 4, 5, -1],\n","  [0.59358275, 0.57526284, 0.515999, 0.48664922, 0.4850039]),\n"," ([0, 5, -1, 16, 13],\n","  [0.68301666, 0.6321294, 0.5743218, 0.53619343, 0.5227872]),\n"," ([6, -1, 30, 12, 3],\n","  [0.6289332, 0.5093118, 0.4571755, 0.45250112, 0.44566888]),\n"," ([6, -1, 21, 5, 11], [0.648805, 0.5740442, 0.5150868, 0.50268537, 0.4968233]),\n"," ([5, -1, 11, 22, 6],\n","  [0.539591, 0.4807731, 0.46246946, 0.43677282, 0.4345572]),\n"," ([3, 9, 4, 14, -1],\n","  [0.6784905, 0.63434297, 0.62956864, 0.5518484, 0.5447004]),\n"," ([14, 5, 13, -1, 4],\n","  [0.66169286, 0.639245, 0.5921581, 0.59066737, 0.5851741]),\n"," ([4, 3, 14, 9, -1], [0.64783704, 0.5598002, 0.5315668, 0.531181, 0.5185517]),\n"," ([4, 3, 14, 9, -1],\n","  [0.655092, 0.56767666, 0.53512454, 0.51735973, 0.48987702]),\n"," ([4, 14, 3, 9, -1],\n","  [0.695344, 0.6204849, 0.61488724, 0.56612134, 0.48218632]),\n"," ([14, 5, -1, 4, 9],\n","  [0.7659029, 0.65595627, 0.6319759, 0.62529624, 0.56728673]),\n"," ([14, 5, 4, -1, 3],\n","  [0.7901465, 0.7182028, 0.7002049, 0.68956196, 0.67775786]),\n"," ([14, 3, -1, 4, 5], [0.62957495, 0.6035372, 0.5865491, 0.58431584, 0.573501]),\n"," ([14, 9, 4, 3, 5], [0.6247927, 0.60066414, 0.56634873, 0.5615567, 0.5076549]),\n"," ([3, 4, 9, -1, 24], [0.81943333, 0.7185483, 0.6301666, 0.6277866, 0.5605491]),\n"," ([18, 5, 14, 13, 22], [0.5654166, 0.5515878, 0.5402109, 0.5296054, 0.524351]),\n"," ([4, 3, 14, 9, -1], [0.711236, 0.68392813, 0.57841337, 0.56106883, 0.511521]),\n"," ([3, 4, 9, -1, 14],\n","  [0.67793673, 0.61888003, 0.5513939, 0.53488785, 0.516256]),\n"," ([3, 4, 9, 14, -1],\n","  [0.6977846, 0.6663055, 0.63919187, 0.59121424, 0.5897374]),\n"," ([3, 4, -1, 14, 24],\n","  [0.7202731, 0.70098287, 0.6803855, 0.67537177, 0.63975805]),\n"," ([5, 14, -1, 9, 22],\n","  [0.8037524, 0.7108186, 0.6877452, 0.6414648, 0.62026966]),\n"," ([6, 1, -1, 11, 27],\n","  [0.64905703, 0.5501102, 0.5063026, 0.4690017, 0.46559724]),\n"," ([-1, 18, 16, 24, 5],\n","  [0.5179507, 0.49717486, 0.4721425, 0.4524297, 0.4321761]),\n"," ([3, 9, -1, 4, 6],\n","  [0.6551046, 0.54466105, 0.5277599, 0.49081418, 0.47386986]),\n"," ([3, 9, 4, 12, 15], [0.7065997, 0.5976231, 0.5630973, 0.53598195, 0.5297299])]"]},"execution_count":13,"metadata":{},"output_type":"execute_result"}],"source":["all_query_topics = []\n","for index, row in queries.iterrows():\n","    all_query_topics.append(topic_model.find_topics(row[\"text\"], top_n=5))\n","all_query_topics"]},{"cell_type":"code","execution_count":14,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T20:38:18.212878Z","iopub.status.busy":"2024-01-03T20:38:18.212296Z","iopub.status.idle":"2024-01-03T20:42:37.435531Z","shell.execute_reply":"2024-01-03T20:42:37.433913Z","shell.execute_reply.started":"2024-01-03T20:38:18.212829Z"},"trusted":true},"outputs":[],"source":["# transform queries to topics\n","query_topics = {}\n","query_topics_auto = {}\n","\n","for index, row in queries.iterrows():\n","    query_id = row[0]\n","    qq = queries.loc[queries['id'] == query_id]\n","\n","    topics, probs = topic_model.find_topics(row[\"text\"], top_n=5)\n","    query_topics[query_id] = topics[0]\n","\n","    topics, probs = topic_model_auto.find_topics(row[\"text\"], top_n=5)\n","    query_topics_auto[query_id] = topics[0]"]},{"cell_type":"code","execution_count":15,"metadata":{},"outputs":[],"source":["with open('./results/query_topics.pkl', 'wb') as f:\n","    pickle.dump(query_topics, f)\n","with open('./results/query_topics_auto.pkl', 'wb') as f:\n","    pickle.dump(query_topics_auto, f)"]},{"cell_type":"code","execution_count":16,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T20:42:37.438390Z","iopub.status.busy":"2024-01-03T20:42:37.437898Z","iopub.status.idle":"2024-01-03T20:42:37.454106Z","shell.execute_reply":"2024-01-03T20:42:37.452690Z","shell.execute_reply.started":"2024-01-03T20:42:37.438329Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["Query Topics: {1: -1, 2: 5, 3: 2, 4: 19, 5: 14, 6: 25, 7: -1, 8: 3, 9: 3, 10: 4, 11: -1, 12: 27, 13: 4, 14: 23, 15: 5, 16: 14, 17: 14, 18: 7, 19: 9, 20: 16, 21: 2, 22: 23, 23: 13, 24: 2, 25: 18, 26: 16, 27: 3, 28: 29, 29: 3, 30: 27, 31: 2, 32: 9, 33: 14, 34: 9, 35: 18, 36: 30, 37: 3, 38: 3, 39: 14, 40: 30, 41: -1, 42: 14, 43: 14, 44: 27, 45: 13, 46: 13, 47: -1, 48: 14, 49: 14, 50: 12, 51: 3, 52: 23, 53: 23, 54: 13, 55: 23, 56: 8, 57: 17, 58: 20, 59: 4, 60: 4, 61: 4, 62: 4, 63: 14, 64: 2, 65: 3, 66: 20, 67: 3, 68: 3, 69: 24, 70: 16, 71: 3, 72: 4, 73: 9, 74: 20, 75: 30, 76: 20, 77: 9, 78: 25, 79: 9, 80: 21, 81: 24, 82: 4, 83: 18, 84: 4, 85: 30, 86: 9, 87: 5, 88: 14, 89: 0, 90: 6, 91: 6, 92: 5, 93: 3, 94: 14, 95: 4, 96: 4, 97: 4, 98: 14, 99: 14, 100: 14, 101: 14, 102: 3, 103: 18, 104: 4, 105: 3, 106: 3, 107: 3, 108: 5, 109: 6, 110: -1, 111: 3, 112: 3}\n","-1 6\n","Query Topics: {1: -1, 2: -1, 3: 0, 4: 6, 5: 0, 6: 10, 7: -1, 8: -1, 9: -1, 10: -1, 11: -1, 12: 2, 13: -1, 14: 9, 15: 0, 16: -1, 17: -1, 18: 3, 19: 4, 20: 0, 21: 0, 22: 9, 23: 0, 24: 0, 25: 8, 26: -1, 27: 4, 28: 13, 29: 4, 30: 2, 31: 13, 32: 4, 33: 4, 34: 4, 35: 12, 36: 10, 37: -1, 38: -1, 39: 0, 40: -1, 41: -1, 42: 0, 43: -1, 44: 2, 45: 0, 46: 0, 47: -1, 48: -1, 49: -1, 50: -1, 51: 4, 52: 9, 53: 9, 54: 0, 55: 9, 56: 0, 57: -1, 58: 8, 59: -1, 60: 2, 61: -1, 62: 5, 63: -1, 64: -1, 65: -1, 66: 8, 67: -1, 68: 4, 69: -1, 70: -1, 71: -1, 72: -1, 73: 4, 74: 8, 75: 10, 76: 8, 77: 4, 78: 10, 79: 4, 80: 7, 81: 4, 82: -1, 83: 12, 84: -1, 85: 10, 86: 4, 87: 6, 88: -1, 89: 0, 90: 2, 91: 2, 92: -1, 93: 4, 94: 0, 95: 4, 96: 4, 97: 4, 98: -1, 99: -1, 100: -1, 101: 4, 102: 4, 103: -1, 104: 4, 105: 4, 106: 4, 107: -1, 108: -1, 109: 2, 110: -1, 111: 4, 112: 4}\n","-1 42\n"]}],"source":["print(\"Query Topics:\", query_topics)\n","print(\"-1\", list(query_topics.values()).count(-1))\n","print(\"Query Topics:\", query_topics_auto)\n","print(\"-1\", list(query_topics_auto.values()).count(-1))"]},{"cell_type":"markdown","metadata":{},"source":["<h2> Re-ranking with BERTopic</h2>"]},{"cell_type":"code","execution_count":17,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T20:12:01.716284Z","iopub.status.busy":"2024-01-03T20:12:01.715582Z","iopub.status.idle":"2024-01-03T20:12:01.726826Z","shell.execute_reply":"2024-01-03T20:12:01.725962Z","shell.execute_reply.started":"2024-01-03T20:12:01.716244Z"},"trusted":true},"outputs":[],"source":["def bertopic_reranker(q_topics, initial_retrieval, query_id, k, lam=0.2, auto=False):\n","    topic = q_topics[query_id]\n","    topic = topic\n","    most_similar_init_k_documents = {}\n","    i = 0\n","    d_info = doc_info if auto is False else doc_info_auto\n","    for id, score in initial_retrieval.items():\n","        doc_topic = d_info.iloc[id-1][\"Topic\"]\n","        if doc_topic == topic and topic != -1:\n","            # same topic, increase score (unless -1/general topic)\n","            most_similar_init_k_documents[id]=[score[0] * lam]\n","        else:\n","            most_similar_init_k_documents[id]=[score[0]]\n","        i += 1\n","    most_similar_init_k_documents = dict(sorted(most_similar_init_k_documents.items(), key=lambda item: item[1], reverse=True))\n","\n","    most_similar_k_documents = {}\n","    counter = 0\n","    for id, score in most_similar_init_k_documents.items():\n","        try:\n","            if counter == k:\n","                break\n","            most_similar_k_documents[id]=score\n","            counter += 1\n","        except:\n","            break\n","\n","    return most_similar_k_documents   "]},{"cell_type":"code","execution_count":18,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T20:52:22.053797Z","iopub.status.busy":"2024-01-03T20:52:22.053248Z","iopub.status.idle":"2024-01-03T20:52:22.084399Z","shell.execute_reply":"2024-01-03T20:52:22.082985Z","shell.execute_reply.started":"2024-01-03T20:52:22.053751Z"},"trusted":true},"outputs":[{"data":{"text/plain":["{145: [15.270515725754914],\n"," 1399: [11.776525442346998],\n"," 597: [11.654505315568294],\n"," 166: [10.781292227767148],\n"," 1071: [10.763945625290491],\n"," 546: [10.405693755713079],\n"," 626: [9.676309340155163],\n"," 1096: [9.519484116329025],\n"," 728: [8.859895015786782],\n"," 523: [7.7636491859711825]}"]},"execution_count":18,"metadata":{},"output_type":"execute_result"}],"source":["# test for one document\n","bertopic_reranker(query_topics, initial_retrieval_with_bm25_scores[2], 2, 10, lam=2.0)"]},{"cell_type":"code","execution_count":19,"metadata":{"execution":{"iopub.execute_input":"2024-01-03T21:23:21.816142Z","iopub.status.busy":"2024-01-03T21:23:21.815690Z","iopub.status.idle":"2024-01-03T21:23:29.221491Z","shell.execute_reply":"2024-01-03T21:23:29.220201Z","shell.execute_reply.started":"2024-01-03T21:23:21.816101Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["Currently retrieving for lam: 1.2\n","Currently retrieving for lam: 1.5\n","Currently retrieving for lam: 2\n","Currently retrieving for lam: 2.5\n","Currently retrieving for lam: 3\n","Currently retrieving for lam: 10\n"]}],"source":["# re-rank documents for all queries\n","lam_values = [1.2, 1.5, 2, 2.5, 3, 10]\n","\n","results_for_different_lams = dict()\n","for lam_value in lam_values:\n","    print(f'Currently retrieving for lam: {lam_value}')\n","    bertopic_reranker_retrieval = dict()\n","    for index, row in queries_cleaned.iterrows():\n","        query_id = row[0]\n","        retrieved_documents = initial_retrieval_with_bm25_scores[query_id]\n","        bertopic_reranker_documents = bertopic_reranker(query_topics, retrieved_documents, query_id, 50, lam=lam_value)\n","        bertopic_reranker_retrieval[query_id] = list(bertopic_reranker_documents.keys())\n","    results_for_different_lams[lam_value] = bertopic_reranker_retrieval\n","\n","with open('./results/reranker_bertopic_results_topic_model.pkl', 'wb') as f:\n","    pickle.dump(results_for_different_lams, f)"]},{"cell_type":"code","execution_count":20,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Currently retrieving for lam: 1.2\n","Currently retrieving for lam: 1.5\n","Currently retrieving for lam: 2\n","Currently retrieving for lam: 2.5\n","Currently retrieving for lam: 3\n","Currently retrieving for lam: 10\n"]}],"source":["lam_values = [1.2, 1.5, 2, 2.5, 3, 10]\n","\n","results_for_different_lams = dict()\n","for lam_value in lam_values:\n","    print(f'Currently retrieving for lam: {lam_value}')\n","    bertopic_reranker_retrieval = dict()\n","    for index, row in queries_cleaned.iterrows():\n","        query_id = row[0]\n","        retrieved_documents = initial_retrieval_with_bm25_scores[query_id]\n","        bertopic_reranker_documents = bertopic_reranker(query_topics_auto, retrieved_documents, query_id, 50, lam=lam_value, auto=True)\n","        bertopic_reranker_retrieval[query_id] = list(bertopic_reranker_documents.keys())\n","    results_for_different_lams[lam_value] = bertopic_reranker_retrieval\n","\n","with open('./results/reranker_bertopic_results_topic_model_auto.pkl', 'wb') as f:\n","    pickle.dump(results_for_different_lams, f)"]}],"metadata":{"kaggle":{"accelerator":"none","dataSources":[{"datasetId":6763,"sourceId":9801,"sourceType":"datasetVersion"},{"datasetId":576263,"sourceId":1043323,"sourceType":"datasetVersion"},{"datasetId":4135603,"sourceId":7160356,"sourceType":"datasetVersion"},{"datasetId":4137237,"sourceId":7162602,"sourceType":"datasetVersion"}],"dockerImageVersionId":30120,"isGpuEnabled":false,"isInternetEnabled":true,"language":"python","sourceType":"notebook"},"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.10.12"}},"nbformat":4,"nbformat_minor":4}