{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "BERTopic Probabilities vs LDA Probabilities: https://github.com/MaartenGr/BERTopic/issues/763 "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "UsageError: Line magic function `%pip3` not found.\n"
     ]
    }
   ],
   "source": [
    "%pip3 install -Uq sentence-transformers faiss-cpu accelerate hdbscan bertopic evaluate kaleido datasets>=2.11\n",
    "%pip install bertopic"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification\n",
    "from transformers import pipeline\n",
    "import numpy as np\n",
    "import torch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from bertopic import BERTopic\n",
    "from bertopic.vectorizers import ClassTfidfTransformer\n",
    "import plotly\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "\n",
    "model_name = 'sentence-transformers/all-MiniLM-L6-v2'\n",
    "\n",
    "docs = pd.read_csv(\"data/cisi-csv/docs.csv\")\n",
    "queries = pd.read_csv(\"data/cisi-csv/queries.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of documents: 1460\n",
      "Number of queries: 112\n"
     ]
    }
   ],
   "source": [
    "print(\"Number of documents:\", len(docs))\n",
    "print(\"Number of queries:\", len(queries))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "def word_len(x):\n",
    "    return len(x.split(\" \"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Average length of documents in characters: 833.1465753424658\n",
      "Min length of documents in characters: 70\n",
      "Max length of documents in characters: 3886\n",
      "Average length of documents in words: 127.7904109589041\n",
      "Min length of documents in words: 10\n",
      "Max length of documents in words: 567\n",
      "Average length of queries in characters: 580.4642857142857\n",
      "Min length of queries in characters: 38\n",
      "Max length of queries in characters: 2178\n",
      "Average length of queries in words: 89.10714285714286\n",
      "Min length of queries in words: 4\n",
      "Max length of queries in words: 362\n"
     ]
    }
   ],
   "source": [
    "docs[\"length\"] = docs[\"text\"].apply(len)\n",
    "docs[\"length_words\"] = docs[\"text\"].apply(word_len)\n",
    "print(\"Average length of documents in characters:\", docs.loc[:, 'length'].mean())\n",
    "print(\"Min length of documents in characters:\", docs.loc[:, 'length'].min())\n",
    "print(\"Max length of documents in characters:\", docs.loc[:, 'length'].max())\n",
    "print(\"Average length of documents in words:\", docs.loc[:, 'length_words'].mean())\n",
    "print(\"Min length of documents in words:\", docs.loc[:, 'length_words'].min())\n",
    "print(\"Max length of documents in words:\", docs.loc[:, 'length_words'].max())\n",
    "queries[\"length\"] = queries[\"text\"].apply(len)\n",
    "queries[\"length_words\"] = queries[\"text\"].apply(word_len)\n",
    "print(\"Average length of queries in characters:\", queries.loc[:, 'length'].mean())\n",
    "print(\"Min length of queries in characters:\", queries.loc[:, 'length'].min())\n",
    "print(\"Max length of queries in characters:\", queries.loc[:, 'length'].max())\n",
    "print(\"Average length of queries in words:\", queries.loc[:, 'length_words'].mean())\n",
    "print(\"Min length of queries in words:\", queries.loc[:, 'length_words'].min())\n",
    "print(\"Max length of queries in words:\", queries.loc[:, 'length_words'].max())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "",
      "text/plain": [
       "<Figure size 640x480 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "docs['length_words'].plot(kind='hist', bins=30, edgecolor='black', alpha=0.7)\n",
    "plt.xlabel('Length')\n",
    "plt.ylabel('Frequency')\n",
    "plt.title('Length of Documents (in words)')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "",
      "text/plain": [
       "<Figure size 640x480 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "queries['length_words'].plot(kind='hist', bins=30, edgecolor='black', alpha=0.7)\n",
    "plt.xlabel('Length')\n",
    "plt.ylabel('Frequency')\n",
    "plt.title('Length of Queries (in words)')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Topic</th>\n",
       "      <th>Count</th>\n",
       "      <th>Name</th>\n",
       "      <th>Representation</th>\n",
       "      <th>Representative_Docs</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>-1</td>\n",
       "      <td>389</td>\n",
       "      <td>-1_literature_on_with_an</td>\n",
       "      <td>[literature, on, with, an, be, scientific, sub...</td>\n",
       "      <td>[Library Optimum Sir,-In his recent article B....</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>259</td>\n",
       "      <td>0_libraries_library_university_academic</td>\n",
       "      <td>[libraries, library, university, academic, pub...</td>\n",
       "      <td>[Cooperation Between Types of Libraries This b...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>88</td>\n",
       "      <td>1_chemical_compounds_notation_search</td>\n",
       "      <td>[chemical, compounds, notation, search, ca, ti...</td>\n",
       "      <td>[Experiences of IIT Research Institute in Oper...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2</td>\n",
       "      <td>82</td>\n",
       "      <td>2_information_science_theory_needs</td>\n",
       "      <td>[information, science, theory, needs, flow, in...</td>\n",
       "      <td>[Science and Information Theory A new scientif...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>3</td>\n",
       "      <td>63</td>\n",
       "      <td>3_automatic_indexing_index_terms</td>\n",
       "      <td>[automatic, indexing, index, terms, document, ...</td>\n",
       "      <td>[What Makes An Automatic Keyword Classificatio...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>4</td>\n",
       "      <td>61</td>\n",
       "      <td>4_social_psychology_sociology_science</td>\n",
       "      <td>[social, psychology, sociology, science, behav...</td>\n",
       "      <td>[Is a Scientific Revolution Taking Place in Ps...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>5</td>\n",
       "      <td>45</td>\n",
       "      <td>5_relevance_answer_relevant_retrieval</td>\n",
       "      <td>[relevance, answer, relevant, retrieval, docum...</td>\n",
       "      <td>[On Relevance, Probabilistic Indexing and Info...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>6</td>\n",
       "      <td>39</td>\n",
       "      <td>6_bases_data_bibliographic_line</td>\n",
       "      <td>[bases, data, bibliographic, line, readable, s...</td>\n",
       "      <td>[Survey of Commercially Available Computer-Rea...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>7</td>\n",
       "      <td>37</td>\n",
       "      <td>7_citation_citations_papers_citing</td>\n",
       "      <td>[citation, citations, papers, citing, cross, s...</td>\n",
       "      <td>[Improvement of the Selectivity of Citation In...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>8</td>\n",
       "      <td>32</td>\n",
       "      <td>8_catalog_catalogs_card_cataloging</td>\n",
       "      <td>[catalog, catalogs, card, cataloging, divided,...</td>\n",
       "      <td>[The Recording of Library of Congress Bibliogr...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>9</td>\n",
       "      <td>29</td>\n",
       "      <td>9_communication_informal_scientists_social</td>\n",
       "      <td>[communication, informal, scientists, social, ...</td>\n",
       "      <td>[Research Studies in Patterns of Scientific Co...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>10</td>\n",
       "      <td>28</td>\n",
       "      <td>10_journals_coverage_journal_articles</td>\n",
       "      <td>[journals, coverage, journal, articles, period...</td>\n",
       "      <td>[Citation Patterns fo the Cardiovascular Seria...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>11</td>\n",
       "      <td>26</td>\n",
       "      <td>11_language_linguistics_linguistic_text</td>\n",
       "      <td>[language, linguistics, linguistic, text, sema...</td>\n",
       "      <td>[Functional Approach The present book sums up ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>12</td>\n",
       "      <td>25</td>\n",
       "      <td>12_medical_health_hospital_manpower</td>\n",
       "      <td>[medical, health, hospital, manpower, hospital...</td>\n",
       "      <td>[Library Practice in Hospitals According to a ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>13</td>\n",
       "      <td>24</td>\n",
       "      <td>13_retrieval_user_interactive_system</td>\n",
       "      <td>[retrieval, user, interactive, system, systems...</td>\n",
       "      <td>[Human Factors in the Design of an Interactive...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>14</td>\n",
       "      <td>21</td>\n",
       "      <td>14_classification_decimal_dewey_schemes</td>\n",
       "      <td>[classification, decimal, dewey, schemes, udc,...</td>\n",
       "      <td>[Adopting the Library of Congress Classificati...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>15</td>\n",
       "      <td>21</td>\n",
       "      <td>15_automation_telefacsimile_processing_automated</td>\n",
       "      <td>[automation, telefacsimile, processing, automa...</td>\n",
       "      <td>[Application of Computer Technology to Library...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>16</td>\n",
       "      <td>18</td>\n",
       "      <td>16_fuzzy_sets_classification_hedge</td>\n",
       "      <td>[fuzzy, sets, classification, hedge, membershi...</td>\n",
       "      <td>[Prospects for a New General Classification In...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>17</td>\n",
       "      <td>17</td>\n",
       "      <td>17_bradford_law_references_zipf</td>\n",
       "      <td>[bradford, law, references, zipf, distribution...</td>\n",
       "      <td>[Progress in Documentation Empirical Hyperboli...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>18</td>\n",
       "      <td>17</td>\n",
       "      <td>18_evaluation_costs_cost_scale</td>\n",
       "      <td>[evaluation, costs, cost, scale, appraisal, sy...</td>\n",
       "      <td>[Standard Costing for Information Systems: Bac...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>19</td>\n",
       "      <td>16</td>\n",
       "      <td>19_centers_services_annual_systems</td>\n",
       "      <td>[centers, services, annual, systems, informati...</td>\n",
       "      <td>[Annual Review of Information Science and Tech...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>20</td>\n",
       "      <td>16</td>\n",
       "      <td>20_serials_isbd_international_entry</td>\n",
       "      <td>[serials, isbd, international, entry, rules, s...</td>\n",
       "      <td>[No Special Rules for Entry of Serials One of ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>21</td>\n",
       "      <td>15</td>\n",
       "      <td>21_thesaurus_thesauri_construction_vocabulary</td>\n",
       "      <td>[thesaurus, thesauri, construction, vocabulary...</td>\n",
       "      <td>[Automatic Construction of Thesauri and of Con...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>22</td>\n",
       "      <td>15</td>\n",
       "      <td>22_microfiche_microforms_microform_microfilm</td>\n",
       "      <td>[microfiche, microforms, microform, microfilm,...</td>\n",
       "      <td>[The Microform Revolution Librarians have trie...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>23</td>\n",
       "      <td>14</td>\n",
       "      <td>23_medlars_medline_twx_medicus</td>\n",
       "      <td>[medlars, medline, twx, medicus, medicine, nlm...</td>\n",
       "      <td>[MEDLARS: A Summary Review and Evaluation of T...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>24</td>\n",
       "      <td>14</td>\n",
       "      <td>24_network_networks_cable_television</td>\n",
       "      <td>[network, networks, cable, television, implies...</td>\n",
       "      <td>[State of the Nation in Networking There is li...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>25</td>\n",
       "      <td>14</td>\n",
       "      <td>25_compression_length_coding_grams</td>\n",
       "      <td>[compression, length, coding, grams, character...</td>\n",
       "      <td>[An Information-Theoretic Approach to Text Sea...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>26</td>\n",
       "      <td>13</td>\n",
       "      <td>26_organizations_organizational_business_enter...</td>\n",
       "      <td>[organizations, organizational, business, ente...</td>\n",
       "      <td>[Principles of Operations Research with Applic...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>27</td>\n",
       "      <td>11</td>\n",
       "      <td>27_physics_viniti_journal_museum</td>\n",
       "      <td>[physics, viniti, journal, museum, abstract, m...</td>\n",
       "      <td>[Tests on Abstracts Journals The amount of sci...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>28</td>\n",
       "      <td>11</td>\n",
       "      <td>28_marc_records_pilot_cobol</td>\n",
       "      <td>[marc, records, pilot, cobol, ii, readable, re...</td>\n",
       "      <td>[The Marc II Format:                        A ...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    Topic  Count                                               Name  \\\n",
       "0      -1    389                           -1_literature_on_with_an   \n",
       "1       0    259            0_libraries_library_university_academic   \n",
       "2       1     88               1_chemical_compounds_notation_search   \n",
       "3       2     82                 2_information_science_theory_needs   \n",
       "4       3     63                   3_automatic_indexing_index_terms   \n",
       "5       4     61              4_social_psychology_sociology_science   \n",
       "6       5     45              5_relevance_answer_relevant_retrieval   \n",
       "7       6     39                    6_bases_data_bibliographic_line   \n",
       "8       7     37                 7_citation_citations_papers_citing   \n",
       "9       8     32                 8_catalog_catalogs_card_cataloging   \n",
       "10      9     29         9_communication_informal_scientists_social   \n",
       "11     10     28              10_journals_coverage_journal_articles   \n",
       "12     11     26            11_language_linguistics_linguistic_text   \n",
       "13     12     25                12_medical_health_hospital_manpower   \n",
       "14     13     24               13_retrieval_user_interactive_system   \n",
       "15     14     21            14_classification_decimal_dewey_schemes   \n",
       "16     15     21   15_automation_telefacsimile_processing_automated   \n",
       "17     16     18                 16_fuzzy_sets_classification_hedge   \n",
       "18     17     17                    17_bradford_law_references_zipf   \n",
       "19     18     17                     18_evaluation_costs_cost_scale   \n",
       "20     19     16                 19_centers_services_annual_systems   \n",
       "21     20     16                20_serials_isbd_international_entry   \n",
       "22     21     15      21_thesaurus_thesauri_construction_vocabulary   \n",
       "23     22     15       22_microfiche_microforms_microform_microfilm   \n",
       "24     23     14                     23_medlars_medline_twx_medicus   \n",
       "25     24     14               24_network_networks_cable_television   \n",
       "26     25     14                 25_compression_length_coding_grams   \n",
       "27     26     13  26_organizations_organizational_business_enter...   \n",
       "28     27     11                   27_physics_viniti_journal_museum   \n",
       "29     28     11                        28_marc_records_pilot_cobol   \n",
       "\n",
       "                                       Representation  \\\n",
       "0   [literature, on, with, an, be, scientific, sub...   \n",
       "1   [libraries, library, university, academic, pub...   \n",
       "2   [chemical, compounds, notation, search, ca, ti...   \n",
       "3   [information, science, theory, needs, flow, in...   \n",
       "4   [automatic, indexing, index, terms, document, ...   \n",
       "5   [social, psychology, sociology, science, behav...   \n",
       "6   [relevance, answer, relevant, retrieval, docum...   \n",
       "7   [bases, data, bibliographic, line, readable, s...   \n",
       "8   [citation, citations, papers, citing, cross, s...   \n",
       "9   [catalog, catalogs, card, cataloging, divided,...   \n",
       "10  [communication, informal, scientists, social, ...   \n",
       "11  [journals, coverage, journal, articles, period...   \n",
       "12  [language, linguistics, linguistic, text, sema...   \n",
       "13  [medical, health, hospital, manpower, hospital...   \n",
       "14  [retrieval, user, interactive, system, systems...   \n",
       "15  [classification, decimal, dewey, schemes, udc,...   \n",
       "16  [automation, telefacsimile, processing, automa...   \n",
       "17  [fuzzy, sets, classification, hedge, membershi...   \n",
       "18  [bradford, law, references, zipf, distribution...   \n",
       "19  [evaluation, costs, cost, scale, appraisal, sy...   \n",
       "20  [centers, services, annual, systems, informati...   \n",
       "21  [serials, isbd, international, entry, rules, s...   \n",
       "22  [thesaurus, thesauri, construction, vocabulary...   \n",
       "23  [microfiche, microforms, microform, microfilm,...   \n",
       "24  [medlars, medline, twx, medicus, medicine, nlm...   \n",
       "25  [network, networks, cable, television, implies...   \n",
       "26  [compression, length, coding, grams, character...   \n",
       "27  [organizations, organizational, business, ente...   \n",
       "28  [physics, viniti, journal, museum, abstract, m...   \n",
       "29  [marc, records, pilot, cobol, ii, readable, re...   \n",
       "\n",
       "                                  Representative_Docs  \n",
       "0   [Library Optimum Sir,-In his recent article B....  \n",
       "1   [Cooperation Between Types of Libraries This b...  \n",
       "2   [Experiences of IIT Research Institute in Oper...  \n",
       "3   [Science and Information Theory A new scientif...  \n",
       "4   [What Makes An Automatic Keyword Classificatio...  \n",
       "5   [Is a Scientific Revolution Taking Place in Ps...  \n",
       "6   [On Relevance, Probabilistic Indexing and Info...  \n",
       "7   [Survey of Commercially Available Computer-Rea...  \n",
       "8   [Improvement of the Selectivity of Citation In...  \n",
       "9   [The Recording of Library of Congress Bibliogr...  \n",
       "10  [Research Studies in Patterns of Scientific Co...  \n",
       "11  [Citation Patterns fo the Cardiovascular Seria...  \n",
       "12  [Functional Approach The present book sums up ...  \n",
       "13  [Library Practice in Hospitals According to a ...  \n",
       "14  [Human Factors in the Design of an Interactive...  \n",
       "15  [Adopting the Library of Congress Classificati...  \n",
       "16  [Application of Computer Technology to Library...  \n",
       "17  [Prospects for a New General Classification In...  \n",
       "18  [Progress in Documentation Empirical Hyperboli...  \n",
       "19  [Standard Costing for Information Systems: Bac...  \n",
       "20  [Annual Review of Information Science and Tech...  \n",
       "21  [No Special Rules for Entry of Serials One of ...  \n",
       "22  [Automatic Construction of Thesauri and of Con...  \n",
       "23  [The Microform Revolution Librarians have trie...  \n",
       "24  [MEDLARS: A Summary Review and Evaluation of T...  \n",
       "25  [State of the Nation in Networking There is li...  \n",
       "26  [An Information-Theoretic Approach to Text Sea...  \n",
       "27  [Principles of Operations Research with Applic...  \n",
       "28  [Tests on Abstracts Journals The amount of sci...  \n",
       "29  [The Marc II Format:                        A ...  "
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "docs_for_analysis = docs[\"text\"]\n",
    "\n",
    "# topic_model = BERTopic(embedding_model=model_name, ctfidf_model=ClassTfidfTransformer(reduce_frequent_words=True), nr_topics=\"auto\", calculate_probabilities=True)\n",
    "# topic_model = BERTopic(embedding_model=model_name, ctfidf_model=ClassTfidfTransformer(reduce_frequent_words=True), calculate_probabilities=True)\n",
    "topic_model = BERTopic(embedding_model=model_name, ctfidf_model=ClassTfidfTransformer(reduce_frequent_words=True))\n",
    "topic_model.fit(docs_for_analysis)\n",
    "topic_info = topic_model.get_topic_info()\n",
    "topic_info\n",
    "\n",
    "# Topic \"-1\": When using HDBSCAN, DBSCAN, or OPTICS, a number of outlier documents might be created\n",
    "# that do not fall within any of the created topics. These are labeled as -1."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.plotly.v1+json": {
       "config": {
        "plotlyServerURL": "https://plot.ly"
       },
       "data": [
        {
         "coloraxis": "coloraxis",
         "hovertemplate": "x: %{x}<br>y: %{y}<br>Similarity Score: %{z}<extra></extra>",
         "name": "0",
         "type": "heatmap",
         "x": [
          "0_libraries_library_univers...",
          "1_chemical_compounds_notation",
          "2_information_science_theory",
          "3_automatic_indexing_index",
          "4_social_psychology_sociology",
          "5_relevance_answer_relevant",
          "6_bases_data_bibliographic",
          "7_citation_citations_papers",
          "8_catalog_catalogs_card",
          "9_communication_informal_sc...",
          "10_journals_coverage_journal",
          "11_language_linguistics_lin...",
          "12_medical_health_hospital",
          "13_retrieval_user_interactive",
          "14_classification_decimal_d...",
          "15_automation_telefacsimile...",
          "16_fuzzy_sets_classification",
          "17_bradford_law_references",
          "18_evaluation_costs_cost",
          "19_centers_services_annual",
          "20_serials_isbd_international",
          "21_thesaurus_thesauri_const...",
          "22_microfiche_microforms_mi...",
          "23_medlars_medline_twx",
          "24_network_networks_cable",
          "25_compression_length_coding",
          "26_organizations_organizati...",
          "27_physics_viniti_journal",
          "28_marc_records_pilot"
         ],
         "xaxis": "x",
         "y": [
          "0_libraries_library_univers...",
          "1_chemical_compounds_notation",
          "2_information_science_theory",
          "3_automatic_indexing_index",
          "4_social_psychology_sociology",
          "5_relevance_answer_relevant",
          "6_bases_data_bibliographic",
          "7_citation_citations_papers",
          "8_catalog_catalogs_card",
          "9_communication_informal_sc...",
          "10_journals_coverage_journal",
          "11_language_linguistics_lin...",
          "12_medical_health_hospital",
          "13_retrieval_user_interactive",
          "14_classification_decimal_d...",
          "15_automation_telefacsimile...",
          "16_fuzzy_sets_classification",
          "17_bradford_law_references",
          "18_evaluation_costs_cost",
          "19_centers_services_annual",
          "20_serials_isbd_international",
          "21_thesaurus_thesauri_const...",
          "22_microfiche_microforms_mi...",
          "23_medlars_medline_twx",
          "24_network_networks_cable",
          "25_compression_length_coding",
          "26_organizations_organizati...",
          "27_physics_viniti_journal",
          "28_marc_records_pilot"
         ],
         "yaxis": "y",
         "z": [
          [
           1,
           0.32666999101638794,
           0.4869450628757477,
           0.3665448725223541,
           0.42433327436447144,
           0.44375643134117126,
           0.6907793283462524,
           0.5039362907409668,
           0.7190819382667542,
           0.407442182302475,
           0.5825252532958984,
           0.264842689037323,
           0.780087947845459,
           0.5123202204704285,
           0.6962786912918091,
           0.7724934220314026,
           0.16009075939655304,
           0.5454391837120056,
           0.46579509973526,
           0.6647471189498901,
           0.4859533905982971,
           0.3213660418987274,
           0.5801035761833191,
           0.4600299596786499,
           0.6126845479011536,
           0.31949400901794434,
           0.3454984128475189,
           0.5179004669189453,
           0.601757824420929
          ],
          [
           0.32666999101638794,
           1.0000001192092896,
           0.4808499813079834,
           0.6821379661560059,
           0.14306548237800598,
           0.6096744537353516,
           0.6788442134857178,
           0.5145554542541504,
           0.5480311512947083,
           0.2988084554672241,
           0.4969031810760498,
           0.4109286963939667,
           0.3315742015838623,
           0.6151208281517029,
           0.5268439650535583,
           0.44448450207710266,
           0.4459976553916931,
           0.3640647530555725,
           0.4059246778488159,
           0.48379915952682495,
           0.4948916733264923,
           0.5384268164634705,
           0.4405169188976288,
           0.5732780694961548,
           0.38101643323898315,
           0.6500089764595032,
           0.12779511511325836,
           0.5266450643539429,
           0.5768140554428101
          ],
          [
           0.4869450628757477,
           0.4808499813079834,
           0.9999997615814209,
           0.4709702134132385,
           0.5332375168800354,
           0.5937291383743286,
           0.6257171034812927,
           0.5465813279151917,
           0.4676480293273926,
           0.7655487656593323,
           0.4530123472213745,
           0.5153947472572327,
           0.37513208389282227,
           0.6438735127449036,
           0.493108868598938,
           0.5101791620254517,
           0.464409738779068,
           0.47803547978401184,
           0.7094599604606628,
           0.7604990601539612,
           0.3964885473251343,
           0.47464171051979065,
           0.4089500904083252,
           0.4371412694454193,
           0.5600941777229309,
           0.4011305868625641,
           0.4173075258731842,
           0.6383822560310364,
           0.4191518723964691
          ],
          [
           0.3665448725223541,
           0.6821379661560059,
           0.4709702134132385,
           1.0000003576278687,
           0.16050265729427338,
           0.8013572692871094,
           0.6394741535186768,
           0.5953236222267151,
           0.5366408824920654,
           0.2632567882537842,
           0.47150325775146484,
           0.5477594137191772,
           0.30301353335380554,
           0.7237895131111145,
           0.6303850412368774,
           0.4487031400203705,
           0.5259124040603638,
           0.4426112473011017,
           0.43282249569892883,
           0.48391374945640564,
           0.48176899552345276,
           0.7214059829711914,
           0.35228481888771057,
           0.5441791415214539,
           0.3156748116016388,
           0.703843355178833,
           0.12561683356761932,
           0.4334876239299774,
           0.5132316946983337
          ],
          [
           0.42433327436447144,
           0.14306548237800598,
           0.5332375168800354,
           0.16050265729427338,
           1.0000004768371582,
           0.21987202763557434,
           0.2843267023563385,
           0.5127263069152832,
           0.2774016261100769,
           0.7173872590065002,
           0.40031445026397705,
           0.32431235909461975,
           0.28359782695770264,
           0.18127897381782532,
           0.3380293846130371,
           0.26238080859184265,
           0.29123449325561523,
           0.40741634368896484,
           0.21149218082427979,
           0.28673067688941956,
           0.19751210510730743,
           0.22732007503509521,
           0.2043216973543167,
           0.2374611645936966,
           0.2846197783946991,
           0.011603796854615211,
           0.5323345065116882,
           0.512387752532959,
           0.2014160454273224
          ],
          [
           0.44375643134117126,
           0.6096744537353516,
           0.5937291383743286,
           0.8013572692871094,
           0.21987202763557434,
           1.000000238418579,
           0.6509302258491516,
           0.5537909865379333,
           0.5332594513893127,
           0.3273804187774658,
           0.4686620533466339,
           0.4943982660770416,
           0.3252916932106018,
           0.8035279512405396,
           0.5849902033805847,
           0.4314064383506775,
           0.5023263096809387,
           0.4709468483924866,
           0.5072644352912903,
           0.5030626654624939,
           0.4467049539089203,
           0.5982650518417358,
           0.3566688597202301,
           0.5393038392066956,
           0.3207283914089203,
           0.5765545964241028,
           0.15856945514678955,
           0.4680105745792389,
           0.4734441041946411
          ],
          [
           0.6907793283462524,
           0.6788442134857178,
           0.6257171034812927,
           0.6394741535186768,
           0.2843267023563385,
           0.6509302258491516,
           1.000000238418579,
           0.6573249101638794,
           0.715074896812439,
           0.45878469944000244,
           0.6006381511688232,
           0.3481522500514984,
           0.5825737118721008,
           0.7843406796455383,
           0.6356508135795593,
           0.7284486293792725,
           0.30154332518577576,
           0.5315589308738708,
           0.6312822103500366,
           0.7910773754119873,
           0.5966969728469849,
           0.5047165751457214,
           0.6133990287780762,
           0.665797233581543,
           0.6172369122505188,
           0.6361223459243774,
           0.2504993677139282,
           0.6073734760284424,
           0.7702684998512268
          ],
          [
           0.5039362907409668,
           0.5145554542541504,
           0.5465813279151917,
           0.5953236222267151,
           0.5127263069152832,
           0.5537909865379333,
           0.6573249101638794,
           0.9999998211860657,
           0.480895459651947,
           0.5663712620735168,
           0.7620363831520081,
           0.3513656258583069,
           0.445541650056839,
           0.41896718740463257,
           0.5344904065132141,
           0.38951680064201355,
           0.3248389959335327,
           0.6870846748352051,
           0.3506940007209778,
           0.4700528681278229,
           0.44369715452194214,
           0.4061499834060669,
           0.43878209590911865,
           0.5301366448402405,
           0.4503916800022125,
           0.3917107880115509,
           0.19470623135566711,
           0.7455843091011047,
           0.4982333481311798
          ],
          [
           0.7190819382667542,
           0.5480311512947083,
           0.4676480293273926,
           0.5366408824920654,
           0.2774016261100769,
           0.5332594513893127,
           0.715074896812439,
           0.480895459651947,
           0.9999997615814209,
           0.3056805729866028,
           0.556000292301178,
           0.31986236572265625,
           0.5661015510559082,
           0.5893541574478149,
           0.7665104269981384,
           0.6807854771614075,
           0.34138283133506775,
           0.5204916596412659,
           0.45162075757980347,
           0.5986390709877014,
           0.6982650756835938,
           0.49238479137420654,
           0.6185477375984192,
           0.506739616394043,
           0.47557705640792847,
           0.4793750047683716,
           0.2431187480688095,
           0.5164980888366699,
           0.7207087874412537
          ],
          [
           0.407442182302475,
           0.2988084554672241,
           0.7655487656593323,
           0.2632567882537842,
           0.7173872590065002,
           0.3273804187774658,
           0.45878469944000244,
           0.5663712620735168,
           0.3056805729866028,
           0.9999999403953552,
           0.47826912999153137,
           0.38302528858184814,
           0.33218809962272644,
           0.3390390872955322,
           0.33111196756362915,
           0.3417498469352722,
           0.2701106369495392,
           0.44717055559158325,
           0.4107925593852997,
           0.5349986553192139,
           0.2829767167568207,
           0.2873861491680145,
           0.3471759557723999,
           0.384666383266449,
           0.5435906052589417,
           0.17419765889644623,
           0.36971431970596313,
           0.6634758710861206,
           0.3188733756542206
          ],
          [
           0.5825252532958984,
           0.4969031810760498,
           0.4530123472213745,
           0.47150325775146484,
           0.40031445026397705,
           0.4686620533466339,
           0.6006381511688232,
           0.7620363831520081,
           0.556000292301178,
           0.47826912999153137,
           0.999999463558197,
           0.261245995759964,
           0.7068846225738525,
           0.3554992973804474,
           0.567395806312561,
           0.3825688064098358,
           0.1829172968864441,
           0.6380084753036499,
           0.3099691867828369,
           0.44534608721733093,
           0.5074808597564697,
           0.32283860445022583,
           0.5465129017829895,
           0.664476215839386,
           0.42472654581069946,
           0.3304610848426819,
           0.15248964726924896,
           0.7856336832046509,
           0.4799654483795166
          ],
          [
           0.264842689037323,
           0.4109286963939667,
           0.5153947472572327,
           0.5477594137191772,
           0.32431235909461975,
           0.4943982660770416,
           0.3481522500514984,
           0.3513656258583069,
           0.31986236572265625,
           0.38302528858184814,
           0.261245995759964,
           0.9999998211860657,
           0.20305554568767548,
           0.42737817764282227,
           0.44585955142974854,
           0.34356924891471863,
           0.5757694244384766,
           0.3067765533924103,
           0.3044542074203491,
           0.3787672519683838,
           0.32560330629348755,
           0.6716551780700684,
           0.1873178631067276,
           0.2778730094432831,
           0.25928065180778503,
           0.447827011346817,
           0.2012946903705597,
           0.3675667345523834,
           0.3426460921764374
          ],
          [
           0.780087947845459,
           0.3315742015838623,
           0.37513208389282227,
           0.30301353335380554,
           0.28359782695770264,
           0.3252916932106018,
           0.5825737118721008,
           0.445541650056839,
           0.5661015510559082,
           0.33218809962272644,
           0.7068846225738525,
           0.20305554568767548,
           1.0000005960464478,
           0.36803537607192993,
           0.5935381054878235,
           0.5900580883026123,
           0.11693933606147766,
           0.4114462435245514,
           0.31961187720298767,
           0.5149164795875549,
           0.3992270529270172,
           0.24458743631839752,
           0.5133451819419861,
           0.6396361589431763,
           0.5499785542488098,
           0.257185161113739,
           0.19611456990242004,
           0.48193642497062683,
           0.47983744740486145
          ],
          [
           0.5123202204704285,
           0.6151208281517029,
           0.6438735127449036,
           0.7237895131111145,
           0.18127897381782532,
           0.8035279512405396,
           0.7843406796455383,
           0.41896718740463257,
           0.5893541574478149,
           0.3390390872955322,
           0.3554992973804474,
           0.42737817764282227,
           0.36803537607192993,
           0.9999997019767761,
           0.5425456166267395,
           0.6553784012794495,
           0.3970632553100586,
           0.35048922896385193,
           0.6896361112594604,
           0.7129610180854797,
           0.4161604642868042,
           0.5729182958602905,
           0.44342342019081116,
           0.5488120913505554,
           0.4745330810546875,
           0.6112532615661621,
           0.2326623797416687,
           0.43351513147354126,
           0.5690170526504517
          ],
          [
           0.6962786912918091,
           0.5268439650535583,
           0.493108868598938,
           0.6303850412368774,
           0.3380293846130371,
           0.5849902033805847,
           0.6356508135795593,
           0.5344904065132141,
           0.7665104269981384,
           0.33111196756362915,
           0.567395806312561,
           0.44585955142974854,
           0.5935381054878235,
           0.5425456166267395,
           1,
           0.5972938537597656,
           0.5475121140480042,
           0.5430302619934082,
           0.3837525248527527,
           0.5752511620521545,
           0.6597582101821899,
           0.5781320929527283,
           0.5277068018913269,
           0.48944762349128723,
           0.46849411725997925,
           0.4965769052505493,
           0.22755120694637299,
           0.5174821615219116,
           0.6348517537117004
          ],
          [
           0.7724934220314026,
           0.44448450207710266,
           0.5101791620254517,
           0.4487031400203705,
           0.26238080859184265,
           0.4314064383506775,
           0.7284486293792725,
           0.38951680064201355,
           0.6807854771614075,
           0.3417498469352722,
           0.3825688064098358,
           0.34356924891471863,
           0.5900580883026123,
           0.6553784012794495,
           0.5972938537597656,
           0.9999995827674866,
           0.22452981770038605,
           0.38402894139289856,
           0.57470703125,
           0.7481255531311035,
           0.49575918912887573,
           0.3710801899433136,
           0.5661441683769226,
           0.479623019695282,
           0.6172865033149719,
           0.44173964858055115,
           0.2895088493824005,
           0.45847445726394653,
           0.701738715171814
          ],
          [
           0.16009075939655304,
           0.4459976553916931,
           0.464409738779068,
           0.5259124040603638,
           0.29123449325561523,
           0.5023263096809387,
           0.30154332518577576,
           0.3248389959335327,
           0.34138283133506775,
           0.2701106369495392,
           0.1829172968864441,
           0.5757694244384766,
           0.11693933606147766,
           0.3970632553100586,
           0.5475121140480042,
           0.22452981770038605,
           0.9999996423721313,
           0.2756603956222534,
           0.2739267647266388,
           0.301395982503891,
           0.3095504343509674,
           0.5979945063591003,
           0.0945306196808815,
           0.19346128404140472,
           0.25156038999557495,
           0.40975475311279297,
           0.27178776264190674,
           0.24501389265060425,
           0.2734578847885132
          ],
          [
           0.5454391837120056,
           0.3640647530555725,
           0.47803547978401184,
           0.4426112473011017,
           0.40741634368896484,
           0.4709468483924866,
           0.5315589308738708,
           0.6870846748352051,
           0.5204916596412659,
           0.44717055559158325,
           0.6380084753036499,
           0.3067765533924103,
           0.4114462435245514,
           0.35048922896385193,
           0.5430302619934082,
           0.38402894139289856,
           0.2756603956222534,
           1.000000238418579,
           0.2906114161014557,
           0.41968342661857605,
           0.46200236678123474,
           0.3352690041065216,
           0.42180103063583374,
           0.33992546796798706,
           0.38590604066848755,
           0.3715161085128784,
           0.19526897370815277,
           0.640178918838501,
           0.43629124760627747
          ],
          [
           0.46579509973526,
           0.4059246778488159,
           0.7094599604606628,
           0.43282249569892883,
           0.21149218082427979,
           0.5072644352912903,
           0.6312822103500366,
           0.3506940007209778,
           0.45162075757980347,
           0.4107925593852997,
           0.3099691867828369,
           0.3044542074203491,
           0.31961187720298767,
           0.6896361112594604,
           0.3837525248527527,
           0.57470703125,
           0.2739267647266388,
           0.2906114161014557,
           0.9999998211860657,
           0.7593415975570679,
           0.3617699146270752,
           0.4159215986728668,
           0.3857450783252716,
           0.40776076912879944,
           0.48156213760375977,
           0.38406315445899963,
           0.4507569968700409,
           0.39460843801498413,
           0.4332655072212219
          ],
          [
           0.6647471189498901,
           0.48379915952682495,
           0.7604990601539612,
           0.48391374945640564,
           0.28673067688941956,
           0.5030626654624939,
           0.7910773754119873,
           0.4700528681278229,
           0.5986390709877014,
           0.5349986553192139,
           0.44534608721733093,
           0.3787672519683838,
           0.5149164795875549,
           0.7129610180854797,
           0.5752511620521545,
           0.7481255531311035,
           0.301395982503891,
           0.41968342661857605,
           0.7593415975570679,
           1.000000238418579,
           0.5102816224098206,
           0.481174111366272,
           0.5973252654075623,
           0.5053600072860718,
           0.7182367444038391,
           0.4815911650657654,
           0.3770318329334259,
           0.5549424290657043,
           0.6067115664482117
          ],
          [
           0.4859533905982971,
           0.4948916733264923,
           0.3964885473251343,
           0.48176899552345276,
           0.19751210510730743,
           0.4467049539089203,
           0.5966969728469849,
           0.44369715452194214,
           0.6982650756835938,
           0.2829767167568207,
           0.5074808597564697,
           0.32560330629348755,
           0.3992270529270172,
           0.4161604642868042,
           0.6597582101821899,
           0.49575918912887573,
           0.3095504343509674,
           0.46200236678123474,
           0.3617699146270752,
           0.5102816224098206,
           0.9999997615814209,
           0.4424283802509308,
           0.5172606706619263,
           0.4614444077014923,
           0.38016098737716675,
           0.5066508650779724,
           0.24105387926101685,
           0.46022748947143555,
           0.6836757659912109
          ],
          [
           0.3213660418987274,
           0.5384268164634705,
           0.47464171051979065,
           0.7214059829711914,
           0.22732007503509521,
           0.5982650518417358,
           0.5047165751457214,
           0.4061499834060669,
           0.49238479137420654,
           0.2873861491680145,
           0.32283860445022583,
           0.6716551780700684,
           0.24458743631839752,
           0.5729182958602905,
           0.5781320929527283,
           0.3710801899433136,
           0.5979945063591003,
           0.3352690041065216,
           0.4159215986728668,
           0.481174111366272,
           0.4424283802509308,
           0.9999996423721313,
           0.28842973709106445,
           0.4072701036930084,
           0.2924222946166992,
           0.5321410298347473,
           0.23966963589191437,
           0.3374651074409485,
           0.42194244265556335
          ],
          [
           0.5801035761833191,
           0.4405169188976288,
           0.4089500904083252,
           0.35228481888771057,
           0.2043216973543167,
           0.3566688597202301,
           0.6133990287780762,
           0.43878209590911865,
           0.6185477375984192,
           0.3471759557723999,
           0.5465129017829895,
           0.1873178631067276,
           0.5133451819419861,
           0.44342342019081116,
           0.5277068018913269,
           0.5661441683769226,
           0.0945306196808815,
           0.42180103063583374,
           0.3857450783252716,
           0.5973252654075623,
           0.5172606706619263,
           0.28842973709106445,
           0.9999999403953552,
           0.5002884268760681,
           0.4474589228630066,
           0.410129576921463,
           0.14490985870361328,
           0.5587202906608582,
           0.5919551253318787
          ],
          [
           0.4600299596786499,
           0.5732780694961548,
           0.4371412694454193,
           0.5441791415214539,
           0.2374611645936966,
           0.5393038392066956,
           0.665797233581543,
           0.5301366448402405,
           0.506739616394043,
           0.384666383266449,
           0.664476215839386,
           0.2778730094432831,
           0.6396361589431763,
           0.5488120913505554,
           0.48944762349128723,
           0.479623019695282,
           0.19346128404140472,
           0.33992546796798706,
           0.40776076912879944,
           0.5053600072860718,
           0.4614444077014923,
           0.4072701036930084,
           0.5002884268760681,
           1.0000001192092896,
           0.4493337869644165,
           0.390693336725235,
           0.14043812453746796,
           0.5104014277458191,
           0.5311428904533386
          ],
          [
           0.6126845479011536,
           0.38101643323898315,
           0.5600941777229309,
           0.3156748116016388,
           0.2846197783946991,
           0.3207283914089203,
           0.6172369122505188,
           0.4503916800022125,
           0.47557705640792847,
           0.5435906052589417,
           0.42472654581069946,
           0.25928065180778503,
           0.5499785542488098,
           0.4745330810546875,
           0.46849411725997925,
           0.6172865033149719,
           0.25156038999557495,
           0.38590604066848755,
           0.48156213760375977,
           0.7182367444038391,
           0.38016098737716675,
           0.2924222946166992,
           0.4474589228630066,
           0.4493337869644165,
           0.9999995231628418,
           0.3441801369190216,
           0.2865138053894043,
           0.48598143458366394,
           0.522725522518158
          ],
          [
           0.31949400901794434,
           0.6500089764595032,
           0.4011305868625641,
           0.703843355178833,
           0.011603796854615211,
           0.5765545964241028,
           0.6361223459243774,
           0.3917107880115509,
           0.4793750047683716,
           0.17419765889644623,
           0.3304610848426819,
           0.447827011346817,
           0.257185161113739,
           0.6112532615661621,
           0.4965769052505493,
           0.44173964858055115,
           0.40975475311279297,
           0.3715161085128784,
           0.38406315445899963,
           0.4815911650657654,
           0.5066508650779724,
           0.5321410298347473,
           0.410129576921463,
           0.390693336725235,
           0.3441801369190216,
           0.9999998807907104,
           0.042668648064136505,
           0.3205097019672394,
           0.5997856259346008
          ],
          [
           0.3454984128475189,
           0.12779511511325836,
           0.4173075258731842,
           0.12561683356761932,
           0.5323345065116882,
           0.15856945514678955,
           0.2504993677139282,
           0.19470623135566711,
           0.2431187480688095,
           0.36971431970596313,
           0.15248964726924896,
           0.2012946903705597,
           0.19611456990242004,
           0.2326623797416687,
           0.22755120694637299,
           0.2895088493824005,
           0.27178776264190674,
           0.19526897370815277,
           0.4507569968700409,
           0.3770318329334259,
           0.24105387926101685,
           0.23966963589191437,
           0.14490985870361328,
           0.14043812453746796,
           0.2865138053894043,
           0.042668648064136505,
           0.9999999403953552,
           0.2140374481678009,
           0.19057883322238922
          ],
          [
           0.5179004669189453,
           0.5266450643539429,
           0.6383822560310364,
           0.4334876239299774,
           0.512387752532959,
           0.4680105745792389,
           0.6073734760284424,
           0.7455843091011047,
           0.5164980888366699,
           0.6634758710861206,
           0.7856336832046509,
           0.3675667345523834,
           0.48193642497062683,
           0.43351513147354126,
           0.5174821615219116,
           0.45847445726394653,
           0.24501389265060425,
           0.640178918838501,
           0.39460843801498413,
           0.5549424290657043,
           0.46022748947143555,
           0.3374651074409485,
           0.5587202906608582,
           0.5104014277458191,
           0.48598143458366394,
           0.3205097019672394,
           0.2140374481678009,
           0.9999998211860657,
           0.5146251916885376
          ],
          [
           0.601757824420929,
           0.5768140554428101,
           0.4191518723964691,
           0.5132316946983337,
           0.2014160454273224,
           0.4734441041946411,
           0.7702684998512268,
           0.4982333481311798,
           0.7207087874412537,
           0.3188733756542206,
           0.4799654483795166,
           0.3426460921764374,
           0.47983744740486145,
           0.5690170526504517,
           0.6348517537117004,
           0.701738715171814,
           0.2734578847885132,
           0.43629124760627747,
           0.4332655072212219,
           0.6067115664482117,
           0.6836757659912109,
           0.42194244265556335,
           0.5919551253318787,
           0.5311428904533386,
           0.522725522518158,
           0.5997856259346008,
           0.19057883322238922,
           0.5146251916885376,
           1.0000003576278687
          ]
         ]
        }
       ],
       "layout": {
        "coloraxis": {
         "colorbar": {
          "title": {
           "text": "Similarity Score"
          }
         },
         "colorscale": [
          [
           0,
           "rgb(247,252,240)"
          ],
          [
           0.125,
           "rgb(224,243,219)"
          ],
          [
           0.25,
           "rgb(204,235,197)"
          ],
          [
           0.375,
           "rgb(168,221,181)"
          ],
          [
           0.5,
           "rgb(123,204,196)"
          ],
          [
           0.625,
           "rgb(78,179,211)"
          ],
          [
           0.75,
           "rgb(43,140,190)"
          ],
          [
           0.875,
           "rgb(8,104,172)"
          ],
          [
           1,
           "rgb(8,64,129)"
          ]
         ]
        },
        "height": 800,
        "hoverlabel": {
         "bgcolor": "white",
         "font": {
          "family": "Rockwell",
          "size": 16
         }
        },
        "legend": {
         "title": {
          "text": "Trend"
         }
        },
        "margin": {
         "t": 60
        },
        "showlegend": true,
        "template": {
         "data": {
          "bar": [
           {
            "error_x": {
             "color": "#2a3f5f"
            },
            "error_y": {
             "color": "#2a3f5f"
            },
            "marker": {
             "line": {
              "color": "#E5ECF6",
              "width": 0.5
             },
             "pattern": {
              "fillmode": "overlay",
              "size": 10,
              "solidity": 0.2
             }
            },
            "type": "bar"
           }
          ],
          "barpolar": [
           {
            "marker": {
             "line": {
              "color": "#E5ECF6",
              "width": 0.5
             },
             "pattern": {
              "fillmode": "overlay",
              "size": 10,
              "solidity": 0.2
             }
            },
            "type": "barpolar"
           }
          ],
          "carpet": [
           {
            "aaxis": {
             "endlinecolor": "#2a3f5f",
             "gridcolor": "white",
             "linecolor": "white",
             "minorgridcolor": "white",
             "startlinecolor": "#2a3f5f"
            },
            "baxis": {
             "endlinecolor": "#2a3f5f",
             "gridcolor": "white",
             "linecolor": "white",
             "minorgridcolor": "white",
             "startlinecolor": "#2a3f5f"
            },
            "type": "carpet"
           }
          ],
          "choropleth": [
           {
            "colorbar": {
             "outlinewidth": 0,
             "ticks": ""
            },
            "type": "choropleth"
           }
          ],
          "contour": [
           {
            "colorbar": {
             "outlinewidth": 0,
             "ticks": ""
            },
            "colorscale": [
             [
              0,
              "#0d0887"
             ],
             [
              0.1111111111111111,
              "#46039f"
             ],
             [
              0.2222222222222222,
              "#7201a8"
             ],
             [
              0.3333333333333333,
              "#9c179e"
             ],
             [
              0.4444444444444444,
              "#bd3786"
             ],
             [
              0.5555555555555556,
              "#d8576b"
             ],
             [
              0.6666666666666666,
              "#ed7953"
             ],
             [
              0.7777777777777778,
              "#fb9f3a"
             ],
             [
              0.8888888888888888,
              "#fdca26"
             ],
             [
              1,
              "#f0f921"
             ]
            ],
            "type": "contour"
           }
          ],
          "contourcarpet": [
           {
            "colorbar": {
             "outlinewidth": 0,
             "ticks": ""
            },
            "type": "contourcarpet"
           }
          ],
          "heatmap": [
           {
            "colorbar": {
             "outlinewidth": 0,
             "ticks": ""
            },
            "colorscale": [
             [
              0,
              "#0d0887"
             ],
             [
              0.1111111111111111,
              "#46039f"
             ],
             [
              0.2222222222222222,
              "#7201a8"
             ],
             [
              0.3333333333333333,
              "#9c179e"
             ],
             [
              0.4444444444444444,
              "#bd3786"
             ],
             [
              0.5555555555555556,
              "#d8576b"
             ],
             [
              0.6666666666666666,
              "#ed7953"
             ],
             [
              0.7777777777777778,
              "#fb9f3a"
             ],
             [
              0.8888888888888888,
              "#fdca26"
             ],
             [
              1,
              "#f0f921"
             ]
            ],
            "type": "heatmap"
           }
          ],
          "heatmapgl": [
           {
            "colorbar": {
             "outlinewidth": 0,
             "ticks": ""
            },
            "colorscale": [
             [
              0,
              "#0d0887"
             ],
             [
              0.1111111111111111,
              "#46039f"
             ],
             [
              0.2222222222222222,
              "#7201a8"
             ],
             [
              0.3333333333333333,
              "#9c179e"
             ],
             [
              0.4444444444444444,
              "#bd3786"
             ],
             [
              0.5555555555555556,
              "#d8576b"
             ],
             [
              0.6666666666666666,
              "#ed7953"
             ],
             [
              0.7777777777777778,
              "#fb9f3a"
             ],
             [
              0.8888888888888888,
              "#fdca26"
             ],
             [
              1,
              "#f0f921"
             ]
            ],
            "type": "heatmapgl"
           }
          ],
          "histogram": [
           {
            "marker": {
             "pattern": {
              "fillmode": "overlay",
              "size": 10,
              "solidity": 0.2
             }
            },
            "type": "histogram"
           }
          ],
          "histogram2d": [
           {
            "colorbar": {
             "outlinewidth": 0,
             "ticks": ""
            },
            "colorscale": [
             [
              0,
              "#0d0887"
             ],
             [
              0.1111111111111111,
              "#46039f"
             ],
             [
              0.2222222222222222,
              "#7201a8"
             ],
             [
              0.3333333333333333,
              "#9c179e"
             ],
             [
              0.4444444444444444,
              "#bd3786"
             ],
             [
              0.5555555555555556,
              "#d8576b"
             ],
             [
              0.6666666666666666,
              "#ed7953"
             ],
             [
              0.7777777777777778,
              "#fb9f3a"
             ],
             [
              0.8888888888888888,
              "#fdca26"
             ],
             [
              1,
              "#f0f921"
             ]
            ],
            "type": "histogram2d"
           }
          ],
          "histogram2dcontour": [
           {
            "colorbar": {
             "outlinewidth": 0,
             "ticks": ""
            },
            "colorscale": [
             [
              0,
              "#0d0887"
             ],
             [
              0.1111111111111111,
              "#46039f"
             ],
             [
              0.2222222222222222,
              "#7201a8"
             ],
             [
              0.3333333333333333,
              "#9c179e"
             ],
             [
              0.4444444444444444,
              "#bd3786"
             ],
             [
              0.5555555555555556,
              "#d8576b"
             ],
             [
              0.6666666666666666,
              "#ed7953"
             ],
             [
              0.7777777777777778,
              "#fb9f3a"
             ],
             [
              0.8888888888888888,
              "#fdca26"
             ],
             [
              1,
              "#f0f921"
             ]
            ],
            "type": "histogram2dcontour"
           }
          ],
          "mesh3d": [
           {
            "colorbar": {
             "outlinewidth": 0,
             "ticks": ""
            },
            "type": "mesh3d"
           }
          ],
          "parcoords": [
           {
            "line": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "parcoords"
           }
          ],
          "pie": [
           {
            "automargin": true,
            "type": "pie"
           }
          ],
          "scatter": [
           {
            "fillpattern": {
             "fillmode": "overlay",
             "size": 10,
             "solidity": 0.2
            },
            "type": "scatter"
           }
          ],
          "scatter3d": [
           {
            "line": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "marker": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "scatter3d"
           }
          ],
          "scattercarpet": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "scattercarpet"
           }
          ],
          "scattergeo": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "scattergeo"
           }
          ],
          "scattergl": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "scattergl"
           }
          ],
          "scattermapbox": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "scattermapbox"
           }
          ],
          "scatterpolar": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "scatterpolar"
           }
          ],
          "scatterpolargl": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "scatterpolargl"
           }
          ],
          "scatterternary": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "scatterternary"
           }
          ],
          "surface": [
           {
            "colorbar": {
             "outlinewidth": 0,
             "ticks": ""
            },
            "colorscale": [
             [
              0,
              "#0d0887"
             ],
             [
              0.1111111111111111,
              "#46039f"
             ],
             [
              0.2222222222222222,
              "#7201a8"
             ],
             [
              0.3333333333333333,
              "#9c179e"
             ],
             [
              0.4444444444444444,
              "#bd3786"
             ],
             [
              0.5555555555555556,
              "#d8576b"
             ],
             [
              0.6666666666666666,
              "#ed7953"
             ],
             [
              0.7777777777777778,
              "#fb9f3a"
             ],
             [
              0.8888888888888888,
              "#fdca26"
             ],
             [
              1,
              "#f0f921"
             ]
            ],
            "type": "surface"
           }
          ],
          "table": [
           {
            "cells": {
             "fill": {
              "color": "#EBF0F8"
             },
             "line": {
              "color": "white"
             }
            },
            "header": {
             "fill": {
              "color": "#C8D4E3"
             },
             "line": {
              "color": "white"
             }
            },
            "type": "table"
           }
          ]
         },
         "layout": {
          "annotationdefaults": {
           "arrowcolor": "#2a3f5f",
           "arrowhead": 0,
           "arrowwidth": 1
          },
          "autotypenumbers": "strict",
          "coloraxis": {
           "colorbar": {
            "outlinewidth": 0,
            "ticks": ""
           }
          },
          "colorscale": {
           "diverging": [
            [
             0,
             "#8e0152"
            ],
            [
             0.1,
             "#c51b7d"
            ],
            [
             0.2,
             "#de77ae"
            ],
            [
             0.3,
             "#f1b6da"
            ],
            [
             0.4,
             "#fde0ef"
            ],
            [
             0.5,
             "#f7f7f7"
            ],
            [
             0.6,
             "#e6f5d0"
            ],
            [
             0.7,
             "#b8e186"
            ],
            [
             0.8,
             "#7fbc41"
            ],
            [
             0.9,
             "#4d9221"
            ],
            [
             1,
             "#276419"
            ]
           ],
           "sequential": [
            [
             0,
             "#0d0887"
            ],
            [
             0.1111111111111111,
             "#46039f"
            ],
            [
             0.2222222222222222,
             "#7201a8"
            ],
            [
             0.3333333333333333,
             "#9c179e"
            ],
            [
             0.4444444444444444,
             "#bd3786"
            ],
            [
             0.5555555555555556,
             "#d8576b"
            ],
            [
             0.6666666666666666,
             "#ed7953"
            ],
            [
             0.7777777777777778,
             "#fb9f3a"
            ],
            [
             0.8888888888888888,
             "#fdca26"
            ],
            [
             1,
             "#f0f921"
            ]
           ],
           "sequentialminus": [
            [
             0,
             "#0d0887"
            ],
            [
             0.1111111111111111,
             "#46039f"
            ],
            [
             0.2222222222222222,
             "#7201a8"
            ],
            [
             0.3333333333333333,
             "#9c179e"
            ],
            [
             0.4444444444444444,
             "#bd3786"
            ],
            [
             0.5555555555555556,
             "#d8576b"
            ],
            [
             0.6666666666666666,
             "#ed7953"
            ],
            [
             0.7777777777777778,
             "#fb9f3a"
            ],
            [
             0.8888888888888888,
             "#fdca26"
            ],
            [
             1,
             "#f0f921"
            ]
           ]
          },
          "colorway": [
           "#636efa",
           "#EF553B",
           "#00cc96",
           "#ab63fa",
           "#FFA15A",
           "#19d3f3",
           "#FF6692",
           "#B6E880",
           "#FF97FF",
           "#FECB52"
          ],
          "font": {
           "color": "#2a3f5f"
          },
          "geo": {
           "bgcolor": "white",
           "lakecolor": "white",
           "landcolor": "#E5ECF6",
           "showlakes": true,
           "showland": true,
           "subunitcolor": "white"
          },
          "hoverlabel": {
           "align": "left"
          },
          "hovermode": "closest",
          "mapbox": {
           "style": "light"
          },
          "paper_bgcolor": "white",
          "plot_bgcolor": "#E5ECF6",
          "polar": {
           "angularaxis": {
            "gridcolor": "white",
            "linecolor": "white",
            "ticks": ""
           },
           "bgcolor": "#E5ECF6",
           "radialaxis": {
            "gridcolor": "white",
            "linecolor": "white",
            "ticks": ""
           }
          },
          "scene": {
           "xaxis": {
            "backgroundcolor": "#E5ECF6",
            "gridcolor": "white",
            "gridwidth": 2,
            "linecolor": "white",
            "showbackground": true,
            "ticks": "",
            "zerolinecolor": "white"
           },
           "yaxis": {
            "backgroundcolor": "#E5ECF6",
            "gridcolor": "white",
            "gridwidth": 2,
            "linecolor": "white",
            "showbackground": true,
            "ticks": "",
            "zerolinecolor": "white"
           },
           "zaxis": {
            "backgroundcolor": "#E5ECF6",
            "gridcolor": "white",
            "gridwidth": 2,
            "linecolor": "white",
            "showbackground": true,
            "ticks": "",
            "zerolinecolor": "white"
           }
          },
          "shapedefaults": {
           "line": {
            "color": "#2a3f5f"
           }
          },
          "ternary": {
           "aaxis": {
            "gridcolor": "white",
            "linecolor": "white",
            "ticks": ""
           },
           "baxis": {
            "gridcolor": "white",
            "linecolor": "white",
            "ticks": ""
           },
           "bgcolor": "#E5ECF6",
           "caxis": {
            "gridcolor": "white",
            "linecolor": "white",
            "ticks": ""
           }
          },
          "title": {
           "x": 0.05
          },
          "xaxis": {
           "automargin": true,
           "gridcolor": "white",
           "linecolor": "white",
           "ticks": "",
           "title": {
            "standoff": 15
           },
           "zerolinecolor": "white",
           "zerolinewidth": 2
          },
          "yaxis": {
           "automargin": true,
           "gridcolor": "white",
           "linecolor": "white",
           "ticks": "",
           "title": {
            "standoff": 15
           },
           "zerolinecolor": "white",
           "zerolinewidth": 2
          }
         }
        },
        "title": {
         "font": {
          "color": "Black",
          "size": 22
         },
         "text": "<b>Similarity Matrix</b>",
         "x": 0.55,
         "xanchor": "center",
         "y": 0.95,
         "yanchor": "top"
        },
        "width": 800,
        "xaxis": {
         "anchor": "y",
         "constrain": "domain",
         "domain": [
          0,
          1
         ],
         "scaleanchor": "y"
        },
        "yaxis": {
         "anchor": "x",
         "autorange": "reversed",
         "constrain": "domain",
         "domain": [
          0,
          1
         ]
        }
       }
      }
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "topic_model.visualize_heatmap()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "What problems and concerns are there in making up descriptive titles? What difficulties are involved in automatically retrieving articles from approximate titles? What is the usual relevance of the content of articles to their titles?\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "([-1], array([0.]))"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# returns: Topic predictions for each documents probabilities: The topic probability distribution \n",
    "# which is returned by default. If calculate_probabilities in BERTopic is set to False, then the \n",
    "# probabilities are not calculated to speed up computation and decrease memory usage.\n",
    "print(queries.iloc[0][\"text\"])\n",
    "topic_model.transform(queries.iloc[0][\"text\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "([-1,\n",
       "  6,\n",
       "  2,\n",
       "  -1,\n",
       "  -1,\n",
       "  -1,\n",
       "  -1,\n",
       "  -1,\n",
       "  -1,\n",
       "  -1,\n",
       "  -1,\n",
       "  -1,\n",
       "  -1,\n",
       "  23,\n",
       "  6,\n",
       "  13,\n",
       "  -1,\n",
       "  1,\n",
       "  -1,\n",
       "  -1,\n",
       "  2,\n",
       "  -1,\n",
       "  15,\n",
       "  2,\n",
       "  19,\n",
       "  18,\n",
       "  3,\n",
       "  1,\n",
       "  -1,\n",
       "  -1,\n",
       "  2,\n",
       "  -1,\n",
       "  13,\n",
       "  -1,\n",
       "  19,\n",
       "  -1,\n",
       "  -1,\n",
       "  -1,\n",
       "  13,\n",
       "  -1,\n",
       "  -1,\n",
       "  13,\n",
       "  13,\n",
       "  -1,\n",
       "  15,\n",
       "  15,\n",
       "  -1,\n",
       "  13,\n",
       "  13,\n",
       "  -1,\n",
       "  -1,\n",
       "  23,\n",
       "  23,\n",
       "  15,\n",
       "  23,\n",
       "  -1,\n",
       "  -1,\n",
       "  24,\n",
       "  5,\n",
       "  5,\n",
       "  5,\n",
       "  -1,\n",
       "  13,\n",
       "  2,\n",
       "  -1,\n",
       "  24,\n",
       "  -1,\n",
       "  -1,\n",
       "  21,\n",
       "  18,\n",
       "  -1,\n",
       "  5,\n",
       "  25,\n",
       "  24,\n",
       "  11,\n",
       "  -1,\n",
       "  -1,\n",
       "  11,\n",
       "  25,\n",
       "  17,\n",
       "  -1,\n",
       "  -1,\n",
       "  -1,\n",
       "  5,\n",
       "  -1,\n",
       "  -1,\n",
       "  -1,\n",
       "  -1,\n",
       "  0,\n",
       "  -1,\n",
       "  7,\n",
       "  -1,\n",
       "  -1,\n",
       "  -1,\n",
       "  5,\n",
       "  5,\n",
       "  5,\n",
       "  13,\n",
       "  -1,\n",
       "  -1,\n",
       "  -1,\n",
       "  3,\n",
       "  -1,\n",
       "  -1,\n",
       "  3,\n",
       "  -1,\n",
       "  3,\n",
       "  6,\n",
       "  -1,\n",
       "  -1,\n",
       "  -1,\n",
       "  3],\n",
       " array([0.        , 0.99014559, 1.        , 0.        , 0.        ,\n",
       "        0.        , 0.        , 0.        , 0.        , 0.        ,\n",
       "        0.        , 0.        , 0.        , 0.70978273, 1.        ,\n",
       "        1.        , 0.        , 0.50214361, 0.        , 0.        ,\n",
       "        0.89330545, 0.        , 0.75838699, 0.88150771, 0.89719769,\n",
       "        0.98151705, 0.8019127 , 0.39128054, 0.        , 0.        ,\n",
       "        0.83138753, 0.        , 1.        , 0.        , 0.89556598,\n",
       "        0.        , 0.        , 0.        , 0.97160578, 0.        ,\n",
       "        0.        , 1.        , 0.8553521 , 0.        , 1.        ,\n",
       "        0.93709397, 0.        , 0.96472697, 1.        , 0.        ,\n",
       "        0.        , 0.54859745, 0.32511822, 0.85615953, 0.92204904,\n",
       "        0.        , 0.        , 0.86990035, 0.50261834, 0.77983412,\n",
       "        0.77491146, 0.        , 0.9493261 , 1.        , 0.        ,\n",
       "        0.9821218 , 0.        , 0.        , 0.78804419, 0.44879896,\n",
       "        0.        , 0.69746855, 0.90628964, 0.57861926, 0.80843619,\n",
       "        0.        , 0.        , 0.82284085, 0.91885856, 0.96898165,\n",
       "        0.        , 0.        , 0.        , 0.58760933, 0.        ,\n",
       "        0.        , 0.        , 0.        , 0.73926762, 0.        ,\n",
       "        0.83435261, 0.        , 0.        , 0.        , 0.66787187,\n",
       "        0.7418057 , 0.77786368, 0.90647409, 0.        , 0.        ,\n",
       "        0.        , 0.81630941, 0.        , 0.        , 0.90813486,\n",
       "        0.        , 0.75429479, 0.86396501, 0.        , 0.        ,\n",
       "        0.        , 0.74939692]))"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_query_topics = topic_model.transform(queries[\"text\"])\n",
    "all_query_topics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "([-1, 20, 5, 3, 14],\n",
       " [0.42508858, 0.40581542, 0.40525758, 0.39575997, 0.39218175])"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "topic_model.find_topics(queries.iloc[0][\"text\"], top_n=5)\n",
    "# topics, similarity = topic_model.find_topics(queries.iloc[0][\"text\"], top_n=5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Query Topics: {1: -1, 2: 6, 3: 2, 4: 25, 5: 13, 6: 11, 7: -1, 8: 3, 9: 3, 10: 5, 11: -1, 12: 27, 13: 5, 14: 23, 15: 6, 16: 13, 17: 13, 18: 1, 19: 25, 20: 18, 21: 2, 22: 23, 23: 15, 24: 2, 25: 19, 26: 18, 27: 3, 28: 1, 29: 3, 30: 27, 31: 2, 32: 3, 33: 13, 34: 25, 35: 19, 36: 11, 37: 3, 38: 3, 39: 13, 40: 11, 41: 3, 42: 13, 43: 13, 44: 27, 45: 15, 46: 15, 47: -1, 48: 13, 49: 13, 50: 14, 51: 3, 52: 23, 53: 23, 54: 15, 55: 23, 56: 8, 57: 20, 58: 24, 59: 5, 60: 5, 61: 5, 62: 5, 63: 13, 64: 2, 65: 3, 66: 24, 67: 1, 68: 3, 69: 21, 70: 18, 71: 3, 72: 5, 73: 25, 74: 24, 75: 11, 76: 24, 77: 3, 78: 11, 79: 25, 80: 17, 81: 21, 82: 5, 83: 19, 84: 5, 85: 11, 86: 25, 87: 6, 88: 13, 89: 0, 90: 7, 91: 7, 92: 6, 93: 3, 94: 13, 95: 5, 96: 5, 97: 5, 98: 13, 99: 13, 100: 13, 101: 13, 102: 3, 103: 6, 104: 5, 105: 3, 106: 3, 107: 3, 108: 6, 109: 7, 110: -1, 111: 3, 112: 3}\n"
     ]
    }
   ],
   "source": [
    "# transform queries to topics\n",
    "query_topics = {}\n",
    "query_topics_with_probs = {}\n",
    "\n",
    "for index, row in queries.iterrows():\n",
    "    query_id = row[0]\n",
    "    topics, probs = topic_model.find_topics(row[\"text\"], top_n=5)\n",
    "    query_topics[query_id] = topics[0]\n",
    "    query_topics_with_probs[query_id] = topic_model.find_topics(row[\"text\"], top_n=5)\n",
    "print(\"Query Topics:\", query_topics)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "-1 5\n",
      "0 1\n",
      "1 3\n",
      "2 5\n",
      "3 20\n",
      "4 0\n",
      "5 13\n",
      "6 6\n",
      "7 3\n"
     ]
    }
   ],
   "source": [
    "print(\"-1\", list(query_topics.values()).count(-1))\n",
    "print(\"0\", list(query_topics.values()).count(0))\n",
    "print(\"1\", list(query_topics.values()).count(1))\n",
    "print(\"2\", list(query_topics.values()).count(2))\n",
    "print(\"3\", list(query_topics.values()).count(3))\n",
    "print(\"4\", list(query_topics.values()).count(4))\n",
    "print(\"5\", list(query_topics.values()).count(5))\n",
    "print(\"6\", list(query_topics.values()).count(6))\n",
    "print(\"7\", list(query_topics.values()).count(7))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Document</th>\n",
       "      <th>Topic</th>\n",
       "      <th>Name</th>\n",
       "      <th>Representation</th>\n",
       "      <th>Representative_Docs</th>\n",
       "      <th>Top_n_words</th>\n",
       "      <th>Probability</th>\n",
       "      <th>Representative_document</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>18 Editions of the Dewey Decimal Classificatio...</td>\n",
       "      <td>14</td>\n",
       "      <td>14_classification_decimal_dewey_schemes</td>\n",
       "      <td>[classification, decimal, dewey, schemes, udc,...</td>\n",
       "      <td>[Adopting the Library of Congress Classificati...</td>\n",
       "      <td>classification - decimal - dewey - schemes - u...</td>\n",
       "      <td>0.741910</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Use Made of Technical Libraries This report is...</td>\n",
       "      <td>-1</td>\n",
       "      <td>-1_literature_on_with_an</td>\n",
       "      <td>[literature, on, with, an, be, scientific, sub...</td>\n",
       "      <td>[Library Optimum Sir,-In his recent article B....</td>\n",
       "      <td>literature - on - with - an - be - scientific ...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Two Kinds of Power An Essay on Bibliographic C...</td>\n",
       "      <td>-1</td>\n",
       "      <td>-1_literature_on_with_an</td>\n",
       "      <td>[literature, on, with, an, be, scientific, sub...</td>\n",
       "      <td>[Library Optimum Sir,-In his recent article B....</td>\n",
       "      <td>literature - on - with - an - be - scientific ...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Systems Analysis of a University Library; fina...</td>\n",
       "      <td>0</td>\n",
       "      <td>0_libraries_library_university_academic</td>\n",
       "      <td>[libraries, library, university, academic, pub...</td>\n",
       "      <td>[Cooperation Between Types of Libraries This b...</td>\n",
       "      <td>libraries - library - university - academic - ...</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>A Library Management Game: a report on a resea...</td>\n",
       "      <td>15</td>\n",
       "      <td>15_automation_telefacsimile_processing_automated</td>\n",
       "      <td>[automation, telefacsimile, processing, automa...</td>\n",
       "      <td>[Application of Computer Technology to Library...</td>\n",
       "      <td>automation - telefacsimile - processing - auto...</td>\n",
       "      <td>0.487765</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1455</th>\n",
       "      <td>World Dynamics Over the last several decades i...</td>\n",
       "      <td>4</td>\n",
       "      <td>4_social_psychology_sociology_science</td>\n",
       "      <td>[social, psychology, sociology, science, behav...</td>\n",
       "      <td>[Is a Scientific Revolution Taking Place in Ps...</td>\n",
       "      <td>social - psychology - sociology - science - be...</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1456</th>\n",
       "      <td>World Trends in Library Education One of the m...</td>\n",
       "      <td>0</td>\n",
       "      <td>0_libraries_library_university_academic</td>\n",
       "      <td>[libraries, library, university, academic, pub...</td>\n",
       "      <td>[Cooperation Between Types of Libraries This b...</td>\n",
       "      <td>libraries - library - university - academic - ...</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1457</th>\n",
       "      <td>Legal Restrictions on Exploitation of the Pate...</td>\n",
       "      <td>-1</td>\n",
       "      <td>-1_literature_on_with_an</td>\n",
       "      <td>[literature, on, with, an, be, scientific, sub...</td>\n",
       "      <td>[Library Optimum Sir,-In his recent article B....</td>\n",
       "      <td>literature - on - with - an - be - scientific ...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1458</th>\n",
       "      <td>Language and Thought This book considers the b...</td>\n",
       "      <td>11</td>\n",
       "      <td>11_language_linguistics_linguistic_text</td>\n",
       "      <td>[language, linguistics, linguistic, text, sema...</td>\n",
       "      <td>[Functional Approach The present book sums up ...</td>\n",
       "      <td>language - linguistics - linguistic - text - s...</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1459</th>\n",
       "      <td>Modern Integral Information Systems for Chemis...</td>\n",
       "      <td>1</td>\n",
       "      <td>1_chemical_compounds_notation_search</td>\n",
       "      <td>[chemical, compounds, notation, search, ca, ti...</td>\n",
       "      <td>[Experiences of IIT Research Institute in Oper...</td>\n",
       "      <td>chemical - compounds - notation - search - ca ...</td>\n",
       "      <td>0.271702</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1460 rows × 8 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                               Document  Topic  \\\n",
       "0     18 Editions of the Dewey Decimal Classificatio...     14   \n",
       "1     Use Made of Technical Libraries This report is...     -1   \n",
       "2     Two Kinds of Power An Essay on Bibliographic C...     -1   \n",
       "3     Systems Analysis of a University Library; fina...      0   \n",
       "4     A Library Management Game: a report on a resea...     15   \n",
       "...                                                 ...    ...   \n",
       "1455  World Dynamics Over the last several decades i...      4   \n",
       "1456  World Trends in Library Education One of the m...      0   \n",
       "1457  Legal Restrictions on Exploitation of the Pate...     -1   \n",
       "1458  Language and Thought This book considers the b...     11   \n",
       "1459  Modern Integral Information Systems for Chemis...      1   \n",
       "\n",
       "                                                  Name  \\\n",
       "0              14_classification_decimal_dewey_schemes   \n",
       "1                             -1_literature_on_with_an   \n",
       "2                             -1_literature_on_with_an   \n",
       "3              0_libraries_library_university_academic   \n",
       "4     15_automation_telefacsimile_processing_automated   \n",
       "...                                                ...   \n",
       "1455             4_social_psychology_sociology_science   \n",
       "1456           0_libraries_library_university_academic   \n",
       "1457                          -1_literature_on_with_an   \n",
       "1458           11_language_linguistics_linguistic_text   \n",
       "1459              1_chemical_compounds_notation_search   \n",
       "\n",
       "                                         Representation  \\\n",
       "0     [classification, decimal, dewey, schemes, udc,...   \n",
       "1     [literature, on, with, an, be, scientific, sub...   \n",
       "2     [literature, on, with, an, be, scientific, sub...   \n",
       "3     [libraries, library, university, academic, pub...   \n",
       "4     [automation, telefacsimile, processing, automa...   \n",
       "...                                                 ...   \n",
       "1455  [social, psychology, sociology, science, behav...   \n",
       "1456  [libraries, library, university, academic, pub...   \n",
       "1457  [literature, on, with, an, be, scientific, sub...   \n",
       "1458  [language, linguistics, linguistic, text, sema...   \n",
       "1459  [chemical, compounds, notation, search, ca, ti...   \n",
       "\n",
       "                                    Representative_Docs  \\\n",
       "0     [Adopting the Library of Congress Classificati...   \n",
       "1     [Library Optimum Sir,-In his recent article B....   \n",
       "2     [Library Optimum Sir,-In his recent article B....   \n",
       "3     [Cooperation Between Types of Libraries This b...   \n",
       "4     [Application of Computer Technology to Library...   \n",
       "...                                                 ...   \n",
       "1455  [Is a Scientific Revolution Taking Place in Ps...   \n",
       "1456  [Cooperation Between Types of Libraries This b...   \n",
       "1457  [Library Optimum Sir,-In his recent article B....   \n",
       "1458  [Functional Approach The present book sums up ...   \n",
       "1459  [Experiences of IIT Research Institute in Oper...   \n",
       "\n",
       "                                            Top_n_words  Probability  \\\n",
       "0     classification - decimal - dewey - schemes - u...     0.741910   \n",
       "1     literature - on - with - an - be - scientific ...     0.000000   \n",
       "2     literature - on - with - an - be - scientific ...     0.000000   \n",
       "3     libraries - library - university - academic - ...     1.000000   \n",
       "4     automation - telefacsimile - processing - auto...     0.487765   \n",
       "...                                                 ...          ...   \n",
       "1455  social - psychology - sociology - science - be...     1.000000   \n",
       "1456  libraries - library - university - academic - ...     1.000000   \n",
       "1457  literature - on - with - an - be - scientific ...     0.000000   \n",
       "1458  language - linguistics - linguistic - text - s...     1.000000   \n",
       "1459  chemical - compounds - notation - search - ca ...     0.271702   \n",
       "\n",
       "      Representative_document  \n",
       "0                       False  \n",
       "1                       False  \n",
       "2                       False  \n",
       "3                       False  \n",
       "4                       False  \n",
       "...                       ...  \n",
       "1455                    False  \n",
       "1456                    False  \n",
       "1457                    False  \n",
       "1458                    False  \n",
       "1459                    False  \n",
       "\n",
       "[1460 rows x 8 columns]"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "doc_info = topic_model.get_document_info(docs[\"text\"])\n",
    "doc_info"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{1: ([-1, 20, 5, 3, 14], [0.42508858, 0.40581542, 0.40525758, 0.39575997, 0.39218175]), 2: ([6, 5, -1, 13, 2], [0.5578636, 0.54351616, 0.5202661, 0.47465622, 0.46867073]), 3: ([2, 19, 18, 9, -1], [0.7133019, 0.564875, 0.54921937, 0.45864588, 0.43095225]), 4: ([25, 1, 22, 15, 28], [0.39171955, 0.36059082, 0.35910475, 0.3572228, 0.34390008]), 5: ([13, 6, -1, 19, 2], [0.64703333, 0.5929646, 0.59010124, 0.58086085, 0.57909113]), 6: ([11, 9, 21, 2, 13], [0.44849384, 0.4383688, 0.34508163, 0.32840043, 0.27865595]), 7: ([-1, 6, 19, 13, 27], [0.58543944, 0.55294424, 0.5477898, 0.5403626, 0.5302058]), 8: ([3, -1, 5, 13, 11], [0.7271405, 0.67743355, 0.67520845, 0.60538036, 0.60110706]), 9: ([3, 5, -1, 11, 21], [0.64982533, 0.6252445, 0.57588446, 0.57279235, 0.55609095]), 10: ([5, -1, 16, 13, 2], [0.5014358, 0.47784606, 0.4759782, 0.45857334, 0.4403532]), 11: ([-1, 2, 5, 7, 6], [0.59934133, 0.56825256, 0.5672239, 0.5394591, 0.5232169]), 12: ([27, 10, 7, -1, 22], [0.66234237, 0.5415038, 0.51516885, 0.5111377, 0.46366423]), 13: ([5, 13, -1, 19, 18], [0.6804598, 0.66106427, 0.6306013, 0.59457284, 0.58618534]), 14: ([23, 12, -1, 10, 27], [0.394485, 0.28473017, 0.28226718, 0.27444452, 0.258744]), 15: ([6, -1, 15, 0, 19], [0.65373707, 0.6073693, 0.60587513, 0.605869, 0.55832195]), 16: ([13, 6, 19, -1, 15], [0.66722643, 0.6369458, 0.59328675, 0.5707141, 0.5344864]), 17: ([13, 5, 6, -1, 18], [0.65762126, 0.6139238, 0.60728157, 0.58866525, 0.54790837]), 18: ([1, 25, 16, 11, 28], [0.50468534, 0.3250762, 0.30392936, 0.24357203, 0.23400743]), 19: ([25, 1, 3, 13, 6], [0.55194914, 0.48832184, 0.45208088, 0.44120616, 0.4190611]), 20: ([18, 19, 15, 13, 2], [0.557827, 0.43977207, 0.43789688, 0.43548745, 0.35338974]), 21: ([2, 19, 18, -1, 6], [0.6691359, 0.60644364, 0.5490931, 0.50098073, 0.49998498]), 22: ([23, -1, 2, 3, 12], [0.57611, 0.45073602, 0.4504422, 0.41900182, 0.41770333]), 23: ([15, 0, 19, -1, 6], [0.7930567, 0.7465304, 0.679661, 0.6681056, 0.6579131]), 24: ([2, 19, 18, 13, 24], [0.60151064, 0.55189604, 0.5194111, 0.44959784, 0.42051235]), 25: ([19, 2, -1, 24, 18], [0.5265802, 0.4686581, 0.44991255, 0.43844196, 0.43669844]), 26: ([18, 13, 15, 2, 19], [0.69062257, 0.5640918, 0.50924456, 0.4998741, 0.49338037]), 27: ([3, 13, 5, 6, -1], [0.7216635, 0.7042624, 0.6044115, 0.59886986, 0.5840592]), 28: ([1, 6, -1, 19, 2], [0.6556672, 0.5127008, 0.4916587, 0.49149424, 0.48611024]), 29: ([3, 13, 6, -1, 5], [0.6038191, 0.5734424, 0.5320734, 0.5162424, 0.5033302]), 30: ([27, -1, 10, 2, 19], [0.6936959, 0.61174935, 0.601683, 0.566511, 0.5466056]), 31: ([2, 19, 18, 9, 27], [0.5819382, 0.54143655, 0.49397776, 0.48814607, 0.48383033]), 32: ([3, -1, 6, 1, 28], [0.65872884, 0.635707, 0.608444, 0.5829582, 0.5752769]), 33: ([13, 5, 6, 3, 25], [0.7292961, 0.60775256, 0.5884522, 0.5407063, 0.53832656]), 34: ([25, 3, 1, 13, -1], [0.637954, 0.5884554, 0.49361923, 0.46198586, 0.41789788]), 35: ([19, 6, 24, 2, -1], [0.57167906, 0.47881758, 0.47715357, 0.47672853, 0.46109694]), 36: ([11, 21, -1, 19, 3], [0.5342965, 0.3540604, 0.31396708, 0.30614287, 0.28725663]), 37: ([3, 21, 14, 1, -1], [0.66179967, 0.6216998, 0.5244014, 0.52273166, 0.5224961]), 38: ([3, 21, 13, -1, 5], [0.64652365, 0.64499676, 0.63917685, 0.6109181, 0.5972434]), 39: ([13, 19, 6, 15, -1], [0.6811728, 0.6171901, 0.5795578, 0.57925373, 0.5643325]), 40: ([11, 15, 28, 19, -1], [0.4830264, 0.4367027, 0.43307197, 0.42501056, 0.42436323]), 41: ([3, -1, 14, 8, 1], [0.50050914, 0.4963848, 0.49332908, 0.4827861, 0.47361302]), 42: ([13, 5, -1, 19, 6], [0.7165978, 0.6552596, 0.63915944, 0.61551917, 0.6041337]), 43: ([13, -1, 5, 3, 18], [0.7352257, 0.6764289, 0.6328855, 0.60559034, 0.58686656]), 44: ([27, 10, -1, 7, 17], [0.6888843, 0.62323713, 0.5650345, 0.5245141, 0.47283512]), 45: ([15, 0, 19, -1, 12], [0.65599155, 0.4443518, 0.34762046, 0.3418584, 0.3243569]), 46: ([15, 6, 0, 28, 19], [0.75873804, 0.53309697, 0.5304099, 0.5202215, 0.5133062]), 47: ([-1, 6, 19, 13, 5], [0.5350419, 0.50903296, 0.50136626, 0.49770844, 0.4896045]), 48: ([13, 5, 6, -1, 19], [0.57852226, 0.5445132, 0.5434073, 0.5272524, 0.50155413]), 49: ([13, -1, 5, 6, 2], [0.66542816, 0.63717955, 0.6197554, 0.6055248, 0.5629763]), 50: ([14, 13, 5, 3, 16], [0.5940044, 0.5292071, 0.52515036, 0.5050933, 0.49033603]), 51: ([3, 5, 13, 1, 25], [0.5697411, 0.4871588, 0.48252225, 0.47071904, 0.45116085]), 52: ([23, 10, 12, -1, 7], [0.79661906, 0.5908638, 0.54812264, 0.5466073, 0.48742852]), 53: ([23, 15, -1, 6, 19], [0.5235566, 0.49062318, 0.4245513, 0.423828, 0.4199304]), 54: ([15, 0, 19, -1, 6], [0.7263851, 0.577638, 0.5167371, 0.51661485, 0.47870922]), 55: ([23, -1, 10, 5, 3], [0.7851007, 0.54155165, 0.50885713, 0.48478395, 0.46920654]), 56: ([8, 14, 1, 3, -1], [0.692508, 0.6453519, 0.5852873, 0.5707865, 0.55951554]), 57: ([20, 14, 8, 3, 28], [0.5258478, 0.5169902, 0.50255233, 0.49264035, 0.47976863]), 58: ([24, 15, 19, 0, 6], [0.7765574, 0.6140851, 0.5888598, 0.55736446, 0.53679776]), 59: ([5, 3, 13, -1, 6], [0.7858389, 0.7399318, 0.6759767, 0.6443262, 0.5526908]), 60: ([5, 7, 3, -1, 6], [0.68206596, 0.6210891, 0.6093588, 0.55330384, 0.5320042]), 61: ([5, 13, 3, -1, 6], [0.71731764, 0.6623226, 0.6215801, 0.58288157, 0.57490253]), 62: ([5, 3, 16, 13, -1], [0.59848094, 0.558551, 0.5418923, 0.5328593, 0.44477707]), 63: ([13, 3, 6, 5, -1], [0.7733216, 0.6852923, 0.67714024, 0.6762121, 0.63050437]), 64: ([2, 18, -1, 19, 5], [0.6221134, 0.46013168, 0.4596297, 0.45897162, 0.43903035]), 65: ([3, 5, 13, -1, 6], [0.65923905, 0.6097661, 0.57920605, 0.5002142, 0.49116856]), 66: ([24, 19, 6, 0, 15], [0.84010744, 0.6925629, 0.6767347, 0.6234027, 0.62311643]), 67: ([1, 3, 21, -1, 5], [0.6935794, 0.68932253, 0.6536973, 0.59040225, 0.5753021]), 68: ([3, 5, 25, -1, 7], [0.7354659, 0.6835431, 0.5855718, 0.58312124, 0.5477498]), 69: ([21, 3, -1, 1, 13], [0.76007116, 0.59630394, 0.56643784, 0.5219722, 0.50971353]), 70: ([18, 2, 13, 19, -1], [0.59311163, 0.519495, 0.51878923, 0.5115966, 0.45841444]), 71: ([3, -1, 21, 5, 1], [0.71684957, 0.6713434, 0.6336597, 0.60011536, 0.5873736]), 72: ([5, 3, -1, 13, 6], [0.7219941, 0.58950275, 0.56725645, 0.5431076, 0.52340853]), 73: ([25, 3, 1, 5, 6], [0.70677423, 0.5736163, 0.51880807, 0.498132, 0.49795717]), 74: ([24, 0, 19, 6, 15], [0.7587775, 0.59496886, 0.55387163, 0.5441983, 0.5205356]), 75: ([11, 21, 16, -1, 5], [0.6377183, 0.35087854, 0.3437996, 0.31165475, 0.29002064]), 76: ([24, 0, 19, 15, 6], [0.7386719, 0.69045925, 0.5792686, 0.56098914, 0.54014987]), 77: ([3, 5, 25, 1, 21], [0.659086, 0.60165024, 0.57904446, 0.5477652, 0.53125393]), 78: ([11, 21, 16, 3, -1], [0.6086948, 0.41993862, 0.40980315, 0.2869692, 0.28065425]), 79: ([25, 3, 5, 13, 1], [0.6384877, 0.5772792, 0.52260494, 0.47974944, 0.4618316]), 80: ([17, 10, 7, 27, -1], [0.6607008, 0.46085802, 0.44260493, 0.42012513, 0.40370655]), 81: ([21, 13, 25, 11, 3], [0.5969976, 0.5061964, 0.49686044, 0.4905312, 0.4867716]), 82: ([5, 13, 6, 3, -1], [0.58330023, 0.5655685, 0.53061336, 0.49870867, 0.48886782]), 83: ([19, 15, 13, 6, 28], [0.59379804, 0.5576351, 0.50826627, 0.5056901, 0.47991803]), 84: ([5, 3, 13, -1, 6], [0.81649184, 0.7189138, 0.67844373, 0.5948365, 0.53950953]), 85: ([11, 21, 3, 16, 13], [0.60725665, 0.49742848, 0.44739795, 0.42205644, 0.40310383]), 86: ([25, 3, 1, 5, 21], [0.5579259, 0.48615286, 0.43963972, 0.40077454, 0.37448668]), 87: ([6, 22, 28, 8, -1], [0.6848248, 0.6714087, 0.5774374, 0.5734037, 0.5719103]), 88: ([13, 11, 5, 6, 3], [0.5971294, 0.524037, 0.51919204, 0.48642057, 0.4809844]), 89: ([0, 6, -1, 18, 19], [0.68423676, 0.6241167, 0.5730027, 0.5361933, 0.5286046]), 90: ([7, -1, 11, 16, 5], [0.62654305, 0.5201823, 0.45111668, 0.4399878, 0.43930742]), 91: ([7, -1, 17, 10, 6], [0.6455877, 0.5784802, 0.53981113, 0.50081, 0.4972272]), 92: ([6, 10, -1, 7, 23], [0.54935217, 0.47624493, 0.46047568, 0.4275682, 0.42179346]), 93: ([3, 5, 25, 13, 21], [0.68364143, 0.6335913, 0.5721427, 0.56449735, 0.5362524]), 94: ([13, 6, 15, 5, -1], [0.654279, 0.6488178, 0.5932056, 0.58846515, 0.5776601]), 95: ([5, 3, 13, -1, 16], [0.64904284, 0.5616013, 0.5369834, 0.5023572, 0.499998]), 96: ([5, 3, 13, -1, 16], [0.6559278, 0.57432604, 0.54575133, 0.46943372, 0.46132255]), 97: ([5, 13, 3, 25, 6], [0.6956768, 0.642449, 0.62465465, 0.49174324, 0.4749214]), 98: ([13, 6, 5, -1, 3], [0.7661036, 0.65524805, 0.6301204, 0.6186588, 0.5668917]), 99: ([13, 6, 5, 3, -1], [0.8071288, 0.71757805, 0.701682, 0.68321455, 0.66584325]), 100: ([13, 3, 5, 6, 22], [0.6334363, 0.61054826, 0.5863518, 0.5740575, 0.5719327]), 101: ([13, 3, 5, 25, 6], [0.6283176, 0.57015043, 0.56985795, 0.55417484, 0.5221219]), 102: ([3, 5, -1, 21, 13], [0.8247145, 0.7191614, 0.6067312, 0.56410027, 0.54289997]), 103: ([6, 19, 13, 15, 28], [0.55669475, 0.5566424, 0.5294384, 0.52737856, 0.51921844]), 104: ([5, 3, 13, -1, 21], [0.70998865, 0.6981659, 0.59811026, 0.48538405, 0.47510195]), 105: ([3, 5, -1, 13, 6], [0.6857774, 0.6194889, 0.5255419, 0.5245912, 0.47831774]), 106: ([3, 5, 13, -1, 25], [0.69759953, 0.67195874, 0.6058873, 0.56095135, 0.5485104]), 107: ([3, 5, 13, -1, 21], [0.7235756, 0.70292735, 0.6856854, 0.66497403, 0.644892]), 108: ([6, 13, -1, 1, 28], [0.80061454, 0.719189, 0.66364586, 0.6288589, 0.61534774]), 109: ([7, -1, 9, 4, 10], [0.65451354, 0.5201517, 0.52002954, 0.5019696, 0.46231374]), 110: ([-1, 19, 18, 21, 26], [0.52580774, 0.49323654, 0.47214228, 0.45704564, 0.43273914]), 111: ([3, 25, -1, 5, 6], [0.6579413, 0.5295661, 0.51791054, 0.49622053, 0.4748402]), 112: ([3, 25, 5, 14, 16], [0.7041523, 0.57188284, 0.5666177, 0.53845775, 0.52973])}\n"
     ]
    }
   ],
   "source": [
    "print(query_topics_with_probs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Query 1 has topic -1\n",
      "Relevant doc topics: \n",
      "5,-1,-1,-1,5,1,-1,10,1,1,12,-1,10,10,12,0,-1,11,-1,-1,1,18,-1,-1,25,20,-1,1,1,-1,1,-1,1,1,1,-1,1,-1,1,1,3,-1,1,5,-1,-1,\n",
      "PCTG:  0.391304347826087\n",
      "Query 2 has topic 6\n",
      "Relevant doc topics: \n",
      "2,5,23,-1,24,-1,5,-1,-1,-1,5,1,1,1,1,1,1,1,1,1,-1,-1,1,1,1,-1,\n",
      "PCTG:  0.0\n",
      "Query 3 has topic 2\n",
      "Relevant doc topics: \n",
      "2,2,13,6,19,2,1,19,19,19,2,0,2,2,-1,5,13,13,-1,2,-1,2,2,-1,2,0,2,3,2,2,5,13,2,-1,0,0,11,-1,2,2,2,2,-1,-1,\n",
      "PCTG:  0.4090909090909091\n",
      "Query 4 has topic 25\n",
      "Relevant doc topics: \n",
      "6,-1,25,25,-1,-1,28,-1,\n",
      "PCTG:  0.25\n",
      "Query 5 has topic 13\n",
      "Relevant doc topics: \n",
      "-1,22,-1,13,13,2,12,12,23,2,0,0,-1,13,2,13,-1,6,-1,13,1,-1,0,6,\n",
      "PCTG:  0.20833333333333334\n",
      "Query 6 has topic 11\n",
      "Relevant doc topics: \n",
      "-1,\n",
      "PCTG:  0.0\n",
      "Query 7 has topic -1\n",
      "Relevant doc topics: \n",
      "6,11,-1,19,19,6,22,-1,\n",
      "PCTG:  0.25\n",
      "Query 8 has topic 3\n",
      "Relevant doc topics: \n",
      "-1,-1,-1,6,6,1,-1,21,28,-1,-1,2,11,11,-1,-1,11,21,\n",
      "PCTG:  0.0\n",
      "Query 9 has topic 3\n",
      "Relevant doc topics: \n",
      "-1,-1,-1,-1,21,11,-1,25,3,11,3,-1,2,-1,11,-1,-1,11,2,-1,1,-1,11,2,-1,11,11,-1,-1,11,-1,-1,-1,11,\n",
      "PCTG:  0.058823529411764705\n",
      "Query 10 has topic 5\n",
      "Relevant doc topics: \n",
      "16,-1,2,2,-1,0,11,13,2,2,2,-1,13,5,24,2,-1,2,16,16,-1,16,11,2,-1,-1,\n",
      "PCTG:  0.038461538461538464\n",
      "Query 11 has topic -1\n",
      "Relevant doc topics: \n",
      "-1,9,-1,-1,-1,7,-1,13,12,-1,9,-1,7,1,-1,0,2,2,-1,-1,-1,4,-1,2,12,12,0,-1,1,1,0,-1,-1,2,6,-1,4,-1,27,2,-1,19,9,-1,10,9,12,9,9,2,2,9,2,9,23,-1,9,-1,2,-1,-1,3,1,-1,-1,-1,6,-1,1,-1,1,1,-1,-1,2,-1,-1,10,-1,2,0,0,2,0,-1,-1,-1,-1,-1,1,2,2,18,-1,2,2,2,2,27,-1,2,9,-1,2,2,2,-1,-1,-1,4,0,-1,27,0,-1,0,0,9,9,-1,23,2,9,-1,0,0,-1,\n",
      "PCTG:  0.3858267716535433\n",
      "Query 12 has topic 27\n",
      "Relevant doc topics: \n",
      "3,1,19,-1,18,19,2,2,-1,-1,27,9,-1,\n",
      "PCTG:  0.07692307692307693\n",
      "Query 13 has topic 5\n",
      "Relevant doc topics: \n",
      "18,5,-1,-1,-1,1,3,-1,5,13,3,5,23,3,5,13,-1,18,18,2,2,3,0,-1,0,-1,3,10,10,0,0,0,6,-1,9,2,3,18,3,13,-1,13,5,3,6,6,3,-1,5,5,-1,-1,13,3,5,3,3,5,18,2,2,3,1,1,1,-1,6,-1,5,5,5,5,-1,5,18,5,-1,5,25,5,-1,5,3,3,5,5,-1,19,3,5,5,\n",
      "PCTG:  0.24175824175824176\n",
      "Query 14 has topic 23\n",
      "Relevant doc topics: \n",
      "3,-1,1,\n",
      "PCTG:  0.0\n",
      "Query 15 has topic 6\n",
      "Relevant doc topics: \n",
      "-1,18,-1,-1,22,-1,13,18,-1,19,18,-1,0,-1,24,0,0,0,15,0,0,8,6,8,-1,19,0,0,0,0,-1,2,19,3,13,-1,1,18,18,-1,1,18,6,1,6,13,6,-1,18,18,1,-1,-1,22,1,6,-1,13,18,-1,0,8,15,-1,8,8,2,2,-1,-1,0,-1,-1,-1,6,22,6,-1,6,6,6,15,\n",
      "PCTG:  0.12195121951219512\n",
      "Query 16 has topic 13\n",
      "Relevant doc topics: \n",
      "6,19,-1,23,19,-1,18,6,6,6,-1,6,24,24,-1,-1,24,24,21,-1,24,-1,15,24,2,-1,\n",
      "PCTG:  0.0\n",
      "Query 17 has topic 13\n",
      "Relevant doc topics: \n",
      "3,3,1,22,19,19,-1,-1,18,-1,6,6,-1,1,-1,-1,22,22,-1,22,22,22,22,22,22,-1,\n",
      "PCTG:  0.0\n",
      "Query 18 has topic 1\n",
      "Relevant doc topics: \n",
      "13,8,-1,1,1,1,1,1,1,1,1,\n",
      "PCTG:  0.7272727272727273\n",
      "Query 19 has topic 25\n",
      "Relevant doc topics: \n",
      "3,-1,7,5,-1,3,3,-1,6,-1,-1,-1,11,-1,-1,3,-1,25,3,-1,5,3,21,6,25,5,1,25,13,3,3,3,5,-1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,3,3,21,-1,-1,-1,25,6,25,28,25,25,1,1,-1,-1,-1,-1,1,-1,3,-1,5,-1,-1,1,25,-1,25,-1,-1,2,-1,\n",
      "PCTG:  0.1111111111111111\n",
      "Query 20 has topic 18\n",
      "Relevant doc topics: \n",
      "18,3,5,23,3,-1,18,18,1,-1,23,23,-1,-1,0,15,15,19,23,3,3,-1,3,25,13,6,-1,1,-1,7,-1,3,-1,13,-1,5,5,3,5,3,6,-1,21,-1,5,23,-1,13,23,-1,3,3,-1,7,23,5,3,3,1,-1,5,5,18,1,1,1,-1,6,1,5,-1,10,5,5,-1,1,5,1,-1,1,18,-1,-1,3,5,-1,5,25,0,25,28,3,28,-1,-1,-1,-1,28,28,-1,-1,-1,3,5,-1,23,23,5,1,1,2,-1,18,18,3,3,5,-1,5,19,-1,1,-1,-1,-1,2,-1,3,-1,-1,5,-1,5,-1,-1,-1,19,6,6,15,-1,3,3,-1,\n",
      "PCTG:  0.04861111111111111\n",
      "Query 21 has topic 2\n",
      "Relevant doc topics: \n",
      "-1,0,0,2,0,12,12,0,0,0,-1,2,0,12,0,0,0,12,0,-1,0,12,0,12,6,\n",
      "PCTG:  0.08\n",
      "Query 22 has topic 23\n",
      "Relevant doc topics: \n",
      "1,-1,-1,3,23,1,23,23,23,-1,10,23,-1,2,24,-1,24,24,0,0,-1,1,19,23,-1,10,12,12,-1,23,1,-1,9,-1,23,23,1,23,10,10,1,-1,12,3,3,5,-1,-1,23,1,23,23,23,\n",
      "PCTG:  0.2641509433962264\n",
      "Query 23 has topic 15\n",
      "Relevant doc topics: \n",
      "-1,0,15,0,0,-1,0,0,-1,-1,10,10,2,10,-1,-1,10,0,0,14,0,0,0,-1,15,0,0,0,22,0,15,0,-1,12,-1,2,0,15,0,0,0,0,0,-1,-1,0,8,0,\n",
      "PCTG:  0.08333333333333333\n",
      "Query 24 has topic 2\n",
      "Relevant doc topics: \n",
      "-1,0,0,0,19,15,0,0,12,23,0,-1,0,15,0,0,0,0,0,0,0,0,2,-1,2,2,6,0,0,28,-1,12,0,0,0,0,0,0,-1,12,-1,12,0,-1,0,-1,0,6,0,0,11,0,\n",
      "PCTG:  0.057692307692307696\n",
      "Query 25 has topic 19\n",
      "Relevant doc topics: \n",
      "0,-1,-1,-1,19,-1,14,-1,-1,19,19,-1,23,18,6,-1,-1,20,20,20,2,-1,-1,-1,0,27,23,-1,-1,-1,-1,20,0,\n",
      "PCTG:  0.09090909090909091\n",
      "Query 26 has topic 18\n",
      "Relevant doc topics: \n",
      "-1,18,-1,13,18,-1,19,18,-1,-1,24,24,6,19,0,0,2,19,3,13,-1,-1,-1,18,-1,1,6,18,-1,6,5,-1,0,13,13,-1,18,1,1,-1,1,13,1,18,8,15,0,-1,15,2,-1,-1,6,-1,6,6,\n",
      "PCTG:  0.125\n",
      "Query 27 has topic 3\n",
      "Relevant doc topics: \n",
      "18,3,3,1,3,-1,5,3,3,3,-1,3,13,-1,6,19,3,1,-1,-1,-1,3,-1,12,23,23,21,2,14,-1,-1,-1,-1,-1,5,25,-1,2,25,7,13,3,3,-1,3,3,-1,3,3,-1,-1,-1,-1,3,3,3,3,3,13,3,3,3,3,-1,-1,7,-1,3,1,1,1,-1,7,1,1,1,1,1,1,1,3,16,-1,3,-1,10,7,3,-1,7,-1,-1,3,-1,3,1,3,1,-1,3,3,5,-1,-1,-1,3,-1,3,3,7,-1,11,-1,3,3,\n",
      "PCTG:  0.33043478260869563\n",
      "Query 28 has topic 1\n",
      "Relevant doc topics: \n",
      "1,1,1,-1,-1,10,-1,1,2,19,1,-1,1,1,1,1,7,1,1,1,-1,1,1,7,1,1,1,1,1,1,1,1,1,1,1,1,1,-1,1,-1,1,-1,1,1,6,1,1,1,1,1,1,-1,3,-1,1,1,1,1,-1,1,\n",
      "PCTG:  0.7\n",
      "Query 29 has topic 3\n",
      "Relevant doc topics: \n",
      "3,-1,5,3,3,3,-1,3,-1,-1,-1,-1,-1,3,10,10,21,14,-1,7,3,3,-1,-1,3,3,3,3,3,1,5,3,1,1,1,-1,1,1,1,1,7,3,1,3,3,3,3,3,\n",
      "PCTG:  0.4166666666666667\n",
      "Query 30 has topic 27\n",
      "Relevant doc topics: \n",
      "-1,0,9,-1,-1,10,3,12,7,-1,4,-1,-1,-1,19,-1,0,-1,12,12,10,0,10,9,-1,-1,-1,17,0,10,0,10,-1,6,10,10,7,7,10,7,7,-1,2,27,27,10,22,22,-1,1,-1,-1,17,-1,17,10,-1,17,-1,17,10,-1,27,7,-1,17,17,-1,2,17,-1,-1,17,-1,-1,7,7,-1,10,0,-1,10,10,17,17,-1,-1,27,-1,2,27,-1,2,-1,2,2,17,2,-1,-1,17,2,-1,17,2,-1,-1,-1,0,-1,27,0,7,17,7,7,9,7,7,7,9,27,7,7,7,9,-1,-1,-1,0,-1,10,7,0,\n",
      "PCTG:  0.05223880597014925\n",
      "Query 31 has topic 2\n",
      "Relevant doc topics: \n",
      "9,1,2,1,6,1,-1,19,17,1,-1,7,18,-1,-1,19,1,-1,10,7,-1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,-1,1,1,6,-1,-1,-1,7,-1,-1,-1,2,-1,3,3,1,2,2,27,-1,1,27,23,-1,20,1,\n",
      "PCTG:  0.06557377049180328\n",
      "Query 32 has topic 3\n",
      "Relevant doc topics: \n",
      "15,-1,-1,-1,22,-1,15,-1,10,10,12,-1,10,21,12,-1,0,-1,24,0,14,-1,-1,14,0,0,0,15,-1,-1,0,8,6,-1,-1,-1,8,8,-1,19,0,15,19,14,15,15,15,12,0,13,6,5,13,1,22,1,6,6,6,8,15,-1,24,8,-1,-1,-1,8,-1,-1,28,28,24,8,8,15,15,-1,8,0,20,-1,24,-1,8,15,-1,-1,8,19,28,28,28,15,-1,-1,24,22,-1,-1,8,-1,3,-1,-1,-1,19,6,22,6,6,6,6,15,-1,0,8,\n",
      "PCTG:  0.008547008547008548\n",
      "Query 33 has topic 13\n",
      "Relevant doc topics: \n",
      "19,6,-1,19,19,24,24,-1,-1,-1,24,24,-1,24,-1,24,23,19,6,15,\n",
      "PCTG:  0.0\n",
      "Query 34 has topic 25\n",
      "Relevant doc topics: \n",
      "3,3,3,3,5,8,-1,6,-1,11,7,-1,3,3,3,3,-1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,3,3,\n",
      "PCTG:  0.0\n",
      "Query 35 has topic 19\n",
      "Relevant doc topics: \n",
      "0,-1,-1,23,-1,24,19,-1,-1,2,15,12,22,-1,19,-1,0,0,14,15,23,1,6,-1,3,2,23,1,1,1,1,1,2,0,-1,19,-1,24,2,2,-1,-1,0,\n",
      "PCTG:  0.06976744186046512\n",
      "Query 37 has topic 3\n",
      "Relevant doc topics: \n",
      "14,14,21,3,-1,3,3,1,3,-1,3,23,3,21,10,21,1,-1,1,1,-1,7,21,21,-1,-1,21,1,21,3,-1,21,3,21,-1,-1,-1,21,21,-1,\n",
      "PCTG:  0.2\n",
      "Query 39 has topic 13\n",
      "Relevant doc topics: \n",
      "-1,0,0,4,13,15,0,-1,27,0,2,0,0,0,-1,-1,2,-1,\n",
      "PCTG:  0.05555555555555555\n",
      "Query 41 has topic 3\n",
      "Relevant doc topics: \n",
      "3,1,14,3,1,1,-1,-1,-1,14,-1,\n",
      "PCTG:  0.18181818181818182\n",
      "Query 42 has topic 13\n",
      "Relevant doc topics: \n",
      "2,2,2,-1,13,6,19,2,-1,-1,-1,0,6,2,0,-1,2,-1,-1,-1,13,-1,6,1,13,-1,2,2,19,2,-1,2,2,2,22,2,-1,15,11,-1,-1,2,2,2,2,-1,-1,2,13,-1,\n",
      "PCTG:  0.08\n",
      "Query 43 has topic 13\n",
      "Relevant doc topics: \n",
      "13,-1,18,19,-1,-1,-1,-1,25,-1,1,25,25,\n",
      "PCTG:  0.07692307692307693\n",
      "Query 44 has topic 27\n",
      "Relevant doc topics: \n",
      "-1,0,-1,-1,3,-1,-1,7,-1,-1,12,-1,9,7,0,1,-1,-1,15,-1,10,-1,2,10,-1,-1,24,12,12,10,0,0,1,-1,-1,18,-1,27,19,19,19,-1,10,12,2,-1,-1,18,-1,1,0,3,1,6,19,-1,10,1,10,-1,19,1,10,10,7,1,10,-1,-1,1,-1,3,1,-1,-1,22,-1,6,-1,10,-1,-1,27,-1,-1,7,-1,17,7,10,0,0,20,-1,10,-1,15,-1,0,0,0,10,0,0,-1,0,22,1,-1,-1,-1,1,-1,2,-1,2,-1,-1,2,0,0,2,-1,0,0,27,0,6,0,0,7,9,7,27,9,1,-1,27,9,0,-1,0,10,-1,-1,-1,0,3,-1,-1,10,0,-1,0,1,\n",
      "PCTG:  0.03225806451612903\n",
      "Query 45 has topic 15\n",
      "Relevant doc topics: \n",
      "0,15,15,0,-1,0,-1,0,22,-1,18,-1,0,19,15,0,15,15,0,-1,0,0,19,5,13,22,-1,22,-1,6,-1,1,0,0,8,-1,0,8,20,0,15,0,0,8,15,0,0,0,0,20,20,0,0,-1,-1,8,0,0,19,28,0,0,0,-1,0,0,-1,0,0,6,0,19,6,-1,0,14,0,\n",
      "PCTG:  0.09090909090909091\n",
      "Query 46 has topic 15\n",
      "Relevant doc topics: \n",
      "15,0,-1,0,0,0,-1,0,-1,18,-1,0,19,15,15,23,-1,8,0,-1,15,0,8,15,-1,8,-1,19,2,15,22,15,15,15,-1,0,21,23,-1,13,6,-1,-1,0,0,22,22,6,0,-1,6,0,8,15,-1,28,24,8,-1,-1,0,-1,-1,-1,28,-1,8,-1,-1,-1,28,0,-1,28,8,8,22,8,15,15,-1,8,0,-1,8,15,-1,-1,-1,8,19,28,28,28,15,22,-1,-1,0,8,-1,3,-1,-1,-1,19,6,6,6,6,0,15,-1,0,8,8,\n",
      "PCTG:  0.12931034482758622\n",
      "Query 49 has topic 13\n",
      "Relevant doc topics: \n",
      "-1,8,9,-1,6,21,2,14,6,19,-1,9,2,6,18,2,-1,1,1,-1,-1,-1,-1,-1,-1,2,0,16,9,9,6,-1,0,19,\n",
      "PCTG:  0.0\n",
      "Query 50 has topic 14\n",
      "Relevant doc topics: \n",
      "14,14,3,8,14,8,14,-1,-1,14,14,14,14,8,8,-1,16,-1,14,14,14,14,-1,3,3,3,3,3,-1,3,3,1,1,16,3,14,1,-1,-1,-1,3,16,3,1,1,28,14,8,0,-1,-1,-1,-1,20,7,14,1,-1,-1,2,16,16,16,1,16,-1,-1,-1,14,-1,8,8,3,7,-1,-1,17,14,-1,14,8,-1,3,3,-1,-1,14,-1,-1,\n",
      "PCTG:  0.21348314606741572\n",
      "Query 52 has topic 23\n",
      "Relevant doc topics: \n",
      "-1,-1,3,23,23,23,10,23,23,23,-1,-1,23,23,23,\n",
      "PCTG:  0.6\n",
      "Query 54 has topic 15\n",
      "Relevant doc topics: \n",
      "-1,15,0,0,0,-1,13,15,12,0,15,0,0,-1,0,27,0,0,0,-1,2,19,0,0,15,28,-1,0,0,0,15,0,0,0,0,0,0,0,-1,0,-1,24,0,0,0,-1,6,19,0,0,0,\n",
      "PCTG:  0.09803921568627451\n",
      "Query 55 has topic 23\n",
      "Relevant doc topics: \n",
      "-1,3,23,23,23,10,23,23,23,-1,3,1,5,-1,-1,23,23,23,\n",
      "PCTG:  0.5\n",
      "Query 56 has topic 8\n",
      "Relevant doc topics: \n",
      "14,0,14,-1,0,-1,1,-1,14,-1,14,-1,3,3,14,-1,3,16,28,14,-1,-1,16,16,16,1,16,-1,-1,14,-1,8,8,3,-1,-1,14,-1,14,8,-1,3,14,-1,-1,\n",
      "PCTG:  0.06666666666666667\n",
      "Query 57 has topic 20\n",
      "Relevant doc topics: \n",
      "14,-1,1,15,14,14,-1,3,3,-1,3,16,28,8,8,-1,-1,-1,\n",
      "PCTG:  0.0\n",
      "Query 58 has topic 24\n",
      "Relevant doc topics: \n",
      "0,15,0,8,-1,9,9,24,19,19,19,19,15,-1,24,24,24,0,19,19,-1,6,6,24,0,-1,28,24,24,8,-1,24,-1,24,24,28,24,0,-1,24,8,6,23,-1,0,19,\n",
      "PCTG:  0.2608695652173913\n",
      "Query 61 has topic 5\n",
      "Relevant doc topics: \n",
      "2,-1,5,5,-1,-1,-1,-1,13,11,2,\n",
      "PCTG:  0.18181818181818182\n",
      "Query 62 has topic 5\n",
      "Relevant doc topics: \n",
      "-1,3,5,16,16,5,5,5,16,16,16,2,\n",
      "PCTG:  0.3333333333333333\n",
      "Query 65 has topic 3\n",
      "Relevant doc topics: \n",
      "3,3,3,3,3,-1,-1,-1,3,3,7,7,5,\n",
      "PCTG:  0.5384615384615384\n",
      "Query 66 has topic 24\n",
      "Relevant doc topics: \n",
      "0,0,8,-1,24,19,19,19,6,-1,4,24,24,24,0,19,19,-1,6,6,24,24,24,-1,-1,24,28,-1,24,-1,24,8,23,-1,19,\n",
      "PCTG:  0.2857142857142857\n",
      "Query 67 has topic 1\n",
      "Relevant doc topics: \n",
      "-1,-1,3,3,-1,1,1,6,1,-1,-1,-1,-1,25,20,-1,1,-1,3,3,3,1,1,-1,1,1,1,3,1,1,-1,1,\n",
      "PCTG:  0.375\n",
      "Query 69 has topic 21\n",
      "Relevant doc topics: \n",
      "21,-1,3,3,21,-1,21,-1,21,21,21,-1,1,1,21,\n",
      "PCTG:  0.4666666666666667\n",
      "Query 71 has topic 3\n",
      "Relevant doc topics: \n",
      "11,21,3,-1,3,1,3,-1,3,1,-1,-1,25,-1,3,-1,-1,3,3,-1,-1,-1,3,3,3,-1,-1,\n",
      "PCTG:  0.37037037037037035\n",
      "Query 76 has topic 24\n",
      "Relevant doc topics: \n",
      "0,15,0,8,-1,9,9,24,19,19,15,10,24,12,24,0,0,27,19,0,0,-1,0,0,-1,0,24,10,0,-1,24,24,8,-1,0,0,0,0,-1,24,24,-1,0,-1,24,24,24,0,-1,24,0,0,8,6,23,0,-1,19,0,0,\n",
      "PCTG:  0.2\n",
      "Query 79 has topic 25\n",
      "Relevant doc topics: \n",
      "5,1,-1,25,13,3,-1,1,1,1,25,\n",
      "PCTG:  0.18181818181818182\n",
      "Query 81 has topic 21\n",
      "Relevant doc topics: \n",
      "-1,5,3,19,-1,25,25,21,3,11,-1,\n",
      "PCTG:  0.09090909090909091\n",
      "Query 82 has topic 5\n",
      "Relevant doc topics: \n",
      "16,5,16,4,-1,21,2,2,-1,-1,\n",
      "PCTG:  0.1\n",
      "Query 84 has topic 5\n",
      "Relevant doc topics: \n",
      "5,-1,3,5,5,5,5,-1,13,2,5,\n",
      "PCTG:  0.5454545454545454\n",
      "Query 90 has topic 7\n",
      "Relevant doc topics: \n",
      "-1,7,7,-1,7,7,10,7,9,9,-1,9,0,4,-1,9,4,9,-1,-1,-1,4,4,-1,9,9,-1,9,-1,9,-1,9,10,7,7,-1,7,7,-1,-1,4,-1,4,-1,10,7,9,4,-1,4,-1,7,7,7,7,7,9,4,7,-1,7,-1,7,-1,7,9,-1,-1,17,7,\n",
      "PCTG:  0.2857142857142857\n",
      "Query 92 has topic 6\n",
      "Relevant doc topics: \n",
      "-1,-1,-1,6,6,6,6,2,1,6,6,6,19,25,-1,6,1,6,6,-1,13,1,1,18,-1,-1,-1,-1,-1,-1,18,2,6,-1,6,6,6,6,\n",
      "PCTG:  0.39473684210526316\n",
      "Query 95 has topic 5\n",
      "Relevant doc topics: \n",
      "-1,5,16,16,5,5,5,16,16,16,-1,\n",
      "PCTG:  0.36363636363636365\n",
      "Query 96 has topic 5\n",
      "Relevant doc topics: \n",
      "5,16,16,5,5,5,16,16,16,\n",
      "PCTG:  0.4444444444444444\n",
      "Query 97 has topic 5\n",
      "Relevant doc topics: \n",
      "-1,3,3,-1,3,5,\n",
      "PCTG:  0.16666666666666666\n",
      "Query 98 has topic 13\n",
      "Relevant doc topics: \n",
      "-1,13,13,4,13,2,13,1,6,1,6,6,-1,13,13,6,13,6,6,-1,-1,13,-1,-1,-1,28,-1,6,6,\n",
      "PCTG:  0.27586206896551724\n",
      "Query 99 has topic 13\n",
      "Relevant doc topics: \n",
      "18,-1,18,-1,13,18,2,2,2,13,4,13,13,6,1,6,6,-1,13,13,6,13,6,6,-1,13,2,13,-1,-1,28,-1,6,6,\n",
      "PCTG:  0.2647058823529412\n",
      "Query 100 has topic 13\n",
      "Relevant doc topics: \n",
      "22,0,19,-1,22,19,-1,1,-1,-1,28,22,22,22,22,22,22,22,\n",
      "PCTG:  0.0\n",
      "Query 101 has topic 13\n",
      "Relevant doc topics: \n",
      "-1,\n",
      "PCTG:  0.0\n",
      "Query 102 has topic 3\n",
      "Relevant doc topics: \n",
      "11,3,3,-1,3,3,3,3,-1,3,3,3,3,3,3,-1,3,3,5,1,3,-1,3,3,\n",
      "PCTG:  0.7083333333333334\n",
      "Query 104 has topic 5\n",
      "Relevant doc topics: \n",
      "11,3,5,3,3,3,-1,3,2,3,5,\n",
      "PCTG:  0.18181818181818182\n",
      "Query 109 has topic 7\n",
      "Relevant doc topics: \n",
      "-1,7,7,-1,7,7,13,10,-1,7,9,9,-1,4,-1,-1,9,4,4,9,-1,-1,7,-1,-1,4,4,-1,9,9,-1,9,-1,9,-1,9,10,7,7,-1,7,-1,-1,4,-1,4,7,9,4,-1,4,-1,7,7,7,7,7,9,4,7,-1,7,-1,7,-1,7,9,-1,-1,17,7,\n",
      "PCTG:  0.28169014084507044\n",
      "Query 111 has topic 3\n",
      "Relevant doc topics: \n",
      "3,3,3,-1,-1,3,\n",
      "PCTG:  0.6666666666666666\n",
      "avg_pctg:  0.14244139328771147\n",
      "min_pctg:  0.0\n",
      "max_pctg:  0.7272727272727273\n"
     ]
    }
   ],
   "source": [
    "# analyse if queries and relevant documents have similar topics\n",
    "rels = pd.read_csv(\"data/cisi-csv/rels.csv\")\n",
    "\n",
    "doc_topics = doc_info[\"Topic\"].tolist()\n",
    "avg_pctg = 0\n",
    "max_pctg = 0\n",
    "min_pctg = 10000\n",
    "\n",
    "for index, row in queries.iterrows():\n",
    "  queryID = row[\"id\"]\n",
    "  truths = rels.loc[rels['queryID'] == queryID]\n",
    "  true_docs = truths['docID'].tolist()\n",
    "  if len(true_docs) == 0:\n",
    "    continue\n",
    "  q_topic = query_topics_with_probs[queryID][0][0]\n",
    "  print(\"Query \" + str(queryID) + \" has topic \" + str(q_topic))\n",
    "  print(\"Relevant doc topics: \")\n",
    "  same_topic = 0\n",
    "  for d in true_docs:\n",
    "    print(doc_topics[d-1], end=\",\")\n",
    "    if doc_topics[d-1] == q_topic:\n",
    "      same_topic += 1\n",
    "  pctg = same_topic / len(true_docs)\n",
    "  print(\"\")\n",
    "  print(\"PCTG: \", pctg)\n",
    "  avg_pctg += pctg\n",
    "  if pctg < min_pctg:\n",
    "    min_pctg = pctg\n",
    "  if pctg > max_pctg:\n",
    "    max_pctg = pctg\n",
    "\n",
    "print(\"avg_pctg: \", avg_pctg/len(queries))\n",
    "print(\"min_pctg: \", min_pctg)\n",
    "print(\"max_pctg: \", max_pctg)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Check how many relevant documents there are per query"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "",
      "text/plain": [
       "<Figure size 3000x700 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import plotly\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "\n",
    "df = pd.read_csv('data/cisi-csv/rels.csv')\n",
    "\n",
    "# Count the number of docID's for each queryID\n",
    "query_doc_count = df.groupby('queryID').size()\n",
    "\n",
    "# Reset index to convert the series to a dataframe for plotting\n",
    "query_doc_count = query_doc_count.reset_index(name='doc_count')\n",
    "\n",
    "# Plot the number of docID's for each queryID\n",
    "plt.figure(figsize=(30, 7))\n",
    "sns.barplot(data=query_doc_count, x='queryID', y='doc_count', )\n",
    "plt.title('Number of docIDs per queryID')\n",
    "plt.xlabel('Query ID')\n",
    "plt.ylabel('Number of docIDs')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "count     76.000000\n",
       "mean      40.973684\n",
       "std       36.171434\n",
       "min        1.000000\n",
       "25%       13.000000\n",
       "50%       30.500000\n",
       "75%       52.250000\n",
       "max      155.000000\n",
       "dtype: float64"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "query_doc_count = df.groupby('queryID').size()\n",
    "query_doc_count.describe()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}