add masked political balance metric and optimal political balance for optimal IR pipeline metric

decd89eb · Thomas Jahoda · e038928d · decd89eb · decd89eb
Commit decd89eb authored 1 year ago by Thomas Jahoda
--- a/src/IR_pipeline.ipynb
+++ b/src/IR_pipeline.ipynb
--- a/src/political_bias_correction.py
+++ b/src/political_bias_correction.py
@@ -8,7 +8,11 @@ import torch
 from numpy.typing import NDArray
 from torch.types import Device
-from src.political_bias import POLITICAL_BIAS_CLASS_INDICES
+from src.political_bias import (
+    POLITICAL_BIAS_CLASS_INDICES,
+    extract_political_bias_classes_probs,
+    calculate_political_balance_metric,
+)
 from src.util.numpy_util import select_top_k_indices_desc, one_hot_encode
@@ -83,11 +87,31 @@ def interleave_docs_by_political_bias_class(sorted_candidate_docs: pd.DataFrame)
    # filter out unused indices
    interleaved_selection_indices = interleaved_selection_indices[interleaved_selection_indices >= 0]
-    # add remaining documents in original order
+    # add remaining documents in original order (in case there is not a balanced number of documents per class)
    remaining_docs = sorted_candidate_docs.drop(interleaved_selection_indices)
    final_selection = pd.concat([sorted_candidate_docs.loc[interleaved_selection_indices], remaining_docs])
    final_selection.reset_index(inplace=True)
    return final_selection
-# TODO some other method that has a tunable parameter for how important political balance is?
+# TODO some other method that has a tunable parameter for how important political balance is that incorporates that into relevance scores?
+def calculate_optimal_political_balance_metric_for_optimal_rankings(relevant_docs: pd.DataFrame):
+    if len(relevant_docs) == 0:
+        # return whatever the metric returns for empty rankings
+        return calculate_political_balance_metric(relevant_docs)
+    # TODO make it work for arbitrary probs
+    political_bias_classes_probs = extract_political_bias_classes_probs(relevant_docs)
+    assert np.all(
+        (political_bias_classes_probs == 0) | (political_bias_classes_probs == 1)
+    ), "implementation only works if probs are all either 0 or 1"
+    relevant_docs_with_relevance_scores = relevant_docs.copy()
+    relevant_docs_with_relevance_scores.reset_index(inplace=True, drop=True)
+    relevant_docs_with_relevance_scores["relevance_score"] = pd.Series(
+        np.ones(len(relevant_docs_with_relevance_scores), dtype=np.float64)
+    )
+    optimally_interleaved_rankings = interleave_docs_by_political_bias_class(relevant_docs_with_relevance_scores)
+    return calculate_political_balance_metric(optimally_interleaved_rankings)