Skip to content
Snippets Groups Projects
Commit decd89eb authored by Thomas Jahoda's avatar Thomas Jahoda
Browse files

add masked political balance metric and optimal political balance for optimal IR pipeline metric

parent e038928d
No related branches found
No related tags found
No related merge requests found
Source diff could not be displayed: it is too large. Options to address this: view the blob.
...@@ -8,7 +8,11 @@ import torch ...@@ -8,7 +8,11 @@ import torch
from numpy.typing import NDArray from numpy.typing import NDArray
from torch.types import Device from torch.types import Device
from src.political_bias import POLITICAL_BIAS_CLASS_INDICES from src.political_bias import (
POLITICAL_BIAS_CLASS_INDICES,
extract_political_bias_classes_probs,
calculate_political_balance_metric,
)
from src.util.numpy_util import select_top_k_indices_desc, one_hot_encode from src.util.numpy_util import select_top_k_indices_desc, one_hot_encode
...@@ -83,11 +87,31 @@ def interleave_docs_by_political_bias_class(sorted_candidate_docs: pd.DataFrame) ...@@ -83,11 +87,31 @@ def interleave_docs_by_political_bias_class(sorted_candidate_docs: pd.DataFrame)
# filter out unused indices # filter out unused indices
interleaved_selection_indices = interleaved_selection_indices[interleaved_selection_indices >= 0] interleaved_selection_indices = interleaved_selection_indices[interleaved_selection_indices >= 0]
# add remaining documents in original order # add remaining documents in original order (in case there is not a balanced number of documents per class)
remaining_docs = sorted_candidate_docs.drop(interleaved_selection_indices) remaining_docs = sorted_candidate_docs.drop(interleaved_selection_indices)
final_selection = pd.concat([sorted_candidate_docs.loc[interleaved_selection_indices], remaining_docs]) final_selection = pd.concat([sorted_candidate_docs.loc[interleaved_selection_indices], remaining_docs])
final_selection.reset_index(inplace=True) final_selection.reset_index(inplace=True)
return final_selection return final_selection
# TODO some other method that has a tunable parameter for how important political balance is? # TODO some other method that has a tunable parameter for how important political balance is that incorporates that into relevance scores?
def calculate_optimal_political_balance_metric_for_optimal_rankings(relevant_docs: pd.DataFrame):
if len(relevant_docs) == 0:
# return whatever the metric returns for empty rankings
return calculate_political_balance_metric(relevant_docs)
# TODO make it work for arbitrary probs
political_bias_classes_probs = extract_political_bias_classes_probs(relevant_docs)
assert np.all(
(political_bias_classes_probs == 0) | (political_bias_classes_probs == 1)
), "implementation only works if probs are all either 0 or 1"
relevant_docs_with_relevance_scores = relevant_docs.copy()
relevant_docs_with_relevance_scores.reset_index(inplace=True, drop=True)
relevant_docs_with_relevance_scores["relevance_score"] = pd.Series(
np.ones(len(relevant_docs_with_relevance_scores), dtype=np.float64)
)
optimally_interleaved_rankings = interleave_docs_by_political_bias_class(relevant_docs_with_relevance_scores)
return calculate_political_balance_metric(optimally_interleaved_rankings)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment