Skip to content
Snippets Groups Projects
Commit 1c55ed8b authored by Kienzl, Julian's avatar Kienzl, Julian
Browse files

Merge branch 'main' of gitlab.tugraz.at:julik/air-project

parents 57db0244 c4db4339
No related branches found
No related tags found
No related merge requests found
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
# Semantic Search with Sentiment Comparison for a Given Query # Semantic Search with Sentiment Comparison for a Given Query
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
**Group 29** **Group 29**
| Name | Matriculation Number | | Name | Matriculation Number |
|--------------------|----------------------| |--------------------|----------------------|
| Benjamin Jost | 11912846 | | Benjamin Jost | 11912846 |
| Julian Kienzel | | | Julian Kienzel | |
| Fabio Maierbrugger | 11908625 | | Fabio Maierbrugger | 11908625 |
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
import json import json
import textwrap import textwrap
import torch import torch
import warnings import warnings
warnings.simplefilter(action='ignore', category=FutureWarning) warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd import pandas as pd
pd.options.mode.chained_assignment = None pd.options.mode.chained_assignment = None
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from collections import Counter from collections import Counter
from sentence_transformers import SentenceTransformer, util from sentence_transformers import SentenceTransformer, util
from transformers import pipeline, logging from transformers import pipeline, logging
logging.set_verbosity_warning() logging.set_verbosity_warning()
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
### Preprocessing ### Preprocessing
Here the already shortened [train-dataset](https://www.kaggle.com/datasets/kritanjalijain/amazon-reviews/data) (from 34.686.770 to 200.000 rows) is preprocessed by the following steps: Here the already shortened [train-dataset](https://www.kaggle.com/datasets/kritanjalijain/amazon-reviews/data) (from 34.686.770 to 200.000 rows) is preprocessed by the following steps:
- Concatinating the columns title and text - Concatinating the columns title and text
- Dropping null values - Dropping null values
- Dropping the title column - Dropping the title column
- Dropping the sentiment column - Dropping the sentiment column
- Convert everything into lowercase - Convert everything into lowercase
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
dataset = pd.read_csv( dataset = pd.read_csv(
"data/train_short_200000.csv", names=["sentiment", "title", "text"], sep="," "data/train_short_200000.csv", names=["sentiment", "title", "text"], sep=","
).head(1000) ).head(10000)
dataset.text = dataset.title + " " + dataset.text dataset.text = dataset.title + " " + dataset.text
dataset = dataset.dropna() dataset = dataset.dropna()
dataset = dataset.drop(["title"], axis=1) dataset = dataset.drop(["title"], axis=1)
dataset = dataset.drop(["sentiment"], axis=1) dataset = dataset.drop(["sentiment"], axis=1)
dataset.text = dataset.text.str.lower() dataset.text = dataset.text.str.lower()
dataset.head() dataset.head()
``` ```
%% Output %% Output
text text
0 stuning even for the non-gamer this sound trac... 0 stuning even for the non-gamer this sound trac...
1 the best soundtrack ever to anything. i'm read... 1 the best soundtrack ever to anything. i'm read...
2 amazing! this soundtrack is my favorite music ... 2 amazing! this soundtrack is my favorite music ...
3 excellent soundtrack i truly like this soundtr... 3 excellent soundtrack i truly like this soundtr...
4 remember, pull your jaw off the floor after he... 4 remember, pull your jaw off the floor after he...
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
### BERT class model ### BERT class model
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
bert_sentence_model = "all-MiniLM-L12-v2" # https://www.sbert.net/docs/pretrained_models.html bert_sentence_model = "all-MiniLM-L12-v2" # https://www.sbert.net/docs/pretrained_models.html
bert_sentiment_model = "distilbert-base-uncased-finetuned-sst-2-english" # https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english bert_sentiment_model = "distilbert-base-uncased-finetuned-sst-2-english" # https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english
class BERT: class BERT:
def __init__(self, dataset: pd.DataFrame) -> None: def __init__(self, dataset: pd.DataFrame) -> None:
self.dataset = dataset self.dataset = dataset
self.sentence_transformer = SentenceTransformer( self.sentence_transformer = SentenceTransformer(
bert_sentence_model, device="cpu" bert_sentence_model, device="cpu"
) )
self.sentiment_analyzer = pipeline( self.sentiment_analyzer = pipeline(
"sentiment-analysis", model=bert_sentiment_model "sentiment-analysis", model=bert_sentiment_model
) )
torch.cuda.empty_cache() torch.cuda.empty_cache()
torch.device("cuda:0" if torch.cuda.is_available() else "cpu") torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
self.content_embeddings = self.sentence_transformer.encode( self.content_embeddings = self.sentence_transformer.encode(
self.dataset.text.tolist(), convert_to_tensor=True self.dataset.text.tolist(), convert_to_tensor=True
) )
def retrieve_top_k_entries_for_query(self, query: str, k: int): def retrieve_top_k_entries_for_query(self, query: str, k: int):
query_embedding = self.sentence_transformer.encode( query_embedding = self.sentence_transformer.encode(
query, convert_to_tensor=True query, convert_to_tensor=True
) )
similarities = util.pytorch_cos_sim(query_embedding, self.content_embeddings)[0] similarities = util.pytorch_cos_sim(query_embedding, self.content_embeddings)[0]
top_k_indices = similarities.argsort(descending=True)[:k].cpu().numpy() top_k_indices = similarities.argsort(descending=True)[:k].cpu().numpy()
return self.dataset.iloc[top_k_indices] return self.dataset.iloc[top_k_indices]
def get_sentiment_for_each_result(self, results: pd.DataFrame) -> pd.DataFrame: def get_sentiment_for_each_result(self, results: pd.DataFrame) -> pd.DataFrame:
for index, row in results.iterrows(): for index, row in results.iterrows():
r = self.sentiment_analyzer(row.text) r = self.sentiment_analyzer(row.text)
results.loc[index, "sentiment"] = r[0]["label"] results.loc[index, "sentiment"] = r[0]["label"]
return results return results
def get_sentiment_of_query(self, query: str) -> tuple: def get_sentiment_of_query(self, query: str) -> tuple:
r = self.sentiment_analyzer(query) r = self.sentiment_analyzer(query)
return r[0]["label"], r[0]["score"] return r[0]["label"], r[0]["score"]
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
bert_model = BERT(dataset=dataset) bert_model = BERT(dataset=dataset)
# This takes a while... # This takes a while...
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
### Semantic Search with BERT ### Semantic Search with BERT
Initialization Initialization
- First our given dataset texts are converted to lists - First our given dataset texts are converted to lists
- The lists are then transformed into embeddings - The lists are then transformed into embeddings
- Those embeddings are converted into a tensors - Those embeddings are converted into a tensors
For the query the same steps are applied in the beginning in retrieve_top_k_entries_for_query. For the query the same steps are applied in the beginning in retrieve_top_k_entries_for_query.
Afterwards we calculate the most similar documents with the pytorch cosine similarity. Afterwards we calculate the most similar documents with the pytorch cosine similarity.
We reorder the list to have the highest similarity values on top and then return the top k elements We reorder the list to have the highest similarity values on top and then return the top k elements
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
query = "Purchased the Lenovo Notebook, and it's been a reliable companion. The design is sleek, and it handles tasks effortlessly. Impressed with the decent battery life, making it suitable for daily use. Overall, a good value for the money.".lower() query = "Purchased the Lenovo Notebook, and it's been a reliable companion. The design is sleek, and it handles tasks effortlessly. Impressed with the decent battery life, making it suitable for daily use. Overall, a good value for the money.".lower()
query_result = bert_model.retrieve_top_k_entries_for_query(query=query, k=10) query_result = bert_model.retrieve_top_k_entries_for_query(query=query, k=10)
print(query_result) print(query_result)
``` ```
%% Output %% Output
text text
7675 great product product is light weight, comfort... 7675 great product product is light weight, comfort...
5077 great value - now these once cost nearly $3000... 5077 great value - now these once cost nearly $3000...
7678 love this, easy to use and makes working from ... 7678 love this, easy to use and makes working from ...
7676 we like them i purchased two lapinator's plus,... 7676 we like them i purchased two lapinator's plus,...
7677 much, much better available... i suggest you k... 7677 much, much better available... i suggest you k...
7679 it works great! it really works! this is my se... 7679 it works great! it really works! this is my se...
7683 great i bought the lapinator and mousitizer af... 7683 great i bought the lapinator and mousitizer af...
1142 good replacement for old charger this is a goo... 1142 good replacement for old charger this is a goo...
8969 ladies citizen my wife loves it. got it for he... 8969 ladies citizen my wife loves it. got it for he...
3394 best computer acessory a year and a half ago, ... 3394 best computer acessory a year and a half ago, ...
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
### Sentiment Analysis with BERT ### Sentiment Analysis with BERT
TODO: Description Initialization
-We create a sentiment-analysis pipeline with our pretrained model bert_sentiment_model.
To determine the sentiment of a given query and of the most similar reviews to that query we have to function:
- get_sentiment_of_query for the query
- get_sentiment_for_each_result for the similar reviews
In get_sentiment_of_query we return a tuple that contains the sentiment and the sentiment score. Both of those values are returned by the sentiment-analysis pipeline.
In get_sentiment_for_each_result we return a pd.dataframe that added both values(sentiment and the sentiment score) to a given pd.dataframe. For each of these values a new column is created within the pd.dataframe.
The higher the sentiment score is, the higher a text tends towards a sentiment.
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
query_sentiment = bert_model.get_sentiment_of_query(query=query) query_sentiment = bert_model.get_sentiment_of_query(query=query)
query_result_with_sentiment = bert_model.get_sentiment_for_each_result( query_result_with_sentiment = bert_model.get_sentiment_for_each_result(
results=query_result results=query_result
) )
print( print(
f"[*] Query ({textwrap.shorten(query, width=30)}): {query_sentiment[0]} [{round(query_sentiment[1] * 100, 4)}%]" f"[*] Query ({textwrap.shorten(query, width=30)}): {query_sentiment[0]} [{round(query_sentiment[1] * 100, 4)}%]"
) )
print( print(
"[*] Sentiment distribution of results : " "[*] Sentiment distribution of results : "
+ str(dict(Counter(query_result_with_sentiment.sentiment.tolist()))) + str(dict(Counter(query_result_with_sentiment.sentiment.tolist())))
) )
``` ```
%% Output %% Output
[*] Query (purchased the lenovo [...]): POSITIVE [99.9848%] [*] Query (purchased the lenovo [...]): POSITIVE [99.9848%]
[*] Sentiment distribution of results : {'POSITIVE': 8, 'NEGATIVE': 2} [*] Sentiment distribution of results : {'POSITIVE': 8, 'NEGATIVE': 2}
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
### Evaluation of the Model ### Evaluation of the Model
TODO: Description TODO: Description
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
def create_bar_chart(positive_count, negative_count, query_text, score, sentiment): def create_bar_chart(positive_count, negative_count, query_text, score, sentiment):
terms = ["Positive", "Negative"] terms = ["Positive", "Negative"]
counts = [positive_count, negative_count] counts = [positive_count, negative_count]
bar_width = 0.6 bar_width = 0.6
bar_positions = [0, 1] bar_positions = [0, 1]
fig, ax = plt.subplots() fig, ax = plt.subplots()
ax.set_title("Sentiment Analysis\n", fontsize=12, weight='bold') ax.set_title("Sentiment Analysis\n", fontsize=12, weight='bold')
subheading = f"Query: {query_text} | {sentiment} | Score: {score}%" subheading = f"Query: {query_text} | {sentiment} | Score: {score}%"
ax.text(0.5, max(counts) + 8, subheading, ha="center", va="center", fontsize=10, weight='light') ax.text(0.5, max(counts) + 8, subheading, ha="center", va="center", fontsize=10, weight='light')
for i, count in enumerate(counts): for i, count in enumerate(counts):
ax.text(bar_positions[i], count + 1, str(count), ha="center", va="bottom", fontsize=8 ax.text(bar_positions[i], count + 1, str(count), ha="center", va="bottom", fontsize=8
) )
ax.bar(bar_positions, counts, color=["green", "red"], width=bar_width, align="center") ax.bar(bar_positions, counts, color=["green", "red"], width=bar_width, align="center")
ax.set_xlabel("Sentiment") ax.set_xlabel("Sentiment")
ax.set_ylabel("Frequency") ax.set_ylabel("Frequency")
ax.set_xticks(bar_positions) ax.set_xticks(bar_positions)
ax.set_xticklabels(terms) ax.set_xticklabels(terms)
plt.subplots_adjust(bottom=0.2) plt.subplots_adjust(bottom=0.2)
plt.show() plt.show()
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# queries = [ # queries = [
# "Cruel and Unusual is the first Patricia Cornwell book I have got read and I for one loved it and I can't wait the read more of her books my boyfriend told me about Patricia Cornwell books he said I need to read Hornet's Nest and Southern Cross and those will be my next two book I will be reading and I will be reading them on my new IPad I got for Christmas I can't wait keep up the great writing Patricia and thanks to my boyfriend for telling me about this awesome writer.........RKsbabydoll", # "Cruel and Unusual is the first Patricia Cornwell book I have got read and I for one loved it and I can't wait the read more of her books my boyfriend told me about Patricia Cornwell books he said I need to read Hornet's Nest and Southern Cross and those will be my next two book I will be reading and I will be reading them on my new IPad I got for Christmas I can't wait keep up the great writing Patricia and thanks to my boyfriend for telling me about this awesome writer.........RKsbabydoll",
# "Actually a good TV. Unfortunately, after a few months the picture is only white. It also cannot be adjusted. The television is therefore junk.", # "Actually a good TV. Unfortunately, after a few months the picture is only white. It also cannot be adjusted. The television is therefore junk.",
# "I don't really understand the positive reviews here. Yes, the RGB light is good and makes it easier to use in a dark room. The additional buttons are also practical... but seriously...What year is it? The keys on this keyboard remind me of my C64 breadbox: see high and loud keystrokes. The feeling is 1:1 the same.I like flat keyboards myself and still gave this Logitech a chance. But after 30 minutes it was over... I'll definitely never get used to that! Therefore, unfortunately, a reach into the toilet.If there ever is a flat version with a quiet stop, I would be happy to test it again.", # "I don't really understand the positive reviews here. Yes, the RGB light is good and makes it easier to use in a dark room. The additional buttons are also practical... but seriously...What year is it? The keys on this keyboard remind me of my C64 breadbox: see high and loud keystrokes. The feeling is 1:1 the same.I like flat keyboards myself and still gave this Logitech a chance. But after 30 minutes it was over... I'll definitely never get used to that! Therefore, unfortunately, a reach into the toilet.If there ever is a flat version with a quiet stop, I would be happy to test it again.",
# "I recently got the [Laptop Brand/Model] and it's been a game-changer. The sleek design caught my eye, and it performs like a champ—smooth multitasking, vibrant display. Battery life is decent, lasting through my workday. Overall, a solid buy for the price!", # "I recently got the [Laptop Brand/Model] and it's been a game-changer. The sleek design caught my eye, and it performs like a champ—smooth multitasking, vibrant display. Battery life is decent, lasting through my workday. Overall, a solid buy for the price!",
# "Purchased the Lenovo Notebook, and it's been a reliable companion. The design is sleek, and it handles tasks effortlessly. Impressed with the decent battery life, making it suitable for daily use. Overall, a good value for the money." # "Purchased the Lenovo Notebook, and it's been a reliable companion. The design is sleek, and it handles tasks effortlessly. Impressed with the decent battery life, making it suitable for daily use. Overall, a good value for the money."
# ] # ]
test_dataset = pd.read_csv( test_dataset = pd.read_csv(
"data/train_short_200000.csv", names=["sentiment", "title", "text"], sep="," "data/train_short_200000.csv", names=["sentiment", "title", "text"], sep=","
).tail(100) ).tail(100)
test_dataset = test_dataset.dropna() test_dataset = test_dataset.dropna()
test_dataset = test_dataset.drop(["title"], axis=1) test_dataset = test_dataset.drop(["title"], axis=1)
test_dataset = test_dataset.drop(["sentiment"], axis=1) test_dataset = test_dataset.drop(["sentiment"], axis=1)
test_dataset.text = test_dataset.text.str.lower() test_dataset.text = test_dataset.text.str.lower()
number_of_results = 50 number_of_results = 50
percentages = dict() percentages = dict()
for index, query in test_dataset.iterrows(): for index, query in test_dataset.iterrows():
query_sentiment = bert_model.get_sentiment_of_query(query=query.text) query_sentiment = bert_model.get_sentiment_of_query(query=query.text)
query_results = bert_model.retrieve_top_k_entries_for_query( query_results = bert_model.retrieve_top_k_entries_for_query(
query=query.text, k=number_of_results query=query.text, k=number_of_results
) )
query_results_with_sentiment = bert_model.get_sentiment_for_each_result( query_results_with_sentiment = bert_model.get_sentiment_for_each_result(
results=query_results results=query_results
) )
sentiment_distribution = Counter(query_results_with_sentiment.sentiment.tolist()) sentiment_distribution = Counter(query_results_with_sentiment.sentiment.tolist())
percentage = (sentiment_distribution[query_sentiment[0]] / number_of_results) * 100 percentage = (sentiment_distribution[query_sentiment[0]] / number_of_results) * 100
percentages[index] = ( percentages[index] = (
textwrap.shorten(query.text, width=30), textwrap.shorten(query.text, width=30),
query_sentiment, query_sentiment,
percentage, percentage,
) )
# print(json.dumps(percentages, indent=4)) # print(json.dumps(percentages, indent=4))
# TODO what to do with all these percentages? How to evaluate/plot them? # TODO what to do with all these percentages? How to evaluate/plot them?
``` ```
%% Output %% Output
--------------------------------------------------------------------------- ---------------------------------------------------------------------------
AttributeError Traceback (most recent call last) KeyboardInterrupt Traceback (most recent call last)
/tmp/ipykernel_4859/1683762815.py in ?() Cell In[27], line 25
17 number_of_results = 50 21 query_sentiment = bert_model.get_sentiment_of_query(query=query.text)
18 percentages = dict() 22 query_results = bert_model.retrieve_top_k_entries_for_query(
19 23 query=query.text, k=number_of_results
20 for index, query in test_dataset.iterrows(): 24 )
---> 21 query_sentiment = bert_model.get_sentiment_of_query(query=query.title) ---> 25 query_results_with_sentiment = bert_model.get_sentiment_for_each_result(
22 query_results = bert_model.retrieve_top_k_entries_for_query( 26 results=query_results
23 query=query.text, k=number_of_results 27 )
24 ) 29 sentiment_distribution = Counter(query_results_with_sentiment.sentiment.tolist())
~/.local/lib/python3.11/site-packages/pandas/core/generic.py in ?(self, name) 30 percentage = (sentiment_distribution[query_sentiment[0]] / number_of_results) * 100
6200 and name not in self._accessors Cell In[15], line 33, in BERT.get_sentiment_for_each_result(self, results)
6201 and self._info_axis._can_hold_identifiers_and_holds_name(name) 31 def get_sentiment_for_each_result(self, results: pd.DataFrame) -> pd.DataFrame:
6202 ): 32 for index, row in results.iterrows():
6203 return self[name] ---> 33 r = self.sentiment_analyzer(row.text)
-> 6204 return object.__getattribute__(self, name) 34 results.loc[index, "sentiment"] = r[0]["label"]
35 return results
AttributeError: 'Series' object has no attribute 'title' File ~/.local/lib/python3.11/site-packages/transformers/pipelines/text_classification.py:156, in TextClassificationPipeline.__call__(self, *args, **kwargs)
122 def __call__(self, *args, **kwargs):
123 """
124 Classify the text(s) given as inputs.
125
(...)
154 If `top_k` is used, one such dictionary is returned per label.
155 """
--> 156 result = super().__call__(*args, **kwargs)
157 # TODO try and retrieve it in a nicer way from _sanitize_parameters.
158 _legacy = "top_k" not in kwargs
File ~/.local/lib/python3.11/site-packages/transformers/pipelines/base.py:1140, in Pipeline.__call__(self, inputs, num_workers, batch_size, *args, **kwargs)
1132 return next(
1133 iter(
1134 self.get_iterator(
(...)
1137 )
1138 )
1139 else:
-> 1140 return self.run_single(inputs, preprocess_params, forward_params, postprocess_params)
File ~/.local/lib/python3.11/site-packages/transformers/pipelines/base.py:1147, in Pipeline.run_single(self, inputs, preprocess_params, forward_params, postprocess_params)
1145 def run_single(self, inputs, preprocess_params, forward_params, postprocess_params):
1146 model_inputs = self.preprocess(inputs, **preprocess_params)
-> 1147 model_outputs = self.forward(model_inputs, **forward_params)
1148 outputs = self.postprocess(model_outputs, **postprocess_params)
1149 return outputs
File ~/.local/lib/python3.11/site-packages/transformers/pipelines/base.py:1046, in Pipeline.forward(self, model_inputs, **forward_params)
1044 with inference_context():
1045 model_inputs = self._ensure_tensor_on_device(model_inputs, device=self.device)
-> 1046 model_outputs = self._forward(model_inputs, **forward_params)
1047 model_outputs = self._ensure_tensor_on_device(model_outputs, device=torch.device("cpu"))
1048 else:
File ~/.local/lib/python3.11/site-packages/transformers/pipelines/text_classification.py:187, in TextClassificationPipeline._forward(self, model_inputs)
185 if "use_cache" in inspect.signature(model_forward).parameters.keys():
186 model_inputs["use_cache"] = False
--> 187 return self.model(**model_inputs)
File ~/.local/lib/python3.11/site-packages/torch/nn/modules/module.py:1518, in Module._wrapped_call_impl(self, *args, **kwargs)
1516 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1517 else:
-> 1518 return self._call_impl(*args, **kwargs)
File ~/.local/lib/python3.11/site-packages/torch/nn/modules/module.py:1527, in Module._call_impl(self, *args, **kwargs)
1522 # If we don't have any hooks, we want to skip the rest of the logic in
1523 # this function, and just call forward.
1524 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1525 or _global_backward_pre_hooks or _global_backward_hooks
1526 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1527 return forward_call(*args, **kwargs)
1529 try:
1530 result = None
File ~/.local/lib/python3.11/site-packages/transformers/models/distilbert/modeling_distilbert.py:1000, in DistilBertForSequenceClassification.forward(self, input_ids, attention_mask, head_mask, inputs_embeds, labels, output_attentions, output_hidden_states, return_dict)
992 r"""
993 labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
994 Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
995 config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
996 `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
997 """
998 return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-> 1000 distilbert_output = self.distilbert(
1001 input_ids=input_ids,
1002 attention_mask=attention_mask,
1003 head_mask=head_mask,
1004 inputs_embeds=inputs_embeds,
1005 output_attentions=output_attentions,
1006 output_hidden_states=output_hidden_states,
1007 return_dict=return_dict,
1008 )
1009 hidden_state = distilbert_output[0] # (bs, seq_len, dim)
1010 pooled_output = hidden_state[:, 0] # (bs, dim)
File ~/.local/lib/python3.11/site-packages/torch/nn/modules/module.py:1518, in Module._wrapped_call_impl(self, *args, **kwargs)
1516 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1517 else:
-> 1518 return self._call_impl(*args, **kwargs)
File ~/.local/lib/python3.11/site-packages/torch/nn/modules/module.py:1527, in Module._call_impl(self, *args, **kwargs)
1522 # If we don't have any hooks, we want to skip the rest of the logic in
1523 # this function, and just call forward.
1524 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1525 or _global_backward_pre_hooks or _global_backward_hooks
1526 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1527 return forward_call(*args, **kwargs)
1529 try:
1530 result = None
File ~/.local/lib/python3.11/site-packages/transformers/models/distilbert/modeling_distilbert.py:820, in DistilBertModel.forward(self, input_ids, attention_mask, head_mask, inputs_embeds, output_attentions, output_hidden_states, return_dict)
817 if attention_mask is None:
818 attention_mask = torch.ones(input_shape, device=device) # (bs, seq_length)
--> 820 return self.transformer(
821 x=embeddings,
822 attn_mask=attention_mask,
823 head_mask=head_mask,
824 output_attentions=output_attentions,
825 output_hidden_states=output_hidden_states,
826 return_dict=return_dict,
827 )
File ~/.local/lib/python3.11/site-packages/torch/nn/modules/module.py:1518, in Module._wrapped_call_impl(self, *args, **kwargs)
1516 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1517 else:
-> 1518 return self._call_impl(*args, **kwargs)
File ~/.local/lib/python3.11/site-packages/torch/nn/modules/module.py:1527, in Module._call_impl(self, *args, **kwargs)
1522 # If we don't have any hooks, we want to skip the rest of the logic in
1523 # this function, and just call forward.
1524 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1525 or _global_backward_pre_hooks or _global_backward_hooks
1526 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1527 return forward_call(*args, **kwargs)
1529 try:
1530 result = None
File ~/.local/lib/python3.11/site-packages/transformers/models/distilbert/modeling_distilbert.py:585, in Transformer.forward(self, x, attn_mask, head_mask, output_attentions, output_hidden_states, return_dict)
577 layer_outputs = self._gradient_checkpointing_func(
578 layer_module.__call__,
579 hidden_state,
(...)
582 output_attentions,
583 )
584 else:
--> 585 layer_outputs = layer_module(
586 hidden_state,
587 attn_mask,
588 head_mask[i],
589 output_attentions,
590 )
592 hidden_state = layer_outputs[-1]
594 if output_attentions:
File ~/.local/lib/python3.11/site-packages/torch/nn/modules/module.py:1518, in Module._wrapped_call_impl(self, *args, **kwargs)
1516 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1517 else:
-> 1518 return self._call_impl(*args, **kwargs)
File ~/.local/lib/python3.11/site-packages/torch/nn/modules/module.py:1527, in Module._call_impl(self, *args, **kwargs)
1522 # If we don't have any hooks, we want to skip the rest of the logic in
1523 # this function, and just call forward.
1524 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1525 or _global_backward_pre_hooks or _global_backward_hooks
1526 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1527 return forward_call(*args, **kwargs)
1529 try:
1530 result = None
File ~/.local/lib/python3.11/site-packages/transformers/models/distilbert/modeling_distilbert.py:530, in TransformerBlock.forward(self, x, attn_mask, head_mask, output_attentions)
528 # Feed Forward Network
529 ffn_output = self.ffn(sa_output) # (bs, seq_length, dim)
--> 530 ffn_output: torch.Tensor = self.output_layer_norm(ffn_output + sa_output) # (bs, seq_length, dim)
532 output = (ffn_output,)
533 if output_attentions:
File ~/.local/lib/python3.11/site-packages/torch/nn/modules/module.py:1518, in Module._wrapped_call_impl(self, *args, **kwargs)
1516 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1517 else:
-> 1518 return self._call_impl(*args, **kwargs)
File ~/.local/lib/python3.11/site-packages/torch/nn/modules/module.py:1527, in Module._call_impl(self, *args, **kwargs)
1522 # If we don't have any hooks, we want to skip the rest of the logic in
1523 # this function, and just call forward.
1524 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1525 or _global_backward_pre_hooks or _global_backward_hooks
1526 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1527 return forward_call(*args, **kwargs)
1529 try:
1530 result = None
File ~/.local/lib/python3.11/site-packages/torch/nn/modules/normalization.py:196, in LayerNorm.forward(self, input)
195 def forward(self, input: Tensor) -> Tensor:
--> 196 return F.layer_norm(
197 input, self.normalized_shape, self.weight, self.bias, self.eps)
File ~/.local/lib/python3.11/site-packages/torch/nn/functional.py:2543, in layer_norm(input, normalized_shape, weight, bias, eps)
2539 if has_torch_function_variadic(input, weight, bias):
2540 return handle_torch_function(
2541 layer_norm, (input, weight, bias), input, normalized_shape, weight=weight, bias=bias, eps=eps
2542 )
-> 2543 return torch.layer_norm(input, normalized_shape, weight, bias, eps, torch.backends.cudnn.enabled)
KeyboardInterrupt:
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
highest_positive_query = ("", "", 0) highest_positive_query = ("", "", 0)
highest_negative_query = ("", "", 0) highest_negative_query = ("", "", 0)
smallest_positive_query = ("", "", 100) smallest_positive_query = ("", "", 100)
smallest_negative_query = ("", "", 100) smallest_negative_query = ("", "", 100)
for query in percentages: for query in percentages:
if percentages[query][1][0] == "POSITIVE": if percentages[query][1][0] == "POSITIVE":
if percentages[query][2] > highest_positive_query[2]: if percentages[query][2] > highest_positive_query[2]:
highest_positive_query = percentages[query] highest_positive_query = percentages[query]
elif percentages[query][2] < smallest_positive_query[2]: elif percentages[query][2] < smallest_positive_query[2]:
smallest_positive_query = percentages[query] smallest_positive_query = percentages[query]
else: else:
if percentages[query][2] > highest_negative_query[2]: if percentages[query][2] > highest_negative_query[2]:
highest_negative_query = percentages[query] highest_negative_query = percentages[query]
elif percentages[query][2] < smallest_negative_query[2]: elif percentages[query][2] < smallest_negative_query[2]:
smallest_negative_query = percentages[query] smallest_negative_query = percentages[query]
print(f"Highest positive query: {highest_positive_query}") print(f"Highest positive query: {highest_positive_query}")
print(f"Highest negative query: {highest_negative_query}") print(f"Highest negative query: {highest_negative_query}")
print(f"Lowest positive query: {smallest_positive_query}") print(f"Lowest positive query: {smallest_positive_query}")
print(f"Lowest negative query: {smallest_negative_query}") print(f"Lowest negative query: {smallest_negative_query}")
create_bar_chart( create_bar_chart(
positive_count=highest_positive_query[2], positive_count=highest_positive_query[2],
negative_count=100 - highest_positive_query[2], negative_count=100 - highest_positive_query[2],
query_text=highest_positive_query[0], query_text=highest_positive_query[0],
score=round(highest_positive_query[1][1] * 100, 4), score=round(highest_positive_query[1][1] * 100, 4),
sentiment=highest_positive_query[1][0], sentiment=highest_positive_query[1][0],
) )
create_bar_chart( create_bar_chart(
positive_count=100 - highest_negative_query[2], positive_count=100 - highest_negative_query[2],
negative_count=highest_negative_query[2], negative_count=highest_negative_query[2],
query_text=highest_negative_query[0], query_text=highest_negative_query[0],
score=round(highest_negative_query[1][1] * 100, 4), score=round(highest_negative_query[1][1] * 100, 4),
sentiment=highest_negative_query[1][0], sentiment=highest_negative_query[1][0],
) )
create_bar_chart( create_bar_chart(
positive_count=smallest_positive_query[2], positive_count=smallest_positive_query[2],
negative_count=100 - smallest_positive_query[2], negative_count=100 - smallest_positive_query[2],
query_text=smallest_positive_query[0], query_text=smallest_positive_query[0],
score=round(smallest_positive_query[1][1] * 100, 4), score=round(smallest_positive_query[1][1] * 100, 4),
sentiment=smallest_positive_query[1][0], sentiment=smallest_positive_query[1][0],
) )
create_bar_chart( create_bar_chart(
positive_count=100 - smallest_negative_query[2], positive_count=100 - smallest_negative_query[2],
negative_count=smallest_negative_query[2], negative_count=smallest_negative_query[2],
query_text=smallest_negative_query[0], query_text=smallest_negative_query[0],
score=round(smallest_negative_query[1][1] * 100, 4), score=round(smallest_negative_query[1][1] * 100, 4),
sentiment=smallest_negative_query[1][0], sentiment=smallest_negative_query[1][0],
) )
``` ```
%% Output %% Output
Highest positive query: ('very nice movie , it [...]', ('POSITIVE', 0.9998714923858643), 90.0) Highest positive query: ('very nice movie , it [...]', ('POSITIVE', 0.9998714923858643), 90.0)
Highest negative query: ('i am returning my copy. [...]', ('NEGATIVE', 0.9997848868370056), 94.0) Highest negative query: ('i am returning my copy. [...]', ('NEGATIVE', 0.9997848868370056), 94.0)
Lowest positive query: ('this was so terrible, i [...]', ('POSITIVE', 0.997624933719635), 6.0) Lowest positive query: ('this was so terrible, i [...]', ('POSITIVE', 0.997624933719635), 6.0)
Lowest negative query: ("i'm preparing the capes [...]", ('NEGATIVE', 0.5031839609146118), 22.0) Lowest negative query: ("i'm preparing the capes [...]", ('NEGATIVE', 0.5031839609146118), 22.0)
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
total = 0 total = 0
for index, percentage in percentages.items(): for index, percentage in percentages.items():
total += percentage[2] total += percentage[2]
print(total / len(percentages)) print(total / len(percentages))
``` ```
%% Output %% Output
61.28 61.28
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment