Skip to content

Instantly share code, notes, and snippets.

@rohitgarud
Last active March 10, 2023 04:02
Show Gist options
  • Save rohitgarud/0260fb79a9f65d424a04197b79e656ca to your computer and use it in GitHub Desktop.
Save rohitgarud/0260fb79a9f65d424a04197b79e656ca to your computer and use it in GitHub Desktop.
Gist for calculating cosine similarity between resultants of different groups of feature vectors (ASReview screening)
from pathlib import Path
import matplotlib.pyplot as plt
import pandas as pd
from asreview import ASReviewData, ASReviewProject, open_state
from asreview.models.balance import DoubleBalance
from asreview.models.classifiers import NaiveBayesClassifier
from asreview.models.feature_extraction import Tfidf
from asreview.models.query import MaxQuery
from asreview.review import ReviewSimulate
from asreviewcontrib.insights.metrics import time_to_discovery
from sklearn.metrics.pairwise import cosine_similarity
project_path = Path("tmp_data")
project_path.mkdir(exist_ok=True)
try:
import shutil
shutil.rmtree(project_path)
except Exception:
pass
finally:
project_path = Path("tmp_data")
project_path.mkdir(exist_ok=True)
# Create a project object and folder
project = ASReviewProject.create(
project_path=project_path / "api_simulation",
project_id="api_example",
project_mode="simulate",
project_name="api_example",
)
dataset = "van_de_Schoot_2017.csv"
# dataset = "van_Dis_2020_raw.csv"
filepath = Path("tmp_data", "api_simulation", "data", dataset)
data_df = pd.read_csv(dataset)
data_df.to_csv(filepath)
project.add_dataset(dataset)
# Select models to use
train_model = NaiveBayesClassifier()
query_model = MaxQuery()
balance_model = DoubleBalance()
feature_model = Tfidf()
data_obj = ASReviewData.from_file(filepath)
# Initialize the simulation reviewer
reviewer = ReviewSimulate(
as_data=data_obj,
model=train_model,
query_model=query_model,
balance_model=balance_model,
feature_model=feature_model,
n_instances=10,
project=project,
n_prior_included=1,
n_prior_excluded=1,
init_seed=165,
)
# Start the review process
project.update_review(status="review")
try:
reviewer.review()
project.mark_review_finished()
except Exception as err:
project.update_review(status="error")
raise err
# Finish and export the project
project.export(Path("tmp_data", "api_example.asreview"))
with open_state("tmp_data/api_example.asreview") as s:
tds = time_to_discovery(s)
rids = [rid for rid, td in tds]
# Cosine Similarity calculations
with open_state("tmp_data/api_example.asreview") as state:
df = state.get_dataset()
df["labeling_order"] = df.index
labels = state.get_labels(priors=True)
labeling_order = df.record_id
td_last = time_to_discovery(state)[-1][1]
feature_extraction_id = project.feature_matrices[0]["id"]
print(feature_extraction_id)
feature_matrix = project.get_feature_matrix(feature_extraction_id)
tfidf_features = feature_matrix.toarray()
# Calculating the combined vector of each group (unlabelled, relevant and
# irrelevant) of dataset by simply summing, Average can also be used, it
# gives the same vector with normalised magnitude
relevant = tfidf_features[labeling_order[0]].reshape(1, -1)
irrelevant = tfidf_features[labeling_order[1]].reshape(1, -1)
unlabelled = sum(tfidf_features).reshape(1, -1) - relevant - irrelevant
unlabelled_relevant = []
unlabelled_irrelevant = []
relevant_irrelevant = []
# Iterate through the labeling order
for i, record in enumerate(labeling_order[2:]):
print(i)
if labels[record] == 1:
relevant += tfidf_features[record].reshape(1, -1)
if labels[record] == 0:
irrelevant += tfidf_features[record].reshape(1, -1)
unlabelled -= tfidf_features[record].reshape(1, -1)
# Calcuate cosine similarities
unlabelled_relevant.append(cosine_similarity(unlabelled, relevant)[0])
unlabelled_irrelevant.append(cosine_similarity(unlabelled, irrelevant)[0])
relevant_irrelevant.append(cosine_similarity(relevant, irrelevant)[0])
# Plotting the cosine similarities between different groups
for td in tds:
plt.axvline(x=td[1], color="y")
plt.plot(unlabelled_relevant, label="unlabelled_relevant")
plt.plot(unlabelled_irrelevant, label="unlabelled_irrelevant")
plt.plot(relevant_irrelevant, label="relevant_irrelevant")
plt.title(
f"Dataset: {dataset.split('.')[0]} (Total: {len(data_df)}, Relevant: {len(tds)+1})"
)
plt.xlabel("Screened Records")
plt.ylabel("Cosine Similarity")
plt.legend()
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment