Skip to content

Instantly share code, notes, and snippets.

@lppier
Created December 27, 2020 06:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lppier/68700e91c9472660998af2ecd193763e to your computer and use it in GitHub Desktop.
Save lppier/68700e91c9472660998af2ecd193763e to your computer and use it in GitHub Desktop.
from pathlib import Path
from sklearn.model_selection import train_test_split
# IMDB Dataset can be found here
# wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
# tar -xf aclImdb_v1.tar.gz
def read_imdb_split(split_dir):
split_dir = Path(split_dir)
texts = []
labels = []
for label_dir in ["pos", "neg"]:
for text_file in (split_dir / label_dir).iterdir():
texts.append(text_file.read_text(encoding="utf8"))
labels.append(0 if label_dir is "neg" else 1)
return texts, labels
train_texts, train_labels = read_imdb_split("data/aclImdb/train")
test_texts, test_labels = read_imdb_split("data/aclImdb/test")
# # Further split training set to get a validation set (Skip this step, assume comprehend does this internally)
# train_texts, val_texts, train_labels, val_labels = train_test_split(
# train_texts, train_labels, test_size=0.1
# )
# train_data = { 'Label' : train_labels, 'Text': train_texts}
import pandas as pd
# df = pd.DataFrame(train_data)
# df.to_csv('data/comprehendimdb2.csv')
test_data = {"Label": test_labels, "Text": test_texts}
df_test = pd.DataFrame(test_data)
df_test.to_csv("data/comprehendimdbtest.csv")
# put to comprehend
import json
# Parse the jsonl output from Comprehend
with open("data/comprehend_predictions.jsonl", "r") as json_file:
json_list = list(json_file)
test_answers = []
for json_str in json_list:
result = json.loads(json_str)
test_answers.append(result["Classes"][0])
df_pred = pd.DataFrame(test_answers)
df_labels = pd.read_csv("data/comprehendimdbtest.csv", header=None)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
precision, recall, f1, _ = precision_recall_fscore_support(
df_labels.iloc[:, 0], pd.to_numeric(df_pred["Name"]), average="binary"
)
acc = accuracy_score(df_labels.iloc[:, 0], pd.to_numeric(df_pred["Name"]))
metrics = {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}
print(metrics)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment