Created
December 27, 2020 06:47
-
-
Save lppier/68700e91c9472660998af2ecd193763e to your computer and use it in GitHub Desktop.
Comprehend_SentimentCustomClassifier https://lppier.github.io/NLP-PK!-HuggingFace-vs-AWS-ML-Services
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pathlib import Path | |
from sklearn.model_selection import train_test_split | |
# IMDB Dataset can be found here | |
# wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz | |
# tar -xf aclImdb_v1.tar.gz | |
def read_imdb_split(split_dir): | |
split_dir = Path(split_dir) | |
texts = [] | |
labels = [] | |
for label_dir in ["pos", "neg"]: | |
for text_file in (split_dir / label_dir).iterdir(): | |
texts.append(text_file.read_text(encoding="utf8")) | |
labels.append(0 if label_dir is "neg" else 1) | |
return texts, labels | |
train_texts, train_labels = read_imdb_split("data/aclImdb/train") | |
test_texts, test_labels = read_imdb_split("data/aclImdb/test") | |
# # Further split training set to get a validation set (Skip this step, assume comprehend does this internally) | |
# train_texts, val_texts, train_labels, val_labels = train_test_split( | |
# train_texts, train_labels, test_size=0.1 | |
# ) | |
# train_data = { 'Label' : train_labels, 'Text': train_texts} | |
import pandas as pd | |
# df = pd.DataFrame(train_data) | |
# df.to_csv('data/comprehendimdb2.csv') | |
test_data = {"Label": test_labels, "Text": test_texts} | |
df_test = pd.DataFrame(test_data) | |
df_test.to_csv("data/comprehendimdbtest.csv") | |
# put to comprehend | |
import json | |
# Parse the jsonl output from Comprehend | |
with open("data/comprehend_predictions.jsonl", "r") as json_file: | |
json_list = list(json_file) | |
test_answers = [] | |
for json_str in json_list: | |
result = json.loads(json_str) | |
test_answers.append(result["Classes"][0]) | |
df_pred = pd.DataFrame(test_answers) | |
df_labels = pd.read_csv("data/comprehendimdbtest.csv", header=None) | |
from sklearn.metrics import accuracy_score, precision_recall_fscore_support | |
precision, recall, f1, _ = precision_recall_fscore_support( | |
df_labels.iloc[:, 0], pd.to_numeric(df_pred["Name"]), average="binary" | |
) | |
acc = accuracy_score(df_labels.iloc[:, 0], pd.to_numeric(df_pred["Name"])) | |
metrics = {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall} | |
print(metrics) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment