Instantly share code, notes, and snippets.

Embed
What would you like to do?
Create CSV files from the Stanford Sentiment Treebank
"""
Put all the Stanford Sentiment Treebank phrase data into test, training, and dev CSVs.
Socher, R., Perelygin, A., Wu, J. Y., Chuang, J., Manning, C. D., Ng, A. Y., & Potts, C. (2013). Recursive Deep Models
for Semantic Compositionality Over a Sentiment Treebank. Presented at the Conference on Empirical Methods in Natural
Language Processing EMNLP.
https://nlp.stanford.edu/sentiment/
"""
import os
import sys
import pandas
def get_phrase_sentiments(base_directory):
def group_labels(label):
if label in ["very negative", "negative"]:
return "negative"
elif label in ["positive", "very positive"]:
return "positive"
else:
return "neutral"
dictionary = pandas.read_csv(os.path.join(base_directory, "dictionary.txt"), sep="|")
dictionary.columns = ["phrase", "id"]
dictionary = dictionary.set_index("id")
sentiment_labels = pandas.read_csv(os.path.join(base_directory, "sentiment_labels.txt"), sep="|")
sentiment_labels.columns = ["id", "sentiment"]
sentiment_labels = sentiment_labels.set_index("id")
phrase_sentiments = dictionary.join(sentiment_labels)
phrase_sentiments["fine"] = pandas.cut(phrase_sentiments.sentiment, [0, 0.2, 0.4, 0.6, 0.8, 1.0],
include_lowest=True,
labels=["very negative", "negative", "neutral", "positive", "very positive"])
phrase_sentiments["coarse"] = phrase_sentiments.fine.apply(group_labels)
return phrase_sentiments
def get_sentence_partitions(base_directory):
sentences = pandas.read_csv(os.path.join(base_directory, "datasetSentences.txt"), index_col="sentence_index",
sep="\t")
splits = pandas.read_csv(os.path.join(base_directory, "datasetSplit.txt"), index_col="sentence_index")
return sentences.join(splits).set_index("sentence")
def partition(base_directory):
phrase_sentiments = get_phrase_sentiments(base_directory)
sentence_partitions = get_sentence_partitions(base_directory)
# noinspection PyUnresolvedReferences
data = phrase_sentiments.join(sentence_partitions, on="phrase")
data["splitset_label"] = data["splitset_label"].fillna(1).astype(int)
data["phrase"] = data["phrase"].str.replace(r"\s('s|'d|'re|'ll|'m|'ve|n't)\b", lambda m: m.group(1))
return data.groupby("splitset_label")
base_directory, output_directory = sys.argv[1:3]
os.makedirs(output_directory, exist_ok=True)
for splitset, partition in partition(base_directory):
split_name = {1: "train", 2: "test", 3: "dev"}[splitset]
filename = os.path.join(output_directory, "stanford-sentiment-treebank.%s.csv" % split_name)
del partition["splitset_label"]
partition.to_csv(filename)
@valexby

This comment has been minimized.

valexby commented Mar 27, 2018

This script make unusual thing - it pushes all non-sentence phrases from dictionary to train sample. So you will achive training sample with 230K trees inside. I've spent some time before notice this. Be careful

@charlesdognin

This comment has been minimized.

charlesdognin commented Oct 26, 2018

Do not use this script for research purposes. 5% of the training set is in the test set and state of the art approaches use much less data for training.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment