Skip to content

Instantly share code, notes, and snippets.

@dalequark
Last active February 13, 2019 17:19
Show Gist options
  • Save dalequark/0c7084724cf6cf94d1abc46eb2bd371b to your computer and use it in GitHub Desktop.
Save dalequark/0c7084724cf6cf94d1abc46eb2bd371b to your computer and use it in GitHub Desktop.
Download Movie Review
from tensorflow import keras
import os
import re
# Load all files from a directory in a DataFrame.
def load_directory_data(directory):
data = {}
data["sentence"] = []
data["sentiment"] = []
for file_path in os.listdir(directory):
with tf.gfile.GFile(os.path.join(directory, file_path), "r") as f:
data["sentence"].append(f.read())
data["sentiment"].append(re.match("\d+_(\d+)\.txt", file_path).group(1))
return pd.DataFrame.from_dict(data)
# Merge positive and negative examples, add a polarity column and shuffle.
def load_dataset(directory):
pos_df = load_directory_data(os.path.join(directory, "pos"))
neg_df = load_directory_data(os.path.join(directory, "neg"))
pos_df["polarity"] = 1
neg_df["polarity"] = 0
return pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True)
# Download and process the dataset files.
def download_and_load_datasets(force_download=False):
dataset = tf.keras.utils.get_file(
fname="aclImdb.tar.gz",
origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz",
extract=True)
train_df = load_dataset(os.path.join(os.path.dirname(dataset),
"aclImdb", "train"))
test_df = load_dataset(os.path.join(os.path.dirname(dataset),
"aclImdb", "test"))
return train_df, test_df
train, test = download_and_load_datasets()
# Downsample so our model trains faster
train = train.sample(5000)
test = test.sample(5000)
# Our input data is stored in DATA_COLUMN; it's sentiment label
# is stored in LABEL_COLUMN as a 0 or 1
DATA_COLUMN = 'sentence'
LABEL_COLUMN = 'polarity'
# label_list is the list of labels, i.e. True, False or 0, 1 or 'dog', 'cat'
label_list = [0, 1]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment