dalequark/download.py

## download.py
from tensorflow import keras
import os
import re

# Load all files from a directory in a DataFrame.
def load_directory_data(directory):
  data = {}
  data["sentence"] = []
  data["sentiment"] = []
  for file_path in os.listdir(directory):
    with tf.gfile.GFile(os.path.join(directory, file_path), "r") as f:
      data["sentence"].append(f.read())
      data["sentiment"].append(re.match("\d+_(\d+)\.txt", file_path).group(1))
  return pd.DataFrame.from_dict(data)

# Merge positive and negative examples, add a polarity column and shuffle.
def load_dataset(directory):
  pos_df = load_directory_data(os.path.join(directory, "pos"))
  neg_df = load_directory_data(os.path.join(directory, "neg"))
  pos_df["polarity"] = 1
  neg_df["polarity"] = 0
  return pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True)

# Download and process the dataset files.
def download_and_load_datasets(force_download=False):
  dataset = tf.keras.utils.get_file(
      fname="aclImdb.tar.gz",
      origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz",
      extract=True)

  train_df = load_dataset(os.path.join(os.path.dirname(dataset),
                                       "aclImdb", "train"))
  test_df = load_dataset(os.path.join(os.path.dirname(dataset),
                                      "aclImdb", "test"))

  return train_df, test_df

train, test = download_and_load_datasets()

# Downsample so our model trains faster
train = train.sample(5000)
test = test.sample(5000)

# Our input data is stored in DATA_COLUMN; it's sentiment label
# is stored in LABEL_COLUMN as a 0 or 1
DATA_COLUMN = 'sentence'
LABEL_COLUMN = 'polarity'
# label_list is the list of labels, i.e. True, False or 0, 1 or 'dog', 'cat'
label_list = [0, 1]
	from tensorflow import keras
	import os
	import re

	# Load all files from a directory in a DataFrame.
	def load_directory_data(directory):
	data = {}
	data["sentence"] = []
	data["sentiment"] = []
	for file_path in os.listdir(directory):
	with tf.gfile.GFile(os.path.join(directory, file_path), "r") as f:
	data["sentence"].append(f.read())
	data["sentiment"].append(re.match("\d+_(\d+)\.txt", file_path).group(1))
	return pd.DataFrame.from_dict(data)

	# Merge positive and negative examples, add a polarity column and shuffle.
	def load_dataset(directory):
	pos_df = load_directory_data(os.path.join(directory, "pos"))
	neg_df = load_directory_data(os.path.join(directory, "neg"))
	pos_df["polarity"] = 1
	neg_df["polarity"] = 0
	return pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True)

	# Download and process the dataset files.
	def download_and_load_datasets(force_download=False):
	dataset = tf.keras.utils.get_file(
	fname="aclImdb.tar.gz",
	origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz",
	extract=True)

	train_df = load_dataset(os.path.join(os.path.dirname(dataset),
	"aclImdb", "train"))
	test_df = load_dataset(os.path.join(os.path.dirname(dataset),
	"aclImdb", "test"))

	return train_df, test_df

	train, test = download_and_load_datasets()

	# Downsample so our model trains faster
	train = train.sample(5000)
	test = test.sample(5000)

	# Our input data is stored in DATA_COLUMN; it's sentiment label
	# is stored in LABEL_COLUMN as a 0 or 1
	DATA_COLUMN = 'sentence'
	LABEL_COLUMN = 'polarity'
	# label_list is the list of labels, i.e. True, False or 0, 1 or 'dog', 'cat'
	label_list = [0, 1]