Created October 13, 2015 01:20
Preprocessing of Pos/Neg review data
import h5py
import numpy as np
import random
import re
import pickle
import pdb
from sklearn.utils import resample
# from
def clean_str(string, TREC=False):
Tokenization/string cleaning for all datasets except for SST.
Every dataset is lower cased except for TREC
string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
string = re.sub(r"\'s", " \'s", string)
string = re.sub(r"\'ve", " \'ve", string)
string = re.sub(r"n\'t", " n\'t", string)
string = re.sub(r"\'re", " \'re", string)
string = re.sub(r"\'d", " \'d", string)
string = re.sub(r"\'ll", " \'ll", string)
string = re.sub(r",", " , ", string)
string = re.sub(r"!", " ! ", string)
string = re.sub(r"\(", " \( ", string)
string = re.sub(r"\)", " \) ", string)
string = re.sub(r"\?", " \? ", string)
string = re.sub(r"\s{2,}", " ", string)
return string.strip() if TREC else string.strip().lower()
def fill_and_zero(lol, length):
Takes a list of lists and pads it with zeros to make a matrix
return np.array([xi+[1]*(length-len(xi)) for xi in lol])
if __name__ == '__main__':
words = set()
dataset = [[], []] # neg, pos so 0/1
# preprocess text so that it's on the proper format
for i, filename in enumerate(['reviews/rt-polarity.neg', 'reviews/rt-polarity.pos']):
# first, create the entire list of words that are going to be in the data
rawdata = open(filename, 'r').read() # should be simple plain text file
data = clean_str(rawdata)
# create the dataset
dataset[i] = [clean_str(line) for line in rawdata.split('\n')]
# make the set for a map
data_size, vocab_size = len(dataset[0]) + len(dataset[1]), len(words)
print 'data has %d lines, %d unique words.' % (data_size, vocab_size)
word_to_idx = { word:i + 1 for i,word in enumerate(words) }
idx_to_word = { i+1:word for i,word in enumerate(words) }
idx_to_word[1] = ""
# now actually process the data
# once for the positive, and once for the negative
# let's just assume for now that 0.1/0.1/0.8 is a good split for validation
data = {
'train': {
'X': [],
'Y': []
'test': {
'X': [],
'Y': []
'val': {
'X': [],
'Y': []
for line in dataset[0][:len(dataset[0])/10]:
data['val']['X'].append([word_to_idx[word] for word in line.split()])
data['val']['Y'] = np.zeros(len(data['val']['X'])) + 1
for line in dataset[0][len(dataset[0])/10:2*len(dataset[0])/10]:
data['test']['X'].append([word_to_idx[word] for word in line.split()])
data['test']['Y'] = np.zeros(len(data['test']['X'])) + 1
for line in dataset[0][2*len(dataset[0])/10:]:
data['train']['X'].append([word_to_idx[word] for word in line.split()])
data['train']['Y'] = np.zeros(len(data['train']['X'])) + 1
# now do the same for the negative set
for line in dataset[1][:len(dataset[1])/10]:
data['val']['X'].append([word_to_idx[word] for word in line.split()])
data['val']['Y'] = np.append(data['val']['Y'], np.ones(len(dataset[1])/10) + 1)
for line in dataset[1][len(dataset[1])/10:2*len(dataset[1])/10]:
data['test']['X'].append([word_to_idx[word] for word in line.split()])
data['test']['Y'] = np.append(data['test']['Y'], np.ones(2*len(dataset[1])/10 - len(dataset[1])/10) + 1)
for line in dataset[1][2*len(dataset[1])/10:]:
data['train']['X'].append([word_to_idx[word] for word in line.split()])
data['train']['Y'] = np.append(data['train']['Y'], np.ones(len(dataset[1]) - 2*len(dataset[1])/10) + 1)
lenmax = 0
lenmax = max(max(map(len, data['val']['X'])), lenmax)
lenmax = max(max(map(len, data['test']['X'])), lenmax)
lenmax = max(max(map(len, data['train']['X'])), lenmax)
f = h5py.File('reviews2.h5', 'w')
# f['word_to_idx'] = word_to_idx
# f['idx_to_word'] = idx_to_word
# f['data'] = data
data['train']['X'] = fill_and_zero(data['train']['X'], lenmax)
data['test']['X'] = fill_and_zero(data['test']['X'], lenmax)
data['val']['X'] = fill_and_zero(data['val']['X'], lenmax)
f['train_X'], f['train_Y'] = resample(data['train']['X'], data['train']['Y'])
f['test_X'], f['test_Y'] = resample(data['test']['X'], data['test']['Y'])
f['val_X'], f['val_Y'] = resample(data['val']['X'], data['val']['Y'])
