Skip to content

Instantly share code, notes, and snippets.

@lazuxd
Last active February 7, 2020 18:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lazuxd/ee16a420ee8ccc8542244c44e6b537fc to your computer and use it in GitHub Desktop.
Save lazuxd/ee16a420ee8ccc8542244c44e6b537fc to your computer and use it in GitHub Desktop.
Building a Sentiment Classifier using Scikit-Learn
import pandas as pd
import re
from os import system, listdir
from os.path import isfile, join
from random import shuffle
system('wget "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"')
system('tar -xzf "aclImdb_v1.tar.gz"')
def create_data_frame(folder: str) -> pd.DataFrame:
'''
folder - the root folder of train or test dataset
Returns: a DataFrame with the combined data from the input folder
'''
pos_folder = f'{folder}/pos' # positive reviews
neg_folder = f'{folder}/neg' # negative reviews
def get_files(fld: str) -> list:
'''
fld - positive or negative reviews folder
Returns: a list with all files in input folder
'''
return [join(fld, f) for f in listdir(fld) if isfile(join(fld, f))]
def append_files_data(data_list: list, files: list, label: int) -> None:
'''
Appends to 'data_list' tuples of form (file content, label)
for each file in 'files' input list
'''
for file_path in files:
with open(file_path, 'r') as f:
text = f.read()
data_list.append((text, label))
pos_files = get_files(pos_folder)
neg_files = get_files(neg_folder)
data_list = []
append_files_data(data_list, pos_files, 1)
append_files_data(data_list, neg_files, 0)
shuffle(data_list)
text, label = tuple(zip(*data_list))
# replacing line breaks with spaces
text = list(map(lambda txt: re.sub('(<br\s*/?>)+', ' ', txt), text))
return pd.DataFrame({'text': text, 'label': label})
imdb_train = create_data_frame('aclImdb/train')
imdb_test = create_data_frame('aclImdb/test')
system("mkdir 'csv'")
imdb_train.to_csv('csv/imdb_train.csv', index=False)
imdb_test.to_csv('csv/imdb_test.csv', index=False)
# imdb_train = pd.read_csv('csv/imdb_train.csv')
# imdb_test = pd.read_csv('csv/imdb_test.csv')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment