Skip to content

Instantly share code, notes, and snippets.

View aaronkub's full-sized avatar

Aaron Kub aaronkub

View GitHub Profile
reviews_train = []
for line in open('../data/movie_data/full_train.txt', 'r'):
reviews_train.append(line.strip())
reviews_test = []
for line in open('../data/movie_data/full_test.txt', 'r'):
reviews_test.append(line.strip())
#!/bin/bash
# unzip and unpack the tar file
gunzip -c aclImdb_v1.tar.gz | tar xopf -
cd aclImdb && mkdir movie_data
# puts four files in the combined_files directory:
# full_train.txt, full_test.txt, original_train_ratings.txt, and original_test_ratings.txt
for split in train test;
import re
REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
def preprocess_reviews(reviews):
reviews = [REPLACE_NO_SPACE.sub("", line.lower()) for line in reviews]
reviews = [REPLACE_WITH_SPACE.sub(" ", line) for line in reviews]
return reviews
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(binary=True)
cv.fit(reviews_train_clean)
X = cv.transform(reviews_train_clean)
X_test = cv.transform(reviews_test_clean)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
target = [1 if i < 12500 else 0 for i in range(25000)]
X_train, X_val, y_train, y_val = train_test_split(
X, target, train_size = 0.75
)
final_model = LogisticRegression(C=0.05)
final_model.fit(X, target)
print ("Final Accuracy: %s"
% accuracy_score(target, final_model.predict(X_test)))
# Final Accuracy: 0.88128
feature_to_coef = {
word: coef for word, coef in zip(
cv.get_feature_names(), final_model.coef_[0]
)
}
for best_positive in sorted(
feature_to_coef.items(),
key=lambda x: x[1],
reverse=True)[:5]:
print (best_positive)
def get_stemmed_text(corpus):
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
return [' '.join([stemmer.stem(word) for word in review.split()]) for review in corpus]
stemmed_reviews = get_stemmed_text(reviews_train_clean)
def get_lemmatized_text(corpus):
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
return [' '.join([lemmatizer.lemmatize(word) for word in review.split()]) for review in corpus]
lemmatized_reviews = get_lemmatized_text(reviews_train_clean)
from nltk.corpus import stopwords
english_stop_words = stopwords.words('english')
def remove_stop_words(corpus):
removed_stop_words = []
for review in corpus:
removed_stop_words.append(
' '.join([word for word in review.split()
if word not in english_stop_words])
)