This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# unzip and unpack the tar file | |
gunzip -c aclImdb_v1.tar.gz | tar xopf - | |
cd aclImdb && mkdir movie_data | |
# puts four files in the combined_files directory: | |
# full_train.txt, full_test.txt, original_train_ratings.txt, and original_test_ratings.txt | |
for split in train test; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
reviews_train = [] | |
for line in open('../data/movie_data/full_train.txt', 'r'): | |
reviews_train.append(line.strip()) | |
reviews_test = [] | |
for line in open('../data/movie_data/full_test.txt', 'r'): | |
reviews_test.append(line.strip()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.feature_extraction.text import CountVectorizer | |
cv = CountVectorizer(binary=True) | |
cv.fit(reviews_train_clean) | |
X = cv.transform(reviews_train_clean) | |
X_test = cv.transform(reviews_test_clean) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
final_model = LogisticRegression(C=0.05) | |
final_model.fit(X, target) | |
print ("Final Accuracy: %s" | |
% accuracy_score(target, final_model.predict(X_test))) | |
# Final Accuracy: 0.88128 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
feature_to_coef = { | |
word: coef for word, coef in zip( | |
cv.get_feature_names(), final_model.coef_[0] | |
) | |
} | |
for best_positive in sorted( | |
feature_to_coef.items(), | |
key=lambda x: x[1], | |
reverse=True)[:5]: | |
print (best_positive) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.linear_model import LogisticRegression | |
from sklearn.metrics import accuracy_score | |
from sklearn.model_selection import train_test_split | |
target = [1 if i < 12500 else 0 for i in range(25000)] | |
X_train, X_val, y_train, y_val = train_test_split( | |
X, target, train_size = 0.75 | |
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def get_lemmatized_text(corpus): | |
from nltk.stem import WordNetLemmatizer | |
lemmatizer = WordNetLemmatizer() | |
return [' '.join([lemmatizer.lemmatize(word) for word in review.split()]) for review in corpus] | |
lemmatized_reviews = get_lemmatized_text(reviews_train_clean) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from nltk.corpus import stopwords | |
english_stop_words = stopwords.words('english') | |
def remove_stop_words(corpus): | |
removed_stop_words = [] | |
for review in corpus: | |
removed_stop_words.append( | |
' '.join([word for word in review.split() | |
if word not in english_stop_words]) | |
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.feature_extraction.text import CountVectorizer | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.metrics import accuracy_score | |
from sklearn.model_selection import train_test_split | |
ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 2)) | |
ngram_vectorizer.fit(reviews_train_clean) | |
X = ngram_vectorizer.transform(reviews_train_clean) | |
X_test = ngram_vectorizer.transform(reviews_test_clean) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.linear_model import LogisticRegression | |
from sklearn.metrics import accuracy_score | |
from sklearn.model_selection import train_test_split | |
wc_vectorizer = CountVectorizer(binary=False) | |
wc_vectorizer.fit(reviews_train_clean) | |
X = wc_vectorizer.transform(reviews_train_clean) | |
X_test = wc_vectorizer.transform(reviews_test_clean) | |
X_train, X_val, y_train, y_val = train_test_split( |
OlderNewer