Skip to content

Instantly share code, notes, and snippets.

View aaronkub's full-sized avatar

Aaron Kub aaronkub

View GitHub Profile
final_model = LogisticRegression(C=0.05)
final_model.fit(X, target)
print ("Final Accuracy: %s"
% accuracy_score(target, final_model.predict(X_test)))
# Final Accuracy: 0.88128
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
target = [1 if i < 12500 else 0 for i in range(25000)]
X_train, X_val, y_train, y_val = train_test_split(
X, target, train_size = 0.75
)
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(binary=True)
cv.fit(reviews_train_clean)
X = cv.transform(reviews_train_clean)
X_test = cv.transform(reviews_test_clean)
import re
REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
def preprocess_reviews(reviews):
reviews = [REPLACE_NO_SPACE.sub("", line.lower()) for line in reviews]
reviews = [REPLACE_WITH_SPACE.sub(" ", line) for line in reviews]
return reviews
#!/bin/bash
# unzip and unpack the tar file
gunzip -c aclImdb_v1.tar.gz | tar xopf -
cd aclImdb && mkdir movie_data
# puts four files in the combined_files directory:
# full_train.txt, full_test.txt, original_train_ratings.txt, and original_test_ratings.txt
for split in train test;
reviews_train = []
for line in open('../data/movie_data/full_train.txt', 'r'):
reviews_train.append(line.strip())
reviews_test = []
for line in open('../data/movie_data/full_test.txt', 'r'):
reviews_test.append(line.strip())