Created
June 14, 2015 22:16
-
-
Save Andrew62/86959d6549f12f97cc2f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Walkthrough on Kaggle | |
See https://www.kaggle.com/c/word2vec-nlp-tutorial/details/part-1-for-beginners-bag-of-words | |
for more information. | |
""" | |
import re | |
import time | |
import pandas as pd | |
from bs4 import BeautifulSoup | |
from nltk.corpus import stopwords | |
from sklearn.metrics import accuracy_score | |
from sklearn.ensemble import RandomForestClassifier | |
from sklearn.cross_validation import StratifiedKFold | |
from sklearn.feature_extraction.text import CountVectorizer | |
def review_to_words(raw_review): | |
review_text = BeautifulSoup(raw_review).get_text() | |
letters_only = re.sub("[^a-zA-z]", " ", review_text) | |
words = letters_only.lower().split() | |
stops = set(stopwords.words('english')) | |
meaningful_words = [w for w in words if w not in stops] | |
return " ".join(meaningful_words) | |
def clean_all_reviews(reviews): | |
print "Cleaing all reviews...\n" | |
cleaned_reviews = [] | |
total = reviews.shape[0] | |
for i in xrange(0, total): | |
cleaned_reviews.append(review_to_words(reviews[i])) | |
return cleaned_reviews | |
def vectorize_words(words): | |
print "Vectorizing words...\n" | |
vectorizer = CountVectorizer(analyzer="word", max_features = 5000) | |
data_features = vectorizer.fit_transform(words) | |
return vectorizer, data_features.toarray() | |
def randon_forest_sentiment(train_data_feats, sentiment): | |
print "Training random forest classifier...\n" | |
forest = RandomForestClassifier(n_estimators=100, n_jobs=-1) | |
forest.fit(train_data_feats, sentiment) | |
return forest | |
def validate(reviews, sentiment, n_validations=5): | |
print "Cross validating...\n" | |
scores = [] | |
average = lambda x : sum(x)/float(len(x)) | |
number = 0.0 | |
for train_index, test_index in StratifiedKFold(sentiment, n_folds=n_validations, shuffle=True): | |
feat_train, feat_test = reviews[train_index], reviews[test_index] | |
sent_train, sent_test = sentiment[train_index], sentiment[test_index] | |
forest = randon_forest_sentiment(feat_train, sent_train) | |
predicted = forest.predict(feat_test) | |
scores.append(accuracy_score(sent_test, predicted)) | |
number += 1 | |
print "{0}%".format(round(number/n_validations*100, 2)) | |
print "Completed cross validation.\nAccuracy: {0}".format(round(average(scores),2)) | |
if __name__ == "__main__": | |
start = time.time() | |
in_file = "labeledTrainData.tsv/labeledTrainData.tsv" | |
train = pd.read_csv(in_file, header=0, quoting=3, delimiter='\t') | |
cleaned_reviews = clean_all_reviews(train.review) | |
_, vectorized_reviews = vectorize_words(cleaned_reviews) | |
validate(vectorized_reviews, train.sentiment) | |
print "Completed in {0} seconds.".format(round(time.time() - start, 2)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment