Skip to content

Instantly share code, notes, and snippets.

@Andrew62
Created June 14, 2015 22:16
Show Gist options
  • Save Andrew62/86959d6549f12f97cc2f to your computer and use it in GitHub Desktop.
Save Andrew62/86959d6549f12f97cc2f to your computer and use it in GitHub Desktop.
"""
Walkthrough on Kaggle
See https://www.kaggle.com/c/word2vec-nlp-tutorial/details/part-1-for-beginners-bag-of-words
for more information.
"""
import re
import time
import pandas as pd
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer
def review_to_words(raw_review):
review_text = BeautifulSoup(raw_review).get_text()
letters_only = re.sub("[^a-zA-z]", " ", review_text)
words = letters_only.lower().split()
stops = set(stopwords.words('english'))
meaningful_words = [w for w in words if w not in stops]
return " ".join(meaningful_words)
def clean_all_reviews(reviews):
print "Cleaing all reviews...\n"
cleaned_reviews = []
total = reviews.shape[0]
for i in xrange(0, total):
cleaned_reviews.append(review_to_words(reviews[i]))
return cleaned_reviews
def vectorize_words(words):
print "Vectorizing words...\n"
vectorizer = CountVectorizer(analyzer="word", max_features = 5000)
data_features = vectorizer.fit_transform(words)
return vectorizer, data_features.toarray()
def randon_forest_sentiment(train_data_feats, sentiment):
print "Training random forest classifier...\n"
forest = RandomForestClassifier(n_estimators=100, n_jobs=-1)
forest.fit(train_data_feats, sentiment)
return forest
def validate(reviews, sentiment, n_validations=5):
print "Cross validating...\n"
scores = []
average = lambda x : sum(x)/float(len(x))
number = 0.0
for train_index, test_index in StratifiedKFold(sentiment, n_folds=n_validations, shuffle=True):
feat_train, feat_test = reviews[train_index], reviews[test_index]
sent_train, sent_test = sentiment[train_index], sentiment[test_index]
forest = randon_forest_sentiment(feat_train, sent_train)
predicted = forest.predict(feat_test)
scores.append(accuracy_score(sent_test, predicted))
number += 1
print "{0}%".format(round(number/n_validations*100, 2))
print "Completed cross validation.\nAccuracy: {0}".format(round(average(scores),2))
if __name__ == "__main__":
start = time.time()
in_file = "labeledTrainData.tsv/labeledTrainData.tsv"
train = pd.read_csv(in_file, header=0, quoting=3, delimiter='\t')
cleaned_reviews = clean_all_reviews(train.review)
_, vectorized_reviews = vectorize_words(cleaned_reviews)
validate(vectorized_reviews, train.sentiment)
print "Completed in {0} seconds.".format(round(time.time() - start, 2))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment