This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import unicode_literals | |
import re, string, nltk | |
from nltk.corpus.reader.wordnet import NOUN | |
from nltk.corpus import wordnet | |
from nltk import word_tokenize | |
from nltk.corpus import stopwords | |
from nltk.stem import WordNetLemmatizer | |
stop = set(stopwords.words('english')) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn import metrics | |
from sklearn.metrics import accuracy_score | |
from sklearn.metrics import precision_recall_fscore_support | |
def flatten(lst): | |
for el in lst: | |
if isinstance(el, list): | |
yield from el | |
else: | |
yield el |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def flatten(lst): | |
for el in lst: | |
if isinstance(el, list): | |
yield from el | |
else: | |
yield el |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def trainClassifier(trainData): | |
print("Training Classifier...") | |
pipeline = Pipeline([('svc', LinearSVC())]) | |
return SklearnClassifier(pipeline).train(trainData) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def crossValidate(dataset, folds): | |
shuffle(dataset) | |
predictions = [] | |
ground_truth = [] | |
foldSize = int(len(dataset)/folds) | |
for i in range(0,len(dataset), foldSize): | |
trainFolds = dataset[:i] + dataset[i+foldSize:] | |
validationFold = dataset[i: i+foldSize] | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def predictLabels(reviewSamples, classifier): | |
return classifier.classify_many(map(lambda t: toFeatureVector(preProcess(t[1])), reviewSamples)) | |
def predictLabel(reviewSample, classifier): | |
return classifier.classify(toFeatureVector(preProcess(reviewSample))) | |
def trainClassifier(trainData): | |
print("Training Classifier...") | |
pipeline = Pipeline([('svc', LinearSVC())]) | |
return SklearnClassifier(pipeline).train(trainData) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# load data from a file and append it to the rawData | |
def loadData(path, Text=None): | |
with open(path) as f: | |
reader = csv.reader(f, delimiter='\t') | |
next(reader, None) # skip the headers | |
for line in reader: | |
(Id, Text, Label) = parseReview(line) | |
rawData.append((Id, Text, Label)) | |
preprocessedData.append((Id, toFeatureVector(preProcess(Text)), Label)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
featureDict = {} # A global dictionary of features | |
def toFeatureVector(tokens): | |
# Should return a dictionary containing features as keys, and weights as values | |
v = {} | |
for t in tokens: | |
try: | |
featureDict[t] += 1 | |
except KeyError: | |
featureDict[t] = 1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Input: a string of one review | |
import re | |
from nltk import word_tokenize | |
def preProcess(text): | |
# Should return a list of tokens | |
text = re.sub(r"(\w)([.,;:!?'\"”\)])", r"\1 \2", text) | |
text = re.sub(r"([.,;:!?'\"“\(])(\w)", r"\1 \2", text) | |
text = text.lower() | |
tokens = word_tokenize(text) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
from sklearn.svm import LinearSVC | |
from nltk.classify import SklearnClassifier | |
from random import shuffle | |
from sklearn.pipeline import Pipeline |