Skip to content

Instantly share code, notes, and snippets.

from __future__ import unicode_literals
import re, string, nltk
from nltk.corpus.reader.wordnet import NOUN
from nltk.corpus import wordnet
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
stop = set(stopwords.words('english'))
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
def flatten(lst):
for el in lst:
if isinstance(el, list):
yield from el
else:
yield el
def flatten(lst):
for el in lst:
if isinstance(el, list):
yield from el
else:
yield el
def trainClassifier(trainData):
print("Training Classifier...")
pipeline = Pipeline([('svc', LinearSVC())])
return SklearnClassifier(pipeline).train(trainData)
def crossValidate(dataset, folds):
shuffle(dataset)
predictions = []
ground_truth = []
foldSize = int(len(dataset)/folds)
for i in range(0,len(dataset), foldSize):
trainFolds = dataset[:i] + dataset[i+foldSize:]
validationFold = dataset[i: i+foldSize]
def predictLabels(reviewSamples, classifier):
return classifier.classify_many(map(lambda t: toFeatureVector(preProcess(t[1])), reviewSamples))
def predictLabel(reviewSample, classifier):
return classifier.classify(toFeatureVector(preProcess(reviewSample)))
def trainClassifier(trainData):
print("Training Classifier...")
pipeline = Pipeline([('svc', LinearSVC())])
return SklearnClassifier(pipeline).train(trainData)
# load data from a file and append it to the rawData
def loadData(path, Text=None):
with open(path) as f:
reader = csv.reader(f, delimiter='\t')
next(reader, None) # skip the headers
for line in reader:
(Id, Text, Label) = parseReview(line)
rawData.append((Id, Text, Label))
preprocessedData.append((Id, toFeatureVector(preProcess(Text)), Label))
featureDict = {} # A global dictionary of features
def toFeatureVector(tokens):
# Should return a dictionary containing features as keys, and weights as values
v = {}
for t in tokens:
try:
featureDict[t] += 1
except KeyError:
featureDict[t] = 1
# Input: a string of one review
import re
from nltk import word_tokenize
def preProcess(text):
# Should return a list of tokens
text = re.sub(r"(\w)([.,;:!?'\"”\)])", r"\1 \2", text)
text = re.sub(r"([.,;:!?'\"“\(])(\w)", r"\1 \2", text)
text = text.lower()
tokens = word_tokenize(text)
import csv
from sklearn.svm import LinearSVC
from nltk.classify import SklearnClassifier
from random import shuffle
from sklearn.pipeline import Pipeline