This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- python -*- | |
from xml.etree import cElementTree | |
import csv | |
# Copy/paste from the help section on SEDE | |
# http://data.stackexchange.com/stackoverflow/query/new | |
schema = [ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def parse_label(label): | |
if label == '__label2__': | |
return 'real' | |
else: | |
return 'fake' | |
# Convert line from input file into an id/text/label tuple | |
def parseReview(reviewLine): | |
# Should return a triple of an integer, a string containing the review, and a string indicating the label | |
return reviewLine[0], reviewLine[8], parse_label(reviewLine[1]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
from sklearn.svm import LinearSVC | |
from nltk.classify import SklearnClassifier | |
from random import shuffle | |
from sklearn.pipeline import Pipeline |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Input: a string of one review | |
import re | |
from nltk import word_tokenize | |
def preProcess(text): | |
# Should return a list of tokens | |
text = re.sub(r"(\w)([.,;:!?'\"”\)])", r"\1 \2", text) | |
text = re.sub(r"([.,;:!?'\"“\(])(\w)", r"\1 \2", text) | |
text = text.lower() | |
tokens = word_tokenize(text) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
featureDict = {} # A global dictionary of features | |
def toFeatureVector(tokens): | |
# Should return a dictionary containing features as keys, and weights as values | |
v = {} | |
for t in tokens: | |
try: | |
featureDict[t] += 1 | |
except KeyError: | |
featureDict[t] = 1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# load data from a file and append it to the rawData | |
def loadData(path, Text=None): | |
with open(path) as f: | |
reader = csv.reader(f, delimiter='\t') | |
next(reader, None) # skip the headers | |
for line in reader: | |
(Id, Text, Label) = parseReview(line) | |
rawData.append((Id, Text, Label)) | |
preprocessedData.append((Id, toFeatureVector(preProcess(Text)), Label)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def predictLabels(reviewSamples, classifier): | |
return classifier.classify_many(map(lambda t: toFeatureVector(preProcess(t[1])), reviewSamples)) | |
def predictLabel(reviewSample, classifier): | |
return classifier.classify(toFeatureVector(preProcess(reviewSample))) | |
def trainClassifier(trainData): | |
print("Training Classifier...") | |
pipeline = Pipeline([('svc', LinearSVC())]) | |
return SklearnClassifier(pipeline).train(trainData) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def crossValidate(dataset, folds): | |
shuffle(dataset) | |
predictions = [] | |
ground_truth = [] | |
foldSize = int(len(dataset)/folds) | |
for i in range(0,len(dataset), foldSize): | |
trainFolds = dataset[:i] + dataset[i+foldSize:] | |
validationFold = dataset[i: i+foldSize] | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def trainClassifier(trainData): | |
print("Training Classifier...") | |
pipeline = Pipeline([('svc', LinearSVC())]) | |
return SklearnClassifier(pipeline).train(trainData) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def flatten(lst): | |
for el in lst: | |
if isinstance(el, list): | |
yield from el | |
else: | |
yield el |
OlderNewer