Skip to content

Instantly share code, notes, and snippets.

@lievcin
lievcin / postsxml2csv.py
Created November 26, 2017 20:39 — forked from tripleee/postsxml2csv.py
postsxml2csv
#!/usr/bin/env python3
# -*- python -*-
from xml.etree import cElementTree
import csv
# Copy/paste from the help section on SEDE
# http://data.stackexchange.com/stackoverflow/query/new
schema = [
import csv
from sklearn.svm import LinearSVC
from nltk.classify import SklearnClassifier
from random import shuffle
from sklearn.pipeline import Pipeline
def parse_label(label):
if label == '__label2__':
return 'real'
else:
return 'fake'
# Convert line from input file into an id/text/label tuple
def parseReview(reviewLine):
# Should return a triple of an integer, a string containing the review, and a string indicating the label
return reviewLine[0], reviewLine[8], parse_label(reviewLine[1])
# Input: a string of one review
import re
from nltk import word_tokenize
def preProcess(text):
# Should return a list of tokens
text = re.sub(r"(\w)([.,;:!?'\"”\)])", r"\1 \2", text)
text = re.sub(r"([.,;:!?'\"“\(])(\w)", r"\1 \2", text)
text = text.lower()
tokens = word_tokenize(text)
featureDict = {} # A global dictionary of features
def toFeatureVector(tokens):
# Should return a dictionary containing features as keys, and weights as values
v = {}
for t in tokens:
try:
featureDict[t] += 1
except KeyError:
featureDict[t] = 1
# load data from a file and append it to the rawData
def loadData(path, Text=None):
with open(path) as f:
reader = csv.reader(f, delimiter='\t')
next(reader, None) # skip the headers
for line in reader:
(Id, Text, Label) = parseReview(line)
rawData.append((Id, Text, Label))
preprocessedData.append((Id, toFeatureVector(preProcess(Text)), Label))
def trainClassifier(trainData):
print("Training Classifier...")
pipeline = Pipeline([('svc', LinearSVC())])
return SklearnClassifier(pipeline).train(trainData)
def predictLabels(reviewSamples, classifier):
return classifier.classify_many(map(lambda t: toFeatureVector(preProcess(t[1])), reviewSamples))
def predictLabel(reviewSample, classifier):
return classifier.classify(toFeatureVector(preProcess(reviewSample)))
def trainClassifier(trainData):
print("Training Classifier...")
pipeline = Pipeline([('svc', LinearSVC())])
return SklearnClassifier(pipeline).train(trainData)
def flatten(lst):
for el in lst:
if isinstance(el, list):
yield from el
else:
yield el
def crossValidate(dataset, folds):
shuffle(dataset)
predictions = []
ground_truth = []
foldSize = int(len(dataset)/folds)
for i in range(0,len(dataset), foldSize):
trainFolds = dataset[:i] + dataset[i+foldSize:]
validationFold = dataset[i: i+foldSize]