Liev lievcin

## postsxml2csv.py
#!/usr/bin/env python3
# -*- python -*-

from xml.etree import cElementTree
import csv


# Copy/paste from the help section on SEDE
# http://data.stackexchange.com/stackoverflow/query/new
schema = [

## importLibraries.py
import csv
from sklearn.svm import LinearSVC
from nltk.classify import SklearnClassifier
from random import shuffle
from sklearn.pipeline import Pipeline

## parseReview.py
def parse_label(label):
    if label == '__label2__':
        return 'real'
    else:
        return 'fake'

# Convert line from input file into an id/text/label tuple
def parseReview(reviewLine):
    # Should return a triple of an integer, a string containing the review, and a string indicating the label
    return reviewLine[0], reviewLine[8], parse_label(reviewLine[1])

## preProcess.py
# Input: a string of one review
import re
from nltk import word_tokenize

def preProcess(text):
    # Should return a list of tokens
    text = re.sub(r"(\w)([.,;:!?'\"”\)])", r"\1 \2", text)
    text = re.sub(r"([.,;:!?'\"“\(])(\w)", r"\1 \2", text)
    text = text.lower()
    tokens = word_tokenize(text)

## toFeatureVector.py
featureDict = {} # A global dictionary of features

def toFeatureVector(tokens):
    # Should return a dictionary containing features as keys, and weights as values
    v = {}
    for t in tokens:
        try:
            featureDict[t] += 1
        except KeyError:
            featureDict[t] = 1

## loadData.py
# load data from a file and append it to the rawData
def loadData(path, Text=None):
    with open(path) as f:
        reader = csv.reader(f, delimiter='\t')
        next(reader, None)  # skip the headers
        for line in reader:
            (Id, Text, Label) = parseReview(line)
            rawData.append((Id, Text, Label))
            preprocessedData.append((Id, toFeatureVector(preProcess(Text)), Label))

## trainClassifier.py
def trainClassifier(trainData):
    print("Training Classifier...")
    pipeline =  Pipeline([('svc', LinearSVC())])
    return SklearnClassifier(pipeline).train(trainData)

## predictLabel.py
def predictLabels(reviewSamples, classifier):
    return classifier.classify_many(map(lambda t: toFeatureVector(preProcess(t[1])), reviewSamples))

def predictLabel(reviewSample, classifier):
    return classifier.classify(toFeatureVector(preProcess(reviewSample)))

def trainClassifier(trainData):
    print("Training Classifier...")
    pipeline =  Pipeline([('svc', LinearSVC())])
    return SklearnClassifier(pipeline).train(trainData)

## flatten.py
def flatten(lst):
    for el in lst:
        if isinstance(el, list):
            yield from el
        else:
            yield el

## crossValidate.py
def crossValidate(dataset, folds):
    shuffle(dataset)
    predictions = []
    ground_truth = []
    foldSize = int(len(dataset)/folds)

    for i in range(0,len(dataset), foldSize):
        trainFolds = dataset[:i] + dataset[i+foldSize:]
        validationFold = dataset[i: i+foldSize]
	#!/usr/bin/env python3
	# -- python --

	from xml.etree import cElementTree
	import csv


	# Copy/paste from the help section on SEDE
	# http://data.stackexchange.com/stackoverflow/query/new
	schema = [
	import csv
	from sklearn.svm import LinearSVC
	from nltk.classify import SklearnClassifier
	from random import shuffle
	from sklearn.pipeline import Pipeline
	def parse_label(label):
	if label == '__label2__':
	return 'real'
	else:
	return 'fake'

	# Convert line from input file into an id/text/label tuple
	def parseReview(reviewLine):
	# Should return a triple of an integer, a string containing the review, and a string indicating the label
	return reviewLine[0], reviewLine[8], parse_label(reviewLine[1])
	# Input: a string of one review
	import re
	from nltk import word_tokenize

	def preProcess(text):
	# Should return a list of tokens
	text = re.sub(r"(\w)([.,;:!?'\"”\)])", r"\1 \2", text)
	text = re.sub(r"([.,;:!?'\"“\(])(\w)", r"\1 \2", text)
	text = text.lower()
	tokens = word_tokenize(text)
	featureDict = {} # A global dictionary of features

	def toFeatureVector(tokens):
	# Should return a dictionary containing features as keys, and weights as values
	v = {}
	for t in tokens:
	try:
	featureDict[t] += 1
	except KeyError:
	featureDict[t] = 1
	# load data from a file and append it to the rawData
	def loadData(path, Text=None):
	with open(path) as f:
	reader = csv.reader(f, delimiter='\t')
	next(reader, None) # skip the headers
	for line in reader:
	(Id, Text, Label) = parseReview(line)
	rawData.append((Id, Text, Label))
	preprocessedData.append((Id, toFeatureVector(preProcess(Text)), Label))
	def trainClassifier(trainData):
	print("Training Classifier...")
	pipeline = Pipeline([('svc', LinearSVC())])
	return SklearnClassifier(pipeline).train(trainData)
	def predictLabels(reviewSamples, classifier):
	return classifier.classify_many(map(lambda t: toFeatureVector(preProcess(t[1])), reviewSamples))

	def predictLabel(reviewSample, classifier):
	return classifier.classify(toFeatureVector(preProcess(reviewSample)))

	def trainClassifier(trainData):
	print("Training Classifier...")
	pipeline = Pipeline([('svc', LinearSVC())])
	return SklearnClassifier(pipeline).train(trainData)
	def flatten(lst):
	for el in lst:
	if isinstance(el, list):
	yield from el
	else:
	yield el
	def crossValidate(dataset, folds):
	shuffle(dataset)
	predictions = []
	ground_truth = []
	foldSize = int(len(dataset)/folds)

	for i in range(0,len(dataset), foldSize):
	trainFolds = dataset[:i] + dataset[i+foldSize:]
	validationFold = dataset[i: i+foldSize]