jonahgeorge/main.py

## main.py
import sys
import re
import csv
from math import log

def parse_row(row):
    [sentence, score] = row.split('\t')
    sentence = sentence.lower()
    sentence = re.sub('/', ' ', sentence)
    sentence = re.sub('[^a-z\s]+', '', sentence)
    return [sentence, int(score)]

def parse_sentence(sentence):
    return sentence.split()

def save_to_csv(matrix, filename):
    with open(filename, 'w') as fp:
        writer = csv.writer(fp)
        for row in matrix:
            writer.writerow(row)

def preprocess(filename):
    with open(filename, 'r') as fp:
        data = fp.read().split('\n')
        del data[-1]

        distinct_words = set()

        # Construct distinct word set
        for d in data:
            [sentence, score] = parse_row(d)
            words = parse_sentence(sentence)
            for w in words:
                distinct_words.add(w)

        distinct_words = sorted(distinct_words)

        # Construct matrix
        matrix = list()
        matrix.append(distinct_words)
        for d in data:
            [sentence, score] = parse_row(d)
            words = parse_sentence(sentence)
            row = list()
            for dw in distinct_words:
                row.append(1 if dw in words else 0)
            row.append(score)
            matrix.append(row)
        matrix[0].append("classlabel")

        return matrix

def get_vocabulary(training_set):
    vocabulary = {k: [0, 0, 0, 0] for k in training_set[0][0:-1]}

    for i, row in enumerate(training_set[1:(len(training_set))]):
        for k, feat in enumerate(row[0:-1]):
            if row[-1] == 0:
                if feat == 0:
                    vocabulary[training_set[0][k]][0] += 1
                else:
                    vocabulary[training_set[0][k]][2] += 1
            else:
                if feat == 0:
                    vocabulary[training_set[0][k]][1] += 1
                else:
                    vocabulary[training_set[0][k]][3] += 1

    return vocabulary

FALSE_FALSE = 0
FALSE_TRUE = 1
TRUE_FALSE = 2
TRUE_TRUE = 3

def predict(vocabulary, total, positive_total, negative_total, label_row, presence_row):
    positive_result = positive_total / total
    negative_result = negative_total / total

    for idx, val in enumerate(presence_row[:-1]): # Enumerate all but classlabel column
        # if val == 1: print(label_row[idx])
        if label_row[idx] in vocabulary:
            if val == 1:
                positive_result *= vocabulary[label_row[idx]][TRUE_TRUE] / positive_total
            elif val == 0:
                positive_result *= vocabulary[label_row[idx]][FALSE_TRUE] / positive_total

            if val == 1:
                negative_result *= vocabulary[label_row[idx]][TRUE_FALSE] / negative_total
            elif val == 0:
                negative_result *= vocabulary[label_row[idx]][FALSE_FALSE] / negative_total

    if positive_result > negative_result:
        return 1
    else:
        return 0

def run(classifier, training_set):
    positive_total = len(list(filter(lambda x: x[-1] == 1, training_set)))
    negative_total = len(list(filter(lambda x: x[-1] == 0, training_set)))
    total = len(training_set[1:])

    accurate_count = 0
    for s in training_set[1:]: # Enumerate all but label row
        result = predict(classifier, total, positive_total, negative_total, training_set[0], s)
        if result == s[-1]:
            accurate_count += 1

    return (accurate_count / total) * 100

if __name__ == "__main__":
    with open('results.txt', 'w') as f:
        training_set = preprocess('trainingSet.txt')
        test_set = preprocess('testSet.txt')

        save_to_csv(training_set, 'preprocessed_train.txt')
        save_to_csv(test_set, 'preprocessed_test.txt')

        classifier = get_vocabulary(training_set)
        training_accuracy = run(classifier, training_set)
        test_accuracy = run(classifier, test_set)

        f.write("Training On: trainingSet.txt\n")
        f.write("Testing On: trainingSet.txt\n")
        f.write("Accuracy: {}%\n".format(training_accuracy))
        f.write("\n")
        f.write("Training On: trainingSet.txt\n")
        f.write("Testing On: testSet.txt\n")
        f.write("Accuracy: {}%\n".format(test_accuracy))
	import sys
	import re
	import csv
	from math import log

	def parse_row(row):
	[sentence, score] = row.split('\t')
	sentence = sentence.lower()
	sentence = re.sub('/', ' ', sentence)
	sentence = re.sub('[^a-z\s]+', '', sentence)
	return [sentence, int(score)]

	def parse_sentence(sentence):
	return sentence.split()

	def save_to_csv(matrix, filename):
	with open(filename, 'w') as fp:
	writer = csv.writer(fp)
	for row in matrix:
	writer.writerow(row)

	def preprocess(filename):
	with open(filename, 'r') as fp:
	data = fp.read().split('\n')
	del data[-1]

	distinct_words = set()

	# Construct distinct word set
	for d in data:
	[sentence, score] = parse_row(d)
	words = parse_sentence(sentence)
	for w in words:
	distinct_words.add(w)

	distinct_words = sorted(distinct_words)

	# Construct matrix
	matrix = list()
	matrix.append(distinct_words)
	for d in data:
	[sentence, score] = parse_row(d)
	words = parse_sentence(sentence)
	row = list()
	for dw in distinct_words:
	row.append(1 if dw in words else 0)
	row.append(score)
	matrix.append(row)
	matrix[0].append("classlabel")

	return matrix

	def get_vocabulary(training_set):
	vocabulary = {k: [0, 0, 0, 0] for k in training_set[0][0:-1]}

	for i, row in enumerate(training_set[1:(len(training_set))]):
	for k, feat in enumerate(row[0:-1]):
	if row[-1] == 0:
	if feat == 0:
	vocabulary[training_set[0][k]][0] += 1
	else:
	vocabulary[training_set[0][k]][2] += 1
	else:
	if feat == 0:
	vocabulary[training_set[0][k]][1] += 1
	else:
	vocabulary[training_set[0][k]][3] += 1

	return vocabulary

	FALSE_FALSE = 0
	FALSE_TRUE = 1
	TRUE_FALSE = 2
	TRUE_TRUE = 3

	def predict(vocabulary, total, positive_total, negative_total, label_row, presence_row):
	positive_result = positive_total / total
	negative_result = negative_total / total

	for idx, val in enumerate(presence_row[:-1]): # Enumerate all but classlabel column
	# if val == 1: print(label_row[idx])
	if label_row[idx] in vocabulary:
	if val == 1:
	positive_result *= vocabulary[label_row[idx]][TRUE_TRUE] / positive_total
	elif val == 0:
	positive_result *= vocabulary[label_row[idx]][FALSE_TRUE] / positive_total

	if val == 1:
	negative_result *= vocabulary[label_row[idx]][TRUE_FALSE] / negative_total
	elif val == 0:
	negative_result *= vocabulary[label_row[idx]][FALSE_FALSE] / negative_total

	if positive_result > negative_result:
	return 1
	else:
	return 0

	def run(classifier, training_set):
	positive_total = len(list(filter(lambda x: x[-1] == 1, training_set)))
	negative_total = len(list(filter(lambda x: x[-1] == 0, training_set)))
	total = len(training_set[1:])

	accurate_count = 0
	for s in training_set[1:]: # Enumerate all but label row
	result = predict(classifier, total, positive_total, negative_total, training_set[0], s)
	if result == s[-1]:
	accurate_count += 1

	return (accurate_count / total) * 100

	if __name__ == "__main__":
	with open('results.txt', 'w') as f:
	training_set = preprocess('trainingSet.txt')
	test_set = preprocess('testSet.txt')

	save_to_csv(training_set, 'preprocessed_train.txt')
	save_to_csv(test_set, 'preprocessed_test.txt')

	classifier = get_vocabulary(training_set)
	training_accuracy = run(classifier, training_set)
	test_accuracy = run(classifier, test_set)

	f.write("Training On: trainingSet.txt\n")
	f.write("Testing On: trainingSet.txt\n")
	f.write("Accuracy: {}%\n".format(training_accuracy))
	f.write("\n")
	f.write("Training On: trainingSet.txt\n")
	f.write("Testing On: testSet.txt\n")
	f.write("Accuracy: {}%\n".format(test_accuracy))