Skip to content

Instantly share code, notes, and snippets.

@jonahgeorge
Created June 8, 2017 01:03
Show Gist options
  • Save jonahgeorge/663ba459393fe558d7404456ceb61d11 to your computer and use it in GitHub Desktop.
Save jonahgeorge/663ba459393fe558d7404456ceb61d11 to your computer and use it in GitHub Desktop.
import sys
import re
import csv
from math import log
def parse_row(row):
[sentence, score] = row.split('\t')
sentence = sentence.lower()
sentence = re.sub('/', ' ', sentence)
sentence = re.sub('[^a-z\s]+', '', sentence)
return [sentence, int(score)]
def parse_sentence(sentence):
return sentence.split()
def save_to_csv(matrix, filename):
with open(filename, 'w') as fp:
writer = csv.writer(fp)
for row in matrix:
writer.writerow(row)
def preprocess(filename):
with open(filename, 'r') as fp:
data = fp.read().split('\n')
del data[-1]
distinct_words = set()
# Construct distinct word set
for d in data:
[sentence, score] = parse_row(d)
words = parse_sentence(sentence)
for w in words:
distinct_words.add(w)
distinct_words = sorted(distinct_words)
# Construct matrix
matrix = list()
matrix.append(distinct_words)
for d in data:
[sentence, score] = parse_row(d)
words = parse_sentence(sentence)
row = list()
for dw in distinct_words:
row.append(1 if dw in words else 0)
row.append(score)
matrix.append(row)
matrix[0].append("classlabel")
return matrix
def get_vocabulary(training_set):
vocabulary = {k: [0, 0, 0, 0] for k in training_set[0][0:-1]}
for i, row in enumerate(training_set[1:(len(training_set))]):
for k, feat in enumerate(row[0:-1]):
if row[-1] == 0:
if feat == 0:
vocabulary[training_set[0][k]][0] += 1
else:
vocabulary[training_set[0][k]][2] += 1
else:
if feat == 0:
vocabulary[training_set[0][k]][1] += 1
else:
vocabulary[training_set[0][k]][3] += 1
return vocabulary
FALSE_FALSE = 0
FALSE_TRUE = 1
TRUE_FALSE = 2
TRUE_TRUE = 3
def predict(vocabulary, total, positive_total, negative_total, label_row, presence_row):
positive_result = positive_total / total
negative_result = negative_total / total
for idx, val in enumerate(presence_row[:-1]): # Enumerate all but classlabel column
# if val == 1: print(label_row[idx])
if label_row[idx] in vocabulary:
if val == 1:
positive_result *= vocabulary[label_row[idx]][TRUE_TRUE] / positive_total
elif val == 0:
positive_result *= vocabulary[label_row[idx]][FALSE_TRUE] / positive_total
if val == 1:
negative_result *= vocabulary[label_row[idx]][TRUE_FALSE] / negative_total
elif val == 0:
negative_result *= vocabulary[label_row[idx]][FALSE_FALSE] / negative_total
if positive_result > negative_result:
return 1
else:
return 0
def run(classifier, training_set):
positive_total = len(list(filter(lambda x: x[-1] == 1, training_set)))
negative_total = len(list(filter(lambda x: x[-1] == 0, training_set)))
total = len(training_set[1:])
accurate_count = 0
for s in training_set[1:]: # Enumerate all but label row
result = predict(classifier, total, positive_total, negative_total, training_set[0], s)
if result == s[-1]:
accurate_count += 1
return (accurate_count / total) * 100
if __name__ == "__main__":
with open('results.txt', 'w') as f:
training_set = preprocess('trainingSet.txt')
test_set = preprocess('testSet.txt')
save_to_csv(training_set, 'preprocessed_train.txt')
save_to_csv(test_set, 'preprocessed_test.txt')
classifier = get_vocabulary(training_set)
training_accuracy = run(classifier, training_set)
test_accuracy = run(classifier, test_set)
f.write("Training On: trainingSet.txt\n")
f.write("Testing On: trainingSet.txt\n")
f.write("Accuracy: {}%\n".format(training_accuracy))
f.write("\n")
f.write("Training On: trainingSet.txt\n")
f.write("Testing On: testSet.txt\n")
f.write("Accuracy: {}%\n".format(test_accuracy))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment