Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
import csv
import tensorflow as tf
import random
import math
import time
from sklearn.feature_extraction.text import CountVectorizer
# Parameters
svm_c = 1
svm_margin = 30
num_trial = 100
batch_size = 100
tags = ["part-time-job",
"full-time-job",
"hourly-wage",
"salary",
"associate-needed",
"bs-degree-needed",
"ms-or-phd-needed",
"licence-needed",
"1-year-experience-needed",
"2-4-years-experience-needed",
"5-plus-years-experience-needed",
"supervising-job"]
tagcount = [0] * len(tags)
# Starting time
st_time = time.perf_counter()
st_proc_time = time.process_time()
word_set = set()
word_list = []
tag_vec = []
# Preprocess a message and returns a list containing words.
def sentence_to_words(str):
# TODO ad-hoc replacements of punctuations
str = str.replace('!', '')
str = str.replace('(', ' ')
str = str.replace(')', ' ')
str = str.replace('.', '')
str = str.replace('/', ' ')
str = str.replace('?', '')
str = str.replace(',', '')
str = str.replace(':', '')
str = str.replace(';', '')
str = str.replace('*', '')
str = str.replace('+', '')
str = str.lower()
return str.split()
with open('indeed_ml_dataset/train.tsv') as file:
train = csv.reader(file, delimiter = '\t')
for row in train:
if row[0] == 'tags':
continue
tv = [-1.0] * len(tags)
poscnt = 0
for elem in row[0].split():
index = tags.index(elem)
tagcount[index] += 1
tv[index] = 1.0 # TODO ad-hoc
poscnt += 1
for i in range(12):
if tv[i] > 0:
tv[i] = (12.0 - poscnt) / poscnt
tag_vec.append(tv)
wl = sentence_to_words(row[1])
for elem in wl:
word_set.add(elem)
word_list.append(wl)
word_freq = {}
words = 0
for entry in word_list:
words += len(entry)
for v in entry:
if not v in word_freq:
word_freq[v] = 0
word_freq[v] += 1
freq_threshold = 3
many_words = []
for entry in word_freq.items():
if entry[1] >= freq_threshold:
many_words.append(entry[0])
n = len(word_list)
# Debug
print('#words = %d' % words)
print('#unique words = %d' % len(word_set))
print('#unique words with occur >= %d = %d' % (freq_threshold, len(many_words)))
print('#rows = %d' % n)
m = len(many_words)
dict = {}
for i in range(m):
dict[many_words[i]] = i
word_vec = []
idt_tbl = [0.0] * m
idt_sum = 0.0
def words_to_occur(words):
entry = [0] * m
for w in words:
if w in dict:
entry[dict[w]] += 1
return entry
def words_to_vec(words):
entry = [0.0] * m
sum = 0.0
for w in words:
if w in dict:
entry[dict[w]] += 1.0
sum += 1.0
if sum != 0.0:
for i in range(m):
entry[i] /= sum
entry[i] *= math.log(idt_sum / idt_tbl[i]) # idf
return entry
for wl in word_list:
occur = words_to_occur(wl)
for i in range(m):
if occur[i] > 0:
idt_tbl[i] += 1.0
idt_sum += 1.0
for i in range(m):
ratio = idt_sum / idt_tbl[i]
for wl in word_list:
word_vec.append(words_to_vec(wl))
# TODO DEBUG
for i in range(m):
if abs(word_vec[0][i] - word_vec[1][i]) >= 2e-2:
print("idx = %d, word_vec[0][i] = %f, word_vec[1][i] = %f" %
(i, word_vec[0][i], word_vec[1][i]))
# Actual learning, using Tensorflow
x = tf.placeholder(tf.float32, [None, m])
W = tf.Variable(tf.zeros([m, 12]))
b = tf.Variable(tf.zeros([12]))
y = tf.matmul(x, W) + b
y_ = tf.placeholder(tf.float32, [None, 12])
# Reference: https://github.com/eakbas/tf-svm/blob/master/linear_svm.py
# Optimization.
regularization_loss = 0.5*tf.reduce_sum(tf.square(W))
hinge_loss = tf.reduce_sum(tf.maximum(tf.zeros(tf.shape(y)),
svm_margin - y*y_))
svm_loss = regularization_loss + svm_c * hinge_loss
train_step = tf.train.GradientDescentOptimizer(0.01).minimize(svm_loss)
# Evaluating Our Model
predicted_class = tf.sign(y);
correct_prediction = tf.reduce_all(tf.equal(y_ ,predicted_class), 1)
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float32"))
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)
# Actual training.
for i in range(num_trial):
tv = []
wv = []
for _ in range(batch_size):
idx = random.randint(0, n - 1)
tv.append(tag_vec[idx])
wv.append(word_vec[idx])
if i%20 == 0 or i == num_trial - 1:
train_accuracy = accuracy.eval(session=sess, feed_dict={
x:wv, y_: tv})
print("step %d, training accuracy %g"%(i, train_accuracy))
print("predicted_class: %s" % predicted_class.eval(
session = sess,feed_dict = { x:wv, y_: tv }))
print("expected_class: %s" % tv)
print("svm loss: %g" % svm_loss.eval(session = sess, feed_dict = {
x:wv, y_: tv}))
print("b: %s" % b.eval(session = sess))
print("elapsed: %g sec"%(time.perf_counter() - st_time))
print("process: %g sec"%(time.process_time() - st_proc_time))
train_step.run(session=sess, feed_dict={x: wv, y_: tv})
answer = []
with open('indeed_ml_dataset/test.tsv') as file:
test = csv.reader(file, delimiter = '\t')
cnt = 0
for row in test:
if row[0] == 'description':
continue
wv = words_to_vec(row[0])
y_est = predicted_class.eval(session = sess, feed_dict = {x: [wv]})[0]
if cnt % 800 == 0:
print("cnt = %d" % cnt)
print("row[0][0:20] = %s" % row[0][0:20])
print("wv[0:20] = %s" % wv[0:20])
print("class = %s" % y_est)
print(y.eval(session = sess, feed_dict = {x: [wv]})[0])
print("elapsed: %g sec"%(time.perf_counter() - st_time))
print("process: %g sec"%(time.process_time() - st_proc_time))
res = ""
for i in range(12):
if y_est[i] > 0:
if res == "":
res = tags[i]
else:
res += " " + tags[i]
answer.append(res)
cnt += 1
with open('tags.tsv', 'w') as file:
file.write("tags\n")
for v in answer:
file.write("%s\n" % v)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment