Last active
April 13, 2017 04:19
-
-
Save koba-e964/b70d9b8de4dc0bcd71c32baee8d6726b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import tensorflow as tf | |
import random | |
import math | |
import time | |
from sklearn.feature_extraction.text import CountVectorizer | |
# Parameters | |
svm_c = 1 | |
svm_margin = 30 | |
num_trial = 100 | |
batch_size = 100 | |
tags = ["part-time-job", | |
"full-time-job", | |
"hourly-wage", | |
"salary", | |
"associate-needed", | |
"bs-degree-needed", | |
"ms-or-phd-needed", | |
"licence-needed", | |
"1-year-experience-needed", | |
"2-4-years-experience-needed", | |
"5-plus-years-experience-needed", | |
"supervising-job"] | |
tagcount = [0] * len(tags) | |
# Starting time | |
st_time = time.perf_counter() | |
st_proc_time = time.process_time() | |
word_set = set() | |
word_list = [] | |
tag_vec = [] | |
# Preprocess a message and returns a list containing words. | |
def sentence_to_words(str): | |
# TODO ad-hoc replacements of punctuations | |
str = str.replace('!', '') | |
str = str.replace('(', ' ') | |
str = str.replace(')', ' ') | |
str = str.replace('.', '') | |
str = str.replace('/', ' ') | |
str = str.replace('?', '') | |
str = str.replace(',', '') | |
str = str.replace(':', '') | |
str = str.replace(';', '') | |
str = str.replace('*', '') | |
str = str.replace('+', '') | |
str = str.lower() | |
return str.split() | |
with open('indeed_ml_dataset/train.tsv') as file: | |
train = csv.reader(file, delimiter = '\t') | |
for row in train: | |
if row[0] == 'tags': | |
continue | |
tv = [-1.0] * len(tags) | |
poscnt = 0 | |
for elem in row[0].split(): | |
index = tags.index(elem) | |
tagcount[index] += 1 | |
tv[index] = 1.0 # TODO ad-hoc | |
poscnt += 1 | |
for i in range(12): | |
if tv[i] > 0: | |
tv[i] = (12.0 - poscnt) / poscnt | |
tag_vec.append(tv) | |
wl = sentence_to_words(row[1]) | |
for elem in wl: | |
word_set.add(elem) | |
word_list.append(wl) | |
word_freq = {} | |
words = 0 | |
for entry in word_list: | |
words += len(entry) | |
for v in entry: | |
if not v in word_freq: | |
word_freq[v] = 0 | |
word_freq[v] += 1 | |
freq_threshold = 3 | |
many_words = [] | |
for entry in word_freq.items(): | |
if entry[1] >= freq_threshold: | |
many_words.append(entry[0]) | |
n = len(word_list) | |
# Debug | |
print('#words = %d' % words) | |
print('#unique words = %d' % len(word_set)) | |
print('#unique words with occur >= %d = %d' % (freq_threshold, len(many_words))) | |
print('#rows = %d' % n) | |
m = len(many_words) | |
dict = {} | |
for i in range(m): | |
dict[many_words[i]] = i | |
word_vec = [] | |
idt_tbl = [0.0] * m | |
idt_sum = 0.0 | |
def words_to_occur(words): | |
entry = [0] * m | |
for w in words: | |
if w in dict: | |
entry[dict[w]] += 1 | |
return entry | |
def words_to_vec(words): | |
entry = [0.0] * m | |
sum = 0.0 | |
for w in words: | |
if w in dict: | |
entry[dict[w]] += 1.0 | |
sum += 1.0 | |
if sum != 0.0: | |
for i in range(m): | |
entry[i] /= sum | |
entry[i] *= math.log(idt_sum / idt_tbl[i]) # idf | |
return entry | |
for wl in word_list: | |
occur = words_to_occur(wl) | |
for i in range(m): | |
if occur[i] > 0: | |
idt_tbl[i] += 1.0 | |
idt_sum += 1.0 | |
for i in range(m): | |
ratio = idt_sum / idt_tbl[i] | |
for wl in word_list: | |
word_vec.append(words_to_vec(wl)) | |
# TODO DEBUG | |
for i in range(m): | |
if abs(word_vec[0][i] - word_vec[1][i]) >= 2e-2: | |
print("idx = %d, word_vec[0][i] = %f, word_vec[1][i] = %f" % | |
(i, word_vec[0][i], word_vec[1][i])) | |
# Actual learning, using Tensorflow | |
x = tf.placeholder(tf.float32, [None, m]) | |
W = tf.Variable(tf.zeros([m, 12])) | |
b = tf.Variable(tf.zeros([12])) | |
y = tf.matmul(x, W) + b | |
y_ = tf.placeholder(tf.float32, [None, 12]) | |
# Reference: https://github.com/eakbas/tf-svm/blob/master/linear_svm.py | |
# Optimization. | |
regularization_loss = 0.5*tf.reduce_sum(tf.square(W)) | |
hinge_loss = tf.reduce_sum(tf.maximum(tf.zeros(tf.shape(y)), | |
svm_margin - y*y_)) | |
svm_loss = regularization_loss + svm_c * hinge_loss | |
train_step = tf.train.GradientDescentOptimizer(0.01).minimize(svm_loss) | |
# Evaluating Our Model | |
predicted_class = tf.sign(y); | |
correct_prediction = tf.reduce_all(tf.equal(y_ ,predicted_class), 1) | |
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float32")) | |
sess = tf.Session() | |
init = tf.global_variables_initializer() | |
sess.run(init) | |
# Actual training. | |
for i in range(num_trial): | |
tv = [] | |
wv = [] | |
for _ in range(batch_size): | |
idx = random.randint(0, n - 1) | |
tv.append(tag_vec[idx]) | |
wv.append(word_vec[idx]) | |
if i%20 == 0 or i == num_trial - 1: | |
train_accuracy = accuracy.eval(session=sess, feed_dict={ | |
x:wv, y_: tv}) | |
print("step %d, training accuracy %g"%(i, train_accuracy)) | |
print("predicted_class: %s" % predicted_class.eval( | |
session = sess,feed_dict = { x:wv, y_: tv })) | |
print("expected_class: %s" % tv) | |
print("svm loss: %g" % svm_loss.eval(session = sess, feed_dict = { | |
x:wv, y_: tv})) | |
print("b: %s" % b.eval(session = sess)) | |
print("elapsed: %g sec"%(time.perf_counter() - st_time)) | |
print("process: %g sec"%(time.process_time() - st_proc_time)) | |
train_step.run(session=sess, feed_dict={x: wv, y_: tv}) | |
answer = [] | |
with open('indeed_ml_dataset/test.tsv') as file: | |
test = csv.reader(file, delimiter = '\t') | |
cnt = 0 | |
for row in test: | |
if row[0] == 'description': | |
continue | |
wv = words_to_vec(row[0]) | |
y_est = predicted_class.eval(session = sess, feed_dict = {x: [wv]})[0] | |
if cnt % 800 == 0: | |
print("cnt = %d" % cnt) | |
print("row[0][0:20] = %s" % row[0][0:20]) | |
print("wv[0:20] = %s" % wv[0:20]) | |
print("class = %s" % y_est) | |
print(y.eval(session = sess, feed_dict = {x: [wv]})[0]) | |
print("elapsed: %g sec"%(time.perf_counter() - st_time)) | |
print("process: %g sec"%(time.process_time() - st_proc_time)) | |
res = "" | |
for i in range(12): | |
if y_est[i] > 0: | |
if res == "": | |
res = tags[i] | |
else: | |
res += " " + tags[i] | |
answer.append(res) | |
cnt += 1 | |
with open('tags.tsv', 'w') as file: | |
file.write("tags\n") | |
for v in answer: | |
file.write("%s\n" % v) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment