koba-e964/indeed-contest-progress.py

## indeed-contest-progress.py
import csv
import tensorflow as tf
import random
import math
import time

from sklearn.feature_extraction.text import CountVectorizer


# Parameters
svm_c = 1
svm_margin = 30
num_trial = 100
batch_size = 100


tags = ["part-time-job",
        "full-time-job",
        "hourly-wage",
        "salary",
        "associate-needed",
        "bs-degree-needed",
        "ms-or-phd-needed",
        "licence-needed",
        "1-year-experience-needed",
        "2-4-years-experience-needed",
        "5-plus-years-experience-needed",
        "supervising-job"]

tagcount = [0] * len(tags)

# Starting time
st_time = time.perf_counter()
st_proc_time = time.process_time()

word_set = set()
word_list = []
tag_vec = []

# Preprocess a message and returns a list containing words.
def sentence_to_words(str):
    # TODO ad-hoc replacements of punctuations

    str = str.replace('!', '')
    str = str.replace('(', ' ')
    str = str.replace(')', ' ')
    str = str.replace('.', '')
    str = str.replace('/', ' ')
    str = str.replace('?', '')
    str = str.replace(',', '')
    str = str.replace(':', '')
    str = str.replace(';', '')
    str = str.replace('*', '')
    str = str.replace('+', '')
    str = str.lower()
    return str.split()

with open('indeed_ml_dataset/train.tsv') as file:
    train = csv.reader(file, delimiter = '\t')

    for row in train:
        if row[0] == 'tags':
            continue
        tv = [-1.0] * len(tags)
        poscnt = 0
        for elem in row[0].split():
            index = tags.index(elem)
            tagcount[index] += 1
            tv[index] = 1.0 # TODO ad-hoc
            poscnt += 1
        for i in range(12):
            if tv[i] > 0:
                tv[i] = (12.0 - poscnt) / poscnt
        tag_vec.append(tv)
        wl = sentence_to_words(row[1])
        for elem in wl:
            word_set.add(elem)
        word_list.append(wl)

word_freq = {}
words = 0
for entry in word_list:
    words += len(entry)
    for v in entry:
        if not v in word_freq:
            word_freq[v] = 0
        word_freq[v] += 1


freq_threshold = 3
many_words = []
for entry in word_freq.items():
    if entry[1] >= freq_threshold:
        many_words.append(entry[0])

n = len(word_list)
# Debug

print('#words = %d' % words)
print('#unique words = %d' % len(word_set))
print('#unique words with occur >= %d = %d' % (freq_threshold, len(many_words)))
print('#rows = %d' % n)

m = len(many_words)
dict = {}
for i in range(m):
    dict[many_words[i]] = i
word_vec = []
idt_tbl = [0.0] * m
idt_sum = 0.0
def words_to_occur(words):
    entry = [0] * m
    for w in words:
        if w in dict:
            entry[dict[w]] += 1
    return entry

def words_to_vec(words):
    entry = [0.0] * m
    sum = 0.0
    for w in words:
        if w in dict:
            entry[dict[w]] += 1.0
        sum += 1.0
    if sum != 0.0:
        for i in range(m):
            entry[i] /= sum
            entry[i] *= math.log(idt_sum / idt_tbl[i]) # idf
    return entry

for wl in word_list:
    occur = words_to_occur(wl)
    for i in range(m):
        if occur[i] > 0:
            idt_tbl[i] += 1.0
    idt_sum += 1.0

for i in range(m):
    ratio = idt_sum / idt_tbl[i]

for wl in word_list:
    word_vec.append(words_to_vec(wl))

# TODO DEBUG
for i in range(m):
    if abs(word_vec[0][i] - word_vec[1][i]) >= 2e-2:
        print("idx = %d, word_vec[0][i] = %f, word_vec[1][i] = %f" %
              (i, word_vec[0][i], word_vec[1][i]))


# Actual learning, using Tensorflow


x = tf.placeholder(tf.float32, [None, m])
W = tf.Variable(tf.zeros([m, 12]))
b = tf.Variable(tf.zeros([12]))

y = tf.matmul(x, W) + b

y_ = tf.placeholder(tf.float32, [None, 12])

# Reference: https://github.com/eakbas/tf-svm/blob/master/linear_svm.py
# Optimization.
regularization_loss = 0.5*tf.reduce_sum(tf.square(W))
hinge_loss = tf.reduce_sum(tf.maximum(tf.zeros(tf.shape(y)),
                                      svm_margin - y*y_))
svm_loss = regularization_loss + svm_c * hinge_loss
train_step = tf.train.GradientDescentOptimizer(0.01).minimize(svm_loss)

# Evaluating Our Model
predicted_class = tf.sign(y);
correct_prediction = tf.reduce_all(tf.equal(y_ ,predicted_class), 1)
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float32"))

sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)


# Actual training.
for i in range(num_trial):
    tv = []
    wv = []
    for _ in range(batch_size):
        idx = random.randint(0, n - 1)
        tv.append(tag_vec[idx])
        wv.append(word_vec[idx])
    if i%20 == 0 or i == num_trial - 1:
        train_accuracy = accuracy.eval(session=sess, feed_dict={
            x:wv, y_: tv})
        print("step %d, training accuracy %g"%(i, train_accuracy))
        print("predicted_class: %s" % predicted_class.eval(
            session = sess,feed_dict = { x:wv, y_: tv }))
        print("expected_class: %s" % tv)
        print("svm loss: %g" % svm_loss.eval(session = sess, feed_dict = {
            x:wv, y_: tv}))
        print("b: %s" % b.eval(session = sess))
        print("elapsed: %g sec"%(time.perf_counter() - st_time))
        print("process: %g sec"%(time.process_time() - st_proc_time))
    train_step.run(session=sess, feed_dict={x: wv, y_: tv})

answer = []
with open('indeed_ml_dataset/test.tsv') as file:
    test = csv.reader(file, delimiter = '\t')
    cnt = 0

    for row in test:
        if row[0] == 'description':
            continue
        wv = words_to_vec(row[0])
        y_est = predicted_class.eval(session = sess, feed_dict = {x: [wv]})[0]
        if cnt % 800 == 0:
            print("cnt = %d" % cnt)
            print("row[0][0:20] = %s" % row[0][0:20])
            print("wv[0:20] = %s" % wv[0:20])
            print("class = %s" % y_est)
            print(y.eval(session = sess, feed_dict = {x: [wv]})[0])
            print("elapsed: %g sec"%(time.perf_counter() - st_time))
            print("process: %g sec"%(time.process_time() - st_proc_time))
        res = ""
        for i in range(12):
            if y_est[i] > 0:
                if res == "":
                    res = tags[i]
                else:
                    res += " " + tags[i]
        answer.append(res)
        cnt += 1

with open('tags.tsv', 'w') as file:
    file.write("tags\n")
    for v in answer:
        file.write("%s\n" % v)
	import csv
	import tensorflow as tf
	import random
	import math
	import time

	from sklearn.feature_extraction.text import CountVectorizer


	# Parameters
	svm_c = 1
	svm_margin = 30
	num_trial = 100
	batch_size = 100


	tags = ["part-time-job",
	"full-time-job",
	"hourly-wage",
	"salary",
	"associate-needed",
	"bs-degree-needed",
	"ms-or-phd-needed",
	"licence-needed",
	"1-year-experience-needed",
	"2-4-years-experience-needed",
	"5-plus-years-experience-needed",
	"supervising-job"]

	tagcount = [0] * len(tags)

	# Starting time
	st_time = time.perf_counter()
	st_proc_time = time.process_time()

	word_set = set()
	word_list = []
	tag_vec = []

	# Preprocess a message and returns a list containing words.
	def sentence_to_words(str):
	# TODO ad-hoc replacements of punctuations

	str = str.replace('!', '')
	str = str.replace('(', ' ')
	str = str.replace(')', ' ')
	str = str.replace('.', '')
	str = str.replace('/', ' ')
	str = str.replace('?', '')
	str = str.replace(',', '')
	str = str.replace(':', '')
	str = str.replace(';', '')
	str = str.replace('*', '')
	str = str.replace('+', '')
	str = str.lower()
	return str.split()

	with open('indeed_ml_dataset/train.tsv') as file:
	train = csv.reader(file, delimiter = '\t')

	for row in train:
	if row[0] == 'tags':
	continue
	tv = [-1.0] * len(tags)
	poscnt = 0
	for elem in row[0].split():
	index = tags.index(elem)
	tagcount[index] += 1
	tv[index] = 1.0 # TODO ad-hoc
	poscnt += 1
	for i in range(12):
	if tv[i] > 0:
	tv[i] = (12.0 - poscnt) / poscnt
	tag_vec.append(tv)
	wl = sentence_to_words(row[1])
	for elem in wl:
	word_set.add(elem)
	word_list.append(wl)

	word_freq = {}
	words = 0
	for entry in word_list:
	words += len(entry)
	for v in entry:
	if not v in word_freq:
	word_freq[v] = 0
	word_freq[v] += 1


	freq_threshold = 3
	many_words = []
	for entry in word_freq.items():
	if entry[1] >= freq_threshold:
	many_words.append(entry[0])

	n = len(word_list)
	# Debug

	print('#words = %d' % words)
	print('#unique words = %d' % len(word_set))
	print('#unique words with occur >= %d = %d' % (freq_threshold, len(many_words)))
	print('#rows = %d' % n)

	m = len(many_words)
	dict = {}
	for i in range(m):
	dict[many_words[i]] = i
	word_vec = []
	idt_tbl = [0.0] * m
	idt_sum = 0.0
	def words_to_occur(words):
	entry = [0] * m
	for w in words:
	if w in dict:
	entry[dict[w]] += 1
	return entry

	def words_to_vec(words):
	entry = [0.0] * m
	sum = 0.0
	for w in words:
	if w in dict:
	entry[dict[w]] += 1.0
	sum += 1.0
	if sum != 0.0:
	for i in range(m):
	entry[i] /= sum
	entry[i] *= math.log(idt_sum / idt_tbl[i]) # idf
	return entry

	for wl in word_list:
	occur = words_to_occur(wl)
	for i in range(m):
	if occur[i] > 0:
	idt_tbl[i] += 1.0
	idt_sum += 1.0

	for i in range(m):
	ratio = idt_sum / idt_tbl[i]

	for wl in word_list:
	word_vec.append(words_to_vec(wl))

	# TODO DEBUG
	for i in range(m):
	if abs(word_vec[0][i] - word_vec[1][i]) >= 2e-2:
	print("idx = %d, word_vec[0][i] = %f, word_vec[1][i] = %f" %
	(i, word_vec[0][i], word_vec[1][i]))


	# Actual learning, using Tensorflow


	x = tf.placeholder(tf.float32, [None, m])
	W = tf.Variable(tf.zeros([m, 12]))
	b = tf.Variable(tf.zeros([12]))

	y = tf.matmul(x, W) + b

	y_ = tf.placeholder(tf.float32, [None, 12])

	# Reference: https://github.com/eakbas/tf-svm/blob/master/linear_svm.py
	# Optimization.
	regularization_loss = 0.5*tf.reduce_sum(tf.square(W))
	hinge_loss = tf.reduce_sum(tf.maximum(tf.zeros(tf.shape(y)),
	svm_margin - y*y_))
	svm_loss = regularization_loss + svm_c * hinge_loss
	train_step = tf.train.GradientDescentOptimizer(0.01).minimize(svm_loss)

	# Evaluating Our Model
	predicted_class = tf.sign(y);
	correct_prediction = tf.reduce_all(tf.equal(y_ ,predicted_class), 1)
	accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float32"))

	sess = tf.Session()
	init = tf.global_variables_initializer()
	sess.run(init)


	# Actual training.
	for i in range(num_trial):
	tv = []
	wv = []
	for _ in range(batch_size):
	idx = random.randint(0, n - 1)
	tv.append(tag_vec[idx])
	wv.append(word_vec[idx])
	if i%20 == 0 or i == num_trial - 1:
	train_accuracy = accuracy.eval(session=sess, feed_dict={
	x:wv, y_: tv})
	print("step %d, training accuracy %g"%(i, train_accuracy))
	print("predicted_class: %s" % predicted_class.eval(
	session = sess,feed_dict = { x:wv, y_: tv }))
	print("expected_class: %s" % tv)
	print("svm loss: %g" % svm_loss.eval(session = sess, feed_dict = {
	x:wv, y_: tv}))
	print("b: %s" % b.eval(session = sess))
	print("elapsed: %g sec"%(time.perf_counter() - st_time))
	print("process: %g sec"%(time.process_time() - st_proc_time))
	train_step.run(session=sess, feed_dict={x: wv, y_: tv})

	answer = []
	with open('indeed_ml_dataset/test.tsv') as file:
	test = csv.reader(file, delimiter = '\t')
	cnt = 0

	for row in test:
	if row[0] == 'description':
	continue
	wv = words_to_vec(row[0])
	y_est = predicted_class.eval(session = sess, feed_dict = {x: [wv]})[0]
	if cnt % 800 == 0:
	print("cnt = %d" % cnt)
	print("row[0][0:20] = %s" % row[0][0:20])
	print("wv[0:20] = %s" % wv[0:20])
	print("class = %s" % y_est)
	print(y.eval(session = sess, feed_dict = {x: [wv]})[0])
	print("elapsed: %g sec"%(time.perf_counter() - st_time))
	print("process: %g sec"%(time.process_time() - st_proc_time))
	res = ""
	for i in range(12):
	if y_est[i] > 0:
	if res == "":
	res = tags[i]
	else:
	res += " " + tags[i]
	answer.append(res)
	cnt += 1

	with open('tags.tsv', 'w') as file:
	file.write("tags\n")
	for v in answer:
	file.write("%s\n" % v)