Last active June 30, 2017 05:09
word2vec for utf-8 text (skip-gram/negative sampling) (cosine similarity/analogy) (t-sne)
#-*- encoding: utf-8 -*-
from __future__ import division
import math
import struct
import numpy as np
from multiprocessing import Pool, Value, Array
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
import scipy
import codecs
import argparse
parser =argparse.ArgumentParser()
parser.add_argument('input', help = 'the file of input text')
args = parser.parse_args()
class ivoc:
def __init__(self, word):
self.word = word
self.freq = 0
#process the vocabulary from the data
class Vocs:
def __init__(self, input_file, min_fq):
#load data
input_file = open(input_file, 'r')
lines ='utf-8').lower().split('\n')
#list data and data info
voc_list = [] #list of all voc
voc_dict = {} #dict of voc index
voc_rdict = {}
line_list = []
voc_count = 0
word_count = 0
for line in lines:
word_list = []
words = line.split(' ')
for word in words:
if word == (' ' or ''):
word_count += 1
if word not in voc_dict:
voc_dict[word] = voc_count
voc_rdict[voc_count] = word
voc_count += 1
voc_list[voc_dict[word]].freq +=1 #assign the freqency to voc
self.voc_list = voc_list
self.dict = voc_dict
self.rdict = voc_rdict
self.word_count = word_count
self.line_list = line_list
#discard the rare vocs and sort the list by freqency
def fq_sort(self, min_fq):
nvoc_list = []
unk_list = []
unk_index = 0
unk_count = 0
for i in self.voc_list:
#if i is a rare voc
if i.freq < min_fq:
unk_count += 1
nvoc_list[unk_index].freq += i.freq
#if i is not rare voc: add it to new list
#sort the list according to freqency
nvoc_list.sort(key=lambda voc: voc.freq, reverse=True)
#renew the dict of voc index
nvoc_rdict = {}
nvoc_dict = {}
for ind, i in enumerate(nvoc_list):
nvoc_dict[i.word] = ind
nvoc_rdict[ind] = i.word
nline_list = []
for line in self.line_list:
word_list = []
for iword in line:
if iword in unk_list:
i = nvoc_dict['<unk>']
word = self.rdict[iword]
i = nvoc_dict[word]
self.line_list = nline_list
self.voc_list = nvoc_list
self.dict = nvoc_dict
self.rdict = nvoc_rdict
#call the index of the voc
def index(self, voc):
if voc in self.voc_list:
return self.dict[voc]
return self.dict['<unk>']
#unigramtable for negative sampling, generated for vocs
class unigramtable:
def __init__(self,vocs):
#rise the distribution by (3/4) power
pw = 0.75
#normalizing fraction
nf = sum([math.pow(i.freq, pw) for i in vocs.voc_list])
#create an emppty table
table_size = int(1e7)
table = np.zeros(table_size, dtype=np.uint32)
#fill the table with index
p = 0 # Cumulative probability
i = 0
for voc in vocs.voc_list:
p += float(math.pow(voc.freq, pw))/nf
while i < table_size and float(i) / table_size < p:
table[i] = vocs.dict[voc.word]
i += 1
self.table = table
#pick the vocs for negative sampling randomly
def neg_sample(self, neg_num):
indices = np.random.randint(0, len(self.table), size = neg_num)
return [self.table[i] for i in indices] #return the indices of target vocs
#set initial weight for hidden layer
def init_w(dim, voc_size): #dimentions and number of voc
random_0 = np.random.uniform(-0.5/dim, 0.5/dim, (voc_size, dim))
#use ctypeslib to speed up
syn0 = np.asarray(random_0)
random_1 = np.zeros((voc_size, dim))
syn1 = np.asarray(random_1)
return (syn0, syn1)
def sigm(z):
if z > 6:
return 1.0
elif z < -1:
return 0.0
return 1/(1 + math.exp(-z))
def train(input_file):
#formalize the input_file
min_freq = 3
neg_num = 20
dim = 100
vocs = Vocs(input_file, min_freq)
voc_size = len(vocs.voc_list)
#set initial net
syn0, syn1 = init_w(dim, voc_size)
table = unigramtable(vocs)
alpha = 0.05
window_size = 10
print 'making data'
for line_num, line in enumerate(vocs.line_list):
for pos, word in enumerate(line):
#make dataset(x,y) with skipgram with randam window size for each x
current_window = np.random.randint(1, window_size)
dataset = []
for i in range(1,current_window):
if pos-i >= 0:
dataset.append((word, line[pos-i]))
if pos+i <= len(line)-1:
dataset.append((word, line[pos+i]))
for x,y in dataset:
#negative sampling
classifiers = [(y,1)]+[(neg,0) for neg in table.neg_sample(neg_num)]
neule = np.zeros(dim)
for y , tag in classifiers:
z =[x],syn1[y])
p = sigm(z)
g = alpha*(tag-p) #loss
neule += g*syn1[y] #save loss for backpropagate
syn1[y]+= g*syn0[x] #update syn1
syn0[x] +=neule
index = input_file.find('.txt')
output_file = input_file[:index] +'_vec'+ input_file[index:]
with, 'w', 'utf-8') as op:
op.write('%d %d\n' % (len(syn0), dim))
for ivoc, vector in zip(vocs.voc_list, syn0):
word = ivoc.word
vector_str = ' '.join([str(v) for v in vector])
op.write('%s %s\n' % (word, vector_str))
print 'done'
return vocs, syn0
def tsne(voclist, vec):
fp =FontProperties(fname= '/home/yingminc/Documents/hw_stuff/notofonts/NotoSansCJKjp-hinted/NotoSansCJKjp-DemiLight.otf', size = 12)
model = TSNE(n_components=2, random_state=0)
np.set_printoptions(suppress = True)
x = model.fit_transform(vec)
plt.scatter(x[:, 0], x[:,1])
for label, x, y in zip(voclist, x[:, 0], x[:,1]):
plt.annotate(label , xy= (x,y), xytext = (0,0), textcoords = 'offset points', fontproperties = fp)
def pair_similar(w1, w2, voclist, vec):
nw1 = voclist.index(w1)
nw2 = voclist.index(w2)
sim = 1 - scipy.spatial.distance.cosine(vec[nw1], vec[nw2])
return sim
def top_similar(inp, voclist, vec, num=20):
inpn = voclist.index(inp)
sims = [1-scipy.spatial.distance.cosine(vec[inpn], i) for i in vec]
wsim = zip(voclist, sims)
swsim = sorted(wsim, key=lambda w: w[1], reverse = True)
return swsim[:num]
def word_analogy(w1, w2, w3, voclist, vec):
for x in top_similar(w1, voclist, vec, num= (len(voclist)-1)):
if pair_similar(x[0],w3,voclist,vec) > pair_similar(x[0],w2,voclist,vec):
print xlist[:20]
def make_voclist(vocs):
voclist = [ i.word for i in vocs.voc_list]
return voclist
def load_vec(file):
input_file= open(file, 'r')
lines ='\n')
voc_num, dim = map(int, lines[0].split(' '))
lines = lines[1:]
voclist = [line.split(' ')[0].decode('utf-8') for line in lines]
veclist = []
for line in lines:
veclist.extend(line.split(' ')[1:])
vec = np.array(veclist)
vec = np.reshape(vec,(voc_num, dim))
return voclist, vec
