KayneWest/word2vec_model.py

## word2vec_model.py
from string import translate,maketrans,punctuation
from itertools import chain
from nltk import PunktSentenceTokenizer
import datetime
import re

def log(msg):
    print("{} {}".format(str(datetime.datetime.now()), msg))
def removeNonAscii(s):
    return "".join(filter(lambda x: ord(x)<128, s))

# keeps -, +, # in words
punctuation = punctuation.replace('-','').replace('+','').replace('#','')
#makes a C translation dictionary converting punctuations to white spaces
Trans = maketrans(punctuation, ' '*len(punctuation))
#splits text into sentences'
tknr = PunktSentenceTokenizer()

#fast ngrammer if you end up using it for phrases
def ngrammer2(l,n):
    temp = [" ".join(l[i:i+n]) for i in xrange(0,len(l)) if len(l[i:i+n])==n]
    return temp

print 'Loading the post data'
import pickle
s=pickle.load(open("title_and_job.p","rb"))
x_train_RAW=[]
for i in s:
    if len(i.values()[0])>=30:
        title=i.keys()[0]
        for q in i.values()[0]:
            x_train_RAW.append(q.encode('utf-8'))

#can use the ngrammer here if you want to look at phrase similarity
#I get rid of html characters from this corpus
def spliter(jobpost):
    sentences2=[]
    s=tknr.tokenize(jobpost)
    cleaned_words = [list(translate(re.sub(r'[0-9]|\-|\\~|\`|\@|\$|\%|\^|\&|\*|\(|\)|\_|\=|\[|\]|\\|\<|\<|\>|\?|\/|\;|\\.',' ',sentence).lower().encode('utf-8'),Trans).split()) for sentence in s]
    #two_three_ngrams = [ngrammer2(sent,num) for num in [1,2,3] for sent in cleaned_words]
    for U in cleaned_words:
        sentences2.append(U)
    sentences2=list(chain(*sentences2))
    return sentences2

#i always do this, not sure why.
import random
random.shuffle(x_train_RAW)

#going to multiprocess the tokenizer to make it faster
from multiprocessing import Pool,cpu_count
pool=Pool(cpu_count())
print 'starting to sentence tokenize'
x_train_RAW=filter(None,pool.map(spliter,x_train_RAW))

import gensim
from multiprocessing import cpu_count
model = gensim.models.Word2Vec(x_train_RAW, size=100, window=5, min_count=5, workers=cpu_count())
pickle.dump(model,open('model.p','wb'))
	from string import translate,maketrans,punctuation
	from itertools import chain
	from nltk import PunktSentenceTokenizer
	import datetime
	import re

	def log(msg):
	print("{} {}".format(str(datetime.datetime.now()), msg))
	def removeNonAscii(s):
	return "".join(filter(lambda x: ord(x)<128, s))

	# keeps -, +, # in words
	punctuation = punctuation.replace('-','').replace('+','').replace('#','')
	#makes a C translation dictionary converting punctuations to white spaces
	Trans = maketrans(punctuation, ' '*len(punctuation))
	#splits text into sentences'
	tknr = PunktSentenceTokenizer()

	#fast ngrammer if you end up using it for phrases
	def ngrammer2(l,n):
	temp = [" ".join(l[i:i+n]) for i in xrange(0,len(l)) if len(l[i:i+n])==n]
	return temp

	print 'Loading the post data'
	import pickle
	s=pickle.load(open("title_and_job.p","rb"))
	x_train_RAW=[]
	for i in s:
	if len(i.values()[0])>=30:
	title=i.keys()[0]
	for q in i.values()[0]:
	x_train_RAW.append(q.encode('utf-8'))

	#can use the ngrammer here if you want to look at phrase similarity
	#I get rid of html characters from this corpus
	def spliter(jobpost):
	sentences2=[]
	s=tknr.tokenize(jobpost)
	cleaned_words = [list(translate(re.sub(r'[0-9]\|\-\|\\~\|\`\|\@\|\$\|\%\|\^\|\&\|\*\|\(\|\)\|\_\|\=\|\[\|\]\|\\\|\<\|\<\|\>\|\?\|\/\|\;\|\\.',' ',sentence).lower().encode('utf-8'),Trans).split()) for sentence in s]
	#two_three_ngrams = [ngrammer2(sent,num) for num in [1,2,3] for sent in cleaned_words]
	for U in cleaned_words:
	sentences2.append(U)
	sentences2=list(chain(*sentences2))
	return sentences2

	#i always do this, not sure why.
	import random
	random.shuffle(x_train_RAW)

	#going to multiprocess the tokenizer to make it faster
	from multiprocessing import Pool,cpu_count
	pool=Pool(cpu_count())
	print 'starting to sentence tokenize'
	x_train_RAW=filter(None,pool.map(spliter,x_train_RAW))

	import gensim
	from multiprocessing import cpu_count
	model = gensim.models.Word2Vec(x_train_RAW, size=100, window=5, min_count=5, workers=cpu_count())
	pickle.dump(model,open('model.p','wb'))