Skip to content

Instantly share code, notes, and snippets.

@KayneWest
Last active August 29, 2015 14:08
Show Gist options
  • Save KayneWest/dc005e9e596e77496395 to your computer and use it in GitHub Desktop.
Save KayneWest/dc005e9e596e77496395 to your computer and use it in GitHub Desktop.
word2vec_model.py
from string import translate,maketrans,punctuation
from itertools import chain
from nltk import PunktSentenceTokenizer
import datetime
import re
def log(msg):
print("{} {}".format(str(datetime.datetime.now()), msg))
def removeNonAscii(s):
return "".join(filter(lambda x: ord(x)<128, s))
# keeps -, +, # in words
punctuation = punctuation.replace('-','').replace('+','').replace('#','')
#makes a C translation dictionary converting punctuations to white spaces
Trans = maketrans(punctuation, ' '*len(punctuation))
#splits text into sentences'
tknr = PunktSentenceTokenizer()
#fast ngrammer if you end up using it for phrases
def ngrammer2(l,n):
temp = [" ".join(l[i:i+n]) for i in xrange(0,len(l)) if len(l[i:i+n])==n]
return temp
print 'Loading the post data'
import pickle
s=pickle.load(open("title_and_job.p","rb"))
x_train_RAW=[]
for i in s:
if len(i.values()[0])>=30:
title=i.keys()[0]
for q in i.values()[0]:
x_train_RAW.append(q.encode('utf-8'))
#can use the ngrammer here if you want to look at phrase similarity
#I get rid of html characters from this corpus
def spliter(jobpost):
sentences2=[]
s=tknr.tokenize(jobpost)
cleaned_words = [list(translate(re.sub(r'[0-9]|\-|\\~|\`|\@|\$|\%|\^|\&|\*|\(|\)|\_|\=|\[|\]|\\|\<|\<|\>|\?|\/|\;|\\.',' ',sentence).lower().encode('utf-8'),Trans).split()) for sentence in s]
#two_three_ngrams = [ngrammer2(sent,num) for num in [1,2,3] for sent in cleaned_words]
for U in cleaned_words:
sentences2.append(U)
sentences2=list(chain(*sentences2))
return sentences2
#i always do this, not sure why.
import random
random.shuffle(x_train_RAW)
#going to multiprocess the tokenizer to make it faster
from multiprocessing import Pool,cpu_count
pool=Pool(cpu_count())
print 'starting to sentence tokenize'
x_train_RAW=filter(None,pool.map(spliter,x_train_RAW))
import gensim
from multiprocessing import cpu_count
model = gensim.models.Word2Vec(x_train_RAW, size=100, window=5, min_count=5, workers=cpu_count())
pickle.dump(model,open('model.p','wb'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment