Skip to content

Instantly share code, notes, and snippets.

@jonathanlxy
Created August 22, 2016 05:44
Show Gist options
  • Save jonathanlxy/840948fc5088396652bb09f35539144a to your computer and use it in GitHub Desktop.
Save jonathanlxy/840948fc5088396652bb09f35539144a to your computer and use it in GitHub Desktop.
W2V Gist for Blogpost
# coding: utf-8
import multiprocessing
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
import gensim
def bind_desc_skill( job ):
if job[1] is np.nan:
return job[0]
else:
return ' '.join( job )
def make_sentence( doc, stem = True ):
low_doc = doc.lower()
# Punctuation removal
tokenizer = RegexpTokenizer( r'\w+' )
tokens_P = tokenizer.tokenize( low_doc )
# Stopwords removal
custom_stopwords = stopwords.words( 'english' ) + [ 'nbsp', 'amp' ]
tokens_PS = [ token for token in tokens_P if token not in custom_stopwords ]
# Stemming
if stem == True:
stemmer = PorterStemmer()
sentence = [ stemmer.stem( token ) for token in tokens_PS ]
else:
sentence = tokens_PS
return sentence
jobs = pd.read_csv( 'unique_jobs.csv', encoding = 'utf-8' )
jobs[ 'desc_skill' ] = jobs[ [ 'job_description', 'skills' ] ].apply( bind_desc_skill, axis = 1 )
descriptions = map( lambda doc: make_sentence( doc, stem = False ), jobs[ 'desc_skill' ].tolist() )
### Train Word2Vec Model
w2v_model = gensim.models.Word2Vec( descriptions, min_count = 2, workers = multiprocessing.cpu_count() )
# Save it
w2v_model.save( 'my_model.word2vec' )
print '#' * 47
print '#' * 5 + 'Word2Vec Skill Similarity Search Demo' + '#' * 5
print '#' * 47 + '\n'
print 'Loading...\n'
import gensim
def custom_skill_set( skill_set ):
custom_skill_set = skill_set[ : ]
while True:
new_key = raw_input( 'Please add new keyword:\n' )
if new_key:
custom_skill_set.append( new_key )
else:
break
print 'skill input:'
print custom_skill_set
return custom_skill_set
def skill_find( custom_skill_set ):
print '{:^20} | {:^20}'.format( 'Word', 'Similarity' )
print '-' * 45
for w, f in w2v_model.most_similar( custom_skill_set, topn=20 ):
print '{:<20} | {:<20}'.format( w, f )
w2v_model = gensim.models.Word2Vec.load( 'my_model.word2vec' )
skill_set = [ 'dataiku' , 'python', 'matplotlib', 'sql',
'pandas' , 'numpy' , 'shiny' , 'r' ,
'regression', 'lasso' , 'ridge' , 'pca',
'mongodb' , 'bash' , 'aws' ,
'forest' , 'knn' , 'tableau' ,
]
print 'Pre-set skills:\n%s\n' %skill_set
while True:
try:
choice = raw_input( 'Include pre-set skills? ( y | n )' )
if choice == 'y':
custom_skill = custom_skill_set( skill_set )
skill_find( custom_skill )
elif choice == 'n':
custom_skill = custom_skill_set( [] )
skill_find( custom_skill )
else:
print 'Invalid Input'
print '#' * 50 + '\n'
except KeyError:
print 'This keyword did not appear in training data.'
print 'Please try a different word.'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment