jonathanlxy/Train Word2Vec Model Secret

## Train Word2Vec Model
# coding: utf-8
import multiprocessing
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
import gensim

def bind_desc_skill( job ):
    if job[1] is np.nan:
        return job[0]
    else:
        return ' '.join( job )

def make_sentence( doc, stem = True ):
    low_doc = doc.lower()
    # Punctuation removal
    tokenizer = RegexpTokenizer( r'\w+' )
    tokens_P = tokenizer.tokenize( low_doc )
    # Stopwords removal
    custom_stopwords = stopwords.words( 'english' ) + [ 'nbsp', 'amp' ]
    tokens_PS = [ token for token in tokens_P if token not in custom_stopwords ]
    # Stemming
    if stem == True:
        stemmer = PorterStemmer()
        sentence = [ stemmer.stem( token ) for token in tokens_PS ]
    else:
        sentence = tokens_PS

    return sentence

jobs = pd.read_csv( 'unique_jobs.csv', encoding = 'utf-8' )
jobs[ 'desc_skill' ] = jobs[ [ 'job_description', 'skills' ] ].apply( bind_desc_skill, axis = 1 )

descriptions = map( lambda doc: make_sentence( doc, stem = False ), jobs[ 'desc_skill' ].tolist() )

### Train Word2Vec Model
w2v_model = gensim.models.Word2Vec( descriptions, min_count = 2, workers = multiprocessing.cpu_count() )
# Save it
w2v_model.save( 'my_model.word2vec' )

## Word2Vec Demo
print '#' * 47
print '#' * 5 + 'Word2Vec Skill Similarity Search Demo' + '#' * 5
print '#' * 47 + '\n'
print 'Loading...\n'

import gensim

def custom_skill_set( skill_set ):
	custom_skill_set = skill_set[ : ]
	while True:
		new_key = raw_input( 'Please add new keyword:\n' )
		if new_key:
			custom_skill_set.append( new_key )
		else:
			break
	print 'skill input:'
	print custom_skill_set

	return custom_skill_set

def skill_find( custom_skill_set ):
	print '{:^20} | {:^20}'.format( 'Word', 'Similarity' )
	print '-' * 45
	for w, f in w2v_model.most_similar( custom_skill_set, topn=20 ):
		print '{:<20} | {:<20}'.format( w, f )

w2v_model = gensim.models.Word2Vec.load( 'my_model.word2vec' )

skill_set = [ 'dataiku'   , 'python', 'matplotlib', 'sql',
              'pandas'    , 'numpy' , 'shiny'     , 'r'  ,
              'regression', 'lasso' , 'ridge'     , 'pca',
              'mongodb'   , 'bash'  , 'aws'       ,
              'forest'    , 'knn'   , 'tableau'   ,
            ]

print 'Pre-set skills:\n%s\n' %skill_set

while True:
	try:
		choice = raw_input( 'Include pre-set skills? ( y | n )' )
		if  choice == 'y':
			custom_skill = custom_skill_set( skill_set )
			skill_find( custom_skill )
		elif choice == 'n':
			custom_skill = custom_skill_set( [] )
			skill_find( custom_skill )
		else:
			print 'Invalid Input'
		print '#' * 50 + '\n'
	except KeyError:
		print 'This keyword did not appear in training data.'
		print 'Please try a different word.'
	# coding: utf-8
	import multiprocessing
	import pandas as pd
	import numpy as np
	from nltk.corpus import stopwords
	from nltk.tokenize import RegexpTokenizer
	from nltk.stem.porter import PorterStemmer
	import gensim

	def bind_desc_skill( job ):
	if job[1] is np.nan:
	return job[0]
	else:
	return ' '.join( job )

	def make_sentence( doc, stem = True ):
	low_doc = doc.lower()
	# Punctuation removal
	tokenizer = RegexpTokenizer( r'\w+' )
	tokens_P = tokenizer.tokenize( low_doc )
	# Stopwords removal
	custom_stopwords = stopwords.words( 'english' ) + [ 'nbsp', 'amp' ]
	tokens_PS = [ token for token in tokens_P if token not in custom_stopwords ]
	# Stemming
	if stem == True:
	stemmer = PorterStemmer()
	sentence = [ stemmer.stem( token ) for token in tokens_PS ]
	else:
	sentence = tokens_PS

	return sentence

	jobs = pd.read_csv( 'unique_jobs.csv', encoding = 'utf-8' )
	jobs[ 'desc_skill' ] = jobs[ [ 'job_description', 'skills' ] ].apply( bind_desc_skill, axis = 1 )

	descriptions = map( lambda doc: make_sentence( doc, stem = False ), jobs[ 'desc_skill' ].tolist() )

	### Train Word2Vec Model
	w2v_model = gensim.models.Word2Vec( descriptions, min_count = 2, workers = multiprocessing.cpu_count() )
	# Save it
	w2v_model.save( 'my_model.word2vec' )
	print '#' * 47
	print '#' * 5 + 'Word2Vec Skill Similarity Search Demo' + '#' * 5
	print '#' * 47 + '\n'
	print 'Loading...\n'

	import gensim

	def custom_skill_set( skill_set ):
	custom_skill_set = skill_set[ : ]
	while True:
	new_key = raw_input( 'Please add new keyword:\n' )
	if new_key:
	custom_skill_set.append( new_key )
	else:
	break
	print 'skill input:'
	print custom_skill_set

	return custom_skill_set

	def skill_find( custom_skill_set ):
	print '{:^20} \| {:^20}'.format( 'Word', 'Similarity' )
	print '-' * 45
	for w, f in w2v_model.most_similar( custom_skill_set, topn=20 ):
	print '{:<20} \| {:<20}'.format( w, f )

	w2v_model = gensim.models.Word2Vec.load( 'my_model.word2vec' )

	skill_set = [ 'dataiku' , 'python', 'matplotlib', 'sql',
	'pandas' , 'numpy' , 'shiny' , 'r' ,
	'regression', 'lasso' , 'ridge' , 'pca',
	'mongodb' , 'bash' , 'aws' ,
	'forest' , 'knn' , 'tableau' ,
	]

	print 'Pre-set skills:\n%s\n' %skill_set

	while True:
	try:
	choice = raw_input( 'Include pre-set skills? ( y \| n )' )
	if choice == 'y':
	custom_skill = custom_skill_set( skill_set )
	skill_find( custom_skill )
	elif choice == 'n':
	custom_skill = custom_skill_set( [] )
	skill_find( custom_skill )
	else:
	print 'Invalid Input'
	print '#' * 50 + '\n'
	except KeyError:
	print 'This keyword did not appear in training data.'
	print 'Please try a different word.'