mtdefelice/s+s-matching.py

## s+s-matching.py
'''
The 'students.csv' and 'startups.csv' files represent the Typeform exports for each and are saved to this script's directory. All student resumes have been downloaded and stored in the 'student_resumes/' folder.

'startups_x.csv' is a parsed version of 'startups.csv' that has been extended with the job descriptions & content from the supplied *.pdf

'''

import glob
import textract
import re
import pandas as pd
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Store all student & startup profiles here (source for tfidf matrix)
profiles = []

# Student profiles
df = pd.read_csv ('students.csv')
df.index = df.iloc[:,5].apply (lambda a: a.strip ().lower ())

# Majors/Minors; Brief biography?
df['text'] = df.iloc[:,8] + ' ' + df.iloc[:,11]
df['text'] = df.text.astype ('str').apply (lambda a: ''.join (a.split ('\n')))

# Append attached resume text to the DataFrame ... before processing, each was downloaded from Typeform and placed in the 'student_resumes/' folder
# Upload Resume: One page saved as a PDF
bd = 'student_resumes/'
for ix, url in df.iloc[:,10].astype ('str').items ():
    cv = '{}{}'.format (bd, os.path.basename (url))
    try:
        text = ' '.join (textract.process (cv).decode ('utf8', 'ignore').split ('\n'))
        match = re.search (r'[\w\.-]+@[\w\.-]+', text)
        email = match.group (0).lower ()
        # Replace non-ascii characters with spaces
        text = ''.join ([c if ord (c) < 128 else ' ' for c in text])
        df.loc[email]['text'] += ' ' + text
    except Exception as e:
        print ('Error processing: {}. {}'.format (cv, e.args))


for ix, text in df.text.items ():
    profiles.append ((ix, text))


# Index that separates students from startup profiles
cutoff = len (profiles)

# Startup profiles

''''
# First method - from the Typeform data but without job descriptions

df = pd.read_csv ('startups.csv')
df.index = df.iloc[:,1].apply (lambda a: a.strip ().lower ())

# Tell us about your company; What can interns learn from you?; What is the top-priority area ...?
df['text'] = df.iloc[:,8] + ' ' + df.iloc[:,10] + ' ' + df.iloc[:,11]
df['text'] = df.text.astype ('str').apply (lambda a: ''.join (a.split ('\n')))
''''


# Second method - parsed Typeform data with pasted job descriptions & content from the supplied *.pdf

df = pd.read_csv ('startups_x.csv')
df.index = df.iloc[:,0].apply (lambda a: a.strip ().lower ().replace (' ', '_'))

# Tell us about your company; What can interns learn from you?; What is the top-priority area ...?; PDF content?
df['text'] = df.iloc[:,4] + ' ' + df.iloc[:,5] + ' ' + df.iloc[:,6] + ' ' + df.iloc[:,7]
df['text'] = df.text.astype ('str').apply (lambda a: ' '.join (a.split ('\n')))

# Replace non-ascii characters with spaces
df['text'] = df.text.apply (lambda a: ''.join ([c if ord (c) < 128 else ' ' for c in a]))

for ix, text in df.text.items ():
    profiles.append ((ix, text))


# Matching
# TODO: add a stemming or lemming mechanism

tf = TfidfVectorizer (analyzer = 'word', ngram_range = (1,3), min_df = 0, stop_words = 'english')
tfidf_matrix =  tf.fit_transform ([content for email, content in profiles])

# Match students with startups
similarities_matrix = pd.DataFrame (index = [ix for ix, text in profiles], columns = [ix for ix, text in profiles])
for ix, profile in enumerate (profiles):
    if ix < cutoff:
        continue
    else:
        print (ix, profile[0])
        for j, score in enumerate (linear_kernel (tfidf_matrix[ix], tfidf_matrix).flatten ()):
            if j < cutoff:
                similarities_matrix.iloc[ix,j] = score


# Export to *.csv
similarities_matrix.to_csv ('matrix.csv')

# Sample
similarities_matrix.loc['codeup'].sort_values (ascending = False).dropna ()
	'''
	The 'students.csv' and 'startups.csv' files represent the Typeform exports for each and are saved to this script's directory. All student resumes have been downloaded and stored in the 'student_resumes/' folder.

	'startups_x.csv' is a parsed version of 'startups.csv' that has been extended with the job descriptions & content from the supplied *.pdf

	'''

	import glob
	import textract
	import re
	import pandas as pd
	import os
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import linear_kernel

	# Store all student & startup profiles here (source for tfidf matrix)
	profiles = []

	# Student profiles
	df = pd.read_csv ('students.csv')
	df.index = df.iloc[:,5].apply (lambda a: a.strip ().lower ())

	# Majors/Minors; Brief biography?
	df['text'] = df.iloc[:,8] + ' ' + df.iloc[:,11]
	df['text'] = df.text.astype ('str').apply (lambda a: ''.join (a.split ('\n')))

	# Append attached resume text to the DataFrame ... before processing, each was downloaded from Typeform and placed in the 'student_resumes/' folder
	# Upload Resume: One page saved as a PDF
	bd = 'student_resumes/'
	for ix, url in df.iloc[:,10].astype ('str').items ():
	cv = '{}{}'.format (bd, os.path.basename (url))
	try:
	text = ' '.join (textract.process (cv).decode ('utf8', 'ignore').split ('\n'))
	match = re.search (r'[\w\.-]+@[\w\.-]+', text)
	email = match.group (0).lower ()
	# Replace non-ascii characters with spaces
	text = ''.join ([c if ord (c) < 128 else ' ' for c in text])
	df.loc[email]['text'] += ' ' + text
	except Exception as e:
	print ('Error processing: {}. {}'.format (cv, e.args))


	for ix, text in df.text.items ():
	profiles.append ((ix, text))


	# Index that separates students from startup profiles
	cutoff = len (profiles)

	# Startup profiles

	''''
	# First method - from the Typeform data but without job descriptions

	df = pd.read_csv ('startups.csv')
	df.index = df.iloc[:,1].apply (lambda a: a.strip ().lower ())

	# Tell us about your company; What can interns learn from you?; What is the top-priority area ...?
	df['text'] = df.iloc[:,8] + ' ' + df.iloc[:,10] + ' ' + df.iloc[:,11]
	df['text'] = df.text.astype ('str').apply (lambda a: ''.join (a.split ('\n')))
	''''


	# Second method - parsed Typeform data with pasted job descriptions & content from the supplied *.pdf

	df = pd.read_csv ('startups_x.csv')
	df.index = df.iloc[:,0].apply (lambda a: a.strip ().lower ().replace (' ', '_'))

	# Tell us about your company; What can interns learn from you?; What is the top-priority area ...?; PDF content?
	df['text'] = df.iloc[:,4] + ' ' + df.iloc[:,5] + ' ' + df.iloc[:,6] + ' ' + df.iloc[:,7]
	df['text'] = df.text.astype ('str').apply (lambda a: ' '.join (a.split ('\n')))

	# Replace non-ascii characters with spaces
	df['text'] = df.text.apply (lambda a: ''.join ([c if ord (c) < 128 else ' ' for c in a]))

	for ix, text in df.text.items ():
	profiles.append ((ix, text))


	# Matching
	# TODO: add a stemming or lemming mechanism

	tf = TfidfVectorizer (analyzer = 'word', ngram_range = (1,3), min_df = 0, stop_words = 'english')
	tfidf_matrix = tf.fit_transform ([content for email, content in profiles])

	# Match students with startups
	similarities_matrix = pd.DataFrame (index = [ix for ix, text in profiles], columns = [ix for ix, text in profiles])
	for ix, profile in enumerate (profiles):
	if ix < cutoff:
	continue
	else:
	print (ix, profile[0])
	for j, score in enumerate (linear_kernel (tfidf_matrix[ix], tfidf_matrix).flatten ()):
	if j < cutoff:
	similarities_matrix.iloc[ix,j] = score


	# Export to *.csv
	similarities_matrix.to_csv ('matrix.csv')

	# Sample
	similarities_matrix.loc['codeup'].sort_values (ascending = False).dropna ()