Skip to content

Instantly share code, notes, and snippets.

@mtdefelice
Last active March 21, 2019 23:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mtdefelice/b875d80c4e25472790ed9eb0539206cf to your computer and use it in GitHub Desktop.
Save mtdefelice/b875d80c4e25472790ed9eb0539206cf to your computer and use it in GitHub Desktop.
'''
The 'students.csv' and 'startups.csv' files represent the Typeform exports for each and are saved to this script's directory. All student resumes have been downloaded and stored in the 'student_resumes/' folder.
'startups_x.csv' is a parsed version of 'startups.csv' that has been extended with the job descriptions & content from the supplied *.pdf
'''
import glob
import textract
import re
import pandas as pd
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
# Store all student & startup profiles here (source for tfidf matrix)
profiles = []
# Student profiles
df = pd.read_csv ('students.csv')
df.index = df.iloc[:,5].apply (lambda a: a.strip ().lower ())
# Majors/Minors; Brief biography?
df['text'] = df.iloc[:,8] + ' ' + df.iloc[:,11]
df['text'] = df.text.astype ('str').apply (lambda a: ''.join (a.split ('\n')))
# Append attached resume text to the DataFrame ... before processing, each was downloaded from Typeform and placed in the 'student_resumes/' folder
# Upload Resume: One page saved as a PDF
bd = 'student_resumes/'
for ix, url in df.iloc[:,10].astype ('str').items ():
cv = '{}{}'.format (bd, os.path.basename (url))
try:
text = ' '.join (textract.process (cv).decode ('utf8', 'ignore').split ('\n'))
match = re.search (r'[\w\.-]+@[\w\.-]+', text)
email = match.group (0).lower ()
# Replace non-ascii characters with spaces
text = ''.join ([c if ord (c) < 128 else ' ' for c in text])
df.loc[email]['text'] += ' ' + text
except Exception as e:
print ('Error processing: {}. {}'.format (cv, e.args))
for ix, text in df.text.items ():
profiles.append ((ix, text))
# Index that separates students from startup profiles
cutoff = len (profiles)
# Startup profiles
''''
# First method - from the Typeform data but without job descriptions
df = pd.read_csv ('startups.csv')
df.index = df.iloc[:,1].apply (lambda a: a.strip ().lower ())
# Tell us about your company; What can interns learn from you?; What is the top-priority area ...?
df['text'] = df.iloc[:,8] + ' ' + df.iloc[:,10] + ' ' + df.iloc[:,11]
df['text'] = df.text.astype ('str').apply (lambda a: ''.join (a.split ('\n')))
''''
# Second method - parsed Typeform data with pasted job descriptions & content from the supplied *.pdf
df = pd.read_csv ('startups_x.csv')
df.index = df.iloc[:,0].apply (lambda a: a.strip ().lower ().replace (' ', '_'))
# Tell us about your company; What can interns learn from you?; What is the top-priority area ...?; PDF content?
df['text'] = df.iloc[:,4] + ' ' + df.iloc[:,5] + ' ' + df.iloc[:,6] + ' ' + df.iloc[:,7]
df['text'] = df.text.astype ('str').apply (lambda a: ' '.join (a.split ('\n')))
# Replace non-ascii characters with spaces
df['text'] = df.text.apply (lambda a: ''.join ([c if ord (c) < 128 else ' ' for c in a]))
for ix, text in df.text.items ():
profiles.append ((ix, text))
# Matching
# TODO: add a stemming or lemming mechanism
tf = TfidfVectorizer (analyzer = 'word', ngram_range = (1,3), min_df = 0, stop_words = 'english')
tfidf_matrix = tf.fit_transform ([content for email, content in profiles])
# Match students with startups
similarities_matrix = pd.DataFrame (index = [ix for ix, text in profiles], columns = [ix for ix, text in profiles])
for ix, profile in enumerate (profiles):
if ix < cutoff:
continue
else:
print (ix, profile[0])
for j, score in enumerate (linear_kernel (tfidf_matrix[ix], tfidf_matrix).flatten ()):
if j < cutoff:
similarities_matrix.iloc[ix,j] = score
# Export to *.csv
similarities_matrix.to_csv ('matrix.csv')
# Sample
similarities_matrix.loc['codeup'].sort_values (ascending = False).dropna ()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment