Last active
March 21, 2019 23:13
-
-
Save mtdefelice/b875d80c4e25472790ed9eb0539206cf to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
The 'students.csv' and 'startups.csv' files represent the Typeform exports for each and are saved to this script's directory. All student resumes have been downloaded and stored in the 'student_resumes/' folder. | |
'startups_x.csv' is a parsed version of 'startups.csv' that has been extended with the job descriptions & content from the supplied *.pdf | |
''' | |
import glob | |
import textract | |
import re | |
import pandas as pd | |
import os | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.metrics.pairwise import linear_kernel | |
# Store all student & startup profiles here (source for tfidf matrix) | |
profiles = [] | |
# Student profiles | |
df = pd.read_csv ('students.csv') | |
df.index = df.iloc[:,5].apply (lambda a: a.strip ().lower ()) | |
# Majors/Minors; Brief biography? | |
df['text'] = df.iloc[:,8] + ' ' + df.iloc[:,11] | |
df['text'] = df.text.astype ('str').apply (lambda a: ''.join (a.split ('\n'))) | |
# Append attached resume text to the DataFrame ... before processing, each was downloaded from Typeform and placed in the 'student_resumes/' folder | |
# Upload Resume: One page saved as a PDF | |
bd = 'student_resumes/' | |
for ix, url in df.iloc[:,10].astype ('str').items (): | |
cv = '{}{}'.format (bd, os.path.basename (url)) | |
try: | |
text = ' '.join (textract.process (cv).decode ('utf8', 'ignore').split ('\n')) | |
match = re.search (r'[\w\.-]+@[\w\.-]+', text) | |
email = match.group (0).lower () | |
# Replace non-ascii characters with spaces | |
text = ''.join ([c if ord (c) < 128 else ' ' for c in text]) | |
df.loc[email]['text'] += ' ' + text | |
except Exception as e: | |
print ('Error processing: {}. {}'.format (cv, e.args)) | |
for ix, text in df.text.items (): | |
profiles.append ((ix, text)) | |
# Index that separates students from startup profiles | |
cutoff = len (profiles) | |
# Startup profiles | |
'''' | |
# First method - from the Typeform data but without job descriptions | |
df = pd.read_csv ('startups.csv') | |
df.index = df.iloc[:,1].apply (lambda a: a.strip ().lower ()) | |
# Tell us about your company; What can interns learn from you?; What is the top-priority area ...? | |
df['text'] = df.iloc[:,8] + ' ' + df.iloc[:,10] + ' ' + df.iloc[:,11] | |
df['text'] = df.text.astype ('str').apply (lambda a: ''.join (a.split ('\n'))) | |
'''' | |
# Second method - parsed Typeform data with pasted job descriptions & content from the supplied *.pdf | |
df = pd.read_csv ('startups_x.csv') | |
df.index = df.iloc[:,0].apply (lambda a: a.strip ().lower ().replace (' ', '_')) | |
# Tell us about your company; What can interns learn from you?; What is the top-priority area ...?; PDF content? | |
df['text'] = df.iloc[:,4] + ' ' + df.iloc[:,5] + ' ' + df.iloc[:,6] + ' ' + df.iloc[:,7] | |
df['text'] = df.text.astype ('str').apply (lambda a: ' '.join (a.split ('\n'))) | |
# Replace non-ascii characters with spaces | |
df['text'] = df.text.apply (lambda a: ''.join ([c if ord (c) < 128 else ' ' for c in a])) | |
for ix, text in df.text.items (): | |
profiles.append ((ix, text)) | |
# Matching | |
# TODO: add a stemming or lemming mechanism | |
tf = TfidfVectorizer (analyzer = 'word', ngram_range = (1,3), min_df = 0, stop_words = 'english') | |
tfidf_matrix = tf.fit_transform ([content for email, content in profiles]) | |
# Match students with startups | |
similarities_matrix = pd.DataFrame (index = [ix for ix, text in profiles], columns = [ix for ix, text in profiles]) | |
for ix, profile in enumerate (profiles): | |
if ix < cutoff: | |
continue | |
else: | |
print (ix, profile[0]) | |
for j, score in enumerate (linear_kernel (tfidf_matrix[ix], tfidf_matrix).flatten ()): | |
if j < cutoff: | |
similarities_matrix.iloc[ix,j] = score | |
# Export to *.csv | |
similarities_matrix.to_csv ('matrix.csv') | |
# Sample | |
similarities_matrix.loc['codeup'].sort_values (ascending = False).dropna () |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment