Skip to content

Instantly share code, notes, and snippets.

@serenamm
Created December 22, 2017 16:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save serenamm/014cad3c69ae544750fdfbfc6df13c30 to your computer and use it in GitHub Desktop.
Save serenamm/014cad3c69ae544750fdfbfc6df13c30 to your computer and use it in GitHub Desktop.
import pandas as pd
import numpy as np
import pickle
import re
import csv
from time import sleep # So we don't request too much from the server
from collections import Counter # Keep track of counts
from selenium import webdriver
from selenium.webdriver.common import action_chains, keys
from selenium.common.exceptions import NoSuchElementException
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
def load_obj(name ):
with open(name + '.pkl', 'rb') as f:
return pickle.load(f)
def save_obj(obj, name ):
with open(name + '.pkl', 'wb') as f:
pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
'''
Initialize Chrome Driver
'''
def initialize_browser():
my_options = webdriver.ChromeOptions()
driver_path = "C:/Data/chromedriver.exe"
my_options.add_argument("--disable-extensions")
my_options.add_argument("--profile-directory=Default")
my_options.add_argument("--incognito")
my_options.add_argument("--disable-plugins-discovery")
my_options.add_argument("--start-maximized")
my_options.add_argument("--disable-notifications")
browser = webdriver.Chrome(executable_path = driver_path, chrome_options = my_options)
return browser
def get_pause():
return np.random.choice(range(4,6))
'''
Get jobs by scraping with Chrome Driver
Returns:
job_dict: dict with job ID as keys, link to job as value
job_desc: dict with job ID as keys, job description as value
'''
def search_jobs(job_name, city, job_dict, desc_dict, num_pages):
browser = initialize_browser()
browser.get("https://www.glassdoor.ca/index.htm")
job = browser.find_element_by_id("KeywordSearch") # Get job field for job input
location = browser.find_element_by_id("LocationSearch") # Get location field for location input
sleep(3) # to not overwhelm the server
job.send_keys(job_name) # look for the specific job name in the search bar
sleep(2)
browser.execute_script("arguments[0].value = ''", location)
location.send_keys(city) # look for the specific city
sleep(2)
browser.find_element_by_xpath("//*[@id='HeroSearchButton']").click() # Click search
# Set up initial page
initial_url = browser.current_url
for i in range(num_pages): # Get first num_pages pages
try:
# Extract useful classes
job_postings = browser.find_elements_by_class_name('jl')
sleep(get_pause())
for element in job_postings:
j_id = element.get_attribute("data-id") # job_id
link_element = element.find_element_by_css_selector('a')
link = link_element.get_attribute('href') #job_link
element.find_element_by_class_name("jobLink").click() # Click onto JD to expand
sleep(2) # The key was to wait for it to load, yay!
desc = browser.find_element_by_css_selector("#JobDesc" + j_id + " > div").text # Get description
# job_title, company name, location
try:
job_title = browser.find_element_by_css_selector("div.empInfo.tbl").text
company = browser.find_element_by_class_name("empDetailsLink").text
if j_id not in job_dict.keys():
desc_dict[j_id] = desc # To work directly with this dict
job_dict[j_id] = [job_title, company, city, link]
except:
pass
except Exception as e:
print(e)
try:
browser.find_element_by_class_name('next').click()
except:
pass
browser.close()
return job_dict, desc_dict
'''
Remove "easy" "apply" "save" etc from job title
'''
def clean_job_text(job_title, words):
for word in words:
job_title = job_title.replace(word, "")
return job_title
'''
Clean up text
Input: Description from desc_dict
Output: Cleaned text
'''
def text_cleaner(text_temp):
from nltk.corpus import stopwords
stopwords = set(stopwords.words("english"))
text = text_temp.strip("\n") # break into lines
text = re.sub("[^a-zA-Z.+3]"," ", text) # Now get rid of any terms that aren't words (include 3 for d3.js)
# Also include + for C++
text = text.lower() # Go to lower case
text = text.split() # and split them apart
text = [w for w in text if not w in stopwords]
return text
from nltk.corpus import stopwords
cachedStopWords = stopwords.words("english")
def text_cleaner(text_temp):
text = text_temp.lower()
text = re.sub("[^a-zA-Z.+3]"," ", text)
text = text.strip("\n")
text = ' '.join([word for word in text.split() if word not in cachedStopWords])
return text
'''
Input a cv and a job
Job_dict should be j_id key, then description
For key,value in job_dict.items():
sim[key] = get_sim(cv, value, item) etc
Returns cosine tfid similarity
'''
def get_sim(cv, job_desc):
sim_vec = TfidfVectorizer(min_df=1)
tfidf = sim_vec.fit_transform([cv, job_desc]) #tfidf vectorization
sim_array = (tfidf * tfidf.T).A # cosine similarity
sim = sim_array[0][1]
return sim
'''
Input cv, and all jobs in the dictionary
Return a sorted list of tuples (job_id, similarity)
'''
def best_match(cv, d_dict):
cv_cleaned = text_cleaner(cv)
sim = {}
new_desc_dict = clean_dict(d_dict)
for key, value in new_desc_dict.items():
sim[key] = get_sim(cv_cleaned, value)
best_match_dict = sorted(sim.items(), key=lambda x:x[1], reverse = True)
return best_match_dict
def clean_dict(d_dict):
new_dict = {}
for key, value in d_dict.items():
new_dict[key] = text_cleaner(value)
return new_dict
'''
Input the sorted list of tuples (job_id, similarity), and
the job_dict containing the job info
Output a dict with the top x job matches
'''
def get_best(job_dict, match_list, num_jobs):
best_matches = {}
for i in range(num_jobs):
j_id = match_list[i][0]
best_matches[j_id] = job_dict[j_id]
best_match_df = pd.DataFrame.from_dict(best_matches, orient = "index")
best_match_df.columns = ["Title", "Company", "Location","Link"]
return best_match_df
def get_best_csv(cv):
try:
job_dict = load_obj("C:/Data/Projects/Glassdoor/job_dict")
desc_dict = load_obj("C:/Data/Projects/Glassdoor/desc_dict")
except:
print("No dictionaries have been found!")
match_list = best_match(cv, desc_dict)
num_jobs = len(job_dict)
best_matches = {}
for i in range(num_jobs):
j_id = match_list[i][0]
best_matches[j_id] = job_dict[j_id]
best_match_df = pd.DataFrame.from_dict(best_matches, orient = "index")
best_match_df.columns = ["Title", "Company", "Location","Link"]
best_match_df['Title'] = best_match_df['Title'].apply(lambda x: clean_job_text(x, ["Apply","Save","Now","Easy"]))
path = "C:/Data/Projects/Glassdoor"
return best_match_df.to_csv(path + "/best_match_df.csv")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment