Skip to content

Instantly share code, notes, and snippets.

@flipdazed
Last active September 21, 2022 07:42
Show Gist options
  • Save flipdazed/13b4797b6917f5fdd5e021a66f14a2f4 to your computer and use it in GitHub Desktop.
Save flipdazed/13b4797b6917f5fdd5e021a66f14a2f4 to your computer and use it in GitHub Desktop.
import os
import time
import numpy as np
import re
import unicodedata
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from selenium.webdriver.common.action_chains import ActionChains
username = 'your username'
password = 'your password'
job_id = 'your job id'
downloads_path = os.path.join(os.path.expanduser('~'), 'Downloads')
options = webdriver.ChromeOptions()
prefs = {"download.default_directory" : downloads_path}
options.add_experimental_option("prefs", prefs)
driver = webdriver.Chrome(options=options)
driver.get('https://www.linkedin.com/')
un = driver.find_element_by_xpath("//*[@autocomplete='username']")
un.send_keys(username)
pw = driver.find_element_by_xpath("//*[@autocomplete='current-password']")
pw.send_keys(password)
pw.submit()
jobs_xpath = '//a[@class="app-aware-link global-nav__primary-link"]'
possible_jobs = driver.find_elements_by_xpath(jobs_xpath)
for jobs in possible_jobs:
if 'jobs' in jobs.text.lower():
break
else:
raise ValueError(' No jobs button found!')
hover = ActionChains(driver).move_to_element(jobs)
hover.perform()
jobs.click()
jobs_home_xpath = "//*[contains(@href,'post-a-job')]"
jobs_home = driver.find_element_by_xpath(jobs_home_xpath)
hover = ActionChains(driver).move_to_element(jobs_home)
hover.perform()
attrs = getattr(getattr(BeautifulSoup(jobs_home.get_attribute('outerHTML'), 'html.parser'), 'a', None), 'attrs', {})
driver.get(attrs['href'])
try:
pw = driver.find_element_by_xpath("//*[@autocomplete='current-password']")
pw.send_keys(password)
except Exception:
print('no password needed!')
manage_jobs_xpath = "//*[contains(@href,'posted-jobs')]"
manage_jobs = WebDriverWait(driver,5).until(EC.visibility_of_element_located(
(By.XPATH, manage_jobs_xpath))
)
hover = ActionChains(driver).move_to_element(manage_jobs)
hover.perform()
manage_jobs.click()
job_posting_xpath = f"//*[contains(@href,'{job_id}')]"
job_posting = driver.find_element_by_xpath(job_posting_xpath)
job_posting.click()
page_xpath = '//li[@data-test-pagination-page-btn]'
applicants_xpath = "//a[@classnames='hiring-applicants-list-item__link link-without-hover-visited']"
WebDriverWait(driver,5).until(EC.visibility_of_element_located(
(By.XPATH, applicants_xpath))
)
def strip_accents(text):
try:
text = unicode(text, 'utf-8')
except (TypeError, NameError): # unicode is a default on python 3
pass
text = unicodedata.normalize('NFD', text)
text = text.encode('ascii', 'ignore')
text = text.decode("utf-8")
return str(text)
def text_to_id(text):
text = strip_accents(text.lower())
text = text.split()
text = text[0].lower() + ''.join(t.title() for t in text[1:])
text = re.sub('[^0-9a-zA-Z_-]', '', text)
return text
def download_cv(url, downloads_path, firstname, lastname):
driver.get(url)
def latest_download_file():
os.chdir(downloads_path)
files = sorted(os.listdir(os.getcwd()), key=os.path.getmtime)
newest = files[-1]
return os.path.join(downloads_path, newest)
fileends = "crdownload"
start = time.time()
while "crdownload" == fileends:
if time.time() < 5:
raise ValueError("File took > 5 seconds to download!")
time.sleep(0.1)
newest_file = latest_download_file()
if "crdownload" in newest_file:
fileends = "crdownload"
else:
fileends = "none"
ext = newest_file.split('.')[-1]
new_fname = f"{datetime.datetime.now().strftime('%Y%m%d')}_{firstname}{lastname.title()}"
new_path = os.path.join(downloads_path, new_fname + '.' + ext)
os.rename(newest_file, new_path)
return new_path
def check_exists_by_xpath(xpath):
try:
driver.find_element_by_xpath(xpath)
except NoSuchElementException:
return False
return True
class VirusScannerError(Exception):
pass
def get_applicant(applicant, details):
firstname, lastname = applicant.text.split()[:2]
details['firstname'] = text_to_id(firstname).lower().title()
details['lastname'] = text_to_id(lastname).lower().title()
applicant.click()
# noise
time.sleep(min(abs(np.random.normal(0.1, 0.25)), 1))
print('waiting for more button')
more_xpath = '//*[@class="artdeco-dropdown__trigger artdeco-dropdown__trigger--placement-bottom ember-view artdeco-button artdeco-button--secondary artdeco-button--muted artdeco-button--3"]'
more_elm = WebDriverWait(driver,5).until(EC.visibility_of_element_located(
(By.XPATH, more_xpath))
)
print('waiting for cv')
check_cv_loaded = '//div[@class="p0 mt4 artdeco-card"]'
WebDriverWait(driver,5).until(EC.visibility_of_element_located(
(By.XPATH, check_cv_loaded))
)
if check_exists_by_xpath('//*[@class="hiring-resume-viewer__virus-scan-section ph5 pv4"]'):
print('cv virus scan!')
raise VirusScannerError
cv_xpath = "//*[contains(@aria-label,'resume')]"
cv_elm = driver.find_element_by_xpath(cv_xpath)
driver.execute_script("arguments[0].click();",more_elm)
print('waiting for details')
details_xpath = '//div[@class="artdeco-dropdown__content-inner"]/ul//span[@class="hiring-applicant-header-actions__more-content-dropdown-item-text"]'
WebDriverWait(driver,5).until(EC.visibility_of_element_located(
(By.XPATH, details_xpath))
)
details_elm = driver.find_elements_by_xpath(details_xpath)
for detail in details_elm:
data = detail.text
if "@" in data:
details['email'] = data
else:
details['phone'] = data
details['cv'] = download_cv(cv_elm.get_property('href'), downloads_path, details['firstname'], details['lastname'])
profile_ext = getattr(getattr(BeautifulSoup(driver.find_element_by_link_text('See full profile').get_attribute('outerHTML'), 'html.parser'), 'a', None), 'attrs', {})['href']
details['linkedin'] = 'https://www.linkedin.com'+ profile_ext
def get_applicants(driver):
WebDriverWait(driver,5).until(EC.visibility_of_element_located(
(By.XPATH, applicants_xpath))
)
applicants = driver.find_elements_by_xpath("//a[@classnames='hiring-applicants-list-item__link link-without-hover-visited']")
applicant_details = []
for applicant in applicants:
# check for new applicants that load late
new_applicants = driver.find_elements_by_xpath("//a[@classnames='hiring-applicants-list-item__link link-without-hover-visited']")
applicants_curr_id = set(a.id for a in applicants)
new_applicants_elms = [a for a in new_applicants if a.id not in applicants_curr_id]
if new_applicants_elms:
print('new applicants were loaded late...')
applicants.extend(new_applicants_elms)
details = {}
try:
get_applicant(applicant, details)
except VirusScannerError:
driver.refresh() #refresh page for virus scanning!
return get_applicants(driver)
except Exception as e:
print(traceback.format_exc())
continue
applicant_details.append(details)
return applicant_details
pages = driver.find_elements_by_xpath(page_xpath)
n_pages = len(pages)
applicant_details = []
for page_i in range(n_pages):
print('getting page', page_i + 1)
pages = driver.find_elements_by_xpath(page_xpath)
page = pages[page_i] # starts at 1
_prev_page_id = page.id
page.click()
# wait for refresh
_new_page_id = str(page.id)
start = time.time()
if 'active' not in page.get_attribute('outerHTML'):
while _prev_page_id == _new_page_id:
if time.time() - start > 5:
raise TimeoutError
pages = driver.find_elements_by_xpath(page_xpath)
_new_page_id = pages[page_i].id
time.sleep(0.1)
applicant_details.extend(get_applicants(driver))
if page_i == 2:
break
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment