Last active
September 21, 2022 07:42
-
-
Save flipdazed/13b4797b6917f5fdd5e021a66f14a2f4 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import time | |
import numpy as np | |
import re | |
import unicodedata | |
from selenium import webdriver | |
from selenium.webdriver.common.by import By | |
from selenium.common.exceptions import NoSuchElementException | |
from selenium.webdriver.support.ui import WebDriverWait | |
from selenium.webdriver.support import expected_conditions as EC | |
from bs4 import BeautifulSoup | |
from selenium.webdriver.common.action_chains import ActionChains | |
username = 'your username' | |
password = 'your password' | |
job_id = 'your job id' | |
downloads_path = os.path.join(os.path.expanduser('~'), 'Downloads') | |
options = webdriver.ChromeOptions() | |
prefs = {"download.default_directory" : downloads_path} | |
options.add_experimental_option("prefs", prefs) | |
driver = webdriver.Chrome(options=options) | |
driver.get('https://www.linkedin.com/') | |
un = driver.find_element_by_xpath("//*[@autocomplete='username']") | |
un.send_keys(username) | |
pw = driver.find_element_by_xpath("//*[@autocomplete='current-password']") | |
pw.send_keys(password) | |
pw.submit() | |
jobs_xpath = '//a[@class="app-aware-link global-nav__primary-link"]' | |
possible_jobs = driver.find_elements_by_xpath(jobs_xpath) | |
for jobs in possible_jobs: | |
if 'jobs' in jobs.text.lower(): | |
break | |
else: | |
raise ValueError(' No jobs button found!') | |
hover = ActionChains(driver).move_to_element(jobs) | |
hover.perform() | |
jobs.click() | |
jobs_home_xpath = "//*[contains(@href,'post-a-job')]" | |
jobs_home = driver.find_element_by_xpath(jobs_home_xpath) | |
hover = ActionChains(driver).move_to_element(jobs_home) | |
hover.perform() | |
attrs = getattr(getattr(BeautifulSoup(jobs_home.get_attribute('outerHTML'), 'html.parser'), 'a', None), 'attrs', {}) | |
driver.get(attrs['href']) | |
try: | |
pw = driver.find_element_by_xpath("//*[@autocomplete='current-password']") | |
pw.send_keys(password) | |
except Exception: | |
print('no password needed!') | |
manage_jobs_xpath = "//*[contains(@href,'posted-jobs')]" | |
manage_jobs = WebDriverWait(driver,5).until(EC.visibility_of_element_located( | |
(By.XPATH, manage_jobs_xpath)) | |
) | |
hover = ActionChains(driver).move_to_element(manage_jobs) | |
hover.perform() | |
manage_jobs.click() | |
job_posting_xpath = f"//*[contains(@href,'{job_id}')]" | |
job_posting = driver.find_element_by_xpath(job_posting_xpath) | |
job_posting.click() | |
page_xpath = '//li[@data-test-pagination-page-btn]' | |
applicants_xpath = "//a[@classnames='hiring-applicants-list-item__link link-without-hover-visited']" | |
WebDriverWait(driver,5).until(EC.visibility_of_element_located( | |
(By.XPATH, applicants_xpath)) | |
) | |
def strip_accents(text): | |
try: | |
text = unicode(text, 'utf-8') | |
except (TypeError, NameError): # unicode is a default on python 3 | |
pass | |
text = unicodedata.normalize('NFD', text) | |
text = text.encode('ascii', 'ignore') | |
text = text.decode("utf-8") | |
return str(text) | |
def text_to_id(text): | |
text = strip_accents(text.lower()) | |
text = text.split() | |
text = text[0].lower() + ''.join(t.title() for t in text[1:]) | |
text = re.sub('[^0-9a-zA-Z_-]', '', text) | |
return text | |
def download_cv(url, downloads_path, firstname, lastname): | |
driver.get(url) | |
def latest_download_file(): | |
os.chdir(downloads_path) | |
files = sorted(os.listdir(os.getcwd()), key=os.path.getmtime) | |
newest = files[-1] | |
return os.path.join(downloads_path, newest) | |
fileends = "crdownload" | |
start = time.time() | |
while "crdownload" == fileends: | |
if time.time() < 5: | |
raise ValueError("File took > 5 seconds to download!") | |
time.sleep(0.1) | |
newest_file = latest_download_file() | |
if "crdownload" in newest_file: | |
fileends = "crdownload" | |
else: | |
fileends = "none" | |
ext = newest_file.split('.')[-1] | |
new_fname = f"{datetime.datetime.now().strftime('%Y%m%d')}_{firstname}{lastname.title()}" | |
new_path = os.path.join(downloads_path, new_fname + '.' + ext) | |
os.rename(newest_file, new_path) | |
return new_path | |
def check_exists_by_xpath(xpath): | |
try: | |
driver.find_element_by_xpath(xpath) | |
except NoSuchElementException: | |
return False | |
return True | |
class VirusScannerError(Exception): | |
pass | |
def get_applicant(applicant, details): | |
firstname, lastname = applicant.text.split()[:2] | |
details['firstname'] = text_to_id(firstname).lower().title() | |
details['lastname'] = text_to_id(lastname).lower().title() | |
applicant.click() | |
# noise | |
time.sleep(min(abs(np.random.normal(0.1, 0.25)), 1)) | |
print('waiting for more button') | |
more_xpath = '//*[@class="artdeco-dropdown__trigger artdeco-dropdown__trigger--placement-bottom ember-view artdeco-button artdeco-button--secondary artdeco-button--muted artdeco-button--3"]' | |
more_elm = WebDriverWait(driver,5).until(EC.visibility_of_element_located( | |
(By.XPATH, more_xpath)) | |
) | |
print('waiting for cv') | |
check_cv_loaded = '//div[@class="p0 mt4 artdeco-card"]' | |
WebDriverWait(driver,5).until(EC.visibility_of_element_located( | |
(By.XPATH, check_cv_loaded)) | |
) | |
if check_exists_by_xpath('//*[@class="hiring-resume-viewer__virus-scan-section ph5 pv4"]'): | |
print('cv virus scan!') | |
raise VirusScannerError | |
cv_xpath = "//*[contains(@aria-label,'resume')]" | |
cv_elm = driver.find_element_by_xpath(cv_xpath) | |
driver.execute_script("arguments[0].click();",more_elm) | |
print('waiting for details') | |
details_xpath = '//div[@class="artdeco-dropdown__content-inner"]/ul//span[@class="hiring-applicant-header-actions__more-content-dropdown-item-text"]' | |
WebDriverWait(driver,5).until(EC.visibility_of_element_located( | |
(By.XPATH, details_xpath)) | |
) | |
details_elm = driver.find_elements_by_xpath(details_xpath) | |
for detail in details_elm: | |
data = detail.text | |
if "@" in data: | |
details['email'] = data | |
else: | |
details['phone'] = data | |
details['cv'] = download_cv(cv_elm.get_property('href'), downloads_path, details['firstname'], details['lastname']) | |
profile_ext = getattr(getattr(BeautifulSoup(driver.find_element_by_link_text('See full profile').get_attribute('outerHTML'), 'html.parser'), 'a', None), 'attrs', {})['href'] | |
details['linkedin'] = 'https://www.linkedin.com'+ profile_ext | |
def get_applicants(driver): | |
WebDriverWait(driver,5).until(EC.visibility_of_element_located( | |
(By.XPATH, applicants_xpath)) | |
) | |
applicants = driver.find_elements_by_xpath("//a[@classnames='hiring-applicants-list-item__link link-without-hover-visited']") | |
applicant_details = [] | |
for applicant in applicants: | |
# check for new applicants that load late | |
new_applicants = driver.find_elements_by_xpath("//a[@classnames='hiring-applicants-list-item__link link-without-hover-visited']") | |
applicants_curr_id = set(a.id for a in applicants) | |
new_applicants_elms = [a for a in new_applicants if a.id not in applicants_curr_id] | |
if new_applicants_elms: | |
print('new applicants were loaded late...') | |
applicants.extend(new_applicants_elms) | |
details = {} | |
try: | |
get_applicant(applicant, details) | |
except VirusScannerError: | |
driver.refresh() #refresh page for virus scanning! | |
return get_applicants(driver) | |
except Exception as e: | |
print(traceback.format_exc()) | |
continue | |
applicant_details.append(details) | |
return applicant_details | |
pages = driver.find_elements_by_xpath(page_xpath) | |
n_pages = len(pages) | |
applicant_details = [] | |
for page_i in range(n_pages): | |
print('getting page', page_i + 1) | |
pages = driver.find_elements_by_xpath(page_xpath) | |
page = pages[page_i] # starts at 1 | |
_prev_page_id = page.id | |
page.click() | |
# wait for refresh | |
_new_page_id = str(page.id) | |
start = time.time() | |
if 'active' not in page.get_attribute('outerHTML'): | |
while _prev_page_id == _new_page_id: | |
if time.time() - start > 5: | |
raise TimeoutError | |
pages = driver.find_elements_by_xpath(page_xpath) | |
_new_page_id = pages[page_i].id | |
time.sleep(0.1) | |
applicant_details.extend(get_applicants(driver)) | |
if page_i == 2: | |
break |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment