flipdazed/linkedin_job_postings.py

## linkedin_job_postings.py
import os
import time
import numpy as np
import re
import unicodedata
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from selenium.webdriver.common.action_chains import ActionChains

username = 'your username'
password = 'your password'
job_id = 'your job id'
downloads_path = os.path.join(os.path.expanduser('~'), 'Downloads')

options = webdriver.ChromeOptions()
prefs = {"download.default_directory" : downloads_path}
options.add_experimental_option("prefs", prefs)

driver = webdriver.Chrome(options=options)
driver.get('https://www.linkedin.com/')
un = driver.find_element_by_xpath("//*[@autocomplete='username']")
un.send_keys(username)

pw = driver.find_element_by_xpath("//*[@autocomplete='current-password']")
pw.send_keys(password)
pw.submit()

jobs_xpath = '//a[@class="app-aware-link global-nav__primary-link"]'
possible_jobs = driver.find_elements_by_xpath(jobs_xpath)
for jobs in possible_jobs:
    if 'jobs' in jobs.text.lower():
        break
else:
    raise ValueError(' No jobs button found!')
hover = ActionChains(driver).move_to_element(jobs)
hover.perform()
jobs.click()

jobs_home_xpath = "//*[contains(@href,'post-a-job')]"
jobs_home = driver.find_element_by_xpath(jobs_home_xpath)
hover = ActionChains(driver).move_to_element(jobs_home)
hover.perform()

attrs = getattr(getattr(BeautifulSoup(jobs_home.get_attribute('outerHTML'), 'html.parser'), 'a', None), 'attrs', {})
driver.get(attrs['href'])
try:
    pw = driver.find_element_by_xpath("//*[@autocomplete='current-password']")
    pw.send_keys(password)
except Exception:
    print('no password needed!')

manage_jobs_xpath = "//*[contains(@href,'posted-jobs')]"
manage_jobs = WebDriverWait(driver,5).until(EC.visibility_of_element_located(
        (By.XPATH, manage_jobs_xpath))
    )
hover = ActionChains(driver).move_to_element(manage_jobs)
hover.perform()
manage_jobs.click()

job_posting_xpath = f"//*[contains(@href,'{job_id}')]"
job_posting = driver.find_element_by_xpath(job_posting_xpath)
job_posting.click()

page_xpath = '//li[@data-test-pagination-page-btn]'
applicants_xpath = "//a[@classnames='hiring-applicants-list-item__link link-without-hover-visited']"
WebDriverWait(driver,5).until(EC.visibility_of_element_located(
        (By.XPATH, applicants_xpath))
    )


def strip_accents(text):
    try:
        text = unicode(text, 'utf-8')
    except (TypeError, NameError): # unicode is a default on python 3
        pass
    text = unicodedata.normalize('NFD', text)
    text = text.encode('ascii', 'ignore')
    text = text.decode("utf-8")
    return str(text)

def text_to_id(text):
    text = strip_accents(text.lower())
    text = text.split()
    text = text[0].lower() + ''.join(t.title() for t in text[1:])
    text = re.sub('[^0-9a-zA-Z_-]', '', text)
    return text

def download_cv(url, downloads_path, firstname, lastname):
    driver.get(url)
    def latest_download_file():
        os.chdir(downloads_path)
        files = sorted(os.listdir(os.getcwd()), key=os.path.getmtime)
        newest = files[-1]
        return os.path.join(downloads_path, newest)

    fileends = "crdownload"
    start = time.time()
    while "crdownload" == fileends:
        if time.time() < 5:
            raise ValueError("File took > 5 seconds to download!")
        time.sleep(0.1)
        newest_file = latest_download_file()
        if "crdownload" in newest_file:
            fileends = "crdownload"
        else:
            fileends = "none"
    ext = newest_file.split('.')[-1]
    new_fname = f"{datetime.datetime.now().strftime('%Y%m%d')}_{firstname}{lastname.title()}"
    new_path = os.path.join(downloads_path, new_fname + '.' + ext)
    os.rename(newest_file, new_path)
    return new_path

def check_exists_by_xpath(xpath):
    try:
        driver.find_element_by_xpath(xpath)
    except NoSuchElementException:
        return False
    return True

class VirusScannerError(Exception):
    pass

def get_applicant(applicant, details):
    firstname, lastname = applicant.text.split()[:2]
    details['firstname'] = text_to_id(firstname).lower().title()
    details['lastname'] = text_to_id(lastname).lower().title()

    applicant.click()

    # noise
    time.sleep(min(abs(np.random.normal(0.1, 0.25)), 1))

    print('waiting for more button')
    more_xpath = '//*[@class="artdeco-dropdown__trigger artdeco-dropdown__trigger--placement-bottom ember-view artdeco-button artdeco-button--secondary artdeco-button--muted artdeco-button--3"]'
    more_elm = WebDriverWait(driver,5).until(EC.visibility_of_element_located(
        (By.XPATH, more_xpath))
    )

    print('waiting for cv')
    check_cv_loaded = '//div[@class="p0 mt4 artdeco-card"]'
    WebDriverWait(driver,5).until(EC.visibility_of_element_located(
        (By.XPATH, check_cv_loaded))
    )

    if check_exists_by_xpath('//*[@class="hiring-resume-viewer__virus-scan-section ph5 pv4"]'):
        print('cv virus scan!')
        raise VirusScannerError

    cv_xpath = "//*[contains(@aria-label,'resume')]"
    cv_elm = driver.find_element_by_xpath(cv_xpath)

    driver.execute_script("arguments[0].click();",more_elm)
    print('waiting for details')
    details_xpath = '//div[@class="artdeco-dropdown__content-inner"]/ul//span[@class="hiring-applicant-header-actions__more-content-dropdown-item-text"]'
    WebDriverWait(driver,5).until(EC.visibility_of_element_located(
        (By.XPATH, details_xpath))
    )
    details_elm = driver.find_elements_by_xpath(details_xpath)

    for detail in details_elm:
        data = detail.text
        if "@" in data:
            details['email'] = data
        else:
            details['phone'] = data

    details['cv'] = download_cv(cv_elm.get_property('href'), downloads_path, details['firstname'], details['lastname'])
    profile_ext = getattr(getattr(BeautifulSoup(driver.find_element_by_link_text('See full profile').get_attribute('outerHTML'), 'html.parser'), 'a', None), 'attrs', {})['href']
    details['linkedin'] = 'https://www.linkedin.com'+ profile_ext

def get_applicants(driver):
    WebDriverWait(driver,5).until(EC.visibility_of_element_located(
            (By.XPATH, applicants_xpath))
        )
    applicants = driver.find_elements_by_xpath("//a[@classnames='hiring-applicants-list-item__link link-without-hover-visited']")
    applicant_details = []
    for applicant in applicants:

        # check for new applicants that load late
        new_applicants = driver.find_elements_by_xpath("//a[@classnames='hiring-applicants-list-item__link link-without-hover-visited']")
        applicants_curr_id = set(a.id for a in applicants)
        new_applicants_elms = [a for a in new_applicants if a.id not in applicants_curr_id]
        if new_applicants_elms:
            print('new applicants were loaded late...')
        applicants.extend(new_applicants_elms)

        details = {}
        try:
            get_applicant(applicant, details)
        except VirusScannerError:
            driver.refresh()  #refresh page for virus scanning!
            return get_applicants(driver)
        except Exception as e:
            print(traceback.format_exc())
            continue
        applicant_details.append(details)
    return applicant_details


pages = driver.find_elements_by_xpath(page_xpath)
n_pages = len(pages)

applicant_details = []
for page_i in range(n_pages):
    print('getting page', page_i + 1)
    pages = driver.find_elements_by_xpath(page_xpath)
    page = pages[page_i]  # starts at 1
    _prev_page_id = page.id
    page.click()
    # wait for refresh
    _new_page_id = str(page.id)
    start = time.time()

    if 'active' not in page.get_attribute('outerHTML'):
        while _prev_page_id == _new_page_id:
            if time.time() - start > 5:
                raise TimeoutError
            pages = driver.find_elements_by_xpath(page_xpath)
            _new_page_id = pages[page_i].id
            time.sleep(0.1)

    applicant_details.extend(get_applicants(driver))
    if page_i == 2:
        break
	import os
	import time
	import numpy as np
	import re
	import unicodedata
	from selenium import webdriver
	from selenium.webdriver.common.by import By
	from selenium.common.exceptions import NoSuchElementException
	from selenium.webdriver.support.ui import WebDriverWait
	from selenium.webdriver.support import expected_conditions as EC
	from bs4 import BeautifulSoup
	from selenium.webdriver.common.action_chains import ActionChains

	username = 'your username'
	password = 'your password'
	job_id = 'your job id'
	downloads_path = os.path.join(os.path.expanduser('~'), 'Downloads')

	options = webdriver.ChromeOptions()
	prefs = {"download.default_directory" : downloads_path}
	options.add_experimental_option("prefs", prefs)

	driver = webdriver.Chrome(options=options)
	driver.get('https://www.linkedin.com/')
	un = driver.find_element_by_xpath("//*[@autocomplete='username']")
	un.send_keys(username)

	pw = driver.find_element_by_xpath("//*[@autocomplete='current-password']")
	pw.send_keys(password)
	pw.submit()

	jobs_xpath = '//a[@class="app-aware-link global-nav__primary-link"]'
	possible_jobs = driver.find_elements_by_xpath(jobs_xpath)
	for jobs in possible_jobs:
	if 'jobs' in jobs.text.lower():
	break
	else:
	raise ValueError(' No jobs button found!')
	hover = ActionChains(driver).move_to_element(jobs)
	hover.perform()
	jobs.click()

	jobs_home_xpath = "//*[contains(@href,'post-a-job')]"
	jobs_home = driver.find_element_by_xpath(jobs_home_xpath)
	hover = ActionChains(driver).move_to_element(jobs_home)
	hover.perform()

	attrs = getattr(getattr(BeautifulSoup(jobs_home.get_attribute('outerHTML'), 'html.parser'), 'a', None), 'attrs', {})
	driver.get(attrs['href'])
	try:
	pw = driver.find_element_by_xpath("//*[@autocomplete='current-password']")
	pw.send_keys(password)
	except Exception:
	print('no password needed!')

	manage_jobs_xpath = "//*[contains(@href,'posted-jobs')]"
	manage_jobs = WebDriverWait(driver,5).until(EC.visibility_of_element_located(
	(By.XPATH, manage_jobs_xpath))
	)
	hover = ActionChains(driver).move_to_element(manage_jobs)
	hover.perform()
	manage_jobs.click()

	job_posting_xpath = f"//*[contains(@href,'{job_id}')]"
	job_posting = driver.find_element_by_xpath(job_posting_xpath)
	job_posting.click()

	page_xpath = '//li[@data-test-pagination-page-btn]'
	applicants_xpath = "//a[@classnames='hiring-applicants-list-item__link link-without-hover-visited']"
	WebDriverWait(driver,5).until(EC.visibility_of_element_located(
	(By.XPATH, applicants_xpath))
	)


	def strip_accents(text):
	try:
	text = unicode(text, 'utf-8')
	except (TypeError, NameError): # unicode is a default on python 3
	pass
	text = unicodedata.normalize('NFD', text)
	text = text.encode('ascii', 'ignore')
	text = text.decode("utf-8")
	return str(text)

	def text_to_id(text):
	text = strip_accents(text.lower())
	text = text.split()
	text = text[0].lower() + ''.join(t.title() for t in text[1:])
	text = re.sub('[^0-9a-zA-Z_-]', '', text)
	return text

	def download_cv(url, downloads_path, firstname, lastname):
	driver.get(url)
	def latest_download_file():
	os.chdir(downloads_path)
	files = sorted(os.listdir(os.getcwd()), key=os.path.getmtime)
	newest = files[-1]
	return os.path.join(downloads_path, newest)

	fileends = "crdownload"
	start = time.time()
	while "crdownload" == fileends:
	if time.time() < 5:
	raise ValueError("File took > 5 seconds to download!")
	time.sleep(0.1)
	newest_file = latest_download_file()
	if "crdownload" in newest_file:
	fileends = "crdownload"
	else:
	fileends = "none"
	ext = newest_file.split('.')[-1]
	new_fname = f"{datetime.datetime.now().strftime('%Y%m%d')}_{firstname}{lastname.title()}"
	new_path = os.path.join(downloads_path, new_fname + '.' + ext)
	os.rename(newest_file, new_path)
	return new_path

	def check_exists_by_xpath(xpath):
	try:
	driver.find_element_by_xpath(xpath)
	except NoSuchElementException:
	return False
	return True

	class VirusScannerError(Exception):
	pass

	def get_applicant(applicant, details):
	firstname, lastname = applicant.text.split()[:2]
	details['firstname'] = text_to_id(firstname).lower().title()
	details['lastname'] = text_to_id(lastname).lower().title()

	applicant.click()

	# noise
	time.sleep(min(abs(np.random.normal(0.1, 0.25)), 1))

	print('waiting for more button')
	more_xpath = '//*[@class="artdeco-dropdown__trigger artdeco-dropdown__trigger--placement-bottom ember-view artdeco-button artdeco-button--secondary artdeco-button--muted artdeco-button--3"]'
	more_elm = WebDriverWait(driver,5).until(EC.visibility_of_element_located(
	(By.XPATH, more_xpath))
	)

	print('waiting for cv')
	check_cv_loaded = '//div[@class="p0 mt4 artdeco-card"]'
	WebDriverWait(driver,5).until(EC.visibility_of_element_located(
	(By.XPATH, check_cv_loaded))
	)

	if check_exists_by_xpath('//*[@class="hiring-resume-viewer__virus-scan-section ph5 pv4"]'):
	print('cv virus scan!')
	raise VirusScannerError

	cv_xpath = "//*[contains(@aria-label,'resume')]"
	cv_elm = driver.find_element_by_xpath(cv_xpath)

	driver.execute_script("arguments[0].click();",more_elm)
	print('waiting for details')
	details_xpath = '//div[@class="artdeco-dropdown__content-inner"]/ul//span[@class="hiring-applicant-header-actions__more-content-dropdown-item-text"]'
	WebDriverWait(driver,5).until(EC.visibility_of_element_located(
	(By.XPATH, details_xpath))
	)
	details_elm = driver.find_elements_by_xpath(details_xpath)

	for detail in details_elm:
	data = detail.text
	if "@" in data:
	details['email'] = data
	else:
	details['phone'] = data

	details['cv'] = download_cv(cv_elm.get_property('href'), downloads_path, details['firstname'], details['lastname'])
	profile_ext = getattr(getattr(BeautifulSoup(driver.find_element_by_link_text('See full profile').get_attribute('outerHTML'), 'html.parser'), 'a', None), 'attrs', {})['href']
	details['linkedin'] = 'https://www.linkedin.com'+ profile_ext

	def get_applicants(driver):
	WebDriverWait(driver,5).until(EC.visibility_of_element_located(
	(By.XPATH, applicants_xpath))
	)
	applicants = driver.find_elements_by_xpath("//a[@classnames='hiring-applicants-list-item__link link-without-hover-visited']")
	applicant_details = []
	for applicant in applicants:

	# check for new applicants that load late
	new_applicants = driver.find_elements_by_xpath("//a[@classnames='hiring-applicants-list-item__link link-without-hover-visited']")
	applicants_curr_id = set(a.id for a in applicants)
	new_applicants_elms = [a for a in new_applicants if a.id not in applicants_curr_id]
	if new_applicants_elms:
	print('new applicants were loaded late...')
	applicants.extend(new_applicants_elms)

	details = {}
	try:
	get_applicant(applicant, details)
	except VirusScannerError:
	driver.refresh() #refresh page for virus scanning!
	return get_applicants(driver)
	except Exception as e:
	print(traceback.format_exc())
	continue
	applicant_details.append(details)
	return applicant_details


	pages = driver.find_elements_by_xpath(page_xpath)
	n_pages = len(pages)

	applicant_details = []
	for page_i in range(n_pages):
	print('getting page', page_i + 1)
	pages = driver.find_elements_by_xpath(page_xpath)
	page = pages[page_i] # starts at 1
	_prev_page_id = page.id
	page.click()
	# wait for refresh
	_new_page_id = str(page.id)
	start = time.time()

	if 'active' not in page.get_attribute('outerHTML'):
	while _prev_page_id == _new_page_id:
	if time.time() - start > 5:
	raise TimeoutError
	pages = driver.find_elements_by_xpath(page_xpath)
	_new_page_id = pages[page_i].id
	time.sleep(0.1)

	applicant_details.extend(get_applicants(driver))
	if page_i == 2:
	break