Skip to content

Instantly share code, notes, and snippets.

@VictorLG98
Created July 29, 2023 15:45
Show Gist options
  • Save VictorLG98/627c263aef47994b628476db017d5c66 to your computer and use it in GitHub Desktop.
Save VictorLG98/627c263aef47994b628476db017d5c66 to your computer and use it in GitHub Desktop.
import yaml
from urllib.parse import urlencode, urljoin
from playwright.sync_api import sync_playwright
from scrapy import Selector
from dataclasses import dataclass
import pandas as pd
import logging
import re
from rich.logging import RichHandler
import click
import sys
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
rich_handler = RichHandler(rich_tracebacks=True)
logging.getLogger().handlers = [rich_handler]
PAGE_NUMBER = 1
@dataclass
class Job:
url: str
job_title: str
job_id: int
company_name: str
company_image: str
job_location: str
def login_to_linkedin(page, email, password, headless):
page.goto("https://www.linkedin.com/uas/login")
page.wait_for_load_state('load')
page.get_by_label("Email o teléfono").click()
page.get_by_label("Email o teléfono").fill(email)
page.get_by_label("Contraseña").click()
page.get_by_label("Contraseña").fill(password)
page.locator("#organic-div form").get_by_role("button", name="Iniciar sesión").click()
page.wait_for_load_state('load')
if "checkpoint/challenge" in page.url and not headless:
logger.warning("Captcha page! Human intervention is needed!")
# Polling loop to check if captcha is solved
while True:
if "checkpoint/challenge" not in page.url:
logger.info("Captcha solved. Continuing with the rest of the process.")
break
page.wait_for_timeout(2000) # Wait for 2 seconds before polling again
page.wait_for_timeout(5000)
else:
logger.error("Captcha page! Aborting due to headless mode...")
sys.exit(1)
def scrape_jobs(page, params, last24h):
global PAGE_NUMBER
main_url = "https://www.linkedin.com/jobs/"
base_url = 'https://www.linkedin.com/jobs/search/'
url = f'{base_url}?{urlencode(params)}'
# List to store job data
job_list = []
page.goto(url)
page.wait_for_load_state('load')
if last24h:
page.get_by_role("button", name="Filtro «Fecha de publicación». Al hacer clic en este botón, se muestran todas las opciones del filtro «Fecha de publicación».").click()
page.locator("label").filter(has_text="Últimas 24 horas Filtrar por «Últimas 24 horas»").click()
pattern = r"Aplicar el filtro actual para mostrar (\d+\+?) resultados"
page.get_by_role("button", name=re.compile(pattern, re.IGNORECASE)).click()
while True:
page.locator("div.jobs-search-results-list").click()
for _ in range(15):
page.mouse.wheel(0, 250)
page.wait_for_timeout(3000)
response = Selector(text=page.content())
jobs = response.css("ul.scaffold-layout__list-container li.ember-view")
for job in jobs:
job_info = Job(
url=urljoin(main_url, job.css("a::attr(href)").get()) if job.css("a::attr(href)").get() else None,
job_title=job.css("a::attr(aria-label)").get(),
job_id=job.css("::attr(data-occludable-job-id)").get(),
company_name=" ".join(job.css("img ::attr(alt)").get().split(" ")[2::]) if job.css("img ::attr(alt)").get() else None,
company_image=job.css("img ::attr(src)").get(),
job_location=" ".join(job.css(".job-card-container__metadata-item ::text").getall()) if job.css(
".job-card-container__metadata-item ::text").get() else None
)
job_list.append(job_info)
logger.info(f"Scraped job: {job_info.job_title}")
# Check if there is a "Next" button and click it
try:
PAGE_NUMBER += 1
page.get_by_role("button", name=f"Página {PAGE_NUMBER}", exact=True).click(timeout=4000)
page.wait_for_timeout(3000) # wait for the next page to load
logger.info(f"Moving to page {PAGE_NUMBER}")
except Exception:
logger.warning("No more pages to scrape")
break # No more pages
PAGE_NUMBER = 1
return job_list
@click.command()
@click.option('--config', type=click.Path(exists=True), default='config.yaml', help='Path to the YAML config file')
@click.option('--headless/--no-headless', default=True, help='Run the browser in headless mode or not')
@click.option('--last24h', is_flag=True, default=False, help='Make the browser go for last 24h jobs only')
def main(config, headless, last24h):
# Load the YAML file with the list of search parameters
with open('config.yaml', 'r') as f:
data = yaml.safe_load(f)
email = data.get("email")
password = data.get("password")
params_list = data.get("params")
# Start the browser
with sync_playwright() as p:
browser = p.chromium.launch(headless=headless)
page = browser.new_page(locale="es-ES") # Changed the locale to English
# Login to LinkedIn once
login_to_linkedin(page, email, password, headless)
all_jobs = []
for params in params_list:
logger.info(f"Crawl starting... Params: {params}")
jobs = scrape_jobs(page, params, last24h)
all_jobs.extend(jobs)
# Create a DataFrame from the combined job_list
df = pd.DataFrame([job.__dict__ for job in all_jobs])
# Save DataFrame to a CSV file
csv_file_path = 'jobs_data.csv'
df.to_csv(csv_file_path, index=False)
# Log the number of jobs scraped and saved
logger.info(f"Scraped {len(all_jobs)} jobs and saved to jobs_data.csv")
browser.close()
if __name__ == '__main__':
main()
@VictorLG98
Copy link
Author

VictorLG98 commented Jul 29, 2023

Sample config.yaml:

email: linkedin_user
password: linkedin_pass
params:
  - currentJobId: 3648419920
    geoId: 105646813
    keywords: Desarrollador de Python
    location: España
    refresh: true
  - currentJobId: 3662600850
    geoId: 105646813
    keywords: Automatizacion QA
    location: España
    refresh: false

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment