Created
July 29, 2023 15:45
-
-
Save VictorLG98/627c263aef47994b628476db017d5c66 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import yaml | |
from urllib.parse import urlencode, urljoin | |
from playwright.sync_api import sync_playwright | |
from scrapy import Selector | |
from dataclasses import dataclass | |
import pandas as pd | |
import logging | |
import re | |
from rich.logging import RichHandler | |
import click | |
import sys | |
# Configure logging | |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
logger = logging.getLogger(__name__) | |
rich_handler = RichHandler(rich_tracebacks=True) | |
logging.getLogger().handlers = [rich_handler] | |
PAGE_NUMBER = 1 | |
@dataclass | |
class Job: | |
url: str | |
job_title: str | |
job_id: int | |
company_name: str | |
company_image: str | |
job_location: str | |
def login_to_linkedin(page, email, password, headless): | |
page.goto("https://www.linkedin.com/uas/login") | |
page.wait_for_load_state('load') | |
page.get_by_label("Email o teléfono").click() | |
page.get_by_label("Email o teléfono").fill(email) | |
page.get_by_label("Contraseña").click() | |
page.get_by_label("Contraseña").fill(password) | |
page.locator("#organic-div form").get_by_role("button", name="Iniciar sesión").click() | |
page.wait_for_load_state('load') | |
if "checkpoint/challenge" in page.url and not headless: | |
logger.warning("Captcha page! Human intervention is needed!") | |
# Polling loop to check if captcha is solved | |
while True: | |
if "checkpoint/challenge" not in page.url: | |
logger.info("Captcha solved. Continuing with the rest of the process.") | |
break | |
page.wait_for_timeout(2000) # Wait for 2 seconds before polling again | |
page.wait_for_timeout(5000) | |
else: | |
logger.error("Captcha page! Aborting due to headless mode...") | |
sys.exit(1) | |
def scrape_jobs(page, params, last24h): | |
global PAGE_NUMBER | |
main_url = "https://www.linkedin.com/jobs/" | |
base_url = 'https://www.linkedin.com/jobs/search/' | |
url = f'{base_url}?{urlencode(params)}' | |
# List to store job data | |
job_list = [] | |
page.goto(url) | |
page.wait_for_load_state('load') | |
if last24h: | |
page.get_by_role("button", name="Filtro «Fecha de publicación». Al hacer clic en este botón, se muestran todas las opciones del filtro «Fecha de publicación».").click() | |
page.locator("label").filter(has_text="Últimas 24 horas Filtrar por «Últimas 24 horas»").click() | |
pattern = r"Aplicar el filtro actual para mostrar (\d+\+?) resultados" | |
page.get_by_role("button", name=re.compile(pattern, re.IGNORECASE)).click() | |
while True: | |
page.locator("div.jobs-search-results-list").click() | |
for _ in range(15): | |
page.mouse.wheel(0, 250) | |
page.wait_for_timeout(3000) | |
response = Selector(text=page.content()) | |
jobs = response.css("ul.scaffold-layout__list-container li.ember-view") | |
for job in jobs: | |
job_info = Job( | |
url=urljoin(main_url, job.css("a::attr(href)").get()) if job.css("a::attr(href)").get() else None, | |
job_title=job.css("a::attr(aria-label)").get(), | |
job_id=job.css("::attr(data-occludable-job-id)").get(), | |
company_name=" ".join(job.css("img ::attr(alt)").get().split(" ")[2::]) if job.css("img ::attr(alt)").get() else None, | |
company_image=job.css("img ::attr(src)").get(), | |
job_location=" ".join(job.css(".job-card-container__metadata-item ::text").getall()) if job.css( | |
".job-card-container__metadata-item ::text").get() else None | |
) | |
job_list.append(job_info) | |
logger.info(f"Scraped job: {job_info.job_title}") | |
# Check if there is a "Next" button and click it | |
try: | |
PAGE_NUMBER += 1 | |
page.get_by_role("button", name=f"Página {PAGE_NUMBER}", exact=True).click(timeout=4000) | |
page.wait_for_timeout(3000) # wait for the next page to load | |
logger.info(f"Moving to page {PAGE_NUMBER}") | |
except Exception: | |
logger.warning("No more pages to scrape") | |
break # No more pages | |
PAGE_NUMBER = 1 | |
return job_list | |
@click.command() | |
@click.option('--config', type=click.Path(exists=True), default='config.yaml', help='Path to the YAML config file') | |
@click.option('--headless/--no-headless', default=True, help='Run the browser in headless mode or not') | |
@click.option('--last24h', is_flag=True, default=False, help='Make the browser go for last 24h jobs only') | |
def main(config, headless, last24h): | |
# Load the YAML file with the list of search parameters | |
with open('config.yaml', 'r') as f: | |
data = yaml.safe_load(f) | |
email = data.get("email") | |
password = data.get("password") | |
params_list = data.get("params") | |
# Start the browser | |
with sync_playwright() as p: | |
browser = p.chromium.launch(headless=headless) | |
page = browser.new_page(locale="es-ES") # Changed the locale to English | |
# Login to LinkedIn once | |
login_to_linkedin(page, email, password, headless) | |
all_jobs = [] | |
for params in params_list: | |
logger.info(f"Crawl starting... Params: {params}") | |
jobs = scrape_jobs(page, params, last24h) | |
all_jobs.extend(jobs) | |
# Create a DataFrame from the combined job_list | |
df = pd.DataFrame([job.__dict__ for job in all_jobs]) | |
# Save DataFrame to a CSV file | |
csv_file_path = 'jobs_data.csv' | |
df.to_csv(csv_file_path, index=False) | |
# Log the number of jobs scraped and saved | |
logger.info(f"Scraped {len(all_jobs)} jobs and saved to jobs_data.csv") | |
browser.close() | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Sample config.yaml: