VictorLG98/linkedin_job_scraper.py

## linkedin_job_scraper.py
import yaml
from urllib.parse import urlencode, urljoin
from playwright.sync_api import sync_playwright
from scrapy import Selector
from dataclasses import dataclass
import pandas as pd
import logging
import re
from rich.logging import RichHandler
import click
import sys

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
rich_handler = RichHandler(rich_tracebacks=True)
logging.getLogger().handlers = [rich_handler]

PAGE_NUMBER = 1

@dataclass
class Job:
    url: str
    job_title: str
    job_id: int
    company_name: str
    company_image: str
    job_location: str

def login_to_linkedin(page, email, password, headless):
    page.goto("https://www.linkedin.com/uas/login")
    page.wait_for_load_state('load')

    page.get_by_label("Email o teléfono").click()
    page.get_by_label("Email o teléfono").fill(email)
    page.get_by_label("Contraseña").click()
    page.get_by_label("Contraseña").fill(password)
    page.locator("#organic-div form").get_by_role("button", name="Iniciar sesión").click()
    page.wait_for_load_state('load')

    if "checkpoint/challenge" in page.url and not headless:
        logger.warning("Captcha page! Human intervention is needed!")
        # Polling loop to check if captcha is solved
        while True:
            if "checkpoint/challenge" not in page.url:
                logger.info("Captcha solved. Continuing with the rest of the process.")
                break
            page.wait_for_timeout(2000)  # Wait for 2 seconds before polling again
        page.wait_for_timeout(5000)
    else:
        logger.error("Captcha page! Aborting due to headless mode...")
        sys.exit(1)

def scrape_jobs(page, params, last24h):
    global PAGE_NUMBER
    main_url = "https://www.linkedin.com/jobs/"

    base_url = 'https://www.linkedin.com/jobs/search/'
    url = f'{base_url}?{urlencode(params)}'

    # List to store job data
    job_list = []

    page.goto(url)
    page.wait_for_load_state('load')

    if last24h:
        page.get_by_role("button", name="Filtro «Fecha de publicación». Al hacer clic en este botón, se muestran todas las opciones del filtro «Fecha de publicación».").click()
        page.locator("label").filter(has_text="Últimas 24 horas Filtrar por «Últimas 24 horas»").click()
        pattern = r"Aplicar el filtro actual para mostrar (\d+\+?) resultados"
        page.get_by_role("button", name=re.compile(pattern, re.IGNORECASE)).click()

    while True:
        page.locator("div.jobs-search-results-list").click()
        for _ in range(15):
            page.mouse.wheel(0, 250)
        page.wait_for_timeout(3000)
        response = Selector(text=page.content())

        jobs = response.css("ul.scaffold-layout__list-container li.ember-view")
        for job in jobs:
            job_info = Job(
                url=urljoin(main_url, job.css("a::attr(href)").get()) if job.css("a::attr(href)").get() else None,
                job_title=job.css("a::attr(aria-label)").get(),
                job_id=job.css("::attr(data-occludable-job-id)").get(),
                company_name=" ".join(job.css("img ::attr(alt)").get().split(" ")[2::]) if job.css("img ::attr(alt)").get() else None,
                company_image=job.css("img ::attr(src)").get(),
                job_location=" ".join(job.css(".job-card-container__metadata-item ::text").getall()) if job.css(
                    ".job-card-container__metadata-item ::text").get() else None
            )
            job_list.append(job_info)
            logger.info(f"Scraped job: {job_info.job_title}")

        # Check if there is a "Next" button and click it
        try:
            PAGE_NUMBER += 1
            page.get_by_role("button", name=f"Página {PAGE_NUMBER}", exact=True).click(timeout=4000)
            page.wait_for_timeout(3000)  # wait for the next page to load
            logger.info(f"Moving to page {PAGE_NUMBER}")
        except Exception:
            logger.warning("No more pages to scrape")
            break  # No more pages

    PAGE_NUMBER = 1
    return job_list


@click.command()
@click.option('--config', type=click.Path(exists=True), default='config.yaml', help='Path to the YAML config file')
@click.option('--headless/--no-headless', default=True, help='Run the browser in headless mode or not')
@click.option('--last24h', is_flag=True, default=False, help='Make the browser go for last 24h jobs only')
def main(config, headless, last24h):
    # Load the YAML file with the list of search parameters
    with open('config.yaml', 'r') as f:
        data = yaml.safe_load(f)

    email = data.get("email")
    password = data.get("password")
    params_list = data.get("params")

    # Start the browser
    with sync_playwright() as p:
        browser = p.chromium.launch(headless=headless)
        page = browser.new_page(locale="es-ES")  # Changed the locale to English

        # Login to LinkedIn once
        login_to_linkedin(page, email, password, headless)

        all_jobs = []
        for params in params_list:
            logger.info(f"Crawl starting... Params: {params}")
            jobs = scrape_jobs(page, params, last24h)
            all_jobs.extend(jobs)

        # Create a DataFrame from the combined job_list
        df = pd.DataFrame([job.__dict__ for job in all_jobs])

        # Save DataFrame to a CSV file
        csv_file_path = 'jobs_data.csv'
        df.to_csv(csv_file_path, index=False)

        # Log the number of jobs scraped and saved
        logger.info(f"Scraped {len(all_jobs)} jobs and saved to jobs_data.csv")

        browser.close()

if __name__ == '__main__':
    main()
	import yaml
	from urllib.parse import urlencode, urljoin
	from playwright.sync_api import sync_playwright
	from scrapy import Selector
	from dataclasses import dataclass
	import pandas as pd
	import logging
	import re
	from rich.logging import RichHandler
	import click
	import sys

	# Configure logging
	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
	logger = logging.getLogger(__name__)
	rich_handler = RichHandler(rich_tracebacks=True)
	logging.getLogger().handlers = [rich_handler]

	PAGE_NUMBER = 1

	@dataclass
	class Job:
	url: str
	job_title: str
	job_id: int
	company_name: str
	company_image: str
	job_location: str

	def login_to_linkedin(page, email, password, headless):
	page.goto("https://www.linkedin.com/uas/login")
	page.wait_for_load_state('load')

	page.get_by_label("Email o teléfono").click()
	page.get_by_label("Email o teléfono").fill(email)
	page.get_by_label("Contraseña").click()
	page.get_by_label("Contraseña").fill(password)
	page.locator("#organic-div form").get_by_role("button", name="Iniciar sesión").click()
	page.wait_for_load_state('load')

	if "checkpoint/challenge" in page.url and not headless:
	logger.warning("Captcha page! Human intervention is needed!")
	# Polling loop to check if captcha is solved
	while True:
	if "checkpoint/challenge" not in page.url:
	logger.info("Captcha solved. Continuing with the rest of the process.")
	break
	page.wait_for_timeout(2000) # Wait for 2 seconds before polling again
	page.wait_for_timeout(5000)
	else:
	logger.error("Captcha page! Aborting due to headless mode...")
	sys.exit(1)

	def scrape_jobs(page, params, last24h):
	global PAGE_NUMBER
	main_url = "https://www.linkedin.com/jobs/"

	base_url = 'https://www.linkedin.com/jobs/search/'
	url = f'{base_url}?{urlencode(params)}'

	# List to store job data
	job_list = []

	page.goto(url)
	page.wait_for_load_state('load')

	if last24h:
	page.get_by_role("button", name="Filtro «Fecha de publicación». Al hacer clic en este botón, se muestran todas las opciones del filtro «Fecha de publicación».").click()
	page.locator("label").filter(has_text="Últimas 24 horas Filtrar por «Últimas 24 horas»").click()
	pattern = r"Aplicar el filtro actual para mostrar (\d+\+?) resultados"
	page.get_by_role("button", name=re.compile(pattern, re.IGNORECASE)).click()

	while True:
	page.locator("div.jobs-search-results-list").click()
	for _ in range(15):
	page.mouse.wheel(0, 250)
	page.wait_for_timeout(3000)
	response = Selector(text=page.content())

	jobs = response.css("ul.scaffold-layout__list-container li.ember-view")
	for job in jobs:
	job_info = Job(
	url=urljoin(main_url, job.css("a::attr(href)").get()) if job.css("a::attr(href)").get() else None,
	job_title=job.css("a::attr(aria-label)").get(),
	job_id=job.css("::attr(data-occludable-job-id)").get(),
	company_name=" ".join(job.css("img ::attr(alt)").get().split(" ")[2::]) if job.css("img ::attr(alt)").get() else None,
	company_image=job.css("img ::attr(src)").get(),
	job_location=" ".join(job.css(".job-card-container__metadata-item ::text").getall()) if job.css(
	".job-card-container__metadata-item ::text").get() else None
	)
	job_list.append(job_info)
	logger.info(f"Scraped job: {job_info.job_title}")

	# Check if there is a "Next" button and click it
	try:
	PAGE_NUMBER += 1
	page.get_by_role("button", name=f"Página {PAGE_NUMBER}", exact=True).click(timeout=4000)
	page.wait_for_timeout(3000) # wait for the next page to load
	logger.info(f"Moving to page {PAGE_NUMBER}")
	except Exception:
	logger.warning("No more pages to scrape")
	break # No more pages

	PAGE_NUMBER = 1
	return job_list


	@click.command()
	@click.option('--config', type=click.Path(exists=True), default='config.yaml', help='Path to the YAML config file')
	@click.option('--headless/--no-headless', default=True, help='Run the browser in headless mode or not')
	@click.option('--last24h', is_flag=True, default=False, help='Make the browser go for last 24h jobs only')
	def main(config, headless, last24h):
	# Load the YAML file with the list of search parameters
	with open('config.yaml', 'r') as f:
	data = yaml.safe_load(f)

	email = data.get("email")
	password = data.get("password")
	params_list = data.get("params")

	# Start the browser
	with sync_playwright() as p:
	browser = p.chromium.launch(headless=headless)
	page = browser.new_page(locale="es-ES") # Changed the locale to English

	# Login to LinkedIn once
	login_to_linkedin(page, email, password, headless)

	all_jobs = []
	for params in params_list:
	logger.info(f"Crawl starting... Params: {params}")
	jobs = scrape_jobs(page, params, last24h)
	all_jobs.extend(jobs)

	# Create a DataFrame from the combined job_list
	df = pd.DataFrame([job.__dict__ for job in all_jobs])

	# Save DataFrame to a CSV file
	csv_file_path = 'jobs_data.csv'
	df.to_csv(csv_file_path, index=False)

	# Log the number of jobs scraped and saved
	logger.info(f"Scraped {len(all_jobs)} jobs and saved to jobs_data.csv")

	browser.close()

	if __name__ == '__main__':
	main()