AbhishekPednekar84/naukri_scraper.py

## naukri_scraper.py
import os
import csv
import asyncio
import pandas as pd
from bs4 import BeautifulSoup

async def run_scraper(html_file_list):
    full_candidtate_list = []
    candidate_info = {}

    for html_file_name in html_file_list:
        file_name, file_ext = html_file_name.split(".")
        with open(html_file_name, "r", encoding="utf-8") as f:
            soup = BeautifulSoup(f, "lxml")

        candidate_list = soup.find_all("div", class_="tuple-card")

        for candidate in candidate_list:
            # Candidate name
            name = candidate.find("a", class_="link ext candidate-name ellipsis")
            candidate_info["name"] = name.text.strip().title()

            meta_data = candidate.find_all("div", class_="meta-data")

            # Candidate experience
            experience = meta_data[0].text.strip()
            candidate_info["experience"] = experience

            # Candidate compensation
            compensation = meta_data[1].text.strip()
            candidate_info["compensation"] = compensation

            # Location
            location = meta_data[2].text.strip()
            candidate_info["location"] = location

            candidate_details = candidate.find_all("div", class_="detail")

            for detail in candidate_details:
                # Candidate designation
                if detail.label.text == "Current":
                    if " at " in detail.span.text.strip():
                        designation, organization = detail.span.text.strip().split(
                            " at "
                        )
                        # candidate_info["current_designation"] = detail.span.text.strip()
                        candidate_info["current_designation"] = designation
                        candidate_info["organization"] = organization
                    else:
                        candidate_info["current_designation"] = detail.span.text.strip()
                        candidate_info["organization"] = None

                # if detail.label.text == "Previous":
                #     print(detail.span.text)

                # Candidate educational qualifications
                if detail.label.text == "Education":
                    candidate_info["education"] = detail.span.text

            full_candidtate_list.append(candidate_info.copy())

    # Generate either the csv or the spreadsheet once all the data is parsed
    # await generate_csv(full_candidtate_list, file_name);
    await generate_spreadsheet(full_candidtate_list, file_name)


async def generate_csv(candidate_list, file_name):
    full_list = candidate_list

    field_names = [
        "name",
        "experience",
        "compensation",
        "location",
        "current_designation",
        "organization",
        "education",
    ]

    with open(f"{file_name}.csv", "w", encoding="utf-8") as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=field_names)
        writer.writeheader()
        writer.writerows(full_list)


async def generate_spreadsheet(candidate_list, file_name):
    df = pd.DataFrame.from_dict(candidate_list)
    df.to_excel(f"./output/{file_name}.xlsx")


async def read_html_files():
    html_file_list = [file for file in os.listdir(".") if file.endswith(".html")]
    await run_scraper(html_file_list)


if __name__ == "__main__":
    asyncio.run(read_html_files())
	import os
	import csv
	import asyncio
	import pandas as pd
	from bs4 import BeautifulSoup

	async def run_scraper(html_file_list):
	full_candidtate_list = []
	candidate_info = {}

	for html_file_name in html_file_list:
	file_name, file_ext = html_file_name.split(".")
	with open(html_file_name, "r", encoding="utf-8") as f:
	soup = BeautifulSoup(f, "lxml")

	candidate_list = soup.find_all("div", class_="tuple-card")

	for candidate in candidate_list:
	# Candidate name
	name = candidate.find("a", class_="link ext candidate-name ellipsis")
	candidate_info["name"] = name.text.strip().title()

	meta_data = candidate.find_all("div", class_="meta-data")

	# Candidate experience
	experience = meta_data[0].text.strip()
	candidate_info["experience"] = experience

	# Candidate compensation
	compensation = meta_data[1].text.strip()
	candidate_info["compensation"] = compensation

	# Location
	location = meta_data[2].text.strip()
	candidate_info["location"] = location

	candidate_details = candidate.find_all("div", class_="detail")

	for detail in candidate_details:
	# Candidate designation
	if detail.label.text == "Current":
	if " at " in detail.span.text.strip():
	designation, organization = detail.span.text.strip().split(
	" at "
	)
	# candidate_info["current_designation"] = detail.span.text.strip()
	candidate_info["current_designation"] = designation
	candidate_info["organization"] = organization
	else:
	candidate_info["current_designation"] = detail.span.text.strip()
	candidate_info["organization"] = None

	# if detail.label.text == "Previous":
	# print(detail.span.text)

	# Candidate educational qualifications
	if detail.label.text == "Education":
	candidate_info["education"] = detail.span.text

	full_candidtate_list.append(candidate_info.copy())

	# Generate either the csv or the spreadsheet once all the data is parsed
	# await generate_csv(full_candidtate_list, file_name);
	await generate_spreadsheet(full_candidtate_list, file_name)


	async def generate_csv(candidate_list, file_name):
	full_list = candidate_list

	field_names = [
	"name",
	"experience",
	"compensation",
	"location",
	"current_designation",
	"organization",
	"education",
	]

	with open(f"{file_name}.csv", "w", encoding="utf-8") as csv_file:
	writer = csv.DictWriter(csv_file, fieldnames=field_names)
	writer.writeheader()
	writer.writerows(full_list)


	async def generate_spreadsheet(candidate_list, file_name):
	df = pd.DataFrame.from_dict(candidate_list)
	df.to_excel(f"./output/{file_name}.xlsx")


	async def read_html_files():
	html_file_list = [file for file in os.listdir(".") if file.endswith(".html")]
	await run_scraper(html_file_list)


	if __name__ == "__main__":
	asyncio.run(read_html_files())