Last active
September 21, 2023 10:56
-
-
Save GuillaumeFalourd/a962ef695408f0ab20f667cf31d54ce0 to your computer and use it in GitHub Desktop.
Script scraping linkedIn jobs datas on a CSV file according to city and profession
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
import requests | |
import csv | |
import os | |
import datetime | |
import time | |
import random | |
from bs4 import BeautifulSoup as soup | |
profession = "Kotlin Developer" | |
city = "Sao Paulo" | |
def filename(job, city): | |
# Create filename according to inputs and date | |
current_date = datetime.datetime.now() | |
current_date_format = current_date.strftime("%m-%d-%Y-%Hh%M") | |
current_date_format_string = str(current_date_format) | |
filename = f'{job.replace(" ", "_")}_{city.replace(" ", "_")}_{current_date_format_string}.csv' | |
return filename | |
def generate_file(csv_filename, job, city, links): | |
with open(csv_filename, 'w', encoding='utf-8') as f: | |
headers = ['Source', 'Organization', 'Job Title', 'Location', 'Posted', 'Applicants Hired', 'Seniority Level', 'Employment Type', 'Job Function', 'Industry'] | |
write = csv.writer(f, dialect='excel') | |
write.writerow(headers) | |
for job_link in links: | |
job_datas = get_datas(job, city, job_link) | |
write.writerows([job_datas]) | |
def check_file(filename): | |
for root, dirs, files in os.walk(f'{os.getcwd()}'): | |
dirs = dirs | |
for data in files: | |
if filename == data: | |
print(f"\n\033[1m✅ Successfully generated \033[4m{filename}\033[0m\033[1m file!\033[0m") | |
def get_nums(string): | |
a_list = string.split() | |
for num in a_list: | |
if num.isdigit(): | |
return num | |
def change_ip(number): | |
print("Current IP Address") | |
os.system('sudo hostname -I') | |
os.system('sudo ifconfig eth0 down') | |
os.system(f'sudo ifconfig eth0 192.168.1.{number}') | |
os.system('sudo ifconfig eth0 up') | |
print("New IP Address") | |
os.system('sudo hostname -I') | |
def get_datas(job, city, job_link): | |
job_datas = [job_link] | |
try: | |
for retry in range(5): | |
time.sleep(random.randint(1, 3)) | |
page_req = requests.get( | |
url = job_link, | |
headers = {'User-agent': f'{job}_{city} bot'} | |
) | |
if page_req.status_code == "429": | |
print(f"\033[1;36m\n⚠️ Too many requests - Retrying with other IP...\033[0m") | |
change_ip(random.randint(1, 30)) | |
time.sleep(random.randint(1, 3)) | |
continue | |
else: | |
page_req.raise_for_status() | |
# Parse HTML | |
job_soup = soup(page_req.text, 'html.parser') | |
contents = job_soup.findAll('div', {'class': 'topcard__content-left'})[0:] | |
if len(contents) == 0: | |
time.sleep(random.randint(1, 3)) | |
continue | |
else: | |
# Couldn't retrieve all datas for the job | |
break | |
if len(contents) != 0: | |
# Topcard scraping | |
for content in contents: | |
# Scraping Organization Names | |
orgs = {'Default-Org': [org.text for org in content.findAll('a', {'class': 'topcard__org-name-link topcard__flavor--black-link'})], | |
'Flavor-Org': [org.text for org in content.findAll('span', {'class': 'topcard__flavor'})]} | |
if orgs['Default-Org'] == []: | |
org = orgs['Flavor-Org'][0] | |
job_datas.append(org) | |
else: | |
for org in orgs['Default-Org']: | |
job_datas.append(org) | |
# Scraping Job Title | |
for title in content.findAll('h1', {'class': 'topcard__title'})[0:]: | |
print(f'\n\033[0;32m📌 {title.text}\033[0m', f'\033[1;33m- {org}\033[0m') | |
job_datas.append(title.text.replace(',', '.')) | |
for location in content.findAll('span', {'class': 'topcard__flavor topcard__flavor--bullet'})[0:]: | |
job_datas.append(location.text.replace(',', '.')) | |
# Scraping Job Time Posted | |
posts = {'Old': [posted.text for posted in content.findAll('span', {'class': 'topcard__flavor--metadata posted-time-ago__text'})], | |
'New': [posted.text for posted in content.findAll('span', {'class': 'topcard__flavor--metadata posted-time-ago__text posted-time-ago__text--new'})]} | |
if posts['New'] == []: | |
for text in posts['Old']: | |
job_datas.append(text) | |
else: | |
for text in posts['New']: | |
job_datas.append(text) | |
# Scraping Number of Applicants Hired | |
applicants = {'More-Than': [applicant.text for applicant in content.findAll('figcaption', {'class': 'num-applicants__caption'})], | |
'Current': [applicant.text for applicant in content.findAll('span', {'class': 'topcard__flavor--metadata topcard__flavor--bullet num-applicants__caption'})]} | |
if applicants['Current'] == []: | |
for applicant in applicants['More-Than']: | |
job_datas.append(f'{get_nums(applicant)}+ Applicants') | |
else: | |
for applicant in applicants['Current']: | |
job_datas.append(f'{get_nums(applicant)} Applicants') | |
# Criteria scraping | |
for criteria in job_soup.findAll('span', {'class': 'job-criteria__text job-criteria__text--criteria'})[:4]: | |
job_datas.append(criteria.text) | |
else: | |
print(f"\n\033[1;36m⚠️ Saving (only) the job link on the CSV file.\033[0m") | |
# print(f"\033[0;34mExtracted Datas: {job_datas} \033[0m") | |
if len(job_datas) < 10: | |
fill_number = 10 - len(job_datas) | |
for i in range(0, fill_number): | |
job_datas.append('') | |
i += 1 | |
except requests.HTTPError as err: | |
print(f'\033[0;31m❌ Something went wrong!\033[0m', err) | |
return job_datas | |
# MAIN CODE | |
if [city, profession] is not None: | |
try: | |
response = requests.get( | |
url = f'https://www.linkedin.com/jobs/search/?keywords={profession}&location={city}&position=1&pageNum=0' | |
) | |
response.raise_for_status() | |
page_soup = soup(response.text, 'html.parser') | |
# Extract Job Links | |
job_links = [] | |
for res_card in page_soup.findAll("li", {"class": "result-card"})[0:]: | |
for links in res_card.findAll('a', {'class': 'result-card__full-card-link'})[0:]: | |
job_links.append(links['href']) | |
if '-' in profession: | |
formatting = [x.capitalize() for x in profession.split('-')] | |
job = ' '.join(formatting) | |
else: | |
job = profession.capitalize() | |
if len(job_links) == 0: | |
print(f"\033[1;36m\n⚠️ Couldn't extract job links list from LinkedIn, try again later!\033[0m") | |
else: | |
print(f'\033[1;33m\n🕵️ {len(job_links)} recent {job} jobs identified in {city.capitalize()}.\n\033[0m') | |
# Extract Datas into a CSV file | |
csv_filename = filename(job, city) | |
generate_file(csv_filename, job, city, job_links) | |
check_file(csv_filename) | |
print(f'\033[1;33m\n🕵️ Written all information in: {csv_filename}\033[0m') | |
except requests.HTTPError as err: | |
print(f'\033[0;31m❌ Something went wrong!\033[0m', err) | |
else: | |
print(f'\033[0;31m❌ Invalid Inputs. City = {city} | Profession = {profession}!\033[0m') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment