Skip to content

Instantly share code, notes, and snippets.

@GuillaumeFalourd
Last active September 21, 2023 10:56
Show Gist options
  • Save GuillaumeFalourd/a962ef695408f0ab20f667cf31d54ce0 to your computer and use it in GitHub Desktop.
Save GuillaumeFalourd/a962ef695408f0ab20f667cf31d54ce0 to your computer and use it in GitHub Desktop.
Script scraping linkedIn jobs datas on a CSV file according to city and profession
#!/usr/bin/python3
import requests
import csv
import os
import datetime
import time
import random
from bs4 import BeautifulSoup as soup
profession = "Kotlin Developer"
city = "Sao Paulo"
def filename(job, city):
# Create filename according to inputs and date
current_date = datetime.datetime.now()
current_date_format = current_date.strftime("%m-%d-%Y-%Hh%M")
current_date_format_string = str(current_date_format)
filename = f'{job.replace(" ", "_")}_{city.replace(" ", "_")}_{current_date_format_string}.csv'
return filename
def generate_file(csv_filename, job, city, links):
with open(csv_filename, 'w', encoding='utf-8') as f:
headers = ['Source', 'Organization', 'Job Title', 'Location', 'Posted', 'Applicants Hired', 'Seniority Level', 'Employment Type', 'Job Function', 'Industry']
write = csv.writer(f, dialect='excel')
write.writerow(headers)
for job_link in links:
job_datas = get_datas(job, city, job_link)
write.writerows([job_datas])
def check_file(filename):
for root, dirs, files in os.walk(f'{os.getcwd()}'):
dirs = dirs
for data in files:
if filename == data:
print(f"\n\033[1m✅ Successfully generated \033[4m{filename}\033[0m\033[1m file!\033[0m")
def get_nums(string):
a_list = string.split()
for num in a_list:
if num.isdigit():
return num
def change_ip(number):
print("Current IP Address")
os.system('sudo hostname -I')
os.system('sudo ifconfig eth0 down')
os.system(f'sudo ifconfig eth0 192.168.1.{number}')
os.system('sudo ifconfig eth0 up')
print("New IP Address")
os.system('sudo hostname -I')
def get_datas(job, city, job_link):
job_datas = [job_link]
try:
for retry in range(5):
time.sleep(random.randint(1, 3))
page_req = requests.get(
url = job_link,
headers = {'User-agent': f'{job}_{city} bot'}
)
if page_req.status_code == "429":
print(f"\033[1;36m\n⚠️ Too many requests - Retrying with other IP...\033[0m")
change_ip(random.randint(1, 30))
time.sleep(random.randint(1, 3))
continue
else:
page_req.raise_for_status()
# Parse HTML
job_soup = soup(page_req.text, 'html.parser')
contents = job_soup.findAll('div', {'class': 'topcard__content-left'})[0:]
if len(contents) == 0:
time.sleep(random.randint(1, 3))
continue
else:
# Couldn't retrieve all datas for the job
break
if len(contents) != 0:
# Topcard scraping
for content in contents:
# Scraping Organization Names
orgs = {'Default-Org': [org.text for org in content.findAll('a', {'class': 'topcard__org-name-link topcard__flavor--black-link'})],
'Flavor-Org': [org.text for org in content.findAll('span', {'class': 'topcard__flavor'})]}
if orgs['Default-Org'] == []:
org = orgs['Flavor-Org'][0]
job_datas.append(org)
else:
for org in orgs['Default-Org']:
job_datas.append(org)
# Scraping Job Title
for title in content.findAll('h1', {'class': 'topcard__title'})[0:]:
print(f'\n\033[0;32m📌 {title.text}\033[0m', f'\033[1;33m- {org}\033[0m')
job_datas.append(title.text.replace(',', '.'))
for location in content.findAll('span', {'class': 'topcard__flavor topcard__flavor--bullet'})[0:]:
job_datas.append(location.text.replace(',', '.'))
# Scraping Job Time Posted
posts = {'Old': [posted.text for posted in content.findAll('span', {'class': 'topcard__flavor--metadata posted-time-ago__text'})],
'New': [posted.text for posted in content.findAll('span', {'class': 'topcard__flavor--metadata posted-time-ago__text posted-time-ago__text--new'})]}
if posts['New'] == []:
for text in posts['Old']:
job_datas.append(text)
else:
for text in posts['New']:
job_datas.append(text)
# Scraping Number of Applicants Hired
applicants = {'More-Than': [applicant.text for applicant in content.findAll('figcaption', {'class': 'num-applicants__caption'})],
'Current': [applicant.text for applicant in content.findAll('span', {'class': 'topcard__flavor--metadata topcard__flavor--bullet num-applicants__caption'})]}
if applicants['Current'] == []:
for applicant in applicants['More-Than']:
job_datas.append(f'{get_nums(applicant)}+ Applicants')
else:
for applicant in applicants['Current']:
job_datas.append(f'{get_nums(applicant)} Applicants')
# Criteria scraping
for criteria in job_soup.findAll('span', {'class': 'job-criteria__text job-criteria__text--criteria'})[:4]:
job_datas.append(criteria.text)
else:
print(f"\n\033[1;36m⚠️ Saving (only) the job link on the CSV file.\033[0m")
# print(f"\033[0;34mExtracted Datas: {job_datas} \033[0m")
if len(job_datas) < 10:
fill_number = 10 - len(job_datas)
for i in range(0, fill_number):
job_datas.append('')
i += 1
except requests.HTTPError as err:
print(f'\033[0;31m❌ Something went wrong!\033[0m', err)
return job_datas
# MAIN CODE
if [city, profession] is not None:
try:
response = requests.get(
url = f'https://www.linkedin.com/jobs/search/?keywords={profession}&location={city}&position=1&pageNum=0'
)
response.raise_for_status()
page_soup = soup(response.text, 'html.parser')
# Extract Job Links
job_links = []
for res_card in page_soup.findAll("li", {"class": "result-card"})[0:]:
for links in res_card.findAll('a', {'class': 'result-card__full-card-link'})[0:]:
job_links.append(links['href'])
if '-' in profession:
formatting = [x.capitalize() for x in profession.split('-')]
job = ' '.join(formatting)
else:
job = profession.capitalize()
if len(job_links) == 0:
print(f"\033[1;36m\n⚠️ Couldn't extract job links list from LinkedIn, try again later!\033[0m")
else:
print(f'\033[1;33m\n🕵️ {len(job_links)} recent {job} jobs identified in {city.capitalize()}.\n\033[0m')
# Extract Datas into a CSV file
csv_filename = filename(job, city)
generate_file(csv_filename, job, city, job_links)
check_file(csv_filename)
print(f'\033[1;33m\n🕵️ Written all information in: {csv_filename}\033[0m')
except requests.HTTPError as err:
print(f'\033[0;31m❌ Something went wrong!\033[0m', err)
else:
print(f'\033[0;31m❌ Invalid Inputs. City = {city} | Profession = {profession}!\033[0m')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment