GuillaumeFalourd/get_job_datas_on_linkedin.py

## get_job_datas_on_linkedin.py
#!/usr/bin/python3
import requests
import csv
import os
import datetime
import time
import random

from bs4 import BeautifulSoup as soup

profession = "Kotlin Developer"
city = "Sao Paulo"

def filename(job, city):
    # Create filename according to inputs and date
    current_date = datetime.datetime.now()
    current_date_format = current_date.strftime("%m-%d-%Y-%Hh%M")
    current_date_format_string = str(current_date_format)
    filename = f'{job.replace(" ", "_")}_{city.replace(" ", "_")}_{current_date_format_string}.csv'
    return filename

def generate_file(csv_filename, job, city, links):
    with open(csv_filename, 'w', encoding='utf-8') as f:
        headers = ['Source', 'Organization', 'Job Title', 'Location', 'Posted', 'Applicants Hired', 'Seniority Level', 'Employment Type', 'Job Function', 'Industry']
        write = csv.writer(f, dialect='excel')
        write.writerow(headers)

        for job_link in links:
            job_datas = get_datas(job, city, job_link)
            write.writerows([job_datas])

def check_file(filename):
    for root, dirs, files in os.walk(f'{os.getcwd()}'):
        dirs = dirs
        for data in files:
            if filename == data:
                print(f"\n\033[1m✅ Successfully generated \033[4m{filename}\033[0m\033[1m file!\033[0m")

def get_nums(string):
    a_list = string.split()
    for num in a_list:
        if num.isdigit():
            return num

def change_ip(number):
    print("Current IP Address")
    os.system('sudo hostname -I')
    os.system('sudo ifconfig eth0 down')
    os.system(f'sudo ifconfig eth0 192.168.1.{number}')
    os.system('sudo ifconfig eth0 up')
    print("New IP Address")
    os.system('sudo hostname -I')

def get_datas(job, city, job_link):
    job_datas = [job_link]
    try:
        for retry in range(5):
            time.sleep(random.randint(1, 3))
            page_req = requests.get(
                url = job_link,
                headers = {'User-agent': f'{job}_{city} bot'}
                )
            if page_req.status_code == "429":
                print(f"\033[1;36m\n⚠️  Too many requests - Retrying with other IP...\033[0m")
                change_ip(random.randint(1, 30))
                time.sleep(random.randint(1, 3))
                continue
            else:
                page_req.raise_for_status()
                # Parse HTML
                job_soup = soup(page_req.text, 'html.parser')
                contents = job_soup.findAll('div', {'class': 'topcard__content-left'})[0:]
                if len(contents) == 0:
                    time.sleep(random.randint(1, 3))
                    continue
                else:
                    # Couldn't retrieve all datas for the job
                    break

        if len(contents) != 0:
            # Topcard scraping
            for content in contents:

                # Scraping Organization Names
                orgs = {'Default-Org': [org.text for org in content.findAll('a', {'class': 'topcard__org-name-link topcard__flavor--black-link'})],
                        'Flavor-Org': [org.text for org in content.findAll('span', {'class': 'topcard__flavor'})]}

                if orgs['Default-Org'] == []:
                    org = orgs['Flavor-Org'][0]
                    job_datas.append(org)
                else:
                    for org in orgs['Default-Org']:
                        job_datas.append(org)

                # Scraping Job Title
                for title in content.findAll('h1', {'class': 'topcard__title'})[0:]:
                    print(f'\n\033[0;32m📌 {title.text}\033[0m', f'\033[1;33m- {org}\033[0m')
                    job_datas.append(title.text.replace(',', '.'))

                for location in content.findAll('span', {'class': 'topcard__flavor topcard__flavor--bullet'})[0:]:
                    job_datas.append(location.text.replace(',', '.'))

                # Scraping Job Time Posted
                posts = {'Old': [posted.text for posted in content.findAll('span', {'class': 'topcard__flavor--metadata posted-time-ago__text'})],
                        'New': [posted.text for posted in content.findAll('span', {'class': 'topcard__flavor--metadata posted-time-ago__text posted-time-ago__text--new'})]}

                if posts['New'] == []:
                    for text in posts['Old']:
                        job_datas.append(text)
                else:
                    for text in posts['New']:
                        job_datas.append(text)

                # Scraping Number of Applicants Hired
                applicants = {'More-Than': [applicant.text for applicant in content.findAll('figcaption', {'class': 'num-applicants__caption'})],
                            'Current': [applicant.text for applicant in content.findAll('span', {'class': 'topcard__flavor--metadata topcard__flavor--bullet num-applicants__caption'})]}

                if applicants['Current'] == []:
                    for applicant in applicants['More-Than']:
                        job_datas.append(f'{get_nums(applicant)}+ Applicants')
                else:
                    for applicant in applicants['Current']:
                        job_datas.append(f'{get_nums(applicant)} Applicants')

            # Criteria scraping
            for criteria in job_soup.findAll('span', {'class': 'job-criteria__text job-criteria__text--criteria'})[:4]:
                job_datas.append(criteria.text)
        else:
            print(f"\n\033[1;36m⚠️  Saving (only) the job link on the CSV file.\033[0m")

        # print(f"\033[0;34mExtracted Datas: {job_datas} \033[0m")

        if len(job_datas) < 10:
            fill_number = 10 - len(job_datas)
            for i in range(0, fill_number):
                job_datas.append('')
                i += 1

    except requests.HTTPError as err:
        print(f'\033[0;31m❌ Something went wrong!\033[0m', err)

    return job_datas

# MAIN CODE

if [city, profession] is not None:
    try:
        response = requests.get(
                url = f'https://www.linkedin.com/jobs/search/?keywords={profession}&location={city}&position=1&pageNum=0'
            )
        response.raise_for_status()
        page_soup = soup(response.text, 'html.parser')

        # Extract Job Links
        job_links = []
        for res_card in page_soup.findAll("li", {"class": "result-card"})[0:]:
            for links in res_card.findAll('a', {'class': 'result-card__full-card-link'})[0:]:
                job_links.append(links['href'])

        if '-' in profession:
            formatting = [x.capitalize() for x in profession.split('-')]
            job = ' '.join(formatting)
        else:
            job = profession.capitalize()

        if len(job_links) == 0:
            print(f"\033[1;36m\n⚠️  Couldn't extract job links list from LinkedIn, try again later!\033[0m")
        else:
            print(f'\033[1;33m\n🕵️  {len(job_links)} recent {job} jobs identified in {city.capitalize()}.\n\033[0m')

            # Extract Datas into a CSV file
            csv_filename = filename(job, city)
            generate_file(csv_filename, job, city, job_links)
            check_file(csv_filename)

            print(f'\033[1;33m\n🕵️  Written all information in: {csv_filename}\033[0m')

    except requests.HTTPError as err:
        print(f'\033[0;31m❌ Something went wrong!\033[0m', err)

else:
    print(f'\033[0;31m❌ Invalid Inputs. City = {city} | Profession = {profession}!\033[0m')
	#!/usr/bin/python3
	import requests
	import csv
	import os
	import datetime
	import time
	import random

	from bs4 import BeautifulSoup as soup

	profession = "Kotlin Developer"
	city = "Sao Paulo"

	def filename(job, city):
	# Create filename according to inputs and date
	current_date = datetime.datetime.now()
	current_date_format = current_date.strftime("%m-%d-%Y-%Hh%M")
	current_date_format_string = str(current_date_format)
	filename = f'{job.replace(" ", "_")}_{city.replace(" ", "_")}_{current_date_format_string}.csv'
	return filename

	def generate_file(csv_filename, job, city, links):
	with open(csv_filename, 'w', encoding='utf-8') as f:
	headers = ['Source', 'Organization', 'Job Title', 'Location', 'Posted', 'Applicants Hired', 'Seniority Level', 'Employment Type', 'Job Function', 'Industry']
	write = csv.writer(f, dialect='excel')
	write.writerow(headers)

	for job_link in links:
	job_datas = get_datas(job, city, job_link)
	write.writerows([job_datas])

	def check_file(filename):
	for root, dirs, files in os.walk(f'{os.getcwd()}'):
	dirs = dirs
	for data in files:
	if filename == data:
	print(f"\n\033[1m✅ Successfully generated \033[4m{filename}\033[0m\033[1m file!\033[0m")

	def get_nums(string):
	a_list = string.split()
	for num in a_list:
	if num.isdigit():
	return num

	def change_ip(number):
	print("Current IP Address")
	os.system('sudo hostname -I')
	os.system('sudo ifconfig eth0 down')
	os.system(f'sudo ifconfig eth0 192.168.1.{number}')
	os.system('sudo ifconfig eth0 up')
	print("New IP Address")
	os.system('sudo hostname -I')

	def get_datas(job, city, job_link):
	job_datas = [job_link]
	try:
	for retry in range(5):
	time.sleep(random.randint(1, 3))
	page_req = requests.get(
	url = job_link,
	headers = {'User-agent': f'{job}_{city} bot'}
	)
	if page_req.status_code == "429":
	print(f"\033[1;36m\n⚠️ Too many requests - Retrying with other IP...\033[0m")
	change_ip(random.randint(1, 30))
	time.sleep(random.randint(1, 3))
	continue
	else:
	page_req.raise_for_status()
	# Parse HTML
	job_soup = soup(page_req.text, 'html.parser')
	contents = job_soup.findAll('div', {'class': 'topcard__content-left'})[0:]
	if len(contents) == 0:
	time.sleep(random.randint(1, 3))
	continue
	else:
	# Couldn't retrieve all datas for the job
	break

	if len(contents) != 0:
	# Topcard scraping
	for content in contents:

	# Scraping Organization Names
	orgs = {'Default-Org': [org.text for org in content.findAll('a', {'class': 'topcard__org-name-link topcard__flavor--black-link'})],
	'Flavor-Org': [org.text for org in content.findAll('span', {'class': 'topcard__flavor'})]}

	if orgs['Default-Org'] == []:
	org = orgs['Flavor-Org'][0]
	job_datas.append(org)
	else:
	for org in orgs['Default-Org']:
	job_datas.append(org)

	# Scraping Job Title
	for title in content.findAll('h1', {'class': 'topcard__title'})[0:]:
	print(f'\n\033[0;32m📌 {title.text}\033[0m', f'\033[1;33m- {org}\033[0m')
	job_datas.append(title.text.replace(',', '.'))

	for location in content.findAll('span', {'class': 'topcard__flavor topcard__flavor--bullet'})[0:]:
	job_datas.append(location.text.replace(',', '.'))

	# Scraping Job Time Posted
	posts = {'Old': [posted.text for posted in content.findAll('span', {'class': 'topcard__flavor--metadata posted-time-ago__text'})],
	'New': [posted.text for posted in content.findAll('span', {'class': 'topcard__flavor--metadata posted-time-ago__text posted-time-ago__text--new'})]}

	if posts['New'] == []:
	for text in posts['Old']:
	job_datas.append(text)
	else:
	for text in posts['New']:
	job_datas.append(text)

	# Scraping Number of Applicants Hired
	applicants = {'More-Than': [applicant.text for applicant in content.findAll('figcaption', {'class': 'num-applicants__caption'})],
	'Current': [applicant.text for applicant in content.findAll('span', {'class': 'topcard__flavor--metadata topcard__flavor--bullet num-applicants__caption'})]}

	if applicants['Current'] == []:
	for applicant in applicants['More-Than']:
	job_datas.append(f'{get_nums(applicant)}+ Applicants')
	else:
	for applicant in applicants['Current']:
	job_datas.append(f'{get_nums(applicant)} Applicants')

	# Criteria scraping
	for criteria in job_soup.findAll('span', {'class': 'job-criteria__text job-criteria__text--criteria'})[:4]:
	job_datas.append(criteria.text)
	else:
	print(f"\n\033[1;36m⚠️ Saving (only) the job link on the CSV file.\033[0m")

	# print(f"\033[0;34mExtracted Datas: {job_datas} \033[0m")

	if len(job_datas) < 10:
	fill_number = 10 - len(job_datas)
	for i in range(0, fill_number):
	job_datas.append('')
	i += 1

	except requests.HTTPError as err:
	print(f'\033[0;31m❌ Something went wrong!\033[0m', err)

	return job_datas

	# MAIN CODE

	if [city, profession] is not None:
	try:
	response = requests.get(
	url = f'https://www.linkedin.com/jobs/search/?keywords={profession}&location={city}&position=1&pageNum=0'
	)
	response.raise_for_status()
	page_soup = soup(response.text, 'html.parser')

	# Extract Job Links
	job_links = []
	for res_card in page_soup.findAll("li", {"class": "result-card"})[0:]:
	for links in res_card.findAll('a', {'class': 'result-card__full-card-link'})[0:]:
	job_links.append(links['href'])

	if '-' in profession:
	formatting = [x.capitalize() for x in profession.split('-')]
	job = ' '.join(formatting)
	else:
	job = profession.capitalize()

	if len(job_links) == 0:
	print(f"\033[1;36m\n⚠️ Couldn't extract job links list from LinkedIn, try again later!\033[0m")
	else:
	print(f'\033[1;33m\n🕵️ {len(job_links)} recent {job} jobs identified in {city.capitalize()}.\n\033[0m')

	# Extract Datas into a CSV file
	csv_filename = filename(job, city)
	generate_file(csv_filename, job, city, job_links)
	check_file(csv_filename)

	print(f'\033[1;33m\n🕵️ Written all information in: {csv_filename}\033[0m')

	except requests.HTTPError as err:
	print(f'\033[0;31m❌ Something went wrong!\033[0m', err)

	else:
	print(f'\033[0;31m❌ Invalid Inputs. City = {city} \| Profession = {profession}!\033[0m')