Skip to content

Instantly share code, notes, and snippets.

@shashank-sharma
Created November 23, 2021 09:16
Show Gist options
  • Save shashank-sharma/6f6fddc6d6d01d92b8f3abf1d9880ca8 to your computer and use it in GitHub Desktop.
Save shashank-sharma/6f6fddc6d6d01d92b8f3abf1d9880ca8 to your computer and use it in GitHub Desktop.
Linkedin job scraper using Scrapy
# Small script to scrape job data using linkedin API
# NOTE: It doesn't check for duplicates, it can be improved here or can be cleaned up
# later on by using job_id (unique)
from urllib.parse import urlencode
from scrapy.http import HtmlResponse
import dateutil.relativedelta
import datetime
import scrapy
class QuotesSpider(scrapy.Spider):
name = "job_spider"
def parse_date(self, date_str):
years = 0
months = 0
weeks = 0
days = 0
hours = 0
minutes = 0
seconds = 0
date_list = date_str.split(" ")
date_count = -int(date_list[0])
date_ago = date_list[1].lower()
if date_ago in ["year", "years"]:
years = date_count
elif date_ago in ["month", "months"]:
months = date_count
elif date_ago in ["week", "weeks"]:
weeks = date_count
elif date_ago in ["days", "day"]:
days = date_count
elif date_ago in ["hours", "hour"]:
hours = date_count
elif date_ago in ["minutes", "minute"]:
minutes = date_count
elif date_ago in ["seconds", "second"]:
seconds = date_count
else:
print("[Error]: {0}".format(date_str))
now = datetime.datetime.now()
original_date = now + dateutil.relativedelta.relativedelta(
years=years,
months=months,
weeks=weeks,
days=days,
hours=hours,
minutes=minutes,
seconds=seconds)
return original_date.strftime("%d/%m/%y")
def get_requests_url(self, start):
BASE_URL = "https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search"
PER_PAGE_COUNTER = 25
# Max 1000
# Location can be updated to country name
while start < 1000:
params = {"keywords": "software engineer", "location": "Earth", "position": "1", "pageNum": "0", "start": start}
# Uncomment next 2 lines if you want recently posted job
# Good to use when you run your scraper every 2 hours
# params["sortBy"] = "R"
# params["f_TPR"] = "r86400"
yield BASE_URL + "?" + urlencode(params)
start += PER_PAGE_COUNTER
def start_requests(self):
for url in self.get_requests_url(0):
yield scrapy.Request(url=url, callback=self.parse, dont_filter=True)
def parse(self, response):
JOB_URL = "https://www.linkedin.com/jobs-guest/jobs/api/jobPosting/{job_id}"
job_links = [JOB_URL.format(job_id=i) for i in [i.css("div.base-card::attr(data-entity-urn)").get().split(":")[3] if i.css("div.base-card") else i.css("a.base-card::attr(data-entity-urn)").get().split(":")[3] for i in response.css("li")]]
yield from response.follow_all(job_links, self.parse_job, dont_filter=True)
def parse_job(self, response):
temp_num = response.css("span.num-applicants__caption::text")
# To get body in a specific encoding and do some formating
# Ideally formatting should be done later on, while doing clean up
temp_body = response.css("div.description__text").css("div.show-more-less-html__markup").get().replace("<br/>", "\n").replace("<li>", "\n- ").replace("</li>", "")
temp_body = HtmlResponse(url="", body=temp_body, encoding='utf-8')
# It's hard to get company name
temp_company_name = response.css("span.topcard__flavor > *::text").get()
company_name = temp_company_name.strip() if temp_company_name else response.css("p.unify-apply-page__company-name-location > *::text").get()
yield {
"job_id": response.url.split("/")[-1],
"company_id": response.css("a::attr(href)").get().split("?")[0].split("/")[4],
"company_name": company_name.strip() if company_name else response.css("span.topcard__flavor::text")[0].extract().strip(),
"job_url": response.css("div.top-card-layout__entity-info > a::attr(href)").get().split("?")[0],
"job_title": response.css("h2.top-card-layout__title::text").get(),
"job_posting": response.css("span.topcard__flavor--bullet::text").get().strip(),
"job_date_string": response.css("span.posted-time-ago__text::text").get().strip(),
"job_date": self.parse_date(response.css("span.posted-time-ago__text::text").get().strip()),
"job_applicants": int(response.css("span.num-applicants__caption::text").get().strip().split(" ")[0]) if temp_num else -1,
"job_content": " ".join(temp_body.css("div.show-more-less-html__markup *::text").extract()).strip(),
"job_metadata": dict(zip([i.get().strip() for i in response.css("h3.description__job-criteria-subheader::text")], [i.get().strip() for i in response.css("span.description__job-criteria-text::text")])),
"created_at": str(datetime.datetime.now())
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment