-
-
Save adrianhorning08/8b1d7d98cb2548dd76031d3d3cb3537b to your computer and use it in GitHub Desktop.
Linkedin Jobs Scrape
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function createCSV(jsonData, fileName) { | |
// Convert JSON to CSV | |
const csvData = []; | |
// Extract the headers | |
const headers = Object.keys(jsonData[0]); | |
csvData.push(headers.join(",")); | |
jsonData.forEach((item) => { | |
const row = []; | |
for (const key in item) { | |
if (item.hasOwnProperty(key)) { | |
if (typeof item[key] === "number") { | |
row.push(item[key]); | |
continue; | |
} | |
const value = item[key]?.includes(",") | |
? `"${item[key].replace(/"/g, '""')}"` | |
: item[key]; | |
row.push(value); | |
} | |
} | |
csvData.push(row.join(",")); | |
}); | |
// Create a Blob containing the CSV data | |
const csvBlob = new Blob([csvData.join("\n")], { | |
type: "text/csv;charset=utf-8", | |
}); | |
// Create a URL for the Blob | |
const csvUrl = URL.createObjectURL(csvBlob); | |
// Create a link element | |
const link = document.createElement("a"); | |
link.href = csvUrl; | |
link.target = "_blank"; | |
link.download = fileName; | |
// Append the link to the body | |
document.body.appendChild(link); | |
// Trigger a click event on the link | |
link.click(); | |
// Remove the link and revoke the Blob URL | |
document.body.removeChild(link); | |
URL.revokeObjectURL(csvUrl); | |
} | |
async function scrollDown() { | |
const wrapper = document.querySelector(".jobs-search-results-list"); | |
await new Promise((resolve, reject) => { | |
var totalHeight = 0; | |
var distance = 1000; | |
var timer = setInterval(async () => { | |
var scrollHeightBefore = wrapper.scrollHeight; | |
wrapper.scrollBy(0, distance); | |
totalHeight += distance; | |
if (totalHeight >= scrollHeightBefore) { | |
totalHeight = 0; | |
// Calculate scrollHeight after waiting | |
var scrollHeightAfter = wrapper.scrollHeight; | |
if (scrollHeightAfter > scrollHeightBefore) { | |
// More content loaded, keep scrolling | |
return; | |
} else { | |
// No more content loaded, stop scrolling | |
clearInterval(timer); | |
resolve(); | |
} | |
} | |
}, 300); | |
}); | |
} | |
function getRidOfUnnecessaryLinesAndSpaces(text) { | |
return text | |
?.split("\n") | |
?.map((line) => line.trim()) | |
?.filter((line) => line !== "") | |
?.join(" "); | |
} | |
function jsonify(jobDiv) { | |
const title = document.querySelector( | |
"h2.job-details-jobs-unified-top-card__job-title" | |
); | |
const link = jobDiv.querySelector("a.ember-view"); | |
let jobLocation = jobDiv.querySelector("ul").textContent; | |
const primaryDescription = document.querySelector( | |
".job-details-jobs-unified-top-card__primary-description div" | |
); | |
const company = primaryDescription?.children?.[0]?.textContent; | |
let companyLink = primaryDescription?.children?.[0]?.getAttribute("href"); | |
const posted = primaryDescription?.children?.[3]?.textContent; | |
const applicants = primaryDescription?.children?.[5]?.textContent; | |
const details = document.querySelectorAll( | |
".job-details-jobs-unified-top-card__job-insight" | |
); | |
// const [salary, employmentType, level] = details[0].textContent?.split("·"); | |
const splitFirstLine = details?.[0]?.textContent?.split("·"); | |
const salary = splitFirstLine?.find((line) => line?.includes("$")); | |
const employmentType = salary ? splitFirstLine?.[1] : splitFirstLine?.[0]; | |
const level = salary ? splitFirstLine?.[2] : splitFirstLine?.[1]; | |
// const [companySize, companyIndustry] = details?.[1]?.textContent?.split("·"); | |
const splitCrap = details?.[1]?.textContent?.split("·"); | |
const companySize = splitCrap?.[0]; | |
const companyIndustry = splitCrap?.[1]; | |
const description = document.querySelector( | |
".jobs-description-content__text" | |
).textContent; | |
return { | |
title: title?.textContent?.trim() || "", | |
company: company?.trim() || "", | |
link: `https://www.linkedin.com${link?.getAttribute("href")}`, | |
jobLocation: jobLocation?.trim() || "", | |
companyLink: companyLink || "", | |
posted: getRidOfUnnecessaryLinesAndSpaces(posted?.trim()) || "", | |
applicants: applicants?.trim() || "", | |
salary: salary?.trim() || "", | |
employmentType: employmentType?.trim() || "", | |
level: level?.trim() || "", | |
companySize: companySize?.trim() || "", | |
companyIndustry: companyIndustry?.trim() || "", | |
description: description?.trim() || "", | |
}; | |
} | |
async function scrapeLinkedinJobs() { | |
const allJobs = []; | |
let i = 0; | |
// TODO: important that they start this on page 1 | |
let currentPage = 1; | |
let hasNextPage = true; | |
while (hasNextPage) { | |
console.log( | |
`If you need jobs scraped, or any other data, email me: adrian@thewebscrapingguy.com` | |
); | |
// wait for the page to load | |
await new Promise((resolve, reject) => { | |
setTimeout(() => { | |
resolve(); | |
}, 3000); | |
}); | |
await scrollDown(); | |
// collect the job data | |
const jobDivs = document.querySelectorAll(".job-card-container"); | |
// click on all the jobDivs | |
for (let index = 0; index < jobDivs.length; index++) { | |
const jobDiv = jobDivs[index]; | |
jobDiv.click(); | |
await new Promise((resolve, reject) => setTimeout(resolve, 1000)); | |
const job = jsonify(jobDiv); | |
allJobs.push(job); | |
} | |
// get the pages to paginate | |
const pages = document.querySelectorAll( | |
".artdeco-pagination__pages--number li button" | |
); | |
// get the attr data-test-pagination-page-btn | |
for (i = 0; i < pages.length; i++) { | |
const page = pages[i]; | |
const pageNumber = page.getAttribute("aria-label").replace(/\D/g, ""); | |
// if the page number is equal to the current page + 1, then click it | |
if (Number(pageNumber) === currentPage + 1) { | |
console.log("advance to next page"); | |
hasNextPage = true; | |
currentPage++; | |
page.click(); | |
break; | |
} | |
} | |
// if the current page is the last page, then stop | |
if (i === pages.length) { | |
hasNextPage = false; | |
} | |
} | |
console.log("allJobs", allJobs); | |
createCSV(allJobs, `linkedinJobs-${new Date().getTime()}.csv`); | |
console.log(`Congrats! 🎉 You scraped ${allJobs.length} jobs!`); | |
console.log( | |
`If you need jobs scraped, or any other data, email me: adrian@thewebscrapingguy.com` | |
); | |
} | |
await scrapeLinkedinJobs(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment