Skip to content

Instantly share code, notes, and snippets.

@adrianhorning08
Created October 10, 2023 21:58
Show Gist options
  • Save adrianhorning08/8b1d7d98cb2548dd76031d3d3cb3537b to your computer and use it in GitHub Desktop.
Save adrianhorning08/8b1d7d98cb2548dd76031d3d3cb3537b to your computer and use it in GitHub Desktop.
Linkedin Jobs Scrape
function createCSV(jsonData, fileName) {
// Convert JSON to CSV
const csvData = [];
// Extract the headers
const headers = Object.keys(jsonData[0]);
csvData.push(headers.join(","));
jsonData.forEach((item) => {
const row = [];
for (const key in item) {
if (item.hasOwnProperty(key)) {
if (typeof item[key] === "number") {
row.push(item[key]);
continue;
}
const value = item[key]?.includes(",")
? `"${item[key].replace(/"/g, '""')}"`
: item[key];
row.push(value);
}
}
csvData.push(row.join(","));
});
// Create a Blob containing the CSV data
const csvBlob = new Blob([csvData.join("\n")], {
type: "text/csv;charset=utf-8",
});
// Create a URL for the Blob
const csvUrl = URL.createObjectURL(csvBlob);
// Create a link element
const link = document.createElement("a");
link.href = csvUrl;
link.target = "_blank";
link.download = fileName;
// Append the link to the body
document.body.appendChild(link);
// Trigger a click event on the link
link.click();
// Remove the link and revoke the Blob URL
document.body.removeChild(link);
URL.revokeObjectURL(csvUrl);
}
async function scrollDown() {
const wrapper = document.querySelector(".jobs-search-results-list");
await new Promise((resolve, reject) => {
var totalHeight = 0;
var distance = 1000;
var timer = setInterval(async () => {
var scrollHeightBefore = wrapper.scrollHeight;
wrapper.scrollBy(0, distance);
totalHeight += distance;
if (totalHeight >= scrollHeightBefore) {
totalHeight = 0;
// Calculate scrollHeight after waiting
var scrollHeightAfter = wrapper.scrollHeight;
if (scrollHeightAfter > scrollHeightBefore) {
// More content loaded, keep scrolling
return;
} else {
// No more content loaded, stop scrolling
clearInterval(timer);
resolve();
}
}
}, 300);
});
}
function getRidOfUnnecessaryLinesAndSpaces(text) {
return text
?.split("\n")
?.map((line) => line.trim())
?.filter((line) => line !== "")
?.join(" ");
}
function jsonify(jobDiv) {
const title = document.querySelector(
"h2.job-details-jobs-unified-top-card__job-title"
);
const link = jobDiv.querySelector("a.ember-view");
let jobLocation = jobDiv.querySelector("ul").textContent;
const primaryDescription = document.querySelector(
".job-details-jobs-unified-top-card__primary-description div"
);
const company = primaryDescription?.children?.[0]?.textContent;
let companyLink = primaryDescription?.children?.[0]?.getAttribute("href");
const posted = primaryDescription?.children?.[3]?.textContent;
const applicants = primaryDescription?.children?.[5]?.textContent;
const details = document.querySelectorAll(
".job-details-jobs-unified-top-card__job-insight"
);
// const [salary, employmentType, level] = details[0].textContent?.split("·");
const splitFirstLine = details?.[0]?.textContent?.split("·");
const salary = splitFirstLine?.find((line) => line?.includes("$"));
const employmentType = salary ? splitFirstLine?.[1] : splitFirstLine?.[0];
const level = salary ? splitFirstLine?.[2] : splitFirstLine?.[1];
// const [companySize, companyIndustry] = details?.[1]?.textContent?.split("·");
const splitCrap = details?.[1]?.textContent?.split("·");
const companySize = splitCrap?.[0];
const companyIndustry = splitCrap?.[1];
const description = document.querySelector(
".jobs-description-content__text"
).textContent;
return {
title: title?.textContent?.trim() || "",
company: company?.trim() || "",
link: `https://www.linkedin.com${link?.getAttribute("href")}`,
jobLocation: jobLocation?.trim() || "",
companyLink: companyLink || "",
posted: getRidOfUnnecessaryLinesAndSpaces(posted?.trim()) || "",
applicants: applicants?.trim() || "",
salary: salary?.trim() || "",
employmentType: employmentType?.trim() || "",
level: level?.trim() || "",
companySize: companySize?.trim() || "",
companyIndustry: companyIndustry?.trim() || "",
description: description?.trim() || "",
};
}
async function scrapeLinkedinJobs() {
const allJobs = [];
let i = 0;
// TODO: important that they start this on page 1
let currentPage = 1;
let hasNextPage = true;
while (hasNextPage) {
console.log(
`If you need jobs scraped, or any other data, email me: adrian@thewebscrapingguy.com`
);
// wait for the page to load
await new Promise((resolve, reject) => {
setTimeout(() => {
resolve();
}, 3000);
});
await scrollDown();
// collect the job data
const jobDivs = document.querySelectorAll(".job-card-container");
// click on all the jobDivs
for (let index = 0; index < jobDivs.length; index++) {
const jobDiv = jobDivs[index];
jobDiv.click();
await new Promise((resolve, reject) => setTimeout(resolve, 1000));
const job = jsonify(jobDiv);
allJobs.push(job);
}
// get the pages to paginate
const pages = document.querySelectorAll(
".artdeco-pagination__pages--number li button"
);
// get the attr data-test-pagination-page-btn
for (i = 0; i < pages.length; i++) {
const page = pages[i];
const pageNumber = page.getAttribute("aria-label").replace(/\D/g, "");
// if the page number is equal to the current page + 1, then click it
if (Number(pageNumber) === currentPage + 1) {
console.log("advance to next page");
hasNextPage = true;
currentPage++;
page.click();
break;
}
}
// if the current page is the last page, then stop
if (i === pages.length) {
hasNextPage = false;
}
}
console.log("allJobs", allJobs);
createCSV(allJobs, `linkedinJobs-${new Date().getTime()}.csv`);
console.log(`Congrats! 🎉 You scraped ${allJobs.length} jobs!`);
console.log(
`If you need jobs scraped, or any other data, email me: adrian@thewebscrapingguy.com`
);
}
await scrapeLinkedinJobs();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment