Skip to content

Instantly share code, notes, and snippets.

@adrianhorning08
Created August 11, 2023 16:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save adrianhorning08/c3315115a808e22f17dbdb48534ae857 to your computer and use it in GitHub Desktop.
Save adrianhorning08/c3315115a808e22f17dbdb48534ae857 to your computer and use it in GitHub Desktop.
Google Jobs Scraper
import * as cheerio from "cheerio";
import fs from "graceful-fs";
import puppeteerExtra from "puppeteer-extra";
import stealthPlugin from "puppeteer-extra-plugin-stealth";
import chromium from "@sparticuz/chromium";
(async function () {
console.log("starting");
let term = "marketing";
const start = Date.now();
puppeteerExtra.use(stealthPlugin());
const browser = await puppeteerExtra.launch({
headless: false,
// headless: "new",
// devtools: true,
executablePath:
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
});
// const browser = await puppeteerExtra.launch({
// args: chromium.args,
// // defaultViewport: chromium.defaultViewport,
// defaultViewport: null,
// executablePath: await chromium.executablePath(),
// headless: "new",
// });
const page = await browser.newPage();
await page.goto(
`https://www.google.com/search?q=${term
?.split(" ")
.join(
"+"
)}&oq=google+jobs&aqs=chrome..69i57j69i64j0i271j69i60l3.1269j0j1&sourceid=chrome&ie=UTF-8&ibp=htl;jobs&sa=X&ved=2ahUKEwj83teW-MqAAxXTlWoFHSZKA4gQudcGKAF6BAgfECs#fpstate=tldetail&htivrt=jobs&htichips=date_posted:today&htischips=date_posted;today&htilrad=-1.0&htiltype=1&htidocid=VLlnA5-pzFMAAAAAAAAAAA%3D%3D`
);
console.log("got to page");
await page.waitForTimeout(3000);
async function autoScroll(page) {
await page.evaluate(async () => {
const wrapper = document.querySelector(
".gws-plugins-horizon-jobs__tl-lvc"
);
await new Promise((resolve, reject) => {
var totalHeight = 0;
var distance = 1000;
var scrollDelay = 5000;
var timer = setInterval(async () => {
var scrollHeightBefore = wrapper.scrollHeight;
wrapper.scrollBy(0, distance);
totalHeight += distance;
if (totalHeight >= scrollHeightBefore) {
totalHeight = 0;
await new Promise((resolve) => setTimeout(resolve, scrollDelay));
// Calculate scrollHeight after waiting
var scrollHeightAfter = wrapper.scrollHeight;
if (scrollHeightAfter > scrollHeightBefore) {
// More content loaded, keep scrolling
return;
} else {
// No more content loaded, stop scrolling
clearInterval(timer);
resolve();
}
}
}, 100);
});
});
}
await autoScroll(page);
const html = await page.content();
console.log("here i am");
const pages = await browser.pages();
await Promise.all(pages.map((page) => page.close()));
await browser.close();
console.log("browser closed");
const $ = cheerio.load(html);
const json = [];
const detailSections = $("#gws-plugins-horizon-jobs__job_details_page");
console.log("trying to find detailSections");
console.log("detailSections", detailSections.length);
detailSections.each((i, section) => {
const id = $(section).attr("data-encoded-doc-id"); // TODO: not sure this is the actual id
let title = $(section)?.find("h2")?.first()?.text();
// find the link that starts with Apply on
const applyLink = $(section)
?.find("a")
?.filter((i, a) => {
const text = $(a).text().trim();
if (text.startsWith("Apply")) {
return true;
}
})
?.attr("href")
?.split("?")?.[0];
const companyAndLocation = $(section)
?.children()
?.first()
?.children()
?.first()
?.children()
?.first()
?.children()
?.eq(1)
?.children()
?.eq(1);
const company = companyAndLocation?.children()?.first()?.text()?.trim();
const location = companyAndLocation?.children()?.eq(1)?.text()?.trim();
const quickFactsSection = $(section)
?.children()
?.first()
?.children()
?.eq(3)
?.children();
const quickFacts = [];
quickFactsSection.each((i, div) => {
const text = $(div)?.text()?.trim();
quickFacts.push(text);
});
// // find anywhere in the div if there is the text "Job description"
const hasDescription = $(section)?.text()?.includes("Job description");
let description;
if (hasDescription) {
const divWithJobDescriptionText = $(section)
?.find("div")
?.filter((i, div) => {
const text = $(div)?.text()?.trim();
if (text?.includes("Job description")) {
return true;
}
});
const divWithJobDescription = divWithJobDescriptionText
?.parent()
?.next()
?.text()
?.trim()
?.replace(/\s\s+/g, " ");
description = divWithJobDescription;
}
json.push({
id, // TODO: not sure this is the actual id
title,
description,
applyLink,
company,
location,
quickFacts,
});
});
const jobsInScrollBar = [];
const jobs = $("li.gws-plugins-horizon-jobs__li-ed");
jobs.each((i, job) => {
const div = $(job).find(".gws-plugins-horizon-jobs__tl-lif > div > div");
const title = div.children().eq(1).text().trim();
const divWithContent = div.children().last().children().first();
const company = divWithContent.children().first().text().trim();
const location = divWithContent.children().eq(1).text().trim();
const source = divWithContent.children().eq(2).text().trim();
jobsInScrollBar.push({ title, company, location, source });
});
const all = json.map((fThisThing, i) => {
return {
...jobsInScrollBar[i],
...fThisThing,
};
});
console.log("all", all.length);
const end = Date.now();
// time in seconds
console.log("time", (end - start) / 1000);
})();
@adrianhorning08
Copy link
Author

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment