-
-
Save martinbowling/3dda5483264d6d111a524d05a41eb0f8 to your computer and use it in GitHub Desktop.
Google Jobs Scraper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import * as cheerio from "cheerio"; | |
import fs from "graceful-fs"; | |
import puppeteerExtra from "puppeteer-extra"; | |
import stealthPlugin from "puppeteer-extra-plugin-stealth"; | |
import chromium from "@sparticuz/chromium"; | |
(async function () { | |
console.log("starting"); | |
let term = "marketing"; | |
const start = Date.now(); | |
puppeteerExtra.use(stealthPlugin()); | |
const browser = await puppeteerExtra.launch({ | |
headless: false, | |
// headless: "new", | |
// devtools: true, | |
executablePath: | |
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome", | |
}); | |
// const browser = await puppeteerExtra.launch({ | |
// args: chromium.args, | |
// // defaultViewport: chromium.defaultViewport, | |
// defaultViewport: null, | |
// executablePath: await chromium.executablePath(), | |
// headless: "new", | |
// }); | |
const page = await browser.newPage(); | |
await page.goto( | |
`https://www.google.com/search?q=${term | |
?.split(" ") | |
.join( | |
"+" | |
)}&oq=google+jobs&aqs=chrome..69i57j69i64j0i271j69i60l3.1269j0j1&sourceid=chrome&ie=UTF-8&ibp=htl;jobs&sa=X&ved=2ahUKEwj83teW-MqAAxXTlWoFHSZKA4gQudcGKAF6BAgfECs#fpstate=tldetail&htivrt=jobs&htichips=date_posted:today&htischips=date_posted;today&htilrad=-1.0&htiltype=1&htidocid=VLlnA5-pzFMAAAAAAAAAAA%3D%3D` | |
); | |
console.log("got to page"); | |
await page.waitForTimeout(3000); | |
async function autoScroll(page) { | |
await page.evaluate(async () => { | |
const wrapper = document.querySelector( | |
".gws-plugins-horizon-jobs__tl-lvc" | |
); | |
await new Promise((resolve, reject) => { | |
var totalHeight = 0; | |
var distance = 1000; | |
var scrollDelay = 5000; | |
var timer = setInterval(async () => { | |
var scrollHeightBefore = wrapper.scrollHeight; | |
wrapper.scrollBy(0, distance); | |
totalHeight += distance; | |
if (totalHeight >= scrollHeightBefore) { | |
totalHeight = 0; | |
await new Promise((resolve) => setTimeout(resolve, scrollDelay)); | |
// Calculate scrollHeight after waiting | |
var scrollHeightAfter = wrapper.scrollHeight; | |
if (scrollHeightAfter > scrollHeightBefore) { | |
// More content loaded, keep scrolling | |
return; | |
} else { | |
// No more content loaded, stop scrolling | |
clearInterval(timer); | |
resolve(); | |
} | |
} | |
}, 100); | |
}); | |
}); | |
} | |
await autoScroll(page); | |
const html = await page.content(); | |
console.log("here i am"); | |
const pages = await browser.pages(); | |
await Promise.all(pages.map((page) => page.close())); | |
await browser.close(); | |
console.log("browser closed"); | |
const $ = cheerio.load(html); | |
const json = []; | |
const detailSections = $("#gws-plugins-horizon-jobs__job_details_page"); | |
console.log("trying to find detailSections"); | |
console.log("detailSections", detailSections.length); | |
detailSections.each((i, section) => { | |
const id = $(section).attr("data-encoded-doc-id"); // TODO: not sure this is the actual id | |
let title = $(section)?.find("h2")?.first()?.text(); | |
// find the link that starts with Apply on | |
const applyLink = $(section) | |
?.find("a") | |
?.filter((i, a) => { | |
const text = $(a).text().trim(); | |
if (text.startsWith("Apply")) { | |
return true; | |
} | |
}) | |
?.attr("href") | |
?.split("?")?.[0]; | |
const companyAndLocation = $(section) | |
?.children() | |
?.first() | |
?.children() | |
?.first() | |
?.children() | |
?.first() | |
?.children() | |
?.eq(1) | |
?.children() | |
?.eq(1); | |
const company = companyAndLocation?.children()?.first()?.text()?.trim(); | |
const location = companyAndLocation?.children()?.eq(1)?.text()?.trim(); | |
const quickFactsSection = $(section) | |
?.children() | |
?.first() | |
?.children() | |
?.eq(3) | |
?.children(); | |
const quickFacts = []; | |
quickFactsSection.each((i, div) => { | |
const text = $(div)?.text()?.trim(); | |
quickFacts.push(text); | |
}); | |
// // find anywhere in the div if there is the text "Job description" | |
const hasDescription = $(section)?.text()?.includes("Job description"); | |
let description; | |
if (hasDescription) { | |
const divWithJobDescriptionText = $(section) | |
?.find("div") | |
?.filter((i, div) => { | |
const text = $(div)?.text()?.trim(); | |
if (text?.includes("Job description")) { | |
return true; | |
} | |
}); | |
const divWithJobDescription = divWithJobDescriptionText | |
?.parent() | |
?.next() | |
?.text() | |
?.trim() | |
?.replace(/\s\s+/g, " "); | |
description = divWithJobDescription; | |
} | |
json.push({ | |
id, // TODO: not sure this is the actual id | |
title, | |
description, | |
applyLink, | |
company, | |
location, | |
quickFacts, | |
}); | |
}); | |
const jobsInScrollBar = []; | |
const jobs = $("li.gws-plugins-horizon-jobs__li-ed"); | |
jobs.each((i, job) => { | |
const div = $(job).find(".gws-plugins-horizon-jobs__tl-lif > div > div"); | |
const title = div.children().eq(1).text().trim(); | |
const divWithContent = div.children().last().children().first(); | |
const company = divWithContent.children().first().text().trim(); | |
const location = divWithContent.children().eq(1).text().trim(); | |
const source = divWithContent.children().eq(2).text().trim(); | |
jobsInScrollBar.push({ title, company, location, source }); | |
}); | |
const all = json.map((fThisThing, i) => { | |
return { | |
...jobsInScrollBar[i], | |
...fThisThing, | |
}; | |
}); | |
console.log("all", all.length); | |
const end = Date.now(); | |
// time in seconds | |
console.log("time", (end - start) / 1000); | |
})(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment