Skip to content

Instantly share code, notes, and snippets.

@thayton
Created February 12, 2019 16:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save thayton/330951c308bd525fc2abea49793d583c to your computer and use it in GitHub Desktop.
Save thayton/330951c308bd525fc2abea49793d583c to your computer and use it in GitHub Desktop.
/*
* Companion code for article at http://toddhayton.com/2019/02/12/revisiting-taleo-with-puppeteer/
*
* Setup:
* $ mkdir scraper/
* $ cd scraper/
* $ npm init -y
* $ npm install puppeteer --save
*
* Usage:
* $ node l3com_scraper.js
*/
const puppeteer = require('puppeteer');
const url = 'https://l3com.taleo.net/careersection/l3_ext_us/jobsearch.ftl';
/*
* The text inside the reload message span changes once the next page of jobs
* have been loaded. For the first page of jobs, the span starts out as empty.
*/
var waitForJobsToLoad = (function () {
let reloadMessage = '';
return async function(page) {
await page.waitForFunction(
oldText => document.querySelector('span#reloadMessage').innerText !== oldText,
{}, reloadMessage
);
reloadMessage = await page.$eval('span#reloadMessage', e => e.innerText);
};
})();
/*
* Alternatively we could wait for the progress indicator to appear/disappear
*
async function waitForJobsToLoad(page) {
await page.waitFor(() => document.querySelector('div#progressIndicator').offsetWidth !== 0);
await page.waitFor(() => document.querySelector('div#progressIndicator').offsetWidth === 0);
}
*/
/*------------------------------------------------------------------------------
* Look for link for pageno in pager. So if pageno was 6 we'd look for 'Page$6'
* in href:
*
* <a href="#" title="Go to page 6" aria-disabled="false">6</a>
*/
async function gotoNextPage(page, pageno) {
let noMorePages = true;
let nextPageXp = `//ul[@class='pager']/li[@class='pagerlink']/a[text()='${pageno}']`;
let nextPage;
nextPage = await page.$x(nextPageXp)
if (nextPage.length > 0) {
await nextPage[0].click();
await waitForJobsToLoad(page);
noMorePages = false;
}
return noMorePages;
}
async function getJobs(page) {
const jobs = await page.evaluate(jobSelector => {
//debugger;
var results = [];
Array.from(document.querySelectorAll(jobSelector)).forEach((tr) => {
th = tr.querySelector('th');
td = tr.querySelectorAll('td');
results.push({
'title': th.innerText.trim(),
'href': th.querySelector('a').href,
'location': td[1].innerText.trim(),
'postingDate': td[2].innerText.trim()
});
});
return results;
}, 'table#jobs tr[id^="job"]');
return jobs;
}
async function main() {
//const browser = await puppeteer.launch({ slowMo: 250, headless: false, devtools: true });
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(url);
await waitForJobsToLoad(page);
let pageno = 2;
while (true) {
console.log(`Getting jobs on page ${pageno - 1}`);
const jobs = await getJobs(page);
jobs.forEach(j => console.log( JSON.stringify(j, null, 2) ));
const noMorePages = await gotoNextPage(page, pageno++);
if (noMorePages) {
break;
}
/* Don't hit the server too quickly... */
await page.waitFor(1000);
}
await browser.close();
}
main().then(() => console.log('Complete!'));
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment