Skip to content

Instantly share code, notes, and snippets.

@thayton
Last active Feb 12, 2019
Embed
What would you like to do?
node_modules/
/*
* Companion code for article at http://toddhayton.com/...
*
* Setup:
* $ mkdir scraper/
* $ cd scraper/
* $ npm init -y
* $ npm install puppeteer --save
*
* Usage:
* $ node l3com_scraper.js
*/
const puppeteer = require('puppeteer');
const url = 'https://l3com.taleo.net/careersection/l3_ext_us/jobsearch.ftl';
/*
* The text inside the reload message span changes once the next page of jobs
* have been loaded. For the first page of jobs, the span starts out as empty.
*/
var waitForJobsToLoad = (function () {
let reloadMessage = '';
return async function(page) {
await page.waitForFunction(
oldText => document.querySelector('span#reloadMessage').innerText !== oldText,
{}, reloadMessage
);
reloadMessage = await page.$eval('span#reloadMessage', e => e.innerText);
};
})();
/* Alternatively we could wait for the progress indicator to appear/disappear
*
async function waitForJobsToLoad(page) {
await page.waitFor(() => document.querySelector('div#progressIndicator').offsetWidth !== 0);
await page.waitFor(() => document.querySelector('div#progressIndicator').offsetWidth === 0);
}
*/
/*------------------------------------------------------------------------------
* Look for link for pageno in pager. So if pageno was 6 we'd look for 'Page$6'
* in href:
*
* <a href="#" title="Go to page 6" aria-disabled="false">6</a>
*/
async function gotoNextPage(page, pageno) {
let noMorePages = true;
let nextPageXp = `//ul[@class='pager']/li[@class='pagerlink']/a[text()='${pageno}']`;
let nextPage;
nextPage = await page.$x(nextPageXp)
if (nextPage.length > 0) {
await nextPage[0].click();
await waitForJobsToLoad(page);
noMorePages = false;
}
return noMorePages;
}
async function getJobs(page) {
const jobs = await page.evaluate(jobSelector => {
//debugger;
var results = [];
Array.from(document.querySelectorAll(jobSelector)).forEach((tr) => {
th = tr.querySelector('th');
td = tr.querySelectorAll('td');
results.push({
'title': th.innerText.trim(),
'href': th.querySelector('a').href,
'location': td[1].innerText.trim(),
'postingDate': td[2].innerText.trim()
});
});
return results;
}, 'table#jobs tr[id^="job"]');
return jobs;
}
async function main() {
//const browser = await puppeteer.launch({ slowMo: 250, headless: false, devtools: true });
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(url);
await waitForJobsToLoad(page);
let pageno = 2;
while (true) {
console.log(`Getting jobs on page ${pageno - 1}`);
const jobs = await getJobs(page);
jobs.forEach(j => console.log( JSON.stringify(j, null, 2) ));
const noMorePages = await gotoNextPage(page, pageno++);
if (noMorePages) {
break;
}
/* Don't hit the server too quickly... */
await page.waitFor(1000);
}
await browser.close();
}
main().then(() => console.log('Complete!'));
{
"name": "l3com_scraper",
"version": "1.0.0",
"lockfileVersion": 1,
"requires": true,
"dependencies": {
"agent-base": {
"version": "4.2.1",
"resolved": "https://registry.npmjs.org/agent-base/-/agent-base-4.2.1.tgz",
"integrity": "sha512-JVwXMr9nHYTUXsBFKUqhJwvlcYU/blreOEUkhNR2eXZIvwd+c+o5V4MgDPKWnMS/56awN3TRzIP+KoPn+roQtg==",
"requires": {
"es6-promisify": "5.0.0"
}
},
"async-limiter": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/async-limiter/-/async-limiter-1.0.0.tgz",
"integrity": "sha512-jp/uFnooOiO+L211eZOoSyzpOITMXx1rBITauYykG3BRYPu8h0UcxsPNB04RR5vo4Tyz3+ay17tR6JVf9qzYWg=="
},
"balanced-match": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.0.tgz",
"integrity": "sha1-ibTRmasr7kneFk6gK4nORi1xt2c="
},
"brace-expansion": {
"version": "1.1.11",
"resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.11.tgz",
"integrity": "sha512-iCuPHDFgrHX7H2vEI/5xpz07zSHB00TpugqhmYtVmMO6518mCuRMoOYFldEBl0g187ufozdaHgWKcYFb61qGiA==",
"requires": {
"balanced-match": "1.0.0",
"concat-map": "0.0.1"
}
},
"buffer-from": {
"version": "1.1.1",
"resolved": "https://registry.npmjs.org/buffer-from/-/buffer-from-1.1.1.tgz",
"integrity": "sha512-MQcXEUbCKtEo7bhqEs6560Hyd4XaovZlO/k9V3hjVUF/zwW7KBVdSK4gIt/bzwS9MbR5qob+F5jusZsb0YQK2A=="
},
"concat-map": {
"version": "0.0.1",
"resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz",
"integrity": "sha1-2Klr13/Wjfd5OnMDajug1UBdR3s="
},
"concat-stream": {
"version": "1.6.2",
"resolved": "https://registry.npmjs.org/concat-stream/-/concat-stream-1.6.2.tgz",
"integrity": "sha512-27HBghJxjiZtIk3Ycvn/4kbJk/1uZuJFfuPEns6LaEvpvG1f0hTea8lilrouyo9mVc2GWdcEZ8OLoGmSADlrCw==",
"requires": {
"buffer-from": "1.1.1",
"inherits": "2.0.3",
"readable-stream": "2.3.6",
"typedarray": "0.0.6"
}
},
"core-util-is": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/core-util-is/-/core-util-is-1.0.2.tgz",
"integrity": "sha1-tf1UIgqivFq1eqtxQMlAdUUDwac="
},
"debug": {
"version": "3.1.0",
"resolved": "https://registry.npmjs.org/debug/-/debug-3.1.0.tgz",
"integrity": "sha512-OX8XqP7/1a9cqkxYw2yXss15f26NKWBpDXQd0/uK/KPqdQhxbPa994hnzjcE2VqQpDslf55723cKPUOGSmMY3g==",
"requires": {
"ms": "2.0.0"
}
},
"es6-promise": {
"version": "4.2.4",
"resolved": "https://registry.npmjs.org/es6-promise/-/es6-promise-4.2.4.tgz",
"integrity": "sha512-/NdNZVJg+uZgtm9eS3O6lrOLYmQag2DjdEXuPaHlZ6RuVqgqaVZfgYCepEIKsLqwdQArOPtC3XzRLqGGfT8KQQ=="
},
"es6-promisify": {
"version": "5.0.0",
"resolved": "https://registry.npmjs.org/es6-promisify/-/es6-promisify-5.0.0.tgz",
"integrity": "sha1-UQnWLz5W6pZ8S2NQWu8IKRyKUgM=",
"requires": {
"es6-promise": "4.2.4"
}
},
"extract-zip": {
"version": "1.6.7",
"resolved": "https://registry.npmjs.org/extract-zip/-/extract-zip-1.6.7.tgz",
"integrity": "sha1-qEC0uK9kAyZMjbV/Txp0Mz74H+k=",
"requires": {
"concat-stream": "1.6.2",
"debug": "2.6.9",
"mkdirp": "0.5.1",
"yauzl": "2.4.1"
},
"dependencies": {
"debug": {
"version": "2.6.9",
"resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz",
"integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==",
"requires": {
"ms": "2.0.0"
}
}
}
},
"fd-slicer": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/fd-slicer/-/fd-slicer-1.0.1.tgz",
"integrity": "sha1-i1vL2ewyfFBBv5qwI/1nUPEXfmU=",
"requires": {
"pend": "1.2.0"
}
},
"fs.realpath": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/fs.realpath/-/fs.realpath-1.0.0.tgz",
"integrity": "sha1-FQStJSMVjKpA20onh8sBQRmU6k8="
},
"glob": {
"version": "7.1.3",
"resolved": "https://registry.npmjs.org/glob/-/glob-7.1.3.tgz",
"integrity": "sha512-vcfuiIxogLV4DlGBHIUOwI0IbrJ8HWPc4MU7HzviGeNho/UJDfi6B5p3sHeWIQ0KGIU0Jpxi5ZHxemQfLkkAwQ==",
"requires": {
"fs.realpath": "1.0.0",
"inflight": "1.0.6",
"inherits": "2.0.3",
"minimatch": "3.0.4",
"once": "1.4.0",
"path-is-absolute": "1.0.1"
}
},
"https-proxy-agent": {
"version": "2.2.1",
"resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-2.2.1.tgz",
"integrity": "sha512-HPCTS1LW51bcyMYbxUIOO4HEOlQ1/1qRaFWcyxvwaqUS9TY88aoEuHUY33kuAh1YhVVaDQhLZsnPd+XNARWZlQ==",
"requires": {
"agent-base": "4.2.1",
"debug": "3.1.0"
}
},
"inflight": {
"version": "1.0.6",
"resolved": "https://registry.npmjs.org/inflight/-/inflight-1.0.6.tgz",
"integrity": "sha1-Sb1jMdfQLQwJvJEKEHW6gWW1bfk=",
"requires": {
"once": "1.4.0",
"wrappy": "1.0.2"
}
},
"inherits": {
"version": "2.0.3",
"resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.3.tgz",
"integrity": "sha1-Yzwsg+PaQqUC9SRmAiSA9CCCYd4="
},
"isarray": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/isarray/-/isarray-1.0.0.tgz",
"integrity": "sha1-u5NdSFgsuhaMBoNJV6VKPgcSTxE="
},
"mime": {
"version": "2.3.1",
"resolved": "https://registry.npmjs.org/mime/-/mime-2.3.1.tgz",
"integrity": "sha512-OEUllcVoydBHGN1z84yfQDimn58pZNNNXgZlHXSboxMlFvgI6MXSWpWKpFRra7H1HxpVhHTkrghfRW49k6yjeg=="
},
"minimatch": {
"version": "3.0.4",
"resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.0.4.tgz",
"integrity": "sha512-yJHVQEhyqPLUTgt9B83PXu6W3rx4MvvHvSUvToogpwoGDOUQ+yDrR0HRot+yOCdCO7u4hX3pWft6kWBBcqh0UA==",
"requires": {
"brace-expansion": "1.1.11"
}
},
"minimist": {
"version": "0.0.8",
"resolved": "http://registry.npmjs.org/minimist/-/minimist-0.0.8.tgz",
"integrity": "sha1-hX/Kv8M5fSYluCKCYuhqp6ARsF0="
},
"mkdirp": {
"version": "0.5.1",
"resolved": "http://registry.npmjs.org/mkdirp/-/mkdirp-0.5.1.tgz",
"integrity": "sha1-MAV0OOrGz3+MR2fzhkjWaX11yQM=",
"requires": {
"minimist": "0.0.8"
}
},
"ms": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz",
"integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g="
},
"once": {
"version": "1.4.0",
"resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz",
"integrity": "sha1-WDsap3WWHUsROsF9nFC6753Xa9E=",
"requires": {
"wrappy": "1.0.2"
}
},
"path-is-absolute": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/path-is-absolute/-/path-is-absolute-1.0.1.tgz",
"integrity": "sha1-F0uSaHNVNP+8es5r9TpanhtcX18="
},
"pend": {
"version": "1.2.0",
"resolved": "https://registry.npmjs.org/pend/-/pend-1.2.0.tgz",
"integrity": "sha1-elfrVQpng/kRUzH89GY9XI4AelA="
},
"process-nextick-args": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/process-nextick-args/-/process-nextick-args-2.0.0.tgz",
"integrity": "sha512-MtEC1TqN0EU5nephaJ4rAtThHtC86dNN9qCuEhtshvpVBkAW5ZO7BASN9REnF9eoXGcRub+pFuKEpOHE+HbEMw=="
},
"progress": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/progress/-/progress-2.0.0.tgz",
"integrity": "sha1-ihvjZr+Pwj2yvSPxDG/pILQ4nR8="
},
"proxy-from-env": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/proxy-from-env/-/proxy-from-env-1.0.0.tgz",
"integrity": "sha1-M8UDmPcOp+uW0h97gXYwpVeRx+4="
},
"puppeteer": {
"version": "1.7.0",
"resolved": "https://registry.npmjs.org/puppeteer/-/puppeteer-1.7.0.tgz",
"integrity": "sha512-f+1DxKHPqce6CXUBz2eVO2WcATeVeQSOPG9GYaGObEZDCiCEUwG+gogjMsrvn7he2wHTqNVb5p6RUrwmr8XFBA==",
"requires": {
"debug": "3.1.0",
"extract-zip": "1.6.7",
"https-proxy-agent": "2.2.1",
"mime": "2.3.1",
"progress": "2.0.0",
"proxy-from-env": "1.0.0",
"rimraf": "2.6.2",
"ws": "5.2.2"
}
},
"readable-stream": {
"version": "2.3.6",
"resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.3.6.tgz",
"integrity": "sha512-tQtKA9WIAhBF3+VLAseyMqZeBjW0AHJoxOtYqSUZNJxauErmLbVm2FW1y+J/YA9dUrAC39ITejlZWhVIwawkKw==",
"requires": {
"core-util-is": "1.0.2",
"inherits": "2.0.3",
"isarray": "1.0.0",
"process-nextick-args": "2.0.0",
"safe-buffer": "5.1.2",
"string_decoder": "1.1.1",
"util-deprecate": "1.0.2"
}
},
"rimraf": {
"version": "2.6.2",
"resolved": "https://registry.npmjs.org/rimraf/-/rimraf-2.6.2.tgz",
"integrity": "sha512-lreewLK/BlghmxtfH36YYVg1i8IAce4TI7oao75I1g245+6BctqTVQiBP3YUJ9C6DQOXJmkYR9X9fCLtCOJc5w==",
"requires": {
"glob": "7.1.3"
}
},
"safe-buffer": {
"version": "5.1.2",
"resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.1.2.tgz",
"integrity": "sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g=="
},
"string_decoder": {
"version": "1.1.1",
"resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz",
"integrity": "sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg==",
"requires": {
"safe-buffer": "5.1.2"
}
},
"typedarray": {
"version": "0.0.6",
"resolved": "https://registry.npmjs.org/typedarray/-/typedarray-0.0.6.tgz",
"integrity": "sha1-hnrHTjhkGHsdPUfZlqeOxciDB3c="
},
"util-deprecate": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz",
"integrity": "sha1-RQ1Nyfpw3nMnYvvS1KKJgUGaDM8="
},
"wrappy": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz",
"integrity": "sha1-tSQ9jz7BqjXxNkYFvA0QNuMKtp8="
},
"ws": {
"version": "5.2.2",
"resolved": "https://registry.npmjs.org/ws/-/ws-5.2.2.tgz",
"integrity": "sha512-jaHFD6PFv6UgoIVda6qZllptQsMlDEJkTQcybzzXDYM1XO9Y8em691FGMPmM46WGyLU4z9KMgQN+qrux/nhlHA==",
"requires": {
"async-limiter": "1.0.0"
}
},
"yauzl": {
"version": "2.4.1",
"resolved": "https://registry.npmjs.org/yauzl/-/yauzl-2.4.1.tgz",
"integrity": "sha1-lSj0QtqxsihOWLQ3m7GU4i4MQAU=",
"requires": {
"fd-slicer": "1.0.1"
}
}
}
}
{
"name": "l3com_scraper",
"version": "1.0.0",
"description": "",
"main": "l3com_scraper.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"repository": {
"type": "git",
"url": "git+ssh://git@gist.github.com/0c703f7a18ae1fe2d90db04910ff1434.git"
},
"keywords": [],
"author": "",
"license": "ISC",
"bugs": {
"url": "https://gist.github.com/0c703f7a18ae1fe2d90db04910ff1434"
},
"homepage": "https://gist.github.com/0c703f7a18ae1fe2d90db04910ff1434",
"dependencies": {
"puppeteer": "^1.7.0"
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment