Skip to content

Instantly share code, notes, and snippets.

@hellonearthis
Last active March 13, 2021 04:44
Show Gist options
  • Save hellonearthis/e1a418e7e84399a92b8177cfb92bda81 to your computer and use it in GitHub Desktop.
Save hellonearthis/e1a418e7e84399a92b8177cfb92bda81 to your computer and use it in GitHub Desktop.
node + puppeteer scraping and processing of a paginated table.
{
"name": "pup_scrapper",
"version": "1.0.0",
"description": "puppeter vs govt",
"main": "ps.js",
"scripts": {
"start": "node ps.js"
},
"author": "",
"license": "ISC",
"dependencies": {
"d3": "^6.6.0",
"jsonfile": "^5.0.0",
"lodash": "^4.17.21",
"puppeteer": "^8.0.0"
}
}
/*
https://www.youtube.com/watch?v=IvaJ5n5xFqU looping through page content
for testing the scraper without bugginh the site:
https://stackoverflow.com/questions/47587352/opening-local-html-file-using-puppeteer
await page.goto('file://C:/Users/compoundeye/test.html');
to view https://www.youtube.com/watch?v=crKJ2hGcQ3Q&list=PLw5h0DiJ-9PDTSsOmwZ0DhzPt2yQ6RY9z
*/
const _ = require("lodash");
const jsonfile = require("jsonfile");
const puppeteer = require("puppeteer");
async function run() {
try {
const browser = await puppeteer.launch({
headless: true ,
timeout: 0,
// slowMo: 250
});
const page = await browser.newPage();
await page.setUserAgent('5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36');
await page.setViewport({ width: 1280, height: 1080 })
let baseURL = 'https://nzhistory.govt.nz/politics/womens-suffrage/petition?page='
await page.setDefaultNavigationTimeout(0);
let PG =[] // save the pages
for (let pageNum = 0; pageNum < 600; pageNum++){ //600 - loop from 0 to 599 to read the full table
console.log(`reading ${baseURL}${pageNum} \n`)
await page.goto(`${baseURL}${pageNum}`,{ waitUntil: 'networkidle2' }); // ,{ timeout: 8000, waitUntil: 'domcontentloaded' } waitUntil: 'networkidle2'
// .waitForSelector('#myId')
let data = await page.evaluate(async () => {
return Array.from(document.querySelectorAll('div.view-content > table > tbody > tr'))
.map((d) => ({data:d.innerHTML})
);
})
let Ra = data.forEach(d => {
let l = _.split(_.trim(d.data), '</td>'); // i address each element of the split in a hard coded way based on it's structure.
let sheet = _.trim(l[0].slice(l[0].search('">') + 2, l[0].length)) // pull off sheet
let surname = _.trim(l[1].slice(l[1].search('">') + 2, l[1].length)) // pull off surname
let givenNames = _.trim(l[2].slice(l[2].search('">') + 2, l[2].length)) // pull off given names
let originalAddress = _.trim(l[3].slice(l[3].search('">') + 2, l[3].length)) // pull off original address
let consistentTownSuburb = _.trim(l[4].slice(l[4].search('">') + 2, l[4].length)) // pull off consistent town suburb
let consistentCityRegion = _.trim(l[5].slice(l[5].search('">') + 2, l[5].length)) // pull off consistent city region
let sigIMG = _.trim(l[6].slice(l[6].search('<a'), l[6].length)) // pull off link to sig image
PG.push({sheetURL: sheet,
surname: surname,
givenNames: givenNames,
originalAddress: originalAddress,
consistentTownSuburb: consistentTownSuburb,
consistentCityRegion: consistentCityRegion,
sigIMG: sigIMG
})
return 'ok'
});
await page.waitForTimeout(1000).then(() => console.log(`Next page ${pageNum+1}`))
// await browser.close();
// await page.close();
}
jsonfile.writeFile('teJson.json', PG, function (err) {
if (err) console.error(err)
})
console.log('FINISHED!')
} catch (e) {
console.log('the error ', e);
}
}
run();
// run().then(console.log).catch(console.error);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment