Puppeteer scraper for https://campaignfinancemd.us/Public/ViewReceipts?theme=vista
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* Companion code for article at http://toddhayton.com/... | |
* | |
* Setup: | |
* $ mkdir scraper/ | |
* $ cd scraper/ | |
* $ npm init -y | |
* $ npm install puppeteer --save | |
* $ npm install node-fetch --save | |
* $ npm install progress-stream --save | |
* | |
* Usage: | |
* $ node campaignfinancemd.js | |
*/ | |
const progressStream = require('progress-stream'); | |
const puppeteer = require('puppeteer'); | |
const fetch = require('node-fetch'); | |
const fs = require('fs'); | |
const url = 'https://campaignfinancemd.us/Public/ViewReceipts'; | |
const streamCompletion = stream => | |
new Promise((resolve, reject) => { | |
stream.on('end', resolve); | |
stream.on('finish', resolve); | |
stream.on('error', (err) => { | |
reject(err); | |
}); | |
}); | |
const submitSearch = async () => { | |
const browser = await puppeteer.launch(); | |
const page = await browser.newPage(); | |
await page.goto(url); | |
await page.select('select#ContributorType', 'I'); | |
await page.select('select#ContributionType', 'CK'); | |
//await page.select('select#ContributionType', 'CL'); | |
await page.click('input#btnSearch'); | |
await page.waitForSelector('div#GridResults'); | |
const elem = await page.$x('//a[@id="export"]/img[@title="CSV"]/..'); | |
const href = await (await elem[0].getProperty('href')).jsonValue(); | |
console.log(`href: ${href}`); | |
/* | |
* We need the session cookie in order to retrieve the results of the current | |
* search. Otherwise, all you get back are the headers with no data. | |
* | |
* We load the cookies from puppeteer into fetch as shown at | |
* https://github.com/mickdekkers/episodecalendar-exporter/blob/master/index.js | |
*/ | |
const { name, value } = (await page.cookies()).find( | |
c => c.name === 'ASP.NET_SessionId' | |
); | |
await page.close(); | |
await browser.close(); | |
return { | |
url: href, | |
sessionCookie: `${name}=${value}` | |
}; | |
}; | |
const downloadResults = async (url, sessionCookie) => { | |
const resp = await fetch(url, { | |
headers: { | |
Cookie: sessionCookie | |
} | |
}); | |
const filesize = resp.headers.get('content-length'); | |
const filename = resp.headers.get('content-disposition').split('filename=')[1]; | |
const progress = progressStream({ | |
length: filesize, | |
time: 100 | |
}); | |
progress.on('progress', p => { | |
process.stdout.write("download progress: " + Math.round(p.percentage) + "%\r") | |
}); | |
const stream = resp.body.pipe(progress).pipe( | |
fs.createWriteStream(filename) | |
); | |
await streamCompletion(stream); | |
}; | |
const main = async () => { | |
const { url, sessionCookie } = await submitSearch(); | |
await downloadResults(url, sessionCookie); | |
}; | |
main().then(() => { | |
console.log('\nComplete'); | |
}); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment