Skip to content

Instantly share code, notes, and snippets.

@thayton
Last active September 14, 2018 20:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save thayton/5fa2b22def4ac43e777660321081814b to your computer and use it in GitHub Desktop.
Save thayton/5fa2b22def4ac43e777660321081814b to your computer and use it in GitHub Desktop.
/*
* Companion code for article at http://toddhayton.com/...
*
* Setup:
* $ mkdir scraper/
* $ cd scraper/
* $ npm init -y
* $ npm install puppeteer --save
* $ npm install node-fetch --save
* $ npm install progress-stream --save
*
* Usage:
* $ node campaignfinancemd.js
*/
const progressStream = require('progress-stream');
const puppeteer = require('puppeteer');
const fetch = require('node-fetch');
const fs = require('fs');
const url = 'https://campaignfinancemd.us/Public/ViewReceipts';
const streamCompletion = stream =>
new Promise((resolve, reject) => {
stream.on('end', resolve);
stream.on('finish', resolve);
stream.on('error', (err) => {
reject(err);
});
});
const submitSearch = async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(url);
await page.select('select#ContributorType', 'I');
await page.select('select#ContributionType', 'CK');
//await page.select('select#ContributionType', 'CL');
await page.click('input#btnSearch');
await page.waitForSelector('div#GridResults');
const elem = await page.$x('//a[@id="export"]/img[@title="CSV"]/..');
const href = await (await elem[0].getProperty('href')).jsonValue();
console.log(`href: ${href}`);
/*
* We need the session cookie in order to retrieve the results of the current
* search. Otherwise, all you get back are the headers with no data.
*
* We load the cookies from puppeteer into fetch as shown at
* https://github.com/mickdekkers/episodecalendar-exporter/blob/master/index.js
*/
const { name, value } = (await page.cookies()).find(
c => c.name === 'ASP.NET_SessionId'
);
await page.close();
await browser.close();
return {
url: href,
sessionCookie: `${name}=${value}`
};
};
const downloadResults = async (url, sessionCookie) => {
const resp = await fetch(url, {
headers: {
Cookie: sessionCookie
}
});
const filesize = resp.headers.get('content-length');
const filename = resp.headers.get('content-disposition').split('filename=')[1];
const progress = progressStream({
length: filesize,
time: 100
});
progress.on('progress', p => {
process.stdout.write("download progress: " + Math.round(p.percentage) + "%\r")
});
const stream = resp.body.pipe(progress).pipe(
fs.createWriteStream(filename)
);
await streamCompletion(stream);
};
const main = async () => {
const { url, sessionCookie } = await submitSearch();
await downloadResults(url, sessionCookie);
};
main().then(() => {
console.log('\nComplete');
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment