Skip to content

Instantly share code, notes, and snippets.

@thayton
Last active October 9, 2020 17:51
Show Gist options
  • Save thayton/3185339aa43b6bb49ddafc611102e90a to your computer and use it in GitHub Desktop.
Save thayton/3185339aa43b6bb49ddafc611102e90a to your computer and use it in GitHub Desktop.
Scraper for ASPX search form at https://myaccount.rid.org/Public/Search/Member.aspx
/*
* Companion code for article at http://toddhayton.com/2018/08/01/scraping-with-puppeteer/
*
* Setup:
* $ mkdir scraper/
* $ cd scraper/
* $ npm init -y
* $ npm install puppeteer --save
*
* Usage:
* $ node rid_scraper.js
*/
const puppeteer = require('puppeteer');
const url = 'https://myaccount.rid.org/Public/Search/Member.aspx';
/*
* Wait until elem becomes detached from DOM
*/
async function waitUntilStale(page, elem) {
await page.waitForFunction(
e => !e.ownerDocument.contains(e),
{ polling: 'raf' }, elem
);
}
async function getSelectOptions(page, selector) {
const options = await page.evaluate(optionSelector => {
return Array.from(document.querySelectorAll(optionSelector))
.filter(o => o.value)
.map(o => {
return {
name: o.text,
value: o.value
};
});
}, selector);
return options;
}
async function getStates(page) {
return await getSelectOptions(page, 'select#FormContentPlaceHolder_Panel_stateDropDownList > option');
}
async function setMaxPageSize(page) {
let html = await page.content();
let pageSizeNameRe = new RegExp(
'ctl00\\$FormContentPlaceHolder\\$Panel\\$resultsGrid\\$ctl\\d+\\$ctl\\d+'
);
let match = pageSizeNameRe.exec(html);
if (match.length <= 0) {
return;
}
let pageSizeName = match[0];
let resultsTable = await page.$('#FormContentPlaceHolder_Panel_resultsGrid');
await page.select(`select[name="${pageSizeName}"]`, '50');
/*
* Selecting the page size triggers an ajax request for the new table results.
* We need to wait until that new table data gets loaded before trying to scrape.
* So we wait until the old member table gets detached from the DOM as the signal
* that the new table has been loaded
*/
await waitUntilStale(page, resultsTable);
}
/*------------------------------------------------------------------------------
* Look for link for pageno in pager. So if pageno was 6 we'd look for 'Page$6'
* in href:
*
* <a href="javascript:__doPostBack('ctl00$FormContentPlaceHolder$Panel$resultsGrid','Page$6')">...</a>
*
* After the next page link gets clicked and the new page is loaded the pager
* will show the current page within a span (not as a link). So we wait until
* pageno appears within a span to indicate that the next page has finished
* loading.
*/
async function gotoNextPage(page, pageno) {
let noMorePages = true;
let nextPageXp = `//tr[@class='PagerStyle']/td/table/tbody/tr/td/a[contains(@href,'Page$${pageno}')]`;
let currPageXp = `//tr[@class='PagerStyle']/td/table/tbody/tr/td/span[text()='${pageno}']`;
let nextPage;
nextPage = await page.$x(nextPageXp)
if (nextPage.length > 0) {
console.log(`Going to page ${pageno}`);
await nextPage[0].click();
await page.waitForXPath(currPageXp);
noMorePages = false;
}
return noMorePages;
}
/*------------------------------------------------------------------------------
* Go back to the first page of results in order to reset the pager. Once the
* first page link is clicked and becomes the current page the page 1 link will
* appear inside of <span>1</span>. So we can determine once page 1 has finished
* loading by waiting until page 1 appears inside of this span.
*
* Note that there might not be a page 1 link because there was only one page of
* results. In that case the page will still show up as <span>1</span> element.
*/
async function gotoFirstPage(page) {
let firstPageLinkXp = `//tr[@class='PagerStyle']/td/table/tbody/tr/td/a[contains(@href,'Page$1')]`;
let firstPageCurrXp = `//tr[@class='PagerStyle']/td/table/tbody/tr/td/span[text()='1']`;
let firstPage;
firstPage = await page.$x(firstPageLinkXp);
if (firstPage.length > 0) {
await firstPage[0].click();
}
await page.waitForXPath(firstPageCurrXp);
}
async function scrapeMemberTable(page) {
const data = await page.evaluate(() => {
const ths = Array.from(document.querySelectorAll('table th'));
const trs = Array.from(document.querySelectorAll('table tr.RowStyle'));
const headers = ths.map(th => th.innerText);
let results = [];
console.log(`${trs.length} rows in member table!`);
trs.forEach(tr => {
let r = {};
let tds = Array.from(tr.querySelectorAll('td')).map(td => td.innerText);
headers.forEach((k,i) => r[k] = tds[i]);
results.push(r);
});
return results;
});
console.log(`Got ${data.length} records`);
return data;
}
async function scrapeAllPages(page) {
let results = [];
let pageno = 2;
while (true) {
console.log(`Scraping page ${pageno - 1}`);
results = results.concat(
await scrapeMemberTable(page)
);
const noMorePages = await gotoNextPage(page, pageno++)
if (noMorePages) {
break;
}
}
/*
* The pager won't reset back to page 1 on its own so we have to explicitly
* click on the page 1 link
*/
await gotoFirstPage(page);
return results;
}
async function main() {
const browser = await puppeteer.launch({ slowMo: 250 });
const page = await browser.newPage();
page.on('console', msg => console.log('PAGE LOG:', msg.text()));
await page.goto(url);
let states = await getStates(page);
for (const [ i, state ] of states.entries()) {
console.log(`[${i+1}/${states.length}] Scraping data for ${state.name}`);
await page.select('#FormContentPlaceHolder_Panel_stateDropDownList', state.value);
await page.select('#FormContentPlaceHolder_Panel_freelanceDropDownList', '1');
/*
* The first time we run a search we can wait for the table to appear to determine
* once the search has loaded the results. However, with subsequent searches the
* table already exists and what we need to determine is when the table contents have
* been updated. To do that we fetch a reference to the table here and then wait for
* it to become stale (detached) as an indication that the new table data has loaded.
*/
let resultsTable = await page.$('table#FormContentPlaceHolder_Panel_resultsGrid');
await page.click('#FormContentPlaceHolder_Panel_searchButtonStrip_searchButton');
if (resultsTable) {
await waitUntilStale(page, resultsTable);
} else {
await page.waitForSelector('#FormContentPlaceHolder_Panel_resultsGrid');
}
/*
* The page size is retained after the first time its set, so we only
* need to call this once
*/
if (i === 0) {
await setMaxPageSize(page);
}
let data = await scrapeAllPages(page);
console.log(`Got ${data.length} records for state ${state.name}`);
console.log(JSON.stringify(data, null, 2));
/* Only grab the first three states for demo purposes */
if (i >= 2) {
break;
}
}
await page.close();
await browser.close();
}
main();
@zahardev
Copy link

zahardev commented Feb 2, 2019

Thanks for the great article and examples 👍

@dominikhm
Copy link

Such a great article and tutorial, thanks! Had trouble wrapping my head about this topic and you really helped!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment