Scraper for ASPX search form at https://myaccount.rid.org/Public/Search/Member.aspx
/* | |
* Companion code for article at http://toddhayton.com/2018/08/01/scraping-with-puppeteer/ | |
* | |
* Setup: | |
* $ mkdir scraper/ | |
* $ cd scraper/ | |
* $ npm init -y | |
* $ npm install puppeteer --save | |
* | |
* Usage: | |
* $ node rid_scraper.js | |
*/ | |
const puppeteer = require('puppeteer'); | |
const url = 'https://myaccount.rid.org/Public/Search/Member.aspx'; | |
/* | |
* Wait until elem becomes detached from DOM | |
*/ | |
async function waitUntilStale(page, elem) { | |
await page.waitForFunction( | |
e => !e.ownerDocument.contains(e), | |
{ polling: 'raf' }, elem | |
); | |
} | |
async function getSelectOptions(page, selector) { | |
const options = await page.evaluate(optionSelector => { | |
return Array.from(document.querySelectorAll(optionSelector)) | |
.filter(o => o.value) | |
.map(o => { | |
return { | |
name: o.text, | |
value: o.value | |
}; | |
}); | |
}, selector); | |
return options; | |
} | |
async function getStates(page) { | |
return await getSelectOptions(page, 'select#FormContentPlaceHolder_Panel_stateDropDownList > option'); | |
} | |
async function setMaxPageSize(page) { | |
let html = await page.content(); | |
let pageSizeNameRe = new RegExp( | |
'ctl00\\$FormContentPlaceHolder\\$Panel\\$resultsGrid\\$ctl\\d+\\$ctl\\d+' | |
); | |
let match = pageSizeNameRe.exec(html); | |
if (match.length <= 0) { | |
return; | |
} | |
let pageSizeName = match[0]; | |
let resultsTable = await page.$('#FormContentPlaceHolder_Panel_resultsGrid'); | |
await page.select(`select[name="${pageSizeName}"]`, '50'); | |
/* | |
* Selecting the page size triggers an ajax request for the new table results. | |
* We need to wait until that new table data gets loaded before trying to scrape. | |
* So we wait until the old member table gets detached from the DOM as the signal | |
* that the new table has been loaded | |
*/ | |
await waitUntilStale(page, resultsTable); | |
} | |
/*------------------------------------------------------------------------------ | |
* Look for link for pageno in pager. So if pageno was 6 we'd look for 'Page$6' | |
* in href: | |
* | |
* <a href="javascript:__doPostBack('ctl00$FormContentPlaceHolder$Panel$resultsGrid','Page$6')">...</a> | |
* | |
* After the next page link gets clicked and the new page is loaded the pager | |
* will show the current page within a span (not as a link). So we wait until | |
* pageno appears within a span to indicate that the next page has finished | |
* loading. | |
*/ | |
async function gotoNextPage(page, pageno) { | |
let noMorePages = true; | |
let nextPageXp = `//tr[@class='PagerStyle']/td/table/tbody/tr/td/a[contains(@href,'Page$${pageno}')]`; | |
let currPageXp = `//tr[@class='PagerStyle']/td/table/tbody/tr/td/span[text()='${pageno}']`; | |
let nextPage; | |
nextPage = await page.$x(nextPageXp) | |
if (nextPage.length > 0) { | |
console.log(`Going to page ${pageno}`); | |
await nextPage[0].click(); | |
await page.waitForXPath(currPageXp); | |
noMorePages = false; | |
} | |
return noMorePages; | |
} | |
/*------------------------------------------------------------------------------ | |
* Go back to the first page of results in order to reset the pager. Once the | |
* first page link is clicked and becomes the current page the page 1 link will | |
* appear inside of <span>1</span>. So we can determine once page 1 has finished | |
* loading by waiting until page 1 appears inside of this span. | |
* | |
* Note that there might not be a page 1 link because there was only one page of | |
* results. In that case the page will still show up as <span>1</span> element. | |
*/ | |
async function gotoFirstPage(page) { | |
let firstPageLinkXp = `//tr[@class='PagerStyle']/td/table/tbody/tr/td/a[contains(@href,'Page$1')]`; | |
let firstPageCurrXp = `//tr[@class='PagerStyle']/td/table/tbody/tr/td/span[text()='1']`; | |
let firstPage; | |
firstPage = await page.$x(firstPageLinkXp); | |
if (firstPage.length > 0) { | |
await firstPage[0].click(); | |
} | |
await page.waitForXPath(firstPageCurrXp); | |
} | |
async function scrapeMemberTable(page) { | |
const data = await page.evaluate(() => { | |
const ths = Array.from(document.querySelectorAll('table th')); | |
const trs = Array.from(document.querySelectorAll('table tr.RowStyle')); | |
const headers = ths.map(th => th.innerText); | |
let results = []; | |
console.log(`${trs.length} rows in member table!`); | |
trs.forEach(tr => { | |
let r = {}; | |
let tds = Array.from(tr.querySelectorAll('td')).map(td => td.innerText); | |
headers.forEach((k,i) => r[k] = tds[i]); | |
results.push(r); | |
}); | |
return results; | |
}); | |
console.log(`Got ${data.length} records`); | |
return data; | |
} | |
async function scrapeAllPages(page) { | |
let results = []; | |
let pageno = 2; | |
while (true) { | |
console.log(`Scraping page ${pageno - 1}`); | |
results = results.concat( | |
await scrapeMemberTable(page) | |
); | |
const noMorePages = await gotoNextPage(page, pageno++) | |
if (noMorePages) { | |
break; | |
} | |
} | |
/* | |
* The pager won't reset back to page 1 on its own so we have to explicitly | |
* click on the page 1 link | |
*/ | |
await gotoFirstPage(page); | |
return results; | |
} | |
async function main() { | |
const browser = await puppeteer.launch({ slowMo: 250 }); | |
const page = await browser.newPage(); | |
page.on('console', msg => console.log('PAGE LOG:', msg.text())); | |
await page.goto(url); | |
let states = await getStates(page); | |
for (const [ i, state ] of states.entries()) { | |
console.log(`[${i+1}/${states.length}] Scraping data for ${state.name}`); | |
await page.select('#FormContentPlaceHolder_Panel_stateDropDownList', state.value); | |
await page.select('#FormContentPlaceHolder_Panel_freelanceDropDownList', '1'); | |
/* | |
* The first time we run a search we can wait for the table to appear to determine | |
* once the search has loaded the results. However, with subsequent searches the | |
* table already exists and what we need to determine is when the table contents have | |
* been updated. To do that we fetch a reference to the table here and then wait for | |
* it to become stale (detached) as an indication that the new table data has loaded. | |
*/ | |
let resultsTable = await page.$('table#FormContentPlaceHolder_Panel_resultsGrid'); | |
await page.click('#FormContentPlaceHolder_Panel_searchButtonStrip_searchButton'); | |
if (resultsTable) { | |
await waitUntilStale(page, resultsTable); | |
} else { | |
await page.waitForSelector('#FormContentPlaceHolder_Panel_resultsGrid'); | |
} | |
/* | |
* The page size is retained after the first time its set, so we only | |
* need to call this once | |
*/ | |
if (i === 0) { | |
await setMaxPageSize(page); | |
} | |
let data = await scrapeAllPages(page); | |
console.log(`Got ${data.length} records for state ${state.name}`); | |
console.log(JSON.stringify(data, null, 2)); | |
/* Only grab the first three states for demo purposes */ | |
if (i >= 2) { | |
break; | |
} | |
} | |
await page.close(); | |
await browser.close(); | |
} | |
main(); |
This comment has been minimized.
This comment has been minimized.
Such a great article and tutorial, thanks! Had trouble wrapping my head about this topic and you really helped! |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This comment has been minimized.
Thanks for the great article and examples👍