Skip to content

Instantly share code, notes, and snippets.

@alexjyong
Last active April 13, 2021 13:11
Show Gist options
  • Save alexjyong/726fd0af5b6ec35e218ab1151a84277a to your computer and use it in GitHub Desktop.
Save alexjyong/726fd0af5b6ec35e218ab1151a84277a to your computer and use it in GitHub Desktop.
'use strict';
const puppeteer = require('puppeteer');
const createCsvWriter = require('csv-writer').createObjectCsvWriter;
async function main() {
var entries=[];//store all the food pantry data;
//headless so i can see what this badboy is doing
const browser = await puppeteer.launch({
headless: true
});
var pages = await browser.pages();
var page = pages[0];
//will log console.log stuff to the console instead of the browser
page.on('console', consoleObj => console.log(consoleObj.text()));
await page.goto('https://www.foodpantries.org/st/indiana');
var foodPantryLinks = await page.evaluate(() => {
var links = [];
const linksYo = document.querySelectorAll('td > a');
for (const el of linksYo) {
links.push(el.href);
}
return links;
});
for (let link of foodPantryLinks){
await page.goto(link);
try {
var jsonBlob = await page.evaluate(()=> {
var jsonString = document.querySelectorAll('script[type*="application/ld+json"]')[3].innerText;
jsonString = jsonString.replace(/(\r\n|\n|\r)/gm, ""); //remove newlines so we don't make json parsing sad
var jsonBlob = JSON.parse(jsonString);
var innerPageInfo = document.evaluate("//a[contains(., 'View Website and Full Address')]", document, null, XPathResult.ANY_TYPE, null );
var thisUrl = innerPageInfo.iterateNext();
thisUrl = thisUrl.href;
jsonBlob['innerPage'] = thisUrl;
return jsonBlob;
});//end page evaluate
await page.goto(jsonBlob.innerPage);
var otherBlobYo = await page.evaluate(()=> {
var someBlob={};
//get facebook and website links
var websiteLinks = document.querySelectorAll('div[class*="widget widget_tags"] > li > a');
if (websiteLinks.length >0){ //sometimes a place doesn't have a website or facebook :(
for (var i=0; i< websiteLinks.length; i++){
var website = websiteLinks[i];
if (website.innerText == "Website"){
someBlob['website'] = website.href;
}
if (website.innerText == "Facebook"){
someBlob['facebook'] = website.href;
}
}
}
return someBlob;
});
var entry = {streetAddress: jsonBlob.address.streetAddress, name:jsonBlob.name, telephone:jsonBlob.telephone, facebook:otherBlobYo.facebook, website:otherBlobYo.website};
entries.push(entry);
}//end try
catch(err){
console.log(err);
console.log("Failed to get records for " + link);
}
}
console.log(entries);
const csvWriter = createCsvWriter({
path: 'out.csv',
fieldDelimiter: ';',
header: [
{id: 'name', title: 'Name'},
{id: 'telephone', title: 'Telephone'},
{id: 'streetAddress', title: 'Street Address'},
{id: 'facebook', title: 'Facebook'},
{id: 'website', title:'Website'}
]
});
await csvWriter
.writeRecords(entries)
.then(()=> console.log('The CSV file was written successfully'));
browser.close();
process.exit("Export complete");
}
main();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment