Skip to content

Instantly share code, notes, and snippets.

@martian17
Last active October 6, 2022 13:05
Show Gist options
  • Save martian17/0bbd3117d7df9ece8a1b8d18ff305d04 to your computer and use it in GitHub Desktop.
Save martian17/0bbd3117d7df9ece8a1b8d18ff305d04 to your computer and use it in GitHub Desktop.
scrape contacts
/*
init commands to run
$ npm init
$ npm install jsdom
$ npm install node-fetch
*/
const {JSDOM} = require("jsdom");
const fetch = (()=>{let m = import("node-fetch");return async (...args)=>await (await m).default(...args);})();
let scrapeContacts = async function(url){
let dom = new JSDOM(await (await fetch(url)).text());
let document = dom.window.document;
let contacts = document.getElementById("ctlMailList_lstList");
let cards = contacts.querySelectorAll(".CompanyBox");
let infos = [];
for(let card of cards){
let lines = [[]];
for(let e of card.childNodes){
if(e.nodeName === "DIV"){
lines[lines.length-1].push(e);
lines.push([]);
}else if(e.nodeName === "BR"){
lines.push([]);
}else{
lines[lines.length-1].push(e);
}
}
lines = lines.filter(l=>l.length>0)
.map(l=>l.map(e=>e.textContent.trim()).join(" ").trim());
let info = {
general:lines[0],
company:lines[1],
name:lines[2],
title:lines[3]
}
let rest = lines.slice(4);
//extract email/phone/address
let address = [];
let ctx = 0;
for(let i = 0; i < rest.length; i++){
let l = rest[i];
if(ctx === 0){
if(l.match(/^Email\:/) || l.match(/^Phone\:/)){
ctx = 1;
i--;
continue;
}
address.push(l);
}else{
if(l.match(/^Email\:/)){
info.email = l.split(":")[1].trim();
}else if(l.match(/^Phone\:/)){
info.phone = l.split(":")[1].trim();
}else{
console.log("unexpected line:",l);
}
}
}
info.address = address.join("\n");
infos.push(info);
}
return infos
};
let main = async function(){
console.log(await scrapeContacts("https://www.puc.texas.gov/industry/electric/directories/brk/report_brk.aspx?ID=BRSQL01DB1245872600005"));
};
main();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment