Skip to content

Instantly share code, notes, and snippets.

@rolfen
Last active March 29, 2024 20:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rolfen/1b0021056a138b7ef4096283806ad14b to your computer and use it in GitHub Desktop.
Save rolfen/1b0021056a138b7ef4096283806ad14b to your computer and use it in GitHub Desktop.
Javascript Console (in-browser) based crawler
let getElement, crawlDom, getCats, save, rowify; // functions
let urlBase, pages; // input data
// config
let waitInterval = 500; // how much to wait between fetching pages
// selectors
let catRe = /\{\s*productcategories\s*\:\s*(\[\s*\{[\s\S]*\}\s*\])\s*\}/m;
let selectors = {
'name' : '#showroomTopContentDiv .showroom-header h1',
'desc' : '#scroll-description .showroom-about',
'booth' : '#scroll-boothlinks #newfloorplanlink strong',
};
class Output {
data =[];
counters = {};
timers = [];
theLog = [];
stop = () => {
this.timers.forEach(t=>clearInterval(t));
}
increment = (counter) => {
(this.counters[counter] === undefined) && (this.counters[counter]=0);
this.counters[counter] ++;
}
log = (msg) => {
this.theLog.push(msg);
}
append = (obj) => this.data.push(obj);
get = () => this.data;
}
crawlDom = (selectors, dom) => Object.keys(selectors).map(k => [k, [...dom.querySelectorAll(selectors[k])].map(e => e.textContent.trim())]);
getCats = (document, re) => {
// extracts categories from source - from inside script tags
let cat;
let scriptTags = [...(document.querySelectorAll('script'))];
let matches = scriptTags.map(e=>e.innerText).map(s=>s.match(re)).filter(m=>m!==null);
if (matches.length > 0 && matches[0].length > 1) {
eval("cat="+matches[0][1]);
} else {
cat = [null];
}
return cat;
}
rowify = (obj) => {
// explodes parsed data into array with one entry for each subcategory
// also flattens array data inside properties
return obj.categories.map((cat) => ({
url: obj.url,
category: cat?.category,
subcategory: cat?.subcategory,
name: obj.name[0],
desc: obj.desc[0],
booth: obj.booth.join(', ')
}));
}
save = (obj, filename='data.json') => {
// save to downloadable file
let data = JSON.stringify(obj, undefined, 4)
var blob = new Blob([data], { type: 'text/json'}),
e = document.createEvent('MouseEvents'),
a = document.createElement('a')
a.download = filename
a.href = window.URL.createObjectURL(blob)
a.dataset.downloadurl = ['text/json', a.download, a.href].join(':')
e.initMouseEvent('click', true, false, window, 0, 0, 0, 0, 0, false, false, false, false, 0, null)
a.dispatchEvent(e)
}
crawlUrl = async(url) => {
try {
let res = await fetch(url, {mode: 'no-cors'});
let txt = await res.text();
let parser = new DOMParser();
let dom = parser.parseFromString(txt, "text/html");
let gatheredData = Object.fromEntries(crawlDom(selectors, dom));
gatheredData.url = url;
gatheredData.categories = getCats(dom, catRe);
return(gatheredData);
} catch(err) {
throw new Error(err);
}
}
crawlUrls = (pages) => {
let o = new Output();
[...new Set(pages)].forEach((url, index)=>{
o.increment('pages_found');
o.timers.push(setTimeout( async () => {
try {
let d = await crawlUrl(url);
rowify(d).forEach(r=>o.append(r));
o.increment('pages_processed');
} catch (e) {
o.increment('errors');
o.log(e);
//throw new Error(e);
}
}, index * waitInterval));
});
return o;
}
/* getting pages
go to https://himss24.mapyourshow.com/8_0/explore/exhibitor-gallery.cfm?featured=false
switch to list mode, and keep clicking "load more" at the bottom until exhausted
in the console:
pages = (Array.from(document.querySelectorAll('section#exhibitor-results table.results-table > tbody > tr.js-List > td:nth-child(2) a[href]'))).map(e=>e.getAttribute('href'))
// then deduplicate
pages = [...new Set(pages)];
*/
// running crawler:
// define urlBase then define pages array with urls relative to urlBase (eg: pages = ['/ex/pages/?id=3424', '/ex/pages/?id=5523'] etc.)
// let o = crawlUrls(pages.map(url=>(urlBase + url)));
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment