rolfen/crawler.js

## crawler.js
let getElement, crawlDom, getCats, save, rowify; // functions
let urlBase, pages; // input data

// config
let waitInterval = 500; // how much to wait between fetching pages

// selectors
let catRe = /\{\s*productcategories\s*\:\s*(\[\s*\{[\s\S]*\}\s*\])\s*\}/m;
let selectors = {
	'name'	: '#showroomTopContentDiv .showroom-header h1',
	'desc'	: '#scroll-description .showroom-about',
	'booth'	: '#scroll-boothlinks #newfloorplanlink strong',
};

class Output {
	data =[];
	counters = {};
	timers = [];
	theLog = [];
	stop = () => {
		this.timers.forEach(t=>clearInterval(t));
	}
	increment = (counter) => {
		(this.counters[counter] === undefined) && (this.counters[counter]=0);
		this.counters[counter] ++;
	}
	log = (msg) => {
		this.theLog.push(msg);
	}
	append = (obj) => this.data.push(obj);
	get = () => this.data;
}

crawlDom = (selectors, dom) => Object.keys(selectors).map(k => [k, [...dom.querySelectorAll(selectors[k])].map(e => e.textContent.trim())]);

getCats = (document, re) => {
	// extracts categories from source - from inside script tags
	let cat;
	let scriptTags = [...(document.querySelectorAll('script'))];
	let matches = scriptTags.map(e=>e.innerText).map(s=>s.match(re)).filter(m=>m!==null);
	if (matches.length > 0 && matches[0].length > 1) {
		eval("cat="+matches[0][1]);
	} else {
		cat = [null];
	}
	return cat;
}

rowify = (obj) => {
	// explodes parsed data into array with one entry for each subcategory
	// also flattens array data inside properties
	return obj.categories.map((cat) => ({
		url: obj.url,
		category: cat?.category,
		subcategory: cat?.subcategory,
		name: obj.name[0],
		desc: obj.desc[0],
		booth: obj.booth.join(', ')
	}));
}

save = (obj, filename='data.json') => {
	// save to downloadable file

	let data = JSON.stringify(obj, undefined, 4)

    var blob = new Blob([data], { type: 'text/json'}),
        e = document.createEvent('MouseEvents'),
        a = document.createElement('a')

    a.download = filename
    a.href = window.URL.createObjectURL(blob)
    a.dataset.downloadurl = ['text/json', a.download, a.href].join(':')
    e.initMouseEvent('click', true, false, window, 0, 0, 0, 0, 0, false, false, false, false, 0, null)
    a.dispatchEvent(e)
}

crawlUrl = async(url) => {
	try {
		let res = await fetch(url, {mode: 'no-cors'});
		let txt = await res.text();
		let parser = new DOMParser();
		let dom = parser.parseFromString(txt, "text/html");
		let gatheredData = Object.fromEntries(crawlDom(selectors, dom));
		gatheredData.url = url;
		gatheredData.categories = getCats(dom, catRe);
		return(gatheredData);
	} catch(err) {
		throw new Error(err);
	}
}

crawlUrls = (pages) => {
	let o = new Output();
	[...new Set(pages)].forEach((url, index)=>{
		o.increment('pages_found');
		o.timers.push(setTimeout( async () => {
			try {
				let d = await crawlUrl(url);
				rowify(d).forEach(r=>o.append(r));
				o.increment('pages_processed');
			} catch (e) {
				o.increment('errors');
				o.log(e);
				//throw new Error(e);
			}
		}, index * waitInterval));
	});
	return o;
}

/* getting pages
go to https://himss24.mapyourshow.com/8_0/explore/exhibitor-gallery.cfm?featured=false
switch to list mode, and keep clicking "load more" at the bottom until exhausted
in the console:
pages = (Array.from(document.querySelectorAll('section#exhibitor-results table.results-table > tbody > tr.js-List > td:nth-child(2) a[href]'))).map(e=>e.getAttribute('href'))
// then deduplicate
pages = [...new Set(pages)];
*/

// running crawler:
// define urlBase then define pages array with urls relative to urlBase (eg: pages = ['/ex/pages/?id=3424', '/ex/pages/?id=5523'] etc.)
// let o = crawlUrls(pages.map(url=>(urlBase + url)));
	let getElement, crawlDom, getCats, save, rowify; // functions
	let urlBase, pages; // input data

	// config
	let waitInterval = 500; // how much to wait between fetching pages

	// selectors
	let catRe = /\{\sproductcategories\s\:\s(\[\s\{[\s\S]\}\s\])\s*\}/m;
	let selectors = {
	'name' : '#showroomTopContentDiv .showroom-header h1',
	'desc' : '#scroll-description .showroom-about',
	'booth' : '#scroll-boothlinks #newfloorplanlink strong',
	};

	class Output {
	data =[];
	counters = {};
	timers = [];
	theLog = [];
	stop = () => {
	this.timers.forEach(t=>clearInterval(t));
	}
	increment = (counter) => {
	(this.counters[counter] === undefined) && (this.counters[counter]=0);
	this.counters[counter] ++;
	}
	log = (msg) => {
	this.theLog.push(msg);
	}
	append = (obj) => this.data.push(obj);
	get = () => this.data;
	}

	crawlDom = (selectors, dom) => Object.keys(selectors).map(k => [k, [...dom.querySelectorAll(selectors[k])].map(e => e.textContent.trim())]);

	getCats = (document, re) => {
	// extracts categories from source - from inside script tags
	let cat;
	let scriptTags = [...(document.querySelectorAll('script'))];
	let matches = scriptTags.map(e=>e.innerText).map(s=>s.match(re)).filter(m=>m!==null);
	if (matches.length > 0 && matches[0].length > 1) {
	eval("cat="+matches[0][1]);
	} else {
	cat = [null];
	}
	return cat;
	}

	rowify = (obj) => {
	// explodes parsed data into array with one entry for each subcategory
	// also flattens array data inside properties
	return obj.categories.map((cat) => ({
	url: obj.url,
	category: cat?.category,
	subcategory: cat?.subcategory,
	name: obj.name[0],
	desc: obj.desc[0],
	booth: obj.booth.join(', ')
	}));
	}

	save = (obj, filename='data.json') => {
	// save to downloadable file

	let data = JSON.stringify(obj, undefined, 4)

	var blob = new Blob([data], { type: 'text/json'}),
	e = document.createEvent('MouseEvents'),
	a = document.createElement('a')

	a.download = filename
	a.href = window.URL.createObjectURL(blob)
	a.dataset.downloadurl = ['text/json', a.download, a.href].join(':')
	e.initMouseEvent('click', true, false, window, 0, 0, 0, 0, 0, false, false, false, false, 0, null)
	a.dispatchEvent(e)
	}

	crawlUrl = async(url) => {
	try {
	let res = await fetch(url, {mode: 'no-cors'});
	let txt = await res.text();
	let parser = new DOMParser();
	let dom = parser.parseFromString(txt, "text/html");
	let gatheredData = Object.fromEntries(crawlDom(selectors, dom));
	gatheredData.url = url;
	gatheredData.categories = getCats(dom, catRe);
	return(gatheredData);
	} catch(err) {
	throw new Error(err);
	}
	}

	crawlUrls = (pages) => {
	let o = new Output();
	[...new Set(pages)].forEach((url, index)=>{
	o.increment('pages_found');
	o.timers.push(setTimeout( async () => {
	try {
	let d = await crawlUrl(url);
	rowify(d).forEach(r=>o.append(r));
	o.increment('pages_processed');
	} catch (e) {
	o.increment('errors');
	o.log(e);
	//throw new Error(e);
	}
	}, index * waitInterval));
	});
	return o;
	}

	/* getting pages
	go to https://himss24.mapyourshow.com/8_0/explore/exhibitor-gallery.cfm?featured=false
	switch to list mode, and keep clicking "load more" at the bottom until exhausted
	in the console:
	pages = (Array.from(document.querySelectorAll('section#exhibitor-results table.results-table > tbody > tr.js-List > td:nth-child(2) a[href]'))).map(e=>e.getAttribute('href'))
	// then deduplicate
	pages = [...new Set(pages)];
	*/

	// running crawler:
	// define urlBase then define pages array with urls relative to urlBase (eg: pages = ['/ex/pages/?id=3424', '/ex/pages/?id=5523'] etc.)
	// let o = crawlUrls(pages.map(url=>(urlBase + url)));