Skip to content

Instantly share code, notes, and snippets.

@jaggli
Last active February 25, 2022 12:34
Show Gist options
  • Save jaggli/a1f0c7d86efe74ce6aceab7542acacb5 to your computer and use it in GitHub Desktop.
Save jaggli/a1f0c7d86efe74ce6aceab7542acacb5 to your computer and use it in GitHub Desktop.
SIA address spider
(async () => {
const injectJquery = () =>
new Promise((resolve) => {
var head = document.getElementsByTagName("head")[0];
var script = document.createElement("script");
script.src = "//code.jquery.com/jquery-3.4.1.min.js";
head.appendChild(script);
const int = window.setInterval(() => {
if (!$ || !$.fn) {
return;
}
window.clearInterval(int);
resolve();
}, 50);
});
const downloadJson = (filename, text) => {
var link = document.createElement("a");
link.setAttribute(
"href",
"data:application/json;charset=utf-8," + encodeURIComponent(text)
);
link.setAttribute("download", filename);
link.style.display = "none";
document.body.appendChild(link);
link.click();
document.body.removeChild(link);
};
await injectJquery();
const getListUrl = (id) =>
`https://www.sia.ch/de/mitgliedschaft/verzeichnis/firmenmitglieder/nc/1/?tx_updsiafeuseradmin_pi1%5BdisplaySearchResult%5D=1&tx_updsiafeuseradmin_pi1%5Bpointer%5D=${id}`;
const totalPages = 53
// const totalPages = 1;
const urls = Array.from(Array(totalPages)).map((dummy, i) => getListUrl(i));
const output = $("<pre></pre>").css({
position: "absolute",
width: "100%",
height: "100%",
});
const progress = $("<div><div class='progress-bar'></div></div>").css({
borderRadius: 3,
border: "1px solid #2565AE",
});
$(document.body).css({padding: 20}).empty().append(progress).append(output);
const bar = progress
.find(".progress-bar")
.css({ height: "20px", background: "#2565AE", width: "0%" })
.css({ transition: "width 100ms" });
output.html(`Getting ${urls.length} list pages...`);
const serps = await Promise.all(
urls.map((url) => fetch(url).then((a) => a.text()))
);
let detailLinks = [];
serps.forEach((page) => {
const links =
$($.parseHTML(page)).find(".table-list td:first-child a") || [];
detailLinks = [...detailLinks, ...links];
});
detailLinks = detailLinks
.map((link) => $(link).attr("href"))
.map((url) => () => fetch(url).then((a) => a.text()));
// .slice(-4)
const serial = (funcs) =>
funcs.reduce(
(promise, func, i) =>
promise.then((result) => {
output.html(`Fetching detail page ${i} / ${detailLinks.length}`);
bar.css({ width: `${i/detailLinks.length*100}%` })
return func().then(Array.prototype.concat.bind(result));
}),
Promise.resolve([])
);
// run the spider
output.html(`Fetching ${detailLinks.length} pages`);
const detailPages = await serial(detailLinks);
bar.css({ width: "100%" });
const data = [];
detailPages.forEach((page) => {
page = $($.parseHTML(page));
const address = page
.find("table > tbody > tr > td.col1")
.html()
.trim()
.replace(/<br>/gi, "\n")
.trim();
const fields = page
.find("table > tbody > tr > td > ul > li")
.get()
.map((li) => li.innerText)
.join("\n");
data.push({
address,
fields,
});
});
const text = JSON.stringify(data, null, 2);
console.log(text);
output.html("done");
downloadJson("firmenmitglieder.json", text);
})();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment