- navigate to https://www.sia.ch/de/mitgliedschaft/verzeichnis/firmenmitglieder/
- Copy spider code into console
- Hit enter
The output will be in the browser window as JSON (and in the console as well)
The output will be in the browser window as JSON (and in the console as well)
(async () => { | |
const injectJquery = () => | |
new Promise((resolve) => { | |
var head = document.getElementsByTagName("head")[0]; | |
var script = document.createElement("script"); | |
script.src = "//code.jquery.com/jquery-3.4.1.min.js"; | |
head.appendChild(script); | |
const int = window.setInterval(() => { | |
if (!$ || !$.fn) { | |
return; | |
} | |
window.clearInterval(int); | |
resolve(); | |
}, 50); | |
}); | |
const downloadJson = (filename, text) => { | |
var link = document.createElement("a"); | |
link.setAttribute( | |
"href", | |
"data:application/json;charset=utf-8," + encodeURIComponent(text) | |
); | |
link.setAttribute("download", filename); | |
link.style.display = "none"; | |
document.body.appendChild(link); | |
link.click(); | |
document.body.removeChild(link); | |
}; | |
await injectJquery(); | |
const getListUrl = (id) => | |
`https://www.sia.ch/de/mitgliedschaft/verzeichnis/firmenmitglieder/nc/1/?tx_updsiafeuseradmin_pi1%5BdisplaySearchResult%5D=1&tx_updsiafeuseradmin_pi1%5Bpointer%5D=${id}`; | |
const totalPages = 53 | |
// const totalPages = 1; | |
const urls = Array.from(Array(totalPages)).map((dummy, i) => getListUrl(i)); | |
const output = $("<pre></pre>").css({ | |
position: "absolute", | |
width: "100%", | |
height: "100%", | |
}); | |
const progress = $("<div><div class='progress-bar'></div></div>").css({ | |
borderRadius: 3, | |
border: "1px solid #2565AE", | |
}); | |
$(document.body).css({padding: 20}).empty().append(progress).append(output); | |
const bar = progress | |
.find(".progress-bar") | |
.css({ height: "20px", background: "#2565AE", width: "0%" }) | |
.css({ transition: "width 100ms" }); | |
output.html(`Getting ${urls.length} list pages...`); | |
const serps = await Promise.all( | |
urls.map((url) => fetch(url).then((a) => a.text())) | |
); | |
let detailLinks = []; | |
serps.forEach((page) => { | |
const links = | |
$($.parseHTML(page)).find(".table-list td:first-child a") || []; | |
detailLinks = [...detailLinks, ...links]; | |
}); | |
detailLinks = detailLinks | |
.map((link) => $(link).attr("href")) | |
.map((url) => () => fetch(url).then((a) => a.text())); | |
// .slice(-4) | |
const serial = (funcs) => | |
funcs.reduce( | |
(promise, func, i) => | |
promise.then((result) => { | |
output.html(`Fetching detail page ${i} / ${detailLinks.length}`); | |
bar.css({ width: `${i/detailLinks.length*100}%` }) | |
return func().then(Array.prototype.concat.bind(result)); | |
}), | |
Promise.resolve([]) | |
); | |
// run the spider | |
output.html(`Fetching ${detailLinks.length} pages`); | |
const detailPages = await serial(detailLinks); | |
bar.css({ width: "100%" }); | |
const data = []; | |
detailPages.forEach((page) => { | |
page = $($.parseHTML(page)); | |
const address = page | |
.find("table > tbody > tr > td.col1") | |
.html() | |
.trim() | |
.replace(/<br>/gi, "\n") | |
.trim(); | |
const fields = page | |
.find("table > tbody > tr > td > ul > li") | |
.get() | |
.map((li) => li.innerText) | |
.join("\n"); | |
data.push({ | |
address, | |
fields, | |
}); | |
}); | |
const text = JSON.stringify(data, null, 2); | |
console.log(text); | |
output.html("done"); | |
downloadJson("firmenmitglieder.json", text); | |
})(); |