Skip to content

Instantly share code, notes, and snippets.

@hejmsdz
Created August 9, 2022 18:38
Show Gist options
  • Save hejmsdz/e7545030075736a0964f85f61c6134de to your computer and use it in GitHub Desktop.
Save hejmsdz/e7545030075736a0964f85f61c6134de to your computer and use it in GitHub Desktop.
import fs from 'fs';
import fetch from 'node-fetch';
import { JSDOM } from 'jsdom';
import YAML from 'yaml'
async function loadDocument(url) {
const response = await fetch(url);
const body = await response.text();
const { document } = new JSDOM(body, { url }).window;
return document;
}
async function scrap(url, target) {
const document = await loadDocument(url);
const links = document.querySelectorAll('tbody tr a');
const antiphonsByPeriod = {};
await Promise.all([...links].map(async (a) => {
const period = a.textContent.trim().replace(/\s+/g, ' ');
const periodUrl = a.href;
antiphonsByPeriod[period] = await scrapPeriod(periodUrl);
}));
fs.writeFileSync(target, YAML.stringify(antiphonsByPeriod));
return antiphonsByPeriod;
}
async function scrapPeriod(url) {
const antiphonsByTime = {};
let nextUrl = url;
while (nextUrl) {
console.log(nextUrl);
const document = await loadDocument(nextUrl);
const rows = document.querySelectorAll('tbody tr');
await Promise.all([...rows].map(async (row) => {
const [, type, time] = [...row.querySelectorAll('td')].map(td => td.textContent.trim());
const antiphonUrl = row.querySelector('a').href;
if (!antiphonsByTime[time]) {
antiphonsByTime[time] = {};
}
antiphonsByTime[time][type] = await scrapAntiphon(antiphonUrl);
}));
nextUrl = document.querySelector('.page-item.active + .page-item a:first-of-type')?.href;
}
return antiphonsByTime;
}
async function scrapAntiphon(url) {
console.log(' > ', url);
const document = await loadDocument(url);
const [latin, polish] = [...document.querySelectorAll('.col-lg-6.col-sm-12')]
.map(block => [...block.querySelectorAll('p.text-justify')].map(node => node.textContent.trim()));
return latin.map((latin,i) => ({ latin, polish: polish[i] }))
}
async function main() {
const indexUrl = 'https://musicamsacram.pl/antyfony';
await scrap(indexUrl, 'antiphons.yml');
}
main();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment