Skip to content

Instantly share code, notes, and snippets.

@julianburr
Last active May 7, 2024 11:36
Show Gist options
  • Save julianburr/f253ea3536907f87af56c9252bc07269 to your computer and use it in GitHub Desktop.
Save julianburr/f253ea3536907f87af56c9252bc07269 to your computer and use it in GitHub Desktop.
IGNRW Crawler
# Stell sicher dass du NodeJS installiert hast
# Siehe https://nodejs.org/en/learn/getting-started/how-to-install-nodejs

# Installier npm dependencies
npm install

# Beispiel script ausführung
LOGIN_USERNAME=foo LOGIN_PASSWORD=bar node index.mjs

# Die CSV datei wird im selben verzeichnis erstellt wo das script ist

###
###

# Weitere optionale Parameter
FROM="01.01.2024" TO="01.02.2024"   # wenn nicht angegeben liest das script die letzten 30 Tage vom aktuellen Datum aus
FILENAME="foo.csv"                  # falls du einen anderen datei namen willst, oder du passt einfach Z.68 in `index.mjs` an
DEBUG=true                          # falls irgendwas nicht funktioniert, kannst du im debug mode sehen was der crawler macht und wo er vielleicht hängen bleibt

# Zum beispiel
DEBUG=true FROM="01.01.2024" TO="01.02.2024" LOGIN_USERNAME=foo LOGIN_PASSWORD=bar node index.mjs
import * as fs from "node:fs";
import * as url from "node:url";
import * as path from "node:path";
import puppeteer from "puppeteer";
const __dirname = url.fileURLToPath(new URL(".", import.meta.url));
(async () => {
// Parameters
const USERNAME = process.env.LOGIN_USERNAME;
const PASSWORD = process.env.LOGIN_PASSWORD;
const FROM = process.env.FROM;
const TO = process.env.TO;
const FILENAME = process.env.FILENAME;
const DEBUG = process.env.DEBUG;
// Date and file name
const toDate = TO ? new Date(TO) : new Date();
const fromDate = FROM
? new Date(FROM)
: new Date(toDate.getTime() - 30 * 24 * 60 * 60 * 1000);
const [fromYear, fromMonth, fromDay] = fromDate.toISOString().split(/[-T]/);
const [toYear, toMonth, toDay] = toDate.toISOString().split(/[-T]/);
// Launch the browser and open a new blank page
const browser = await puppeteer.launch({ headless: !DEBUG });
const page = await browser.newPage();
// Login
await page.goto("https://www.ig.nrw.de/IGNRW-Internet/");
await page.type('input[name="kennung"]', USERNAME);
await page.type('input[name="kennwort"]', PASSWORD);
await Promise.all([
page.waitForNavigation(),
page.click('input[type="submit"]'),
]);
// Get data
await page.goto(
"https://www.ig.nrw.de/IGNRW-Internet/prepareAuswertungKHVerwalterMeldungen.action?ziel=h13m03u02"
);
await page.type("#txtfDatumStart", `${fromDay}.${fromMonth}.${fromYear}`);
await page.type("#txtfDatumEnde", `${toDay}.${toMonth}.${toYear}`);
// HACK: timeout needed because the page loads weirdly
await page.waitForNetworkIdle({ idleTime: 4000 });
// Find table rows and loop through them
const csv = await page.evaluate(() => {
let csv = "";
const rows = document.querySelectorAll("#apliste tbody tr");
for (let i = 0; i < rows.length; i++) {
const cells = rows[i].querySelectorAll("td");
for (let c = 0; c < cells.length; c++) {
csv += `${c > 0 ? ";" : ""}${cells[c].innerHTML.trim()}`;
}
csv += `\n`;
}
return csv;
});
const fileName =
FILENAME ||
`${fromYear}${fromMonth}${fromDay}_${toYear}${toMonth}${toDay}.csv`;
const filePath = path.resolve(__dirname, fileName);
fs.writeFileSync(filePath, csv);
if (!DEBUG) {
await browser.close();
}
})();
{
"name": "ignrw-crawler",
"version": "1.0.0",
"description": "",
"keywords": [],
"author": "",
"license": "ISC",
"dependencies": {
"puppeteer": "^22.7.1"
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment