Last active
September 7, 2019 09:10
-
-
Save michalbcz/f2b6a6ae364611dc98aeec3cbae06606 to your computer and use it in GitHub Desktop.
Puppetteer based nodejs scraper of https://zbranekvalitne.cz/zbrojni-prukaz/testove-otazky. Questions are not categorized by groups. Just all the questions.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const puppeteer = require('puppeteer') | |
const fs = require('fs') | |
// this wrapper means immediatelly execute this code | |
void(async () => { | |
const url = 'https://zbranekvalitne.cz/zbrojni-prukaz/testove-otazky' | |
try { | |
console.log("I am scraping questions from " + url) | |
const browser = await puppeteer.launch({ | |
/* | |
headless: false, // launch headful mode - good for debugging purposes (you will see what happened inside browser) | |
slowMo: 250, // slow down puppeteer script so that it's easier to follow visually | |
*/ | |
}) | |
const page = await browser.newPage() | |
// use nodejs console logging from inside of evaluate (where scraping script is running) | |
page.on('console', consoleMessageObject => function (consoleMessageObject) { | |
if (consoleMessageObject._type !== 'warning') { | |
console.debug(consoleMessageObject._text) | |
} | |
}); | |
await page.goto(url) | |
console.log("Browser opened, starting to evaluate scraping script...") | |
const questions = await page.evaluate(() => { | |
console.log("Evaluating scraping script") | |
const questionsParentDiv = document.querySelectorAll('div#questions > div') | |
const questions = Array.from(questionsParentDiv).map((el, index) => { | |
console.log(el) | |
const rows = el.querySelectorAll('div.row') | |
const questionText = rows[0].innerText.trim() | |
const question = { | |
order: index, | |
question: { | |
text : questionText | |
}, | |
answers: [] | |
} | |
// answers | |
for (let i = 1; i < rows.length; i++) { | |
let answerRow = rows[i]; | |
const isCorrect = answerRow.className.includes("correct-answer") | |
const rawAnswerText = answerRow.innerText | |
const answerText = rawAnswerText.replace(/^[a-z]\)/,"").trim() | |
question.answers.push({ | |
answerText: answerText, | |
isCorrect: isCorrect | |
}) | |
} | |
return question | |
}) | |
return questions | |
}) | |
await browser.close() | |
console.log("Scraping is done. Browser is closed. We scraped", questions.length, "questions") | |
//questions.forEach(it => console.log(it)) | |
const fileUri = './questions.json' | |
console.log("Writing questions to file:", fileUri) | |
fs.writeFile(fileUri, JSON.stringify(questions), (err) => { | |
if (err) { | |
console.error("Cannot write file questions.json", err) | |
} | |
console.info("File saved! Goodbye!") | |
}) | |
} catch (error) { | |
console.error(error) | |
} | |
})(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment