Skip to content

Instantly share code, notes, and snippets.

@michalbcz
Last active September 7, 2019 09:10
Show Gist options
  • Save michalbcz/f2b6a6ae364611dc98aeec3cbae06606 to your computer and use it in GitHub Desktop.
Save michalbcz/f2b6a6ae364611dc98aeec3cbae06606 to your computer and use it in GitHub Desktop.
Puppetteer based nodejs scraper of https://zbranekvalitne.cz/zbrojni-prukaz/testove-otazky. Questions are not categorized by groups. Just all the questions.
const puppeteer = require('puppeteer')
const fs = require('fs')
// this wrapper means immediatelly execute this code
void(async () => {
const url = 'https://zbranekvalitne.cz/zbrojni-prukaz/testove-otazky'
try {
console.log("I am scraping questions from " + url)
const browser = await puppeteer.launch({
/*
headless: false, // launch headful mode - good for debugging purposes (you will see what happened inside browser)
slowMo: 250, // slow down puppeteer script so that it's easier to follow visually
*/
})
const page = await browser.newPage()
// use nodejs console logging from inside of evaluate (where scraping script is running)
page.on('console', consoleMessageObject => function (consoleMessageObject) {
if (consoleMessageObject._type !== 'warning') {
console.debug(consoleMessageObject._text)
}
});
await page.goto(url)
console.log("Browser opened, starting to evaluate scraping script...")
const questions = await page.evaluate(() => {
console.log("Evaluating scraping script")
const questionsParentDiv = document.querySelectorAll('div#questions > div')
const questions = Array.from(questionsParentDiv).map((el, index) => {
console.log(el)
const rows = el.querySelectorAll('div.row')
const questionText = rows[0].innerText.trim()
const question = {
order: index,
question: {
text : questionText
},
answers: []
}
// answers
for (let i = 1; i < rows.length; i++) {
let answerRow = rows[i];
const isCorrect = answerRow.className.includes("correct-answer")
const rawAnswerText = answerRow.innerText
const answerText = rawAnswerText.replace(/^[a-z]\)/,"").trim()
question.answers.push({
answerText: answerText,
isCorrect: isCorrect
})
}
return question
})
return questions
})
await browser.close()
console.log("Scraping is done. Browser is closed. We scraped", questions.length, "questions")
//questions.forEach(it => console.log(it))
const fileUri = './questions.json'
console.log("Writing questions to file:", fileUri)
fs.writeFile(fileUri, JSON.stringify(questions), (err) => {
if (err) {
console.error("Cannot write file questions.json", err)
}
console.info("File saved! Goodbye!")
})
} catch (error) {
console.error(error)
}
})();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment