Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save michalbcz/0fd5e3249ad541376412eb63ac39a0e6 to your computer and use it in GitHub Desktop.
Save michalbcz/0fd5e3249ad541376412eb63ac39a0e6 to your computer and use it in GitHub Desktop.
Puppetteer based nodejs scraper of https://zbranekvalitne.cz/zbrojni-prukaz/testove-otazky including categorizing to groups
const puppeteer = require('puppeteer')
const fs = require('fs')
// this wrapper means immediatelly execute this code
void(async () => {
const url = 'https://zbranekvalitne.cz/zbrojni-prukaz/testove-otazky'
try {
console.log("I am scraping questions from " + url)
const browser = await puppeteer.launch({
// headless: false, // launch headful mode - good for debugging purposes (you will see what happened inside browser)
// slowMo: 50, // slow down puppeteer script so that it's easier to follow visually
})
const page = await browser.newPage()
// use nodejs console logging from inside of evaluate (where scraping script is running)
/*
page.on('console', consoleMessageObject => function (consoleMessageObject) {
if (consoleMessageObject._type !== 'warning') {
console.log(consoleMessageObject._text)
}
});
*/
await page.goto(url)
console.log("Browser opened, starting to evaluate scraping script...")
console.log("Scraping groups and its links")
const groups = await extractGroup(page)
let allQuestions = []
for (let i = 0; i < groups.length; i++) {
const group = groups[i]
if (groupHasSubgroups(group, groups)) {
console.log(
`
Skipping group ${group.groupId} with sub groups. Otherwise we would have duplicated
questions (from parent group and its subgroups), because parent group link leads to all questions for it's subgroups.
`
)
continue // skip this group
}
console.log("Extracting questions for group", group)
await Promise.all([
page.evaluate((groupName) => {
let groupItems = document.querySelectorAll('form#setup div.menu div.item')
let groupItemElement = Array.from(groupItems).find((groupItem) => groupItem.textContent.includes(groupName))
groupItemElement.click()
}, group.name),
page.waitForNavigation()
])
const questions = await page.evaluate(extractQuestions)
let questionsForGroup = questions.map((question, index) => {
let newQuestion = { ...question }
newQuestion.groupId = group.groupId
return newQuestion
})
allQuestions = allQuestions.concat(questionsForGroup)
}
await browser.close()
console.log('Scraping is done. Browser is closed. We scraped', allQuestions.length, 'questions')
//questions.forEach(it => console.log(it))
const resultJson = {
groups: groups,
questions: allQuestions
}
const fileUri = './questions.json'
console.log('Writing questions to file:', fileUri)
fs.writeFile(fileUri, JSON.stringify(resultJson, null, '\t' /* pretty-print */), (err) => {
if (err) {
console.error("Cannot write file questions.json", err)
}
console.info("File saved! Goodbye!")
})
} catch (error) {
console.error(error)
}
})();
function extractQuestions() {
console.log('Extracting questions...')
const questionsParentDiv = document.querySelectorAll('div#questions > div')
const questions = Array.from(questionsParentDiv).map((el, index) => {
const rows = el.querySelectorAll('div.row')
const questionText = rows[0].innerText.trim()
const question = {
order: index,
question: {
text : questionText
},
answers: []
}
// answers
for (let i = 1; i < rows.length; i++) {
let answerRow = rows[i];
const isCorrect = answerRow.className.includes("correct-answer")
const rawAnswerText = answerRow.innerText
const answerText = rawAnswerText.replace(/^[a-z]\)/,"").trim()
question.answers.push({
answerText: answerText,
isCorrect: isCorrect
})
}
return question
})
return questions
}
async function extractGroup(page) {
const groupLinks = await page.$$('form#setup div.menu div.item')
console.log("Group links size", groupLinks.length)
let nextParentGroupId = null;
let groups = []
for(let i = 1; i < groupLinks.length; i++) {
const groupLinkElement = groupLinks[i]
const thisIsSubGroup = await groupLinkElement.$("i.level")
if (!thisIsSubGroup) {
nextParentGroupId = null
}
let groupName = await groupLinkElement.$("span") != null ? await groupLinkElement.$eval("span", (node) => node.textContent) : await page.evaluate((node) => node.textContent, groupLinkElement) // first span contains name }
groupName = groupName.replace(/\n/, '').trim()
const group = {
groupId: i,
parentGroupId: nextParentGroupId,
name: groupName
}
groups.push(group)
// set parentGroupId for following subgroups
if (!thisIsSubGroup) {
nextParentGroupId = group.groupId
}
}
return groups
}
function groupHasSubgroups(group, groups) {
return groups.filter((it) => group.groupId === it.parentGroupId).length > 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment