Skip to content

Instantly share code, notes, and snippets.

@aimuhire
Created October 15, 2019 09:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save aimuhire/4ba4c656137c33b7319830c081aaf73c to your computer and use it in GitHub Desktop.
Save aimuhire/4ba4c656137c33b7319830c081aaf73c to your computer and use it in GitHub Desktop.
Code Sample from An Extension Project
/**
* Welcome to the official Scrapper class
*
* Author: Arsene I. Muhire
* email:marsanline@gmail.com
*
* Project Description:
* I was building a browser extension that would rely on data from a server.
* basically, the extension allows users to get answers tips pop up on each question page during their CISCO/CCNA assessment.
* I made this as a proof of concept... the first browser extension I worked on.
*
* Class Details:
* Used to scrape exam data from https://www.ccna7.com/
* Because all answer pages are not formatted alike. We use strategies aka: selectors.
* We basically try one selector, if it returns a null or unusable object... we try selector NUMBER TWO.
* This class is used to extract all answers from an answer page.
* Until it get to work (|| !=)
* */
const cheerio = require('cheerio')
var request = require('request');
class Scrapper {
/**
* Creates the Scrapper Object
* @param URL required eg: "https://www.ccna7.com/ccna2-v5-03/ccna2-practice-final-exam-v5-03/"
* @return {Object} Scrapper Object.
*/
constructor(URL) {
if(URL == null)
throw "URL required!"
this.pageUrl = URL
//selectors are used to retrieve data from an HTML page
this.titleSelectors = [
"h3",
'div[class="ai-stem"]>strong', 'strong'
]
this.questionsSelectors = [
'ol[class="wpProQuiz_list"] > li',
"div.entry-content > ol > li",
]
this.choicesSelectors = [
'li[class="wpProQuiz_questionListItem"]',
'ul > li'
]
}
/**
* Get Exam JavaScript Object
* @return {Object} Object containing exam details, questions and answers.
*/
async getExam() {
/**
* Get Exam JavaScript Object
* @return {Object} Object containing exam details, questions and answers.
*/
//returns the page as an HTML string
var pageStr = await this.getPageString().then((result) => {
return result
})
//load string into cheerio for easy data extraction
const $ = cheerio.load(pageStr)
var exam = {}
exam.name = this.prettifyString($("div.entry-content > h2").text())
exam.pageUrl = this.pageUrl
exam.version = ""
exam.questions = []
// get the questionsEl an Array of questions cheerio elements
var questionsEl = this.getElement($, this.questionsSelectors, $("body"))
if (questionsEl)
//loops through the questions list
questionsEl.each((questionIndex, questionEl) => {
var question = {}
try {
//extract question title
//eg: "What are the different Routing Protocols?"
var titleEl = this.getElement($, this.titleSelectors, questionEl)
if (titleEl)
var title = titleEl.text()
try {
// Removes Explanation text once found
var title = titleEl.text().split("Explanation:")[0]
} catch (error) {
console.log(error)
}
} catch (error) {
console.log(error)
}
question.title = this.prettifyString(title)
//the question.solution is an object of question choices, answers and explanation
var solutionResult = this.getSolution($, questionEl)
//Solutions are either multiple choices {CHOICES} or images solutions...
//Currently we save HTML_SOLUTIONS when we find images in the solution
if (solutionResult.state === "CHOICES") {
question.solution = {
choices: solutionResult.choices,
explanation: solutionResult.explanation
}
} else if (solutionResult.state === "HTML_SOLUTION") {
question.htmlSolution = solutionResult.htmlSolution
} else {
question.solution = solutionResult
}
// DO not save questions without a title
if (!question.title)
return
question.identifier = question.title.replace(/\ /g, "")
// PUSH the question into the questions array
exam.questions.push(question)
})
//after looping through the questions, we return the Exam JS Object
return exam
}
/**
* retrieves and return a promise with the HTML page as a string, using the request module
*
*/
getPageString() {
return new Promise((resolve, reject) => {
request(this.pageUrl, function (error, response, body) {
if (error)
reject(error)
resolve(body)
})
})
}
/**
* Retrieves a question solution from a cheerio element
* @param {Object} $ cheerio function
* @param {Object} questionEl the cheerio question element object
* @return solution object
*/
getSolution($, questionEl) {
var solution = { choices: [] }
var hasAnswer = false
var choicesResult = this.getChoicesElement($, this.choicesSelectors, questionEl)
if (choicesResult.state === "CHOICES_ELEMENT") {
var choicesEl = choicesResult.element
solution.state = "CHOICES"
} else {
solution.state = "HTML_SOLUTION"
solution.htmlSolution = choicesResult.htmlSolution
hasAnswer = true
return solution
}
if (choicesEl)
choicesEl.each((choiceIndex, choiceEl) => {
var choice = {}
var cleanChoice = ""
//try catch hell?
try {
cleanChoice = this.prettifyString($(choiceEl).text())
try {
cleanChoice = this.prettifyString($(choiceEl).text()).split("Explanation:")[0]
var expl = $('div[class="itemfeedback"]', choiceEl).text() || this.prettifyString($(choiceEl).text()).split("explanation")[1]
} catch (error) {
console.log("hi yaaaa", error)
}
if (expl)
solution.explanation = this.prettifyString(expl)
} catch (error) {
console.log("Error cleaning answer...", error)
cleanChoice = $(choiceEl).text()
}
choice.name = cleanChoice
if ($('span[style*="color"]', choiceEl).text()) {
hasAnswer = true
choice.isAnswer = true
solution.choices.push(choice)
} else {
choice.isAnswer = false
solution.choices.push(choice)
}
})
if (hasAnswer)
return solution
// else
// console.log("Question has no answer: ", this.pageUrl, JSON.stringify(solution))
return {}
}
/**
* Trim string and removes unnecessary characters from a string
* @param {String} input The string input
* @return {String} The prettified string
*/
prettifyString(input) {
var output = "";
try {
output = input.replace(/[\r\n( )]+/g, " ").trim()
output = input.replace(/Question\sID\s\d+/g, "")
} catch (error) {
}
return output
}
/**
* Retrieve an element
* @param {Object} $ The querying function
* @param {Array} selectors Array of Selectors
* @return {String} The prettified string
*/
getElement($, selectors, root) {
var element = null
for (var i = 0; i < selectors.length; i++) {
element = $(selectors[i], root)
if (element && element.length > 0) {
return element
}
}
if (element)
console.log("####SelectorError__________", selectors, $(element).html())
return false
}
/**
* Returns the choices cheerio element
* @param {Object} $ cheerio function
* @param {Array} selectors choice selectors
* @param {Object} root root element, from which to select through
*/
getChoicesElement($, selectors, root) {
var element = null
var result = {}
for (var i = 0; i < selectors.length; i++) {
element = $(selectors[i], root)
if (element && element.length > 0) {
result.state = "CHOICES_ELEMENT"
result.element = element
return result
}
}
/**
* @todo Extract the images save them to disk. stop relying on their image links.
*/
result.state = "HTML_SOLUTION"
result.htmlSolution = $(root).html()
return result
}
}
module.exports = Scrapper
// CODE Below is only used for demo purposes. It's not part of the original class file.
//could be run with node, cheerio and request modules installed
useClass()
async function useClass(){
let ScObj = new Scrapper("https://www.ccna7.com/ccna2-v5-03/ccna2-practice-final-exam-v5-03/")
try {
let Exam = await ScObj.getExam()
console.log(Exam)
} catch (error) {
console.log("error while retrieving results",error)
}
/**
* Expected Output:
*
* { name: 'CCNA2 Practice Final Exam Answer v5.03 2016',
pageUrl: 'https://www.ccna7.com/ccna2-v5-03/ccna2-practice-final-exam-v5-03/',
version: '',
questions:
[ { title: 'A client is using SLAAC to obtain an IPv6 address for its interface. After an address has been generated and applied to the interface, what must the client do before it can begin to use this IPv6 address?',
solution: [Object],
identifier: 'AclientisusingSLAACtoobtainanIPv6addressforitsinterface.Afteranaddresshasbeengeneratedandappliedtotheinterface,whatmusttheclientdobeforeitcanbegintousethisIPv6address?' }
, //other questions...]}
*/
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment