Skip to content

Instantly share code, notes, and snippets.

@ZeroX-DG
Last active December 15, 2019 22:11
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save ZeroX-DG/5fadc377fb19f19fada40a6187006425 to your computer and use it in GitHub Desktop.
Save ZeroX-DG/5fadc377fb19f19fada40a6187006425 to your computer and use it in GitHub Desktop.
const puppeteer = require('puppeteer');
const firebase = require('firebase');
if(!firebase.apps.length) {
let config = {
apiKey: "xxxxxxxxxxxxxxxxxxxxx",
authDomain: "xxxxxxxxxxxxxxxxxxxxxxxx",
databaseURL: "xxxxxxxxxxxxxxxxxxxxx",
projectId: "xxxxxxxxx",
storageBucket: "xxxxxxxxxxxxxxxxxxxx",
messagingSenderId: "xxxxxxxxxxxxxxx"
};
firebase.initializeApp(config);
}
const db = firebase.database();
(async () => {
const browser = await puppeteer.launch({headless: false, timeout: 0});
const page = await browser.newPage();
const page_start = 2;
const max_page = 31;
const item_per_page = 20;
const link_start = 0; // change this if max timeout
browser.on('disconnected', () => {console.log("disconnected !")});
for (let current_page = page_start; current_page <= max_page; current_page++) {
let start = (current_page - 1) * item_per_page;
console.log(
"-------- PAGE " +
current_page +
" ( https://www.ielts-mentor.com/cue-card-sample?start=" +
start +
" ) --------"
);
console.log("[#] Getting links...");
await page.goto(
'https://www.ielts-mentor.com/cue-card-sample?start=' + start,
{waitUntil: 'networkidle2'}
);
// Extract the list of links from the result page
const cue_card_links = await page.evaluate((selector) => {
const anchors_node_list = document.querySelectorAll(selector);
const anchors = [...anchors_node_list];
return anchors.map(link => link.href);
}, '#adminForm > table > tbody > tr > td.list-title > a');
console.log("[#] Done getting links\n");
for (let i = link_start; i < cue_card_links.length; i++) {
let link = cue_card_links[i];
console.log("\n[*] Trying: " + link);
await page.goto(link, {waitUntil: 'networkidle2'});
const question = await page.evaluate((selector1, selector2) => {
let question_dom = document.querySelector(selector1);
if (!question_dom) {
question_dom = document.querySelector(selector2);
}
return question_dom.textContent.trim();
},
"#main > article > h2:nth-child(5) > span",
"#main > article > h3:nth-child(5) > span"
);
const cues = await page.evaluate((selector) => {
let cue_doms = [...document.querySelector(selector).children]
return cue_doms.map(cue => cue.textContent.trim());
}, "#main > article > ul:nth-child(8)");
//console.log({title, cues});
saveToFirebase(title, cues);
}
}
await page.close();
await browser.close();
console.log("------ DONE ------");
})();
let saveToFirebase = (question, cues) => {
let questionRef = db.ref('/questions');
let newQuestionRef = questionRef.push();
let newQuestionKey = newQuestionRef.key;
newQuestionRef.set({
question: question,
cues: cues
});
console.log("[#] Success => Id: " + newQuestionKey + " | Title: " + question);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment