Created
April 10, 2018 16:18
-
-
Save IgorDePaula/28b4c06a64733822e44b1fe8afd07b3e to your computer and use it in GitHub Desktop.
Scrap authenticated pages
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const fs = require("fs"); | |
const request = require("request"); | |
let cheerio = require("cheerio"); | |
var cheerioAdv = require('cheerio-advanced-selectors') | |
cheerio = cheerioAdv.wrap(cheerio) | |
// Prepare all the variables needed later | |
let count = 0; | |
let timeout = 0; | |
const id = 'admin@admin.com'; | |
const mdp = '123456'; | |
let obj; | |
// The URLs we will scrape from | |
const connexionUrl = "https://www.packtpub.com/"; | |
// Will write an "output.json" file | |
// creating a clean jar to store the cookies | |
const j = request.jar(); | |
// First Get Request Call | |
request( | |
{ | |
url: connexionUrl, | |
jar: j | |
}, | |
(err, httpResponse, html) => { | |
const $ = cheerio.load(html); | |
// We use Cheerio to load the HTML and be able to find the connection__token | |
const token = $('[name="_token"]').val(); // here is the connection__token | |
// Construction of the form required in the POST request to login | |
const form = { | |
email: id, | |
password: mdp, | |
}; | |
// POST REQUEST to Log IN. Same url with "request headers" and the complete form. | |
request.post( | |
{ | |
url: connexionUrl, | |
jar: j, | |
headers: { | |
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", | |
"Accept-Encoding": "gzip, deflate, br", | |
"Accept-Language": "fr-FR,fr;q=0.8,en-US;q=0.6,en;q=0.4", | |
"Cache-Control": "no-cache", | |
"Content-Type": "application/x-www-form-urlencoded", | |
Origin: "https://www.packtpub.com/", | |
Host: "www.packtpub.com", | |
"Upgrade-Insecure-Requests": 1, | |
"User-Agents": "Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:42.0) Gecko/20100101 Firefox/42.0", | |
Connection: "keep-alive", | |
Pragma: "no-cache", | |
Referer: "https://www.packtpub.com/" | |
}, | |
form: form | |
}, | |
(error, response, body) => { | |
// WE ARE CONNECTED :D | |
/* Second GET request call : this time, we use the response of the POST | |
request to request the right URL */ | |
request( | |
{ | |
url: response.headers.location, | |
jar: j | |
}, | |
(err, httpResponse, html2) => { | |
//console.log(html2) | |
// const json = fs.readFileSync("./firstStep.json"); // Load the JSON created in step one | |
// obj = JSON.parse(json); // We create our JSON in a usable javascript object | |
request( | |
{ | |
url: 'https://www.packtpub.com/account/my-ebooks', | |
jar: j | |
}, | |
(error1, httpResponse, html3) => { | |
if (!error1) { | |
//console.log(html3) | |
const $ = cheerio.load(html3); | |
//agora eh ler o html retornado e baixar os pdfs | |
console.log($('.info-box:first .info-box-number').html()) | |
} | |
} | |
); | |
} | |
); | |
} | |
); | |
} | |
); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment