Skip to content

Instantly share code, notes, and snippets.

@IgorDePaula
Created April 10, 2018 16:18
Show Gist options
  • Save IgorDePaula/28b4c06a64733822e44b1fe8afd07b3e to your computer and use it in GitHub Desktop.
Save IgorDePaula/28b4c06a64733822e44b1fe8afd07b3e to your computer and use it in GitHub Desktop.
Scrap authenticated pages
const fs = require("fs");
const request = require("request");
let cheerio = require("cheerio");
var cheerioAdv = require('cheerio-advanced-selectors')
cheerio = cheerioAdv.wrap(cheerio)
// Prepare all the variables needed later
let count = 0;
let timeout = 0;
const id = 'admin@admin.com';
const mdp = '123456';
let obj;
// The URLs we will scrape from
const connexionUrl = "https://www.packtpub.com/";
// Will write an "output.json" file
// creating a clean jar to store the cookies
const j = request.jar();
// First Get Request Call
request(
{
url: connexionUrl,
jar: j
},
(err, httpResponse, html) => {
const $ = cheerio.load(html);
// We use Cheerio to load the HTML and be able to find the connection__token
const token = $('[name="_token"]').val(); // here is the connection__token
// Construction of the form required in the POST request to login
const form = {
email: id,
password: mdp,
};
// POST REQUEST to Log IN. Same url with "request headers" and the complete form.
request.post(
{
url: connexionUrl,
jar: j,
headers: {
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "fr-FR,fr;q=0.8,en-US;q=0.6,en;q=0.4",
"Cache-Control": "no-cache",
"Content-Type": "application/x-www-form-urlencoded",
Origin: "https://www.packtpub.com/",
Host: "www.packtpub.com",
"Upgrade-Insecure-Requests": 1,
"User-Agents": "Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:42.0) Gecko/20100101 Firefox/42.0",
Connection: "keep-alive",
Pragma: "no-cache",
Referer: "https://www.packtpub.com/"
},
form: form
},
(error, response, body) => {
// WE ARE CONNECTED :D
/* Second GET request call : this time, we use the response of the POST
request to request the right URL */
request(
{
url: response.headers.location,
jar: j
},
(err, httpResponse, html2) => {
//console.log(html2)
// const json = fs.readFileSync("./firstStep.json"); // Load the JSON created in step one
// obj = JSON.parse(json); // We create our JSON in a usable javascript object
request(
{
url: 'https://www.packtpub.com/account/my-ebooks',
jar: j
},
(error1, httpResponse, html3) => {
if (!error1) {
//console.log(html3)
const $ = cheerio.load(html3);
//agora eh ler o html retornado e baixar os pdfs
console.log($('.info-box:first .info-box-number').html())
}
}
);
}
);
}
);
}
);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment