Skip to content

Instantly share code, notes, and snippets.

@abhinavKeshri07
Created November 25, 2019 16:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save abhinavKeshri07/11a62145222fc091b83875e2c2863f31 to your computer and use it in GitHub Desktop.
Save abhinavKeshri07/11a62145222fc091b83875e2c2863f31 to your computer and use it in GitHub Desktop.
this file shows you how to scrape data form a website. Thank me later.
const cheerio = require('cheerio');
const fs = require('fs');
const readline = require('readline');
const get_post_data = require('./get_POST_data');
const get_get_data = require('./get_GET_data');
let callback_url = 'https://xyz.com';
// this form data object contain the query to be sent.
let form_data = {
'm_hc': '01',
'm_sideflg': 'C',
'm_sr': 'R',
'm_skey': 'AO',
'frmdate': '01-11-2018', // this value can be changed according to need. I am querying the database form this date.
'todate': '21-11-2018', // this value can be changed according to need.
'submit11': 'List By Case Type' // this field is neccessary for this particular request.
};
// neccessary headers set to make the post requets to casequery_action.php
let headers = {
'Accept': '*/*',
'Content-Type': 'application/x-www-form-urlencoded',
'User-Agent': 'My post Script'
}
let $; // for cheerio
const writeStream = fs.createWriteStream('allLinks.csv');
// this counter keeps track of "how many entries" we got in this query.
let LinkCounter = 0;
let loadDataInFile = function() {
get_post_data(callback_url, form_data, headers)
.then((response) => {
//console.log(response);
$ = cheerio.load(response);
$('font a').each((i, ele) => {
const item = $(ele).text();
const link = $(ele).attr('href');
writeStream.write(`${item},https://xyz.com/${link}\n`);
LinkCounter++;
});
console.log("done fetching data and stored it in csv file");
// all the links to case have beeen stored in the "allLinks.csv" file.
// now making call to store details in "details.json" file
loadDetailInFile();
return true;
})
.catch((error) => {
console.log("Error while loadin data form server");
console.log(error);
return false;
});
};
let loadDetailInFile = function() {
let DetailCounter = 0;
let readStream = fs.createReadStream('allLinks.csv');
let detailWriteStream = fs.createWriteStream('detailCases.csv');
let rl = readline.createInterface({
input: readStream,
terminal: false,
preserveCursor: true
});
rl.on('line', function(line) {
// reading each line of "allLinks.csv" file one by one
get_get_data(line.split(',')[1])
.then((response) => {
console.log(response);
console.log("\n\n\n\n\n");
DetailCounter++;
$ = cheerio.load(response);
detail = {};
//console.log($('select[name="m_resno"] option').text())
detail['Petitioner'] = $('select[name="m_petno"] option').text();
detail['Respondent'] = $('select[name="m_resno"] option').text();
detail['Pent.Adv'] = $('select[name="m_padv"] option').text();
//Similarly other details can be extracted .
detailWriteStream.write(JSON.stringify(detail) + "\n");
console.log(DetailCounter + " \n\n\n");
if (DetailCounter >= LinkCounter) { rl.close(); return; }
})
.catch((error) => {
if (error.message == "not 200 statuscode") {
// auth failed so we need to again make the request for all the urls.
console.log("Error geting case details. Again refreshing Links");
LinkCounter = 0;
rl.pause();
funcH()
.then(() => {
rl.prompt();
})
.catch(() => {
console.log("error occured while refetching all the links");
})
} else {
console.log(error);
}
});
});
}
let funcH = function() {
return new Promise((resolve, reject) => {
if (loadDataInFile()) {
resolve({});
} else {
reject({});
}
})
}
loadDataInFile();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment