Skip to content

Instantly share code, notes, and snippets.

@denis-cto
Created September 19, 2017 11:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save denis-cto/74f69562ffc55bb3ec480b7b6671e47c to your computer and use it in GitHub Desktop.
Save denis-cto/74f69562ffc55bb3ec480b7b6671e47c to your computer and use it in GitHub Desktop.
dom.gosuslugi.ru parser for companies names and emails
const args = require("node-args-parser")(process.argv)
var number = args.number;
if (number === 'undefined') process.exit();
console.log("\007");
var posix = require('posix');
posix.setrlimit('nofile', {soft: 10000}); // multiple connections require a lot of open session files
var RateLimiter = require('request-rate-limiter');
var limiter = new RateLimiter({
rate: 200 // requests per interval,
// defaults to 60
, interval: 30 // interval for the rate, x
// requests per interval,
// defaults to 60
, backoffCode: 429 // back off when this status is
// returned, defaults to 429
, backoffTime: 20 // back off for n seconds,
// defauts to rate/5
, maxWaitingTime: 3000 // return errors for requests
// that will have to wait for
// n seconds or more. defaults
// to 5 minutes
});
var fs = require('fs'),
JSONStream = require('JSONStream'), // we will use Streams beacuse of large files
es = require('event-stream');
var request = require('request');
var getStream = function () {
var jsonData = 'all' + number + '.json',
stream = fs.createReadStream(jsonData, {encoding: 'utf8'}),
parser = JSONStream.parse('*');
return stream.pipe(parser);
};
var csvWriter = require('csv-write-stream') // we will write to Stream also
var writer = csvWriter(
{
separator: '\t',
newline: '\n',
headers: undefined,
sendHeaders: false
}
);
writer.pipe(fs.createWriteStream('all' + number + '.csv'));
getStream()
.pipe(es.mapSync(function (data) {
// console.log(data);
for (var i in data) {
(function (i) { // create anonymoues function to collect data from file and corresponding request (
var firmaUrl = 'https://dom.gosuslugi.ru/ppa/api/rest/services/ppa/public/organizations/orgByGuid?organizationGuid=' + data[i].guid;
limiter.request(function (err, backoff) { // use request Rate limiter, beacuse nodejs tries to open all requests at once, and the last ones - fail due to timeout.
if (err) {
console.log('the err object is set if the limiter is overflowing or is not able to execute your request in time');
}
else {
request(
{
json: true,
url: firmaUrl,
headers: {
'Content-Type': 'application/json',
'user-agent': 'node.js',
}
}, function (error, response, body) {
// Do more stuff with 'body' here
console.log(i);
if (err) {
console.log(error);
}
else if (response.statusCode === 429) {
// we have to back off. this callback will be called again as soon as the remote enpoint
// should accept requests again. no need to queue your callback another time on the limiter.
backoff();
}
else {
if ((body !== null) && (typeof(body) !== 'undefined')) {
if (data[i].factualAddress.city !== null) factualAdress = data[i].factualAddress.city.offName; else factualAdress = '';
if (body.postalAddress.city !== null) cityPostal = body.postalAddress.city.offName; else cityPostal = '';
if (data[i].factualAddress.region !== null) factualRegion = data[i].factualAddress.region.offName; else factualRegion = '';
if (body.postalAddress.region !== null) oblastPostal = body.postalAddress.region.offName; else oblastPostal = '';
var org = {
'fullName': data[i].fullName,
'shortName': data[i].shortName,
'phone': data[i].phone,
'inn': data[i].inn,
'url': data[i].url,
'orgAddress': data[i].orgAddress, /// юр
'physicalAdress': data[i].factualAddress.formattedAddress, /// юр
'oblastFact': factualRegion,
'cityFact': factualAdress,
'postalAdress': body.postalAddress.formattedAddress, /// юр
'oblastPostal': oblastPostal,
'cityPostal': cityPostal,
'guid': data[i].guid,
'firstName': body.chiefFirstName,
'lastName': body.chiefLastName,
'middleName': body.chiefMiddleName,
'email': body.orgEmail,
};
writer.write(org);
}
else {
console.log('body undefined in', i);
console.log(response);
console.log(error);
console.log(body);
}
}
});
}
});
})(i, data);
}
}
));
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment