Last active
April 15, 2017 22:22
-
-
Save rafapolo/06fd70cf38e0334a81a8f31034b9e16a to your computer and use it in GitHub Desktop.
pega geral em json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/ruby | |
#require 'mysql2' | |
require 'mechanize' | |
require 'parallel' | |
require 'byebug' | |
require 'awesome_print' | |
require 'colorize' | |
require 'csv' | |
require 'json' | |
# https://raw.githubusercontent.com/opsxcq/proxy-list/master/list.txt | |
PROXIES = File.read('proxies.txt').split("\n") | |
@used_proxies = [] | |
def log(msg, error=false) | |
puts msg.colorize( error ? :red : :blue ) | |
end | |
def get_from_api(cnpj, use_proxy=true) | |
begin | |
http = Mechanize.new | |
if use_proxy | |
proxy_now = (PROXIES - @used_proxies).sample(1).first | |
proxy = proxy_now.split(':') | |
http.set_proxy proxy[0], proxy[1] | |
end | |
http.user_agent_alias = Mechanize::AGENT_ALIASES.to_a.sample[0] # random | |
http.keep_alive=false | |
http.open_timeout=5 | |
http.read_timeout=5 | |
log "=> #{cnpj} with #{proxy_now};\n" | |
page = http.get "http://www.receitaws.com.br/v1/cnpj/#{cnpj}" | |
locked = "Too many requests, please try again later." | |
if json = page.body | |
unless json == locked | |
ap json # awesome print | |
return json | |
else | |
log locked, true | |
# change proxy & do again | |
@used_proxies << proxy_now if use_proxy | |
scrap cnpj | |
end | |
end | |
rescue Exception => e | |
log e.message, true | |
#puts "Backtrace:\n\t#{e.backtrace.join("\n\t")}" | |
if e.message.index("Failed") | |
# change proxy & do again | |
@used_proxies << proxy_now | |
scrap cnpj | |
else | |
if e.message.index("expired") || e.message.index("Timeout") | |
log "=> timeout for #{cnpj}", true | |
scrap cnpj # again! | |
end | |
end | |
end | |
false | |
end | |
def scrap(cnpj) | |
output = "data/#{cnpj}.json" | |
return if File.exists? output | |
begin | |
#con = db # renew db for every request sice is running in Parallel | |
if json = get_from_api(cnpj) | |
#safe_json = con.escape(json) | |
File.write(output, json) | |
end | |
rescue Exception => e | |
log e.message, true | |
end | |
end | |
def cnpjs_from_csv | |
pjs = [] | |
CSV.foreach("nodes.csv") do |row| | |
pj = row[4] | |
if pj && pj!='' | |
pjs << pj.gsub(/\D/, '') | |
end | |
end | |
pjs.sort.uniq! | |
end | |
def export_results list | |
csv = "cnpj,atividade,nome,telefone,responsavel,bairro,logradouro,numero,cep,abertura,capital\n" | |
total = 0 | |
list.each do |pj| | |
data = "data/#{pj}.json" | |
if File.exists? data | |
puts pj | |
json = JSON.parse(File.read(data)) | |
if json['status'] == 'ERROR' # cnpj invalido | |
d1 = d2 = d3 = d4 = d5 = d6 = d7 = d8 = d9 = d10 = d11 = d12 = "" | |
else | |
d1 = json['atividade_principal'][0]['text'] | |
d2 = json['nome'] | |
d3 = json['telefone'] | |
d4 = '' | |
d4 = json['qsa'][0]['nome'] if json['qsa'] && json['qsa'][0] | |
d5 = json['bairro'] | |
d6 = json['logradouro'] | |
d7 = json['numero'] | |
d8 = json['cep'] | |
d9 = json['abertura'] | |
d10 = json['capital_social'] | |
d11 = json['atividade_principal'][0]['code'] | |
d12 = json['situacao'] | |
d13 = json['natureza_juridica'] | |
d14 = json['municipio'] | |
end | |
values = "#{pj};#{d1};#{d2};#{d3};#{d4};#{d5};#{d6};#{d7};#{d8};#{d9};#{d10};#{d11};#{d12};#{d13};#{d14}\n" | |
csv += values | |
total+=1 | |
end | |
end | |
File.write("result.csv", csv) | |
puts "="*20 | |
puts "PJs: #{list.count}".blue | |
puts "Exported: #{total}".green | |
puts "="*20 | |
end | |
pjs = File.read('pjs.txt').split("\n") | |
# pjs = cnpjs_from_csv | |
Parallel.each(pjs, in_threads: 8) do |pj| | |
scrap(pj) | |
end | |
export_results(pjs) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment