Skip to content

Instantly share code, notes, and snippets.

@rafapolo
Last active April 15, 2017 22:22
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rafapolo/06fd70cf38e0334a81a8f31034b9e16a to your computer and use it in GitHub Desktop.
Save rafapolo/06fd70cf38e0334a81a8f31034b9e16a to your computer and use it in GitHub Desktop.
pega geral em json
#!/usr/bin/ruby
#require 'mysql2'
require 'mechanize'
require 'parallel'
require 'byebug'
require 'awesome_print'
require 'colorize'
require 'csv'
require 'json'
# https://raw.githubusercontent.com/opsxcq/proxy-list/master/list.txt
PROXIES = File.read('proxies.txt').split("\n")
@used_proxies = []
def log(msg, error=false)
puts msg.colorize( error ? :red : :blue )
end
def get_from_api(cnpj, use_proxy=true)
begin
http = Mechanize.new
if use_proxy
proxy_now = (PROXIES - @used_proxies).sample(1).first
proxy = proxy_now.split(':')
http.set_proxy proxy[0], proxy[1]
end
http.user_agent_alias = Mechanize::AGENT_ALIASES.to_a.sample[0] # random
http.keep_alive=false
http.open_timeout=5
http.read_timeout=5
log "=> #{cnpj} with #{proxy_now};\n"
page = http.get "http://www.receitaws.com.br/v1/cnpj/#{cnpj}"
locked = "Too many requests, please try again later."
if json = page.body
unless json == locked
ap json # awesome print
return json
else
log locked, true
# change proxy & do again
@used_proxies << proxy_now if use_proxy
scrap cnpj
end
end
rescue Exception => e
log e.message, true
#puts "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
if e.message.index("Failed")
# change proxy & do again
@used_proxies << proxy_now
scrap cnpj
else
if e.message.index("expired") || e.message.index("Timeout")
log "=> timeout for #{cnpj}", true
scrap cnpj # again!
end
end
end
false
end
def scrap(cnpj)
output = "data/#{cnpj}.json"
return if File.exists? output
begin
#con = db # renew db for every request sice is running in Parallel
if json = get_from_api(cnpj)
#safe_json = con.escape(json)
File.write(output, json)
end
rescue Exception => e
log e.message, true
end
end
def cnpjs_from_csv
pjs = []
CSV.foreach("nodes.csv") do |row|
pj = row[4]
if pj && pj!=''
pjs << pj.gsub(/\D/, '')
end
end
pjs.sort.uniq!
end
def export_results list
csv = "cnpj,atividade,nome,telefone,responsavel,bairro,logradouro,numero,cep,abertura,capital\n"
total = 0
list.each do |pj|
data = "data/#{pj}.json"
if File.exists? data
puts pj
json = JSON.parse(File.read(data))
if json['status'] == 'ERROR' # cnpj invalido
d1 = d2 = d3 = d4 = d5 = d6 = d7 = d8 = d9 = d10 = d11 = d12 = ""
else
d1 = json['atividade_principal'][0]['text']
d2 = json['nome']
d3 = json['telefone']
d4 = ''
d4 = json['qsa'][0]['nome'] if json['qsa'] && json['qsa'][0]
d5 = json['bairro']
d6 = json['logradouro']
d7 = json['numero']
d8 = json['cep']
d9 = json['abertura']
d10 = json['capital_social']
d11 = json['atividade_principal'][0]['code']
d12 = json['situacao']
d13 = json['natureza_juridica']
d14 = json['municipio']
end
values = "#{pj};#{d1};#{d2};#{d3};#{d4};#{d5};#{d6};#{d7};#{d8};#{d9};#{d10};#{d11};#{d12};#{d13};#{d14}\n"
csv += values
total+=1
end
end
File.write("result.csv", csv)
puts "="*20
puts "PJs: #{list.count}".blue
puts "Exported: #{total}".green
puts "="*20
end
pjs = File.read('pjs.txt').split("\n")
# pjs = cnpjs_from_csv
Parallel.each(pjs, in_threads: 8) do |pj|
scrap(pj)
end
export_results(pjs)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment