Skip to content

Instantly share code, notes, and snippets.

@rafapolo
Last active February 23, 2017 13:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rafapolo/8dc81701fa19bb129bd7fae0cf0ca7f0 to your computer and use it in GitHub Desktop.
Save rafapolo/8dc81701fa19bb129bd7fae0cf0ca7f0 to your computer and use it in GitHub Desktop.
pega um pega geral
#!/usr/bin/ruby
require 'mysql2'
require 'mechanize'
require 'parallel'
require 'byebug'
require 'awesome_print'
require 'colorize'
require 'csv'
PROXIES = File.read('scrap/proxies.txt').split("\n")
@used_proxies = []
def db
Mysql2::Client.new(host: 'localhost', username: 'root', password: 'semsenha', database: 'tse')
end
def log(msg, error=false)
puts msg.colorize( error ? :red : :blue )
end
def get_from_api(cnpj, use_proxy=true)
begin
http = Mechanize.new
if use_proxy
proxy_now = (PROXIES - @used_proxies).sample(1).first
proxy = proxy_now.split(':')
http.set_proxy proxy[0], proxy[1]
end
http.user_agent_alias = Mechanize::AGENT_ALIASES.to_a.sample[0] # random
http.keep_alive=false
http.open_timeout=5
http.read_timeout=5
log "=> #{cnpj} with #{proxy_now};\n"
page = http.get "http://www.receitaws.com.br/v1/cnpj/#{cnpj}"
locked = "Too many requests, please try again later."
if json = page.body
unless json == locked
ap json # awesome print
return json
else
log locked, true
# change proxy & do again
@used_proxies << proxy_now if use_proxy
scrap cnpj
end
end
rescue Exception => e
log e.message, true
#puts "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
if e.message.index("Failed")
# change proxy & do again
@used_proxies << proxy_now
scrap cnpj
else
if e.message.index("expired") || e.message.index("Timeout")
db.query("UPDATE doadores_meta SET timeout=1 WHERE cpf='#{cnpj}'")
log "=> timeout for #{cnpj}", true
end
end
end
false
end
def scrap(cnpj)
begin
con = db # renew db for every request sice is running in Parallel
if json = get_from_api(cnpj)
safe_json = con.escape(json)
con.query("UPDATE doadores_meta SET result='#{safe_json}' WHERE cpf='#{cnpj}'")
end
rescue Exception => e
log e.message, true
end
end
valid_cnpjs = db.query('SELECT cpf from doadores_meta where valid=1 and result is NULL')
Parallel.each(valid_cnpjs, in_threads: 32) do |row|
scrap(row['cpf'])
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment