Last active
February 23, 2017 13:11
-
-
Save rafapolo/8dc81701fa19bb129bd7fae0cf0ca7f0 to your computer and use it in GitHub Desktop.
pega um pega geral
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/ruby | |
require 'mysql2' | |
require 'mechanize' | |
require 'parallel' | |
require 'byebug' | |
require 'awesome_print' | |
require 'colorize' | |
require 'csv' | |
PROXIES = File.read('scrap/proxies.txt').split("\n") | |
@used_proxies = [] | |
def db | |
Mysql2::Client.new(host: 'localhost', username: 'root', password: 'semsenha', database: 'tse') | |
end | |
def log(msg, error=false) | |
puts msg.colorize( error ? :red : :blue ) | |
end | |
def get_from_api(cnpj, use_proxy=true) | |
begin | |
http = Mechanize.new | |
if use_proxy | |
proxy_now = (PROXIES - @used_proxies).sample(1).first | |
proxy = proxy_now.split(':') | |
http.set_proxy proxy[0], proxy[1] | |
end | |
http.user_agent_alias = Mechanize::AGENT_ALIASES.to_a.sample[0] # random | |
http.keep_alive=false | |
http.open_timeout=5 | |
http.read_timeout=5 | |
log "=> #{cnpj} with #{proxy_now};\n" | |
page = http.get "http://www.receitaws.com.br/v1/cnpj/#{cnpj}" | |
locked = "Too many requests, please try again later." | |
if json = page.body | |
unless json == locked | |
ap json # awesome print | |
return json | |
else | |
log locked, true | |
# change proxy & do again | |
@used_proxies << proxy_now if use_proxy | |
scrap cnpj | |
end | |
end | |
rescue Exception => e | |
log e.message, true | |
#puts "Backtrace:\n\t#{e.backtrace.join("\n\t")}" | |
if e.message.index("Failed") | |
# change proxy & do again | |
@used_proxies << proxy_now | |
scrap cnpj | |
else | |
if e.message.index("expired") || e.message.index("Timeout") | |
db.query("UPDATE doadores_meta SET timeout=1 WHERE cpf='#{cnpj}'") | |
log "=> timeout for #{cnpj}", true | |
end | |
end | |
end | |
false | |
end | |
def scrap(cnpj) | |
begin | |
con = db # renew db for every request sice is running in Parallel | |
if json = get_from_api(cnpj) | |
safe_json = con.escape(json) | |
con.query("UPDATE doadores_meta SET result='#{safe_json}' WHERE cpf='#{cnpj}'") | |
end | |
rescue Exception => e | |
log e.message, true | |
end | |
end | |
valid_cnpjs = db.query('SELECT cpf from doadores_meta where valid=1 and result is NULL') | |
Parallel.each(valid_cnpjs, in_threads: 32) do |row| | |
scrap(row['cpf']) | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment