Skip to content

Instantly share code, notes, and snippets.

@buccolo
Last active December 28, 2015 23:59
Show Gist options
  • Save buccolo/7583386 to your computer and use it in GitHub Desktop.
Save buccolo/7583386 to your computer and use it in GitHub Desktop.
#!/usr/bin/env ruby
require 'scraperwiki'
require 'rubygems'
require 'active_support/time'
require 'nokogiri'
class Scraper
SIZE_CONSTRAINT = 40
NEIGHBOURHOODS = %w[ moema itaim vl-olimpia vl-mariana brooklin pinheiros perdizes jardins ]
BASE_URL = 'http://www.zap.com.br/imoveis/sao-paulo+sao-paulo+%s/apartamento-padrao/venda/?tipobusca=rapida&rangeValor=0-400000&foto=1&ord=dataatualizacao'
def start
NEIGHBOURHOODS.each do |bairro|
url = BASE_URL % bairro
html = ScraperWiki::scrape(url)
doc = Nokogiri::HTML(html, nil, 'ISO-8859-1')
index = 0
while doc.css('.content-minificha').length > 0
doc.css('.content-minificha').each do |item|
index += 1
date_str = item.css('div.itemData span').text.strip.split.last
date = Date.strptime date_str, '%d/%m/%Y'
data = {}
data['url'] = item.at_css('a.valorOferta')['href']
data['bairro'] = bairro
data['data'] = date_str
unless scraped? data['url']
itempage = Nokogiri::HTML(ScraperWiki::scrape(data['url']), nil, 'ISO-8859-1')
data['rua'] = itempage.at_css('span.street-address').text if itempage.at_css('span.street-address')
data['preco'] = itempage.at_css('.priceFull').text.clean.to_f
itempage.css('ul.fc-detalhes li').each do |attr|
case attr.css('span').first
when /dormit.rios/
data['dorms'] = attr.css('span').last.text.clean.to_f
when /vagas/
data['vagas'] = attr.css('span').last.text.clean.to_f
when /valor/
data['preco_metro'] = attr.css('span').last.text.clean.to_f
when /.rea.*til/
data['area'] = attr.css('span').last.text.gsub(/\s+/, "").clean.to_f
when /condom.*/
data['cond'] = attr.css('span').last.text.clean.to_f
when /IPTU.*/
data['iptu'] = attr.css('span').last.text.clean.to_f
when /pre.* de aluguel.*/
data['aluguel'] = attr.css('span').last.text.clean.to_f
end
end
next if data['area'] < SIZE_CONSTRAINT
puts "#{index}: #{data['url']}"
data['yield'] = (data['aluguel'].to_f / data['preco'].to_f) * 100
maps = itempage.at_css('img.fichaMapa') && itempage.at_css('img.fichaMapa')['src']
lat, lon = maps.split("/")[3].split(".png")[0].split("_") if maps
data['lat'] = lat.to_f
data['lon'] = lon.to_f
ScraperWiki::save_sqlite(['url'], data)
end
end
puts next_page = doc.at_css("a[class=pagNext]")["href"] if doc.at_css("a[class=pagNext]")
doc = Nokogiri::HTML(ScraperWiki::scrape(next_page), nil, 'ISO-8859-1')
end
end
end
def scraped? apartment_url
rows = ScraperWiki.select("* from swdata where url = '#{apartment_url}'") rescue [] # empty data set, table not found
rows.length > 0
end
end
class String
def clean
undesired = %w[ quartos/dts quarto/dt . R$ m2 vagas vaga]
undesired.inject(self) do |buff, curr|
buff.split(curr).join
end.strip
end
end
Scraper.new.start
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment