Skip to content

Instantly share code, notes, and snippets.

@rafapolo
Last active August 17, 2020 17:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rafapolo/7c8d0524cc66cc036b771b025ec19d8f to your computer and use it in GitHub Desktop.
Save rafapolo/7c8d0524cc66cc036b771b025ec19d8f to your computer and use it in GitHub Desktop.
#! /bin/ruby
# author: extrapolo.com
# scraps all min,max temperature from Nova Friburgo, Rj, Brazil from accuweather.com since 1992
require "selenium-webdriver"
require "byebug"
profile['browser.frames.enabled'] = false
profile['plugin.state.flash'] = 0
profile['network.http.prompt-temp-redirect'] = false
opts = Selenium::WebDriver::Firefox::Options.new(profile: profile)
options.headless!
@driver = Selenium::WebDriver.for :firefox #, options: opts
@driver.manage.timeouts.page_load = 6000
def get_data
days = @driver.find_elements(class: "monthly-daypanel")
days.each do |d|
ano = @driver.current_url.split("year=")[1]
data = "#{d.text}/#{ano}\n\n"
puts data
File.open("raw.txt", 'a') { |file| file.write(data) }
end
end
def crawl
@driver.navigate.to "https://www.accuweather.com/pt/br/nova-friburgo/35548/november-weather/35548?year=1998"
get_data
sleep 2
while nxt=@driver.find_element(class: "is-next")
href = nxt.attribute(:href)
puts href
@driver.navigate.to href
sleep 2
get_data
end
@driver.quit
end
def parse
raw = File.open("raw.txt").read
today = Date.today
from = Date.parse("1991-12-31")
nxt = from
while nxt!=today
nxt += 1
ano, mes, dia = nxt.to_s.split("-")
match = raw.scan(/#{dia}\/#{mes}\n(\d{2})°\n(\d{2})°\/#{ano}/)
max, min = !match.empty? ? match.first : ["-", "-"]
puts "#{ano}, #{mes}, #{dia}, #{max}, #{min}"
end
end
parse
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment