Skip to content

Instantly share code, notes, and snippets.

@masciugo
Last active August 29, 2015 14:22
Show Gist options
  • Save masciugo/f2f6048cf9a5541cd0e4 to your computer and use it in GitHub Desktop.
Save masciugo/f2f6048cf9a5541cd0e4 to your computer and use it in GitHub Desktop.
download quotidie
#!/usr/bin/env ruby
require 'mechanize'
require 'open-uri'
require 'byebug'
class Zeitung
attr_accessor :uri
attr_reader :final_name, :regexp, :move
def initialize(final_name, regexp, move=false)
@final_name = final_name
@regexp = regexp
@move = move
@uri = nil
end
end
zeitungs = []
zeitungs << Zeitung.new("Milano Finanza", /milano finanza/i, true)
zeitungs << Zeitung.new("Corriere della Sera", /corriere della sera/i, true)
zeitungs << Zeitung.new("Corriere Economia", /corriere economia/i, true)
zeitungs << Zeitung.new("Gazzetta dello Sport", /gazzetta dello sport/i, true)
zeitungs << Zeitung.new("Fatto Quotidiano", /fatto quotidiano/i, true)
zeitungs << Zeitung.new("Repubblica", /repubblica/i, true)
zeitungs << Zeitung.new("Sole 24 Ore", /sole 24 ore/i, true)
zeitungs << Zeitung.new("Stampa", /stampa/i, true)
zeitungs << Zeitung.new("Centro", /centro/i, true)
zeitungs << Zeitung.new("Foglio", /foglio/i, false)
zeitungs << Zeitung.new("Repubblica Roma", /Rep.locale.+\-RM/i, false)
zeitungs << Zeitung.new("Corriere Milano", /Corriere della Sera Milano/i, false)
zeitungs << Zeitung.new("Giornale", /giornale/i, false)
zeitungs << Zeitung.new("Libero", /libero/i ,false)
def download_zeitung(zeitung, filename, destination='')
puts "zeitung '#{zeitung.final_name}' is being downloaded..."
begin
uri = URI::HTTPS.build(host: zeitung.uri.host, path: zeitung.uri.path, query: "directDownload=true")
File.open("#{filename}.temp", "wb") do |saved_file|
open(uri, "rb") do |read_file|
saved_file.write(read_file.read)
end
end
FileUtils.mv("#{filename}.temp", destination+filename)
rescue Exception => e
exit if e.is_a? Interrupt
puts "problem with '#{filename}' at #{uri}: #{e.message}"
end
end
# zeitungs: è un array di oggetti Zeitung
# password: la password del mese
# date: la data
# where: stabilisce dove vanno caricati i giornali
def download_zeitungs(zeitungs, password, date, where)
t = Time.now
date = (date.nil? ? Date.today : Date.parse(date))
date_string = date.strftime("%Y-%m-%d")
puts "Downloading zeitungs for #{date_string}"
puts "-----------------------------------------------------"
agent = Mechanize.new
agent.get('http://zeitung2.tumblr.com') do |page|
# compilo form autenticazione
form = page.forms.first
form.password = 'uvetta'
# non so perche arriva un'altro form precompilato
form2 = agent.submit(form).forms.first
# la pagina con i diversi giorni
quotidie_page = agent.submit(form2)
# finalmente l'url della pagina con la lista dei quotidiani
today_quotidie_page_url = quotidie_page.link_with(text: date.strftime("%d.%m.%Y")).href
agent.get(today_quotidie_page_url) do |page|
# tutti i link utili
all_quotidie_links = page.links.find_all { |link| link.attributes.parent.parent.path == "/html/body/section/section/div/article/div/section[1]/div/div" }
zeitungs.each do |z|
if link = all_quotidie_links.find{|l| z.regexp.match l.attributes.previous_sibling.previous_sibling } # se c'è un link per il zetung corrente
z.uri = link.uri
filename = "#{z.final_name} - #{date_string}.pdf"
destination = ((z.move and where) or '')
download_zeitung(z, filename, destination)
else # se non c'è un link per il zetung corrente
puts "zeitung '#{z.final_name}' not downloaded: CANNOT FIND A LINK "
end
end
end
end
puts "Time elapsed: #{Time.now-t}s"
puts
puts
end
# download_zeitungs(zeitungs, 'xxxx', nil, 'Dropbox/quotidie/')
# download_zeitungs(zeitungs, 'xxxx', '2015-06-01', nil)
download_zeitungs(zeitungs, 'xxxx', nil, nil)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment