Skip to content

Instantly share code, notes, and snippets.

@ahmdrefat
Created June 13, 2012 19:21
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ahmdrefat/2925910 to your computer and use it in GitHub Desktop.
Save ahmdrefat/2925910 to your computer and use it in GitHub Desktop.
Script to scrape front pages images of newspapers form http://kiosko.net/
require "fileutils"
require "open-uri"
require "RMagick"
class Scraper
def self.get_issues
# the dates the issues to scrape
year = 2010
month = 5
start_day = 3
end_day = 7
# URIs of the issues
newspapers_issues_paths = Scraper.build_kiosko_issues(year, month, start_day, end_day)
#newspapers_issues_paths = Scraper.build_newyork_times_issues(year, month, start_day, end_day)
#newspapers_issues_paths = Scraper.build_elpais_issues(year, month, start_day, end_day)
Scraper.scrape newspapers_issues_paths
puts "Scraping is done"
end
# scrape method take the URIs of the issues and scrape them
def self.scrape(newspapers_issues_paths)
paths = newspapers_issues_paths
paths.each do |path|
begin
open(path) do |source|
# pass to save method the path of the issue and the issue it self
Scraper.save_kiosko_issues path, source
#Scraper.save_newyork_times_issues path, source
#Scraper.save_elpais_issues path, source
end
rescue => e
puts e.message + " => " + path
end
end
end
# formating the issues date for Kiosko.com in "YYYY/MM/DD" based on the specified year, month, start day, and end day
def self.issues_dates(year, month, start_day, end_day)
day = start_day
days = []
# assume that at least the number of issues is on
number_of_issues = 1
# calculate the number of issues
number_of_issues = end_day - start_day + 1 unless end_day == 0
number_of_issues.times do
if day < 10
f_day = String("0" + day.to_s)
else
f_day = day.to_s
end
# formating the dates part of the images name
days << "#{year}/" + "0#{month}/" + f_day
day += 1
end
days
end
# building the URIs of the issues based on the passed dates
# this script able to scrape back to 2008, 2009, and 2010 but most of the newspaper dosen't exsit in this years, and it also covers 2011, 2012
def self.build_kiosko_issues(year, month, start_day, end_day)
FileUtils.mkdir "pics/kiosko" unless File.directory? "pics/kiosko"
# sample of the countries and their newspapers form http://kiosko.net/
=begin
es => Spain
de => Germany
fr => France
it => Italy
uk => United Kingdom
us => USA
. . . . . .
=end
kiosko_newspapers = {"es" => ["elpais", "abc"], "de" => ["faz", "bild"], "fr" => ["lemonde", "lacroix"], "it" => ["corriere_della_sera", "ilmessaggero"], "uk" => ["the_times", ],"us" => ["wsj", "newyork_times", "usa_today"]}
domain = "http://img.kiosko.net/"
issues = Scraper.issues_dates year, month, start_day, end_day
newspapers_issues = []
newspapers_issues_paths = []
# formating the images name by country name then the newspaper name with '.750.jpg' extention
kiosko_newspapers.each do |country, newspaper|
newspaper.each do |_newspaper|
newspapers_issues << "/#{country}/#{_newspaper}.750.jpg"
FileUtils.mkdir "pics/kiosko/#{_newspaper}" unless File.directory? "pics/kiosko/#{_newspaper}"
end
end
# constracting the full URI of each image
newspapers_issues.each do |newspaper|
issues.each do |issue|
newspapers_issues_paths << domain + issue + newspaper
end
end
newspapers_issues_paths
end
# save each image in it's place with name contains the date if the issue
def self.save_kiosko_issues(path, source)
newspaper_name = path.split('/').last
# resolution of the produced image is [750x1072]
open("pics/kiosko/#{path.split("/")[-1].split(".")[0]}/" + "#{path.split('/')[-3]}-#{path.split('/')[-4]}-#{path.split('/')[-5]}-" + newspaper_name ,"wb") do |file|
file.write(source.read())
puts "done => #{path.split('/')[-3]}-#{path.split('/')[-4]}-#{path.split('/')[-5]}-" + newspaper_name
end
end
# building the URIs of the issues based on the passed dates
def self.build_newyork_times_issues(year, month, start_day, end_day)
FileUtils.mkdir "pics/newyork_times" unless File.directory? "pics/newyork_times"
domain = "http://www.nytimes.com/images/"
newspapers_issues_paths = []
issues = Scraper.issues_dates year, month, start_day, end_day
issues.each do |issue|
newspapers_issues_paths << domain + issue + "/nytfrontpage/scan.jpg"
end
newspapers_issues_paths
end
# save each image in it's place with name contains the date if the issue
def self.save_newyork_times_issues(path, source)
# resolution of the produced image is [348x640]
open("pics/newyork_times/" + "#{path.split('/')[-3]}-#{path.split('/')[-4]}-#{path.split('/')[-5]}" ,"wb") do |file|
file.write(source.read())
puts "done => #{path.split('/')[-3]}-#{path.split('/')[-4]}-#{path.split('/')[-5]}"
end
end
# building the URIs of the issues based on the passed dates
def self.build_elpais_issues(year, month, start_day, end_day)
# scrape a pdf from http://elpais.com/ and convert it to png
FileUtils.mkdir "pics/elpais" unless File.directory? "pics/elpais"
# first issue available date is 2012/03/01
issues = Scraper.issues_dates year, month, start_day, end_day
newspapers_issues_paths = []
issues.each do |d|
newspapers_issues_paths << "http://srv00.epimg.net/pdf/elpais/1aPagina/" + d[0..7] + "ep-" + d[0..3] + d[5..6] + d[8..11] + ".pdf"
end
newspapers_issues_paths
end
# convert each passed pdf file to jpg and save this image in it's place with name contains the date if the issue
def self.save_elpais_issues(path, source)
file_name = "#{path.split('/')[-3]}-#{path.split('/')[-2]}-#{path.split('/')[-1][9..10]}"
open("pics/elpais/" + file_name ,"w+b") do |file|
file.write(source.read())
issue_pdf = Magick::ImageList.new("pics/elpais/#{file_name}")
# resolution of the produced image is [765x1133]
issue_pdf.write "pics/elpais/#{file_name}.jpg"
File.delete "pics/elpais/#{file_name}"
puts "done => #{file_name}"
end
end
end
FileUtils.mkdir "pics" unless File.directory? "pics"
Scraper.get_issues
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment