Created
December 28, 2010 22:14
-
-
Save elliotcm/757806 to your computer and use it in GitHub Desktop.
Scrape the Daily Mail website for Richard Littlejohn headlines.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'rubygems' | |
require 'nokogiri' | |
require 'open-uri' | |
class Archive | |
def initialize(starting_url) | |
@pages = [first_page = Page.new(starting_url)] | |
add_links_from(first_page) | |
end | |
def each | |
@pages.map(&:headlines).flatten.each do |headline| | |
yield headline.content.gsub('RICHARD LITTLEJOHN: ', '') | |
end | |
end | |
private | |
def add_links_from(page) | |
page.links.each do |url| | |
next if @pages.include? url | |
new_page = Page.new(url) | |
@pages << new_page | |
add_links_from(new_page) | |
end | |
end | |
end | |
class Page | |
PREFIX = "http://www.dailymail.co.uk/news/columnist-322/" | |
def initialize(url) | |
if url =~ /^http/ | |
@url = url | |
else | |
@url = PREFIX + url | |
end | |
puts "Loading contents of #{@url}" | |
@doc = Nokogiri::HTML(open(@url)) | |
end | |
attr_reader :url | |
def ==(other_page) | |
if other_page.is_a? Page | |
self.url == other_page.url | |
elsif other_page.is_a? String | |
self.url == other_page or self.url == PREFIX + other_page | |
else | |
self.url == other_page | |
end | |
end | |
def headlines | |
@headlines ||= @doc.css('.archived-article-links a, .article h2 a') | |
end | |
def links | |
@links ||= @doc.css('.pagination a.page-number').map do |link| | |
link['href'] | |
end | |
end | |
end | |
Archive.new("http://www.dailymail.co.uk/news/columnist-322/Richard-Littlejohn.html").each do |headline| | |
puts headline | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment