Created
January 27, 2010 00:44
-
-
Save jkells/287423 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'curb' | |
require 'nokogiri' | |
class Scrape < ActiveRecord::Base | |
belongs_to :push_definition | |
#Class method to scrape a push_definition. | |
def self.scrape push_definition | |
curl = Curl::Easy.new | |
curl.follow_location = true | |
curl.max_redirects = 5 | |
curl.connect_timeout = 30 | |
curl.dns_cache_timeout = 30 | |
curl.max_redirects = 5 | |
curl.timeout = 120 | |
curl.url = push_definition.url_pattern | |
#Scrape the site | |
curl.perform | |
#TODO handle any HTTP errors etc | |
parse_full_document curl.body_str, push_definition | |
end | |
# Parse the result after scraping the URL. For each container | |
# in the list we creape a generate a new scrape instance. | |
def self.parse_full_document body_str, push_definition | |
xml_doc = Nokogiri::HTML(body_str) | |
containers = xml_doc.xpath(push_definition.container_xpath) | |
containers.each do |container| | |
scrape = Scrape.new | |
scrape.push_definition = push_definition | |
scrape.set_initial_result | |
scrape.parse_container container | |
scrape.save | |
end | |
end | |
def set_initial_result | |
self.push_id = push_definition.push_id_pattern | |
self.title = push_definition.title_pattern | |
self.detail = push_definition.detail_pattern | |
self.link = push_definition.link_pattern | |
self.image_url = push_definition.image_url_pattern | |
end | |
# Parse a container by extracting all fields, applying any | |
# regular expressions and then inserting them into the | |
# result fields. | |
def parse_container container | |
# Fetch fields | |
fields = push_definition.field_definitions | |
fields.each do |field| | |
result = container.xpath(field.xpath)[0].content() | |
#Transform fields with regexp | |
#field.field_transforms.each do |transform| | |
# transform.expression | |
#end | |
field_exp = Regexp.new('%' + field.field_number.to_s + '%') | |
self.push_id = self.push_id.gsub(field_exp, result) | |
self.title = self.title.gsub(field_exp, result) | |
self.detail = self.detail.gsub(field_exp, result) | |
self.link = self.link.gsub(field_exp, result) | |
self.image_url = self.image_url.gsub(field_exp, result) | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment