Skip to content

Instantly share code, notes, and snippets.

@jkells
Created January 27, 2010 00:44
Show Gist options
  • Save jkells/287423 to your computer and use it in GitHub Desktop.
Save jkells/287423 to your computer and use it in GitHub Desktop.
require 'curb'
require 'nokogiri'
class Scrape < ActiveRecord::Base
belongs_to :push_definition
#Class method to scrape a push_definition.
def self.scrape push_definition
curl = Curl::Easy.new
curl.follow_location = true
curl.max_redirects = 5
curl.connect_timeout = 30
curl.dns_cache_timeout = 30
curl.max_redirects = 5
curl.timeout = 120
curl.url = push_definition.url_pattern
#Scrape the site
curl.perform
#TODO handle any HTTP errors etc
parse_full_document curl.body_str, push_definition
end
# Parse the result after scraping the URL. For each container
# in the list we creape a generate a new scrape instance.
def self.parse_full_document body_str, push_definition
xml_doc = Nokogiri::HTML(body_str)
containers = xml_doc.xpath(push_definition.container_xpath)
containers.each do |container|
scrape = Scrape.new
scrape.push_definition = push_definition
scrape.set_initial_result
scrape.parse_container container
scrape.save
end
end
def set_initial_result
self.push_id = push_definition.push_id_pattern
self.title = push_definition.title_pattern
self.detail = push_definition.detail_pattern
self.link = push_definition.link_pattern
self.image_url = push_definition.image_url_pattern
end
# Parse a container by extracting all fields, applying any
# regular expressions and then inserting them into the
# result fields.
def parse_container container
# Fetch fields
fields = push_definition.field_definitions
fields.each do |field|
result = container.xpath(field.xpath)[0].content()
#Transform fields with regexp
#field.field_transforms.each do |transform|
# transform.expression
#end
field_exp = Regexp.new('%' + field.field_number.to_s + '%')
self.push_id = self.push_id.gsub(field_exp, result)
self.title = self.title.gsub(field_exp, result)
self.detail = self.detail.gsub(field_exp, result)
self.link = self.link.gsub(field_exp, result)
self.image_url = self.image_url.gsub(field_exp, result)
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment