This ruby script uses Hpricot to scrape the Freshly Pressed pages on Wordpress.com. It then stores its results as JSON on S3. Here is the related blog post to this Gist: http://idevrecipes.com/?p=260
#!/usr/bin/ruby -rubygems | |
require 'open-uri' | |
require 'hpricot' | |
require 'aws/s3' | |
require 'yaml' | |
require 'json' | |
def save_in_s3(key, data, bucket, options) | |
amazon_s3_settings = YAML.load(open(File.join(File.dirname(__FILE__), "amazon_s3.yml"){ |f| f.read })) | |
AWS::S3::Base.establish_connection!(:access_key_id => amazon_s3_settings[:access_key_id], :secret_access_key => amazon_s3_settings[:secret_access_key]) | |
AWS::S3::S3Object.store(key, data, amazon_s3_settings[bucket], options) | |
end | |
num_pages = 10 | |
(1..num_pages).each do |page| | |
picks = Array.new | |
doc = Hpricot(open("http://wordpress.com/?load=editorpicks&fp=#{page}")) | |
doc.search('.pick').each do |pick_element| | |
# Sponsored posts are being skipped until I can figure out a way to get a 320px wide version | |
# of the images used for Sponsored posts | |
next if pick_element.inner_html.include? 'Sponsored Post' | |
pick = Hash.new | |
pick[:url] = pick_element.at('a')['href'] | |
pick[:title] = pick_element.at('.posttitle').inner_text | |
subtitle = pick_element.at('small') | |
pick[:subtitle] = subtitle.inner_text if subtitle | |
picture = pick_element.at('.picture') | |
if picture | |
style_array = picture['style'].split(';').collect{|x| Hash[*x.split(':',2).collect{|a|a.strip}]} | |
style_hash = Hash.new | |
style_array.each {|x| style_hash = style_hash.merge(x)} | |
background_image = style_hash["background-image"] | |
image_url = background_image.match(/url\('([^']+)'\)/)[1] | |
pick[:image] = image_url.gsub('w=223', 'w=320') | |
scale_increase = 1.43497757847534 # 320.0/223.0 | |
pick[:y_offset] = style_hash["background-position"].split[1].sub('px','').to_f * scale_increase | |
else | |
pick[:image] = pick_element.at('img')['src'] | |
end | |
picks << pick | |
end | |
picks << {:next_page => page + 1} unless (page == num_pages - 1) | |
save_in_s3("freshlypressed/#{page}.json", picks.to_json, :wordpress_bucket, {:access => 'public-read'}) | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment