This ruby script uses Hpricot to scrape the Freshly Pressed pages on It then stores its results as JSON on S3. Here is the related blog post to this Gist:
#!/usr/bin/ruby -rubygems
require 'open-uri'
require 'hpricot'
require 'aws/s3'
require 'yaml'
require 'json'
def save_in_s3(key, data, bucket, options)
amazon_s3_settings = YAML.load(open(File.join(File.dirname(__FILE__), "amazon_s3.yml"){ |f| }))
AWS::S3::Base.establish_connection!(:access_key_id => amazon_s3_settings[:access_key_id], :secret_access_key => amazon_s3_settings[:secret_access_key]), data, amazon_s3_settings[bucket], options)
num_pages = 10
(1..num_pages).each do |page|
picks =
doc = Hpricot(open("{page}"))'.pick').each do |pick_element|
# Sponsored posts are being skipped until I can figure out a way to get a 320px wide version
# of the images used for Sponsored posts
next if pick_element.inner_html.include? 'Sponsored Post'
pick =
pick[:url] ='a')['href']
pick[:title] ='.posttitle').inner_text
subtitle ='small')
pick[:subtitle] = subtitle.inner_text if subtitle
picture ='.picture')
if picture
style_array = picture['style'].split(';').collect{|x| Hash[*x.split(':',2).collect{|a|a.strip}]}
style_hash =
style_array.each {|x| style_hash = style_hash.merge(x)}
background_image = style_hash["background-image"]
image_url = background_image.match(/url\('([^']+)'\)/)[1]
pick[:image] = image_url.gsub('w=223', 'w=320')
scale_increase = 1.43497757847534 # 320.0/223.0
pick[:y_offset] = style_hash["background-position"].split[1].sub('px','').to_f * scale_increase
pick[:image] ='img')['src']
picks << pick
picks << {:next_page => page + 1} unless (page == num_pages - 1)
save_in_s3("freshlypressed/#{page}.json", picks.to_json, :wordpress_bucket, {:access => 'public-read'})
