Created
April 3, 2013 16:25
-
-
Save newtriks/5302781 to your computer and use it in GitHub Desktop.
Sinatra script to scrape a specific xml file for image urls and upload the linked images to an Amazon S3 bucket. Change to suit your particular XML nodes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
require 'rubygems' | |
require 'sinatra' | |
require 'haml' | |
require 'nokogiri' | |
require 'aws/s3' | |
set :bucket, 'bucket-name' | |
set :s3_key, 'xxxxxx' | |
set :s3_secret, 'xxxxxx' | |
set :s3_host, 's3-eu-west-1.amazonaws.com' | |
set :scraped_domain, 'example.com' | |
set :id_node, '//communication/@id' | |
set :url_node, '//image/@url' | |
set :upload_dir, 'tmp' | |
before do | |
AWS::S3::DEFAULT_HOST.replace settings.s3_host | |
AWS::S3::Base.establish_connection!(:access_key_id => settings.s3_key, :secret_access_key => settings.s3_secret) | |
end | |
get '/' do | |
haml :index | |
end | |
post '/' do | |
@results = "" | |
# Get host xml file from form post | |
host = params[:host] | |
# Prepend with fixed domain name of the host for scraped images | |
url = setting.scraped_domain | |
# Get page xml using Nokogiri | |
page = Nokogiri::HTML(open(host)) | |
# Grab unique dir name from communication id in xml (change if non-existent) | |
dir_name = page.at_xpath(settings.id_node).to_s | |
# Loop through image urls and upload each image to the S3 bucket | |
page.xpath(settings.url_node).each do |src| | |
uri = make_absolute(src,url) | |
File.open(File.basename(uri),'wb'){ |f| | |
name = "#{settings.upload_dir}/#{dir_name}/images/#{File.basename(uri)}" | |
AWS::S3::S3Object.store(name,open(uri).read,settings.bucket,:access => :public_read) | |
} | |
end | |
@results = 'Successfully scraped images' | |
haml :index | |
end | |
def make_absolute( href, root ) | |
URI.parse(root).merge(URI.parse(href)).to_s | |
end | |
__END__ | |
@@ layout | |
%html | |
%head | |
%title Image Scraper | |
%body | |
#header | |
%h1 Enter a URL to an XML file | |
#content | |
=yield | |
@@ index | |
%form(action='/' method='POST') | |
%input(type='text' name='host' value=@host) | |
%input(type='submit') | |
- if defined?(@results) | |
%text= @results |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment