Created
September 8, 2013 23:49
-
-
Save astockwell/6489665 to your computer and use it in GitHub Desktop.
Scrape media portal of client site (had to use Watir due to site being built in backbone.js) to download 1000+ high-res images. 1) Scrape image file URLs from all gallery pages. 2) Run non-trivial curl/wget loop to download image files (not shown). 3) Re-organize files from step 2 based on product categories from step 1.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/env ruby | |
require 'watir-webdriver' | |
@username = ENV["username"] | |
@password = ENV["password"] | |
@login_url = ENV["login_url"] | |
@product_url = "#{@login_url}/photos/" | |
b = Watir::Browser.new | |
b.goto @login_url | |
b.wait(2000) | |
login_link = b.element(:css => 'a[href="#login"]') | |
login_link.click | |
f_username = b.text_field :type => 'text' | |
f_password = b.text_field :type => 'password' | |
f_username.value = @username | |
f_password.value = @password | |
submit = b.element(:css => '.login') | |
submit.click | |
b.element(:css => 'a[href="/photos/"]').wait_until_present | |
b.goto @product_url | |
product_lines = { } | |
product_links = { } | |
product_images = { } | |
b.elements(:css => '.item-box h2 a').each do |link| | |
product_lines[link.text] = link.attribute_value('href') | |
b.element(:css => '#content-next').click if b.element(:css => '#next').present? | |
end | |
p product_lines | |
product_lines.each do |line, url| | |
b.goto url | |
product_links[line] = { } | |
b.elements(:css => '.item a').each do |link| | |
link_text = link.text | |
link_href = link.attribute_value('href') | |
product_links[line][link_text] = { | |
'product' => link_href, | |
'downloads' => [] | |
} | |
b.element(:css => '#content-next').click if b.element(:css => '#content-next').present? | |
end | |
end | |
product_links.each do |line, product| | |
product.each do |sku, info| | |
b.goto info['product_page'] | |
b.elements(:css => '.item a').each do |link| | |
link_href = link.attribute_value('href') | |
info['downloads'].push(link_href) | |
end | |
end | |
end | |
p product_links |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/env ruby | |
require 'fileutils' | |
# Via get_product_images.rb | |
@product_images = File.read("images.json") | |
output = '' | |
@product_images.each do |parent_folder,products| | |
FileUtils.mkdir parent_folder.gsub(/[\\\/\:]/, '-').to_s | |
products.each do |product_folder,product_obj| | |
FileUtils.mkdir parent_folder.gsub(/[\\\/\:]/, '-').to_s + '/' + product_folder.gsub(/[\\\/\:]/, '-').to_s | |
product_obj['downloads'].each do |download| | |
download_filename = download | |
.gsub(/(http\:\/\/www\.example\.com\/photos\/)(\d+)(\/)/, '') # trim url portion | |
.gsub(/\%20/, ' ') # replace url ascii spaces with actual spaces | |
.gsub(/\%C3\%B8/, '_o_') # replace special characters | |
begin | |
FileUtils.mv download_filename, parent_folder.gsub(/[\\\/\:]/, '-').to_s + '/' + product_folder.gsub(/[\\\/\:]/, '-').to_s | |
rescue | |
output += "Error moving filename: " + download_filename + " image name: " + download + "\n" | |
end | |
end | |
end | |
end | |
File.open('error_log.txt', 'w') do |file| | |
file.write(output) | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment