Created
April 16, 2010 09:38
-
-
Save ryantss/368216 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module Leonardo | |
class XmlParser | |
@@parser_logger = Logger.new(File.join(LOG_PATH, "leonardo_xml_parser_#{Gazo::Time.pretty_day}.log")) | |
@@error_logger = Logger.new(File.join(LOG_PATH, "leonardo_xml_parser_error_#{Gazo::Time.pretty_day}.log")) | |
SIZES = { :A => '70x70!', :B => '100x100', :D => '200x200', :J => '480x480'} | |
IMG_PATH_PREFIX = "http://images.leonardo.com/imgs" | |
class << self | |
def process! | |
# Create the path if it does not exists | |
pre_process | |
# Dir.entries(XML_DIR).grep(/\.xml$/).each do |file| | |
properties = [] | |
Dir[(File.join(XML_DIR, '*.xml'))].sort.each_with_index do |file, idx| | |
puts file | |
if File.size?(File.join(PARSED_DIR, "processed_xml.txt")) && File.read(File.join(PARSED_DIR, "processed_xml.txt")).include?(file) | |
@@parser_logger.debug("#{file} have been processed earlier..., skip to next") | |
next | |
end | |
images_hash = parse(file, idx) | |
images_hash.each do |leonardo_id, images| | |
next if leonardo_id.nil? | |
@@parser_logger.debug("Images for Property ID: #{leonardo_id}, Image URLs: #{images.size}") | |
###### | |
if properties.include?(leonardo_id) | |
@@parser_logger.debug("#{leonardo_id} is already exists!!!") | |
else | |
properties << leonardo_id | |
end | |
###### | |
save_image_urls_for(leonardo_id, :with => images) | |
end | |
append_to_file(File.join(PARSED_DIR, "processed_xml.txt"), file) | |
images_hash = nil | |
end | |
@@parser_logger.debug("Total properties-keys: #{properties.size}") | |
end | |
private | |
def pre_process | |
Dir.mkdir(IMG_DIR) unless File.exists?(IMG_DIR) | |
Dir.mkdir(PARSED_DIR) unless File.exists?(PARSED_DIR) | |
end | |
def save_image_urls_for(leonardo_id, opts = {}) | |
image_urls_to_save = opts[:with] || [] | |
image_urls_to_save.each do |img| | |
fname = File.join(IMG_DIR, img.url[img.url.rindex("/")+1..-1]) | |
if File.size?(File.join(PARSED_DIR, "error_image_download.txt")) && File.read(File.join(PARSED_DIR, "error_image_download.txt")).include?(img.url) | |
@@parser_logger.debug("404 Image, skip #{img.url}") | |
next | |
end | |
unless File.size?(fname) | |
begin | |
#TODO temporary commented for testing | |
#download_image(img.url, fname) | |
append_to_file(File.join(PARSED_DIR, "#{leonardo_id}.csv"), img.to_csv) | |
# Download only largest image and resize for the rest | |
# resize_and_save(img, fname, leonardo_id) | |
rescue | |
@@error_logger.debug("Failed to download #{fname} - #{$!}") | |
append_to_file(File.join(PARSED_DIR, "error_image_download.txt"), img.url) | |
end | |
end | |
end | |
end | |
def resize_and_save(img, fname, leonardo_id) | |
[:A, :B, :D].each do |size_id| | |
new_fname = fname.gsub('_J.jpg', "_#{size_id.to_s}.jpg") | |
@@parser_logger.debug("Resizing image to #{new_fname} to #{SIZES[size_id]}") | |
image = MiniMagick::Image.from_file(fname) | |
image.resize SIZES[size_id] | |
image.write(new_fname) | |
new_img = img.clone | |
new_img.width, new_img.height = image[:width], image[:height] | |
new_img.url.gsub!('_J.jpg', "_#{size_id.to_s}.jpg") | |
append_to_file(File.join(PARSED_DIR, "#{leonardo_id}.csv"), new_img.to_csv) | |
image = nil | |
end | |
end | |
def append_to_file(filename, content) | |
File.open(filename, 'a') do |out| | |
out.puts(content) | |
end | |
end | |
def download_image(image_path, filename) | |
@@parser_logger.debug("downloading #{filename}...") | |
File.open(filename, 'wb') do |f| | |
f.write(open(image_path).read) | |
end | |
end | |
def parse(file, idx) | |
@@parser_logger.debug("Examining File\##{idx+1}: #{file}") | |
images_hash = {} | |
doc = Nokogiri::XML(open(file)) | |
doc.xpath('//Property').each do |property| | |
id = property.attributes['Id'] | |
name = property.at('LeonardoName').text | |
property.xpath('.//Image').each do |img| | |
get_all_image_data_from(img).each do |i| | |
img_code, img_name, img_caption, img_category, img_description, img_url, img_orig_size, img_width, img_height = i | |
image = Image.new(:property_id => id, :property_name => name, | |
:url => "http://images.leonardo.com/imgs#{img_url}", | |
:img_code => img_code, :img_name => img_name, :category => img_category, | |
:caption => img_caption, :description => img_description, | |
:width => img_width, :height => img_height, :original_size => img_orig_size | |
) | |
images_hash[id] ||= [] | |
images_hash[id] << image | |
end | |
end | |
end | |
doc = nil | |
@@parser_logger.debug("size: #{images_hash.size}") | |
@@parser_logger.debug("Finished examining file: #{file}") | |
return images_hash | |
end | |
def get_all_image_data_from(img) | |
images, name, code, caption, description, category, url = [], "", "", "", "", "", "" | |
sizes = get_all_sizes_for(img.at('Sizes')) | |
# Get the metadata | |
name = img.at('ImageName')['Val'] | |
code = img.at('LeonardoCode')['Val'] | |
caption = img.xpath('.//Caption//en').text | |
description = img.xpath('.//Description//en').text | |
category = img.at('Category')['Val'] | |
img_orig_size = "#{img.at('OriginalSize')['Width']}x#{img.at('OriginalSize')['Height']}" | |
img.xpath('.//File').each do |img_f| | |
url = img_f['Name'] | |
size_category = url.match(/(\w)\.jpg$/)[1] | |
# next unless %w{A B D J}.include?(size_category) | |
next unless size_category == "J" | |
width, height = sizes[size_category] | |
images << [code, name, caption, category, description, url, img_orig_size, width, height] | |
end | |
return images | |
end | |
def get_all_sizes_for(img_size) | |
sizes = {} | |
img_size.xpath('.//Size').each do |s| | |
sizes[s['Sign']] = [s['Width'], s['Height']] | |
# puts "#{s['Sign']}=#{s['Width']}x#{'Height'}" | |
end | |
sizes | |
end | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment