Skip to content

Instantly share code, notes, and snippets.

@ryantss
Created April 16, 2010 09:38
Show Gist options
  • Save ryantss/368216 to your computer and use it in GitHub Desktop.
Save ryantss/368216 to your computer and use it in GitHub Desktop.
module Leonardo
class XmlParser
@@parser_logger = Logger.new(File.join(LOG_PATH, "leonardo_xml_parser_#{Gazo::Time.pretty_day}.log"))
@@error_logger = Logger.new(File.join(LOG_PATH, "leonardo_xml_parser_error_#{Gazo::Time.pretty_day}.log"))
SIZES = { :A => '70x70!', :B => '100x100', :D => '200x200', :J => '480x480'}
IMG_PATH_PREFIX = "http://images.leonardo.com/imgs"
class << self
def process!
# Create the path if it does not exists
pre_process
# Dir.entries(XML_DIR).grep(/\.xml$/).each do |file|
properties = []
Dir[(File.join(XML_DIR, '*.xml'))].sort.each_with_index do |file, idx|
puts file
if File.size?(File.join(PARSED_DIR, "processed_xml.txt")) && File.read(File.join(PARSED_DIR, "processed_xml.txt")).include?(file)
@@parser_logger.debug("#{file} have been processed earlier..., skip to next")
next
end
images_hash = parse(file, idx)
images_hash.each do |leonardo_id, images|
next if leonardo_id.nil?
@@parser_logger.debug("Images for Property ID: #{leonardo_id}, Image URLs: #{images.size}")
######
if properties.include?(leonardo_id)
@@parser_logger.debug("#{leonardo_id} is already exists!!!")
else
properties << leonardo_id
end
######
save_image_urls_for(leonardo_id, :with => images)
end
append_to_file(File.join(PARSED_DIR, "processed_xml.txt"), file)
images_hash = nil
end
@@parser_logger.debug("Total properties-keys: #{properties.size}")
end
private
def pre_process
Dir.mkdir(IMG_DIR) unless File.exists?(IMG_DIR)
Dir.mkdir(PARSED_DIR) unless File.exists?(PARSED_DIR)
end
def save_image_urls_for(leonardo_id, opts = {})
image_urls_to_save = opts[:with] || []
image_urls_to_save.each do |img|
fname = File.join(IMG_DIR, img.url[img.url.rindex("/")+1..-1])
if File.size?(File.join(PARSED_DIR, "error_image_download.txt")) && File.read(File.join(PARSED_DIR, "error_image_download.txt")).include?(img.url)
@@parser_logger.debug("404 Image, skip #{img.url}")
next
end
unless File.size?(fname)
begin
#TODO temporary commented for testing
#download_image(img.url, fname)
append_to_file(File.join(PARSED_DIR, "#{leonardo_id}.csv"), img.to_csv)
# Download only largest image and resize for the rest
# resize_and_save(img, fname, leonardo_id)
rescue
@@error_logger.debug("Failed to download #{fname} - #{$!}")
append_to_file(File.join(PARSED_DIR, "error_image_download.txt"), img.url)
end
end
end
end
def resize_and_save(img, fname, leonardo_id)
[:A, :B, :D].each do |size_id|
new_fname = fname.gsub('_J.jpg', "_#{size_id.to_s}.jpg")
@@parser_logger.debug("Resizing image to #{new_fname} to #{SIZES[size_id]}")
image = MiniMagick::Image.from_file(fname)
image.resize SIZES[size_id]
image.write(new_fname)
new_img = img.clone
new_img.width, new_img.height = image[:width], image[:height]
new_img.url.gsub!('_J.jpg', "_#{size_id.to_s}.jpg")
append_to_file(File.join(PARSED_DIR, "#{leonardo_id}.csv"), new_img.to_csv)
image = nil
end
end
def append_to_file(filename, content)
File.open(filename, 'a') do |out|
out.puts(content)
end
end
def download_image(image_path, filename)
@@parser_logger.debug("downloading #{filename}...")
File.open(filename, 'wb') do |f|
f.write(open(image_path).read)
end
end
def parse(file, idx)
@@parser_logger.debug("Examining File\##{idx+1}: #{file}")
images_hash = {}
doc = Nokogiri::XML(open(file))
doc.xpath('//Property').each do |property|
id = property.attributes['Id']
name = property.at('LeonardoName').text
property.xpath('.//Image').each do |img|
get_all_image_data_from(img).each do |i|
img_code, img_name, img_caption, img_category, img_description, img_url, img_orig_size, img_width, img_height = i
image = Image.new(:property_id => id, :property_name => name,
:url => "http://images.leonardo.com/imgs#{img_url}",
:img_code => img_code, :img_name => img_name, :category => img_category,
:caption => img_caption, :description => img_description,
:width => img_width, :height => img_height, :original_size => img_orig_size
)
images_hash[id] ||= []
images_hash[id] << image
end
end
end
doc = nil
@@parser_logger.debug("size: #{images_hash.size}")
@@parser_logger.debug("Finished examining file: #{file}")
return images_hash
end
def get_all_image_data_from(img)
images, name, code, caption, description, category, url = [], "", "", "", "", "", ""
sizes = get_all_sizes_for(img.at('Sizes'))
# Get the metadata
name = img.at('ImageName')['Val']
code = img.at('LeonardoCode')['Val']
caption = img.xpath('.//Caption//en').text
description = img.xpath('.//Description//en').text
category = img.at('Category')['Val']
img_orig_size = "#{img.at('OriginalSize')['Width']}x#{img.at('OriginalSize')['Height']}"
img.xpath('.//File').each do |img_f|
url = img_f['Name']
size_category = url.match(/(\w)\.jpg$/)[1]
# next unless %w{A B D J}.include?(size_category)
next unless size_category == "J"
width, height = sizes[size_category]
images << [code, name, caption, category, description, url, img_orig_size, width, height]
end
return images
end
def get_all_sizes_for(img_size)
sizes = {}
img_size.xpath('.//Size').each do |s|
sizes[s['Sign']] = [s['Width'], s['Height']]
# puts "#{s['Sign']}=#{s['Width']}x#{'Height'}"
end
sizes
end
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment