ryantss/xml_parser.rb

## xml_parser.rb
module Leonardo
  class XmlParser
    @@parser_logger = Logger.new(File.join(LOG_PATH, "leonardo_xml_parser_#{Gazo::Time.pretty_day}.log"))
    @@error_logger = Logger.new(File.join(LOG_PATH, "leonardo_xml_parser_error_#{Gazo::Time.pretty_day}.log"))

    SIZES = { :A => '70x70!', :B => '100x100', :D => '200x200', :J => '480x480'}

    IMG_PATH_PREFIX = "http://images.leonardo.com/imgs"
    class << self

      def process!
        # Create the path if it does not exists
        pre_process

        # Dir.entries(XML_DIR).grep(/\.xml$/).each do |file|
        properties = []

        Dir[(File.join(XML_DIR, '*.xml'))].sort.each_with_index do |file, idx|
          puts file

          if File.size?(File.join(PARSED_DIR, "processed_xml.txt")) && File.read(File.join(PARSED_DIR, "processed_xml.txt")).include?(file)
            @@parser_logger.debug("#{file} have been processed earlier..., skip to next")
            next
          end

          images_hash = parse(file, idx)

          images_hash.each do |leonardo_id, images|
            next if leonardo_id.nil?
            @@parser_logger.debug("Images for Property ID: #{leonardo_id}, Image URLs: #{images.size}")

            ######
            if properties.include?(leonardo_id)
              @@parser_logger.debug("#{leonardo_id} is already exists!!!")
            else
              properties << leonardo_id
            end
            ######

            save_image_urls_for(leonardo_id, :with => images)
          end

          append_to_file(File.join(PARSED_DIR, "processed_xml.txt"), file)

          images_hash = nil
        end

        @@parser_logger.debug("Total properties-keys: #{properties.size}")
      end

    private

      def pre_process
        Dir.mkdir(IMG_DIR)      unless File.exists?(IMG_DIR)
        Dir.mkdir(PARSED_DIR)   unless File.exists?(PARSED_DIR)
      end

     def save_image_urls_for(leonardo_id, opts = {})
        image_urls_to_save = opts[:with] || []
        image_urls_to_save.each do |img|
          fname = File.join(IMG_DIR, img.url[img.url.rindex("/")+1..-1])
          if File.size?(File.join(PARSED_DIR, "error_image_download.txt")) && File.read(File.join(PARSED_DIR, "error_image_download.txt")).include?(img.url)
            @@parser_logger.debug("404 Image, skip #{img.url}")
            next
          end

          unless File.size?(fname)
            begin
              #TODO temporary commented for testing
              #download_image(img.url, fname)
              append_to_file(File.join(PARSED_DIR, "#{leonardo_id}.csv"), img.to_csv)

              # Download only largest image and resize for the rest
              # resize_and_save(img, fname, leonardo_id)
            rescue
              @@error_logger.debug("Failed to download #{fname} - #{$!}")
              append_to_file(File.join(PARSED_DIR, "error_image_download.txt"), img.url)
            end
          end
        end
      end

      def resize_and_save(img, fname, leonardo_id)
        [:A, :B, :D].each do |size_id|
          new_fname = fname.gsub('_J.jpg', "_#{size_id.to_s}.jpg")
          @@parser_logger.debug("Resizing image to #{new_fname} to #{SIZES[size_id]}")
          image = MiniMagick::Image.from_file(fname)
          image.resize SIZES[size_id]
          image.write(new_fname)
          new_img = img.clone
          new_img.width, new_img.height = image[:width], image[:height]
          new_img.url.gsub!('_J.jpg', "_#{size_id.to_s}.jpg")
          append_to_file(File.join(PARSED_DIR, "#{leonardo_id}.csv"), new_img.to_csv)

          image = nil
        end
      end

      def append_to_file(filename, content)
        File.open(filename, 'a') do |out|
         out.puts(content)
        end
      end

      def download_image(image_path, filename)
        @@parser_logger.debug("downloading #{filename}...")
        File.open(filename, 'wb') do |f|
          f.write(open(image_path).read)
        end
      end

      def parse(file, idx)
        @@parser_logger.debug("Examining File\##{idx+1}: #{file}")

        images_hash = {}

        doc = Nokogiri::XML(open(file))

        doc.xpath('//Property').each do |property|
          id = property.attributes['Id']
          name = property.at('LeonardoName').text

          property.xpath('.//Image').each do |img|
            get_all_image_data_from(img).each do |i|
              img_code, img_name, img_caption, img_category, img_description, img_url, img_orig_size, img_width, img_height = i

              image = Image.new(:property_id => id, :property_name => name,
                                :url => "http://images.leonardo.com/imgs#{img_url}",
                                :img_code => img_code, :img_name => img_name, :category => img_category,
                                :caption => img_caption, :description => img_description,
                                :width => img_width, :height => img_height, :original_size => img_orig_size
                                )

              images_hash[id] ||= []
              images_hash[id] << image

            end
          end
        end
        doc = nil

        @@parser_logger.debug("size: #{images_hash.size}")
        @@parser_logger.debug("Finished examining file: #{file}")

        return images_hash
      end

      def get_all_image_data_from(img)
        images, name, code, caption, description, category, url = [], "", "", "", "", "", ""
        sizes = get_all_sizes_for(img.at('Sizes'))

       # Get the metadata
        name = img.at('ImageName')['Val']
        code = img.at('LeonardoCode')['Val']
        caption = img.xpath('.//Caption//en').text
        description = img.xpath('.//Description//en').text
        category = img.at('Category')['Val']
        img_orig_size = "#{img.at('OriginalSize')['Width']}x#{img.at('OriginalSize')['Height']}"

        img.xpath('.//File').each do |img_f|
          url = img_f['Name']
          size_category = url.match(/(\w)\.jpg$/)[1]
          # next unless %w{A B D J}.include?(size_category)
          next unless size_category == "J"

          width, height = sizes[size_category]
          images << [code, name, caption, category, description, url, img_orig_size, width, height]
        end

        return images
      end

      def get_all_sizes_for(img_size)
        sizes = {}

        img_size.xpath('.//Size').each do |s|
          sizes[s['Sign']] = [s['Width'], s['Height']]
          # puts "#{s['Sign']}=#{s['Width']}x#{'Height'}"
        end

        sizes
      end

    end

  end
end
	module Leonardo
	class XmlParser
	@@parser_logger = Logger.new(File.join(LOG_PATH, "leonardo_xml_parser_#{Gazo::Time.pretty_day}.log"))
	@@error_logger = Logger.new(File.join(LOG_PATH, "leonardo_xml_parser_error_#{Gazo::Time.pretty_day}.log"))

	SIZES = { :A => '70x70!', :B => '100x100', :D => '200x200', :J => '480x480'}

	IMG_PATH_PREFIX = "http://images.leonardo.com/imgs"
	class << self

	def process!
	# Create the path if it does not exists
	pre_process

	# Dir.entries(XML_DIR).grep(/\.xml$/).each do \|file\|
	properties = []

	Dir[(File.join(XML_DIR, '*.xml'))].sort.each_with_index do \|file, idx\|
	puts file

	if File.size?(File.join(PARSED_DIR, "processed_xml.txt")) && File.read(File.join(PARSED_DIR, "processed_xml.txt")).include?(file)
	@@parser_logger.debug("#{file} have been processed earlier..., skip to next")
	next
	end

	images_hash = parse(file, idx)

	images_hash.each do \|leonardo_id, images\|
	next if leonardo_id.nil?
	@@parser_logger.debug("Images for Property ID: #{leonardo_id}, Image URLs: #{images.size}")

	######
	if properties.include?(leonardo_id)
	@@parser_logger.debug("#{leonardo_id} is already exists!!!")
	else
	properties << leonardo_id
	end
	######

	save_image_urls_for(leonardo_id, :with => images)
	end

	append_to_file(File.join(PARSED_DIR, "processed_xml.txt"), file)

	images_hash = nil
	end

	@@parser_logger.debug("Total properties-keys: #{properties.size}")
	end

	private

	def pre_process
	Dir.mkdir(IMG_DIR) unless File.exists?(IMG_DIR)
	Dir.mkdir(PARSED_DIR) unless File.exists?(PARSED_DIR)
	end

	def save_image_urls_for(leonardo_id, opts = {})
	image_urls_to_save = opts[:with] \|\| []
	image_urls_to_save.each do \|img\|
	fname = File.join(IMG_DIR, img.url[img.url.rindex("/")+1..-1])
	if File.size?(File.join(PARSED_DIR, "error_image_download.txt")) && File.read(File.join(PARSED_DIR, "error_image_download.txt")).include?(img.url)
	@@parser_logger.debug("404 Image, skip #{img.url}")
	next
	end

	unless File.size?(fname)
	begin
	#TODO temporary commented for testing
	#download_image(img.url, fname)
	append_to_file(File.join(PARSED_DIR, "#{leonardo_id}.csv"), img.to_csv)

	# Download only largest image and resize for the rest
	# resize_and_save(img, fname, leonardo_id)
	rescue
	@@error_logger.debug("Failed to download #{fname} - #{$!}")
	append_to_file(File.join(PARSED_DIR, "error_image_download.txt"), img.url)
	end
	end
	end
	end

	def resize_and_save(img, fname, leonardo_id)
	[:A, :B, :D].each do \|size_id\|
	new_fname = fname.gsub('_J.jpg', "_#{size_id.to_s}.jpg")
	@@parser_logger.debug("Resizing image to #{new_fname} to #{SIZES[size_id]}")
	image = MiniMagick::Image.from_file(fname)
	image.resize SIZES[size_id]
	image.write(new_fname)
	new_img = img.clone
	new_img.width, new_img.height = image[:width], image[:height]
	new_img.url.gsub!('_J.jpg', "_#{size_id.to_s}.jpg")
	append_to_file(File.join(PARSED_DIR, "#{leonardo_id}.csv"), new_img.to_csv)

	image = nil
	end
	end

	def append_to_file(filename, content)
	File.open(filename, 'a') do \|out\|
	out.puts(content)
	end
	end

	def download_image(image_path, filename)
	@@parser_logger.debug("downloading #{filename}...")
	File.open(filename, 'wb') do \|f\|
	f.write(open(image_path).read)
	end
	end

	def parse(file, idx)
	@@parser_logger.debug("Examining File\##{idx+1}: #{file}")

	images_hash = {}

	doc = Nokogiri::XML(open(file))

	doc.xpath('//Property').each do \|property\|
	id = property.attributes['Id']
	name = property.at('LeonardoName').text

	property.xpath('.//Image').each do \|img\|
	get_all_image_data_from(img).each do \|i\|
	img_code, img_name, img_caption, img_category, img_description, img_url, img_orig_size, img_width, img_height = i

	image = Image.new(:property_id => id, :property_name => name,
	:url => "http://images.leonardo.com/imgs#{img_url}",
	:img_code => img_code, :img_name => img_name, :category => img_category,
	:caption => img_caption, :description => img_description,
	:width => img_width, :height => img_height, :original_size => img_orig_size
	)

	images_hash[id] \|\|= []
	images_hash[id] << image

	end
	end
	end
	doc = nil

	@@parser_logger.debug("size: #{images_hash.size}")
	@@parser_logger.debug("Finished examining file: #{file}")

	return images_hash
	end

	def get_all_image_data_from(img)
	images, name, code, caption, description, category, url = [], "", "", "", "", "", ""
	sizes = get_all_sizes_for(img.at('Sizes'))

	# Get the metadata
	name = img.at('ImageName')['Val']
	code = img.at('LeonardoCode')['Val']
	caption = img.xpath('.//Caption//en').text
	description = img.xpath('.//Description//en').text
	category = img.at('Category')['Val']
	img_orig_size = "#{img.at('OriginalSize')['Width']}x#{img.at('OriginalSize')['Height']}"

	img.xpath('.//File').each do \|img_f\|
	url = img_f['Name']
	size_category = url.match(/(\w)\.jpg$/)[1]
	# next unless %w{A B D J}.include?(size_category)
	next unless size_category == "J"

	width, height = sizes[size_category]
	images << [code, name, caption, category, description, url, img_orig_size, width, height]
	end

	return images
	end

	def get_all_sizes_for(img_size)
	sizes = {}

	img_size.xpath('.//Size').each do \|s\|
	sizes[s['Sign']] = [s['Width'], s['Height']]
	# puts "#{s['Sign']}=#{s['Width']}x#{'Height'}"
	end

	sizes
	end

	end

	end
	end