Skip to content

Instantly share code, notes, and snippets.

@rymai
Created April 18, 2010 19:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rymai/33df1990c11830e94022 to your computer and use it in GitHub Desktop.
Save rymai/33df1990c11830e94022 to your computer and use it in GitHub Desktop.
# (c) Copyright 2010 Rémy Coutable. All Rights Reserved.
# RPCFN: XML Transformer (#8) : http://rubylearning.com/blog/2010/04/07/rpcfn-xml-transformer-8/
#
# You can test XML transformation with:
# - files in the cloud (default): ruby xml_transformer.rb
# - local files: ruby xml_transformer.rb local
#
# My solutions works as follow:
# 1. It defines a hierarchical structure (as {:name => {:first, :last}, :birth => {:date, :place}...).
# 2. It parses the XML (file or url) file & store the recognized informations
# according to this hierarchical structure.
# 3. It populates a new normalized XML document with the collected informations.
#
# My solution skip the unrecognized nodes nested between 2 recognized nodes
# (i.e. in <name><useless><first_name>Rémy</first_name></useless></name>, the <useless> level will be ignored,
# and the useful informations will be stored as [{:name=>{:first=>'Rémy'}}]
#
# For the moment it can only stores informations of 2-levels depth
# (i.e. :name=>{:first} but not :name=>{:first=>{:real, :fake}}.
# Though, with more work it could handle more depth levels.
require 'nokogiri'
require 'open-uri'
module Rymai
class XmlTransformer
attr_accessor :filename, :original_xml, :normalized_xml
PARENT_INDEX_REGEX = /\D*(\[\d+\])*\D*\[(\d+)\]\D*/
NAME_REGEX = /name/
BIRTH_REGEX = /birth/
ADDRESS_REGEX = /address/
FATHER_REGEX = /father/
MOTHER_REGEX = /mother/
FIRST_REGEX = /_?first_?/
LAST_REGEX = /_?(sur|last|family)_?/
PLACE_REGEX = /_?place_?/
DATE_REGEX = /_?date_?/
STREET_REGEX = /_?street_?/
NUMBER_REGEX = /_?number_?/
POSTAL_CODE_REGEX = /_?postal_?code_?/
CITY_REGEX = /_?city_?/
COUNTRY_REGEX = /_?country_?/
REGEX_MAPPING = {
:name => NAME_REGEX,
:birth => BIRTH_REGEX,
:address => ADDRESS_REGEX,
:first => FIRST_REGEX,
:last => LAST_REGEX,
:place => PLACE_REGEX,
:date => DATE_REGEX,
:street => STREET_REGEX,
:number => NUMBER_REGEX,
:postal_code => POSTAL_CODE_REGEX,
:city => CITY_REGEX,
:country => COUNTRY_REGEX
}
HIERARCHY = {
:name => [:first, :last],
:birth => [:place, :date],
:address => [:street, :number, :postal_code, :city, :country],
}
def initialize(filename)
@filename = filename
file_content = if @filename =~ /https?:\/\//
open(@filename)
elsif File.file?(@filename)
File.new(@filename)
else
print "\n!Warning! #{filename} is not a readable file!\n"
end
@original_xml = Nokogiri::XML::Document.parse(file_content) if file_content
@normalized_xml = "\nNo normalized XML yet\n"
end
def transform
return false unless @original_xml
@normalized_array = []
explore(@original_xml.root.path)
@normalized_xml = Nokogiri::XML::Document.new
@normalized_xml.encoding = 'UTF-8'
@normalized_xml << @normalized_xml.create_element('people') do |people_node|
@normalized_array.each do |person_infos|
people_node << @normalized_xml.create_element('person') do |person_node|
person_infos.each do |level_1_name, level_2_infos|
person_node << @normalized_xml.create_element(level_1_name.to_s) do |level1_node|
level_2_infos.each do |level_2_name, text|
if level_2_name == :text
level1_node << @normalized_xml.create_text_node(text)
else
level1_node << @normalized_xml.create_element(level_2_name.to_s) do |level2_node|
level2_node << @normalized_xml.create_text_node(text)
end
end
end
end
end
end
end
end
true
end
def to_s
@normalized_xml.to_s
end
private
# Recursive method that parse the xml given x_path
# and save formatted informations in the normalized_array instance variable.
def explore(x_path)
@original_xml.xpath(x_path).each do |data|
data.children.each do |node|
if node.element?
level_1 = Rymai::XmlTransformer.get_level_1(node)
if PARENT_INDEX_REGEX.match(node.parent.path)
if level_1 && level_2 = Rymai::XmlTransformer.get_level_2(level_1, node)
((@normalized_array[$2.to_i - 1]||={})[level_1]||={})[level_2] = node.child.text
end
end
explore(node.path)
end
end
end
end
# Recursive method that return the normalized level 1 node name corresponding to the node given.
# This method search in the ancestors of the node if necessary.
# Return nil if no level 1 node is found
#
# For example, if the node given has the name 'first_name', then this method will return 'name'.
# 'name' is the normaized level 1 node name for a person's name informations
# (and 'first' is the level 2 for first name information)
def self.get_level_1(node)
if level_1 = detect_level_1(node)
return level_1
else
node.ancestors.each do |ancestor|
get_level_1(ancestor)
end
nil
end
end
# Given a +level_1+ node name, this method return the normalized level 2 node name
# corresponding to the +node+ given.
def self.get_level_2(level_1, node)
HIERARCHY[level_1].each do |node_name|
return node_name if REGEX_MAPPING[node_name].match(node.name)
end
nil
end
# Return the normalized level 1 node name corresponding to the +node+ given.
def self.detect_level_1(node)
HIERARCHY.keys.each do |father_node|
return father_node if REGEX_MAPPING[father_node].match(node.name)
end
# We didn't find a level 1 node, let's try with the level 2 nodes...
# If we find, we set the level 1 node corresponding to the found level 2 (as set in HIERARCHY_MAPPING).
HIERARCHY.each do |father_node, sons_nodes|
sons_nodes.each do |node_name|
return father_node if REGEX_MAPPING[node_name].match(node.name)
end
end
nil
end
end
end
XML_FILES_IDS_IN_THE_CLOUD = %w[UMB US3 UaY VIm].map{ |id| "http://cl.ly/#{id}/content" }
XML_LOCAL_FILES_NAMES = %w[source1.xml source2.xml source3.xml source_custom.xml]
SOURCE = ARGV[0] && ARGV[0].downcase == "local" ? XML_LOCAL_FILES_NAMES : XML_FILES_IDS_IN_THE_CLOUD
print "\nTransforming XML sources #{SOURCE == XML_FILES_IDS_IN_THE_CLOUD ? 'from the cloud' : 'locally'}:\n"
# Tests Rymai::XmlTransformer with the xml source files (including the custom one with additional fields)
SOURCE.each do |filename|
transformer = Rymai::XmlTransformer.new(filename)
if transformer.transform
print "\n#{filename} after transformation:\n"
print transformer.to_s
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment