Last active
August 29, 2015 14:07
-
-
Save weapp/18bd85bf244fb33c54a5 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require "pp" | |
block_size = 4_096 | |
def tags_from_file(filename) | |
no_tag_or_tag = /([^<>]+)|(<\/[^<>]+>)|(<[^<>]+?\/>)|(<\!\[CDATA\[(?:.*?)\]\]>)|(<[^!a-zA-Z][^<>]+>)|(<[^<>]+>)/ | |
reg_exp = /\A(?<p>(?:#{no_tag_or_tag}))*\g<p>*/ | |
block_size = 4_096 | |
Enumerator.new do |enum| | |
File.open(filename, "r:ISO-8859-15") do |f| | |
buf = "" | |
loop do | |
break if not block = f.gets(nil, block_size) | |
buf += block | |
# porcion que matchea | |
match = reg_exp.match(buf).to_s | |
# porcion a partir del match | |
buf = buf[match.length..-1] | |
# devolvemos los matchs | |
match.scan(/#{no_tag_or_tag}/).each{|m| enum.yield(m)} | |
end | |
end | |
end | |
end | |
class Selector | |
def initialize selector | |
@selector = selector | |
@search = selector.split(" ") | |
end | |
def match? hierarchy | |
tags = hierarchy.map{|h| h[:tagname]} | |
matching?(tags, @search.dup, true) | |
end | |
private | |
def matching? tags, searched, last=true | |
if last | |
(tags.pop == searched.pop) && matching?(tags, searched, false) | |
elsif searched == [] | |
true | |
else | |
search = searched.pop | |
i = tags.index(search) | |
i && matching?(tags[0...i], searched) | |
end | |
end | |
end | |
def extract_from_file(filename, selectors) | |
Enumerator.new do |enum| | |
saving_childs = false | |
hierarchy = [] | |
childs = [] | |
selectors = Array(selectors).map{|s| Selector.new(s)} | |
tags_from_file(filename).each do |no_tag, close_tag, inline_tag, cdata, other_tag, tag| | |
all_tags = close_tag || inline_tag || tag | |
m = /<\/?(\w+)/.match(all_tags) | |
m = m[1] if m | |
if tag || inline_tag | |
last_childs = childs | |
childs = [] | |
node = {tagname: m, content: tag} | |
last_childs << node if saving_childs | |
hierarchy << node | |
selectors.each do |selector| | |
saving_childs ||= selector.match? hierarchy | |
end | |
node[:childs] = childs if saving_childs | |
end | |
if saving_childs | |
childs << no_tag if no_tag && !no_tag.strip.empty? | |
childs << (/<!\[CDATA\[(.*)\]\]>/.match(cdata)[1]) if cdata | |
end | |
if close_tag || inline_tag | |
selectors.each do |selector| | |
if selector.match? hierarchy | |
enum.yield hierarchy | |
saving_childs = false | |
end | |
end | |
hierarchy.pop | |
childs = hierarchy.last[:childs] | |
end | |
end | |
end | |
end | |
tags = extract_from_file("Export.xml", ["Feature", "Category", "Hierarchy Link", "FeatureLink", "Object", "Sku", "Pack", "CrossSell Link", "Alternative Link"]) | |
tags.take(10).each do |h| | |
puts | |
pp h | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment