Skip to content

Instantly share code, notes, and snippets.

@weapp
Last active August 29, 2015 14:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save weapp/18bd85bf244fb33c54a5 to your computer and use it in GitHub Desktop.
Save weapp/18bd85bf244fb33c54a5 to your computer and use it in GitHub Desktop.
require "pp"
block_size = 4_096
def tags_from_file(filename)
no_tag_or_tag = /([^<>]+)|(<\/[^<>]+>)|(<[^<>]+?\/>)|(<\!\[CDATA\[(?:.*?)\]\]>)|(<[^!a-zA-Z][^<>]+>)|(<[^<>]+>)/
reg_exp = /\A(?<p>(?:#{no_tag_or_tag}))*\g<p>*/
block_size = 4_096
Enumerator.new do |enum|
File.open(filename, "r:ISO-8859-15") do |f|
buf = ""
loop do
break if not block = f.gets(nil, block_size)
buf += block
# porcion que matchea
match = reg_exp.match(buf).to_s
# porcion a partir del match
buf = buf[match.length..-1]
# devolvemos los matchs
match.scan(/#{no_tag_or_tag}/).each{|m| enum.yield(m)}
end
end
end
end
class Selector
def initialize selector
@selector = selector
@search = selector.split(" ")
end
def match? hierarchy
tags = hierarchy.map{|h| h[:tagname]}
matching?(tags, @search.dup, true)
end
private
def matching? tags, searched, last=true
if last
(tags.pop == searched.pop) && matching?(tags, searched, false)
elsif searched == []
true
else
search = searched.pop
i = tags.index(search)
i && matching?(tags[0...i], searched)
end
end
end
def extract_from_file(filename, selectors)
Enumerator.new do |enum|
saving_childs = false
hierarchy = []
childs = []
selectors = Array(selectors).map{|s| Selector.new(s)}
tags_from_file(filename).each do |no_tag, close_tag, inline_tag, cdata, other_tag, tag|
all_tags = close_tag || inline_tag || tag
m = /<\/?(\w+)/.match(all_tags)
m = m[1] if m
if tag || inline_tag
last_childs = childs
childs = []
node = {tagname: m, content: tag}
last_childs << node if saving_childs
hierarchy << node
selectors.each do |selector|
saving_childs ||= selector.match? hierarchy
end
node[:childs] = childs if saving_childs
end
if saving_childs
childs << no_tag if no_tag && !no_tag.strip.empty?
childs << (/<!\[CDATA\[(.*)\]\]>/.match(cdata)[1]) if cdata
end
if close_tag || inline_tag
selectors.each do |selector|
if selector.match? hierarchy
enum.yield hierarchy
saving_childs = false
end
end
hierarchy.pop
childs = hierarchy.last[:childs]
end
end
end
end
tags = extract_from_file("Export.xml", ["Feature", "Category", "Hierarchy Link", "FeatureLink", "Object", "Sku", "Pack", "CrossSell Link", "Alternative Link"])
tags.take(10).each do |h|
puts
pp h
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment