Skip to content

Instantly share code, notes, and snippets.

@dvdbng
Created February 6, 2012 19:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dvdbng/1754258 to your computer and use it in GitHub Desktop.
Save dvdbng/1754258 to your computer and use it in GitHub Desktop.
Prune DOM tree keeping structure and nodes for wich a function is true
import xml.etree.cElementTree as etree
def traverseEtree(tree,traverser):
traverser.start(tree)
traverser.data(tree.text)
for child in tree:
traverseEtree(child,traverser)
traverser.end(tree)
traverser.data(tree.tail)
class Traverser():
def __init__(self,fn):
self.stack = []
self.res = etree.TreeBuilder()
self.last = None
self.fn = fn
def start(self,elm):
self.stack.append(elm)
if self.fn(elm):
self.addCurrent(elm)
def end(self,elm):
self.stack.pop()
def getCommonAncestor(self):
for elm in self.stack[:-1][::-1]: # Parents of the actual element
if elm in self.lastStack:
return elm
def addCurrent(self,elm):
if self.last is None:
for elm in self.stack:
self.res.start(elm.tag,elm.attrib)
else:
ant = self.getCommonAncestor()
for elm in self.lastStack[::-1]:
if elm == ant:
break
else:
self.res.end(elm.tag)
afterCommon = False
for elm in self.stack:
if afterCommon:
self.res.start(elm.tag,elm.attrib)
else:
if elm == ant:
afterCommon = True
self.last = elm
self.lastStack = list(self.stack)
def close(self):
for elm in self.stack[::-1]:
self.res.end(elm.tag)
return self.res.close()
def data(self,data):
pass
def pruneTree(tree,fn):
trv = Traverser(fn)
traverseEtree(tree,trv)
return trv.close()
if __name__ == "__main__":
tree = etree.fromstring("""
<a>
<b>
<c>
<d/>
<d/>
</c>
<d><keep/></d>
</b>
<keep/>
<b>
<b>
<keep>
<a></a>
</keep>
</b>
<keep/>
<c/>
</b>
</a>
""")
# This will return a minimal tree with the same structure that contains all the nodes with tagname "keep", i.e:
# <a>
# <b>
# <d><keep /></d>
# </b>
# <keep />
# <b>
# <b>
# <keep />
# </b>
# <keep />
# </b>
# </a>
print etree.tostring(pruneTree(tree,lambda e: e.tag == "keep"))
@dvdbng
Copy link
Author

dvdbng commented Feb 6, 2012

Manejar los nodos de texto se deja como un ejercicio para el lector.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment