Skip to content

Instantly share code, notes, and snippets.

@niklasl
Created October 17, 2010 14:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save niklasl/630906 to your computer and use it in GitHub Desktop.
Save niklasl/630906 to your computer and use it in GitHub Desktop.
Parses html files (in given dir, matching given glob) and prints out a tree representing the structure (css-selector style).
@Grab(group='org.ccil.cowan.tagsoup', module='tagsoup', version='0.9.7')
import org.ccil.cowan.tagsoup.Parser
/**
* Parses html files and prints out a tree representing the structure
* (css-selector style).
*
* Shows the frequency for how many pages a specific subtree appears in. Can
* "anonymize" generated ID:s.
*/
class CSSTreeComputer {
static final DEFAULT_GEN_ID_TOKEN = "GEN_ID"
static final DEFAULT_ID_SKIP_PATTERN = /(?:[A-Z]{2,}\d.+)|(?:([-_])\w+\d+\w*.*)/
List<File> sources
/** Regexp used to match irrelevant (generated) id:s. */
String idSkipPattern = null
String genIdToken = DEFAULT_GEN_ID_TOKEN
ItemPath topItemPath
CSSTreeComputer(sources, idSkipPattern=null, genIdToken=DEFAULT_GEN_ID_TOKEN) {
this.sources = sources
this.idSkipPattern = idSkipPattern
this.genIdToken = genIdToken
this.topItemPath = computeCssTree()
}
protected ItemPath computeCssTree() {
def slurper = new XmlSlurper(new Parser())
def root = new ItemPath('')
sources.each { source ->
def html = slurper.parse(source)
html.body.each {
buildItemPathTree(source.name, it, root)
}
}
return root.children[0]
}
protected void buildItemPathTree(source, node, ItemPath parent) {
def selector = node.name()
node.attributes()['id']?.with {
def id = it.toString()
def skipMatcher = idSkipPattern? id =~ idSkipPattern : null
if (skipMatcher)
id = skipMatcher.replaceFirst('$1' + genIdToken)
selector += "#${id}"
}
node.attributes()['class']?.with {
selector += it.toString().split(/\s+/).collect { it? ".${it}" : '' }.join("")
}
def current = parent.add(source, selector)
node.children().each {
buildItemPathTree(source, it, current)
}
}
void output(out=System.out, indent=2, itemPath=topItemPath,
level=0, parentRatio=0) {
def note = ""
def ratio = itemPath.sources.size() / sources.size()
if (ratio < 1.0 && ratio != parentRatio) {
note = " /* in ${Math.round(ratio*100)}% */"
}
out.println((" " * indent * level) + itemPath.selector + note)
itemPath.children.each {
output(out, indent, it, level+1, ratio)
}
}
}
class ItemPath {
Set<Object> sources = new HashSet<Object>()
String selector
ItemPath parent
List<ItemPath> children = []
ItemPath(selector, parent=null) {
this.selector = selector
if (parent) {
this.parent = parent
parent.children << this
}
}
ItemPath add(Object source, String selector) {
def itemPath = children.find { it.selector == selector} ?:
new ItemPath(selector, this)
itemPath.sources << source
return itemPath
}
}
if (args.length < 2) {
File script = new File(this.class.protectionDomain.codeSource.location.toURI())
println "Usage: ${script.name} BASE_DIR SOURCE_GLOB"
System.exit 0
}
def base = args[0]
def fpattern = args[1]
def idSkipPattern = args.length>2? args[2] : CSSTreeComputer.DEFAULT_ID_SKIP_PATTERN
def sources = new AntBuilder().fileScanner {
fileset(dir:base, includes: "**/${fpattern}")
}.collect { it }
new CSSTreeComputer(sources, idSkipPattern).output()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment