Created
October 17, 2010 14:34
-
-
Save niklasl/630906 to your computer and use it in GitHub Desktop.
Parses html files (in given dir, matching given glob) and prints out a tree representing the structure (css-selector style).
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
@Grab(group='org.ccil.cowan.tagsoup', module='tagsoup', version='0.9.7') | |
import org.ccil.cowan.tagsoup.Parser | |
/** | |
* Parses html files and prints out a tree representing the structure | |
* (css-selector style). | |
* | |
* Shows the frequency for how many pages a specific subtree appears in. Can | |
* "anonymize" generated ID:s. | |
*/ | |
class CSSTreeComputer { | |
static final DEFAULT_GEN_ID_TOKEN = "GEN_ID" | |
static final DEFAULT_ID_SKIP_PATTERN = /(?:[A-Z]{2,}\d.+)|(?:([-_])\w+\d+\w*.*)/ | |
List<File> sources | |
/** Regexp used to match irrelevant (generated) id:s. */ | |
String idSkipPattern = null | |
String genIdToken = DEFAULT_GEN_ID_TOKEN | |
ItemPath topItemPath | |
CSSTreeComputer(sources, idSkipPattern=null, genIdToken=DEFAULT_GEN_ID_TOKEN) { | |
this.sources = sources | |
this.idSkipPattern = idSkipPattern | |
this.genIdToken = genIdToken | |
this.topItemPath = computeCssTree() | |
} | |
protected ItemPath computeCssTree() { | |
def slurper = new XmlSlurper(new Parser()) | |
def root = new ItemPath('') | |
sources.each { source -> | |
def html = slurper.parse(source) | |
html.body.each { | |
buildItemPathTree(source.name, it, root) | |
} | |
} | |
return root.children[0] | |
} | |
protected void buildItemPathTree(source, node, ItemPath parent) { | |
def selector = node.name() | |
node.attributes()['id']?.with { | |
def id = it.toString() | |
def skipMatcher = idSkipPattern? id =~ idSkipPattern : null | |
if (skipMatcher) | |
id = skipMatcher.replaceFirst('$1' + genIdToken) | |
selector += "#${id}" | |
} | |
node.attributes()['class']?.with { | |
selector += it.toString().split(/\s+/).collect { it? ".${it}" : '' }.join("") | |
} | |
def current = parent.add(source, selector) | |
node.children().each { | |
buildItemPathTree(source, it, current) | |
} | |
} | |
void output(out=System.out, indent=2, itemPath=topItemPath, | |
level=0, parentRatio=0) { | |
def note = "" | |
def ratio = itemPath.sources.size() / sources.size() | |
if (ratio < 1.0 && ratio != parentRatio) { | |
note = " /* in ${Math.round(ratio*100)}% */" | |
} | |
out.println((" " * indent * level) + itemPath.selector + note) | |
itemPath.children.each { | |
output(out, indent, it, level+1, ratio) | |
} | |
} | |
} | |
class ItemPath { | |
Set<Object> sources = new HashSet<Object>() | |
String selector | |
ItemPath parent | |
List<ItemPath> children = [] | |
ItemPath(selector, parent=null) { | |
this.selector = selector | |
if (parent) { | |
this.parent = parent | |
parent.children << this | |
} | |
} | |
ItemPath add(Object source, String selector) { | |
def itemPath = children.find { it.selector == selector} ?: | |
new ItemPath(selector, this) | |
itemPath.sources << source | |
return itemPath | |
} | |
} | |
if (args.length < 2) { | |
File script = new File(this.class.protectionDomain.codeSource.location.toURI()) | |
println "Usage: ${script.name} BASE_DIR SOURCE_GLOB" | |
System.exit 0 | |
} | |
def base = args[0] | |
def fpattern = args[1] | |
def idSkipPattern = args.length>2? args[2] : CSSTreeComputer.DEFAULT_ID_SKIP_PATTERN | |
def sources = new AntBuilder().fileScanner { | |
fileset(dir:base, includes: "**/${fpattern}") | |
}.collect { it } | |
new CSSTreeComputer(sources, idSkipPattern).output() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment