-
-
Save lfrancke/2037ddf88019f75c5b8e6744edc5a5d8 to your computer and use it in GitHub Desktop.
Convert Confluence HTML export into asciidoc
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
@Grab('net.sourceforge.htmlcleaner:htmlcleaner:2.24') | |
import org.htmlcleaner.* | |
def src = new File('html').toPath() | |
def dst = new File('asciidoc').toPath() | |
def cleaner = new HtmlCleaner() | |
def props = cleaner.properties | |
props.translateSpecialEntities = false | |
def serializer = new SimpleHtmlSerializer(props) | |
src.toFile().eachFileRecurse { f -> | |
def relative = src.relativize(f.toPath()) | |
def target = dst.resolve(relative) | |
println(f) | |
if (f.directory) { | |
target.toFile().mkdir() | |
} else if (f.name.endsWith('.html')) { | |
def tmpHtml = File.createTempFile('clean', 'html') | |
println "Converting $relative" | |
def result = cleaner.clean(f) | |
result.traverse({ tagNode, htmlNode -> | |
tagNode?.attributes?.remove 'class' | |
if ('td' == tagNode?.name || 'th' == tagNode?.name) { | |
tagNode.name = 'td' | |
String txt = tagNode.text | |
tagNode.removeAllChildren() | |
tagNode.insertChild(0, new ContentNode(txt)) | |
} | |
true | |
} as TagNodeVisitor) | |
serializer.writeToFile(result, tmpHtml.absolutePath, "utf-8") | |
def process = "pandoc -f html+smart+raw_html -t asciidoc -s $tmpHtml -o ${target}.adoc".execute() | |
process.waitFor() | |
process.consumeProcessOutput(System.out, System.err) | |
tmpHtml.delete() | |
} /* else { | |
"cp html/$relative $target".execute() | |
}*/ | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment