Skip to content

Instantly share code, notes, and snippets.

@bdabelow
Forked from melix/convert.groovy
Last active September 21, 2023 06:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bdabelow/67db92c7bd33687353fd8a07ede9ff5c to your computer and use it in GitHub Desktop.
Save bdabelow/67db92c7bd33687353fd8a07ede9ff5c to your computer and use it in GitHub Desktop.
Convert Confluence HTML export into asciidoc
@Grab('net.sourceforge.htmlcleaner:htmlcleaner:2.4')
import org.htmlcleaner.*
def src = new File('html').toPath()
def dst = new File('asciidoc').toPath()
def cleaner = new HtmlCleaner()
def props = cleaner.properties
props.translateSpecialEntities = false
def serializer = new SimpleHtmlSerializer(props)
src.toFile().eachFileRecurse { f ->
def relative = src.relativize(f.toPath())
def target = dst.resolve(relative)
if (f.isDirectory()) {
target.toFile().mkdir()
} else if (f.name.endsWith('.html')) {
def tmpHtml = File.createTempFile('clean', 'html')
println "Converting $relative"
def result = cleaner.clean(f)
result.traverse({ tagNode, htmlNode ->
tagNode?.attributes?.remove 'class'
if ('td' == tagNode?.name || 'th'==tagNode?.name) {
tagNode.name='td'
String txt = tagNode.text
tagNode.removeAllChildren()
tagNode.insertChild(0, new ContentNode(txt))
}
true
} as TagNodeVisitor)
serializer.writeToFile(
result, tmpHtml.absolutePath, "utf-8"
)
println "Target: ${target}.adoc"
cmdline = "pandoc -f html+raw_html+smart -t asciidoc -s $tmpHtml -o ${target}.adoc"
proc = cmdline.execute()
proc.waitFor()
tmpHtml.delete()
if ( proc.exitValue() != 0 ) {
println "\nCommand returned error: $cmdline\n"
println proc.err.text
System.exit(proc.exitValue())
}
}/* else {
"cp html/$relative $target".execute()
}*/
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment