Skip to content

Instantly share code, notes, and snippets.

@cwe1ss
Created September 22, 2022 13:53
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cwe1ss/9427a8c2d0a298b3ef72623ffde69987 to your computer and use it in GitHub Desktop.
Save cwe1ss/9427a8c2d0a298b3ef72623ffde69987 to your computer and use it in GitHub Desktop.
Convert Confluence to Markdown (Azure DevOps)
const fs = require('fs/promises')
const path = require('path')
//const util = require('util')
const sanitize = require("sanitize-filename")
const HTMLParser = require('node-html-parser')
const TurndownService = require('turndown')
const TurndownPluginGfmService = require('@guyplusplus/turndown-plugin-gfm')
const TurndownPluginConfluenceToGfmService = require('turndown-plugin-confluence-to-gfm')
const turndownService = new TurndownService()
TurndownPluginGfmService.gfm(turndownService)
TurndownPluginConfluenceToGfmService.confluenceGfm(turndownService)
///////////////////////////////////////
// Source and destination folders
// A directory which has been exported from Confluence using the HTML export.
const htmlDirectory = 'C:/temp/Confluence-export/ABC'
// The target directory
const markdownDirectory = 'c:/temp/AzDevOps/docs/ABC'
; (async () => {
const newAttachmentsDirectoryName = '.attachments'
///////////////////////////////////////
// Read folder structure, page titles, etc from index.html
var addChildren = function(ul, result, parentFolders) {
if (!ul) {
return
}
for (const listItemNode of ul.childNodes) {
if (listItemNode.tagName != 'LI') continue;
var obj = {
htmlFileName: '',
markdownFileName: '',
folderName: null,
parents: parentFolders
}
result.push(obj)
var addAsParent = true
for (const listItemChildNode of listItemNode.childNodes) {
if (listItemChildNode.tagName == 'A') {
obj.htmlFileName = listItemChildNode.attributes['href']
var pageTitle = listItemChildNode.innerText
pageTitle = pageTitle
.replaceAll('&', '&')
.replaceAll(''', '\'')
.replaceAll('"', '"')
.replaceAll('/', '|')
// https://learn.microsoft.com/en-us/azure/devops/project/wiki/wiki-file-structure?view=azure-devops#special-characters-in-wiki-page-titles
.replaceAll(':', '%3A')
.replaceAll('<', '%3C')
.replaceAll('>', '%3E')
.replaceAll('*', '%2A')
.replaceAll('?', '%3F')
.replaceAll('|', '%7C')
.replaceAll('-', '%2D')
.replaceAll('"', '%22')
var sanitizedPageTitle = sanitize(pageTitle)
// Azure DevOps Wiki doesn't allow spaces
sanitizedPageTitle = sanitizedPageTitle.replaceAll(' ', '-')
obj.markdownFileName = sanitizedPageTitle + '.md'
obj.folderName = sanitizedPageTitle
}
if (listItemChildNode.tagName == 'IMG') {
// The sole root item is the home page (with an img tag to recognize it) and we don't need a separate folder for it.
addAsParent = false
}
if (listItemChildNode.tagName == 'UL') {
const newParents = [...parentFolders]
if (addAsParent) {
newParents.push(obj.folderName)
}
addChildren(listItemChildNode, result, newParents)
}
}
}
}
var getFiles = async function() {
const indexFilePath = path.join(htmlDirectory, 'index.html')
const htmlContent = await fs.readFile(indexFilePath, { encoding: 'utf-8'} )
const html = HTMLParser.parse(htmlContent)
const pageSection = html.querySelector('div.pageSection ul')
var files = []
addChildren(pageSection, files, [])
files.unshift({
htmlFileName: 'index.html',
markdownFileName: 'index.md',
pageTitle: html.querySelector('head title').innerText,
folderName: null,
parents: []
})
return files
}
var files = await getFiles()
// console.log(util.inspect(files, { depth: null, colors: true }))
// return
///////////////////////////////////////
// Copy "attachments"-directory to ".attachments" (Azure DevOps requires this name)
const existingAttachmentsDirectory = path.join(htmlDirectory, 'attachments')
const newAttachmentsDirectory = path.join(markdownDirectory, newAttachmentsDirectoryName)
var copyRecursiveSync = async function(src, dest) {
// https://stackoverflow.com/a/22185855
var stats = await fs.stat(src)
var isDirectory = stats.isDirectory()
if (isDirectory) {
await fs.mkdir(dest, { recursive: true })
const files = await fs.readdir(src)
for (const childItemName of files) {
await copyRecursiveSync(path.join(src, childItemName), path.join(dest, childItemName))
}
} else {
await fs.copyFile(src, dest)
}
};
await copyRecursiveSync(existingAttachmentsDirectory, newAttachmentsDirectory)
///////////////////////////////////////
// Convert pages
for (const file of files) {
const htmlFileFullPath = path.join(htmlDirectory, file.htmlFileName)
const markdownFileDirectory = path.join(markdownDirectory, ...file.parents)
const markdownFileFullPath = path.join(markdownFileDirectory, file.markdownFileName)
console.log(htmlFileFullPath + " --> " + markdownFileFullPath)
///////////////////////////////////////
// Load file content
var htmlContent = await fs.readFile(htmlFileFullPath, { encoding: 'utf-8'} )
///////////////////////////////////////
// Replace links
for (const linkFile of files) {
var target = '../'.repeat(file.parents.length)
target += linkFile.parents.join('/') + (linkFile.parents.length > 0 ? '/' : '')
target += linkFile.markdownFileName
//console.log(' - ' + linkFile.htmlFileName + ': ' + target)
htmlContent = htmlContent.replaceAll(linkFile.htmlFileName, target)
}
///////////////////////////////////////
// Replace attachment links (since we place files in subfolders)
const attachmentsUrl = '../'.repeat(file.parents.length) + newAttachmentsDirectoryName + '/'
htmlContent = htmlContent.replaceAll('"attachments/', '"' + attachmentsUrl)
var html = HTMLParser.parse(htmlContent)
///////////////////////////////////////
// Remove breadcrumbs section (as we have the navigation bar anyway)
var htmlBreadcrumbs = html.querySelector('#breadcrumbs')
if (htmlBreadcrumbs) {
htmlBreadcrumbs.set_content('')
}
///////////////////////////////////////
// Remove footer (as that only contains info about when the HTML document was generated and a link to Atlassian)
var htmlFooter = html.querySelector('#footer')
if (htmlFooter) {
htmlFooter.set_content('')
}
///////////////////////////////////////
// Remove page title (as that would output the page title twice)
var htmlPageTitle = html.querySelector('title')
if (htmlPageTitle) {
htmlPageTitle.set_content('')
}
///////////////////////////////////////
// Remove header from content (as Azure DevOps displays a title based on the file name)
var htmlMainHeader = html.querySelector('#main-header')
htmlMainHeader.set_content('')
///////////////////////////////////////
// Remove query strings from attachment includes (as this doesn't display in Azure DevOps)
var htmlImages = html.querySelectorAll('img')
for (const htmlImage of htmlImages) {
const src = htmlImage.attributes['src']
if (src.indexOf(newAttachmentsDirectoryName) >= 0) {
var indexOfQuery = src.indexOf('?')
if (indexOfQuery >= 0) {
const newSrc = src.substring(0, indexOfQuery)
htmlImage.setAttribute('src', newSrc)
}
}
}
///////////////////////////////////////
// Remove Page metadata (author, creation time) (since we don't need them in DevOps)
var htmlPageMetadata = html.querySelector('#content div.page-metadata')
htmlPageMetadata?.set_content('')
///////////////////////////////////////
// Show "information macro"-widgets as blockquotes
const macroBodies = html.querySelectorAll('div.confluence-information-macro > p.title, div.confluence-information-macro-body, div.confluence-information-macro-body p, div.confluence-information-macro-body div, div.confluence-information-macro-body li, div.confluence-information-macro-body pre')
for (const macroBody of macroBodies) {
if (macroBody.innerText?.trim() != '')
macroBody.insertAdjacentHTML('afterbegin', '{{tmp:blockquote}}')
}
///////////////////////////////////////
// Replace known custom scripts
const attachments = html.querySelector('#attachments')?.parentNode?.nextElementSibling?.querySelectorAll('a')
const customScripts = html.querySelectorAll('script.ap-iframe-body-script')
for (const customScript of customScripts) {
const scriptContent = customScript.innerHTML
if (scriptContent.indexOf('com.gliffy.integration.confluence') >= 0) {
///////////////////////////////////////
// Gliffy Diagrams
const gliffyExtractPageIdRegex = /(?:.*container=|^)(\d+)/
const gliffyExtractAttachmentIdRegex = /(?:.*imageAttachmentId=att|^)(\d+)/
const gliffyExtractNameRegex = /(?:.*[\|"]name=)([^|\\]*)/
const pageIdMatch = scriptContent.match(gliffyExtractPageIdRegex)
if (!pageIdMatch || pageIdMatch.length < 2) continue;
// The attachment id is not easy to resolve. If we can't get it directly, we need to find it by name.
var attachmentId = ''
const attachmentIdMatch = scriptContent.match(gliffyExtractAttachmentIdRegex)
if (attachmentIdMatch && attachmentIdMatch.length > 1) {
attachmentId = attachmentIdMatch[1]
} else {
// Find the image by attachment name and use the most recent one
const nameMatch = scriptContent.match(gliffyExtractNameRegex)
if (!nameMatch || nameMatch.length < 2) continue;
const imageAttachments = attachments.filter(x => x.innerText == nameMatch[1] + '.png')
if (imageAttachments.length > 0) {
const latestAttachment = imageAttachments[imageAttachments.length - 1]
const href = latestAttachment.attributes['href']
attachmentId = href.substring(href.lastIndexOf('/') + 1, href.lastIndexOf('.'))
}
}
//console.log('PageId: ' + pageId[1] + '; AttachmentId: ' + attachmentId[1])
const attachmentUrl = attachmentsUrl + pageIdMatch[1] + '/' + attachmentId + '.png'
customScript.set_content('')
customScript.insertAdjacentHTML('beforebegin', '<img src="' + attachmentUrl + '" />')
} else if (scriptContent.indexOf('com.balsamiq.mockups.confluence') >= 0) {
///////////////////////////////////////
// Balsamiq Mockups
const balsamiqExtractPageIdRegex = /(?:.*pageid=|^)(\d+)/
const balsamiqExtractDownloadLinkRegex = /(?:.*[\|"]downloadLink=)([^|\\]*)/
const balsamiqExtractNameRegex = /(?:.*&name=)([^&\"]*)/
const pageIdMatch = scriptContent.match(balsamiqExtractPageIdRegex)
if (!pageIdMatch || pageIdMatch.length < 2) continue;
// Find the image by attachment name and use the most recent one
var attachmentUrl = null
const downloadLinkMatch = scriptContent.match(balsamiqExtractDownloadLinkRegex)
if (downloadLinkMatch && downloadLinkMatch.length > 1) {
const downloadLink = downloadLinkMatch[1]
const lastIndexOfSlash = downloadLink.lastIndexOf('/')
const fileName = downloadLink.substring(lastIndexOfSlash + 1)
const imageAttachments = attachments.filter(x => x.innerText == fileName)
if (imageAttachments.length > 0) {
const latestAttachment = imageAttachments[imageAttachments.length - 1]
const href = latestAttachment.attributes['href']
const attachmentId = href.substring(href.lastIndexOf('/') + 1, href.lastIndexOf('.'))
//console.log('PageId: ' + pageId[1] + '; AttachmentId: ' + attachmentId[1])
attachmentUrl = attachmentsUrl + pageIdMatch[1] + '/' + attachmentId + '.png'
}
} else {
const nameMatch = scriptContent.match(balsamiqExtractNameRegex)
if (nameMatch && nameMatch.length > 1) {
const name = decodeURIComponent(nameMatch[1].replaceAll('+', ' '))
console.log('- ' + name)
const imageAttachments = attachments.filter(x => x.innerText == 'mockup_' + name + '.png')
if (imageAttachments.length > 0) {
const latestAttachment = imageAttachments[imageAttachments.length - 1]
const href = latestAttachment.attributes['href']
const attachmentId = href.substring(href.lastIndexOf('/') + 1, href.lastIndexOf('.'))
attachmentUrl = attachmentsUrl + pageIdMatch[1] + '/' + attachmentId + '.png'
}
}
}
if (attachmentUrl) {
customScript.set_content('')
customScript.insertAdjacentHTML('beforebegin', '<img src="' + attachmentUrl + '" />')
}
}
}
///////////////////////////////////////
// Fix double strong text (<strong>Some <strong>text</strong> example</strong>)
const doubleStrong = html.querySelectorAll('strong strong')
for (const strong of doubleStrong) {
strong.insertAdjacentHTML('beforebegin', strong.innerHTML)
strong.set_content('')
}
///////////////////////////////////////
// Convert Attachments to list
const attachmentsContainer = html.querySelector('#attachments')?.parentNode?.nextElementSibling
if (attachmentsContainer) {
attachmentsContainer.set_content(
'<ul>' +
attachmentsContainer.innerHTML.replaceAll('<img', '<li><img').replaceAll('<br>', '</li>') +
'</ul>')
}
///////////////////////////////////////
// Convert HTML to Markdown
var markdown = turndownService.turndown(html.toString())
///////////////////////////////////////
// Trim lines at the end (we can't trim at the start since Markdown relies on indentation at the beginning)
// Fix Quotations
var lines = markdown.split('\n')
for (let i = 0; i < lines.length; i++) {
var line = lines[i]
if (line.indexOf('{{tmp:blockquote}}') >= 0) {
line = '> ' + line.replaceAll('{{tmp:blockquote}}', '')
if (line.trim() == '>') {
line = ''
}
}
lines[i] = line.trimEnd()
}
markdown = lines.join('\n')
///////////////////////////////////////
// Remove excess empty lines (more than one empty line between text)
var oldMarkdown = ''
do {
oldMarkdown = markdown
markdown = markdown.replaceAll('\n\n\n', '\n\n')
} while (oldMarkdown != markdown)
///////////////////////////////////////
// Remove empty lines between quotes (as that shows up as separate blockquote-items)
lines = markdown.split('\n')
for (let i = 0; i < lines.length; i++) {
const previousLine = i > 0 ? lines[i - 1] : null
const currentLine = lines[i]
const nextLine = i < lines.length - 1 ? lines[i + 1] : null
if (previousLine && previousLine.startsWith('> ') && nextLine && nextLine.startsWith('> ') && currentLine.trim() == '') {
lines[i] = '>'
}
}
markdown = lines.join('\n').trim();
///////////////////////////////////////
// Write markdown to disk
await fs.mkdir(markdownFileDirectory, { recursive: true })
await fs.writeFile(markdownFileFullPath, markdown)
}
///////////////////////////////////////
// Update .order file (we currently only support the root level)
const orderFilePath = path.join(markdownDirectory, '.order')
var orderFileContent = ''
for (const file of files) {
if (file.parents.length == 0) {
orderFileContent += file.markdownFileName.replace('.md', '') + '\n'
}
}
await fs.writeFile(orderFilePath, orderFileContent)
console.log('finished')
})()
{
"name": "confluence-export",
"version": "1.0.0",
"description": "",
"main": "index.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1",
"start": "node index.js"
},
"author": "",
"license": "ISC",
"dependencies": {
"node-html-parser": "^6.1.0",
"sanitize-filename": "^1.6.3",
"turndown-plugin-confluence-to-gfm": "^0.5.0"
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment