Skip to content

Instantly share code, notes, and snippets.

@saibotsivad
Last active August 29, 2015 14:06
Show Gist options
  • Save saibotsivad/de16661dd01c5f23d8cf to your computer and use it in GitHub Desktop.
Save saibotsivad/de16661dd01c5f23d8cf to your computer and use it in GitHub Desktop.
Automatically converting DOC to MD

First convert DOC to DOCX using LibreOffice:

/Applications/LibreOffice.app/Contents/MacOS/soffice --invisible --convert-to docx file.doc

Then convert to Markdown using pandoc:

pandoc file.docx -f docx -t markdown -o file.md

Convert to Markdown, extracting the image files:

pandoc file.docx --extract-media=/path/to/folder -f docx -t markdown -o file.md

Automatically convert a folder of docx files, extracting the images to named folders with the convert.js

var fs = require('fs')
var path = require('path')
var mkdirp = require('mkdirp')
var exec = require('child_process').exec
var BagPipe = require('bagpipe')
var bag = new BagPipe(5)
// '/Users/saibotsivad/Development/thinking/kayser-commentary'
var input_folder = process.argv[2]
// '/Users/saibotsivad/Development/thinking/kayser-commentary-markdown'
var output_folder = input_folder + '-markdown'
var temp_folder = '/tmp/markdown_conversion'
processDirectory(input_folder, '.')
function processDirectory(parentPath, childPath) {
fs.readdir(path.join(parentPath, childPath), function(err, files) {
if (err) {
console.log(err)
} else {
files.forEach(checkIfFile.bind(null, parentPath, childPath))
}
})
}
function checkIfFile(parentPath, childPath, file) {
fs.stat(path.join(parentPath, childPath, file), function(err, stats) {
if (stats.isDirectory()) {
processDirectory(parentPath, path.join(childPath, file))
} else {
beginConversion(parentPath, childPath, file)
}
})
}
function uuidv4() {
return 'xxxxxxxxxxxx4xxxyxxxxxxxxxxxxxxx'.replace(/[xy]/g, function(c) {
var r = Math.random() * 16 | 0, v = c == 'x' ? r : (r & 0x3 | 0x8)
return v.toString(16)
})
}
function beginConversion(parentPath, childPath, file) {
if (/\.doc$/i.test(file)) {
bag.push(convertDocToDocxThenMarkdown, parentPath, childPath, file, function(){
console.log('Success converting to DOCX: ' + file)
})
} else if (/\.docx$/i.test(file)) {
convertDocxToMarkdown(parentPath, childPath, file, path.join(parentPath, childPath))
}
}
function regexFilename(file) {
var regex = /(.+)\.[^.]+$/.exec(file)
if (!regex) {
console.log('Error regexing filename', file)
} else {
return regex[1]
}
}
function convertDocToDocxThenMarkdown(parentPath, childPath, file, cb) {
console.log('Starting LibreOffice conversion...', path.join(childPath, file))
var outputPath = path.join(temp_folder, uuidv4())
exec('/Applications/LibreOffice.app/Contents/MacOS/soffice --invisible --convert-to docx "' + path.join(parentPath, childPath, file) + '" --outdir "' + outputPath + '"', {
cwd: input_folder
}, function(err, stdout, stderr) {
if (err) {
console.log('Error converting to DOCX', err, stderr)
} else {
console.log(stderr, stdout)
cb()
convertDocxToMarkdown(parentPath, childPath, regexFilename(file) + '.docx', outputPath)
}
})
}
function convertDocxToMarkdown(parentPath, childPath, filenameWithExtension, inputFilePath) {
var filenameWithoutExtension = regexFilename(filenameWithExtension)
var childPathWithFile = path.join(childPath, filenameWithoutExtension)
mkdirp(path.join(output_folder, childPath), function() {
exec('/usr/local/bin/pandoc "' + filenameWithExtension + '" --extract-media="./' + filenameWithoutExtension + '" -f docx -t markdown -o "' + path.join(output_folder, childPath, filenameWithoutExtension) + '.md"', {
cwd: inputFilePath
}, function(err, stdout, stderr) {
if (err) {
console.log('Error converting to markdown', err, stderr)
} else {
console.log('Success converting to markdown: ' + childPathWithFile)
fs.exists(path.join(inputFilePath, filenameWithoutExtension), function(exists) {
if (exists) {
console.log('Moving image directory:', childPathWithFile)
fs.rename(path.join(inputFilePath, filenameWithoutExtension), path.join(output_folder, childPath, filenameWithoutExtension), function(err) {
if (err) {
console.log('Error moving image directory:', childPathWithFile)
} else {
console.log('Success moving image directory:', childPathWithFile)
}
})
} else {
console.log('No image directory found:', childPathWithFile)
}
})
}
})
})
}
#!/bin/bash
/usr/local/bin/node /Users/saibotsivad/Development/thinking/kayser-commentary/convert.js "$1"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment