Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Docx -> MediaWiki w/ Images

Are you having problems converting your docx to MediaWiki and preserving the images? Great, so did I. That's why this exists. Overall, the problem is a terrible, horrible one, but not one that's unsolvable (given that this exists). First, you need to be on linux. If you want it on windows, it would probably work with some modifications. I've only tested it on ubuntu (14.10) so far.

First, install these programs:

  • imagemagick
  • libreoffice
  • mmv
  • nodejs v0.12.0 or later (we need execSync functionality)

Then install these node packages:

  • fs-extra
  • glob
  • minimist
  • nodemw
  • q
  • winston

One line:

npm install fs-extra glob minimist nodemw q winston

The Converter.js file supports a couple arguments:

  • glob (default: 'docs/*.docx') -- the docx files to process
  • tags (default: '') -- the MediaWiki categories for your documents. Example: Document,Wiki,Apple
  • ignore-article (default: false) -- whether or not to ignore the article (the actual text content). Most times you want this on.
  • ignore-images (default: false) -- whether or not to ignore the images in the article. For multiple subsequent runs, you probably want this off.
  • ignore-original (default: false) -- whether or not to upload the original docx document to the wiki.
var fs = require('fs-extra');
var wiki = require('nodemw');
var q = require('q');
var glob = require('glob');
var execSync = require('child_process').execSync;
var logger = require('winston');
var argv = require('minimist')(process.argv.slice(2));
var client = new wiki({
server: 'wiki.server.com',
port: 8080,
path: '/mediawiki',
debug: true,
username: 'uploadmonkey',
password: 'lol',
userAgent: 'UploadMonkey/1.1 (http://thissitedoesntexist.com/UploadMonkey/; my@email.com) Node/0.12'
});
var botLoggedIn = q.defer();
var watchMe = botLoggedIn.promise;
client.logIn(function(err, data) {
if(err) return;
botLoggedIn.resolve();
});
var cglob = argv.glob || 'docs/*.docx';
var tags = argv.tags ? argv.tags.split(',') : [];
var ignoreArticle = argv['ignore-article'];
var ignoreImages = argv['ignore-images'];
var ignoreOriginal = argv['ignore-original'];
var stripPath = function(path) {
return path.split('\\').pop().split('/').pop();
};
var stripExtension = function(file) {
return file.split('.')[0];
};
var readAndReplaceImagesInFile = function(fileName) {
var dirName = stripExtension(fileName);
var justFileName = stripPath(dirName);
// read everything into memory
var doc = fs.readFileSync(dirName+'/word/document.xml');
var rels = fs.readFileSync(dirName+'/word/_rels/document.xml.rels');
var markdown = fs.readFileSync(dirName+'.txt');
// rip out the ordering of relationships from the docx
var getRelationships = function(file) {
var regex = /r:embed="(rId[0-9]+)"/g;
var match = null;
var matches = [];
while( (match=regex.exec(file)) !== null) {
matches.push(match[1]);
}
return matches;
};
// rip out relationship / image pairs
var getRelationshipTargets = function(file) {
var regex = /Id="([\w]+)"\sType="[\w|\d|\.|\/|:]+"\sTarget="media\/([\w]+)\.\w{1,4}"/g;
var match = null;
var matchHash = {};
while( (match=regex.exec(file)) !== null) {
matchHash[match[1]] = match[2];
}
return matchHash;
};
// get all possible relationships and their matching image
var orderedRelationships = getRelationships(doc.toString());
var relToImg = getRelationshipTargets(rels.toString());
// naiive matching of image to position in document
var currentMatch = 0;
// regex replace all of the old, deprecated [[Image:]] with the newer [[File:]] and link to the image
var replFile = markdown.toString().replace(/\[\[Image:]]/g, function(match, p1) {
return "[[File:"+justFileName+"_"+(relToImg[orderedRelationships[currentMatch++]])+".png]]";
});
if(tags.length > 0) {
replFile += '\n\n';
}
tags.forEach(function(tag) {
replFile += '[[Category:'+tag+']]\n';
});
return replFile;
};
var upload = {
// upload a file
file: function(filename, text) {
var realName = stripPath(stripExtension(filename));
logger.info('[article] Uploading ' + realName);
client.edit(realName, text, 'summary', function(e, ret) {
if(e) return;
logger.info('[article] Successfully uploaded ' + realName);
});
},
// upload all of the images for a file
images: function(fileName) {
var dirName = stripExtension(fileName);
glob(dirName+'/word/media/*.png', {}, function(e, files) {
files.forEach(function(filename) {
var justFileName = stripPath(filename);
var buffer = fs.readFileSync(filename);
logger.info('[image] Uploading ' + justFileName);
client.upload(justFileName, buffer, 'summary', function(e, data) {
if(e) return;
logger.info('[image] Successfully uploaded ' + justFileName);
});
});
});
},
original: function(fileName) {
var buffer = fs.readFileSync(fileName);
logger.info('[original] Uploading ' + fileName);
client.upload(stripPath(fileName), buffer, 'summary', function(e, data) {
if(e) return;
logger.info('[original] Successfully uploaded ' + fileName);
});
}
};
var sys = {
// remove numbering.xml from all files, because it fucks up all shit right and proper
precheck: function(file) {
var dirName = stripExtension(file);
logger.info('backing up '+file);
fs.copySync(file, file+'.backup');
logger.info('unzipping '+file);
execSync('unzip -o "' +file+ '" -d "' +dirName+ '"');
numberingFiles = glob.sync(dirName+'/**/numbering.*');
numberingFiles.forEach(function(file) {
logger.info('removing '+file);
fs.unlinkSync(file);
});
logger.info('removing '+file);
fs.unlinkSync(file);
logger.info('rezipping '+file);
execSync('cd "'+dirName+'" && zip -r "../' +stripPath(file)+ '" . && cd ../');
},
// convert docx to txt:MediaWiki using libreoffice
convert: function(file) {
logger.info('converting '+file);
execSync('soffice --headless --convert-to txt:MediaWiki "'+file+ '" --outdir docs');
},
// convert all jp* to pngs
fixImages: function(file) {
var dirName = stripExtension(file);
logger.info('making all jpgs into pngs for '+file);
execSync('mogrify -format png "'+dirName+'/word/media/*.jp*"');
},
// rename all pngs so when uploaded there is no collision
renameImages: function(file) {
var dirName = stripExtension(file);
var fileName = stripPath(dirName);
logger.info('renaming all pngs (collision prevention) for '+file);
execSync('mmv "'+dirName+'/word/media/*.png" "'+dirName+'/word/media/'+fileName+'_#1.png"');
},
// remove all jp* from the
removeJpegs: function(file) {
var dirName = stripExtension(file);
logger.info('removing all jp*gs for '+file);
jpgFiles = glob.sync(dirName+'/word/media/*.jp*');
jpgFiles.forEach(function(fileName) {
logger.info('removing '+fileName);
fs.unlinkSync(fileName);
});
}
};
// run a bunch of system calls
var doSystemWorkOn = function(file) {
sys.precheck(file);
sys.convert(file);
sys.fixImages(file);
sys.renameImages(file);
sys.removeJpegs(file);
};
var files = glob.sync(cglob);
files.forEach(function(file) {
doSystemWorkOn(file);
var fileText = readAndReplaceImagesInFile(file);
watchMe.then(function() {
if(!ignoreArticle) {
upload.file(file, fileText);
}
if(!ignoreImages) {
upload.images(file);
}
if(!ignoreOriginal) {
upload.original(file);
}
});
});
@amartinr1977
Copy link

amartinr1977 commented Nov 4, 2015

Hello!! Can you write an example with a ODT archive??
¿node Converter.js glob archivo.odt?
A lot of thanks in advance!!

Loading

@mikeltxu7
Copy link

mikeltxu7 commented Apr 12, 2016

Hello!
Could you explain how to use the Converter.js?
I am running the Convert.js with 'node Converter.js' and there is no error, but how can I convert my .docx in to a mediawiki?
My MediaWiki is on UbuntuServer 14.04.4.
Thank you and sorry for my bad English.

Loading

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment