seiyria/Converter.js

## README.md

      
    Raw
  

              README.md
            
          
    Are you having problems converting your docx to MediaWiki and preserving the images? Great, so did I. That's why this exists. Overall, the problem is a terrible, horrible one, but not one that's unsolvable (given that this exists). First, you need to be on linux. If you want it on windows, it would probably work with some modifications. I've only tested it on ubuntu (14.10) so far.
First, install these programs:

imagemagick
libreoffice
mmv
nodejs v0.12.0 or later (we need execSync functionality)

Then install these node packages:

fs-extra
glob
minimist
nodemw
q
winston

One line:
npm install fs-extra glob minimist nodemw q winston

The Converter.js file supports a couple arguments:

glob (default: 'docs/*.docx') -- the docx files to process
tags (default: '') -- the MediaWiki categories for your documents. Example: Document,Wiki,Apple
ignore-article (default: false) -- whether or not to ignore the article (the actual text content). Most times you want this on.
ignore-images (default: false) -- whether or not to ignore the images in the article. For multiple subsequent runs, you probably want this off.
ignore-original (default: false) -- whether or not to upload the original docx document to the wiki.


## Converter.js
var fs = require('fs-extra');
var wiki = require('nodemw');
var q = require('q');
var glob = require('glob');
var execSync = require('child_process').execSync;
var logger = require('winston');
var argv = require('minimist')(process.argv.slice(2));

var client = new wiki({
  server: 'wiki.server.com',
  port: 8080,
  path: '/mediawiki',
  debug: true,
  username: 'uploadmonkey',
  password: 'lol',
  userAgent: 'UploadMonkey/1.1 (http://thissitedoesntexist.com/UploadMonkey/; my@email.com) Node/0.12'
});

var botLoggedIn = q.defer();
var watchMe = botLoggedIn.promise;

client.logIn(function(err, data) {
  if(err) return;
  botLoggedIn.resolve();
});

var cglob = argv.glob || 'docs/*.docx';
var tags = argv.tags ? argv.tags.split(',') : [];
var ignoreArticle = argv['ignore-article'];
var ignoreImages = argv['ignore-images'];
var ignoreOriginal = argv['ignore-original'];

var stripPath = function(path) {
  return path.split('\\').pop().split('/').pop();
};

var stripExtension = function(file) {
  return file.split('.')[0];
};

var readAndReplaceImagesInFile = function(fileName) {

  var dirName = stripExtension(fileName);
  var justFileName = stripPath(dirName);

  // read everything into memory
  var doc  = fs.readFileSync(dirName+'/word/document.xml');
  var rels = fs.readFileSync(dirName+'/word/_rels/document.xml.rels');
  var markdown = fs.readFileSync(dirName+'.txt');

  // rip out the ordering of relationships from the docx
  var getRelationships = function(file) {
    var regex = /r:embed="(rId[0-9]+)"/g;
    var match = null;
    var matches = [];

    while( (match=regex.exec(file)) !== null) {
      matches.push(match[1]);
    }

    return matches;
  };

  // rip out relationship / image pairs
  var getRelationshipTargets = function(file) {
    var regex = /Id="([\w]+)"\sType="[\w|\d|\.|\/|:]+"\sTarget="media\/([\w]+)\.\w{1,4}"/g;
    var match = null;
    var matchHash = {};

    while( (match=regex.exec(file)) !== null) {
      matchHash[match[1]] = match[2];
    }
    return matchHash;
  };

  // get all possible relationships and their matching image
  var orderedRelationships = getRelationships(doc.toString());
  var relToImg = getRelationshipTargets(rels.toString());

  // naiive matching of image to position in document
  var currentMatch = 0;

  // regex replace all of the old, deprecated [[Image:]] with the newer [[File:]] and link to the image
  var replFile = markdown.toString().replace(/\[\[Image:]]/g, function(match, p1) {
    return "[[File:"+justFileName+"_"+(relToImg[orderedRelationships[currentMatch++]])+".png]]";
  });

  if(tags.length > 0) {
    replFile += '\n\n';
  }

  tags.forEach(function(tag) {
    replFile += '[[Category:'+tag+']]\n';
  });

  return replFile;
};

var upload = {

  // upload a file
  file: function(filename, text) {
    var realName = stripPath(stripExtension(filename));

    logger.info('[article] Uploading ' + realName);
    client.edit(realName, text, 'summary', function(e, ret) {
      if(e) return;
      logger.info('[article] Successfully uploaded ' + realName);
    });
  },

  // upload all of the images for a file
  images: function(fileName) {
    var dirName = stripExtension(fileName);

    glob(dirName+'/word/media/*.png', {}, function(e, files) {

      files.forEach(function(filename) {
        var justFileName = stripPath(filename);
        var buffer = fs.readFileSync(filename);

        logger.info('[image] Uploading ' + justFileName);
        client.upload(justFileName, buffer, 'summary', function(e, data) {
          if(e) return;
          logger.info('[image] Successfully uploaded ' + justFileName);
        });
      });

    });
  },

  original: function(fileName) {
    var buffer = fs.readFileSync(fileName);

    logger.info('[original] Uploading ' + fileName);
    client.upload(stripPath(fileName), buffer, 'summary', function(e, data) {
      if(e) return;
      logger.info('[original] Successfully uploaded ' + fileName);
    });
  }
};

var sys = {

  // remove numbering.xml from all files, because it fucks up all shit right and proper
  precheck: function(file) {
    var dirName = stripExtension(file);

    logger.info('backing up '+file);
    fs.copySync(file, file+'.backup');

    logger.info('unzipping '+file);
    execSync('unzip -o "' +file+ '" -d "' +dirName+ '"');

    numberingFiles = glob.sync(dirName+'/**/numbering.*');
    numberingFiles.forEach(function(file) {
      logger.info('removing '+file);
      fs.unlinkSync(file);
    });

    logger.info('removing '+file);
    fs.unlinkSync(file);

    logger.info('rezipping '+file);
    execSync('cd "'+dirName+'" && zip -r "../' +stripPath(file)+ '" . && cd ../');
  },

  // convert docx to txt:MediaWiki using libreoffice
  convert: function(file) {
    logger.info('converting '+file);
    execSync('soffice --headless --convert-to txt:MediaWiki "'+file+ '" --outdir docs');
  },

  // convert all jp* to pngs
  fixImages: function(file) {
    var dirName = stripExtension(file);
    logger.info('making all jpgs into pngs for '+file);
    execSync('mogrify -format png "'+dirName+'/word/media/*.jp*"');
  },

  // rename all pngs so when uploaded there is no collision
  renameImages: function(file) {
    var dirName = stripExtension(file);
    var fileName = stripPath(dirName);

    logger.info('renaming all pngs (collision prevention) for '+file);
    execSync('mmv "'+dirName+'/word/media/*.png" "'+dirName+'/word/media/'+fileName+'_#1.png"');
  },

  // remove all jp* from the
  removeJpegs: function(file) {
    var dirName = stripExtension(file);
    logger.info('removing all jp*gs for '+file);

    jpgFiles = glob.sync(dirName+'/word/media/*.jp*');
    jpgFiles.forEach(function(fileName) {
      logger.info('removing '+fileName);
      fs.unlinkSync(fileName);
    });
  }

};

// run a bunch of system calls
var doSystemWorkOn = function(file) {
    sys.precheck(file);
    sys.convert(file);
    sys.fixImages(file);
    sys.renameImages(file);
    sys.removeJpegs(file);
};

var files = glob.sync(cglob);

files.forEach(function(file) {
  doSystemWorkOn(file);
  var fileText = readAndReplaceImagesInFile(file);

  watchMe.then(function() {

    if(!ignoreArticle) {
      upload.file(file, fileText);
    }

    if(!ignoreImages) {
      upload.images(file);
    }

    if(!ignoreOriginal) {
      upload.original(file);
    }

  });

});
	var fs = require('fs-extra');
	var wiki = require('nodemw');
	var q = require('q');
	var glob = require('glob');
	var execSync = require('child_process').execSync;
	var logger = require('winston');
	var argv = require('minimist')(process.argv.slice(2));

	var client = new wiki({
	server: 'wiki.server.com',
	port: 8080,
	path: '/mediawiki',
	debug: true,
	username: 'uploadmonkey',
	password: 'lol',
	userAgent: 'UploadMonkey/1.1 (http://thissitedoesntexist.com/UploadMonkey/; my@email.com) Node/0.12'
	});

	var botLoggedIn = q.defer();
	var watchMe = botLoggedIn.promise;

	client.logIn(function(err, data) {
	if(err) return;
	botLoggedIn.resolve();
	});

	var cglob = argv.glob \|\| 'docs/*.docx';
	var tags = argv.tags ? argv.tags.split(',') : [];
	var ignoreArticle = argv['ignore-article'];
	var ignoreImages = argv['ignore-images'];
	var ignoreOriginal = argv['ignore-original'];

	var stripPath = function(path) {
	return path.split('\\').pop().split('/').pop();
	};

	var stripExtension = function(file) {
	return file.split('.')[0];
	};

	var readAndReplaceImagesInFile = function(fileName) {

	var dirName = stripExtension(fileName);
	var justFileName = stripPath(dirName);

	// read everything into memory
	var doc = fs.readFileSync(dirName+'/word/document.xml');
	var rels = fs.readFileSync(dirName+'/word/_rels/document.xml.rels');
	var markdown = fs.readFileSync(dirName+'.txt');

	// rip out the ordering of relationships from the docx
	var getRelationships = function(file) {
	var regex = /r:embed="(rId[0-9]+)"/g;
	var match = null;
	var matches = [];

	while( (match=regex.exec(file)) !== null) {
	matches.push(match[1]);
	}

	return matches;
	};

	// rip out relationship / image pairs
	var getRelationshipTargets = function(file) {
	var regex = /Id="([\w]+)"\sType="[\w\|\d\|\.\|\/\|:]+"\sTarget="media\/([\w]+)\.\w{1,4}"/g;
	var match = null;
	var matchHash = {};

	while( (match=regex.exec(file)) !== null) {
	matchHash[match[1]] = match[2];
	}
	return matchHash;
	};

	// get all possible relationships and their matching image
	var orderedRelationships = getRelationships(doc.toString());
	var relToImg = getRelationshipTargets(rels.toString());

	// naiive matching of image to position in document
	var currentMatch = 0;

	// regex replace all of the old, deprecated [[Image:]] with the newer [[File:]] and link to the image
	var replFile = markdown.toString().replace(/\[\[Image:]]/g, function(match, p1) {
	return "[[File:"+justFileName+"_"+(relToImg[orderedRelationships[currentMatch++]])+".png]]";
	});

	if(tags.length > 0) {
	replFile += '\n\n';
	}

	tags.forEach(function(tag) {
	replFile += '[[Category:'+tag+']]\n';
	});

	return replFile;
	};

	var upload = {

	// upload a file
	file: function(filename, text) {
	var realName = stripPath(stripExtension(filename));

	logger.info('[article] Uploading ' + realName);
	client.edit(realName, text, 'summary', function(e, ret) {
	if(e) return;
	logger.info('[article] Successfully uploaded ' + realName);
	});
	},

	// upload all of the images for a file
	images: function(fileName) {
	var dirName = stripExtension(fileName);

	glob(dirName+'/word/media/*.png', {}, function(e, files) {

	files.forEach(function(filename) {
	var justFileName = stripPath(filename);
	var buffer = fs.readFileSync(filename);

	logger.info('[image] Uploading ' + justFileName);
	client.upload(justFileName, buffer, 'summary', function(e, data) {
	if(e) return;
	logger.info('[image] Successfully uploaded ' + justFileName);
	});
	});

	});
	},

	original: function(fileName) {
	var buffer = fs.readFileSync(fileName);

	logger.info('[original] Uploading ' + fileName);
	client.upload(stripPath(fileName), buffer, 'summary', function(e, data) {
	if(e) return;
	logger.info('[original] Successfully uploaded ' + fileName);
	});
	}
	};

	var sys = {

	// remove numbering.xml from all files, because it fucks up all shit right and proper
	precheck: function(file) {
	var dirName = stripExtension(file);

	logger.info('backing up '+file);
	fs.copySync(file, file+'.backup');

	logger.info('unzipping '+file);
	execSync('unzip -o "' +file+ '" -d "' +dirName+ '"');

	numberingFiles = glob.sync(dirName+'/*/numbering.');
	numberingFiles.forEach(function(file) {
	logger.info('removing '+file);
	fs.unlinkSync(file);
	});

	logger.info('removing '+file);
	fs.unlinkSync(file);

	logger.info('rezipping '+file);
	execSync('cd "'+dirName+'" && zip -r "../' +stripPath(file)+ '" . && cd ../');
	},

	// convert docx to txt:MediaWiki using libreoffice
	convert: function(file) {
	logger.info('converting '+file);
	execSync('soffice --headless --convert-to txt:MediaWiki "'+file+ '" --outdir docs');
	},

	// convert all jp* to pngs
	fixImages: function(file) {
	var dirName = stripExtension(file);
	logger.info('making all jpgs into pngs for '+file);
	execSync('mogrify -format png "'+dirName+'/word/media/.jp"');
	},

	// rename all pngs so when uploaded there is no collision
	renameImages: function(file) {
	var dirName = stripExtension(file);
	var fileName = stripPath(dirName);

	logger.info('renaming all pngs (collision prevention) for '+file);
	execSync('mmv "'+dirName+'/word/media/*.png" "'+dirName+'/word/media/'+fileName+'_#1.png"');
	},

	// remove all jp* from the
	removeJpegs: function(file) {
	var dirName = stripExtension(file);
	logger.info('removing all jp*gs for '+file);

	jpgFiles = glob.sync(dirName+'/word/media/.jp');
	jpgFiles.forEach(function(fileName) {
	logger.info('removing '+fileName);
	fs.unlinkSync(fileName);
	});
	}

	};

	// run a bunch of system calls
	var doSystemWorkOn = function(file) {
	sys.precheck(file);
	sys.convert(file);
	sys.fixImages(file);
	sys.renameImages(file);
	sys.removeJpegs(file);
	};

	var files = glob.sync(cglob);

	files.forEach(function(file) {
	doSystemWorkOn(file);
	var fileText = readAndReplaceImagesInFile(file);

	watchMe.then(function() {

	if(!ignoreArticle) {
	upload.file(file, fileText);
	}

	if(!ignoreImages) {
	upload.images(file);
	}

	if(!ignoreOriginal) {
	upload.original(file);
	}

	});

	});