hdon/word2vec-extract-vocab.js

## word2vec-extract-vocab.js
/*
 * Some examples of word2vec binary format:
 * https://github.com/dav/word2vec/blob/master/src/word2vec.c#L560-L566
 * https://github.com/jasonphillips/word2vec-stream/blob/master/index.js
 *
 * stream-promise-inverter:
 * https://github.com/hdon/stream-promise-inverter
 *
 * compare to JUST word2vec format decode without stream-promise-inverter:
 * https://github.com/jasonphillips/word2vec-stream/blob/8c76cf2df6a4dca14be9eb86ea784ccacfa7952d/index.js
 *
 * TODO are we sure utf8 is used by word2vec binary format?
 */
const fs = require('fs');
const zlib = require('zlib');
const stream = require('stream');
const util = require('util');
const Inverter = require('./stream-promise-inverter')
const _ = require('lodash');

if (process.argv.length != 5) {
  console.log('please specify paths for: your gzipped word2vec binary, your desired output file, and a file containing your vocabulary of interest');
  process.exit(1);
}

const inPath = process.argv[2];
const outPath = process.argv[3];
const vocabPath = process.argv[4];

const ourVocab = _.invert(fs.readFileSync(vocabPath).toString('utf8').split('\n'));
const ourVocabCount = _.size(ourVocab);
const outFile = fs.openSync(outPath, 'w');

const BYTES_PER_DIMENSION = 4;
const SP = ' '.charCodeAt(0);
const LF = '\n'.charCodeAt(0);

const inStream = fs
  .createReadStream(inPath)
  .pipe(zlib.createGunzip())
  .pipe(new Inverter(async inverter => {
    /* Read vocabulary size */
    const vocabSize = Number(await inverter.readUntil(SP));
    /* Read dimensionality */
    const numDimensions = Number(await inverter.readUntil(LF));
    const bytesPerVector = BYTES_PER_DIMENSION * numDimensions;
    console.log('bytes per vector =', bytesPerVector);
    /* Allocate a buffer to hold our vocabulary's embeddings */
    const ourVocabVectors = new Float32Array(
      bytesPerVector * ourVocabCount
    );
    /* Read all vectors until we exhaust either our vocab or the file's vocab */
    let ourVocabCursor = 0;
    for (let i = 0; i < vocabSize && ourVocabCursor < ourVocabCount; i++) {
      /* Read in word */
      const word = (await inverter.readUntil(SP)).toString('utf8').slice(0,-1);
      /* Read in vector */
      const vector = await inverter.read(bytesPerVector);
      /* If the word is one of our words, grab it */
      if (word in ourVocab) {
        console.log('found', word);
        /* Write out the word2vec data */
        inverter.write(vector);
        ourVocabCursor++;
      }
    }
    console.log('wrote', ourVocabCursor, 'records to', outPath);
  }))
  .pipe(fs.createWriteStream(outPath))
	/*
	* Some examples of word2vec binary format:
	* https://github.com/dav/word2vec/blob/master/src/word2vec.c#L560-L566
	* https://github.com/jasonphillips/word2vec-stream/blob/master/index.js
	*
	* stream-promise-inverter:
	* https://github.com/hdon/stream-promise-inverter
	*
	* compare to JUST word2vec format decode without stream-promise-inverter:
	* https://github.com/jasonphillips/word2vec-stream/blob/8c76cf2df6a4dca14be9eb86ea784ccacfa7952d/index.js
	*
	* TODO are we sure utf8 is used by word2vec binary format?
	*/
	const fs = require('fs');
	const zlib = require('zlib');
	const stream = require('stream');
	const util = require('util');
	const Inverter = require('./stream-promise-inverter')
	const _ = require('lodash');

	if (process.argv.length != 5) {
	console.log('please specify paths for: your gzipped word2vec binary, your desired output file, and a file containing your vocabulary of interest');
	process.exit(1);
	}

	const inPath = process.argv[2];
	const outPath = process.argv[3];
	const vocabPath = process.argv[4];

	const ourVocab = _.invert(fs.readFileSync(vocabPath).toString('utf8').split('\n'));
	const ourVocabCount = _.size(ourVocab);
	const outFile = fs.openSync(outPath, 'w');

	const BYTES_PER_DIMENSION = 4;
	const SP = ' '.charCodeAt(0);
	const LF = '\n'.charCodeAt(0);

	const inStream = fs
	.createReadStream(inPath)
	.pipe(zlib.createGunzip())
	.pipe(new Inverter(async inverter => {
	/* Read vocabulary size */
	const vocabSize = Number(await inverter.readUntil(SP));
	/* Read dimensionality */
	const numDimensions = Number(await inverter.readUntil(LF));
	const bytesPerVector = BYTES_PER_DIMENSION * numDimensions;
	console.log('bytes per vector =', bytesPerVector);
	/* Allocate a buffer to hold our vocabulary's embeddings */
	const ourVocabVectors = new Float32Array(
	bytesPerVector * ourVocabCount
	);
	/* Read all vectors until we exhaust either our vocab or the file's vocab */
	let ourVocabCursor = 0;
	for (let i = 0; i < vocabSize && ourVocabCursor < ourVocabCount; i++) {
	/* Read in word */
	const word = (await inverter.readUntil(SP)).toString('utf8').slice(0,-1);
	/* Read in vector */
	const vector = await inverter.read(bytesPerVector);
	/* If the word is one of our words, grab it */
	if (word in ourVocab) {
	console.log('found', word);
	/* Write out the word2vec data */
	inverter.write(vector);
	ourVocabCursor++;
	}
	}
	console.log('wrote', ourVocabCursor, 'records to', outPath);
	}))
	.pipe(fs.createWriteStream(outPath))