Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@hdon
Last active February 11, 2018 03:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hdon/ec41b461bc89492069edc62528b1cb72 to your computer and use it in GitHub Desktop.
Save hdon/ec41b461bc89492069edc62528b1cb72 to your computer and use it in GitHub Desktop.
better word2vec stream-parser using stream-promise-inverter
/*
* Some examples of word2vec binary format:
* https://github.com/dav/word2vec/blob/master/src/word2vec.c#L560-L566
* https://github.com/jasonphillips/word2vec-stream/blob/master/index.js
*
* stream-promise-inverter:
* https://github.com/hdon/stream-promise-inverter
*
* compare to JUST word2vec format decode without stream-promise-inverter:
* https://github.com/jasonphillips/word2vec-stream/blob/8c76cf2df6a4dca14be9eb86ea784ccacfa7952d/index.js
*
* TODO are we sure utf8 is used by word2vec binary format?
*/
const fs = require('fs');
const zlib = require('zlib');
const stream = require('stream');
const util = require('util');
const Inverter = require('./stream-promise-inverter')
const _ = require('lodash');
if (process.argv.length != 5) {
console.log('please specify paths for: your gzipped word2vec binary, your desired output file, and a file containing your vocabulary of interest');
process.exit(1);
}
const inPath = process.argv[2];
const outPath = process.argv[3];
const vocabPath = process.argv[4];
const ourVocab = _.invert(fs.readFileSync(vocabPath).toString('utf8').split('\n'));
const ourVocabCount = _.size(ourVocab);
const outFile = fs.openSync(outPath, 'w');
const BYTES_PER_DIMENSION = 4;
const SP = ' '.charCodeAt(0);
const LF = '\n'.charCodeAt(0);
const inStream = fs
.createReadStream(inPath)
.pipe(zlib.createGunzip())
.pipe(new Inverter(async inverter => {
/* Read vocabulary size */
const vocabSize = Number(await inverter.readUntil(SP));
/* Read dimensionality */
const numDimensions = Number(await inverter.readUntil(LF));
const bytesPerVector = BYTES_PER_DIMENSION * numDimensions;
console.log('bytes per vector =', bytesPerVector);
/* Allocate a buffer to hold our vocabulary's embeddings */
const ourVocabVectors = new Float32Array(
bytesPerVector * ourVocabCount
);
/* Read all vectors until we exhaust either our vocab or the file's vocab */
let ourVocabCursor = 0;
for (let i = 0; i < vocabSize && ourVocabCursor < ourVocabCount; i++) {
/* Read in word */
const word = (await inverter.readUntil(SP)).toString('utf8').slice(0,-1);
/* Read in vector */
const vector = await inverter.read(bytesPerVector);
/* If the word is one of our words, grab it */
if (word in ourVocab) {
console.log('found', word);
/* Write out the word2vec data */
inverter.write(vector);
ourVocabCursor++;
}
}
console.log('wrote', ourVocabCursor, 'records to', outPath);
}))
.pipe(fs.createWriteStream(outPath))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment