Last active
February 11, 2018 03:16
-
-
Save hdon/ec41b461bc89492069edc62528b1cb72 to your computer and use it in GitHub Desktop.
better word2vec stream-parser using stream-promise-inverter
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* Some examples of word2vec binary format: | |
* https://github.com/dav/word2vec/blob/master/src/word2vec.c#L560-L566 | |
* https://github.com/jasonphillips/word2vec-stream/blob/master/index.js | |
* | |
* stream-promise-inverter: | |
* https://github.com/hdon/stream-promise-inverter | |
* | |
* compare to JUST word2vec format decode without stream-promise-inverter: | |
* https://github.com/jasonphillips/word2vec-stream/blob/8c76cf2df6a4dca14be9eb86ea784ccacfa7952d/index.js | |
* | |
* TODO are we sure utf8 is used by word2vec binary format? | |
*/ | |
const fs = require('fs'); | |
const zlib = require('zlib'); | |
const stream = require('stream'); | |
const util = require('util'); | |
const Inverter = require('./stream-promise-inverter') | |
const _ = require('lodash'); | |
if (process.argv.length != 5) { | |
console.log('please specify paths for: your gzipped word2vec binary, your desired output file, and a file containing your vocabulary of interest'); | |
process.exit(1); | |
} | |
const inPath = process.argv[2]; | |
const outPath = process.argv[3]; | |
const vocabPath = process.argv[4]; | |
const ourVocab = _.invert(fs.readFileSync(vocabPath).toString('utf8').split('\n')); | |
const ourVocabCount = _.size(ourVocab); | |
const outFile = fs.openSync(outPath, 'w'); | |
const BYTES_PER_DIMENSION = 4; | |
const SP = ' '.charCodeAt(0); | |
const LF = '\n'.charCodeAt(0); | |
const inStream = fs | |
.createReadStream(inPath) | |
.pipe(zlib.createGunzip()) | |
.pipe(new Inverter(async inverter => { | |
/* Read vocabulary size */ | |
const vocabSize = Number(await inverter.readUntil(SP)); | |
/* Read dimensionality */ | |
const numDimensions = Number(await inverter.readUntil(LF)); | |
const bytesPerVector = BYTES_PER_DIMENSION * numDimensions; | |
console.log('bytes per vector =', bytesPerVector); | |
/* Allocate a buffer to hold our vocabulary's embeddings */ | |
const ourVocabVectors = new Float32Array( | |
bytesPerVector * ourVocabCount | |
); | |
/* Read all vectors until we exhaust either our vocab or the file's vocab */ | |
let ourVocabCursor = 0; | |
for (let i = 0; i < vocabSize && ourVocabCursor < ourVocabCount; i++) { | |
/* Read in word */ | |
const word = (await inverter.readUntil(SP)).toString('utf8').slice(0,-1); | |
/* Read in vector */ | |
const vector = await inverter.read(bytesPerVector); | |
/* If the word is one of our words, grab it */ | |
if (word in ourVocab) { | |
console.log('found', word); | |
/* Write out the word2vec data */ | |
inverter.write(vector); | |
ourVocabCursor++; | |
} | |
} | |
console.log('wrote', ourVocabCursor, 'records to', outPath); | |
})) | |
.pipe(fs.createWriteStream(outPath)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment