Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
zozo
var fs = require('fs');
Int64 = require('node-int64');
var varint = require('varint');
var lazy = require("lazy");
// http://braindrivendevelopment.com/2013/10/31/reading-a-file-in-node-js/
// http://lucene.apache.org/core/3_0_3/fileformats.pdf
var bufferSegmentGen = fs.readFileSync('./segments.gen');
var bufferSegments = fs.readFileSync('./segments_4o4');
var bufferFields = fs.readFileSync('./_9ea.fnm');
var bufferTermsDico = fs.readFileSync('./_9ea.tis');
/*for (ii = 0; ii < bufferFields.length; ii++) {
console.log(bufferFields.readUInt8(ii));
}*/
//console.log(bufferSegments.readUInt32BE(0));
// GEN
console.log(bufferSegmentGen.readInt32BE(0));
console.log("Generation"+new Int64(bufferSegmentGen, 4));
console.log("Generation"+new Int64(bufferSegmentGen, 12));
// SEGMENT
/*Format Int32, Version Int64, NameCounter Int32, SegCount Int32, <SegName String
, SegSize Int32, DelGen Int64,
DocStoreOffset Int32, [DocStoreSegment, DocStoreIsCompoundFile], HasSingleNormFile Int8
,
NumField Int32, NormGenNumField, IsCompoundFile, DeletionCount, HasProx,
Diagnostics>SegCount, CommitUserData, Checksum
*/
// http://www.i-programmer.info/programming/javascript/2550-javascript-bit-manipulation.html
// http://www.cs.umd.edu/class/sum2003/cmsc311/Notes/Data/endian.html
console.log("Format "+bufferSegments.readInt32BE(0));
console.log("Version "+new Int64(bufferSegmentGen, 4));
console.log("NameCounter "+bufferSegments.readInt32BE(12));
console.log("SegCount "+bufferSegments.readInt32BE(16));
var segNameSize = varint.decode(bufferSegments, 20);
var nextOffset = 21 + segNameSize;
console.log("SegName size "+varint.decode(bufferSegments, 20));
console.log("SegName "+bufferSegments.slice(21, nextOffset));
console.log("SegSize (number documents) "+bufferSegments.readInt32BE(nextOffset));
console.log("DelGen "+bufferSegments.readInt32BE(nextOffset+4));
console.log("DocStoreOffset "+bufferSegments.readInt32BE(nextOffset+8));
console.log(new Int64(bufferSegmentGen, nextOffset+5));
function createMask () {
var nMask = 0, nFlag = 0, nLen = arguments.length > 32 ? 32 : arguments.length;
for (nFlag; nFlag < nLen; nMask |= arguments[nFlag] << nFlag++);
return nMask;
}
// FIELDS
// FieldInfos (.fnm) -->
// FNMVersion VInt,FieldsCount VInt , <FieldName String, FieldBits Byte> FieldsCount
// TODO FNMVersion (added in 2.9) is always -2.
/*FieldBits --> Byte
• The low-order bit is one for indexed fields, and zero for non-indexed fields.
*/
var INDEXED_FIELDS = 0x01;
/*
• The second lowest-order bit is one for fields that have term vectors stored, and zero for
fields without term vectors.
*/
var TERM_VECTORS_STORED = 0x02;
/*
• If the third lowest-order bit is set (0x04), term positions are stored with the term vectors.
*/
var TERM_POSITIONS_STORED = 0x04;
/*
• If the fourth lowest-order bit is set (0x08), term offsets are stored with the term vectors.
*/
var TERM_OFFSETS_STORED = 0x08;
/*
• If the fifth lowest-order bit is set (0x10), norms are omitted for the indexed field.
*/
var NO_NORMS = 0x10;
/*
• If the sixth lowest-order bit is set (0x20), payloads are stored for the indexed field
*/
var PAYLOADS_STORED = 0x20;
console.log("FieldsCount "+varint.decode(bufferFields, 0));
nextOffset = varint.decode.bytes;
// GET FIELDS
/*while (nextOffset < bufferFields.length) {
// string
var fieldNameSize = varint.decode(bufferFields, nextOffset);
nextOffset += varint.decode.bytes;
//console.log("FieldName size "+fieldNameSize);
console.log("FieldName "+bufferFields.slice(nextOffset, nextOffset+fieldNameSize));
nextOffset += fieldNameSize;
// fieldbits
var fieldBit = bufferFields[nextOffset];
console.log(fieldBit);
var fielsBitInfo = {
INDEXED_FIELDS: fieldBit & INDEXED_FIELDS === INDEXED_FIELDS,
TERM_VECTORS_STORED: fieldBit & TERM_VECTORS_STORED === TERM_VECTORS_STORED,
TERM_POSITIONS_STORED: fieldBit & TERM_POSITIONS_STORED === TERM_POSITIONS_STORED,
TERM_OFFSETS_STORED: fieldBit & TERM_OFFSETS_STORED === TERM_OFFSETS_STORED,
NO_NORMS: fieldBit & NO_NORMS === NO_NORMS,
PAYLOADS_STORED: fieldBit & PAYLOADS_STORED === PAYLOADS_STORED
};
console.log(fielsBitInfo);
nextOffset += 1;
}*/
// GET TERM DICTIONARY
/*TermInfoFile (.tis)--> TIVersion UInt32, TermCount UInt64, IndexInterval UInt32, SkipInterval UInt32,
MaxSkipLevels , TermInfos*/
/*TIVersion --> UInt32
TermCount --> UInt64
IndexInterval --> UInt32
SkipInterval --> UInt32
MaxSkipLevels --> UInt32
TermInfos --> <TermInfo> TermCount
TermInfo --> <Term, DocFreq, FreqDelta, ProxDelta, SkipDelta>
Term --> <PrefixLength, Suffix, FieldNum VInt>
Suffix --> String
PrefixLength, DocFreq, FreqDelta, ProxDelta, SkipDelta
--> VInt
*/
console.log("TIVersion "+bufferTermsDico.readInt32BE(0));
console.log("TermCount"+new Int64(bufferTermsDico, 4));
console.log("IndexInterval"+bufferTermsDico.readInt32BE(12));
console.log("SkipInterval"+bufferTermsDico.readInt32BE(16));
console.log("MaxSkipLevels"+bufferTermsDico.readInt32BE(20));
nextOffset = 24;
var i = 0;
var jsonFields = [];
var PrefixLength = 0;
new lazy(fs.createReadStream('./_9ea.tis')).lines.skip(1).forEach(function(line){
var field = {};
nextOffset = 0;
// TermInfos --> <TermInfo> TermCount
// TermInfo --> <Term, DocFreq, FreqDelta, ProxDelta, SkipDelta>
// Term --> <PrefixLength, Suffix, FieldNum>
PrefixLength = varint.decode(line, nextOffset);
nextOffset += varint.decode.bytes;
var suffixSize = varint.decode(line, nextOffset);
nextOffset += varint.decode.bytes;
var Suffix = line.slice(nextOffset, nextOffset+suffixSize);
nextOffset += suffixSize;
var FieldNum = varint.decode(line, nextOffset);
nextOffset += varint.decode.bytes;
var DocFreq = varint.decode(line, nextOffset);
nextOffset += varint.decode.bytes;
var FreqDelta = varint.decode(line, nextOffset);
nextOffset += varint.decode.bytes;
var ProxDelta = varint.decode(line, nextOffset);
nextOffset += varint.decode.bytes;
var SkipDelta = varint.decode(line, nextOffset);
nextOffset += varint.decode.bytes;
field.PrefixLength = PrefixLength;
field.Suffix = Suffix.toString();
//field.Suffix = field.Suffix.substring(PrefixLength, field.Suffix.length);
field.FieldNum = FieldNum;
field.DocFreq = DocFreq;
field.FreqDelta = FreqDelta;
field.ProxDelta = ProxDelta;
field.SkipDelta = SkipDelta;
jsonFields.push(field);
//console.log(field);
i++;
/*if (i ===5)
break;*/
}
).on('pipe', function () {
fs.appendFile('fields.json', JSON.stringify(jsonFields, null, '\t'));
});
/*while (nextOffset < bufferTermsDico.length) {
var field = {};
varint = require('varint');
// TermInfos --> <TermInfo> TermCount
// TermInfo --> <Term, DocFreq, FreqDelta, ProxDelta, SkipDelta>
// Term --> <PrefixLength, Suffix, FieldNum>
PrefixLength = varint.decode(bufferTermsDico, nextOffset);
nextOffset += varint.decode.bytes;
var suffixSize = varint.decode(bufferTermsDico, nextOffset);
nextOffset += varint.decode.bytes;
var Suffix = bufferTermsDico.slice(nextOffset, nextOffset+suffixSize);
nextOffset += suffixSize;
var FieldNum = varint.decode(bufferTermsDico, nextOffset);
nextOffset += varint.decode.bytes;
var DocFreq = varint.decode(bufferTermsDico, nextOffset);
nextOffset += varint.decode.bytes;
var FreqDelta = varint.decode(bufferTermsDico, nextOffset);
nextOffset += varint.decode.bytes;
var ProxDelta = varint.decode(bufferTermsDico, nextOffset);
nextOffset += varint.decode.bytes;
var SkipDelta = varint.decode(bufferTermsDico, nextOffset);
nextOffset += varint.decode.bytes;
field.PrefixLength = PrefixLength;
field.Suffix = Suffix.toString();
field.Suffix = field.Suffix.substring(PrefixLength, field.Suffix.length);
field.FieldNum = FieldNum;
field.DocFreq = DocFreq;
field.FreqDelta = FreqDelta;
field.ProxDelta = ProxDelta;
field.SkipDelta = SkipDelta;
//jsonFields.push(field);
//i++;
if (i ===5)
break;
}*/
/*jsonFields = jsonFields.sort(function(a,b) {
return b.FieldNum - a.FieldNum }
);
for (var i=0;i<jsonFields.length;i++) {
if (i>0) {
jsonFields[i].Suffix = jsonFields[i].Suffix.substring(jsonFields[i-1].PrefixLength, jsonFields[i].Suffix.length);
}
}*/
console.log("coucou");
console.log(jsonFields.length);
fs.writeFileSync('fields.json', JSON.stringify(jsonFields, null, '\t'));
//console.log("FieldName "+bufferSegments.slice(21, nextOffset));
// console.log(data);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.