Created
February 13, 2015 16:19
-
-
Save darul75/6b418da7281ca43c5786 to your computer and use it in GitHub Desktop.
zozo
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var fs = require('fs'); | |
Int64 = require('node-int64'); | |
var varint = require('varint'); | |
var lazy = require("lazy"); | |
// http://braindrivendevelopment.com/2013/10/31/reading-a-file-in-node-js/ | |
// http://lucene.apache.org/core/3_0_3/fileformats.pdf | |
var bufferSegmentGen = fs.readFileSync('./segments.gen'); | |
var bufferSegments = fs.readFileSync('./segments_4o4'); | |
var bufferFields = fs.readFileSync('./_9ea.fnm'); | |
var bufferTermsDico = fs.readFileSync('./_9ea.tis'); | |
/*for (ii = 0; ii < bufferFields.length; ii++) { | |
console.log(bufferFields.readUInt8(ii)); | |
}*/ | |
//console.log(bufferSegments.readUInt32BE(0)); | |
// GEN | |
console.log(bufferSegmentGen.readInt32BE(0)); | |
console.log("Generation"+new Int64(bufferSegmentGen, 4)); | |
console.log("Generation"+new Int64(bufferSegmentGen, 12)); | |
// SEGMENT | |
/*Format Int32, Version Int64, NameCounter Int32, SegCount Int32, <SegName String | |
, SegSize Int32, DelGen Int64, | |
DocStoreOffset Int32, [DocStoreSegment, DocStoreIsCompoundFile], HasSingleNormFile Int8 | |
, | |
NumField Int32, NormGenNumField, IsCompoundFile, DeletionCount, HasProx, | |
Diagnostics>SegCount, CommitUserData, Checksum | |
*/ | |
// http://www.i-programmer.info/programming/javascript/2550-javascript-bit-manipulation.html | |
// http://www.cs.umd.edu/class/sum2003/cmsc311/Notes/Data/endian.html | |
console.log("Format "+bufferSegments.readInt32BE(0)); | |
console.log("Version "+new Int64(bufferSegmentGen, 4)); | |
console.log("NameCounter "+bufferSegments.readInt32BE(12)); | |
console.log("SegCount "+bufferSegments.readInt32BE(16)); | |
var segNameSize = varint.decode(bufferSegments, 20); | |
var nextOffset = 21 + segNameSize; | |
console.log("SegName size "+varint.decode(bufferSegments, 20)); | |
console.log("SegName "+bufferSegments.slice(21, nextOffset)); | |
console.log("SegSize (number documents) "+bufferSegments.readInt32BE(nextOffset)); | |
console.log("DelGen "+bufferSegments.readInt32BE(nextOffset+4)); | |
console.log("DocStoreOffset "+bufferSegments.readInt32BE(nextOffset+8)); | |
console.log(new Int64(bufferSegmentGen, nextOffset+5)); | |
function createMask () { | |
var nMask = 0, nFlag = 0, nLen = arguments.length > 32 ? 32 : arguments.length; | |
for (nFlag; nFlag < nLen; nMask |= arguments[nFlag] << nFlag++); | |
return nMask; | |
} | |
// FIELDS | |
// FieldInfos (.fnm) --> | |
// FNMVersion VInt,FieldsCount VInt , <FieldName String, FieldBits Byte> FieldsCount | |
// TODO FNMVersion (added in 2.9) is always -2. | |
/*FieldBits --> Byte | |
• The low-order bit is one for indexed fields, and zero for non-indexed fields. | |
*/ | |
var INDEXED_FIELDS = 0x01; | |
/* | |
• The second lowest-order bit is one for fields that have term vectors stored, and zero for | |
fields without term vectors. | |
*/ | |
var TERM_VECTORS_STORED = 0x02; | |
/* | |
• If the third lowest-order bit is set (0x04), term positions are stored with the term vectors. | |
*/ | |
var TERM_POSITIONS_STORED = 0x04; | |
/* | |
• If the fourth lowest-order bit is set (0x08), term offsets are stored with the term vectors. | |
*/ | |
var TERM_OFFSETS_STORED = 0x08; | |
/* | |
• If the fifth lowest-order bit is set (0x10), norms are omitted for the indexed field. | |
*/ | |
var NO_NORMS = 0x10; | |
/* | |
• If the sixth lowest-order bit is set (0x20), payloads are stored for the indexed field | |
*/ | |
var PAYLOADS_STORED = 0x20; | |
console.log("FieldsCount "+varint.decode(bufferFields, 0)); | |
nextOffset = varint.decode.bytes; | |
// GET FIELDS | |
/*while (nextOffset < bufferFields.length) { | |
// string | |
var fieldNameSize = varint.decode(bufferFields, nextOffset); | |
nextOffset += varint.decode.bytes; | |
//console.log("FieldName size "+fieldNameSize); | |
console.log("FieldName "+bufferFields.slice(nextOffset, nextOffset+fieldNameSize)); | |
nextOffset += fieldNameSize; | |
// fieldbits | |
var fieldBit = bufferFields[nextOffset]; | |
console.log(fieldBit); | |
var fielsBitInfo = { | |
INDEXED_FIELDS: fieldBit & INDEXED_FIELDS === INDEXED_FIELDS, | |
TERM_VECTORS_STORED: fieldBit & TERM_VECTORS_STORED === TERM_VECTORS_STORED, | |
TERM_POSITIONS_STORED: fieldBit & TERM_POSITIONS_STORED === TERM_POSITIONS_STORED, | |
TERM_OFFSETS_STORED: fieldBit & TERM_OFFSETS_STORED === TERM_OFFSETS_STORED, | |
NO_NORMS: fieldBit & NO_NORMS === NO_NORMS, | |
PAYLOADS_STORED: fieldBit & PAYLOADS_STORED === PAYLOADS_STORED | |
}; | |
console.log(fielsBitInfo); | |
nextOffset += 1; | |
}*/ | |
// GET TERM DICTIONARY | |
/*TermInfoFile (.tis)--> TIVersion UInt32, TermCount UInt64, IndexInterval UInt32, SkipInterval UInt32, | |
MaxSkipLevels , TermInfos*/ | |
/*TIVersion --> UInt32 | |
TermCount --> UInt64 | |
IndexInterval --> UInt32 | |
SkipInterval --> UInt32 | |
MaxSkipLevels --> UInt32 | |
TermInfos --> <TermInfo> TermCount | |
TermInfo --> <Term, DocFreq, FreqDelta, ProxDelta, SkipDelta> | |
Term --> <PrefixLength, Suffix, FieldNum VInt> | |
Suffix --> String | |
PrefixLength, DocFreq, FreqDelta, ProxDelta, SkipDelta | |
--> VInt | |
*/ | |
console.log("TIVersion "+bufferTermsDico.readInt32BE(0)); | |
console.log("TermCount"+new Int64(bufferTermsDico, 4)); | |
console.log("IndexInterval"+bufferTermsDico.readInt32BE(12)); | |
console.log("SkipInterval"+bufferTermsDico.readInt32BE(16)); | |
console.log("MaxSkipLevels"+bufferTermsDico.readInt32BE(20)); | |
nextOffset = 24; | |
var i = 0; | |
var jsonFields = []; | |
var PrefixLength = 0; | |
new lazy(fs.createReadStream('./_9ea.tis')).lines.skip(1).forEach(function(line){ | |
var field = {}; | |
nextOffset = 0; | |
// TermInfos --> <TermInfo> TermCount | |
// TermInfo --> <Term, DocFreq, FreqDelta, ProxDelta, SkipDelta> | |
// Term --> <PrefixLength, Suffix, FieldNum> | |
PrefixLength = varint.decode(line, nextOffset); | |
nextOffset += varint.decode.bytes; | |
var suffixSize = varint.decode(line, nextOffset); | |
nextOffset += varint.decode.bytes; | |
var Suffix = line.slice(nextOffset, nextOffset+suffixSize); | |
nextOffset += suffixSize; | |
var FieldNum = varint.decode(line, nextOffset); | |
nextOffset += varint.decode.bytes; | |
var DocFreq = varint.decode(line, nextOffset); | |
nextOffset += varint.decode.bytes; | |
var FreqDelta = varint.decode(line, nextOffset); | |
nextOffset += varint.decode.bytes; | |
var ProxDelta = varint.decode(line, nextOffset); | |
nextOffset += varint.decode.bytes; | |
var SkipDelta = varint.decode(line, nextOffset); | |
nextOffset += varint.decode.bytes; | |
field.PrefixLength = PrefixLength; | |
field.Suffix = Suffix.toString(); | |
//field.Suffix = field.Suffix.substring(PrefixLength, field.Suffix.length); | |
field.FieldNum = FieldNum; | |
field.DocFreq = DocFreq; | |
field.FreqDelta = FreqDelta; | |
field.ProxDelta = ProxDelta; | |
field.SkipDelta = SkipDelta; | |
jsonFields.push(field); | |
//console.log(field); | |
i++; | |
/*if (i ===5) | |
break;*/ | |
} | |
).on('pipe', function () { | |
fs.appendFile('fields.json', JSON.stringify(jsonFields, null, '\t')); | |
}); | |
/*while (nextOffset < bufferTermsDico.length) { | |
var field = {}; | |
varint = require('varint'); | |
// TermInfos --> <TermInfo> TermCount | |
// TermInfo --> <Term, DocFreq, FreqDelta, ProxDelta, SkipDelta> | |
// Term --> <PrefixLength, Suffix, FieldNum> | |
PrefixLength = varint.decode(bufferTermsDico, nextOffset); | |
nextOffset += varint.decode.bytes; | |
var suffixSize = varint.decode(bufferTermsDico, nextOffset); | |
nextOffset += varint.decode.bytes; | |
var Suffix = bufferTermsDico.slice(nextOffset, nextOffset+suffixSize); | |
nextOffset += suffixSize; | |
var FieldNum = varint.decode(bufferTermsDico, nextOffset); | |
nextOffset += varint.decode.bytes; | |
var DocFreq = varint.decode(bufferTermsDico, nextOffset); | |
nextOffset += varint.decode.bytes; | |
var FreqDelta = varint.decode(bufferTermsDico, nextOffset); | |
nextOffset += varint.decode.bytes; | |
var ProxDelta = varint.decode(bufferTermsDico, nextOffset); | |
nextOffset += varint.decode.bytes; | |
var SkipDelta = varint.decode(bufferTermsDico, nextOffset); | |
nextOffset += varint.decode.bytes; | |
field.PrefixLength = PrefixLength; | |
field.Suffix = Suffix.toString(); | |
field.Suffix = field.Suffix.substring(PrefixLength, field.Suffix.length); | |
field.FieldNum = FieldNum; | |
field.DocFreq = DocFreq; | |
field.FreqDelta = FreqDelta; | |
field.ProxDelta = ProxDelta; | |
field.SkipDelta = SkipDelta; | |
//jsonFields.push(field); | |
//i++; | |
if (i ===5) | |
break; | |
}*/ | |
/*jsonFields = jsonFields.sort(function(a,b) { | |
return b.FieldNum - a.FieldNum } | |
); | |
for (var i=0;i<jsonFields.length;i++) { | |
if (i>0) { | |
jsonFields[i].Suffix = jsonFields[i].Suffix.substring(jsonFields[i-1].PrefixLength, jsonFields[i].Suffix.length); | |
} | |
}*/ | |
console.log("coucou"); | |
console.log(jsonFields.length); | |
fs.writeFileSync('fields.json', JSON.stringify(jsonFields, null, '\t')); | |
//console.log("FieldName "+bufferSegments.slice(21, nextOffset)); | |
// console.log(data); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment