Created
September 17, 2014 02:46
-
-
Save maliabadi/5916fb8ab3ffdbc6ef58 to your computer and use it in GitHub Desktop.
solr_segment_file_parser.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import bitstring | |
from sys import argv | |
class SegmentsFile(object): | |
def __init__(self, path): | |
self.stream = bitstring.ConstBitStream(filename=path) | |
def read_32(self): | |
return self.stream.read('uint:32') | |
def read_64(self): | |
return self.stream.read('uint:64') | |
def read_vint(self): | |
binary_buffer = '' | |
while True: | |
bits = self.stream.read('bin:8') | |
binary_buffer += bits | |
if int(bits, 2) <= 255: | |
break | |
return int(binary_buffer, 2) | |
def read_string(self): | |
length = self.read_vint() | |
string_buffer = '' | |
for i in range(length): | |
utf_id = self.stream.read('int:8') | |
if utf_id not in range(256): | |
break | |
character = chr(utf_id) | |
if utf_id > 32: | |
string_buffer += character | |
return string_buffer | |
def read_string_map(self): | |
response = {} | |
length = self.read_vint() | |
for i in range(length): | |
key = self.read_string() | |
value = self.read_string() | |
response[key] = value | |
return response | |
def read_8(self): | |
return self.stream.read('uint:8') | |
def parse_header(self): | |
if getattr(self, 'header_parsed', False): | |
return | |
self.stream.pos = 0 | |
self.format = self.read_32() | |
self.version = self.read_64() | |
self.name_counter = self.read_32() | |
self.seg_count = self.read_32() | |
self.header_parsed = True | |
def parse(self): | |
self.parse_header() | |
entries = [] | |
for i in range(self.seg_count): | |
entries.append(self.parse_entry()) | |
self.commit_user_data = self.read_string_map() | |
self.checksum = self.read_64() | |
return entries | |
def parse_entry(self): | |
data = {} | |
data['seg_version'] = self.read_string() | |
data['seg_name'] = self.read_string() | |
data['seg_size'] = self.read_32() | |
data['del_gen'] = self.read_64() | |
data['doc_store_offset'] = self.read_32() | |
data['doc_store_segment'] = self.read_string() | |
data['doc_store_is_compound_file'] = self.read_8() | |
data['has_single_norm_file'] = self.read_8() | |
data['num_field'] = self.read_32() | |
data['is_compound_file'] = self.read_8() | |
data['deletion_count'] = self.read_32() | |
data['has_prox'] = self.read_8() | |
data['diagnostics'] = self.read_string_map() | |
data['has_vectors'] = self.read_8() | |
return data | |
if __name__ == "__main__": | |
segment_file = SegmentsFile(argv[1]) | |
print(segment_file.parse()) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment