Skip to content

Instantly share code, notes, and snippets.

@maliabadi
Created September 17, 2014 02:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save maliabadi/5916fb8ab3ffdbc6ef58 to your computer and use it in GitHub Desktop.
Save maliabadi/5916fb8ab3ffdbc6ef58 to your computer and use it in GitHub Desktop.
solr_segment_file_parser.py
import bitstring
from sys import argv
class SegmentsFile(object):
def __init__(self, path):
self.stream = bitstring.ConstBitStream(filename=path)
def read_32(self):
return self.stream.read('uint:32')
def read_64(self):
return self.stream.read('uint:64')
def read_vint(self):
binary_buffer = ''
while True:
bits = self.stream.read('bin:8')
binary_buffer += bits
if int(bits, 2) <= 255:
break
return int(binary_buffer, 2)
def read_string(self):
length = self.read_vint()
string_buffer = ''
for i in range(length):
utf_id = self.stream.read('int:8')
if utf_id not in range(256):
break
character = chr(utf_id)
if utf_id > 32:
string_buffer += character
return string_buffer
def read_string_map(self):
response = {}
length = self.read_vint()
for i in range(length):
key = self.read_string()
value = self.read_string()
response[key] = value
return response
def read_8(self):
return self.stream.read('uint:8')
def parse_header(self):
if getattr(self, 'header_parsed', False):
return
self.stream.pos = 0
self.format = self.read_32()
self.version = self.read_64()
self.name_counter = self.read_32()
self.seg_count = self.read_32()
self.header_parsed = True
def parse(self):
self.parse_header()
entries = []
for i in range(self.seg_count):
entries.append(self.parse_entry())
self.commit_user_data = self.read_string_map()
self.checksum = self.read_64()
return entries
def parse_entry(self):
data = {}
data['seg_version'] = self.read_string()
data['seg_name'] = self.read_string()
data['seg_size'] = self.read_32()
data['del_gen'] = self.read_64()
data['doc_store_offset'] = self.read_32()
data['doc_store_segment'] = self.read_string()
data['doc_store_is_compound_file'] = self.read_8()
data['has_single_norm_file'] = self.read_8()
data['num_field'] = self.read_32()
data['is_compound_file'] = self.read_8()
data['deletion_count'] = self.read_32()
data['has_prox'] = self.read_8()
data['diagnostics'] = self.read_string_map()
data['has_vectors'] = self.read_8()
return data
if __name__ == "__main__":
segment_file = SegmentsFile(argv[1])
print(segment_file.parse())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment