Skip to content

Instantly share code, notes, and snippets.

@jbaiter
Last active July 12, 2018 05:30
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jbaiter/00e57d10aed45432931d3fd3c0465a12 to your computer and use it in GitHub Desktop.
Save jbaiter/00e57d10aed45432931d3fd3c0465a12 to your computer and use it in GitHub Desktop.
Print size distribution of your Solr index
#!/usr/bin/env python3
# coding=utf8
"""
CFE file layout:
Header:
CodecHeader:
Magic [4byte uint32]
Codecname [?byte String]
Version [4byte uint32]
ObjectId [16byte]
SuffixLength [1byte]
SuffixBytes [`SuffixLength` bytes]
FileCount [?byte VInt]
<repeated>:
FileName [?byte String]
DataOffset [8byte uint64]
DataLength [8byte uint64]
"""
import os
import struct
import sys
from collections import defaultdict, namedtuple
from itertools import chain
TYPE_MAPPING = {
'si': 'Segment Info',
'fnm': 'Fields Info',
'fdx': 'Fields Index',
'fdt': 'Field Data',
'tim': 'Term Dictionary',
'tip': 'Term Index',
'doc': 'Frequencies',
'pos': 'Positions',
'pay': 'Payloads',
'nvd': 'Norms (nvd)',
'nvm': 'Norms (nvm)',
'dvd': 'Per-Document Values (dvd)',
'dvm': 'Per-Document Values (dvm)',
'tvx': 'Term Vector Index',
'tvd': 'Term Vector Documents',
'tvf': 'Term Vector Fields',
'liv': 'Live Documents',
'dii': 'Point values (dii)',
'dim': 'Point values (dim)'
}
Header = namedtuple('Header', ['codec_name', 'codec_version', 'object_id',
'suffix'])
File = namedtuple('File', ['name', 'offset', 'length'])
def parse_vint(fp):
b = ord(fp.read(1))
i = b & 0x7f
shift = 7
while (b & 0x80) != 0:
b = ord(fp.read(1))
i |= (b & 0x7f) << shift
shift += 7
return i
def parse_string(fp):
length = parse_vint(fp)
return fp.read(length).decode('utf8')
def parse_header(fp):
magic = struct.unpack('>I', fp.read(4))[0]
assert magic == 1071082519
name = parse_string(fp)
version = struct.unpack('>I', fp.read(4))[0]
object_id = fp.read(16)
suffix_len = ord(fp.read(1))
if suffix_len > 0:
suffix = fp.read(suffix_len)
else:
suffix = None
return Header(codec_name=name, codec_version=version, object_id=object_id,
suffix=suffix)
def parse_files(fp):
num_files = parse_vint(fp)
for fnum in range(num_files):
if fp.tell() == os.fstat(fp.fileno()).st_size:
break
fname = parse_string(fp)
offset = struct.unpack('>Q', fp.read(8))[0]
length = struct.unpack('>Q', fp.read(8))[0]
yield File(fname, offset, length)
def parse_cfe(fname):
try:
with open(fname, 'rb') as fp:
header = parse_header(fp)
files = list(parse_files(fp))
return header, files
except:
print("Could not read {}".format(fname), file=sys.stderr)
return None, []
def analyze_index(data_dir):
index_dir = os.path.join(
data_dir, next(fname for fname in os.listdir(data_dir)
if fname.startswith('index') and
os.path.isdir(os.path.join(data_dir, fname))))
cfe_infos = chain.from_iterable(
parse_cfe(os.path.join(index_dir, cfename))[1]
for cfename in os.listdir(index_dir)
if cfename is not None and cfename.endswith('.cfe'))
single_infos = {fname: os.stat(os.path.join(index_dir, fname)).st_size
for fname in os.listdir(index_dir)
if os.path.splitext(fname)[1] not in ('.cfe', '.cfs', '.lock')}
stats = defaultdict(int)
for cfe in cfe_infos:
if cfe is None:
continue
ftype = cfe.name[1:] if cfe.name.startswith('.') else os.path.splitext(cfe.name)[1][1:]
if not ftype:
continue
stats[ftype] += cfe.length
for fname, size in single_infos.items():
ftype = os.path.splitext(fname)[1][1:]
if not ftype:
continue
stats[ftype] += size
total_size = sum(stats.values())
for ftype, size in sorted(stats.items(), key=lambda x: x[1]):
print("{:<25} {:>6.1f}GiB {:>5.1f}%".format(
TYPE_MAPPING[ftype],
size / 1024. / 1024. / 1024.,
(float(size) / total_size) * 100))
print("="*42)
print("{:<25} {:>6.1f}GiB".format("Total", total_size / 1024. / 1024. / 1024.))
if __name__ == '__main__':
if len(sys.argv) != 2:
print("Please run with the Solr data directory as the first argument.")
else:
analyze_index(sys.argv[1])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment