jbaiter/solrstats

## solrstats
#!/usr/bin/env python3
# coding=utf8
"""
CFE file layout:

    Header:
        CodecHeader:
            Magic               [4byte uint32]
            Codecname           [?byte String]
            Version             [4byte uint32]
        ObjectId                [16byte]
        SuffixLength            [1byte]
        SuffixBytes             [`SuffixLength` bytes]
    FileCount                   [?byte VInt]
    <repeated>:
        FileName                [?byte String]
        DataOffset              [8byte uint64]
        DataLength              [8byte uint64]
"""
import os
import struct
import sys
from collections import defaultdict, namedtuple
from itertools import chain

TYPE_MAPPING = {
    'si': 'Segment Info',
    'fnm': 'Fields Info',
    'fdx': 'Fields Index',
    'fdt': 'Field Data',
    'tim': 'Term Dictionary',
    'tip': 'Term Index',
    'doc': 'Frequencies',
    'pos': 'Positions',
    'pay': 'Payloads',
    'nvd': 'Norms (nvd)',
    'nvm': 'Norms (nvm)',
    'dvd': 'Per-Document Values (dvd)',
    'dvm': 'Per-Document Values (dvm)',
    'tvx': 'Term Vector Index',
    'tvd': 'Term Vector Documents',
    'tvf': 'Term Vector Fields',
    'liv': 'Live Documents',
    'dii': 'Point values (dii)',
    'dim': 'Point values (dim)'
}

Header = namedtuple('Header', ['codec_name', 'codec_version', 'object_id',
                               'suffix'])
File = namedtuple('File', ['name', 'offset', 'length'])


def parse_vint(fp):
    b = ord(fp.read(1))
    i = b & 0x7f
    shift = 7
    while (b & 0x80) != 0:
        b = ord(fp.read(1))
        i |= (b & 0x7f) << shift
        shift += 7
    return i


def parse_string(fp):
    length = parse_vint(fp)
    return fp.read(length).decode('utf8')


def parse_header(fp):
    magic = struct.unpack('>I', fp.read(4))[0]
    assert magic == 1071082519
    name = parse_string(fp)
    version = struct.unpack('>I', fp.read(4))[0]
    object_id = fp.read(16)
    suffix_len = ord(fp.read(1))
    if suffix_len > 0:
        suffix = fp.read(suffix_len)
    else:
        suffix = None
    return Header(codec_name=name, codec_version=version, object_id=object_id,
                  suffix=suffix)


def parse_files(fp):
    num_files = parse_vint(fp)
    for fnum in range(num_files):
        if fp.tell() == os.fstat(fp.fileno()).st_size:
            break
        fname = parse_string(fp)
        offset = struct.unpack('>Q', fp.read(8))[0]
        length = struct.unpack('>Q', fp.read(8))[0]
        yield File(fname, offset, length)


def parse_cfe(fname):
    try:
        with open(fname, 'rb') as fp:
            header = parse_header(fp)
            files = list(parse_files(fp))
            return header, files
    except:
        print("Could not read {}".format(fname), file=sys.stderr)
        return None, []


def analyze_index(data_dir):
    index_dir = os.path.join(
        data_dir, next(fname for fname in os.listdir(data_dir)
                       if fname.startswith('index') and
                       os.path.isdir(os.path.join(data_dir, fname))))
    cfe_infos = chain.from_iterable(
        parse_cfe(os.path.join(index_dir, cfename))[1]
        for cfename in os.listdir(index_dir)
        if cfename is not None and cfename.endswith('.cfe'))
    single_infos = {fname: os.stat(os.path.join(index_dir, fname)).st_size
                    for fname in os.listdir(index_dir)
                    if os.path.splitext(fname)[1] not in ('.cfe', '.cfs', '.lock')}
    stats = defaultdict(int)
    for cfe in cfe_infos:
        if cfe is None:
            continue
        ftype = cfe.name[1:] if cfe.name.startswith('.') else os.path.splitext(cfe.name)[1][1:]
        if not ftype:
            continue
        stats[ftype] += cfe.length
    for fname, size in single_infos.items():
        ftype = os.path.splitext(fname)[1][1:]
        if not ftype:
            continue
        stats[ftype] += size

    total_size = sum(stats.values())
    for ftype, size in sorted(stats.items(), key=lambda x: x[1]):
        print("{:<25} {:>6.1f}GiB {:>5.1f}%".format(
            TYPE_MAPPING[ftype],
            size / 1024. / 1024. / 1024.,
            (float(size) / total_size) * 100))
    print("="*42)
    print("{:<25} {:>6.1f}GiB".format("Total", total_size / 1024. / 1024. / 1024.))


if __name__ == '__main__':
    if len(sys.argv) != 2:
        print("Please run with the Solr data directory as the first argument.")
    else:
        analyze_index(sys.argv[1])
	#!/usr/bin/env python3
	# coding=utf8
	"""
	CFE file layout:

	Header:
	CodecHeader:
	Magic [4byte uint32]
	Codecname [?byte String]
	Version [4byte uint32]
	ObjectId [16byte]
	SuffixLength [1byte]
	SuffixBytes [`SuffixLength` bytes]
	FileCount [?byte VInt]
	<repeated>:
	FileName [?byte String]
	DataOffset [8byte uint64]
	DataLength [8byte uint64]
	"""
	import os
	import struct
	import sys
	from collections import defaultdict, namedtuple
	from itertools import chain

	TYPE_MAPPING = {
	'si': 'Segment Info',
	'fnm': 'Fields Info',
	'fdx': 'Fields Index',
	'fdt': 'Field Data',
	'tim': 'Term Dictionary',
	'tip': 'Term Index',
	'doc': 'Frequencies',
	'pos': 'Positions',
	'pay': 'Payloads',
	'nvd': 'Norms (nvd)',
	'nvm': 'Norms (nvm)',
	'dvd': 'Per-Document Values (dvd)',
	'dvm': 'Per-Document Values (dvm)',
	'tvx': 'Term Vector Index',
	'tvd': 'Term Vector Documents',
	'tvf': 'Term Vector Fields',
	'liv': 'Live Documents',
	'dii': 'Point values (dii)',
	'dim': 'Point values (dim)'
	}

	Header = namedtuple('Header', ['codec_name', 'codec_version', 'object_id',
	'suffix'])
	File = namedtuple('File', ['name', 'offset', 'length'])


	def parse_vint(fp):
	b = ord(fp.read(1))
	i = b & 0x7f
	shift = 7
	while (b & 0x80) != 0:
	b = ord(fp.read(1))
	i \|= (b & 0x7f) << shift
	shift += 7
	return i


	def parse_string(fp):
	length = parse_vint(fp)
	return fp.read(length).decode('utf8')


	def parse_header(fp):
	magic = struct.unpack('>I', fp.read(4))[0]
	assert magic == 1071082519
	name = parse_string(fp)
	version = struct.unpack('>I', fp.read(4))[0]
	object_id = fp.read(16)
	suffix_len = ord(fp.read(1))
	if suffix_len > 0:
	suffix = fp.read(suffix_len)
	else:
	suffix = None
	return Header(codec_name=name, codec_version=version, object_id=object_id,
	suffix=suffix)


	def parse_files(fp):
	num_files = parse_vint(fp)
	for fnum in range(num_files):
	if fp.tell() == os.fstat(fp.fileno()).st_size:
	break
	fname = parse_string(fp)
	offset = struct.unpack('>Q', fp.read(8))[0]
	length = struct.unpack('>Q', fp.read(8))[0]
	yield File(fname, offset, length)


	def parse_cfe(fname):
	try:
	with open(fname, 'rb') as fp:
	header = parse_header(fp)
	files = list(parse_files(fp))
	return header, files
	except:
	print("Could not read {}".format(fname), file=sys.stderr)
	return None, []



	def analyze_index(data_dir):
	index_dir = os.path.join(
	data_dir, next(fname for fname in os.listdir(data_dir)
	if fname.startswith('index') and
	os.path.isdir(os.path.join(data_dir, fname))))
	cfe_infos = chain.from_iterable(
	parse_cfe(os.path.join(index_dir, cfename))[1]
	for cfename in os.listdir(index_dir)
	if cfename is not None and cfename.endswith('.cfe'))
	single_infos = {fname: os.stat(os.path.join(index_dir, fname)).st_size
	for fname in os.listdir(index_dir)
	if os.path.splitext(fname)[1] not in ('.cfe', '.cfs', '.lock')}
	stats = defaultdict(int)
	for cfe in cfe_infos:
	if cfe is None:
	continue
	ftype = cfe.name[1:] if cfe.name.startswith('.') else os.path.splitext(cfe.name)[1][1:]
	if not ftype:
	continue
	stats[ftype] += cfe.length
	for fname, size in single_infos.items():
	ftype = os.path.splitext(fname)[1][1:]
	if not ftype:
	continue
	stats[ftype] += size

	total_size = sum(stats.values())
	for ftype, size in sorted(stats.items(), key=lambda x: x[1]):
	print("{:<25} {:>6.1f}GiB {:>5.1f}%".format(
	TYPE_MAPPING[ftype],
	size / 1024. / 1024. / 1024.,
	(float(size) / total_size) * 100))
	print("="*42)
	print("{:<25} {:>6.1f}GiB".format("Total", total_size / 1024. / 1024. / 1024.))


	if __name__ == '__main__':
	if len(sys.argv) != 2:
	print("Please run with the Solr data directory as the first argument.")
	else:
	analyze_index(sys.argv[1])