Skip to content

Instantly share code, notes, and snippets.

@bramp
Last active December 15, 2015 01:49
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bramp/5183117 to your computer and use it in GitHub Desktop.
Save bramp/5183117 to your computer and use it in GitHub Desktop.
# Simple script that reads a mongodb datafile (by Andrew Brampton) Finds all documents starting with _id reads the record, and prints out compressed and uncompressed results.
# Simple script that reads a mongodb datafile
# Finds all documents starting with _id
# reads the record, and prints out compressed and uncompressed
# results.
#
import mmap
import os
import struct
import zlib
import sys
f = open( sys.argv[1] )
s = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
old = 0
record_len = 0
total = 0
total_record_len = 0
total_compressed_len = 0
while True:
next = s.find('\x07_id', old + record_len)
if next == -1:
break
s.seek(next - 4, os.SEEK_SET)
record_len = struct.unpack('i', s.read(4))[0]
# Compress data
record = s.read(record_len)
record = zlib.compress(record)
print next, (next - old), record_len, len(record)
total += (next - old)
total_record_len += record_len
total_compressed_len += len(record)
old = next
print total_record_len, total_compressed_len
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment