chekunkov/s3_gz.py

## s3_gz.py
import zlib

import boto


def decompress(key):
    d = zlib.decompressobj(16 + zlib.MAX_WBITS)
    for chunk in key:
        yield d.decompress(chunk)
    yield d.flush()


def iterlines(decompressed_stream):
    buf = b''
    # keep reading chunks of bytes into the buffer
    for chunk in decompressed_stream:
        buf += chunk
        start = 0
        # process all lines within the current buffer
        while True:
            end = buf.find(b'\n', start) + 1
            if end:
                yield buf[start:end]
                start = end
            else:
                # no more newlines => break out to read more data from s3 into the buffer
                buf = buf[start:]
                break
    # process the last line, too
    if buf:
        yield buf


def main():
    bucket = boto.connect_s3().get_bucket("some.bucket")
    key = bucket.get_key("some/key.gz")
    dstream = decompress(key)
    for line in iterlines(dstream):
        print line


if __name__ == '__main__':
    main()
	import zlib

	import boto


	def decompress(key):
	d = zlib.decompressobj(16 + zlib.MAX_WBITS)
	for chunk in key:
	yield d.decompress(chunk)
	yield d.flush()


	def iterlines(decompressed_stream):
	buf = b''
	# keep reading chunks of bytes into the buffer
	for chunk in decompressed_stream:
	buf += chunk
	start = 0
	# process all lines within the current buffer
	while True:
	end = buf.find(b'\n', start) + 1
	if end:
	yield buf[start:end]
	start = end
	else:
	# no more newlines => break out to read more data from s3 into the buffer
	buf = buf[start:]
	break
	# process the last line, too
	if buf:
	yield buf


	def main():
	bucket = boto.connect_s3().get_bucket("some.bucket")
	key = bucket.get_key("some/key.gz")
	dstream = decompress(key)
	for line in iterlines(dstream):
	print line


	if __name__ == '__main__':
	main()