Skip to content

Instantly share code, notes, and snippets.

@chekunkov
Last active April 20, 2022 12:55
Show Gist options
  • Save chekunkov/1ebcb461c4afd4d98cd4bf3893ce2059 to your computer and use it in GitHub Desktop.
Save chekunkov/1ebcb461c4afd4d98cd4bf3893ce2059 to your computer and use it in GitHub Desktop.
Stream gzip file from s3
import zlib
import boto
def decompress(key):
d = zlib.decompressobj(16 + zlib.MAX_WBITS)
for chunk in key:
yield d.decompress(chunk)
yield d.flush()
def iterlines(decompressed_stream):
buf = b''
# keep reading chunks of bytes into the buffer
for chunk in decompressed_stream:
buf += chunk
start = 0
# process all lines within the current buffer
while True:
end = buf.find(b'\n', start) + 1
if end:
yield buf[start:end]
start = end
else:
# no more newlines => break out to read more data from s3 into the buffer
buf = buf[start:]
break
# process the last line, too
if buf:
yield buf
def main():
bucket = boto.connect_s3().get_bucket("some.bucket")
key = bucket.get_key("some/key.gz")
dstream = decompress(key)
for line in iterlines(dstream):
print line
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment