Skip to content

Instantly share code, notes, and snippets.

@aymanfarhat
Created September 10, 2022 08:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save aymanfarhat/c41873f6842dd060b8d4d92b1106c152 to your computer and use it in GitHub Desktop.
Save aymanfarhat/c41873f6842dd060b8d4d92b1106c152 to your computer and use it in GitHub Desktop.
#Copyright 2022 Google LLC.
#SPDX-License-Identifier: Apache-2.0
import os
import psutil
from google.cloud.storage import Client
def get_mem_mb():
return psutil.Process().memory_info().rss / (1024 * 1024)
class ChunkParser(object):
def __init__(self, fileobj):
self._fileobj = fileobj
self.chunk_count = 0
self.line_breaks_count = 0
def write(self, chunk):
#self._fileobj.write(chunk)
line_breaks = chunk.count(b'\n')
self.chunk_count += 1
self.line_breaks_count += line_breaks
print(f'Allocated memory on start {get_mem_mb()}')
client = Client()
bucket = client.get_bucket('your-bucket')
blob = bucket.blob('inputs/data/compressed.csv')
with open('virtua_file', 'wb', os.O_NONBLOCK) as blob_file:
parser = ChunkParser(blob_file)
blob.download_to_file(parser)
print(f'Total chunks {parser.chunk_count}')
print(f'Total line breaks {parser.line_breaks_count}')
print(f'Allocated memory on end {get_mem_mb()}')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment