Skip to content

Instantly share code, notes, and snippets.

@tseaver
Created June 11, 2021 19:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tseaver/f5a2f91d7685a175ef89ab1d63cd681f to your computer and use it in GitHub Desktop.
Save tseaver/f5a2f91d7685a175ef89ab1d63cd681f to your computer and use it in GitHub Desktop.
import csv
import datetime
import io
import random
import string
from google.api_core import exceptions
from google.cloud.storage import Client
BUCKET_NAME = "gcp-storage-repro-462"
BLOB_NAME = "repro.csv"
CHARS = list(set(string.printable) - set(string.whitespace))
ROW_COUNT = 100000
MEGABYTE = (1024 * 1024)
CHUNK_SIZE: int = MEGABYTE * 4
def _generate_csv_rows(count=ROW_COUNT):
current_date = datetime.date.today()
yield ("Date", "Counter", "Random String")
while count > 0:
rand_string = "".join(random.choices(CHARS, k=100))
yield (current_date.isoformat(), count, rand_string)
count -= 1
current_date -+ datetime.timedelta(days=1)
def _write_csv(count=ROW_COUNT):
iterator = _generate_csv_rows(count)
field_names = next(iterator)
buf = io.StringIO()
writer = csv.writer(buf)
writer.writerow(field_names)
for row in iterator:
writer.writerow(row)
return buf.getvalue()
def _ensure_bucket(client):
bucket = client.bucket(BUCKET_NAME)
if not bucket.exists():
bucket.create(predefined_acl="public-read")
return bucket
def _ensure_blob(bucket):
blob = bucket.blob(BLOB_NAME)
try:
blob.reload()
except exceptions.NotFound:
csv_text = _write_csv()
blob.upload_from_string(
csv_text, content_type="text/csv", predefined_acl="public-read",
)
return blob
def main():
client = Client()
bucket = _ensure_bucket(client)
blob = _ensure_blob(bucket)
csv_bytes = blob.download_as_bytes()
print(f"Length of whole CSV: {len(csv_bytes)}")
output_buffer = io.BytesIO()
weird_chunk_size = 17 * 17 * 17
chunk_count = 0
with blob.open("rb", chunk_size=CHUNK_SIZE) as stream:
weird_chunk = stream.read(weird_chunk_size)
while weird_chunk:
chunk_count += 1
output_buffer.write(weird_chunk)
weird_chunk = stream.read(weird_chunk_size)
print(f"Chunk count: {chunk_count}")
chunked_bytes = output_buffer.getvalue()
print(f"Length of chunked CSV: {len(chunked_bytes)}")
assert chunked_bytes == csv_bytes
print("OK")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment