tseaver/repro-python-storage-462.py

## repro-python-storage-462.py
import csv
import datetime
import io
import random
import string

from google.api_core import exceptions
from google.cloud.storage import Client

BUCKET_NAME = "gcp-storage-repro-462"
BLOB_NAME = "repro.csv"
CHARS = list(set(string.printable) - set(string.whitespace))
ROW_COUNT = 100000
MEGABYTE = (1024 * 1024)
CHUNK_SIZE: int = MEGABYTE * 4


def _generate_csv_rows(count=ROW_COUNT):
    current_date = datetime.date.today()

    yield ("Date", "Counter", "Random String")
    while count > 0:
        rand_string = "".join(random.choices(CHARS, k=100))
        yield (current_date.isoformat(), count, rand_string)
        count -= 1
        current_date -+ datetime.timedelta(days=1)

def _write_csv(count=ROW_COUNT):
    iterator = _generate_csv_rows(count)
    field_names = next(iterator)
    buf = io.StringIO()
    writer = csv.writer(buf)
    writer.writerow(field_names)

    for row in iterator:
        writer.writerow(row)

    return buf.getvalue()

def _ensure_bucket(client):
    bucket = client.bucket(BUCKET_NAME)

    if not bucket.exists():
        bucket.create(predefined_acl="public-read")

    return bucket

def _ensure_blob(bucket):
    blob = bucket.blob(BLOB_NAME)

    try:
        blob.reload()
    except exceptions.NotFound:
        csv_text = _write_csv()
        blob.upload_from_string(
            csv_text, content_type="text/csv", predefined_acl="public-read",
        )

    return blob

def main():
    client = Client()
    bucket = _ensure_bucket(client)
    blob = _ensure_blob(bucket)

    csv_bytes = blob.download_as_bytes()
    print(f"Length of whole CSV:   {len(csv_bytes)}")

    output_buffer = io.BytesIO()
    weird_chunk_size = 17 * 17 * 17
    chunk_count = 0

    with blob.open("rb", chunk_size=CHUNK_SIZE) as stream:

        weird_chunk = stream.read(weird_chunk_size)

        while weird_chunk:
            chunk_count += 1
            output_buffer.write(weird_chunk)
            weird_chunk = stream.read(weird_chunk_size)

    print(f"Chunk count:           {chunk_count}")
    chunked_bytes = output_buffer.getvalue()
    print(f"Length of chunked CSV: {len(chunked_bytes)}")
    assert chunked_bytes == csv_bytes
    print("OK")

if __name__ == "__main__":
    main()
	import csv
	import datetime
	import io
	import random
	import string

	from google.api_core import exceptions
	from google.cloud.storage import Client

	BUCKET_NAME = "gcp-storage-repro-462"
	BLOB_NAME = "repro.csv"
	CHARS = list(set(string.printable) - set(string.whitespace))
	ROW_COUNT = 100000
	MEGABYTE = (1024 * 1024)
	CHUNK_SIZE: int = MEGABYTE * 4


	def _generate_csv_rows(count=ROW_COUNT):
	current_date = datetime.date.today()

	yield ("Date", "Counter", "Random String")
	while count > 0:
	rand_string = "".join(random.choices(CHARS, k=100))
	yield (current_date.isoformat(), count, rand_string)
	count -= 1
	current_date -+ datetime.timedelta(days=1)

	def _write_csv(count=ROW_COUNT):
	iterator = _generate_csv_rows(count)
	field_names = next(iterator)
	buf = io.StringIO()
	writer = csv.writer(buf)
	writer.writerow(field_names)

	for row in iterator:
	writer.writerow(row)

	return buf.getvalue()

	def _ensure_bucket(client):
	bucket = client.bucket(BUCKET_NAME)

	if not bucket.exists():
	bucket.create(predefined_acl="public-read")

	return bucket

	def _ensure_blob(bucket):
	blob = bucket.blob(BLOB_NAME)

	try:
	blob.reload()
	except exceptions.NotFound:
	csv_text = _write_csv()
	blob.upload_from_string(
	csv_text, content_type="text/csv", predefined_acl="public-read",
	)

	return blob

	def main():
	client = Client()
	bucket = _ensure_bucket(client)
	blob = _ensure_blob(bucket)

	csv_bytes = blob.download_as_bytes()
	print(f"Length of whole CSV: {len(csv_bytes)}")

	output_buffer = io.BytesIO()
	weird_chunk_size = 17 * 17 * 17
	chunk_count = 0

	with blob.open("rb", chunk_size=CHUNK_SIZE) as stream:

	weird_chunk = stream.read(weird_chunk_size)

	while weird_chunk:
	chunk_count += 1
	output_buffer.write(weird_chunk)
	weird_chunk = stream.read(weird_chunk_size)

	print(f"Chunk count: {chunk_count}")
	chunked_bytes = output_buffer.getvalue()
	print(f"Length of chunked CSV: {len(chunked_bytes)}")
	assert chunked_bytes == csv_bytes
	print("OK")

	if __name__ == "__main__":
	main()