Skip to content

Instantly share code, notes, and snippets.

@cemoody
Created August 31, 2023 14:52
Show Gist options
  • Save cemoody/dace7fd58e139417b00b7660427ed09e to your computer and use it in GitHub Desktop.
Save cemoody/dace7fd58e139417b00b7660427ed09e to your computer and use it in GitHub Desktop.
from google.cloud.bigquery_storage import BigQueryReadClient
from google.cloud.bigquery_storage import types
from google.cloud import bigquery_storage
from tqdm import tqdm
import pandas
import os
import dill
project_id = (
"automatic-asset-359722" # A Project where you have biquery.readsession permission
)
def read_table():
bqstorageclient = bigquery_storage.BigQueryReadClient()
project_id = "automatic-asset-359722"
dataset_id = "moody"
table_id = "partial_product_extract_v1_asins"
table = f"projects/{project_id}/datasets/{dataset_id}/tables/{table_id}"
read_options = types.ReadSession.TableReadOptions(selected_fields=["num_asins"])
parent = "projects/{}".format(project_id)
requested_session = types.ReadSession(
table=table,
data_format=types.DataFormat.ARROW,
read_options=read_options,
)
# pickle the requested_session
with open("requested_session.pkl", "wb") as f:
dill.dump(requested_session, f)
# unpickle the requested_session
with open("requested_session.pkl", "rb") as f:
requested_session = dill.load(f)
read_session = bqstorageclient.create_read_session(
parent=parent,
read_session=requested_session,
max_stream_count=4,
)
for stream in read_session.streams:
reader = bqstorageclient.read_rows(stream.name)
frames = []
for message in tqdm(reader.rows().pages):
frames.append(message.to_dataframe())
dataframe = pandas.concat(frames)
print(dataframe.head())
print(dataframe.shape)
return dataframe
read_table()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment