Skip to content

Instantly share code, notes, and snippets.

@Shinichi-Nakagawa
Last active November 8, 2022 08:11
Show Gist options
  • Save Shinichi-Nakagawa/01b1e1583cb8ba0c2e24abe2b77cdfac to your computer and use it in GitHub Desktop.
Save Shinichi-Nakagawa/01b1e1583cb8ba0c2e24abe2b77cdfac to your computer and use it in GitHub Desktop.
PyCon JP 2022資料用スニペット
"""
5. BigQuery読込
"""
from pyspark.sql import SparkSession
from pyspark.sql import DataFrame as SparkDataFrame
from pyspark.sql.utils import AnalysisException
# ここは同じ
spark: SparkSession = SparkSession \
.builder \
.appName('your app')\
.config('spark.jars', 'gs://spark-lib/bigquery/spark-bigquery-with-dependencies_2.12-0.25.2.jar') \
.config('spark.sql.debug.maxToStringFields', 2000) \
.getOrCreate()
spark.conf.set('temporaryGcsBucket', GCS_BUCKET)
# ここでView読み取りを有効化しないとエラーになるので注意
spark.conf.set("viewsEnabled", "true")
def read_bq() -> SparkDataFrame:
"""
Read Dashboard data
BigQuery View to SparkDataFrame
"""
try:
df: SparkDataFrame = spark.read.format('bigquery') \
.option('project', 'your project') \
.option('table', f'your_project.view_baseball') \
.load()
return df
except AnalysisException:
return None # type: ignore
sdf: SparkDataFrame = read_bq()
"""
6. GCS保存
"""
from pyspark.sql import DataFrame as SparkDataFrame
def save_json(sdf: SparkDataFrame) -> None:
"""
Save as JSON dataset
SparkDataFrame to GCS Bucket
"""
sdf.write \
.format("json") \
.mode("overwrite") \
.option("path", "gs://your-gcs-bucket/filepath/hoge") \
.save()
save_json(sdf)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment