Skip to content

Instantly share code, notes, and snippets.

@zhouyuan
Created April 18, 2024 00:43
Show Gist options
  • Save zhouyuan/a1cf4b43b94c0aa2e32f5e80d409a0b0 to your computer and use it in GitHub Desktop.
Save zhouyuan/a1cf4b43b94c0aa2e32f5e80d409a0b0 to your computer and use it in GitHub Desktop.
sc.setLogLevel("WARN")
spark.sql("use parquet_t_tpcds_100;")
spark.sql("set spark.io.compression.codec=zstd")
//spark.sql(" select * from store_sales left outer join customer_demographics on ss_cdemo_sk = cd_demo_sk and cd_demo_sk <= 1920800 ;").write.option("parquet.compression","zstd").mode("overwrite").format("parquet").save("ETL/newparquet_zstd")
//spark.sql(" select * from store_sales left outer join customer_demographics on ss_cdemo_sk = cd_demo_sk and cd_demo_sk <= 1920800 ;").write.option("parquet.compression","zstd").mode("overwrite").format("parquet").partitionBy("ss_sold_date_sk").save("ETL/newparquet_zstd")
spark.sql(" select * from store_sales left outer join customer_demographics on ss_cdemo_sk = cd_demo_sk and cd_demo_sk <= 1920800 ;").write.option("parquet.compression","zstd").mode("overwrite").format("noop").save("ETL/newparquet_zstd")
//spark.sql(" select cast (null as string) AS spam_domain_label, * from store_sales left outer join customer_demographics on ss_cdemo_sk = cd_demo_sk and cd_demo_sk <= 1920800 ;").show
//spark.sql(" select COALESCE(ss_sold_time_sk) from store_sales left outer join customer_demographics on ss_cdemo_sk = cd_demo_sk and cd_demo_sk <= 1920800 ;").show
//spark.sql("insert overwrite local directory '/mnt/nvme1/tmp/etl' USING orc select * from store_sales left outer join customer_demographics on ss_cdemo_sk = cd_demo_sk and cd_demo_sk <= 1920800 ;").show
//spark.sql("insert overwrite local directory '/mnt/nvme1/tmp/etl' STORED AS TEXTFILE select * from customer_demographics;").show
//spark.sql("CREATE TABLE students (name VARCHAR(64), address VARCHAR(64)) USING PARQUET ;").show
//spark.sql("INSERT INTO students VALUES ('Amy Smith', '123 Park Ave, San Jose');").show
//spark.sql(" select * from store_sales inner join customer_demographics on ss_cdemo_sk = cd_demo_sk and cd_demo_sk between 1920800 and 1930800 ;").show
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment