Skip to content

Instantly share code, notes, and snippets.

@zhouyuan
Created April 18, 2024 00:42
Show Gist options
  • Save zhouyuan/851b3dd0d1e53a0a13a87e16019b85df to your computer and use it in GitHub Desktop.
Save zhouyuan/851b3dd0d1e53a0a13a87e16019b85df to your computer and use it in GitHub Desktop.
sc.setLogLevel("WARN")
spark.sql("use parquet_t_tpcds_100;")
spark.sql("set spark.sql.autoBroadcastJoinThreshold=-1;")
//spark.sql("set spark.io.compression.codec=zstd")
//spark.sql(" select * from store_sales left outer join customer_demographics on ss_cdemo_sk = cd_demo_sk and cd_demo_sk <= 1920800 ;").write.option("parquet.compression","zstd").mode("overwrite").format("parquet").save("ETL/newparquet_zstd")
//spark.sql(" select * from store_sales left outer join customer_demographics on ss_cdemo_sk = cd_demo_sk and cd_demo_sk <= 1920800 ;").write.option("parquet.compression","zstd").mode("overwrite").format("parquet").partitionBy("ss_sold_date_sk").save("ETL/newparquet_zstd")
//spark.sql(" select * from store_sales left outer join customer_demographics on ss_cdemo_sk = cd_demo_sk and cd_demo_sk <= 1920800 ;").write.option("parquet.compression","zstd").mode("overwrite").format("noop").save("ETL/newparquet_zstd")
//spark.sql(" select cast (null as string) AS spam_domain_label, * from store_sales left outer join customer_demographics on ss_cdemo_sk = cd_demo_sk and cd_demo_sk <= 1920800 ;").show
//spark.sql(" select COALESCE(ss_sold_time_sk) from store_sales left outer join customer_demographics on ss_cdemo_sk = cd_demo_sk and cd_demo_sk <= 1920800 ;").show
//spark.sql("insert overwrite local directory '/mnt/nvme1/tmp/etl' USING orc select * from store_sales left outer join customer_demographics on ss_cdemo_sk = cd_demo_sk and cd_demo_sk <= 1920800 ;").show
//spark.sql("insert overwrite local directory '/mnt/nvme1/tmp/etl' STORED AS TEXTFILE select * from customer_demographics;").show
//spark.sql("CREATE TABLE students (name VARCHAR(64), address VARCHAR(64)) USING PARQUET ;").show
//spark.sql("INSERT INTO students VALUES ('Amy Smith', '123 Park Ave, San Jose');").show
//spark.sql(" select * from store_sales inner join customer_demographics on ss_cdemo_sk = cd_demo_sk and cd_demo_sk between 1920800 and 1930800 ;").show
//spark.sql("select ss_item_sk, ss_ticket_number, count(*) rowcount from store_sales where ss_sold_date_sk between 2450816 and 2451500 group by ss_item_sk , ss_ticket_number;").write.option("parquet.compression","zstd").mode("overwrite").format("noop").save("ETL/newparquet_zstd")
// spill test suite
//spark.sql("select * from store_sales order by ss_cdemo_sk;").write.option("parquet.compression","zstd").mode("overwrite").format("noop").save("ETL/newparquet_zstd")
spark.sql("select count(distinct ss_item_sk) from store_sales ;").write.option("parquet.compression","zstd").mode("overwrite").format("noop").save("ETL/newparquet_zstd")
//spark.sql("select avg(ss_item_sk) from store_sales ;").write.option("parquet.compression","zstd").mode("overwrite").format("noop").save("ETL/newparquet_zstd")
//spark.sql(" select count(*) from store_sales a ,store_returns b where a.ss_item_sk = b.sr_item_sk and a.ss_ticket_number = b.sr_ticket_number and ss_sold_date_sk between 2450816 and 2451500 and sr_returned_date_sk between 2450816 and 2451500 ;").write.option("parquet.compression","zstd").mode("overwrite").format("parquet").save("ETL/newparquet_zstd")
//spark.sql(" select count(*) from store_sales a ,store_returns b where a.ss_item_sk = b.sr_item_sk and a.ss_ticket_number = b.sr_ticket_number ;").write.option("parquet.compression","zstd").mode("overwrite").format("parquet").save("ETL/newparquet_zstd")
//spark.sql(" select distinct ss_item_sk from store_sales;").write.option("parquet.compression","zstd").mode("overwrite").format("parquet").save("ETL/newparquet_zstd")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment