Created
April 18, 2024 00:42
-
-
Save zhouyuan/851b3dd0d1e53a0a13a87e16019b85df to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
sc.setLogLevel("WARN") | |
spark.sql("use parquet_t_tpcds_100;") | |
spark.sql("set spark.sql.autoBroadcastJoinThreshold=-1;") | |
//spark.sql("set spark.io.compression.codec=zstd") | |
//spark.sql(" select * from store_sales left outer join customer_demographics on ss_cdemo_sk = cd_demo_sk and cd_demo_sk <= 1920800 ;").write.option("parquet.compression","zstd").mode("overwrite").format("parquet").save("ETL/newparquet_zstd") | |
//spark.sql(" select * from store_sales left outer join customer_demographics on ss_cdemo_sk = cd_demo_sk and cd_demo_sk <= 1920800 ;").write.option("parquet.compression","zstd").mode("overwrite").format("parquet").partitionBy("ss_sold_date_sk").save("ETL/newparquet_zstd") | |
//spark.sql(" select * from store_sales left outer join customer_demographics on ss_cdemo_sk = cd_demo_sk and cd_demo_sk <= 1920800 ;").write.option("parquet.compression","zstd").mode("overwrite").format("noop").save("ETL/newparquet_zstd") | |
//spark.sql(" select cast (null as string) AS spam_domain_label, * from store_sales left outer join customer_demographics on ss_cdemo_sk = cd_demo_sk and cd_demo_sk <= 1920800 ;").show | |
//spark.sql(" select COALESCE(ss_sold_time_sk) from store_sales left outer join customer_demographics on ss_cdemo_sk = cd_demo_sk and cd_demo_sk <= 1920800 ;").show | |
//spark.sql("insert overwrite local directory '/mnt/nvme1/tmp/etl' USING orc select * from store_sales left outer join customer_demographics on ss_cdemo_sk = cd_demo_sk and cd_demo_sk <= 1920800 ;").show | |
//spark.sql("insert overwrite local directory '/mnt/nvme1/tmp/etl' STORED AS TEXTFILE select * from customer_demographics;").show | |
//spark.sql("CREATE TABLE students (name VARCHAR(64), address VARCHAR(64)) USING PARQUET ;").show | |
//spark.sql("INSERT INTO students VALUES ('Amy Smith', '123 Park Ave, San Jose');").show | |
//spark.sql(" select * from store_sales inner join customer_demographics on ss_cdemo_sk = cd_demo_sk and cd_demo_sk between 1920800 and 1930800 ;").show | |
//spark.sql("select ss_item_sk, ss_ticket_number, count(*) rowcount from store_sales where ss_sold_date_sk between 2450816 and 2451500 group by ss_item_sk , ss_ticket_number;").write.option("parquet.compression","zstd").mode("overwrite").format("noop").save("ETL/newparquet_zstd") | |
// spill test suite | |
//spark.sql("select * from store_sales order by ss_cdemo_sk;").write.option("parquet.compression","zstd").mode("overwrite").format("noop").save("ETL/newparquet_zstd") | |
spark.sql("select count(distinct ss_item_sk) from store_sales ;").write.option("parquet.compression","zstd").mode("overwrite").format("noop").save("ETL/newparquet_zstd") | |
//spark.sql("select avg(ss_item_sk) from store_sales ;").write.option("parquet.compression","zstd").mode("overwrite").format("noop").save("ETL/newparquet_zstd") | |
//spark.sql(" select count(*) from store_sales a ,store_returns b where a.ss_item_sk = b.sr_item_sk and a.ss_ticket_number = b.sr_ticket_number and ss_sold_date_sk between 2450816 and 2451500 and sr_returned_date_sk between 2450816 and 2451500 ;").write.option("parquet.compression","zstd").mode("overwrite").format("parquet").save("ETL/newparquet_zstd") | |
//spark.sql(" select count(*) from store_sales a ,store_returns b where a.ss_item_sk = b.sr_item_sk and a.ss_ticket_number = b.sr_ticket_number ;").write.option("parquet.compression","zstd").mode("overwrite").format("parquet").save("ETL/newparquet_zstd") | |
//spark.sql(" select distinct ss_item_sk from store_sales;").write.option("parquet.compression","zstd").mode("overwrite").format("parquet").save("ETL/newparquet_zstd") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment