Skip to content

Instantly share code, notes, and snippets.

View zhouyuan's full-sized avatar

Yuan zhouyuan

View GitHub Profile
sc.setLogLevel("WARN")
spark.sql("use parquet_t_tpcds_100;")
spark.sql("set spark.io.compression.codec=zstd")
//spark.sql(" select * from store_sales left outer join customer_demographics on ss_cdemo_sk = cd_demo_sk and cd_demo_sk <= 1920800 ;").write.option("parquet.compression","zstd").mode("overwrite").format("parquet").save("ETL/newparquet_zstd")
//spark.sql(" select * from store_sales left outer join customer_demographics on ss_cdemo_sk = cd_demo_sk and cd_demo_sk <= 1920800 ;").write.option("parquet.compression","zstd").mode("overwrite").format("parquet").partitionBy("ss_sold_date_sk").save("ETL/newparquet_zstd")
spark.sql(" select * from store_sales left outer join customer_demographics on ss_cdemo_sk = cd_demo_sk and cd_demo_sk <= 1920800 ;").write.option("parquet.compression","zstd").mode("overwrite").format("noop").save("ETL/newparquet_zstd")
//spark.sql(" select cast (null as string) AS spam_domain_label, * from store_sales left outer join customer_demographics on ss_cdemo_sk = cd_demo_sk and c
sc.setLogLevel("WARN")
spark.sql("use parquet_t_tpcds_100;")
spark.sql("set spark.sql.autoBroadcastJoinThreshold=-1;")
//spark.sql("set spark.io.compression.codec=zstd")
//spark.sql(" select * from store_sales left outer join customer_demographics on ss_cdemo_sk = cd_demo_sk and cd_demo_sk <= 1920800 ;").write.option("parquet.compression","zstd").mode("overwrite").format("parquet").save("ETL/newparquet_zstd")
//spark.sql(" select * from store_sales left outer join customer_demographics on ss_cdemo_sk = cd_demo_sk and cd_demo_sk <= 1920800 ;").write.option("parquet.compression","zstd").mode("overwrite").format("parquet").partitionBy("ss_sold_date_sk").save("ETL/newparquet_zstd")
//spark.sql(" select * from store_sales left outer join customer_demographics on ss_cdemo_sk = cd_demo_sk and cd_demo_sk <= 1920800 ;").write.option("parquet.compression","zstd").mode("overwrite").format("noop").save("ETL/newparquet_zstd")
//spark.sql(" select cast (null as string) AS spam_domain_label, * from store_sales left outer
diff --git a/Dockerfile.cpu b/Dockerfile.cpu
index 4251fdd..d9344a0 100644
--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@@ -6,6 +6,8 @@ RUN apt-get update -y \
&& apt-get install -y git wget vim numactl gcc-12 g++-12 python3 python3-pip \
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
+ENV https_proxy="http://proxy-shz.intel.com:911"
+ENV http_proxy="http://proxy-shz.intel.com:911"
#include <string>
#include <unordered_set>
#include <algorithm>
#include <cctype>
class UDFNormalizeString {
public:
static const std::string DEFAULT_VALUE;
static const std::unordered_set<std::string> DEFAULT_NULL_VALUES;
@Description(
name = "norm_str",
value = "_FUNC_(input, [defaultValue], [dirtyValues ...]) trims input and " +
"normalize null, empty or dirtyValues to defVal. \n",
extended = "preset defaultValue is 'N-A' and preset dirtyValues are {'null', 'unknown', 'unknow', 'N-A'},\n" +
"the third NULL argument will clear the preset dirtyValues list."
)
public class UDFNormalizeString extends GenericUDF {
This file has been truncated, but you can view the full file.
diff -urN /mnt/nvme1/git/spark/sql/core/src/test/java/org/apache/spark/sql/api/java/UDF23Test.java sql/core/src/test/java/org/apache/spark/sql/api/java/UDF23Test.java
--- /mnt/nvme1/git/spark/sql/core/src/test/java/org/apache/spark/sql/api/java/UDF23Test.java 1970-01-01 08:00:00.000000000 +0800
+++ sql/core/src/test/java/org/apache/spark/sql/api/java/UDF23Test.java 2023-09-18 15:00:42.998708419 +0800
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
diff --git a/gluten-core/src/main/scala/io/glutenproject/extension/StrategyOverrides.scala b/gluten-core/src/main/scala/io/glutenproject/extension/StrategyOverrides.scala
index 2898f8a7..afbc8d7f 100644
--- a/gluten-core/src/main/scala/io/glutenproject/extension/StrategyOverrides.scala
+++ b/gluten-core/src/main/scala/io/glutenproject/extension/StrategyOverrides.scala
@@ -85,7 +85,10 @@ case class JoinSelectionOverrides(session: SparkSession)
planLater(left),
planLater(right)))
}
-
+ logInfo("===DEBUG===")
import org.apache.spark.sql.execution.debug._
import scala.io.Source
import java.io.File
import java.util.Arrays
def time[R](block: => R): R = {
val t0 = System.nanoTime()
val result = block // call-by-name
val t1 = System.nanoTime()
println("Elapsed time: " + (t1 - t0)/1000000 + "ms")
#include <algorithm>
#include <boost/container/map.hpp>
#include <chrono>
#include <iostream>
#include <map>
#include <random>
#include <unordered_map>
#include <vector>
#include "parallel_hashmap/btree.h"
This file has been truncated, but you can view the full file.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ApproxCountDistinctForIntervalsQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ApproxCountDistinctForIntervalsQuerySuite.scala
index 171e93c1bf..53662c6560 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/ApproxCountDistinctForIntervalsQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/ApproxCountDistinctForIntervalsQuerySuite.scala
@@ -17,6 +17,8 @@
package org.apache.spark.sql
+import java.time.{Duration, Period}
+