Skip to content

Instantly share code, notes, and snippets.

View zhouyuan's full-sized avatar

Yuan zhouyuan

View GitHub Profile
# glone the repo
git clone https://github.com/apache/incubator-gluten && cd incubator-gluten
docker run -it -v $PWD:/opt/gluten -v /root/.ccache:/root/.ccache apache/gluten:vcpkg-centos-7
# execute inside docker
cd /opt/gluten
bash dev/ci-velox-buildstatic-centos-7.sh
# package for spark-3.5
mvn clean install -Pbackends-velox -Pspark-3.5 -DskipTests
#!/bin/bash
set -x
export https_proxy="http://child-prc.intel.com:913"
export http_proxy="http://child-prc.intel.com:913"
yesterday=`env TZ=Asia/Shanghai date -d '-1 day' '+%Y_%m_%d'`
today=`env TZ=Asia/Shanghai date '+%Y_%m_%d'`
sc.setLogLevel("WARN")
spark.sql("use parquet_t_tpcds_100;")
spark.sql("set spark.io.compression.codec=zstd")
//spark.sql(" select * from store_sales left outer join customer_demographics on ss_cdemo_sk = cd_demo_sk and cd_demo_sk <= 1920800 ;").write.option("parquet.compression","zstd").mode("overwrite").format("parquet").save("ETL/newparquet_zstd")
//spark.sql(" select * from store_sales left outer join customer_demographics on ss_cdemo_sk = cd_demo_sk and cd_demo_sk <= 1920800 ;").write.option("parquet.compression","zstd").mode("overwrite").format("parquet").partitionBy("ss_sold_date_sk").save("ETL/newparquet_zstd")
spark.sql(" select * from store_sales left outer join customer_demographics on ss_cdemo_sk = cd_demo_sk and cd_demo_sk <= 1920800 ;").write.option("parquet.compression","zstd").mode("overwrite").format("noop").save("ETL/newparquet_zstd")
//spark.sql(" select cast (null as string) AS spam_domain_label, * from store_sales left outer join customer_demographics on ss_cdemo_sk = cd_demo_sk and c
sc.setLogLevel("WARN")
spark.sql("use parquet_t_tpcds_100;")
spark.sql("set spark.sql.autoBroadcastJoinThreshold=-1;")
//spark.sql("set spark.io.compression.codec=zstd")
//spark.sql(" select * from store_sales left outer join customer_demographics on ss_cdemo_sk = cd_demo_sk and cd_demo_sk <= 1920800 ;").write.option("parquet.compression","zstd").mode("overwrite").format("parquet").save("ETL/newparquet_zstd")
//spark.sql(" select * from store_sales left outer join customer_demographics on ss_cdemo_sk = cd_demo_sk and cd_demo_sk <= 1920800 ;").write.option("parquet.compression","zstd").mode("overwrite").format("parquet").partitionBy("ss_sold_date_sk").save("ETL/newparquet_zstd")
//spark.sql(" select * from store_sales left outer join customer_demographics on ss_cdemo_sk = cd_demo_sk and cd_demo_sk <= 1920800 ;").write.option("parquet.compression","zstd").mode("overwrite").format("noop").save("ETL/newparquet_zstd")
//spark.sql(" select cast (null as string) AS spam_domain_label, * from store_sales left outer
diff --git a/Dockerfile.cpu b/Dockerfile.cpu
index 4251fdd..d9344a0 100644
--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@@ -6,6 +6,8 @@ RUN apt-get update -y \
&& apt-get install -y git wget vim numactl gcc-12 g++-12 python3 python3-pip \
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
+ENV https_proxy="http://proxy-shz.intel.com:911"
+ENV http_proxy="http://proxy-shz.intel.com:911"
#include <string>
#include <unordered_set>
#include <algorithm>
#include <cctype>
class UDFNormalizeString {
public:
static const std::string DEFAULT_VALUE;
static const std::unordered_set<std::string> DEFAULT_NULL_VALUES;
@Description(
name = "norm_str",
value = "_FUNC_(input, [defaultValue], [dirtyValues ...]) trims input and " +
"normalize null, empty or dirtyValues to defVal. \n",
extended = "preset defaultValue is 'N-A' and preset dirtyValues are {'null', 'unknown', 'unknow', 'N-A'},\n" +
"the third NULL argument will clear the preset dirtyValues list."
)
public class UDFNormalizeString extends GenericUDF {
This file has been truncated, but you can view the full file.
diff -urN /mnt/nvme1/git/spark/sql/core/src/test/java/org/apache/spark/sql/api/java/UDF23Test.java sql/core/src/test/java/org/apache/spark/sql/api/java/UDF23Test.java
--- /mnt/nvme1/git/spark/sql/core/src/test/java/org/apache/spark/sql/api/java/UDF23Test.java 1970-01-01 08:00:00.000000000 +0800
+++ sql/core/src/test/java/org/apache/spark/sql/api/java/UDF23Test.java 2023-09-18 15:00:42.998708419 +0800
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
diff --git a/gluten-core/src/main/scala/io/glutenproject/extension/StrategyOverrides.scala b/gluten-core/src/main/scala/io/glutenproject/extension/StrategyOverrides.scala
index 2898f8a7..afbc8d7f 100644
--- a/gluten-core/src/main/scala/io/glutenproject/extension/StrategyOverrides.scala
+++ b/gluten-core/src/main/scala/io/glutenproject/extension/StrategyOverrides.scala
@@ -85,7 +85,10 @@ case class JoinSelectionOverrides(session: SparkSession)
planLater(left),
planLater(right)))
}
-
+ logInfo("===DEBUG===")
import org.apache.spark.sql.execution.debug._
import scala.io.Source
import java.io.File
import java.util.Arrays
def time[R](block: => R): R = {
val t0 = System.nanoTime()
val result = block // call-by-name
val t1 = System.nanoTime()
println("Elapsed time: " + (t1 - t0)/1000000 + "ms")