Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sairamdgr8/443cefeb1e60dc252e32256f711c5b95 to your computer and use it in GitHub Desktop.
Save sairamdgr8/443cefeb1e60dc252e32256f711c5b95 to your computer and use it in GitHub Desktop.
Spark Adaptive Query Execution- Performance Optimization using pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
# COMMAND ----------
spark_dis_AQE=SparkSession.builder.appName("Data_skewness_enabling_AEQ").getOrCreate()
spark_dis_AQE.conf.set("spark.sql.adaptive.enabled", False)
# COMMAND ----------
df1 = spark_dis_AQE.range(1000) \
.withColumn("id", lit("x"))
extravalues = spark_dis_AQE.range(4)\
.withColumn("id", lit("y"))
moreextravalues = spark_dis_AQE.range(4)\
.withColumn("id", lit("z"))
df1 = df1.union(extravalues).union(moreextravalues)
#########
df2=spark_dis_AQE.range(1000) \
.withColumn("id", lit("x"))
extravalues1 = spark_dis_AQE.range(1)\
.withColumn("id", lit("y"))
moreextravalues1 = spark_dis_AQE.range(1)\
.withColumn("id", lit("z"))
df2 = df2.union(extravalues1).union(moreextravalues1)
# COMMAND ----------
f_df1=df1.join(df2,'id').select("id").distinct()
f_df1.display(5)
f_df1.rdd.getNumPartitions()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment