aialenti/file.scala

## file.scala
// The following row avoids the broadcasting, the dimension_table2
// is very small and my configuration would broadcast it
spark.conf.set("spark.sql.autoBroadcastJoinThreshold",-1)

// I'm using caching to simplify the DAG
dimension_table2.cache
dimension_table2.count

fact_table = fact_table.repartition(400)

fact_table = fact_table.join(dimension_table2.repartition(400),
                fact_table.col("dimension_2_key") === dimension_table2.col("id"), "left")
fact_table.count
	// The following row avoids the broadcasting, the dimension_table2
	// is very small and my configuration would broadcast it
	spark.conf.set("spark.sql.autoBroadcastJoinThreshold",-1)

	// I'm using caching to simplify the DAG
	dimension_table2.cache
	dimension_table2.count

	fact_table = fact_table.repartition(400)

	fact_table = fact_table.join(dimension_table2.repartition(400),
	fact_table.col("dimension_2_key") === dimension_table2.col("id"), "left")
	fact_table.count