Skip to content

Instantly share code, notes, and snippets.

@relud
relud / test_repartition.scala
Last active January 23, 2018 22:55 — forked from mreid-moz/test_repartition.scala
Test repartitioning behaviour when writing parquet data.
import java.util.UUID.randomUUID
import scala.sys.process._
import java.util.zip.CRC32
import com.mozilla.telemetry.utils.getOrCreateSparkSession
val spark = getOrCreateSparkSession("test")
spark.sparkContext.setLogLevel("WARN")
import spark.implicits._
def getPartitionId(clientId: String, sampleId: Int, filesPerPartition: Int) = {
-- Query generated by: sql/clients_daily_scalar_aggregates.sql.py
CREATE TEMP FUNCTION
udf_aggregate_map_sum(maps ANY TYPE) AS (STRUCT(ARRAY(
SELECT
AS STRUCT key,
SUM(value) AS value
FROM
UNNEST(maps),
UNNEST(key_value)
GROUP BY