ottomata/backfill_mediawiki_revision_score.scala

## backfill_mediawiki_revision_score.scala
import org.apache.spark.sql._
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions._

// We need this to convert the out of order new schema to the new hive table schema.
// This also is used to drop columns that aren't in the new hive table schema.
import org.wikimedia.analytics.refinery.spark.sql.HiveExtensions._

// Get the new desired field schemas
val mediawiki_revision_score_2 = spark.table("event.mediawiki_revision_score")
val scoreMapFieldSchema = mediawiki_revision_score_2.schema("scores").dataType
val errorMapFieldSchema = mediawiki_revision_score_2.schema("errors").dataType


// Function to convert scores array to scores map
val scoreArrayRowStructToMap = (scoresArray: Seq[Row]) => {
    // We need to return an Option to avoid NullPointerExceptions if values are null
    if (scoresArray == null || scoresArray.length == 0) {
        None
    }
    else {
        // Convert the array of score structs to a array of score with probability maps
        val scoresWithMapProbability = scoresArray.map(scoreWithStructProbabilty => {
            val model_name = scoreWithStructProbabilty.getString(0)
            val model_version = scoreWithStructProbabilty.getString(1)
            val prediction = scoreWithStructProbabilty.getSeq[String](2)
            val probabilityMap = scoreWithStructProbabilty.getSeq[Row](3).map(p => p.getString(0) -> p.getDouble(1)).toMap
            Row(model_name, model_version, prediction, probabilityMap)
        })
        // convert the array of score object with probability maps to
        // a map of model_name -> score object
        Some(scoresWithMapProbability.map(r => r.getString(0) -> r).toMap)
    }
}
// Make a udf
val scoreArrayRowStructToMapUdf = udf(scoreArrayRowStructToMap, scoreMapFieldSchema)


// function to convert errors array to errors map
val errorArrayRowStructToMap = (errorsArray: Seq[Row]) => {
    if (errorsArray == null || errorsArray.length == 0) {
        None
    }
    else {
        Some(errorsArray.map(errorStruct => errorStruct.getString(0) -> errorStruct).toMap)
    }
}
val errorArrayRowStructToMapUdf = udf(errorArrayRowStructToMap, errorMapFieldSchema)


def convertRevisionScore1to2(revScore1Df: DataFrame) = {
    // Use the udf to convert and add the new map columns, and then drop the old array ones.
    revScore1Df
    .withColumn("scores_map", scoreArrayRowStructToMapUdf(col("scores"))).drop("scores").withColumnRenamed("scores_map", "scores")
    .withColumn("errors_map", errorArrayRowStructToMapUdf(col("errors"))).drop("errors").withColumnRenamed("errors_map", "errors")
    // Now all of the fields should be the same, we just need the field order to be the same.
    // Good thing we have HiveExtensions convertToSchema!
    // Note: This also drops 2 unused columns: meta.schema_uri and meta.topic
    .convertToSchema(mediawiki_revision_score_2.schema)
}

def convertAndWriteRevisionScore1to2(revScore1Df: DataFrame, outputBasePath: String) = {
    // I can't seem to insert this DataFrame directly into event.mediawiki_revision_score.
    // I get ParquetEncodingException: empty fields are illegal, the field should be ommited completely instead.
    // I've tried modifying the UDF functions above to return Options for any map type inside (like probability),
    // but it doesnt't help. Without the options, I get NullPointerExceptions.
    // So!  We write this to its own NEW hive table, and will switch to Hive directoy
    // to insert into event.mediawiki_revision_score.
    convertRevisionScore1to2(revScore1Df)
    .write
    .partitionBy("datacenter", "year", "month", "day", "hour")
    .mode("append")
    .parquet(outputBasePath)
}


val months = Seq(
    ("2018", "12"),
    ("2019", "1"),
    ("2019", "2"),
    ("2019", "3"),
    ("2019", "4"),
    ("2019", "5"),
    ("2019", "6"),
    ("2019", "7"),
    ("2019", "8"),
    ("2019", "9")
)


val mediawiki_revision_score_1 = spark.table("otto.mediawiki_revision_score_1")

months.foreach({ case (year, month) => {
    println(s"------ BEGIN Transforming ${year} ${month} day < 15")
    convertAndWriteRevisionScore1to2(
        mediawiki_revision_score_1.where(s"year=${year} and month=${month} and day < 15"),
        "/user/otto/mediawiki_revision_score_1_backfill/backfill0"
    )
    println(s"------ DONE Transforming ${year} ${month} day < 15\n\n\n")

    println(s"------ BEGIN Transforming ${year} ${month} day >= 15")
    convertAndWriteRevisionScore1to2(
        mediawiki_revision_score_1.where(s"year=${year} and month=${month} and day >= 15"),
        "/user/otto/mediawiki_revision_score_1_backfill/backfill0"
    )
    println(s"------ DONE Transforming ${year} ${month} day >= 15\n\n\n")
}})

// --- TODO ---
// Move data dirs out of /user/otto/mediawiki_revision_score_1_backfill/backfill0 into event/mediawiki_revision_score
// MSCK REPAIR TABLE event.mediawiki_revision_score
	import org.apache.spark.sql._
	import org.apache.spark.sql.types._
	import org.apache.spark.sql.functions._

	// We need this to convert the out of order new schema to the new hive table schema.
	// This also is used to drop columns that aren't in the new hive table schema.
	import org.wikimedia.analytics.refinery.spark.sql.HiveExtensions._

	// Get the new desired field schemas
	val mediawiki_revision_score_2 = spark.table("event.mediawiki_revision_score")
	val scoreMapFieldSchema = mediawiki_revision_score_2.schema("scores").dataType
	val errorMapFieldSchema = mediawiki_revision_score_2.schema("errors").dataType


	// Function to convert scores array to scores map
	val scoreArrayRowStructToMap = (scoresArray: Seq[Row]) => {
	// We need to return an Option to avoid NullPointerExceptions if values are null
	if (scoresArray == null \|\| scoresArray.length == 0) {
	None
	}
	else {
	// Convert the array of score structs to a array of score with probability maps
	val scoresWithMapProbability = scoresArray.map(scoreWithStructProbabilty => {
	val model_name = scoreWithStructProbabilty.getString(0)
	val model_version = scoreWithStructProbabilty.getString(1)
	val prediction = scoreWithStructProbabilty.getSeq[String](2)
	val probabilityMap = scoreWithStructProbabilty.getSeq[Row](3).map(p => p.getString(0) -> p.getDouble(1)).toMap
	Row(model_name, model_version, prediction, probabilityMap)
	})
	// convert the array of score object with probability maps to
	// a map of model_name -> score object
	Some(scoresWithMapProbability.map(r => r.getString(0) -> r).toMap)
	}
	}
	// Make a udf
	val scoreArrayRowStructToMapUdf = udf(scoreArrayRowStructToMap, scoreMapFieldSchema)


	// function to convert errors array to errors map
	val errorArrayRowStructToMap = (errorsArray: Seq[Row]) => {
	if (errorsArray == null \|\| errorsArray.length == 0) {
	None
	}
	else {
	Some(errorsArray.map(errorStruct => errorStruct.getString(0) -> errorStruct).toMap)
	}
	}
	val errorArrayRowStructToMapUdf = udf(errorArrayRowStructToMap, errorMapFieldSchema)



	def convertRevisionScore1to2(revScore1Df: DataFrame) = {
	// Use the udf to convert and add the new map columns, and then drop the old array ones.
	revScore1Df
	.withColumn("scores_map", scoreArrayRowStructToMapUdf(col("scores"))).drop("scores").withColumnRenamed("scores_map", "scores")
	.withColumn("errors_map", errorArrayRowStructToMapUdf(col("errors"))).drop("errors").withColumnRenamed("errors_map", "errors")
	// Now all of the fields should be the same, we just need the field order to be the same.
	// Good thing we have HiveExtensions convertToSchema!
	// Note: This also drops 2 unused columns: meta.schema_uri and meta.topic
	.convertToSchema(mediawiki_revision_score_2.schema)
	}

	def convertAndWriteRevisionScore1to2(revScore1Df: DataFrame, outputBasePath: String) = {
	// I can't seem to insert this DataFrame directly into event.mediawiki_revision_score.
	// I get ParquetEncodingException: empty fields are illegal, the field should be ommited completely instead.
	// I've tried modifying the UDF functions above to return Options for any map type inside (like probability),
	// but it doesnt't help. Without the options, I get NullPointerExceptions.
	// So! We write this to its own NEW hive table, and will switch to Hive directoy
	// to insert into event.mediawiki_revision_score.
	convertRevisionScore1to2(revScore1Df)
	.write
	.partitionBy("datacenter", "year", "month", "day", "hour")
	.mode("append")
	.parquet(outputBasePath)
	}



	val months = Seq(
	("2018", "12"),
	("2019", "1"),
	("2019", "2"),
	("2019", "3"),
	("2019", "4"),
	("2019", "5"),
	("2019", "6"),
	("2019", "7"),
	("2019", "8"),
	("2019", "9")
	)


	val mediawiki_revision_score_1 = spark.table("otto.mediawiki_revision_score_1")

	months.foreach({ case (year, month) => {
	println(s"------ BEGIN Transforming ${year} ${month} day < 15")
	convertAndWriteRevisionScore1to2(
	mediawiki_revision_score_1.where(s"year=${year} and month=${month} and day < 15"),
	"/user/otto/mediawiki_revision_score_1_backfill/backfill0"
	)
	println(s"------ DONE Transforming ${year} ${month} day < 15\n\n\n")

	println(s"------ BEGIN Transforming ${year} ${month} day >= 15")
	convertAndWriteRevisionScore1to2(
	mediawiki_revision_score_1.where(s"year=${year} and month=${month} and day >= 15"),
	"/user/otto/mediawiki_revision_score_1_backfill/backfill0"
	)
	println(s"------ DONE Transforming ${year} ${month} day >= 15\n\n\n")
	}})

	// --- TODO ---
	// Move data dirs out of /user/otto/mediawiki_revision_score_1_backfill/backfill0 into event/mediawiki_revision_score
	// MSCK REPAIR TABLE event.mediawiki_revision_score