Skip to content

Instantly share code, notes, and snippets.

@andreatbonanno
Last active November 21, 2017 19:22
Show Gist options
  • Save andreatbonanno/c1b29401cf9f1f9ef03acab0ad533b06 to your computer and use it in GitHub Desktop.
Save andreatbonanno/c1b29401cf9f1f9ef03acab0ad533b06 to your computer and use it in GitHub Desktop.
Spark: create hive external table with partitions (from partitioned parquet file in hdfs)
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.sql.SparkSession
def listHdpFiles(filePath: String, excludeFilesFrom: String = ""): Array[String] = {
FileSystem
.get(sc.hadoopConfiguration)
.listStatus(new Path(filePath))
.map(fileStatus => fileStatus.getPath.toString)
.filter(filePath => filePath > excludeFilesFrom)
}
def createHiveTable(filePath: String, db: String, table: String, filtering: (String) => Boolean, partitioningField: String) = {
spark.sqlContext.createExternalTable(s"${db}.${table}", filePath)
listHdpFiles(filePath)
.filter(filtering)
.map { partition =>
spark.sql(s"ALTER TABLE ${db}.${table} ADD PARTITION(${partitioningTerm}='${partition.split("=").last}')")
}
}
createHiveTable("hdfs://cluster/partitionedFile/", "db", "table", (x: String) => x.contains("partitioningField"), "partitioningField")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment