Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Generate Parquet Common MetaData separately using a Spark Submit
import java.net.URI
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.parquet.hadoop.{ParquetFileReader, ParquetFileWriter}
import org.apache.spark.{SparkConf, SparkContext}
import scala.collection.JavaConverters.{collectionAsScalaIterableConverter, mapAsScalaMapConverter}
/**
* Generate the _common_metadata file for Parquet files stored in S3 or HDFS.
*
* Usage
* spark-submit --master yarn-client --driver-memory 32g --executor-memory 2g --executor-cores 1 --num-executors 1 \
* --class ParquetCommonMetaDataGenerator yourJarName.jar \
* hdfs://pathToYourParquetFiles
*
* Reference: http://stackoverflow.com/questions/37327626/generate-metadata-for-parquet-files
*
* Created by anish on 10/05/17.
*/
class ParquetCommonMetaDataGenerator(conf: Configuration) {
def getFileMetadata(path: String) = {
getFooters(path)
.asScala.map(_.getParquetMetadata.getFileMetaData.getKeyValueMetaData.asScala)
}
def getFooters(path: String) = {
val uri: URI = new URI(path)
val fs = FileSystem.get(uri, conf)
val footers = ParquetFileReader.readAllFootersInParallel(conf, fs.getFileStatus(new Path(path)))
footers
}
def createMetadata(path: String) = {
val footers = getFooters(path)
ParquetFileWriter.writeMetadataFile(conf, new Path(path), footers)
}
}
object ParquetCommonMetaDataGenerator {
def main(args: Array[String]): Unit = {
require(!args(0).isEmpty, throw new IllegalArgumentException("Need path to create Metadata"))
val sparkConf = new SparkConf()
.setAppName("ParquetCommonMetaDataGenerator")
val sc = new SparkContext(sparkConf)
val conf = sc.hadoopConfiguration
new ParquetCommonMetaDataGenerator(conf).createMetadata(args(0))
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.