Skip to content

Instantly share code, notes, and snippets.

What would you like to do?
Generate Parquet Common MetaData separately using a Spark Submit
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.parquet.hadoop.{ParquetFileReader, ParquetFileWriter}
import org.apache.spark.{SparkConf, SparkContext}
import scala.collection.JavaConverters.{collectionAsScalaIterableConverter, mapAsScalaMapConverter}
* Generate the _common_metadata file for Parquet files stored in S3 or HDFS.
* Usage
* spark-submit --master yarn-client --driver-memory 32g --executor-memory 2g --executor-cores 1 --num-executors 1 \
* --class ParquetCommonMetaDataGenerator yourJarName.jar \
* hdfs://pathToYourParquetFiles
* Reference:
* Created by anish on 10/05/17.
class ParquetCommonMetaDataGenerator(conf: Configuration) {
def getFileMetadata(path: String) = {
def getFooters(path: String) = {
val uri: URI = new URI(path)
val fs = FileSystem.get(uri, conf)
val footers = ParquetFileReader.readAllFootersInParallel(conf, fs.getFileStatus(new Path(path)))
def createMetadata(path: String) = {
val footers = getFooters(path)
ParquetFileWriter.writeMetadataFile(conf, new Path(path), footers)
object ParquetCommonMetaDataGenerator {
def main(args: Array[String]): Unit = {
require(!args(0).isEmpty, throw new IllegalArgumentException("Need path to create Metadata"))
val sparkConf = new SparkConf()
val sc = new SparkContext(sparkConf)
val conf = sc.hadoopConfiguration
new ParquetCommonMetaDataGenerator(conf).createMetadata(args(0))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.