sakhtar-git/a.scala

## a.scala
import org.apache.hadoop.fs.Path
import org.apache.parquet.column.statistics.Statistics
import org.apache.parquet.example.data.Group
import org.apache.parquet.hadoop.ParquetFileReader
import org.apache.parquet.hadoop.util.HadoopInputFile

object ParquetFileValidation {
  def main(args: Array[String]): Unit = {
    // path to the Parquet file
    val parquetFilePath = "s3://my-bucket/my-file.parquet"

    // create a Parquet file reader
    val parquetReader = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(parquetFilePath)))

    try {
      // get the number of rows in the file
      val numRows = parquetReader.getFooter.getBlocks.asScala.map(_.getRowCount).sum

      // get the schema of the file
      val schema = parquetReader.getFooter.getFileMetaData.getSchema

      // iterate over the columns and validate the statistics
      schema.getColumns.asScala.foreach { column =>
        val columnChunkMetaData = parquetReader.getFooter.getBlocks.asScala
          .flatMap(_.getColumns.asScala)
          .find(_.getPath.toDotString == column.getPath.toDotString)
          .getOrElse(throw new RuntimeException(s"Column ${column.getPath} not found"))

        val statistics = columnChunkMetaData.getStatistics

        if (statistics == null) {
          throw new RuntimeException(s"No statistics found for column ${column.getPath}")
        }

        if (statistics.isEmpty) {
          throw new RuntimeException(s"No statistics found for column ${column.getPath}")
        }

        // check the statistics
        val min = statistics.getMin
        val max = statistics.getMax
        val nullCount = statistics.getNumNulls

        // perform additional validations as needed

        // print some information about the column
        println(s"Column ${column.getPath}: min=$min, max=$max, nullCount=$nullCount")
      }

      // print some information about the file
      println(s"File $parquetFilePath has $numRows rows")
    } finally {
      // close the reader
      parquetReader.close()
    }
  }
}
	import org.apache.hadoop.fs.Path
	import org.apache.parquet.column.statistics.Statistics
	import org.apache.parquet.example.data.Group
	import org.apache.parquet.hadoop.ParquetFileReader
	import org.apache.parquet.hadoop.util.HadoopInputFile

	object ParquetFileValidation {
	def main(args: Array[String]): Unit = {
	// path to the Parquet file
	val parquetFilePath = "s3://my-bucket/my-file.parquet"

	// create a Parquet file reader
	val parquetReader = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(parquetFilePath)))

	try {
	// get the number of rows in the file
	val numRows = parquetReader.getFooter.getBlocks.asScala.map(_.getRowCount).sum

	// get the schema of the file
	val schema = parquetReader.getFooter.getFileMetaData.getSchema

	// iterate over the columns and validate the statistics
	schema.getColumns.asScala.foreach { column =>
	val columnChunkMetaData = parquetReader.getFooter.getBlocks.asScala
	.flatMap(_.getColumns.asScala)
	.find(_.getPath.toDotString == column.getPath.toDotString)
	.getOrElse(throw new RuntimeException(s"Column ${column.getPath} not found"))

	val statistics = columnChunkMetaData.getStatistics

	if (statistics == null) {
	throw new RuntimeException(s"No statistics found for column ${column.getPath}")
	}

	if (statistics.isEmpty) {
	throw new RuntimeException(s"No statistics found for column ${column.getPath}")
	}

	// check the statistics
	val min = statistics.getMin
	val max = statistics.getMax
	val nullCount = statistics.getNumNulls

	// perform additional validations as needed

	// print some information about the column
	println(s"Column ${column.getPath}: min=$min, max=$max, nullCount=$nullCount")
	}

	// print some information about the file
	println(s"File $parquetFilePath has $numRows rows")
	} finally {
	// close the reader
	parquetReader.close()
	}
	}
	}