Skip to content

Instantly share code, notes, and snippets.

@sakhtar-git
Last active March 29, 2023 06:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sakhtar-git/aebeaa68c00f0d83895a2a3091606665 to your computer and use it in GitHub Desktop.
Save sakhtar-git/aebeaa68c00f0d83895a2a3091606665 to your computer and use it in GitHub Desktop.
import org.apache.hadoop.fs.Path
import org.apache.parquet.column.statistics.Statistics
import org.apache.parquet.example.data.Group
import org.apache.parquet.hadoop.ParquetFileReader
import org.apache.parquet.hadoop.util.HadoopInputFile
object ParquetFileValidation {
def main(args: Array[String]): Unit = {
// path to the Parquet file
val parquetFilePath = "s3://my-bucket/my-file.parquet"
// create a Parquet file reader
val parquetReader = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(parquetFilePath)))
try {
// get the number of rows in the file
val numRows = parquetReader.getFooter.getBlocks.asScala.map(_.getRowCount).sum
// get the schema of the file
val schema = parquetReader.getFooter.getFileMetaData.getSchema
// iterate over the columns and validate the statistics
schema.getColumns.asScala.foreach { column =>
val columnChunkMetaData = parquetReader.getFooter.getBlocks.asScala
.flatMap(_.getColumns.asScala)
.find(_.getPath.toDotString == column.getPath.toDotString)
.getOrElse(throw new RuntimeException(s"Column ${column.getPath} not found"))
val statistics = columnChunkMetaData.getStatistics
if (statistics == null) {
throw new RuntimeException(s"No statistics found for column ${column.getPath}")
}
if (statistics.isEmpty) {
throw new RuntimeException(s"No statistics found for column ${column.getPath}")
}
// check the statistics
val min = statistics.getMin
val max = statistics.getMax
val nullCount = statistics.getNumNulls
// perform additional validations as needed
// print some information about the column
println(s"Column ${column.getPath}: min=$min, max=$max, nullCount=$nullCount")
}
// print some information about the file
println(s"File $parquetFilePath has $numRows rows")
} finally {
// close the reader
parquetReader.close()
}
}
}
@sakhtar-git
Copy link
Author

import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.scalatest.funsuite.AnyFunSuite
import org.scalatest.matchers.should.Matchers

class MyDataFrameTest extends AnyFunSuite with Matchers {

test("test schema of myDataFrame") {
val myDataFrame = Seq(
("John", 25),
("Jane", 30),
("Bob", 40)
).toDF("name", "age")

val expectedSchema = StructType(Seq(
  StructField("name", StringType, nullable = true),
  StructField("age", IntegerType, nullable = true)
))

myDataFrame.schema shouldEqual expectedSchema

}

}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment