Skip to content

Instantly share code, notes, and snippets.

@erich-truveta
Created December 9, 2022 01:00
Show Gist options
  • Save erich-truveta/f30d77441186a8c30c5f22f9c44bf59f to your computer and use it in GitHub Desktop.
Save erich-truveta/f30d77441186a8c30c5f22f9c44bf59f to your computer and use it in GitHub Desktop.
java.io.IOException: can not read class org.apache.parquet.format.PageHeader: Unrecognized type 0
"22/12/08 22:10:28 INFO FileScanRDD: Reading File path: abfss://REDACTED/part-00375-74c76941-8b11-4873-86d6-80444390017d-c000.snappy.parquet, range: 0-87836085, partition values: [empty row]"
"22/12/08 22:10:28 ERROR Executor: Exception in task 365.0 in stage 2865.0 (TID 117118)"
"org.apache.spark.sql.execution.QueryExecutionException: Encountered error while reading file abfss://REDACTED/part-00364-ff7615b7-4a38-4aa7-aa39-3c4ee58483e7-c000.snappy.parquet. Details: "
" at org.apache.spark.sql.errors.QueryExecutionErrors$.cannotReadFilesError(QueryExecutionErrors.scala:731)"
" at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:283)"
" at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:116)"
" at org.apache.spark.sql.execution.FileSourceScanExec$$anon$1.hasNext(DataSourceScanExec.scala:554)"
" at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.columnartorow_nextBatch_0$(Unknown Source)"
" at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)"
" at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)"
" at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)"
" at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)"
" at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:140)"
" at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)"
" at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)"
" at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)"
" at org.apache.spark.scheduler.Task.run(Task.scala:136)"
" at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)"
" at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504)"
" at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)"
" at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)"
" at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)"
" at java.base/java.lang.Thread.run(Thread.java:829)"
"Caused by: java.io.IOException: can not read class org.apache.parquet.format.PageHeader: Unrecognized type 0"
" at org.apache.parquet.format.Util.read(Util.java:365)"
" at org.apache.parquet.format.Util.readPageHeader(Util.java:132)"
" at org.apache.parquet.hadoop.ParquetFileReader$Chunk.readPageHeader(ParquetFileReader.java:1382)"
" at org.apache.parquet.hadoop.ParquetFileReader$Chunk.readAllPages(ParquetFileReader.java:1429)"
" at org.apache.parquet.hadoop.ParquetFileReader$Chunk.readAllPages(ParquetFileReader.java:1402)"
" at org.apache.parquet.hadoop.ParquetFileReader.readChunkPages(ParquetFileReader.java:1023)"
" at org.apache.parquet.hadoop.ParquetFileReader.readNextRowGroup(ParquetFileReader.java:928)"
" at org.apache.parquet.hadoop.ParquetFileReader.readNextFilteredRowGroup(ParquetFileReader.java:972)"
" at org.apache.spark.sql.execution.datasources.parquet.SpecificParquetRecordReaderBase$ParquetRowGroupReaderImpl.readNextRowGroup(SpecificParquetRecordReaderBase.java:266)"
" at org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.checkEndOfRowGroup(VectorizedParquetRecordReader.java:388)"
" at org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.nextBatch(VectorizedParquetRecordReader.java:309)"
" at org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.nextKeyValue(VectorizedParquetRecordReader.java:212)"
" at org.apache.spark.sql.execution.datasources.RecordReaderIterator.hasNext(RecordReaderIterator.scala:39)"
" at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:116)"
" at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:274)"
" ... 18 more"
"Caused by: shaded.parquet.org.apache.thrift.protocol.TProtocolException: Unrecognized type 0"
" at shaded.parquet.org.apache.thrift.protocol.TProtocolUtil.skip(TProtocolUtil.java:144)"
" at shaded.parquet.org.apache.thrift.protocol.TProtocolUtil.skip(TProtocolUtil.java:130)"
" at shaded.parquet.org.apache.thrift.protocol.TProtocolUtil.skip(TProtocolUtil.java:60)"
" at org.apache.parquet.format.PageHeader$PageHeaderStandardScheme.read(PageHeader.java:1078)"
" at org.apache.parquet.format.PageHeader$PageHeaderStandardScheme.read(PageHeader.java:1019)"
" at org.apache.parquet.format.PageHeader.read(PageHeader.java:896)"
" at org.apache.parquet.format.Util.read(Util.java:362)"
" ... 32 more"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment