Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save dportabella/edd7e665121fba9b4984389354b8802f to your computer and use it in GitHub Desktop.
Save dportabella/edd7e665121fba9b4984389354b8802f to your computer and use it in GitHub Desktop.
How to deserialize a hadoop result sequence file outside hadoop (or a spark saveAsObjectFile outside spark) without having the class declaration
// resolvers += "dportabella-3rd-party-mvn-repo-releases" at "https://github.com/dportabella/3rd-party-mvn-repo/raw/master/releases/"
// libraryDependencies += "org.apache.hadoop" % "hadoop-common" % "2.7.3"
// libraryDependencies += "com.github.dportabella.3rd-party-mvn-repo" % "jdeserialize" % "1.0.0",
import java.io._
import org.apache.hadoop.conf._
import org.apache.hadoop.fs._
import org.apache.hadoop.io._
import org.unsynchronized.jdeserialize
def deserializeWithClassDeclaration(data: Array[Byte]) {
val values = new ObjectInputStream(new ByteArrayInputStream(data)).readObject().asInstanceOf[Array[Person]] // put here your class or _
values.foreach(println)
}
def deserializeWithoutClassDeclaration(data: Array[Byte]) {
val fis = new ByteArrayInputStream(data)
val jd = new jdeserialize(null)
// jd.debugEnabled = true
jd.run(fis, true)
jd.dump(null)
}
val f = "//tmp/persons.rdd/part-00001"
val reader = new SequenceFile.Reader(new Configuration(), SequenceFile.Reader.file(new Path(f)))
try {
val key = NullWritable.get
val value = new BytesWritable
while (reader.next(key, value)) {
deserializeWithClassDeclaration(value.getBytes) // this works
deserializeWithoutClassDeclaration(value.getBytes) // this fails
}
} finally reader.close()
/*
Unfortunatelly jdeserialize fails to deserialize the object and fails with this (my /path/to/part-00000 file has a list of Person instances):
read: [array 0x7e0001 classdesc [cd 0x7e0000: name [LPerson; uid 8257594952091008868]: [arraycoll sz 2 Person _h0x7e0007 = r_0x7e0002; , Person _h0x7e00e4 = r_0x7e0002; ]
I tried with the many forks of jdeserialize
https://github.com/unsynchronized/jdeserialize
https://gist.github.com/dportabella/3dbd22333012682210b6d6ee2e50118d
Any idea?
Example to create a test input file: Test.scala
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
case class Person(name: String, age: Integer)
object Test extends App {
val sparkConf = new SparkConf().setAppName("test").setMaster("local[1]")
val sc = new SparkContext(sparkConf)
val personList = List(Person("John", 30), Person("Maria", 20))
val rdd: RDD[Person] = sc.parallelize(personList)
rdd.saveAsObjectFile("/tmp/persons.rdd")
sc.stop()
}
*/
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment