dportabella/DeserializeHadoopSequenceFileWithoutClassDeclaration.scala

## DeserializeHadoopSequenceFileWithoutClassDeclaration.scala
// resolvers += "dportabella-3rd-party-mvn-repo-releases" at "https://github.com/dportabella/3rd-party-mvn-repo/raw/master/releases/"
// libraryDependencies += "org.apache.hadoop" % "hadoop-common" % "2.7.3"
// libraryDependencies += "com.github.dportabella.3rd-party-mvn-repo" % "jdeserialize" % "1.0.0",

import java.io._
import org.apache.hadoop.conf._
import org.apache.hadoop.fs._
import org.apache.hadoop.io._
import org.unsynchronized.jdeserialize

def deserializeWithClassDeclaration(data: Array[Byte]) {
  val values = new ObjectInputStream(new ByteArrayInputStream(data)).readObject().asInstanceOf[Array[Person]]  // put here your class or _
  values.foreach(println)
}

def deserializeWithoutClassDeclaration(data: Array[Byte]) {
  val fis = new ByteArrayInputStream(data)
  val jd = new jdeserialize(null)
//    jd.debugEnabled = true
  jd.run(fis, true)
  jd.dump(null)
}

val f = "//tmp/persons.rdd/part-00001"
val reader = new SequenceFile.Reader(new Configuration(), SequenceFile.Reader.file(new Path(f)))
try {
  val key = NullWritable.get
  val value = new BytesWritable
  while (reader.next(key, value)) {
    deserializeWithClassDeclaration(value.getBytes)     // this works
    deserializeWithoutClassDeclaration(value.getBytes)  // this fails
  }
} finally reader.close()


/*
Unfortunatelly jdeserialize fails to deserialize the object and fails with this (my /path/to/part-00000 file has a list of Person instances):
read: [array 0x7e0001 classdesc [cd 0x7e0000: name [LPerson; uid 8257594952091008868]: [arraycoll sz 2 Person _h0x7e0007 = r_0x7e0002;  , Person _h0x7e00e4 = r_0x7e0002;  ]

I tried with the many forks of jdeserialize
https://github.com/unsynchronized/jdeserialize
https://gist.github.com/dportabella/3dbd22333012682210b6d6ee2e50118d

Any idea?


Example to create a test input file: Test.scala

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

case class Person(name: String, age: Integer)

object Test extends App {
  val sparkConf = new SparkConf().setAppName("test").setMaster("local[1]")
  val sc = new SparkContext(sparkConf)

  val personList = List(Person("John", 30), Person("Maria", 20))
  val rdd: RDD[Person] = sc.parallelize(personList)
  rdd.saveAsObjectFile("/tmp/persons.rdd")
  sc.stop()
}
*/
	// resolvers += "dportabella-3rd-party-mvn-repo-releases" at "https://github.com/dportabella/3rd-party-mvn-repo/raw/master/releases/"
	// libraryDependencies += "org.apache.hadoop" % "hadoop-common" % "2.7.3"
	// libraryDependencies += "com.github.dportabella.3rd-party-mvn-repo" % "jdeserialize" % "1.0.0",

	import java.io._
	import org.apache.hadoop.conf._
	import org.apache.hadoop.fs._
	import org.apache.hadoop.io._
	import org.unsynchronized.jdeserialize

	def deserializeWithClassDeclaration(data: Array[Byte]) {
	val values = new ObjectInputStream(new ByteArrayInputStream(data)).readObject().asInstanceOf[Array[Person]] // put here your class or _
	values.foreach(println)
	}

	def deserializeWithoutClassDeclaration(data: Array[Byte]) {
	val fis = new ByteArrayInputStream(data)
	val jd = new jdeserialize(null)
	// jd.debugEnabled = true
	jd.run(fis, true)
	jd.dump(null)
	}

	val f = "//tmp/persons.rdd/part-00001"
	val reader = new SequenceFile.Reader(new Configuration(), SequenceFile.Reader.file(new Path(f)))
	try {
	val key = NullWritable.get
	val value = new BytesWritable
	while (reader.next(key, value)) {
	deserializeWithClassDeclaration(value.getBytes) // this works
	deserializeWithoutClassDeclaration(value.getBytes) // this fails
	}
	} finally reader.close()


	/*
	Unfortunatelly jdeserialize fails to deserialize the object and fails with this (my /path/to/part-00000 file has a list of Person instances):
	read: [array 0x7e0001 classdesc [cd 0x7e0000: name [LPerson; uid 8257594952091008868]: [arraycoll sz 2 Person _h0x7e0007 = r_0x7e0002; , Person _h0x7e00e4 = r_0x7e0002; ]

	I tried with the many forks of jdeserialize
	https://github.com/unsynchronized/jdeserialize
	https://gist.github.com/dportabella/3dbd22333012682210b6d6ee2e50118d

	Any idea?


	Example to create a test input file: Test.scala

	import org.apache.spark.rdd.RDD
	import org.apache.spark.{SparkConf, SparkContext}

	case class Person(name: String, age: Integer)

	object Test extends App {
	val sparkConf = new SparkConf().setAppName("test").setMaster("local[1]")
	val sc = new SparkContext(sparkConf)

	val personList = List(Person("John", 30), Person("Maria", 20))
	val rdd: RDD[Person] = sc.parallelize(personList)
	rdd.saveAsObjectFile("/tmp/persons.rdd")
	sc.stop()
	}
	*/