Skip to content

Instantly share code, notes, and snippets.

@sambos
Created May 11, 2018 17:06
Show Gist options
  • Save sambos/23427d9abf924078da3ea837f0aeb167 to your computer and use it in GitHub Desktop.
Save sambos/23427d9abf924078da3ea837f0aeb167 to your computer and use it in GitHub Desktop.
Read Avro

Read and display avro records

using avro-tools utility: avro-tools tojson filename.avro | head -n 10

Using scala :

package org.sample.utils
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.SQLContext

object ReadAvro extends App {
  
    var path = "avro-file"    
    //path = "C://dev//majeed//avro1//*.avro"
     def b2s(a: Array[Byte]): String = new String(a)
    
  val sc: SparkContext = new SparkContext(
          new SparkConf().setMaster("local[1]").setAppName("read-avro"))   
   import com.databricks.spark.avro._
    val sqlContext = new SQLContext(sc) 
    //sqlContext.setConf("spark.sql.avro.compression.codec", "snappy")
    //sqlContext.setConf("spark.sql.avro.snappy.level", "5")
    
    sqlContext.setConf("spark.sql.avro.compression.codec", "deflate")
    sqlContext.setConf("spark.sql.avro.deflate.level", "9")
    
    val df = sqlContext.read.avro(path)    
    df.printSchema()
    
    val lines = df.select("body").map { x => b2s(x.getAs[Array[Byte]]("body"))}
    lines.take(2).foreach(x => println(x))
    sc.stop()
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment