Skip to content

Instantly share code, notes, and snippets.

@rjurney
Created May 28, 2014 00:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rjurney/966acf3071224d4cf768 to your computer and use it in GitHub Desktop.
Save rjurney/966acf3071224d4cf768 to your computer and use it in GitHub Desktop.
avro.scala
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.avro.generic.GenericRecord
import org.apache.avro.mapred.AvroKey
import org.apache.avro.mapreduce.AvroKeyInputFormat
import org.apache.hadoop.io.NullWritable
import org.apache.commons.lang.StringEscapeUtils.escapeCsv
import org.apache.avro.file.DataFileStream
val file = sc.textFile("hdfs://hivecluster2/securityx/beaconing_activity.txt/2014/05/12/14/hour")
val arys = file.map{line => line.split("\t")}
val avroRdd = sc.newAPIHadoopFile("hdfs://hivecluster2/securityx/web_proxy_mef/2014/05/27/19/*",
classOf[AvroKeyInputFormat[GenericRecord]],
classOf[AvroKey[GenericRecord]],
classOf[NullWritable])
import org.apache.hadoop.fs.Path
import org.apache.hadoop.fs.FileSystem
import org.apache.hadoop.conf.Configuration
import java.net.URI
import java.io.BufferedInputStream
val input = "hdfs://hivecluster2/securityx/web_proxy_mef/2014/05/27/19/part-m-00019.avro"
val inPath = new Path(input);
val fs = FileSystem.get(URI.create(input), new Configuration());
val inStream = new BufferedInputStream(fs.open(inPath));
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment