Skip to content

Instantly share code, notes, and snippets.

@jlewi
Last active August 29, 2015 14:01
Show Gist options
  • Save jlewi/59b0dec90b639d5d568e to your computer and use it in GitHub Desktop.
Save jlewi/59b0dec90b639d5d568e to your computer and use it in GitHub Desktop.
Reading a FastQ file in hadoop
import contrail.AvroHelper
import contrail.spark.SerializableGraphNodeData
val contigFile = contrail.AvroHelper.readAvro(sc,"hdfs://hadoop-nn//tmp/contrail.stages.CompressAndCorrect/part-*.avro")
val datums = contigFile.map(r => new SerializableGraphNodeData(r._1.datum))
val keyedById = datums.map(r => (r.node_id.toString, r))
keyedById.cache()
keyedById.lookup(“4fw3YAOX8-lvVjIWgNPGYR3gc5CvsxI”)
import org.apache.spark.rdd.NewHadoopRDD
import org.apache.hadoop.conf.Configuration
import org.apache.avro.mapreduce.AvroKeyInputFormat
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
import org.apache.hadoop.mapreduce.Job
import org.apache.hadoop.fs.Path
import org.apache.avro.generic.GenericRecord
import contrail.graph.GraphNodeData
import org.apache.avro.mapred.AvroKey
import org.apache.hadoop.io.NullWritable
val job = new Job
FileInputFormat.addInputPath(job, new Path("hdfs://hadoop-nn//tmp/contrail.stages.CompressAndCorrect/part-*.avro"))
job.setInputFormatClass(classOf[AvroKeyInputFormat[GraphNodeData]])
val nodes = new NewHadoopRDD(sc, classOf[AvroKeyInputFormat[GraphNodeData]] , classOf[AvroKey[GraphNodeData]], classOf[NullWritable], job.getConfiguration)
val nodesById = nodes.map(r => (r._1.datum.node_id, r._1))
nodesById.cache()
nodesById.lookup("4fw3YAOX8-lvVjIWgNPGYR3gc5CvsxI")
import org.apache.spark.rdd.NewHadoopRDD
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.io.LongWritable
import contrail.io.FastQWritable
import contrail.io.mapreduce.FastQInputFormatNew
import org.apache.hadoop.mapreduce.Job
import org.apache.hadoop.fs.Path
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
val job = new Job
FileInputFormat.addInputPath(job, new Path("hdfs://hadoop-nn/tmp/speciesA_300i_40x.2.fastq"))
job.setInputFormatClass(classOf[FastQInputFormatNew])
val reads = new NewHadoopRDD(sc, classOf[FastQInputFormatNew], classOf[LongWritable], classOf[FastQWritable], job.getConfiguration)
val keyed = reads.map(r => (r._2.getId, r._2))
keyed.cache()
keyed.lookup("assemblathon_40x300i_b50/2")
reads.count()
@rjurney
Copy link

rjurney commented Jun 4, 2014

I would love to see the code for AvroHelper

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment