Skip to content

Instantly share code, notes, and snippets.

@nikhilRP
Last active August 29, 2015 14:16
Show Gist options
  • Save nikhilRP/baa9a98e257c5258056e to your computer and use it in GitHub Desktop.
Save nikhilRP/baa9a98e257c5258056e to your computer and use it in GitHub Desktop.
Finding depth of a variant
import org.bdgenomics.formats.avro.{ AlignmentRecord, Contig }
import org.bdgenomics.adam.rdd.ADAMContext._
import org.bdgenomics.adam.rdd.BroadcastRegionJoin
import org.bdgenomics.adam.projections.Projection
import org.bdgenomics.adam.projections.AlignmentRecordField._
import org.bdgenomics.adam.models.{ SequenceDictionary, ReferenceRegion }
import org.bdgenomics.adam.rdd.BroadcastRegionJoin
import org.apache.spark.rdd.RDD
// Loading adam files array in s3
var adamFiles = Array(
"s3n://encoded-hdfs/ENCFF000WQG.adam",
"s3n://encoded-hdfs/ENCFF000WQP.adam",
"s3n://encoded-hdfs/ENCFF000WQS.adam",
"s3n://encoded-hdfs/ENCFF000WQY.adam",
"s3n://encoded-hdfs/ENCFF000WQQ.adam",
"s3n://encoded-hdfs/ENCFF000WRA.adam",
"s3n://encoded-hdfs/ENCFF000WRG.adam",
"s3n://encoded-hdfs/ENCFF000WRH.adam",
"s3n://encoded-hdfs/ENCFF000WRV.adam",
"s3n://encoded-hdfs/ENCFF000WRW.adam",
"s3n://encoded-hdfs/ENCFF000WRX.adam",
"s3n://encoded-hdfs/ENCFF000WRY.adam",
"s3n://encoded-hdfs/ENCFF000WSB.adam",
"s3n://encoded-hdfs/ENCFF000WSH.adam",
"s3n://encoded-hdfs/ENCFF000WSN.adam",
"s3n://encoded-hdfs/ENCFF000WSO.adam",
"s3n://encoded-hdfs/ENCFF000WSZ.adam")
def run(file: String) = {
val proj = Projection(contig, start, end, cigar, readMapped)
val adamRDD: RDD[AlignmentRecord] = sc.loadAlignments(file, projection=Some(proj))
val recordsRdd = adamRDD.filter(_.getReadMapped).keyBy(ReferenceRegion(_).get)
val reg = new ReferenceRegion("chr1", 160000, 160001)
val joinedRDD: RDD[(ReferenceRegion, AlignmentRecord)] = BroadcastRegionJoin.cartesianFilter(reg, recordsRdd)
val depths: RDD[(ReferenceRegion, Int)] = joinedRDD.map { case (region, record) => (region, 1) }.reduceByKey(_ + _).sortByKey()
println("location\tdepth")
depths.collect().foreach {
case (region, count) =>
println("%20s\t%15s\t% 5d".format(
"%s:%d".format(region.referenceName, region.start),
count
)
)
}
}
adamFiles.foreach(run)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment