Skip to content

Instantly share code, notes, and snippets.

@dnaerys
Created October 8, 2022 07:47
Show Gist options
  • Save dnaerys/9830d693af03d138561ad394180eb4e7 to your computer and use it in GitHub Desktop.
Save dnaerys/9830d693af03d138561ad394180eb4e7 to your computer and use it in GitHub Desktop.
import org.apache.spark.sql
import org.rogach.scallop.ScallopConf
class ArgsPanels(arguments: Seq[String]) extends ScallopConf(arguments) {
val deltapath = opt[String](required = true, descr = "path to delta lake")
verify()
}
object Panels {
// GRCh37
// Mitochondrial liver disease gene panel; (c) Genomics England
// 11 genes: S1L,DGUOK,MPV17,POLG,TRMU,TWNK,TFAM,ACAD9,POLG2,RRM2B,SCO1
val mito11Chr = scala.Array("2", "2", "2", "15", "22", "10", "10", "3", "17", "8", "17")
val mito11Start = scala.Array(219523487, 74153953, 27532360, 89859534, 46726772, 102747124, 60144782, 128598439, 62473902, 103216730, 10583654)
val mito11End = scala.Array(219528166, 74186088, 27548547, 89878092, 46753237, 102754158, 60158981, 128634910, 62493154, 103251346, 10601692)
def main(arguments: Array[String]): Unit = {
val args = new ArgsPanels(arguments)
val delta_silver_path = args.deltapath()
// Spark init
import org.apache.spark.sql.SparkSession
val spark = SparkSession
.builder()
.appName(getClass.getSimpleName)
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
.getOrCreate()
// data load
val df = spark
.read
.format("delta")
.load(delta_silver_path)
.persist(org.apache.spark.storage.StorageLevel.MEMORY_ONLY)
df.show()
for (i <- 0 until 32) {
baselineBenchmark(df)
}
}
def baselineBenchmark(df: sql.DataFrame): Unit = {
// Presence in mitochondrial liver disease gene panel
val startMito11 = System.nanoTime
val mito11vars = df
.filter(
(df("contigName") === mito11Chr(0) && df("start") >= mito11Start(0) && df("end") <= mito11End(0)) ||
(df("contigName") === mito11Chr(0) && df("start") <= mito11Start(0) && df("end") >= mito11Start(0)) ||
(df("contigName") === mito11Chr(0) && df("start") <= mito11End(0) && df("end") >= mito11End(0)) ||
(df("contigName") === mito11Chr(1) && df("start") >= mito11Start(1) && df("end") <= mito11End(1)) ||
(df("contigName") === mito11Chr(1) && df("start") <= mito11Start(1) && df("end") >= mito11Start(1)) ||
(df("contigName") === mito11Chr(1) && df("start") <= mito11End(1) && df("end") >= mito11End(1)) ||
(df("contigName") === mito11Chr(2) && df("start") >= mito11Start(2) && df("end") <= mito11End(2)) ||
(df("contigName") === mito11Chr(2) && df("start") <= mito11Start(2) && df("end") >= mito11Start(2)) ||
(df("contigName") === mito11Chr(2) && df("start") <= mito11End(2) && df("end") >= mito11End(2)) ||
(df("contigName") === mito11Chr(3) && df("start") >= mito11Start(3) && df("end") <= mito11End(3)) ||
(df("contigName") === mito11Chr(3) && df("start") <= mito11Start(3) && df("end") >= mito11Start(3)) ||
(df("contigName") === mito11Chr(3) && df("start") <= mito11End(3) && df("end") >= mito11End(3)) ||
(df("contigName") === mito11Chr(4) && df("start") >= mito11Start(4) && df("end") <= mito11End(4)) ||
(df("contigName") === mito11Chr(4) && df("start") <= mito11Start(4) && df("end") >= mito11Start(4)) ||
(df("contigName") === mito11Chr(4) && df("start") <= mito11End(4) && df("end") >= mito11End(4)) ||
(df("contigName") === mito11Chr(5) && df("start") >= mito11Start(5) && df("end") <= mito11End(5)) ||
(df("contigName") === mito11Chr(5) && df("start") <= mito11Start(5) && df("end") >= mito11Start(5)) ||
(df("contigName") === mito11Chr(5) && df("start") <= mito11End(5) && df("end") >= mito11End(5)) ||
(df("contigName") === mito11Chr(6) && df("start") >= mito11Start(6) && df("end") <= mito11End(6)) ||
(df("contigName") === mito11Chr(6) && df("start") <= mito11Start(6) && df("end") >= mito11Start(6)) ||
(df("contigName") === mito11Chr(6) && df("start") <= mito11End(6) && df("end") >= mito11End(6)) ||
(df("contigName") === mito11Chr(7) && df("start") >= mito11Start(7) && df("end") <= mito11End(7)) ||
(df("contigName") === mito11Chr(7) && df("start") <= mito11Start(7) && df("end") >= mito11Start(7)) ||
(df("contigName") === mito11Chr(7) && df("start") <= mito11End(7) && df("end") >= mito11End(7)) ||
(df("contigName") === mito11Chr(8) && df("start") >= mito11Start(8) && df("end") <= mito11End(8)) ||
(df("contigName") === mito11Chr(8) && df("start") <= mito11Start(8) && df("end") >= mito11Start(8)) ||
(df("contigName") === mito11Chr(8) && df("start") <= mito11End(8) && df("end") >= mito11End(8)) ||
(df("contigName") === mito11Chr(9) && df("start") >= mito11Start(9) && df("end") <= mito11End(9)) ||
(df("contigName") === mito11Chr(9) && df("start") <= mito11Start(9) && df("end") >= mito11Start(9)) ||
(df("contigName") === mito11Chr(9) && df("start") <= mito11End(9) && df("end") >= mito11End(9)) ||
(df("contigName") === mito11Chr(10) && df("start") >= mito11Start(10) && df("end") <= mito11End(10)) ||
(df("contigName") === mito11Chr(10) && df("start") <= mito11Start(10) && df("end") >= mito11Start(10)) ||
(df("contigName") === mito11Chr(10) && df("start") <= mito11End(10) && df("end") >= mito11End(10))
)
.count()
val elapsedMito11 = (System.nanoTime - startMito11) / 1000000 // nano to milli
println(s"\n\t In Mito11 panel: $mito11vars variants, $elapsedMito11 ms")
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment