-
-
Save dnaerys/9830d693af03d138561ad394180eb4e7 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import org.apache.spark.sql | |
import org.rogach.scallop.ScallopConf | |
class ArgsPanels(arguments: Seq[String]) extends ScallopConf(arguments) { | |
val deltapath = opt[String](required = true, descr = "path to delta lake") | |
verify() | |
} | |
object Panels { | |
// GRCh37 | |
// Mitochondrial liver disease gene panel; (c) Genomics England | |
// 11 genes: S1L,DGUOK,MPV17,POLG,TRMU,TWNK,TFAM,ACAD9,POLG2,RRM2B,SCO1 | |
val mito11Chr = scala.Array("2", "2", "2", "15", "22", "10", "10", "3", "17", "8", "17") | |
val mito11Start = scala.Array(219523487, 74153953, 27532360, 89859534, 46726772, 102747124, 60144782, 128598439, 62473902, 103216730, 10583654) | |
val mito11End = scala.Array(219528166, 74186088, 27548547, 89878092, 46753237, 102754158, 60158981, 128634910, 62493154, 103251346, 10601692) | |
def main(arguments: Array[String]): Unit = { | |
val args = new ArgsPanels(arguments) | |
val delta_silver_path = args.deltapath() | |
// Spark init | |
import org.apache.spark.sql.SparkSession | |
val spark = SparkSession | |
.builder() | |
.appName(getClass.getSimpleName) | |
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") | |
.getOrCreate() | |
// data load | |
val df = spark | |
.read | |
.format("delta") | |
.load(delta_silver_path) | |
.persist(org.apache.spark.storage.StorageLevel.MEMORY_ONLY) | |
df.show() | |
for (i <- 0 until 32) { | |
baselineBenchmark(df) | |
} | |
} | |
def baselineBenchmark(df: sql.DataFrame): Unit = { | |
// Presence in mitochondrial liver disease gene panel | |
val startMito11 = System.nanoTime | |
val mito11vars = df | |
.filter( | |
(df("contigName") === mito11Chr(0) && df("start") >= mito11Start(0) && df("end") <= mito11End(0)) || | |
(df("contigName") === mito11Chr(0) && df("start") <= mito11Start(0) && df("end") >= mito11Start(0)) || | |
(df("contigName") === mito11Chr(0) && df("start") <= mito11End(0) && df("end") >= mito11End(0)) || | |
(df("contigName") === mito11Chr(1) && df("start") >= mito11Start(1) && df("end") <= mito11End(1)) || | |
(df("contigName") === mito11Chr(1) && df("start") <= mito11Start(1) && df("end") >= mito11Start(1)) || | |
(df("contigName") === mito11Chr(1) && df("start") <= mito11End(1) && df("end") >= mito11End(1)) || | |
(df("contigName") === mito11Chr(2) && df("start") >= mito11Start(2) && df("end") <= mito11End(2)) || | |
(df("contigName") === mito11Chr(2) && df("start") <= mito11Start(2) && df("end") >= mito11Start(2)) || | |
(df("contigName") === mito11Chr(2) && df("start") <= mito11End(2) && df("end") >= mito11End(2)) || | |
(df("contigName") === mito11Chr(3) && df("start") >= mito11Start(3) && df("end") <= mito11End(3)) || | |
(df("contigName") === mito11Chr(3) && df("start") <= mito11Start(3) && df("end") >= mito11Start(3)) || | |
(df("contigName") === mito11Chr(3) && df("start") <= mito11End(3) && df("end") >= mito11End(3)) || | |
(df("contigName") === mito11Chr(4) && df("start") >= mito11Start(4) && df("end") <= mito11End(4)) || | |
(df("contigName") === mito11Chr(4) && df("start") <= mito11Start(4) && df("end") >= mito11Start(4)) || | |
(df("contigName") === mito11Chr(4) && df("start") <= mito11End(4) && df("end") >= mito11End(4)) || | |
(df("contigName") === mito11Chr(5) && df("start") >= mito11Start(5) && df("end") <= mito11End(5)) || | |
(df("contigName") === mito11Chr(5) && df("start") <= mito11Start(5) && df("end") >= mito11Start(5)) || | |
(df("contigName") === mito11Chr(5) && df("start") <= mito11End(5) && df("end") >= mito11End(5)) || | |
(df("contigName") === mito11Chr(6) && df("start") >= mito11Start(6) && df("end") <= mito11End(6)) || | |
(df("contigName") === mito11Chr(6) && df("start") <= mito11Start(6) && df("end") >= mito11Start(6)) || | |
(df("contigName") === mito11Chr(6) && df("start") <= mito11End(6) && df("end") >= mito11End(6)) || | |
(df("contigName") === mito11Chr(7) && df("start") >= mito11Start(7) && df("end") <= mito11End(7)) || | |
(df("contigName") === mito11Chr(7) && df("start") <= mito11Start(7) && df("end") >= mito11Start(7)) || | |
(df("contigName") === mito11Chr(7) && df("start") <= mito11End(7) && df("end") >= mito11End(7)) || | |
(df("contigName") === mito11Chr(8) && df("start") >= mito11Start(8) && df("end") <= mito11End(8)) || | |
(df("contigName") === mito11Chr(8) && df("start") <= mito11Start(8) && df("end") >= mito11Start(8)) || | |
(df("contigName") === mito11Chr(8) && df("start") <= mito11End(8) && df("end") >= mito11End(8)) || | |
(df("contigName") === mito11Chr(9) && df("start") >= mito11Start(9) && df("end") <= mito11End(9)) || | |
(df("contigName") === mito11Chr(9) && df("start") <= mito11Start(9) && df("end") >= mito11Start(9)) || | |
(df("contigName") === mito11Chr(9) && df("start") <= mito11End(9) && df("end") >= mito11End(9)) || | |
(df("contigName") === mito11Chr(10) && df("start") >= mito11Start(10) && df("end") <= mito11End(10)) || | |
(df("contigName") === mito11Chr(10) && df("start") <= mito11Start(10) && df("end") >= mito11Start(10)) || | |
(df("contigName") === mito11Chr(10) && df("start") <= mito11End(10) && df("end") >= mito11End(10)) | |
) | |
.count() | |
val elapsedMito11 = (System.nanoTime - startMito11) / 1000000 // nano to milli | |
println(s"\n\t In Mito11 panel: $mito11vars variants, $elapsedMito11 ms") | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment