Skip to content

Instantly share code, notes, and snippets.

@dnaerys
Created October 8, 2022 07:42
Show Gist options
  • Save dnaerys/27cc29be0f11d98c3b936dad808dd47e to your computer and use it in GitHub Desktop.
Save dnaerys/27cc29be0f11d98c3b936dad808dd47e to your computer and use it in GitHub Desktop.
gene panels
import org.rogach.scallop.ScallopConf
class ArgsETL2Delta(arguments: Seq[String]) extends ScallopConf(arguments) {
val path = opt[String](required = true, descr = "path to VCF files to process")
val path2save = opt[String](required = true, descr = "path to output delta dataset")
val htsjdk = opt[Boolean](default = Some(false), descr = "use htsjdk VCF parser")
verify()
}
object etl2delta {
def main(arguments: Array[String]): Unit = {
val args = new ArgsETL2Delta(arguments)
val delta_silver_path = args.path2save()
// Spark init
import org.apache.spark.sql.SparkSession
val spark = SparkSession
.builder()
.appName(getClass.getSimpleName)
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
.config("io.projectglow.vcf.fastReaderEnabled", !args.htsjdk())
.getOrCreate()
import spark.implicits._
// Glow init
import io.projectglow.Glow
val sparkGlow = Glow.register(spark, true) // see https://github.com/projectglow/glow/issues/362
// etl to delta
val df = sparkGlow
.read
.format("vcf")
.option("includeSampleIds", "false")
.option("flattenInfoFields", "false")
.option("validationStringency", "lenient") // warnings on malformed rows
.load(args.path())
val dfBiallelic = Glow.transform("split_multiallelics", df)
dfBiallelic
.select($"contigName", $"start", $"end", $"referenceAllele", $"alternateAlleles", $"splitFromMultiAllelic", $"genotypes.calls")
.write
.mode("overwrite")
.format("delta")
.save(delta_silver_path)
}
}
import org.apache.spark.sql
import org.rogach.scallop.ScallopConf
class ArgsPanels(arguments: Seq[String]) extends ScallopConf(arguments) {
val deltapath = opt[String](required = true, descr = "path to delta lake")
verify()
}
object Panels {
// GRCh37
// Mitochondrial liver disease gene panel; (c) Genomics England
// 11 genes: S1L,DGUOK,MPV17,POLG,TRMU,TWNK,TFAM,ACAD9,POLG2,RRM2B,SCO1
val mito11Chr = scala.Array("2", "2", "2", "15", "22", "10", "10", "3", "17", "8", "17")
val mito11Start = scala.Array(219523487, 74153953, 27532360, 89859534, 46726772, 102747124, 60144782, 128598439, 62473902, 103216730, 10583654)
val mito11End = scala.Array(219528166, 74186088, 27548547, 89878092, 46753237, 102754158, 60158981, 128634910, 62493154, 103251346, 10601692)
// Adult solid tumours cancer susceptibility panel; (c) Genomics England
// 104 genes: APC,ATM,BAP1,BMPR1A,BRCA1,BRCA2,BRIP1,CBL,CDC73,CDH1,CDK4,CDKN1B,CDKN2A,DDB2,DICER1,EPCAM,ERCC2,ERCC3,ERCC4,ERCC5,FANCA,FANCB,FANCC,FANCD2,FANCE,FANCF,FANCG,FANCI,FANCL,FH,FLCN,HRAS,KIT,KRAS,MAX,MEN1,MET,MLH1,MSH2,MSH6,MUTYH,NF1,NF2,NRAS,NTHL1,PALB2,PMS2,POLD1,POLE,POLH,PTCH1,PTEN,PTPN11,RAD51C,RAD51D,RAF1,RB1,RET,RTEL1,SDHA,SDHAF2,SDHB,SDHC,SDHD,SHOC2,SMAD4,SMARCA4,SMARCB1,SOS1,STK11,SUFU,TERC,TERT,TMEM127,TP53,TSC1,TSC2,VHL,WRAP53,WT1,XPA,XPC,ACD,AIP,BRAF,CHEK2,CTC1,DKC1,ERCC1,EXT1,EXT2,LZTR1,MAP2K1,MAP2K2,PARN,PPP1CB,RIT1,SLX4,SOS2,TINF2,NOP10,PDGFRA,RABL3,SPRED1
val adult104Chr = scala.Array("5", "11", "3", "10", "17", "13", "17", "11", "1", "16", "12", "12", "9", "11", "14", "2", "19", "2", "16", "13", "16", "X", "9", "3", "6", "11", "9", "15", "2", "1", "17", "11", "4", "12", "14", "11", "7", "3", "2", "2", "1", "17", "22", "1", "16", "16", "7", "19", "12", "6", "9", "10", "12", "17", "17", "3", "13", "10", "20", "5", "11", "1", "1", "11", "10", "18", "19", "22", "2", "19", "10", "3", "5", "2", "17", "9", "16", "3", "17", "11", "9", "3", "16", "11", "7", "22", "17", "X", "19", "8", "11", "22", "15", "19", "16", "2", "1", "16", "14", "14", "15", "4", "3", "15")
val adult104Start = scala.Array(112043195, 108093211, 52435029, 88516407, 41196312, 32889611, 59758627, 119076752, 193091147, 68771128, 58141510, 12867992, 21967751, 47236493, 95552565, 47572297, 45853095, 128014866, 14014014, 103497194, 89803957, 14861529, 97861336, 10068098, 35420138, 22644079, 35073832, 89787180, 58386378, 241660903, 17115526, 532242, 55524085, 25357723, 65472892, 64570982, 116312444, 37034823, 47630108, 47922669, 45794835, 29421945, 29999545, 115247090, 2089816, 23614488, 6012870, 50887461, 133200348, 43543887, 98205262, 89622870, 112856155, 56769934, 33426811, 12625100, 48877887, 43572475, 62289163, 218356, 61197514, 17345217, 161284047, 111957497, 112679301, 48494410, 11071598, 24129150, 39208537, 1189406, 104263744, 169482308, 1253262, 96914254, 7565097, 135766735, 2097466, 10182692, 7589389, 32409321, 100437191, 14186647, 67691415, 67250512, 140419127, 29083731, 8130191, 153991031, 45910591, 118806729, 44117099, 21333751, 66679155, 4090319, 14529558, 28974506, 155867599, 3631182, 50583847, 24708849, 34633917, 55095264, 120405528, 38544527)
val adult104End = scala.Array(112181936, 108239829, 52444366, 88692595, 41277500, 32973805, 59940882, 119178859, 193223031, 68869451, 58149796, 12875305, 21995300, 47260767, 95624347, 47614740, 45874176, 128051752, 14046202, 103528345, 89883065, 14891191, 98079991, 10143614, 35434880, 22647387, 35080013, 89860492, 58468507, 241683061, 17140502, 537287, 55606881, 25403870, 65569413, 64578766, 116438440, 37107380, 47789450, 48037240, 45806142, 29709134, 30094587, 115259515, 2097867, 23652631, 6048756, 50921273, 133263951, 43586701, 98279339, 89731687, 112947717, 56811703, 33448541, 12705725, 49056122, 43625799, 62328416, 256815, 61215001, 17380665, 161332984, 111990353, 112773425, 48611415, 11176071, 24176703, 39351486, 1228428, 104393292, 169482848, 1295184, 96931732, 7590856, 135820020, 2138716, 10193904, 7606820, 32457176, 100459639, 14220283, 67694713, 67258574, 140624564, 29138410, 8151362, 154005964, 45982086, 119124092, 44266979, 21353327, 66784650, 4124126, 14726585, 29025806, 155881195, 3661599, 50698276, 24711880, 34633917, 55095264, 120405528, 38544527)
// Possible mitochondrial disorder, nuclear genes panel; (c) Genomics England
// 374 genes: AARS2,ABAT,ABCB7,ACAD9,ACO2,AFG3L2,AGK,AIFM1,ANO10,APOPT1,APTX,ATAD3A,ATP5D,ATPAF2,BCS1L,BOLA3,BTD,C12orf65,C19orf70,C1QBP,CA5A,CARS2,CHCHD10,CLPB,CLPP,COA6,COA7,COQ2,COQ4,COQ6,COQ7,COQ8A,COQ8B,COQ9,COX10,COX14,COX15,COX20,COX6A1,COX6B1,COX7B,CYC1,DARS2,DGUOK,DLAT,DLD,DNA2,DNAJC19,DNM1L,DNM2,EARS2,ECHS1,ELAC2,ETFDH,ETHE1,FARS2,FASTKD2,FBXL4,FDX2,FDXR,FH,FLAD1,FOXRED1,GARS,GDAP1,GFER,GFM1,GFM2,GLRX5,GTPBP3,HARS2,HCCS,HIBCH,HLCS,HSD17B10,HSPD1,HTRA2,IARS2,IBA57,ISCA1,ISCA2,ISCU,KARS,LARS2,LIAS,LIPT1,LIPT2,LONP1,LRPPRC,LYRM7,MARS2,MDH2,MECR,MFF,MFN2,MGME1,MICU1,MIPEP,MPC1,MPV17,MRPL3,MRPL44,MRPS2,MRPS22,MRPS34,MSTO1,MTFMT,MTO1,MTPAP,NADK2,NARS2,NAXE,NDUFA1,NDUFA10,NDUFA11,NDUFA2,NDUFA4,NDUFA6,NDUFA9,NDUFAF1,NDUFAF2,NDUFAF3,NDUFAF4,NDUFAF5,NDUFAF6,NDUFAF8,NDUFB11,NDUFB3,NDUFB8,NDUFS1,NDUFS2,NDUFS3,NDUFS4,NDUFS6,NDUFS7,NDUFS8,NDUFV1,NDUFV2,NFU1,NUBPL,OPA1,OPA3,PARS2,PC,PDHA1,PDHB,PDHX,PDP1,PDSS1,PDSS2,PET100,PMPCA,PMPCB,PNPLA8,PNPT1,POLG,POLG2,PPA2,PUS1,QRSL1,RARS2,RMND1,RNASEH1,RRM2B,RTN4IP1,SACS,SARS2,SCO1,SCO2,SDHA,SDHAF1,SDHD,SERAC1,SFXN4,SLC19A2,SLC19A3,SLC25A1,SLC25A12,SLC25A19,SLC25A26,SLC25A3,SLC25A32,SLC25A38,SLC25A4,SLC25A42,SLC25A46,SPG7,SUCLA2,SUCLG1,SURF1,TACO1,TAZ,TIMM50,TIMM8A,TK2,TMEM126B,TMEM70,TOP3A,TPK1,TRIT1,TRMT10C,TRMT5,TRMU,TRNT1,TSFM,TTC19,TUFM,TWNK,TYMP,UQCC2,UQCRB,VARS2,WARS2,YARS2,ATP5A1,ATP5B,ATP5C1,ATP5E,ATP5F1,ATP5G1,ATP5G2,ATP5G3,ATP5H,ATP5I,ATP5J,ATP5J2,ATP5L,ATP5L2,ATP5O,ATPAF1,CEP89,COA1,COA3,COA4,COA5,COQ5,COX11,COX16,COX17,COX18,COX19,COX4I1,COX4I2,COX5A,COX5B,COX6A2,COX6B2,COX6C,COX7A1,COX7C,COX8A,ERAL1,GATB,GATC,IDH3A,IDH3B,LYRM4,MRM2,MRPL12,MRPS14,MRPS16,MRPS23,MRPS7,NDUFA12,NDUFA13,NDUFA3,NDUFA5,NDUFA7,NDUFA8,NDUFAB1,NDUFAF7,NDUFB1,NDUFB10,NDUFB2,NDUFB4,NDUFB5,NDUFB6,NDUFB7,NDUFB9,NDUFC1,NDUFC2,NDUFS5,NDUFV3,NFS1,NSUN3,OXA1L,PET117,POLRMT,PTCD3,SDHAF2,SDHAF3,SDHAF4,SDHB,SDHC,SLC25A21,TARS2,TFAM,TIMM22,TIMMDC1,TMEM65,TXN2,UQCC1,UQCC3,UQCR10,UQCR11,UQCRC1,UQCRC2,UQCRFS1,UQCRH,UQCRQ,YME1L1,ACADM,ACADS,ACADSB,ACADVL,ACAT1,C19orf12,CHKB,CISD2,COASY,CPT1A,CPT2,CYCS,D2HGDH,DARS,DCC,DHTKD1,ECSIT,ETFA,ETFB,FXN,G6PC,GATM,GLUD1,HADH,HADHA,HADHB,HMGCL,HMGCS2,HSPA9,HTT,IER3IP1,L2HGDH,NNT,OXCT1,PANK2,PDK3,PDP2,PDPR,PITRM1,PNPLA4,PPOX,PYCR1,QARS,ROBO3,SAMHD1,SLC22A5,SLC25A13,SLC25A20,SLC25A22,SLC25A40,SLC52A2,SLC52A3,SRRT,SSBP1,STAT2,SUCLG2,TANGO2,TIMM44,TMEM126A,TRAP1,VPS13C,WFS1,XPNPEP3
val mito374Chr = scala.Array[String]("6", "16", "X", "3", "22", "18", "7", "X", "3", "14", "9", "1", "19", "17", "2", "2", "3", "12", "19", "17", "16", "13", "22", "11", "19", "1", "1", "4", "9", "14", "16", "1", "19", "16", "17", "12", "10", "1", "12", "19", "X", "8", "1", "2", "11", "7", "10", "3", "12", "19", "16", "10", "17", "4", "19", "6", "2", "6", "19", "17", "1", "1", "11", "7", "8", "16", "3", "5", "14", "19", "5", "X", "2", "21", "X", "2", "2", "1", "1", "9", "14", "12", "16", "3", "4", "2", "11", "19", "2", "5", "2", "7", "1", "2", "1", "20", "10", "13", "6", "2", "3", "2", "9", "3", "16", "1", "15", "6", "10", "5", "11", "1", "X", "2", "19", "5", "7", "22", "12", "15", "5", "3", "6", "20", "8", "17", "X", "2", "10", "2", "1", "11", "5", "5", "19", "11", "11", "18", "2", "14", "3", "19", "1", "11", "X", "3", "11", "8", "10", "6", "19", "9", "7", "7", "2", "15", "17", "4", "12", "6", "6", "6", "2", "8", "6", "13", "19", "17", "22", "5", "19", "11", "6", "10", "1", "2", "22", "2", "17", "3", "12", "8", "3", "4", "19", "5", "16", "13", "2", "9", "17", "X", "19", "X", "16", "11", "8", "17", "7", "1", "3", "14", "22", "3", "12", "17", "16", "10", "22", "6", "8", "6", "1", "12", "18", "12", "10", "20", "1", "17", "12", "2", "17", "4", "21", "7", "11", "22", "21", "1", "19", "7", "17", "11", "2", "12", "17", "14", "3", "4", "7", "16", "20", "15", "2", "16", "19", "8", "19", "5", "11", "17", "4", "12", "15", "20", "6", "7", "17", "1", "10", "17", "17", "12", "19", "19", "7", "19", "9", "16", "2", "14", "16", "7", "3", "3", "9", "19", "8", "4", "11", "1", "21", "20", "3", "14", "20", "19", "2", "11", "7", "6", "1", "1", "14", "1", "10", "17", "3", "8", "22", "20", "11", "22", "19", "3", "16", "19", "1", "5", "10", "1", "12", "10", "17", "11", "19", "22", "4", "17", "11", "1", "7", "2", "2", "18", "10", "19", "15", "19", "9", "17", "15", "10", "4", "2", "2", "1", "1", "5", "4", "18", "14", "5", "5", "20", "X", "16", "16", "10", "X", "1", "17", "3", "11", "20", "5", "7", "3", "11", "7", "8", "20", "7", "7", "12", "3", "22", "19", "11", "16", "15", "4", "22")
val mito374Start = scala.Array(44267391, 8768422, 74273115, 128598439, 41865129, 12328943, 141250989, 129263337, 43396351, 104029299, 32972604, 1447531, 1241749, 17880723, 219523487, 74362525, 15642848, 123717463, 5678432, 5336097, 87921625, 111293759, 24108021, 72003469, 6361463, 234509202, 53152508, 84182689, 131084815, 74416629, 19078921, 227085237, 41197434, 57481337, 13972813, 50505762, 101471601, 244998624, 120875893, 36139125, 77154935, 145149930, 173793641, 74153953, 111895538, 107531415, 70173821, 180701497, 32832134, 10828755, 23533335, 135175984, 12895708, 159593277, 44010871, 5261277, 207630081, 99316420, 10416103, 72858619, 241660903, 154955814, 126138950, 30634297, 75233365, 2034208, 158362067, 74017029, 95999840, 17445729, 140071011, 11129421, 191054461, 38123189, 53458206, 198351305, 74756504, 220267444, 228353516, 88879461, 74960423, 108956358, 75661622, 45429998, 39460620, 99771418, 74202757, 5691845, 44113647, 130506503, 198570087, 75677369, 29519385, 228189867, 12040238, 17949556, 74127098, 24304328, 166778407, 27532360, 131181056, 224822121, 138391830, 138724648, 1821891, 155579979, 65294845, 74171301, 30598730, 36192694, 78147007, 156561554, 119005450, 240831867, 5891287, 140018325, 10971578, 42481529, 4758261, 41679551, 60240956, 49057892, 97337189, 13765596, 95907995, 79213039, 47001615, 201936156, 102267203, 206979541, 161166894, 47586888, 52856463, 1801514, 1383526, 67798084, 67374323, 9102628, 69622882, 31959162, 193310933, 46030685, 55222571, 66615704, 19362011, 58413357, 34937376, 94870035, 26986588, 107473761, 7694623, 139305110, 102937869, 108110866, 55861400, 89859534, 62473902, 106290234, 132413745, 107077453, 88224096, 151725989, 3592383, 103216730, 107018903, 23902965, 39405906, 10583654, 50961997, 218356, 36486090, 111957497, 158530536, 120900279, 169433147, 228549926, 19163095, 172640880, 73269073, 66119285, 98987369, 104410863, 39424839, 186064395, 19174808, 110073837, 89557325, 48510622, 84650647, 136218610, 61678231, 153639854, 39971052, 100600649, 66541906, 85339629, 74884672, 18174742, 144149034, 40306723, 101280706, 61438169, 46726772, 3168600, 58176372, 15902694, 28853732, 102747124, 50964181, 33662070, 97238148, 30876019, 119573839, 32880424, 43664110, 57031959, 7830092, 57600522, 111991486, 46970127, 54026510, 176040986, 73034958, 666225, 27088815, 99046098, 118271869, 43035809, 35275757, 47098409, 33369902, 43648055, 40947165, 73583712, 99215773, 120941077, 53029263, 70791798, 119373360, 73921797, 938415, 85832239, 30225691, 75212132, 98262503, 31439052, 55860674, 100885428, 36641824, 85913721, 63742079, 27181956, 152591656, 120884241, 78423840, 2639041, 5102827, 2273866, 79670387, 174979925, 75006510, 55916842, 73257755, 95290831, 19626545, 54606036, 123177051, 8373490, 124894745, 23592323, 37458774, 92582466, 2009509, 140390577, 120315156, 179322478, 32552997, 14676890, 125551344, 140188034, 77779350, 39491990, 44299754, 34255977, 93781760, 23235731, 18118517, 617223, 86333305, 61197514, 96745902, 71276620, 17345217, 161284047, 37147636, 150459887, 60144782, 900357, 119217379, 125324231, 36863083, 33890369, 62437745, 30163358, 1597171, 48636435, 21963981, 29698173, 46769303, 132202252, 27399383, 76190036, 121163538, 124768495, 7120444, 107992243, 30191721, 51017378, 103790135, 40713485, 68522088, 53662101, 25159710, 242673994, 136664247, 49866542, 12110971, 11616731, 76507696, 51848423, 71650175, 41052814, 45653322, 88810243, 108910870, 26413504, 26466038, 24128375, 120290619, 137890571, 3076408, 44681413, 50704281, 43602794, 41730167, 3869486, 24483338, 66912492, 70147529, 3179920, 7866288, 161136200, 79890260, 49133365, 124735282, 35518632, 131705444, 95749532, 48894369, 790475, 87462883, 145577795, 740724, 100472733, 141438121, 56735381, 67410884, 20004537, 7991603, 85359011, 3701640, 62144588, 6271576, 41253081)
val mito374End = scala.Array(44281063, 8878432, 74376567, 128634910, 41924993, 12377313, 141355044, 129299861, 43733086, 104073860, 33025166, 1470067, 1244824, 17942523, 219528166, 74375121, 15687329, 123742506, 5680907, 5352150, 87970135, 111365950, 24110630, 72145692, 6368919, 234519795, 53164038, 84206067, 131096351, 74430373, 19091417, 227175246, 41224112, 57495187, 14111994, 50514240, 101491857, 245008359, 120878545, 36149763, 77162870, 145152428, 173827684, 74186088, 111935114, 107572175, 70231879, 180707562, 32898486, 10944164, 23569052, 135187193, 12921504, 159630775, 44031396, 5771813, 207657233, 99395849, 10426691, 72869156, 241683061, 154965587, 126148026, 30673649, 75401107, 2037750, 158410364, 74063196, 96011061, 17453544, 140078889, 11141198, 191208919, 38362536, 53461320, 198381461, 74760472, 220321380, 228369958, 88897676, 74963809, 108963160, 75682541, 45590913, 39479273, 99779620, 74204778, 5720583, 44223144, 130541119, 198573113, 75696826, 29557454, 228222550, 12073571, 17971765, 74385899, 24463558, 166796486, 27548547, 131221827, 224832431, 138396519, 139076065, 1823156, 155718153, 65321977, 74218959, 30663377, 36242381, 78285919, 156564091, 119010625, 240964819, 5904017, 140027370, 10979883, 42486959, 4798454, 41694717, 60448853, 49060928, 97345757, 13799067, 96128683, 79215081, 47004903, 201950473, 102289757, 207024327, 161184185, 47606114, 52979168, 1816719, 1395583, 67804111, 67380006, 9134343, 69664760, 32330430, 193415612, 46105470, 55230187, 66725847, 19379823, 58419584, 35042138, 94938294, 27035727, 107780768, 7696842, 139318213, 102969958, 108210110, 55921045, 89878092, 62493154, 106395238, 132428406, 107116292, 88299721, 151773259, 3606206, 103251346, 107077373, 24007841, 39440495, 10601692, 50964868, 256815, 36487220, 111990353, 158589312, 120925179, 169455241, 228582728, 19166343, 172864766, 73285591, 66438540, 98995946, 104427417, 39438842, 186071536, 19223697, 110100857, 89624176, 48612125, 84687169, 136223552, 61685725, 153650065, 39984422, 100604184, 66586447, 85347580, 74895018, 18218321, 144533488, 40349183, 101285290, 61448076, 46753237, 3192563, 58201854, 15948329, 28857729, 102754158, 50968485, 33679504, 97247862, 30894236, 119683294, 32908836, 43684300, 57039798, 7849778, 57607437, 112005395, 46973233, 54071192, 176049335, 73043080, 668127, 27107984, 99063954, 118302211, 43036607, 35288284, 47139539, 33462897, 43769316, 40950722, 73588033, 99224978, 120972237, 53046146, 70826448, 119396301, 73935472, 1015235, 85840650, 30232809, 75230509, 98264846, 31439967, 55866182, 100906290, 36643771, 85916779, 63744015, 27188085, 152682175, 120899389, 78464291, 2644865, 5261172, 2281840, 79674556, 174992561, 75012451, 55927417, 73262454, 95397546, 19644285, 54612564, 123198309, 8386280, 124922098, 23607677, 37480546, 92588261, 2011976, 140422590, 120321347, 179345435, 32573160, 14682874, 125580751, 140223705, 77791265, 39500308, 44333414, 34287281, 93847389, 23241007, 18123813, 633597, 86369280, 61215001, 96811075, 71299272, 17380665, 161332984, 37642071, 150480078, 60158981, 906911, 119243937, 125384933, 36878077, 33999944, 62441159, 30166402, 1605480, 48648409, 21994981, 29704448, 46782448, 132203723, 27444195, 76253260, 121177811, 124817827, 7128592, 108018503, 30206364, 51039884, 103810399, 40718295, 68611878, 53679869, 25164980, 242708231, 136743670, 51057784, 12165224, 11639989, 76603813, 51869672, 71715094, 41065386, 45694525, 88854623, 108956331, 26467594, 26513336, 24165110, 120311528, 137911133, 3245676, 44702745, 50779266, 43707507, 41870621, 3907605, 24557954, 66929657, 70195203, 3215003, 7895780, 161147803, 79900288, 49142553, 124751366, 35580246, 131731306, 95951459, 48936426, 798316, 87505672, 145584932, 749131, 100486285, 141487722, 56753939, 67705038, 20053449, 8008805, 85367591, 3767598, 62352672, 6304992, 41363838)
def main(arguments: Array[String]): Unit = {
val args = new ArgsPanels(arguments)
val delta_silver_path = args.deltapath()
// Spark init
import org.apache.spark.sql.SparkSession
val spark = SparkSession
.builder()
.appName(getClass.getSimpleName)
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
.getOrCreate()
// data load
val df = spark
.read
.format("delta")
.load(delta_silver_path)
.persist(org.apache.spark.storage.StorageLevel.MEMORY_ONLY)
for (i <- 0 until 32) {
naiveBenchmarks(df)
}
}
def naiveBenchmarks(df: sql.DataFrame): Unit = {
// Presence in mitochondrial liver disease gene panel
val startMito11 = System.nanoTime
var mito11vars: Long = 0
for (i <- mito11Chr.indices) {
val geneChr = mito11Chr(i)
val geneStart = mito11Start(i)
val geneEnd = mito11End(i)
mito11vars += df
.filter(
(df("contigName") === geneChr && df("start") >= geneStart && df("end") <= geneEnd) ||
(df("contigName") === geneChr && df("start") <= geneStart && df("end") >= geneStart) ||
(df("contigName") === geneChr && df("start") <= geneEnd && df("end") >= geneEnd))
.count()
}
val elapsedMito11 = (System.nanoTime - startMito11) / 1000000 // nano to milli
println(s"\n\t In Mito 11: $mito11vars variants, $elapsedMito11 ms")
// Presence in adult solid tumours cancer susceptibility panel
val startAdult104 = System.nanoTime
var adult104vars: Long = 0
for (i <- adult104Chr.indices) {
val geneChr = adult104Chr(i)
val geneStart = adult104Start(i)
val geneEnd = adult104End(i)
adult104vars += df
.filter(
(df("contigName") === geneChr && df("start") >= geneStart && df("end") <= geneEnd) ||
(df("contigName") === geneChr && df("start") <= geneStart && df("end") >= geneStart) ||
(df("contigName") === geneChr && df("start") <= geneEnd && df("end") >= geneEnd))
.count()
}
val elapsedAdult104 = (System.nanoTime - startAdult104) / 1000000 // nano to milli
println(s"\n\t In Adult 104: $adult104vars variants, $elapsedAdult104 ms")
// Presence in possible mitochondrial disorder, nuclear genes panel
val startMito374 = System.nanoTime
var mito374vars: Long = 0
for (i <- mito374Chr.indices) {
val geneChr = mito374Chr(i)
val geneStart = mito374Start(i)
val geneEnd = mito374End(i)
mito374vars += df
.filter(
(df("contigName") === geneChr && df("start") >= geneStart && df("end") <= geneEnd) ||
(df("contigName") === geneChr && df("start") <= geneStart && df("end") >= geneStart) ||
(df("contigName") === geneChr && df("start") <= geneEnd && df("end") >= geneEnd))
.count()
}
val elapsedMito374 = (System.nanoTime - startMito374) / 1000000 // nano to milli
println(s"\n\t In Mito 374: $mito374vars variants, $elapsedMito374 ms")
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment