-
-
Save dnaerys/27cc29be0f11d98c3b936dad808dd47e to your computer and use it in GitHub Desktop.
gene panels
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import org.rogach.scallop.ScallopConf | |
class ArgsETL2Delta(arguments: Seq[String]) extends ScallopConf(arguments) { | |
val path = opt[String](required = true, descr = "path to VCF files to process") | |
val path2save = opt[String](required = true, descr = "path to output delta dataset") | |
val htsjdk = opt[Boolean](default = Some(false), descr = "use htsjdk VCF parser") | |
verify() | |
} | |
object etl2delta { | |
def main(arguments: Array[String]): Unit = { | |
val args = new ArgsETL2Delta(arguments) | |
val delta_silver_path = args.path2save() | |
// Spark init | |
import org.apache.spark.sql.SparkSession | |
val spark = SparkSession | |
.builder() | |
.appName(getClass.getSimpleName) | |
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") | |
.config("io.projectglow.vcf.fastReaderEnabled", !args.htsjdk()) | |
.getOrCreate() | |
import spark.implicits._ | |
// Glow init | |
import io.projectglow.Glow | |
val sparkGlow = Glow.register(spark, true) // see https://github.com/projectglow/glow/issues/362 | |
// etl to delta | |
val df = sparkGlow | |
.read | |
.format("vcf") | |
.option("includeSampleIds", "false") | |
.option("flattenInfoFields", "false") | |
.option("validationStringency", "lenient") // warnings on malformed rows | |
.load(args.path()) | |
val dfBiallelic = Glow.transform("split_multiallelics", df) | |
dfBiallelic | |
.select($"contigName", $"start", $"end", $"referenceAllele", $"alternateAlleles", $"splitFromMultiAllelic", $"genotypes.calls") | |
.write | |
.mode("overwrite") | |
.format("delta") | |
.save(delta_silver_path) | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import org.apache.spark.sql | |
import org.rogach.scallop.ScallopConf | |
class ArgsPanels(arguments: Seq[String]) extends ScallopConf(arguments) { | |
val deltapath = opt[String](required = true, descr = "path to delta lake") | |
verify() | |
} | |
object Panels { | |
// GRCh37 | |
// Mitochondrial liver disease gene panel; (c) Genomics England | |
// 11 genes: S1L,DGUOK,MPV17,POLG,TRMU,TWNK,TFAM,ACAD9,POLG2,RRM2B,SCO1 | |
val mito11Chr = scala.Array("2", "2", "2", "15", "22", "10", "10", "3", "17", "8", "17") | |
val mito11Start = scala.Array(219523487, 74153953, 27532360, 89859534, 46726772, 102747124, 60144782, 128598439, 62473902, 103216730, 10583654) | |
val mito11End = scala.Array(219528166, 74186088, 27548547, 89878092, 46753237, 102754158, 60158981, 128634910, 62493154, 103251346, 10601692) | |
// Adult solid tumours cancer susceptibility panel; (c) Genomics England | |
// 104 genes: APC,ATM,BAP1,BMPR1A,BRCA1,BRCA2,BRIP1,CBL,CDC73,CDH1,CDK4,CDKN1B,CDKN2A,DDB2,DICER1,EPCAM,ERCC2,ERCC3,ERCC4,ERCC5,FANCA,FANCB,FANCC,FANCD2,FANCE,FANCF,FANCG,FANCI,FANCL,FH,FLCN,HRAS,KIT,KRAS,MAX,MEN1,MET,MLH1,MSH2,MSH6,MUTYH,NF1,NF2,NRAS,NTHL1,PALB2,PMS2,POLD1,POLE,POLH,PTCH1,PTEN,PTPN11,RAD51C,RAD51D,RAF1,RB1,RET,RTEL1,SDHA,SDHAF2,SDHB,SDHC,SDHD,SHOC2,SMAD4,SMARCA4,SMARCB1,SOS1,STK11,SUFU,TERC,TERT,TMEM127,TP53,TSC1,TSC2,VHL,WRAP53,WT1,XPA,XPC,ACD,AIP,BRAF,CHEK2,CTC1,DKC1,ERCC1,EXT1,EXT2,LZTR1,MAP2K1,MAP2K2,PARN,PPP1CB,RIT1,SLX4,SOS2,TINF2,NOP10,PDGFRA,RABL3,SPRED1 | |
val adult104Chr = scala.Array("5", "11", "3", "10", "17", "13", "17", "11", "1", "16", "12", "12", "9", "11", "14", "2", "19", "2", "16", "13", "16", "X", "9", "3", "6", "11", "9", "15", "2", "1", "17", "11", "4", "12", "14", "11", "7", "3", "2", "2", "1", "17", "22", "1", "16", "16", "7", "19", "12", "6", "9", "10", "12", "17", "17", "3", "13", "10", "20", "5", "11", "1", "1", "11", "10", "18", "19", "22", "2", "19", "10", "3", "5", "2", "17", "9", "16", "3", "17", "11", "9", "3", "16", "11", "7", "22", "17", "X", "19", "8", "11", "22", "15", "19", "16", "2", "1", "16", "14", "14", "15", "4", "3", "15") | |
val adult104Start = scala.Array(112043195, 108093211, 52435029, 88516407, 41196312, 32889611, 59758627, 119076752, 193091147, 68771128, 58141510, 12867992, 21967751, 47236493, 95552565, 47572297, 45853095, 128014866, 14014014, 103497194, 89803957, 14861529, 97861336, 10068098, 35420138, 22644079, 35073832, 89787180, 58386378, 241660903, 17115526, 532242, 55524085, 25357723, 65472892, 64570982, 116312444, 37034823, 47630108, 47922669, 45794835, 29421945, 29999545, 115247090, 2089816, 23614488, 6012870, 50887461, 133200348, 43543887, 98205262, 89622870, 112856155, 56769934, 33426811, 12625100, 48877887, 43572475, 62289163, 218356, 61197514, 17345217, 161284047, 111957497, 112679301, 48494410, 11071598, 24129150, 39208537, 1189406, 104263744, 169482308, 1253262, 96914254, 7565097, 135766735, 2097466, 10182692, 7589389, 32409321, 100437191, 14186647, 67691415, 67250512, 140419127, 29083731, 8130191, 153991031, 45910591, 118806729, 44117099, 21333751, 66679155, 4090319, 14529558, 28974506, 155867599, 3631182, 50583847, 24708849, 34633917, 55095264, 120405528, 38544527) | |
val adult104End = scala.Array(112181936, 108239829, 52444366, 88692595, 41277500, 32973805, 59940882, 119178859, 193223031, 68869451, 58149796, 12875305, 21995300, 47260767, 95624347, 47614740, 45874176, 128051752, 14046202, 103528345, 89883065, 14891191, 98079991, 10143614, 35434880, 22647387, 35080013, 89860492, 58468507, 241683061, 17140502, 537287, 55606881, 25403870, 65569413, 64578766, 116438440, 37107380, 47789450, 48037240, 45806142, 29709134, 30094587, 115259515, 2097867, 23652631, 6048756, 50921273, 133263951, 43586701, 98279339, 89731687, 112947717, 56811703, 33448541, 12705725, 49056122, 43625799, 62328416, 256815, 61215001, 17380665, 161332984, 111990353, 112773425, 48611415, 11176071, 24176703, 39351486, 1228428, 104393292, 169482848, 1295184, 96931732, 7590856, 135820020, 2138716, 10193904, 7606820, 32457176, 100459639, 14220283, 67694713, 67258574, 140624564, 29138410, 8151362, 154005964, 45982086, 119124092, 44266979, 21353327, 66784650, 4124126, 14726585, 29025806, 155881195, 3661599, 50698276, 24711880, 34633917, 55095264, 120405528, 38544527) | |
// Possible mitochondrial disorder, nuclear genes panel; (c) Genomics England | |
// 374 genes: AARS2,ABAT,ABCB7,ACAD9,ACO2,AFG3L2,AGK,AIFM1,ANO10,APOPT1,APTX,ATAD3A,ATP5D,ATPAF2,BCS1L,BOLA3,BTD,C12orf65,C19orf70,C1QBP,CA5A,CARS2,CHCHD10,CLPB,CLPP,COA6,COA7,COQ2,COQ4,COQ6,COQ7,COQ8A,COQ8B,COQ9,COX10,COX14,COX15,COX20,COX6A1,COX6B1,COX7B,CYC1,DARS2,DGUOK,DLAT,DLD,DNA2,DNAJC19,DNM1L,DNM2,EARS2,ECHS1,ELAC2,ETFDH,ETHE1,FARS2,FASTKD2,FBXL4,FDX2,FDXR,FH,FLAD1,FOXRED1,GARS,GDAP1,GFER,GFM1,GFM2,GLRX5,GTPBP3,HARS2,HCCS,HIBCH,HLCS,HSD17B10,HSPD1,HTRA2,IARS2,IBA57,ISCA1,ISCA2,ISCU,KARS,LARS2,LIAS,LIPT1,LIPT2,LONP1,LRPPRC,LYRM7,MARS2,MDH2,MECR,MFF,MFN2,MGME1,MICU1,MIPEP,MPC1,MPV17,MRPL3,MRPL44,MRPS2,MRPS22,MRPS34,MSTO1,MTFMT,MTO1,MTPAP,NADK2,NARS2,NAXE,NDUFA1,NDUFA10,NDUFA11,NDUFA2,NDUFA4,NDUFA6,NDUFA9,NDUFAF1,NDUFAF2,NDUFAF3,NDUFAF4,NDUFAF5,NDUFAF6,NDUFAF8,NDUFB11,NDUFB3,NDUFB8,NDUFS1,NDUFS2,NDUFS3,NDUFS4,NDUFS6,NDUFS7,NDUFS8,NDUFV1,NDUFV2,NFU1,NUBPL,OPA1,OPA3,PARS2,PC,PDHA1,PDHB,PDHX,PDP1,PDSS1,PDSS2,PET100,PMPCA,PMPCB,PNPLA8,PNPT1,POLG,POLG2,PPA2,PUS1,QRSL1,RARS2,RMND1,RNASEH1,RRM2B,RTN4IP1,SACS,SARS2,SCO1,SCO2,SDHA,SDHAF1,SDHD,SERAC1,SFXN4,SLC19A2,SLC19A3,SLC25A1,SLC25A12,SLC25A19,SLC25A26,SLC25A3,SLC25A32,SLC25A38,SLC25A4,SLC25A42,SLC25A46,SPG7,SUCLA2,SUCLG1,SURF1,TACO1,TAZ,TIMM50,TIMM8A,TK2,TMEM126B,TMEM70,TOP3A,TPK1,TRIT1,TRMT10C,TRMT5,TRMU,TRNT1,TSFM,TTC19,TUFM,TWNK,TYMP,UQCC2,UQCRB,VARS2,WARS2,YARS2,ATP5A1,ATP5B,ATP5C1,ATP5E,ATP5F1,ATP5G1,ATP5G2,ATP5G3,ATP5H,ATP5I,ATP5J,ATP5J2,ATP5L,ATP5L2,ATP5O,ATPAF1,CEP89,COA1,COA3,COA4,COA5,COQ5,COX11,COX16,COX17,COX18,COX19,COX4I1,COX4I2,COX5A,COX5B,COX6A2,COX6B2,COX6C,COX7A1,COX7C,COX8A,ERAL1,GATB,GATC,IDH3A,IDH3B,LYRM4,MRM2,MRPL12,MRPS14,MRPS16,MRPS23,MRPS7,NDUFA12,NDUFA13,NDUFA3,NDUFA5,NDUFA7,NDUFA8,NDUFAB1,NDUFAF7,NDUFB1,NDUFB10,NDUFB2,NDUFB4,NDUFB5,NDUFB6,NDUFB7,NDUFB9,NDUFC1,NDUFC2,NDUFS5,NDUFV3,NFS1,NSUN3,OXA1L,PET117,POLRMT,PTCD3,SDHAF2,SDHAF3,SDHAF4,SDHB,SDHC,SLC25A21,TARS2,TFAM,TIMM22,TIMMDC1,TMEM65,TXN2,UQCC1,UQCC3,UQCR10,UQCR11,UQCRC1,UQCRC2,UQCRFS1,UQCRH,UQCRQ,YME1L1,ACADM,ACADS,ACADSB,ACADVL,ACAT1,C19orf12,CHKB,CISD2,COASY,CPT1A,CPT2,CYCS,D2HGDH,DARS,DCC,DHTKD1,ECSIT,ETFA,ETFB,FXN,G6PC,GATM,GLUD1,HADH,HADHA,HADHB,HMGCL,HMGCS2,HSPA9,HTT,IER3IP1,L2HGDH,NNT,OXCT1,PANK2,PDK3,PDP2,PDPR,PITRM1,PNPLA4,PPOX,PYCR1,QARS,ROBO3,SAMHD1,SLC22A5,SLC25A13,SLC25A20,SLC25A22,SLC25A40,SLC52A2,SLC52A3,SRRT,SSBP1,STAT2,SUCLG2,TANGO2,TIMM44,TMEM126A,TRAP1,VPS13C,WFS1,XPNPEP3 | |
val mito374Chr = scala.Array[String]("6", "16", "X", "3", "22", "18", "7", "X", "3", "14", "9", "1", "19", "17", "2", "2", "3", "12", "19", "17", "16", "13", "22", "11", "19", "1", "1", "4", "9", "14", "16", "1", "19", "16", "17", "12", "10", "1", "12", "19", "X", "8", "1", "2", "11", "7", "10", "3", "12", "19", "16", "10", "17", "4", "19", "6", "2", "6", "19", "17", "1", "1", "11", "7", "8", "16", "3", "5", "14", "19", "5", "X", "2", "21", "X", "2", "2", "1", "1", "9", "14", "12", "16", "3", "4", "2", "11", "19", "2", "5", "2", "7", "1", "2", "1", "20", "10", "13", "6", "2", "3", "2", "9", "3", "16", "1", "15", "6", "10", "5", "11", "1", "X", "2", "19", "5", "7", "22", "12", "15", "5", "3", "6", "20", "8", "17", "X", "2", "10", "2", "1", "11", "5", "5", "19", "11", "11", "18", "2", "14", "3", "19", "1", "11", "X", "3", "11", "8", "10", "6", "19", "9", "7", "7", "2", "15", "17", "4", "12", "6", "6", "6", "2", "8", "6", "13", "19", "17", "22", "5", "19", "11", "6", "10", "1", "2", "22", "2", "17", "3", "12", "8", "3", "4", "19", "5", "16", "13", "2", "9", "17", "X", "19", "X", "16", "11", "8", "17", "7", "1", "3", "14", "22", "3", "12", "17", "16", "10", "22", "6", "8", "6", "1", "12", "18", "12", "10", "20", "1", "17", "12", "2", "17", "4", "21", "7", "11", "22", "21", "1", "19", "7", "17", "11", "2", "12", "17", "14", "3", "4", "7", "16", "20", "15", "2", "16", "19", "8", "19", "5", "11", "17", "4", "12", "15", "20", "6", "7", "17", "1", "10", "17", "17", "12", "19", "19", "7", "19", "9", "16", "2", "14", "16", "7", "3", "3", "9", "19", "8", "4", "11", "1", "21", "20", "3", "14", "20", "19", "2", "11", "7", "6", "1", "1", "14", "1", "10", "17", "3", "8", "22", "20", "11", "22", "19", "3", "16", "19", "1", "5", "10", "1", "12", "10", "17", "11", "19", "22", "4", "17", "11", "1", "7", "2", "2", "18", "10", "19", "15", "19", "9", "17", "15", "10", "4", "2", "2", "1", "1", "5", "4", "18", "14", "5", "5", "20", "X", "16", "16", "10", "X", "1", "17", "3", "11", "20", "5", "7", "3", "11", "7", "8", "20", "7", "7", "12", "3", "22", "19", "11", "16", "15", "4", "22") | |
val mito374Start = scala.Array(44267391, 8768422, 74273115, 128598439, 41865129, 12328943, 141250989, 129263337, 43396351, 104029299, 32972604, 1447531, 1241749, 17880723, 219523487, 74362525, 15642848, 123717463, 5678432, 5336097, 87921625, 111293759, 24108021, 72003469, 6361463, 234509202, 53152508, 84182689, 131084815, 74416629, 19078921, 227085237, 41197434, 57481337, 13972813, 50505762, 101471601, 244998624, 120875893, 36139125, 77154935, 145149930, 173793641, 74153953, 111895538, 107531415, 70173821, 180701497, 32832134, 10828755, 23533335, 135175984, 12895708, 159593277, 44010871, 5261277, 207630081, 99316420, 10416103, 72858619, 241660903, 154955814, 126138950, 30634297, 75233365, 2034208, 158362067, 74017029, 95999840, 17445729, 140071011, 11129421, 191054461, 38123189, 53458206, 198351305, 74756504, 220267444, 228353516, 88879461, 74960423, 108956358, 75661622, 45429998, 39460620, 99771418, 74202757, 5691845, 44113647, 130506503, 198570087, 75677369, 29519385, 228189867, 12040238, 17949556, 74127098, 24304328, 166778407, 27532360, 131181056, 224822121, 138391830, 138724648, 1821891, 155579979, 65294845, 74171301, 30598730, 36192694, 78147007, 156561554, 119005450, 240831867, 5891287, 140018325, 10971578, 42481529, 4758261, 41679551, 60240956, 49057892, 97337189, 13765596, 95907995, 79213039, 47001615, 201936156, 102267203, 206979541, 161166894, 47586888, 52856463, 1801514, 1383526, 67798084, 67374323, 9102628, 69622882, 31959162, 193310933, 46030685, 55222571, 66615704, 19362011, 58413357, 34937376, 94870035, 26986588, 107473761, 7694623, 139305110, 102937869, 108110866, 55861400, 89859534, 62473902, 106290234, 132413745, 107077453, 88224096, 151725989, 3592383, 103216730, 107018903, 23902965, 39405906, 10583654, 50961997, 218356, 36486090, 111957497, 158530536, 120900279, 169433147, 228549926, 19163095, 172640880, 73269073, 66119285, 98987369, 104410863, 39424839, 186064395, 19174808, 110073837, 89557325, 48510622, 84650647, 136218610, 61678231, 153639854, 39971052, 100600649, 66541906, 85339629, 74884672, 18174742, 144149034, 40306723, 101280706, 61438169, 46726772, 3168600, 58176372, 15902694, 28853732, 102747124, 50964181, 33662070, 97238148, 30876019, 119573839, 32880424, 43664110, 57031959, 7830092, 57600522, 111991486, 46970127, 54026510, 176040986, 73034958, 666225, 27088815, 99046098, 118271869, 43035809, 35275757, 47098409, 33369902, 43648055, 40947165, 73583712, 99215773, 120941077, 53029263, 70791798, 119373360, 73921797, 938415, 85832239, 30225691, 75212132, 98262503, 31439052, 55860674, 100885428, 36641824, 85913721, 63742079, 27181956, 152591656, 120884241, 78423840, 2639041, 5102827, 2273866, 79670387, 174979925, 75006510, 55916842, 73257755, 95290831, 19626545, 54606036, 123177051, 8373490, 124894745, 23592323, 37458774, 92582466, 2009509, 140390577, 120315156, 179322478, 32552997, 14676890, 125551344, 140188034, 77779350, 39491990, 44299754, 34255977, 93781760, 23235731, 18118517, 617223, 86333305, 61197514, 96745902, 71276620, 17345217, 161284047, 37147636, 150459887, 60144782, 900357, 119217379, 125324231, 36863083, 33890369, 62437745, 30163358, 1597171, 48636435, 21963981, 29698173, 46769303, 132202252, 27399383, 76190036, 121163538, 124768495, 7120444, 107992243, 30191721, 51017378, 103790135, 40713485, 68522088, 53662101, 25159710, 242673994, 136664247, 49866542, 12110971, 11616731, 76507696, 51848423, 71650175, 41052814, 45653322, 88810243, 108910870, 26413504, 26466038, 24128375, 120290619, 137890571, 3076408, 44681413, 50704281, 43602794, 41730167, 3869486, 24483338, 66912492, 70147529, 3179920, 7866288, 161136200, 79890260, 49133365, 124735282, 35518632, 131705444, 95749532, 48894369, 790475, 87462883, 145577795, 740724, 100472733, 141438121, 56735381, 67410884, 20004537, 7991603, 85359011, 3701640, 62144588, 6271576, 41253081) | |
val mito374End = scala.Array(44281063, 8878432, 74376567, 128634910, 41924993, 12377313, 141355044, 129299861, 43733086, 104073860, 33025166, 1470067, 1244824, 17942523, 219528166, 74375121, 15687329, 123742506, 5680907, 5352150, 87970135, 111365950, 24110630, 72145692, 6368919, 234519795, 53164038, 84206067, 131096351, 74430373, 19091417, 227175246, 41224112, 57495187, 14111994, 50514240, 101491857, 245008359, 120878545, 36149763, 77162870, 145152428, 173827684, 74186088, 111935114, 107572175, 70231879, 180707562, 32898486, 10944164, 23569052, 135187193, 12921504, 159630775, 44031396, 5771813, 207657233, 99395849, 10426691, 72869156, 241683061, 154965587, 126148026, 30673649, 75401107, 2037750, 158410364, 74063196, 96011061, 17453544, 140078889, 11141198, 191208919, 38362536, 53461320, 198381461, 74760472, 220321380, 228369958, 88897676, 74963809, 108963160, 75682541, 45590913, 39479273, 99779620, 74204778, 5720583, 44223144, 130541119, 198573113, 75696826, 29557454, 228222550, 12073571, 17971765, 74385899, 24463558, 166796486, 27548547, 131221827, 224832431, 138396519, 139076065, 1823156, 155718153, 65321977, 74218959, 30663377, 36242381, 78285919, 156564091, 119010625, 240964819, 5904017, 140027370, 10979883, 42486959, 4798454, 41694717, 60448853, 49060928, 97345757, 13799067, 96128683, 79215081, 47004903, 201950473, 102289757, 207024327, 161184185, 47606114, 52979168, 1816719, 1395583, 67804111, 67380006, 9134343, 69664760, 32330430, 193415612, 46105470, 55230187, 66725847, 19379823, 58419584, 35042138, 94938294, 27035727, 107780768, 7696842, 139318213, 102969958, 108210110, 55921045, 89878092, 62493154, 106395238, 132428406, 107116292, 88299721, 151773259, 3606206, 103251346, 107077373, 24007841, 39440495, 10601692, 50964868, 256815, 36487220, 111990353, 158589312, 120925179, 169455241, 228582728, 19166343, 172864766, 73285591, 66438540, 98995946, 104427417, 39438842, 186071536, 19223697, 110100857, 89624176, 48612125, 84687169, 136223552, 61685725, 153650065, 39984422, 100604184, 66586447, 85347580, 74895018, 18218321, 144533488, 40349183, 101285290, 61448076, 46753237, 3192563, 58201854, 15948329, 28857729, 102754158, 50968485, 33679504, 97247862, 30894236, 119683294, 32908836, 43684300, 57039798, 7849778, 57607437, 112005395, 46973233, 54071192, 176049335, 73043080, 668127, 27107984, 99063954, 118302211, 43036607, 35288284, 47139539, 33462897, 43769316, 40950722, 73588033, 99224978, 120972237, 53046146, 70826448, 119396301, 73935472, 1015235, 85840650, 30232809, 75230509, 98264846, 31439967, 55866182, 100906290, 36643771, 85916779, 63744015, 27188085, 152682175, 120899389, 78464291, 2644865, 5261172, 2281840, 79674556, 174992561, 75012451, 55927417, 73262454, 95397546, 19644285, 54612564, 123198309, 8386280, 124922098, 23607677, 37480546, 92588261, 2011976, 140422590, 120321347, 179345435, 32573160, 14682874, 125580751, 140223705, 77791265, 39500308, 44333414, 34287281, 93847389, 23241007, 18123813, 633597, 86369280, 61215001, 96811075, 71299272, 17380665, 161332984, 37642071, 150480078, 60158981, 906911, 119243937, 125384933, 36878077, 33999944, 62441159, 30166402, 1605480, 48648409, 21994981, 29704448, 46782448, 132203723, 27444195, 76253260, 121177811, 124817827, 7128592, 108018503, 30206364, 51039884, 103810399, 40718295, 68611878, 53679869, 25164980, 242708231, 136743670, 51057784, 12165224, 11639989, 76603813, 51869672, 71715094, 41065386, 45694525, 88854623, 108956331, 26467594, 26513336, 24165110, 120311528, 137911133, 3245676, 44702745, 50779266, 43707507, 41870621, 3907605, 24557954, 66929657, 70195203, 3215003, 7895780, 161147803, 79900288, 49142553, 124751366, 35580246, 131731306, 95951459, 48936426, 798316, 87505672, 145584932, 749131, 100486285, 141487722, 56753939, 67705038, 20053449, 8008805, 85367591, 3767598, 62352672, 6304992, 41363838) | |
def main(arguments: Array[String]): Unit = { | |
val args = new ArgsPanels(arguments) | |
val delta_silver_path = args.deltapath() | |
// Spark init | |
import org.apache.spark.sql.SparkSession | |
val spark = SparkSession | |
.builder() | |
.appName(getClass.getSimpleName) | |
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") | |
.getOrCreate() | |
// data load | |
val df = spark | |
.read | |
.format("delta") | |
.load(delta_silver_path) | |
.persist(org.apache.spark.storage.StorageLevel.MEMORY_ONLY) | |
for (i <- 0 until 32) { | |
naiveBenchmarks(df) | |
} | |
} | |
def naiveBenchmarks(df: sql.DataFrame): Unit = { | |
// Presence in mitochondrial liver disease gene panel | |
val startMito11 = System.nanoTime | |
var mito11vars: Long = 0 | |
for (i <- mito11Chr.indices) { | |
val geneChr = mito11Chr(i) | |
val geneStart = mito11Start(i) | |
val geneEnd = mito11End(i) | |
mito11vars += df | |
.filter( | |
(df("contigName") === geneChr && df("start") >= geneStart && df("end") <= geneEnd) || | |
(df("contigName") === geneChr && df("start") <= geneStart && df("end") >= geneStart) || | |
(df("contigName") === geneChr && df("start") <= geneEnd && df("end") >= geneEnd)) | |
.count() | |
} | |
val elapsedMito11 = (System.nanoTime - startMito11) / 1000000 // nano to milli | |
println(s"\n\t In Mito 11: $mito11vars variants, $elapsedMito11 ms") | |
// Presence in adult solid tumours cancer susceptibility panel | |
val startAdult104 = System.nanoTime | |
var adult104vars: Long = 0 | |
for (i <- adult104Chr.indices) { | |
val geneChr = adult104Chr(i) | |
val geneStart = adult104Start(i) | |
val geneEnd = adult104End(i) | |
adult104vars += df | |
.filter( | |
(df("contigName") === geneChr && df("start") >= geneStart && df("end") <= geneEnd) || | |
(df("contigName") === geneChr && df("start") <= geneStart && df("end") >= geneStart) || | |
(df("contigName") === geneChr && df("start") <= geneEnd && df("end") >= geneEnd)) | |
.count() | |
} | |
val elapsedAdult104 = (System.nanoTime - startAdult104) / 1000000 // nano to milli | |
println(s"\n\t In Adult 104: $adult104vars variants, $elapsedAdult104 ms") | |
// Presence in possible mitochondrial disorder, nuclear genes panel | |
val startMito374 = System.nanoTime | |
var mito374vars: Long = 0 | |
for (i <- mito374Chr.indices) { | |
val geneChr = mito374Chr(i) | |
val geneStart = mito374Start(i) | |
val geneEnd = mito374End(i) | |
mito374vars += df | |
.filter( | |
(df("contigName") === geneChr && df("start") >= geneStart && df("end") <= geneEnd) || | |
(df("contigName") === geneChr && df("start") <= geneStart && df("end") >= geneStart) || | |
(df("contigName") === geneChr && df("start") <= geneEnd && df("end") >= geneEnd)) | |
.count() | |
} | |
val elapsedMito374 = (System.nanoTime - startMito374) / 1000000 // nano to milli | |
println(s"\n\t In Mito 374: $mito374vars variants, $elapsedMito374 ms") | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment