Skip to content

Instantly share code, notes, and snippets.

@tbl3rd
Last active July 24, 2018 12:36
Show Gist options
  • Save tbl3rd/90a5c77d4f384f1cd1bff645ff1b6d9b to your computer and use it in GitHub Desktop.
Save tbl3rd/90a5c77d4f384f1cd1bff645ff1b6d9b to your computer and use it in GitHub Desktop.
Parse header metadata from a Variant Call Format file into a Clojure map. (See https://github.com/tbl3rd/vcf)
{"p95Red" "8992",
"Extension(A)" "Extension(A)|Extension|25349|405",
"zcallVersion" "1.0.0.0",
"manifestFile" "Broad_GWAS_supplemental_15061359_A1.bpm",
"chipWellBarcode" "200557070005_R06C01",
"Biotin(High)" "Biotin(High)|Staining|618|4165",
"analysisVersionNumber" "1",
"picardVersion" "07b46e26eb638116226b10df9f3f653b82b8ea95",
"String(MM)" "String(MM)|Stringency|896|237",
"arrayType" "Broad_GWAS_supplemental_15061359_A1",
"extendedManifestFile" "Broad_GWAS_supplemental_15061359_A1.1.2.extended.csv",
"p95Green" "4329",
"fileformat" "VCFv4.2",
"NSB(Bgnd)Red" "NSB(Bgnd)Red|Non-SpecificBinding|360|153",
"Hyb(Low)" "Hyb(Low)|Hybridization|2031|1856",
"Restore" "Restore|Restoration|228|305",
"fingerprintGender" "Unknown",
"fileDate" "Wed May 23 21:21:12 UTC 2018",
"DNP(High)" "DNP(High)|Staining|12573|355",
"contig" {"9" {"ID" "9", "length" "141213431", "assembly" "GRCh37"},
"GL000213.1" {"ID" "GL000213.1",
"length" "164239",
"assembly" "GRCh37"},
"GL000207.1" {"ID" "GL000207.1",
"length" "4262",
"assembly" "GRCh37"},
"3" {"ID" "3", "length" "198022430", "assembly" "GRCh37"},
"GL000248.1" {"ID" "GL000248.1",
"length" "39786",
"assembly" "GRCh37"},
"GL000205.1" {"ID" "GL000205.1",
"length" "174588",
"assembly" "GRCh37"},
"GL000237.1" {"ID" "GL000237.1",
"length" "45867",
"assembly" "GRCh37"},
"GL000196.1" {"ID" "GL000196.1",
"length" "38914",
"assembly" "GRCh37"},
"GL000212.1" {"ID" "GL000212.1",
"length" "186858",
"assembly" "GRCh37"},
"22" {"ID" "22", "length" "51304566", "assembly" "GRCh37"},
"GL000241.1" {"ID" "GL000241.1",
"length" "42152",
"assembly" "GRCh37"},
"4" {"ID" "4", "length" "191154276", "assembly" "GRCh37"},
"8" {"ID" "8", "length" "146364022", "assembly" "GRCh37"},
"14" {"ID" "14", "length" "107349540", "assembly" "GRCh37"},
"GL000239.1" {"ID" "GL000239.1",
"length" "33824",
"assembly" "GRCh37"},
"GL000202.1" {"ID" "GL000202.1",
"length" "40103",
"assembly" "GRCh37"},
"GL000234.1" {"ID" "GL000234.1",
"length" "40531",
"assembly" "GRCh37"},
"21" {"ID" "21", "length" "48129895", "assembly" "GRCh37"},
"GL000246.1" {"ID" "GL000246.1",
"length" "38154",
"assembly" "GRCh37"},
"GL000235.1" {"ID" "GL000235.1",
"length" "34474",
"assembly" "GRCh37"},
"GL000220.1" {"ID" "GL000220.1",
"length" "161802",
"assembly" "GRCh37"},
"NC_007605" {"ID" "NC_007605",
"length" "171823",
"assembly" "NC_007605.1"},
"20" {"ID" "20", "length" "63025520", "assembly" "GRCh37"},
"GL000240.1" {"ID" "GL000240.1",
"length" "41933",
"assembly" "GRCh37"},
"GL000208.1" {"ID" "GL000208.1",
"length" "92689",
"assembly" "GRCh37"},
"GL000230.1" {"ID" "GL000230.1",
"length" "43691",
"assembly" "GRCh37"},
"GL000238.1" {"ID" "GL000238.1",
"length" "39939",
"assembly" "GRCh37"},
"GL000199.1" {"ID" "GL000199.1",
"length" "169874",
"assembly" "GRCh37"},
"19" {"ID" "19", "length" "59128983", "assembly" "GRCh37"},
"17" {"ID" "17", "length" "81195210", "assembly" "GRCh37"},
"Y" {"ID" "Y", "length" "59373566", "assembly" "GRCh37"},
"GL000221.1" {"ID" "GL000221.1",
"length" "155397",
"assembly" "GRCh37"},
"GL000224.1" {"ID" "GL000224.1",
"length" "179693",
"assembly" "GRCh37"},
"GL000215.1" {"ID" "GL000215.1",
"length" "172545",
"assembly" "GRCh37"},
"15" {"ID" "15", "length" "102531392", "assembly" "GRCh37"},
"7" {"ID" "7", "length" "159138663", "assembly" "GRCh37"},
"GL000217.1" {"ID" "GL000217.1",
"length" "172149",
"assembly" "GRCh37"},
"GL000236.1" {"ID" "GL000236.1",
"length" "41934",
"assembly" "GRCh37"},
"5" {"ID" "5", "length" "180915260", "assembly" "GRCh37"},
"18" {"ID" "18", "length" "78077248", "assembly" "GRCh37"},
"12" {"ID" "12", "length" "133851895", "assembly" "GRCh37"},
"GL000242.1" {"ID" "GL000242.1",
"length" "43523",
"assembly" "GRCh37"},
"13" {"ID" "13", "length" "115169878", "assembly" "GRCh37"},
"GL000219.1" {"ID" "GL000219.1",
"length" "179198",
"assembly" "GRCh37"},
"GL000243.1" {"ID" "GL000243.1",
"length" "43341",
"assembly" "GRCh37"},
"GL000195.1" {"ID" "GL000195.1",
"length" "182896",
"assembly" "GRCh37"},
"GL000232.1" {"ID" "GL000232.1",
"length" "40652",
"assembly" "GRCh37"},
"6" {"ID" "6", "length" "171115067", "assembly" "GRCh37"},
"GL000247.1" {"ID" "GL000247.1",
"length" "36422",
"assembly" "GRCh37"},
"GL000211.1" {"ID" "GL000211.1",
"length" "166566",
"assembly" "GRCh37"},
"GL000231.1" {"ID" "GL000231.1",
"length" "27386",
"assembly" "GRCh37"},
"GL000233.1" {"ID" "GL000233.1",
"length" "45941",
"assembly" "GRCh37"},
"GL000216.1" {"ID" "GL000216.1",
"length" "172294",
"assembly" "GRCh37"},
"GL000223.1" {"ID" "GL000223.1",
"length" "180455",
"assembly" "GRCh37"},
"GL000222.1" {"ID" "GL000222.1",
"length" "186861",
"assembly" "GRCh37"},
"GL000227.1" {"ID" "GL000227.1",
"length" "128374",
"assembly" "GRCh37"},
"GL000193.1" {"ID" "GL000193.1",
"length" "189789",
"assembly" "GRCh37"},
"1" {"ID" "1", "length" "249250621", "assembly" "GRCh37"},
"GL000244.1" {"ID" "GL000244.1",
"length" "39929",
"assembly" "GRCh37"},
"GL000229.1" {"ID" "GL000229.1",
"length" "19913",
"assembly" "GRCh37"},
"GL000210.1" {"ID" "GL000210.1",
"length" "27682",
"assembly" "GRCh37"},
"GL000206.1" {"ID" "GL000206.1",
"length" "41001",
"assembly" "GRCh37"},
"GL000194.1" {"ID" "GL000194.1",
"length" "191469",
"assembly" "GRCh37"},
"X" {"ID" "X", "length" "155270560", "assembly" "GRCh37"},
"GL000204.1" {"ID" "GL000204.1",
"length" "81310",
"assembly" "GRCh37"},
"11" {"ID" "11", "length" "135006516", "assembly" "GRCh37"},
"GL000226.1" {"ID" "GL000226.1",
"length" "15008",
"assembly" "GRCh37"},
"GL000198.1" {"ID" "GL000198.1",
"length" "90085",
"assembly" "GRCh37"},
"GL000200.1" {"ID" "GL000200.1",
"length" "187035",
"assembly" "GRCh37"},
"GL000228.1" {"ID" "GL000228.1",
"length" "129120",
"assembly" "GRCh37"},
"GL000192.1" {"ID" "GL000192.1",
"length" "547496",
"assembly" "GRCh37"},
"GL000225.1" {"ID" "GL000225.1",
"length" "211173",
"assembly" "GRCh37"},
"GL000201.1" {"ID" "GL000201.1",
"length" "36148",
"assembly" "GRCh37"},
"2" {"ID" "2", "length" "243199373", "assembly" "GRCh37"},
"GL000245.1" {"ID" "GL000245.1",
"length" "36651",
"assembly" "GRCh37"},
"GL000191.1" {"ID" "GL000191.1",
"length" "106433",
"assembly" "GRCh37"},
"16" {"ID" "16", "length" "90354753", "assembly" "GRCh37"},
"GL000209.1" {"ID" "GL000209.1",
"length" "159169",
"assembly" "GRCh37"},
"10" {"ID" "10", "length" "135534747", "assembly" "GRCh37"},
"GL000214.1" {"ID" "GL000214.1",
"length" "137718",
"assembly" "GRCh37"},
"GL000218.1" {"ID" "GL000218.1",
"length" "161147",
"assembly" "GRCh37"},
"GL000203.1" {"ID" "GL000203.1",
"length" "37498",
"assembly" "GRCh37"},
"GL000249.1" {"ID" "GL000249.1",
"length" "38502",
"assembly" "GRCh37"},
"MT" {"ID" "MT", "length" "16569", "assembly" "GRCh37"},
"GL000197.1" {"ID" "GL000197.1",
"length" "37175",
"assembly" "GRCh37"}},
"NSB(Bgnd)Purple" "NSB(Bgnd)Purple|Non-SpecificBinding|371|161",
"autocallVersion" "2.0.0.137",
"extendedIlluminaManifestVersion" "1.2",
"NSB(Bgnd)Green" "NSB(Bgnd)Green|Non-SpecificBinding|277|149",
"NSB(Bgnd)Blue" "NSB(Bgnd)Blue|Non-SpecificBinding|371|153",
"Extension(G)" "Extension(G)|Extension|2205|8950",
"content" "Broad_GWAS_supplemental_15061359_A1.1.2.extended.csv",
"NP(G)" "NP(G)|Non-Polymorphic|563|4047",
"autocallDate" "05/23/2018 21:15",
"Extension(T)" "Extension(T)|Extension|27882|316",
"scannerName" "N370",
"NP(C)" "NP(C)|Non-Polymorphic|568|4775",
"reference" "/cromwell_root/broad-references/hg19/v0/Homo_sapiens_assembly19.fasta",
"NP(T)" "NP(T)|Non-Polymorphic|7892|204",
"source" "BPM file",
"DNP(Bgnd)" "DNP(Bgnd)|Staining|333|266",
"genomeBuild" "HG19",
"clusterFile" "Broad_GWAS_supplemental_15061359_A1.egt",
"expectedGender" "Female",
"FILTER" {"DUPE" {"ID" "DUPE",
"Description" "Duplicate assays position."},
"FAIL_REF" {"ID" "FAIL_REF",
"Description" "Assay failed to map to reference."},
"TRIALLELIC" {"ID" "TRIALLELIC",
"Description" "Tri-allelic assay."},
"ZCALL_DIFF" {"ID" "ZCALL_DIFF",
"Description" "ZCALL_DIFF"}},
"String(PM)" "String(PM)|Stringency|13564|257",
"autocallGender" "F",
"zcallThresholds" "thresholds.7.txt",
"Extension(C)" "Extension(C)|Extension|1442|8646",
"FORMAT" {"NORMX" {"ID" "NORMX",
"Number" "1",
"Type" "Float",
"Description" "Normalized X intensity"},
"LRR" {"ID" "LRR",
"Number" "1",
"Type" "Float",
"Description" "Log R Ratio"},
"Y" {"ID" "Y",
"Number" "1",
"Type" "Integer",
"Description" "Raw Y intensity"},
"R" {"ID" "R",
"Number" "1",
"Type" "Float",
"Description" "Normalized R value"},
"IGC" {"ID" "IGC",
"Number" "1",
"Type" "Float",
"Description" "Illumina GenCall Confidence Score"},
"NORMY" {"ID" "NORMY",
"Number" "1",
"Type" "Float",
"Description" "Normalized Y intensity"},
"THETA" {"ID" "THETA",
"Number" "1",
"Type" "Float",
"Description" "Normalized Theta value"},
"X" {"ID" "X",
"Number" "1",
"Type" "Integer",
"Description" "Raw X intensity"},
"GTA" {"ID" "GTA",
"Number" "1",
"Type" "String",
"Description" "Illumina Autocall Genotype"},
"GT" {"ID" "GT",
"Number" "1",
"Type" "String",
"Description" "Genotype"},
"BAF" {"ID" "BAF",
"Number" "1",
"Type" "Float",
"Description" "B Allele Frequency"},
"GTZ" {"ID" "GTZ",
"Number" "1",
"Type" "String",
"Description" "zCall Genotype"}},
"Hyb(Medium)" "Hyb(Medium)|Hybridization|517|5068",
"INFO" {"N_AA" {"ID" "N_AA",
"Number" "1",
"Type" "Integer",
"Description" "Number of AA calls in training set"},
"devR_AB" {"ID" "devR_AB",
"Number" "1",
"Type" "Float",
"Description" "Standard deviation of normalized R for AB cluster"},
"meanX_AA" {"ID" "meanX_AA",
"Number" "1",
"Type" "Float",
"Description" "Mean of normalized X for AA cluster"},
"devY_BB" {"ID" "devY_BB",
"Number" "1",
"Type" "Float",
"Description" "Standard deviation of normalized Y for BB cluster"},
"SOURCE" {"ID" "SOURCE",
"Number" "1",
"Type" "String",
"Description" "Probe source"},
"meanY_AA" {"ID" "meanY_AA",
"Number" "1",
"Type" "Float",
"Description" "Mean of normalized Y for AA cluster"},
"N_AB" {"ID" "N_AB",
"Number" "1",
"Type" "Integer",
"Description" "Number of AB calls in training set"},
"ILLUMINA_STRAND" {"ID" "ILLUMINA_STRAND",
"Number" "1",
"Type" "String",
"Description" "Probe strand"},
"ILLUMINA_CHR" {"ID" "ILLUMINA_CHR",
"Number" "1",
"Type" "String",
"Description" "Chromosome in Illumina manifest"},
"devY_AA" {"ID" "devY_AA",
"Number" "1",
"Type" "Float",
"Description" "Standard deviation of normalized Y for AA cluster"},
"devR_AA" {"ID" "devR_AA",
"Number" "1",
"Type" "Float",
"Description" "Standard deviation of normalized R for AA cluster"},
"ILLUMINA_BUILD" {"ID" "ILLUMINA_BUILD",
"Number" "1",
"Type" "String",
"Description" "Genome Build in Illumina manifest"},
"PROBE_B" {"ID" "PROBE_B",
"Number" "1",
"Type" "String",
"Description" "Probe base pair sequence; not missing for strand-ambiguous SNPs"},
"meanTHETA_BB" {"ID" "meanTHETA_BB",
"Number" "1",
"Type" "Float",
"Description" "Mean of normalized THETA for BB cluster"},
"devTHETA_AB" {"ID" "devTHETA_AB",
"Number" "1",
"Type" "Float",
"Description" "Standard deviation of normalized THETA for AB cluster"},
"AC" {"ID" "AC",
"Number" "A",
"Type" "Integer",
"Description" "Allele count in genotypes, for each ALT allele, in the same order as listed"},
"devY_AB" {"ID" "devY_AB",
"Number" "1",
"Type" "Float",
"Description" "Standard deviation of normalized Y for AB cluster"},
"devR_BB" {"ID" "devR_BB",
"Number" "1",
"Type" "Float",
"Description" "Standard deviation of normalized R for BB cluster"},
"meanR_AB" {"ID" "meanR_AB",
"Number" "1",
"Type" "Float",
"Description" "Mean of normalized R for AB cluster"},
"GC_SCORE" {"ID" "GC_SCORE",
"Number" "1",
"Type" "Float",
"Description" "Gentrain Score"},
"AN" {"ID" "AN",
"Number" "1",
"Type" "Integer",
"Description" "Total number of alleles in called genotypes"},
"PROBE_A" {"ID" "PROBE_A",
"Number" "1",
"Type" "String",
"Description" "Probe base pair sequence"},
"AF" {"ID" "AF",
"Number" "A",
"Type" "Float",
"Description" "Allele Frequency, for each ALT allele, in the same order as listed"},
"meanR_AA" {"ID" "meanR_AA",
"Number" "1",
"Type" "Float",
"Description" "Mean of normalized R for AA cluster"},
"devTHETA_AA" {"ID" "devTHETA_AA",
"Number" "1",
"Type" "Float",
"Description" "Standard deviation of normalized THETA for AA cluster"},
"devTHETA_BB" {"ID" "devTHETA_BB",
"Number" "1",
"Type" "Float",
"Description" "Standard deviation of normalized THETA for BB cluster"},
"devX_BB" {"ID" "devX_BB",
"Number" "1",
"Type" "Float",
"Description" "Standard deviation of normalized X for BB cluster"},
"meanTHETA_AA" {"ID" "meanTHETA_AA",
"Number" "1",
"Type" "Float",
"Description" "Mean of normalized THETA for AA cluster"},
"meanY_BB" {"ID" "meanY_BB",
"Number" "1",
"Type" "Float",
"Description" "Mean of normalized Y for BB cluster"},
"meanY_AB" {"ID" "meanY_AB",
"Number" "1",
"Type" "Float",
"Description" "Mean of normalized Y for AB cluster"},
"meanTHETA_AB" {"ID" "meanTHETA_AB",
"Number" "1",
"Type" "Float",
"Description" "Mean of normalized THETA for AB cluster"},
"ALLELE_A" {"ID" "ALLELE_A",
"Number" "1",
"Type" "String",
"Description" "A allele"},
"devX_AB" {"ID" "devX_AB",
"Number" "1",
"Type" "Float",
"Description" "Standard deviation of normalized X for AB cluster"},
"meanR_BB" {"ID" "meanR_BB",
"Number" "1",
"Type" "Float",
"Description" "Mean of normalized R for BB cluster"},
"devX_AA" {"ID" "devX_AA",
"Number" "1",
"Type" "Float",
"Description" "Standard deviation of normalized X for AA cluster"},
"ILLUMINA_POS" {"ID" "ILLUMINA_POS",
"Number" "1",
"Type" "Integer",
"Description" "Position in Illumina manifest"},
"ALLELE_B" {"ID" "ALLELE_B",
"Number" "1",
"Type" "String",
"Description" "B allele"},
"refSNP" {"ID" "refSNP",
"Number" "1",
"Type" "String",
"Description" "dbSNP rs ID"},
"zthresh_X" {"ID" "zthresh_X",
"Number" "1",
"Type" "Float",
"Description" "zCall X threshold"},
"meanX_AB" {"ID" "meanX_AB",
"Number" "1",
"Type" "Float",
"Description" "Mean of normalized X for AB cluster"},
"meanX_BB" {"ID" "meanX_BB",
"Number" "1",
"Type" "Float",
"Description" "Mean of normalized X for BB cluster"},
"N_BB" {"ID" "N_BB",
"Number" "1",
"Type" "Integer",
"Description" "Number of BB calls in training set"},
"zthresh_Y" {"ID" "zthresh_Y",
"Number" "1",
"Type" "Float",
"Description" "zCall Y threshold"},
"BEADSET_ID" {"ID" "BEADSET_ID",
"Number" "1",
"Type" "Integer",
"Description" "Bead set ID for normalization"}},
"Hyb(High)" "Hyb(High)|Hybridization|2227|8113",
"sampleAlias" "NA12878",
"Biotin(Bgnd)" "Biotin(Bgnd)|Staining|456|292",
"imagingDate" "3/2/2017 4:24:38 PM",
"NP(A)" "NP(A)|Non-Polymorphic|7725|255",
"TargetRemoval" "TargetRemoval|TargetRemoval|1156|208"}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment