Last active
August 22, 2017 19:27
-
-
Save Anishka0107/0e2f485eda87b9ec246b40ee61d2cde6 to your computer and use it in GitHub Desktop.
Variant Call Format (VCF) formal grammar
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
tokens=[ | |
digit='regexp:\d' | |
alpha='regexp:\p{Alpha}' | |
punct='regexp:\p{Punct}' | |
CS='regexp:\t' | |
newline='regexp:\n' | |
newlinewindows='regexp:\r\n' | |
] | |
} | |
syntax ::= main_vcf | |
alnum ::= digit | alpha | |
space ::= " " | |
print ::= alnum | punct | space | |
any ::= print | "\x01" | "\x02" | "\x03" | "\x04" | "\x05" | "\x06" | "\a" | "\b" | "\t" | "\n" | "\v" | "\f" | "\r" | "\x014" | "\x015" | "\x016" | "\x017" | "\x018" | "\x019" | "\x020" | "\x021" | "\x022" | "\x023" | "\x024" | "\x025" | "\x026" | "\x027" | "\x028" | "\x029" | "\x030" | "\x031" | |
MISSING ::= "." | |
NL ::= newline | newlinewindows | |
// Integer and floating-point numbers | |
any_number ::= ("-" | "+")? (digit)+ ("." (digit)+)? (("e"|"E") ("+"|"-")? (digit)+)? | ("-" | "+")? "Inf" | "NaN" | |
int_number ::= ("-" | "+")? (digit)+ | |
nat_number ::= (digit)+ | |
// Bases are case-insensitive | |
bases ::= ("A" | "C" | "G" | "T" | "N" | "a" | "c" | "t" | "g" | "n" )+ | |
//Identifiers may contain some symbols, but not only those | |
// TODO Could it accept more symbols? Comma won't be, for sure | |
// ALT ID (can be used from meta entries or in the ALT column) | |
identifier ::= !((punct)+) (alnum | "." | "_" | "-" )+ | |
// ALT ID (can be used from meta entries or in the ALT column) | |
alt_id ::= !((punct)+) ( alnum | (!("," | "<" | ">") punct) )+ | |
// A contig must be a sequence name allowed by the SAM format ( regex [!-)+-<>-~][!-~]* ) excluding the characters <>[]:* | |
// A chromosome must be a string with no white-spaces or colons, and may be surronded by < > symbols (for contigs) | |
meta_contig_char ::= alnum | ( !(":" | "<" | ">" | "[" | "]" | "*" | "=" | "," ) punct ) | |
chrom_basic ::= (!"#" meta_contig_char) (meta_contig_char)* | |
chrom_contig ::= "<" chrom_basic ">" | |
chromosome ::= chrom_basic | chrom_contig | |
// URL inspired in http://stackoverflow.com/questions/8784903/failed-to-convert-url-parser-regular-expression-to-ragel | |
scheme ::= (alpha (!NL !(":" | "/" | "?" | "#") any)+ ) | |
authority ::= (alpha (!NL !("/" | "?" | "#") any)* ) | ( (digit digit? digit?) ("." digit digit? digit?) ("." digit digit? digit?)? ("." digit digit? digit?)? ) | |
path ::= (!NL !("?" | "#") any)* | |
query ::= (!NL !"#" any)* | |
fragment ::= (!NL any)* | |
url ::= (scheme "://" authority path ("?" query)? ("#" fragment)?) | |
// File format line | |
fileformat_name ::= "VCFv4.3" | |
fileformat ::= "##fileformat=" fileformat_name | |
// Meta-data | |
meta_typeid ::= !("ALT" | "FILTER" | "FORMAT" | "INFO" | "assembly" | "contig" | "META" | "SAMPLE" | "PEDIGREE" | "pedigreeDB") (!"=" print)+ | |
meta_key ::= !((punct)+) (alnum | "." | "_" | "-" )+ | |
// A value can be expressed in multiple ways: | |
// - The line is a single key-value pair (meta_value): the value cannot start with an angle bracket, quote or linebreak, but it is free text thereafter | |
// - One of multiple key-value pairs, non-quoted (meta_field_value): the value cannot contain a comma -end of field-, or a closing angle bracket -end of entry-, or quote | |
// - One of multiple key-value pairs, quoted (meta_field_desc): the value can contain any character except for non-escaped quotes | |
meta_value ::= (!("<" | '"' | NL) print) (print)* | |
meta_field_value ::= (!("," | ">" | '"') print)+ | |
meta_field_desc ::= (("\\" print) | (!'"' print))* | |
meta_values ::= ('"' meta_field_desc '"') | meta_field_value | |
meta_field ::= meta_key "=" meta_values | |
meta_field_num ::= (digit)+ | "A" | "R" | "G" | MISSING | |
meta_field_type ::= (alpha)+ | |
meta_alt ::= "ID=" alt_id ',Description="' meta_field_desc '"' ("," identifier '="' meta_field_desc '"' )* | |
meta_assembly ::= url | |
meta_contig ::= "ID=" chrom_basic ("," identifier "=" meta_values)* | |
meta_filter ::= "ID=" identifier ',Description="' meta_field_desc '"' ("," identifier '="' meta_field_desc '"')* | |
meta_format ::= "ID=" identifier ",Number=" meta_field_num ",Type=" meta_field_type ',Description="' meta_field_desc '"' ("," identifier '="' meta_field_desc '"' )* | |
meta_info ::= "ID=" identifier ",Number=" meta_field_num ",Type=" meta_field_type ',Description="' meta_field_desc '"' ("," identifier '="' meta_field_desc '"' )* | |
// 3 different ways to define a PEDIGREE tag now: | |
// ID=TumourSample,Original=GermlineID | |
// ID=ChildID,Father=FatherID,Mother=MotherID | |
// ID=SampleID,Name_1=Ancestor_1,...,Name_N=Ancestor_N | |
meta_pedigree_name ::= ("Name_" (digit)+) | |
meta_pedigree ::= "ID=" identifier | |
( | |
( ",Original=" identifier ) | | |
( ",Father=" identifier | |
",Mother=" identifier ) | | |
("," meta_pedigree_name "=" identifier)+ | |
) | |
meta_pedigreeDB ::= url | |
// META values don't seem to have any particular restrictions so just applying those critical for parsing | |
meta_meta_value ::= (!("," | "<" | ">" | space) print)+ | |
meta_meta_values ::= meta_meta_value ("," (space)? meta_meta_value)* | |
meta_meta ::= "ID=" identifier | |
",Number=." | |
",Type=String" | |
",Values=[" meta_meta_values "]" | |
// Multiple ways to describe a sample now | |
meta_sample ::= "ID=" identifier ("," identifier "=" meta_values)* | |
meta_entry ::= "##" ( | |
("ALT=<" meta_alt ">" ) | | |
("FILTER=<" meta_filter ">" ) | | |
("FORMAT=<" meta_format ">" ) | | |
("INFO=<" meta_info ">" ) | | |
("assembly=" meta_assembly ) | | |
("contig=<" meta_contig ">" ) | | |
("META=" meta_meta ">" ) | | |
("SAMPLE=<" meta_sample ">" ) | | |
("PEDIGREE=<" meta_pedigree ">" ) | | |
("pedigreeDB=<" meta_pedigreeDB ">" ) | | |
(meta_typeid "=" | |
( | |
("<" meta_field ("," meta_field)* ">") | | |
('"' meta_field_desc '"') | | |
('<"' meta_field_desc '">') | | |
meta_value | |
) | |
) | |
) | |
// Header between meta and records | |
sample_name ::= (!(NL | CS) print)+ | |
header ::= ("#CHROM" CS "POS" CS "ID" CS "REF" CS "ALT" CS "QUAL" CS "FILTER" CS "INFO" ) (CS "FORMAT" (CS sample_name)+ )? | |
// Records | |
position ::= nat_number | |
record_chrom ::= chrom_basic | "<" chrom_basic ">" | |
record_position ::= position | |
// ID must be a (list of) string with no white-spaces or semi-colons | |
record_id_value ::= (!(space | ";") print)+ | |
record_id_empty ::= MISSING | |
record_id ::= record_id_value (";" record_id_value)* | record_id_empty | |
record_ref ::= bases | |
// A basic ALT can be something like AGT (biallelic), A,AT (multiallelic) | |
record_alt_snv ::= bases | |
// Indel alternates can be represented by standardized prefixes or an asterisk | |
record_alt_indel ::= "<DEL>" | "<INS>" | "<DUP>" | "<INV>" | "<CNV>" | "<DUP:TANDEM>" | "<DEL:ME:" (alnum)+ ">" | "<INS:ME:" (alnum)+ ">" | "*" | |
// Structural variants follow forms like: | |
// ]1:1234]ATG or ]<contig_1>:1234]ATG : paired breakends | |
// .AGT, AGT.: single breakends | |
record_alt_sv ::= "]" chromosome ":" position "]" bases | | |
"[" chromosome ":" position "[" bases | | |
bases "]" chromosome ":" position "]" | | |
bases "[" chromosome ":" position "[" | | |
"." bases | | |
bases "." | |
// Other alternates can be any identifier surrounded by < > symbols | |
// TODO IUPAC codes | |
record_alt_other ::= !("<DEL>" | "<INS>" | "<DUP>" | "<INV>" | "<CNV>" | "<DUP:TANDEM>" | "<DEL:ME:" (alnum)+ ">" | "<INS:ME:" (alnum)+ ">") ("<" alt_id ">") | |
// No variant, must be the only ALT allele | |
record_alt_empty ::= MISSING | |
// Main alternate allele rule | |
record_alt_data ::= ( record_alt_snv | | |
record_alt_indel | | |
record_alt_sv | | |
record_alt_other ) | |
record_alt ::= (record_alt_data ("," record_alt_data)*) | record_alt_empty | |
record_qual ::= any_number | MISSING | |
record_fil_value ::= !((punct)+) (alnum | (!";" punct))+ | |
record_fil_empty ::= MISSING | |
record_filter ::= (record_fil_value (";" record_fil_value)*) | record_fil_empty | |
info_key ::= ( (alpha | "_") (alpha | digit | "_" | ".")*) | "1000G" | |
info_value ::= (!";" print)+ | |
info_value_list ::= info_value ("," info_value)* | |
info_entry ::= (info_key "=" info_value_list) | info_key | |
record_info ::= ( info_entry (";" info_entry)* ) | MISSING | |
// Accepting non-alphanumeric characters is an addition in v4.3, but widely used in already existing files | |
format_value ::= ( (alpha | "_") (alnum | "_" | "." | "%")* ) | |
record_format ::= format_value (":" format_value)* | |
// In a sample, if a genotype is present it must be the first field | |
sample_allele ::= (digit)+ | MISSING | |
sample_gt ::= (sample_allele (("/" | "|") sample_allele)*) | |
sample_value ::= (alnum | (!":" punct))+ | MISSING | |
sample_values ::= sample_value (":" sample_value)* | |
record_sample ::= ( | |
sample_gt (":" sample_values)? | | |
sample_values | |
) | |
record ::= ( | |
record_chrom | |
CS record_position | |
CS record_id | |
CS record_ref | |
CS record_alt | |
CS record_qual | |
CS record_filter | |
CS record_info | |
(CS record_format | |
(CS record_sample)+ )? | |
) | |
fileformat_section ::= (fileformat NL) | |
meta_section ::= (meta_entry NL)* | |
header_section ::= header NL | |
body_section ::= (record (NL record)*)? | |
// Machine start (fileformat, then optional meta, header, then optional records) | |
main_vcf ::= fileformat_section | |
meta_section | |
header_section | |
body_section | |
(NL)* |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment