Skip to content

Instantly share code, notes, and snippets.

@Anishka0107
Last active August 22, 2017 19:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Anishka0107/0e2f485eda87b9ec246b40ee61d2cde6 to your computer and use it in GitHub Desktop.
Save Anishka0107/0e2f485eda87b9ec246b40ee61d2cde6 to your computer and use it in GitHub Desktop.
Variant Call Format (VCF) formal grammar
{
tokens=[
digit='regexp:\d'
alpha='regexp:\p{Alpha}'
punct='regexp:\p{Punct}'
CS='regexp:\t'
newline='regexp:\n'
newlinewindows='regexp:\r\n'
]
}
syntax ::= main_vcf
alnum ::= digit | alpha
space ::= " "
print ::= alnum | punct | space
any ::= print | "\x01" | "\x02" | "\x03" | "\x04" | "\x05" | "\x06" | "\a" | "\b" | "\t" | "\n" | "\v" | "\f" | "\r" | "\x014" | "\x015" | "\x016" | "\x017" | "\x018" | "\x019" | "\x020" | "\x021" | "\x022" | "\x023" | "\x024" | "\x025" | "\x026" | "\x027" | "\x028" | "\x029" | "\x030" | "\x031"
MISSING ::= "."
NL ::= newline | newlinewindows
// Integer and floating-point numbers
any_number ::= ("-" | "+")? (digit)+ ("." (digit)+)? (("e"|"E") ("+"|"-")? (digit)+)? | ("-" | "+")? "Inf" | "NaN"
int_number ::= ("-" | "+")? (digit)+
nat_number ::= (digit)+
// Bases are case-insensitive
bases ::= ("A" | "C" | "G" | "T" | "N" | "a" | "c" | "t" | "g" | "n" )+
//Identifiers may contain some symbols, but not only those
// TODO Could it accept more symbols? Comma won't be, for sure
// ALT ID (can be used from meta entries or in the ALT column)
identifier ::= !((punct)+) (alnum | "." | "_" | "-" )+
// ALT ID (can be used from meta entries or in the ALT column)
alt_id ::= !((punct)+) ( alnum | (!("," | "<" | ">") punct) )+
// A contig must be a sequence name allowed by the SAM format ( regex [!-)+-<>-~][!-~]* ) excluding the characters <>[]:*
// A chromosome must be a string with no white-spaces or colons, and may be surronded by < > symbols (for contigs)
meta_contig_char ::= alnum | ( !(":" | "<" | ">" | "[" | "]" | "*" | "=" | "," ) punct )
chrom_basic ::= (!"#" meta_contig_char) (meta_contig_char)*
chrom_contig ::= "<" chrom_basic ">"
chromosome ::= chrom_basic | chrom_contig
// URL inspired in http://stackoverflow.com/questions/8784903/failed-to-convert-url-parser-regular-expression-to-ragel
scheme ::= (alpha (!NL !(":" | "/" | "?" | "#") any)+ )
authority ::= (alpha (!NL !("/" | "?" | "#") any)* ) | ( (digit digit? digit?) ("." digit digit? digit?) ("." digit digit? digit?)? ("." digit digit? digit?)? )
path ::= (!NL !("?" | "#") any)*
query ::= (!NL !"#" any)*
fragment ::= (!NL any)*
url ::= (scheme "://" authority path ("?" query)? ("#" fragment)?)
// File format line
fileformat_name ::= "VCFv4.3"
fileformat ::= "##fileformat=" fileformat_name
// Meta-data
meta_typeid ::= !("ALT" | "FILTER" | "FORMAT" | "INFO" | "assembly" | "contig" | "META" | "SAMPLE" | "PEDIGREE" | "pedigreeDB") (!"=" print)+
meta_key ::= !((punct)+) (alnum | "." | "_" | "-" )+
// A value can be expressed in multiple ways:
// - The line is a single key-value pair (meta_value): the value cannot start with an angle bracket, quote or linebreak, but it is free text thereafter
// - One of multiple key-value pairs, non-quoted (meta_field_value): the value cannot contain a comma -end of field-, or a closing angle bracket -end of entry-, or quote
// - One of multiple key-value pairs, quoted (meta_field_desc): the value can contain any character except for non-escaped quotes
meta_value ::= (!("<" | '"' | NL) print) (print)*
meta_field_value ::= (!("," | ">" | '"') print)+
meta_field_desc ::= (("\\" print) | (!'"' print))*
meta_values ::= ('"' meta_field_desc '"') | meta_field_value
meta_field ::= meta_key "=" meta_values
meta_field_num ::= (digit)+ | "A" | "R" | "G" | MISSING
meta_field_type ::= (alpha)+
meta_alt ::= "ID=" alt_id ',Description="' meta_field_desc '"' ("," identifier '="' meta_field_desc '"' )*
meta_assembly ::= url
meta_contig ::= "ID=" chrom_basic ("," identifier "=" meta_values)*
meta_filter ::= "ID=" identifier ',Description="' meta_field_desc '"' ("," identifier '="' meta_field_desc '"')*
meta_format ::= "ID=" identifier ",Number=" meta_field_num ",Type=" meta_field_type ',Description="' meta_field_desc '"' ("," identifier '="' meta_field_desc '"' )*
meta_info ::= "ID=" identifier ",Number=" meta_field_num ",Type=" meta_field_type ',Description="' meta_field_desc '"' ("," identifier '="' meta_field_desc '"' )*
// 3 different ways to define a PEDIGREE tag now:
// ID=TumourSample,Original=GermlineID
// ID=ChildID,Father=FatherID,Mother=MotherID
// ID=SampleID,Name_1=Ancestor_1,...,Name_N=Ancestor_N
meta_pedigree_name ::= ("Name_" (digit)+)
meta_pedigree ::= "ID=" identifier
(
( ",Original=" identifier ) |
( ",Father=" identifier
",Mother=" identifier ) |
("," meta_pedigree_name "=" identifier)+
)
meta_pedigreeDB ::= url
// META values don't seem to have any particular restrictions so just applying those critical for parsing
meta_meta_value ::= (!("," | "<" | ">" | space) print)+
meta_meta_values ::= meta_meta_value ("," (space)? meta_meta_value)*
meta_meta ::= "ID=" identifier
",Number=."
",Type=String"
",Values=[" meta_meta_values "]"
// Multiple ways to describe a sample now
meta_sample ::= "ID=" identifier ("," identifier "=" meta_values)*
meta_entry ::= "##" (
("ALT=<" meta_alt ">" ) |
("FILTER=<" meta_filter ">" ) |
("FORMAT=<" meta_format ">" ) |
("INFO=<" meta_info ">" ) |
("assembly=" meta_assembly ) |
("contig=<" meta_contig ">" ) |
("META=" meta_meta ">" ) |
("SAMPLE=<" meta_sample ">" ) |
("PEDIGREE=<" meta_pedigree ">" ) |
("pedigreeDB=<" meta_pedigreeDB ">" ) |
(meta_typeid "="
(
("<" meta_field ("," meta_field)* ">") |
('"' meta_field_desc '"') |
('<"' meta_field_desc '">') |
meta_value
)
)
)
// Header between meta and records
sample_name ::= (!(NL | CS) print)+
header ::= ("#CHROM" CS "POS" CS "ID" CS "REF" CS "ALT" CS "QUAL" CS "FILTER" CS "INFO" ) (CS "FORMAT" (CS sample_name)+ )?
// Records
position ::= nat_number
record_chrom ::= chrom_basic | "<" chrom_basic ">"
record_position ::= position
// ID must be a (list of) string with no white-spaces or semi-colons
record_id_value ::= (!(space | ";") print)+
record_id_empty ::= MISSING
record_id ::= record_id_value (";" record_id_value)* | record_id_empty
record_ref ::= bases
// A basic ALT can be something like AGT (biallelic), A,AT (multiallelic)
record_alt_snv ::= bases
// Indel alternates can be represented by standardized prefixes or an asterisk
record_alt_indel ::= "<DEL>" | "<INS>" | "<DUP>" | "<INV>" | "<CNV>" | "<DUP:TANDEM>" | "<DEL:ME:" (alnum)+ ">" | "<INS:ME:" (alnum)+ ">" | "*"
// Structural variants follow forms like:
// ]1:1234]ATG or ]<contig_1>:1234]ATG : paired breakends
// .AGT, AGT.: single breakends
record_alt_sv ::= "]" chromosome ":" position "]" bases |
"[" chromosome ":" position "[" bases |
bases "]" chromosome ":" position "]" |
bases "[" chromosome ":" position "[" |
"." bases |
bases "."
// Other alternates can be any identifier surrounded by < > symbols
// TODO IUPAC codes
record_alt_other ::= !("<DEL>" | "<INS>" | "<DUP>" | "<INV>" | "<CNV>" | "<DUP:TANDEM>" | "<DEL:ME:" (alnum)+ ">" | "<INS:ME:" (alnum)+ ">") ("<" alt_id ">")
// No variant, must be the only ALT allele
record_alt_empty ::= MISSING
// Main alternate allele rule
record_alt_data ::= ( record_alt_snv |
record_alt_indel |
record_alt_sv |
record_alt_other )
record_alt ::= (record_alt_data ("," record_alt_data)*) | record_alt_empty
record_qual ::= any_number | MISSING
record_fil_value ::= !((punct)+) (alnum | (!";" punct))+
record_fil_empty ::= MISSING
record_filter ::= (record_fil_value (";" record_fil_value)*) | record_fil_empty
info_key ::= ( (alpha | "_") (alpha | digit | "_" | ".")*) | "1000G"
info_value ::= (!";" print)+
info_value_list ::= info_value ("," info_value)*
info_entry ::= (info_key "=" info_value_list) | info_key
record_info ::= ( info_entry (";" info_entry)* ) | MISSING
// Accepting non-alphanumeric characters is an addition in v4.3, but widely used in already existing files
format_value ::= ( (alpha | "_") (alnum | "_" | "." | "%")* )
record_format ::= format_value (":" format_value)*
// In a sample, if a genotype is present it must be the first field
sample_allele ::= (digit)+ | MISSING
sample_gt ::= (sample_allele (("/" | "|") sample_allele)*)
sample_value ::= (alnum | (!":" punct))+ | MISSING
sample_values ::= sample_value (":" sample_value)*
record_sample ::= (
sample_gt (":" sample_values)? |
sample_values
)
record ::= (
record_chrom
CS record_position
CS record_id
CS record_ref
CS record_alt
CS record_qual
CS record_filter
CS record_info
(CS record_format
(CS record_sample)+ )?
)
fileformat_section ::= (fileformat NL)
meta_section ::= (meta_entry NL)*
header_section ::= header NL
body_section ::= (record (NL record)*)?
// Machine start (fileformat, then optional meta, header, then optional records)
main_vcf ::= fileformat_section
meta_section
header_section
body_section
(NL)*
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment