Skip to content

Instantly share code, notes, and snippets.

@magaliruffier
Last active August 29, 2015 14:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save magaliruffier/dc82aa402afee95d7177 to your computer and use it in GitHub Desktop.
Save magaliruffier/dc82aa402afee95d7177 to your computer and use it in GitHub Desktop.
{
{
"input": "1 230845794 230845794 A G . . .",
"colocated_variants": [
{
"aa_maf": 0.173173,
"ea_maf": 0.425814,
"amr_maf": 0.36,
"strand": 1,
"id": "rs699",
"asn_maf": 0.16,
"allele_string": "A/G",
"minor_allele_freq": 0.3384,
"afr_maf": 0.13,
"clin_sig": [
"other"
],
"eur_maf": 0.41,
"end": 230845794,
"minor_allele": "A",
"start": 230845794,
"pubmed": [
18513389,
19131662,
19263529,
19330901,
19559392,
18603647,
23021345,
20570668,
18069999,
18248681,
18279468,
18637188,
18653189,
18698212,
18953568,
19108684,
19932491,
20029521,
20061926,
20185782,
20486282,
20592051,
21261619,
21304999,
21306748,
21438754,
21988197,
22531885,
22569109,
22858200,
23132613,
23251296,
23333443,
23354977,
23681449,
23716723
]
},
{
"strand": 1,
"id": "CM920010",
"allele_string": "HGMD_MUTATION",
"end": 230845794,
"start": 230845794
},
{
"strand": 1,
"id": "COSM425562",
"allele_string": "A/G",
"end": 230845794,
"start": 230845794
}
],
"end": 230845794,
"seq_region_name": "1",
"transcript_consequences": [
{
"hgvsp": "ENSP00000355627.4:p.Met268Thr",
"variant_allele": "G",
"polyphen_score": 0,
"cdna_end": 1018,
"codons": "aTg/aCg",
"hgvsc": "ENST00000366667.4:c.803N>C",
"protein_end": 268,
"amino_acids": "M/T",
"strand": -1,
"gene_symbol": "AGT",
"transcript_id": "ENST00000366667",
"cdna_start": 1018,
"gene_id": "ENSG00000135744",
"cds_start": 803,
"canonical": 1,
"polyphen_prediction": "benign",
"sift_prediction": "tolerated",
"protein_start": 268,
"biotype": "protein_coding",
"gene_symbol_source": "HGNC",
"sift_score": 1,
"cds_end": 803,
"consequence_terms": [
"missense_variant"
]
},
{
"variant_allele": "G",
"distance": 650,
"strand": -1,
"gene_symbol": "RP11-99J16__A.2",
"transcript_id": "ENST00000412344",
"gene_id": "ENSG00000244137",
"biotype": "antisense",
"gene_symbol_source": "Clone_based_vega_gene",
"consequence_terms": [
"downstream_gene_variant"
]
}
],
"strand": 1,
"regulatory_feature_consequences": [
{
"variant_allele": "G",
"consequence_terms": [
"regulatory_region_variant"
],
"regulatory_feature_id": "ENSR00001529861"
}
],
"id": "230845794",
"most_severe_consequence": "missense_variant",
"allele_string": "A/G",
"start": 230845794
},
{
"input": "11 128496 128496 T G . . .",
"colocated_variants": [
{
"strand": 1,
"id": "rs111606699",
"allele_string": "T/G",
"end": 128496,
"start": 128496
}
],
"end": 128496,
"seq_region_name": "11",
"motif_feature_consequences": [
{
"variant_allele": "G",
"motif_feature_id": "MA0162.1",
"motif_name": "Jaspar_Matrix_Egr1:MA0162.1",
"high_inf_pos": "N",
"consequence_terms": [
"TF_binding_site_variant"
],
"motif_pos": 5,
"motif_score_change": -0.027,
"strand": 1
}
],
"transcript_consequences": [
{
"variant_allele": "G",
"distance": 2712,
"strand": -1,
"gene_symbol": "RP11-304M2.6",
"transcript_id": "ENST00000529266",
"gene_id": "ENSG00000254468",
"canonical": 1,
"biotype": "lincRNA",
"gene_symbol_source": "Clone_based_vega_gene",
"consequence_terms": [
"upstream_gene_variant"
]
},
{
"variant_allele": "G",
"distance": 783,
"strand": 1,
"gene_symbol": "RP11-304M2.3",
"transcript_id": "ENST00000527297",
"gene_id": "ENSG00000255229",
"canonical": 1,
"biotype": "antisense",
"gene_symbol_source": "Clone_based_vega_gene",
"consequence_terms": [
"upstream_gene_variant"
]
},
{
"variant_allele": "G",
"hgvsc": "ENST00000526704.2:n.2160-848N>C",
"strand": -1,
"gene_symbol": "LINC01001",
"transcript_id": "ENST00000526704",
"gene_id": "ENSG00000230724",
"biotype": "lincRNA",
"gene_symbol_source": "HGNC",
"consequence_terms": [
"intron_variant",
"nc_transcript_variant"
]
},
{
"variant_allele": "G",
"distance": 2373,
"strand": 1,
"gene_symbol": "CICP23",
"transcript_id": "ENST00000605013",
"gene_id": "ENSG00000270921",
"canonical": 1,
"biotype": "processed_pseudogene",
"gene_symbol_source": "HGNC",
"consequence_terms": [
"downstream_gene_variant"
]
},
{
"variant_allele": "G",
"hgvsc": "ENST00000540375.1:n.1760-480N>C",
"strand": -1,
"gene_symbol": "LINC01001",
"transcript_id": "ENST00000540375",
"gene_id": "ENSG00000230724",
"canonical": 1,
"biotype": "lincRNA",
"gene_symbol_source": "HGNC",
"consequence_terms": [
"intron_variant",
"nc_transcript_variant"
]
}
],
"strand": 1,
"regulatory_feature_consequences": [
{
"variant_allele": "G",
"consequence_terms": [
"regulatory_region_variant"
],
"regulatory_feature_id": "ENSR00000556729"
}
],
"id": "128496",
"most_severe_consequence": "intron_variant",
"allele_string": "T/G",
"start": 128496
},
{
"input": "21 9462266 9462266 G T . . .",
"colocated_variants": [
{
"strand": 1,
"id": "rs71252742",
"allele_string": "G/T",
"end": 9462266,
"start": 9462266
},
{
"strand": 1,
"id": "rs373173311",
"allele_string": "G/T",
"end": 9462266,
"start": 9462266
}
],
"end": 9462266,
"seq_region_name": "21",
"strand": 1,
"id": "9462266",
"most_severe_consequence": "intergenic_variant",
"intergenic_consequences": [
{
"variant_allele": "T",
"consequence_terms": [
"intergenic_variant"
]
}
],
"allele_string": "G/T",
"start": 9462266
},
{
"input": "1 160283 sv1 . <DUP> . . SVTYPE=DUP;END=471362",
"end": 471362,
"seq_region_name": "1",
"variant_class": "duplication",
"strand": 1,
"transcript_consequences": [
{
"variant_allele": "duplication",
"percentage_overlap": 100,
"bp_overlap": 895,
"strand": 1,
"gene_symbol": "RP4-669L17.10",
"transcript_id": "ENST00000432964",
"gene_id": "ENSG00000237094",
"biotype": "lincRNA",
"gene_symbol_source": "Clone_based_vega_gene",
"consequence_terms": [
"transcript_amplification"
]
},
{
"variant_allele": "duplication",
"percentage_overlap": 100,
"bp_overlap": 91,
"strand": -1,
"gene_symbol": "AL732372.1",
"transcript_id": "ENST00000540477",
"gene_id": "ENSG00000256186",
"canonical": 1,
"biotype": "pseudogene",
"gene_symbol_source": "Clone_based_ensembl_gene",
"consequence_terms": [
"transcript_amplification"
]
},
{
"variant_allele": "duplication",
"percentage_overlap": 100,
"bp_overlap": 6870,
"strand": -1,
"gene_symbol": "AP006222.2",
"transcript_id": "ENST00000442116",
"gene_id": "ENSG00000228463",
"biotype": "lincRNA",
"gene_symbol_source": "Clone_based_vega_gene",
"consequence_terms": [
"transcript_amplification"
]
},
{
"variant_allele": "duplication",
"percentage_overlap": 100,
"bp_overlap": 180,
"strand": 1,
"gene_symbol": "RP4-669L17.4",
"transcript_id": "ENST00000445840",
"gene_id": "ENSG00000224813",
"canonical": 1,
"biotype": "transcribed_unprocessed_pseudogene",
"gene_symbol_source": "Clone_based_vega_gene",
"consequence_terms": [
"transcript_amplification"
]
},
{
"variant_allele": "duplication",
"percentage_overlap": 100,
"bp_overlap": 3698,
"strand": 1,
"gene_symbol": "RP4-669L17.10",
"transcript_id": "ENST00000425496",
"gene_id": "ENSG00000237094",
"canonical": 1,
"biotype": "lincRNA",
"gene_symbol_source": "Clone_based_vega_gene",
"consequence_terms": [
"transcript_amplification"
]
},
{
"variant_allele": "duplication",
"percentage_overlap": 100,
"bp_overlap": 31142,
"strand": -1,
"gene_symbol": "AP006222.2",
"transcript_id": "ENST00000424587",
"gene_id": "ENSG00000228463",
"canonical": 1,
"biotype": "lincRNA",
"gene_symbol_source": "Clone_based_vega_gene",
"consequence_terms": [
"transcript_amplification"
]
},
{
"variant_allele": "duplication",
"percentage_overlap": 100,
"bp_overlap": 363,
"strand": -1,
"gene_symbol": "WBP1LP7",
"transcript_id": "ENST00000437905",
"gene_id": "ENSG00000269732",
"canonical": 1,
"biotype": "processed_pseudogene",
"gene_symbol_source": "HGNC",
"consequence_terms": [
"transcript_amplification"
]
},
{
"variant_allele": "duplication",
"percentage_overlap": 100,
"bp_overlap": 732,
"strand": 1,
"gene_symbol": "RP4-669L17.10",
"transcript_id": "ENST00000431321",
"gene_id": "ENSG00000237094",
"biotype": "lincRNA",
"gene_symbol_source": "Clone_based_vega_gene",
"consequence_terms": [
"transcript_amplification"
]
},
{
"variant_allele": "duplication",
"percentage_overlap": 100,
"bp_overlap": 1323,
"strand": 1,
"gene_symbol": "RP4-669L17.10",
"transcript_id": "ENST00000599771",
"gene_id": "ENSG00000237094",
"biotype": "lincRNA",
"gene_symbol_source": "Clone_based_vega_gene",
"consequence_terms": [
"transcript_amplification"
]
},
{
"variant_allele": "duplication",
"percentage_overlap": 100,
"bp_overlap": 4300,
"strand": 1,
"gene_symbol": "RP4-669L17.10",
"transcript_id": "ENST00000423728",
"gene_id": "ENSG00000237094",
"biotype": "lincRNA",
"gene_symbol_source": "Clone_based_vega_gene",
"consequence_terms": [
"transcript_amplification"
]
},
{
"variant_allele": "duplication",
"percentage_overlap": 100,
"bp_overlap": 2879,
"strand": -1,
"gene_symbol": "RP11-34P13.13",
"transcript_id": "ENST00000491962",
"gene_id": "ENSG00000241860",
"biotype": "lincRNA",
"gene_symbol_source": "Clone_based_vega_gene",
"consequence_terms": [
"transcript_amplification"
]
},
{
"variant_allele": "duplication",
"percentage_overlap": 100,
"bp_overlap": 1080,
"strand": 1,
"gene_symbol": "RP11-34P13.9",
"transcript_id": "ENST00000496488",
"gene_id": "ENSG00000241599",
"canonical": 1,
"biotype": "lincRNA",
"gene_symbol_source": "Clone_based_vega_gene",
"consequence_terms": [
"transcript_amplification"
]
},
{
"variant_allele": "duplication",
"percentage_overlap": 100,
"bp_overlap": 6848,
"strand": -1,
"gene_symbol": "RP4-669L17.2",
"transcript_id": "ENST00000450983",
"gene_id": "ENSG00000236601",
"canonical": 1,
"biotype": "lincRNA",
"gene_symbol_source": "Clone_based_vega_gene",
"consequence_terms": [
"transcript_amplification"
]
},
{
"variant_allele": "duplication",
"percentage_overlap": 100,
"bp_overlap": 385,
"strand": 1,
"gene_symbol": "RP4-669L17.1",
"transcript_id": "ENST00000458203",
"gene_id": "ENSG00000236679",
"canonical": 1,
"biotype": "processed_pseudogene",
"gene_symbol_source": "Clone_based_vega_gene",
"consequence_terms": [
"transcript_amplification"
]
},
{
"variant_allele": "duplication",
"percentage_overlap": 100,
"bp_overlap": 14358,
"strand": 1,
"gene_symbol": "RP4-669L17.10",
"transcript_id": "ENST00000440163",
"gene_id": "ENSG00000237094",
"biotype": "lincRNA",
"gene_symbol_source": "Clone_based_vega_gene",
"consequence_terms": [
"transcript_amplification"
]
},
{
"variant_allele": "duplication",
"percentage_overlap": 100,
"bp_overlap": 10645,
"strand": 1,
"gene_symbol": "RP4-669L17.10",
"transcript_id": "ENST00000601814",
"gene_id": "ENSG00000237094",
"biotype": "lincRNA",
"gene_symbol_source": "Clone_based_vega_gene",
"consequence_terms": [
"transcript_amplification"
]
},
{
"variant_allele": "duplication",
"percentage_overlap": 100,
"bp_overlap": 1763,
"strand": 1,
"gene_symbol": "RP4-669L17.10",
"transcript_id": "ENST00000601486",
"gene_id": "ENSG00000237094",
"biotype": "lincRNA",
"gene_symbol_source": "Clone_based_vega_gene",
"consequence_terms": [
"transcript_amplification"
]
},
{
"variant_allele": "duplication",
"percentage_overlap": 100,
"bp_overlap": 1854,
"strand": -1,
"gene_symbol": "AP006222.2",
"transcript_id": "ENST00000335577",
"gene_id": "ENSG00000228463",
"biotype": "lincRNA",
"gene_symbol_source": "Clone_based_vega_gene",
"consequence_terms": [
"transcript_amplification"
]
},
{
"variant_allele": "duplication",
"percentage_overlap": 100,
"bp_overlap": 2624,
"strand": 1,
"gene_symbol": "RP4-669L17.10",
"transcript_id": "ENST00000608420",
"gene_id": "ENSG00000237094",
"biotype": "lincRNA",
"gene_symbol_source": "Clone_based_vega_gene",
"consequence_terms": [
"transcript_amplification"
]
},
{
"variant_allele": "duplication",
"percentage_overlap": 100,
"bp_overlap": 2299,
"strand": 1,
"gene_symbol": "RP5-857K21.15",
"transcript_id": "ENST00000441866",
"gene_id": "ENSG00000236743",
"canonical": 1,
"biotype": "lincRNA",
"gene_symbol_source": "Clone_based_vega_gene",
"consequence_terms": [
"transcript_amplification"
]
},
{
"variant_allele": "duplication",
"percentage_overlap": 100,
"bp_overlap": 10664,
"strand": -1,
"gene_symbol": "AP006222.2",
"transcript_id": "ENST00000448958",
"gene_id": "ENSG00000228463",
"biotype": "lincRNA",
"gene_symbol_source": "Clone_based_vega_gene",
"consequence_terms": [
"transcript_amplification"
]
},
{
"variant_allele": "duplication",
"percentage_overlap": 100,
"bp_overlap": 4488,
"strand": 1,
"gene_symbol": "RP4-669L17.10",
"transcript_id": "ENST00000431812",
"gene_id": "ENSG00000237094",
"biotype": "lincRNA",
"gene_symbol_source": "Clone_based_vega_gene",
"consequence_terms": [
"transcript_amplification"
]
},
{
"variant_allele": "duplication",
"percentage_overlap": 100,
"bp_overlap": 112027,
"strand": 1,
"gene_symbol": "RP4-669L17.10",
"transcript_id": "ENST00000455207",
"gene_id": "ENSG00000237094",
"biotype": "lincRNA",
"gene_symbol_source": "Clone_based_vega_gene",
"consequence_terms": [
"transcript_amplification"
]
},
{
"variant_allele": "duplication",
"percentage_overlap": 100,
"bp_overlap": 995,
"strand": 1,
"gene_symbol": "OR4F29",
"transcript_id": "ENST00000426406",
"gene_id": "ENSG00000235249",
"canonical": 1,
"biotype": "protein_coding",
"gene_symbol_source": "HGNC",
"consequence_terms": [
"transcript_amplification"
]
},
{
"variant_allele": "duplication",
"percentage_overlap": 100,
"bp_overlap": 6639,
"strand": -1,
"gene_symbol": "RP4-669L17.2",
"transcript_id": "ENST00000412666",
"gene_id": "ENSG00000236601",
"biotype": "lincRNA",
"gene_symbol_source": "Clone_based_vega_gene",
"consequence_terms": [
"transcript_amplification"
]
},
{
"variant_allele": "duplication",
"distance": 2397,
"strand": -1,
"gene_symbol": "RNU6-1100P",
"transcript_id": "ENST00000410691",
"gene_id": "ENSG00000222623",
"canonical": 1,
"biotype": "snRNA",
"gene_symbol_source": "HGNC",
"consequence_terms": [
"upstream_gene_variant"
]
},
{
"variant_allele": "duplication",
"percentage_overlap": 100,
"bp_overlap": 20729,
"strand": 1,
"gene_symbol": "RP4-669L17.10",
"transcript_id": "ENST00000455464",
"gene_id": "ENSG00000237094",
"biotype": "lincRNA",
"gene_symbol_source": "Clone_based_vega_gene",
"consequence_terms": [
"transcript_amplification"
]
},
{
"variant_allele": "duplication",
"percentage_overlap": 100,
"bp_overlap": 3056,
"strand": 1,
"gene_symbol": "RP4-669L17.10",
"transcript_id": "ENST00000453935",
"gene_id": "ENSG00000237094",
"biotype": "lincRNA",
"gene_symbol_source": "Clone_based_vega_gene",
"consequence_terms": [
"transcript_amplification"
]
},
{
"variant_allele": "duplication",
"percentage_overlap": 100,
"bp_overlap": 363,
"strand": -1,
"gene_symbol": "AP006222.1",
"transcript_id": "ENST00000424429",
"gene_id": "ENSG00000241670",
"biotype": "processed_pseudogene",
"gene_symbol_source": "Clone_based_vega_gene",
"consequence_terms": [
"transcript_amplification"
]
},
{
"variant_allele": "duplication",
"percentage_overlap": 100,
"bp_overlap": 7154,
"strand": 1,
"gene_symbol": "RP4-669L17.10",
"transcript_id": "ENST00000440038",
"gene_id": "ENSG00000237094",
"biotype": "lincRNA",
"gene_symbol_source": "Clone_based_vega_gene",
"consequence_terms": [
"transcript_amplification"
]
},
{
"variant_allele": "duplication",
"percentage_overlap": 49.42,
"bp_overlap": 13579,
"strand": -1,
"gene_symbol": "RP11-34P13.13",
"transcript_id": "ENST00000466557",
"gene_id": "ENSG00000241860",
"biotype": "lincRNA",
"gene_symbol_source": "Clone_based_vega_gene",
"consequence_terms": [
"non_coding_exon_variant",
"intron_variant",
"nc_transcript_variant"
]
},
{
"variant_allele": "duplication",
"percentage_overlap": 100,
"bp_overlap": 2806,
"strand": -1,
"gene_symbol": "CICP7",
"transcript_id": "ENST00000432723",
"gene_id": "ENSG00000233653",
"canonical": 1,
"biotype": "processed_pseudogene",
"gene_symbol_source": "HGNC",
"consequence_terms": [
"transcript_amplification"
]
},
{
"variant_allele": "duplication",
"percentage_overlap": 100,
"bp_overlap": 2284,
"strand": 1,
"gene_symbol": "RP4-669L17.10",
"transcript_id": "ENST00000419160",
"gene_id": "ENSG00000237094",
"biotype": "lincRNA",
"gene_symbol_source": "Clone_based_vega_gene",
"consequence_terms": [
"transcript_amplification"
]
},
{
"variant_allele": "duplication",
"percentage_overlap": 100,
"bp_overlap": 2017,
"strand": 1,
"gene_symbol": "RP4-669L17.8",
"transcript_id": "ENST00000514436",
"gene_id": "ENSG00000250575",
"canonical": 1,
"biotype": "unprocessed_pseudogene",
"gene_symbol_source": "Clone_based_vega_gene",
"consequence_terms": [
"transcript_amplification"
]
},
{
"variant_allele": "duplication",
"percentage_overlap": 100,
"bp_overlap": 457,
"strand": -1,
"gene_symbol": "AP006222.1",
"transcript_id": "ENST00000450734",
"gene_id": "ENSG00000241670",
"canonical": 1,
"biotype": "processed_pseudogene",
"gene_symbol_source": "Clone_based_vega_gene",
"consequence_terms": [
"transcript_amplification"
]
}
],
"regulatory_feature_consequences": [
{
"variant_allele": "duplication",
"percentage_overlap": 87.67,
"consequence_terms": [
"regulatory_region_variant"
],
"bp_overlap": 576,
"regulatory_feature_id": "ENSR00000278273"
}
],
"id": "sv1",
"most_severe_consequence": "transcript_amplification",
"start": 471362
}
}
@heuermh
Copy link

heuermh commented Jul 3, 2014

Thanks. A few suggestions/comments/questions:

"canonical": "YES" could be "canonical": 1

numbers are inconsistently formatted, e.g. "strand": "1", and "strand": 1

how does "location": "21:9462266" differ from the combination of ("seq_region_name": "21", "start": "9462266", "end": "9462266", "strand": 1)?

@willmclaren
Copy link

Agreed, it is neater for canonical to be 1 and not "YES"; I will update this.

The number formatting is interesting; because we are in Perl and Perl is agnostic about scalar types, it might be a difficult change (the JSON is produced from Perl hashes by Catalyst). Magali may know more.

The location field is as you say just a combination of the other fields. I guess some might find it useful even if it is technically superfluous.

@magaliruffier
Copy link
Author

We were keeping the location field because it has been there before, but it seems to lead to more confusion, so it is gone now.
The numbers have been converted to proper scalars rather than strings.
I have updated the example output, happy to take any further input

@grsr
Copy link

grsr commented Jul 4, 2014

If the aim is to stick to native JSON types where possible (which Will has just done for numerical data), I think it would be preferable for canonical to be a boolean value, i.e.:

"canonical": true

@heuermh
Copy link

heuermh commented Jul 6, 2014

Great, thanks!

Would it be possible to include the reference allele as a separate field? It is usually present in "allele_string": "A/G" but not always "allele_string": "HGMD_MUTATION" (although that is an odd one).

@andrewyatz
Copy link

Numbers which are strings are a problem with Perl and any text based transfer protocol. Perl says it doesn't care if it's a number until we want to eval it as one and then it'll do the checking/conversion. As @magaliruffier says it's a simple fix by just either doing +0 or *1 to any number.

As for the Boolean situation yes true is the correct way to represent this information. Our Json library does handle this via Types::Serialiser and it's true and false values. Though this does mean rewriting the output for a number of endpoints. I'm in favour of the change but this will have to be evaluated as to the impact of the change. If it's too large we should then schedule the change for the next major bump.

@magaliruffier
Copy link
Author

The reference allele should correspond to the first allele in your input (if searching by region), as well as the first allele in the allele_string field
in cases where the allele_string does not contain that information, we don't always know what the reference allele is
HGMD_mutation tells us there is a known mutation in the location, but we don't know what reference sequence they used, which might be different from our reference (refseq gene vs GRCh37 assembly for example)
We will look into adding the reference_allele field, but not in this implementation

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment