Skip to content

Instantly share code, notes, and snippets.

@linhai86
Created February 25, 2019 00:34
Show Gist options
  • Save linhai86/4d40deda3fe75b1e971b7083aa371d68 to your computer and use it in GitHub Desktop.
Save linhai86/4d40deda3fe75b1e971b7083aa371d68 to your computer and use it in GitHub Desktop.
Given mutation, extract flanking protein sequence.
import requests
class Mut2Seq:
"""Given mutation, extract flanking protein sequence using Ensembl API.
"""
def __init__(self, server='https://rest.ensembl.org',
species='homo_sapiens'):
self.server = server
self.species = species
def mut2seq(self, chrom, start, end, allele, flank=5):
results = []
hgvsp = self._get_hgvsp(chrom, start, end, allele)
for k, v in hgvsp.items():
results.append(self._get_protein_seq(v, flank))
return results
def _get_hgvsp(self, chrom, start, end, allele):
ext = '/vep/{species}/region/{chrom}:{start}:{end}/{allele}?canonical=1&hgvs=1&distance=0&protein=1'.format(
species=self.species,
chrom=chrom,
start=start,
end=end,
allele=allele
)
headers = { "Content-Type" : "application/json"}
response = requests.get(self.server + ext, headers=headers)
if not response.ok:
response.raise_for_status()
response_json = response.json()
assert len(response_json) == 1
response_json = response_json.pop()
hgvsp = {}
if 'transcript_consequences' in response_json:
for transcript in response_json['transcript_consequences']:
if 'hgvsp' in transcript:
hgvsp[transcript['hgvsp']] = {
'protein_id': transcript['protein_id'],
'protein_start': transcript['protein_start'],
'protein_end': transcript['protein_end'],
'amino_acids': transcript['amino_acids'].split('/'),
}
return hgvsp
def _get_protein_seq(self, hgvsp, flank):
ext = '/sequence/id/{protein_id}?'.format(protein_id=hgvsp['protein_id'])
headers = { "Content-Type" : "text/plain"}
response = requests.get(self.server + ext, headers=headers)
if not response.ok:
response.raise_for_status()
seq = response.text
assert seq[hgvsp['protein_start']-1:hgvsp['protein_end']] == hgvsp['amino_acids'][0]
extract_start = max(hgvsp['protein_start'] - 1 - flank, 0)
extract_end = min(hgvsp['protein_end'] + flank, len(seq))
hgvsp['ref_seq'] = seq[extract_start:extract_end]
if len(hgvsp['amino_acids']) == 2:
hgvsp['alt_seq'] = seq[extract_start:hgvsp['protein_start']-1] + hgvsp['amino_acids'][1] + seq[hgvsp['protein_end']:extract_end]
return hgvsp
m2s = EnsMut2Seq()
x = m2s.mut2seq(1, 6526802, 6526802, 'C')
# x:
# [{'protein_id': 'ENSP00000366934',
# 'protein_start': 618,
# 'protein_end': 618,
# 'amino_acids': ['K', 'R'],
# 'ref_seq': 'GIDMEKRLYHI',
# 'alt_seq': 'GIDMERRLYHI'}]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment