Skip to content

Instantly share code, notes, and snippets.

@eweitz
Created April 25, 2017 15:33
Show Gist options
  • Save eweitz/20ee1d7035e103f7533ed34f81bfe35d to your computer and use it in GitHub Desktop.
Save eweitz/20ee1d7035e103f7533ed34f81bfe35d to your computer and use it in GitHub Desktop.
NCBI EUtils demo: get a chromosome's RefSeq accession given its name and assembly
'''
This simple script shows how to use NCBI E-Utilies to get a chromosome's
RefSeq accession given the chromosome's name and its genome assembly.
Example:
$ python3 chr_rs_acc_via_eutils.py
RefSeq accession for chromosome 6 on genome assembly GRCh38 (GCF_000001405.26):
NC_000006.12
'''
from urllib.request import urlopen
import json
chr_name = '6'
asm_name = 'GRCh38'
debug = False
def load_json(response):
return json.loads(response.read().decode('utf-8'))
# The E-Utilies In Depth: Parameters, Syntax and More:
# https://www.ncbi.nlm.nih.gov/books/NBK25499/
eutils = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
esearch = eutils + 'esearch.fcgi?retmode=json'
esummary = eutils + 'esummary.fcgi?retmode=json'
elink = eutils + 'elink.fcgi?retmode=json'
asm_search = esearch + '&db=assembly&term=%22' + asm_name + '%22[name]'
if debug:
print('Fetching ' + asm_search)
with urlopen(asm_search) as f:
data = load_json(f)
# Get NCBI Assembly database's internal identifier (uid) for this assembly
asm_uid = data['esearchresult']['idlist'][0]
asm_summary = esummary + '&db=assembly&id=' + asm_uid
if debug:
print('Fetching ' + asm_summary)
with urlopen(asm_summary) as f:
data = load_json(f)
# Get RefSeq UID for this assembly
rs_uid = data['result'][asm_uid]['rsuid']
asm_acc = data['result'][asm_uid]['assemblyaccession']
# Get a list of IDs for the chromosomes in this genome.
# This information does not seem to be available from well-known NCBI databases
# like Assembly or Nucleotide, so we use GenColl, a lesser-known NCBI database.
qs = '&db=nuccore&linkname=gencoll_nuccore_chr&from_uid=' + rs_uid
nuccore_link = elink + qs
if debug:
print('Fetching ' + nuccore_link)
with urlopen(nuccore_link) as f:
data = load_json(f)
links = ','.join(str(x) for x in data['linksets'][0]['linksetdbs'][0]['links'])
nt_summary = esummary + '&db=nucleotide&id=' + links
if debug:
print('Fetching ' + nt_summary)
with urlopen(nt_summary) as f:
data = load_json(f)
results = data['result']
for key in results:
if key == 'uids':
continue
result = results[key]
try:
cn_index = result['subtype'].split('|').index('chromosome')
except ValueError as e:
cn_index = 1
try:
this_chr_name = result['subname'].split('|')[cn_index]
except IndexError:
continue
if this_chr_name != None and this_chr_name[:3] == 'chr':
# Convert 'chr12' to '12', e.g. for banana (GCF_000313855.2)
this_chr_name = this_chr_name[3:]
if this_chr_name == chr_name:
print(
'RefSeq accession for chromosome ' + chr_name + ' on ' +
'genome assembly ' + asm_name + ' (' + asm_acc + '):'
)
print(result['accessionversion'])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment