eweitz/chr_rs_acc_via_eutils.py

## chr_rs_acc_via_eutils.py
'''
This simple script shows how to use NCBI E-Utilies to get a chromosome's
RefSeq accession given the chromosome's name and its genome assembly.

Example:
$ python3 chr_rs_acc_via_eutils.py
RefSeq accession for chromosome 6 on genome assembly GRCh38 (GCF_000001405.26):
NC_000006.12
'''

from urllib.request import urlopen
import json

chr_name = '6'
asm_name = 'GRCh38'

debug = False

def load_json(response):
    return json.loads(response.read().decode('utf-8'))

# The E-Utilies In Depth: Parameters, Syntax and More:
# https://www.ncbi.nlm.nih.gov/books/NBK25499/
eutils = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
esearch = eutils + 'esearch.fcgi?retmode=json'
esummary = eutils + 'esummary.fcgi?retmode=json'
elink = eutils + 'elink.fcgi?retmode=json'

asm_search = esearch +  '&db=assembly&term=%22' + asm_name + '%22[name]'

if debug:
    print('Fetching ' + asm_search)
with urlopen(asm_search) as f:
    data = load_json(f)

# Get NCBI Assembly database's internal identifier (uid) for this assembly
asm_uid = data['esearchresult']['idlist'][0]
asm_summary = esummary + '&db=assembly&id=' + asm_uid

if debug:
    print('Fetching ' + asm_summary)
with urlopen(asm_summary) as f:
    data = load_json(f)

# Get RefSeq UID for this assembly
rs_uid = data['result'][asm_uid]['rsuid']
asm_acc = data['result'][asm_uid]['assemblyaccession']

# Get a list of IDs for the chromosomes in this genome.
# This information does not seem to be available from well-known NCBI databases
# like Assembly or Nucleotide, so we use GenColl, a lesser-known NCBI database.
qs = '&db=nuccore&linkname=gencoll_nuccore_chr&from_uid=' + rs_uid
nuccore_link = elink + qs

if debug:
    print('Fetching ' + nuccore_link)
with urlopen(nuccore_link) as f:
    data = load_json(f)

links = ','.join(str(x) for x in data['linksets'][0]['linksetdbs'][0]['links'])
nt_summary = esummary + '&db=nucleotide&id=' + links

if debug:
    print('Fetching ' + nt_summary)
with urlopen(nt_summary) as f:
    data = load_json(f)

results = data['result']

for key in results:
    if key == 'uids':
        continue
    result = results[key]

    try:
        cn_index = result['subtype'].split('|').index('chromosome')
    except ValueError as e:
        cn_index = 1

    try:
        this_chr_name = result['subname'].split('|')[cn_index]
    except IndexError:
        continue

    if this_chr_name != None and this_chr_name[:3] == 'chr':
      # Convert 'chr12' to '12', e.g. for banana (GCF_000313855.2)
      this_chr_name = this_chr_name[3:]

    if this_chr_name == chr_name:
        print(
            'RefSeq accession for chromosome ' + chr_name + ' on ' +
            'genome assembly ' + asm_name + ' (' + asm_acc + '):'
        )
        print(result['accessionversion'])
	'''
	This simple script shows how to use NCBI E-Utilies to get a chromosome's
	RefSeq accession given the chromosome's name and its genome assembly.

	Example:
	$ python3 chr_rs_acc_via_eutils.py
	RefSeq accession for chromosome 6 on genome assembly GRCh38 (GCF_000001405.26):
	NC_000006.12
	'''

	from urllib.request import urlopen
	import json

	chr_name = '6'
	asm_name = 'GRCh38'

	debug = False

	def load_json(response):
	return json.loads(response.read().decode('utf-8'))

	# The E-Utilies In Depth: Parameters, Syntax and More:
	# https://www.ncbi.nlm.nih.gov/books/NBK25499/
	eutils = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
	esearch = eutils + 'esearch.fcgi?retmode=json'
	esummary = eutils + 'esummary.fcgi?retmode=json'
	elink = eutils + 'elink.fcgi?retmode=json'

	asm_search = esearch + '&db=assembly&term=%22' + asm_name + '%22[name]'

	if debug:
	print('Fetching ' + asm_search)
	with urlopen(asm_search) as f:
	data = load_json(f)

	# Get NCBI Assembly database's internal identifier (uid) for this assembly
	asm_uid = data['esearchresult']['idlist'][0]
	asm_summary = esummary + '&db=assembly&id=' + asm_uid

	if debug:
	print('Fetching ' + asm_summary)
	with urlopen(asm_summary) as f:
	data = load_json(f)

	# Get RefSeq UID for this assembly
	rs_uid = data['result'][asm_uid]['rsuid']
	asm_acc = data['result'][asm_uid]['assemblyaccession']

	# Get a list of IDs for the chromosomes in this genome.
	# This information does not seem to be available from well-known NCBI databases
	# like Assembly or Nucleotide, so we use GenColl, a lesser-known NCBI database.
	qs = '&db=nuccore&linkname=gencoll_nuccore_chr&from_uid=' + rs_uid
	nuccore_link = elink + qs

	if debug:
	print('Fetching ' + nuccore_link)
	with urlopen(nuccore_link) as f:
	data = load_json(f)

	links = ','.join(str(x) for x in data['linksets'][0]['linksetdbs'][0]['links'])
	nt_summary = esummary + '&db=nucleotide&id=' + links

	if debug:
	print('Fetching ' + nt_summary)
	with urlopen(nt_summary) as f:
	data = load_json(f)

	results = data['result']

	for key in results:
	if key == 'uids':
	continue
	result = results[key]

	try:
	cn_index = result['subtype'].split('\|').index('chromosome')
	except ValueError as e:
	cn_index = 1

	try:
	this_chr_name = result['subname'].split('\|')[cn_index]
	except IndexError:
	continue

	if this_chr_name != None and this_chr_name[:3] == 'chr':
	# Convert 'chr12' to '12', e.g. for banana (GCF_000313855.2)
	this_chr_name = this_chr_name[3:]

	if this_chr_name == chr_name:
	print(
	'RefSeq accession for chromosome ' + chr_name + ' on ' +
	'genome assembly ' + asm_name + ' (' + asm_acc + '):'
	)
	print(result['accessionversion'])