Skip to content

Instantly share code, notes, and snippets.

@aabiddanda
Last active December 14, 2016 17:02
Show Gist options
  • Save aabiddanda/8d8e2a29eee242ad0f75a1b32f165ac4 to your computer and use it in GitHub Desktop.
Save aabiddanda/8d8e2a29eee242ad0f75a1b32f165ac4 to your computer and use it in GitHub Desktop.
#!/usr/local/bin/python3
'''
Outputting audio representation of the
geography of a genetic variant (from 1000 Genomes)
'''
import argparse as arg
import subprocess as sp
import requests as req
'''
Super-population definitions in 1000 Genomes
'''
AFR = ['YRI', 'LWK', 'GWD', 'MSL', 'ESN', 'ASW', 'ACB']
EUR = ['CEU', 'TSI', 'FIN', 'GBR', 'IBS']
SAS = ['GIH', 'PJL', 'BEB', 'STU', 'ITU']
EAS = ['CHB', 'JPT', 'CHS', 'CDX', 'KHV']
AMR = ['MXL', 'PUR', 'CLM', 'PEL']
superpops = ['Africa', 'Europe', 'South Asia', 'East Asia', 'Americas']
'''
Extracts a variant via the ggv api
@arg rsID : rsID of variant in 1000 genomes
@return : json dictionary holding variant fields
'''
def get_variant(rsID=None):
if rsID is None:
snp = req.get("http://popgen.uchicago.edu/ggv_api/freq_table?data=%221000genomes_table%22&random_snp=True")
else:
snp = req.get("http://popgen.uchicago.edu/ggv_api/freq_table?data=%221000genomes_table%22&rsID=%s", rsID)
return(snp.json())
'''
Function to collect variant frequency in various populations
@arg snp_json - json object from ggv api call
@return - length 5 vector with super population frequencies
'''
def collect_freqs_superpops(snp_json):
pop_dict = dict()
pop = [snp_json[i]['pop'] for i in range(len(snp_json))]
total_cnt = [(snp_json[i]['xobs'], snp_json[i]['nobs']) for i in range(len(snp_json))]
for i in range(len(pop)):
pop_dict[pop[i]] = total_cnt[i]
# Assign superpopulation frequencies
AFR_freq = sum([float(pop_dict[i][0]) for i in AFR]) / sum([float(pop_dict[i][1]) for i in AFR])
EUR_freq = sum([float(pop_dict[i][0]) for i in EUR]) / sum([float(pop_dict[i][1]) for i in EUR])
EAS_freq = sum([float(pop_dict[i][0]) for i in EAS]) / sum([float(pop_dict[i][1]) for i in EAS])
SAS_freq = sum([float(pop_dict[i][0]) for i in SAS]) / sum([float(pop_dict[i][1]) for i in SAS])
AMR_freq = sum([float(pop_dict[i][0]) for i in AMR]) / sum([float(pop_dict[i][1]) for i in AMR])
return([AFR_freq, EUR_freq, SAS_freq, EAS_freq, AMR_freq])
'''
Categorization of one frequency (fancy binning)
@arg frq - frequency of variant (0 <= f <= 1)
@arg rare - cutoff for calling something rare (def : 0.01)
@arg low - cutoff for calling something low frequency (def : 0.05)
@return - string categorizing variant frequency
'''
def categorize_freq(frq, rare = 0.01, low = 0.05):
if frq == 0:
return(None)
elif frq > low:
return('common')
elif (frq < low) and (frq > rare):
return('low-frequency')
else:
return('rare')
'''
Heuristic Categorization of Variant Frequencies
@arg superpop_freqs - length 5 vector of super pop frequencies
@return - string representation of variant categorization
'''
def categorize_variant(superpop_freqs, rare = 0.01, low = 0.05):
# s = ''
if all([(frq > low) for frq in superpop_freqs]):
s = 'globally common'
return(s)
elif all([ frq < low and frq > rare for frq in superpop_freqs]):
s = 'globally low-frequency'
return(s)
elif all([frq < rare and frq > 0.0 for frq in superpop_freqs]):
s = 'globally rare'
return(s)
else:
s = ''
for i in range(len(superpops)):
cat = categorize_freq(superpop_freqs[i], rare, low)
if (cat is not None):
s += cat + ' in ' + superpops[i] + ','
s = s[:-1]
return(s)
'''
Passing geodist to vocalization program
@arg geodist string - description of geographic distribution of variant.
@return - passes categorization to 'say' program
'''
def vocalize_geodist(geodist_str):
sp.call(["say", geodist_str])
if __name__ =='__main__':
# Parse all arguments given
parser = arg.ArgumentParser()
parser.add_argument('-r', '--rsID', type=str, required=False, help="rsID")
args = parser.parse_args()
#1. Get Json Object
if args.rsID is not None:
print('rsID : %s' % args.rsID)
snp_json = get_variant(args.rsID)
if len(snp_json) < 2:
print('Incorrect formatting of rsID')
exit(1)
print('Chrom:Pos : %s' % snp_json[1]['chrom_pos'])
#2. Get Super Population Frequencies
superpop_freqs = collect_freqs_superpops(snp_json)
print(superpop_freqs)
#3. Get Variant Categorization
geodist_string = categorize_variant(superpop_freqs)
print(geodist_string)
#4. Read off Geodist
vocalize_geodist(geodist_string)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment