Created
August 9, 2018 21:41
-
-
Save carolinecunningham/bcd5895c67b85226d94cda80fe5d8869 to your computer and use it in GitHub Desktop.
API for motifs and signatures
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import json | |
import csv | |
import glob | |
# MUTAGENE_URL = "https://www.ncbi.nlm.nih.gov/research/mutagene" | |
#MUTAGENE_URL = "https://dev.ncbi.nlm.nih.gov/research/mutagene" | |
MUTAGENE_URL_M = "http://mwebdev2/research/mutagene" | |
# MUTAGENE_URL = "http://localhost:5000" | |
MUTAGENE_URL = "https://www.ncbi.nlm.nih.gov/research/mutagene" | |
path = '/Users/cunninghamck/PycharmProjects/motifs/*.txt' | |
def get_motifs(fname, assembly=37): | |
""" | |
Identify mutational motifs in a VCF or MAF sample | |
""" | |
url = MUTAGENE_URL_M + '/pub/api/identify/motifs' | |
files = {'file': open(fname, 'rb')} | |
r = requests.post(url, files=files, data={'assembly': assembly}) | |
if r.status_code == 200: | |
return r.json()['motifs'] | |
def print_motifs(motifs): | |
""" | |
Printing the results of motif identification | |
""" | |
if motifs is None: | |
#print("Empty") | |
return | |
#for m in motifs: | |
#print(m['motif'], m['mutations']) | |
def return_motifs(motifs): | |
""" | |
Printing the results of motif identification | |
""" | |
listed = {"\t":"\t"} | |
if motifs is None: | |
listed['APOBEC'] = 0 | |
listed['AGEING'] = 0 | |
listed['Tobacco'] = 0 | |
listed['AID'] = 0 | |
listed['UV Light'] = 0 | |
listed['Aristolochic Acid (AA)'] = 0 | |
return listed | |
for m in motifs: | |
listed[m['name']] = int(m['mutations']) | |
return listed | |
def get_profile(fname, assembly=37): | |
""" | |
Calling MutaGene REST API to convert a VCF file into a mutational profile (96 context-dependent mutational probabilities) | |
and profile_counts (counts of mutations for each of the 96 context-dependent mutations) | |
It is important to specify genome assembly correctly. Curently 19, 37 and 38 will work | |
""" | |
url = MUTAGENE_URL + '/pub/api/identify/profile' | |
files = {'file': open(fname, 'rb')} | |
r = requests.post(url, files=files, data={'assembly': assembly}) | |
# print("STATUS", r.status_code) | |
if r.status_code == 200: | |
return r.json()['result_counts'] | |
def get_decomposition(profile_counts, signatures='COSMIC30'): | |
""" | |
Decomposition of mutational profiles into a combination of signatures. | |
It is highly recommended to use profile_counts instead of profile in order to use Maximum Likelihood method | |
*signatures* should be one of COSMIC30 MUTAGENE5 MUTAGENE10 | |
*others_threshold* is used for not reporting signatures with exposure less or equal than the threshold and reporting the sum of their exposures as "Other signatures". | |
Set *others_threshold* to 0 if not needed. The MutaGene website uses others_threshold = 0.05 by default. | |
""" | |
url = MUTAGENE_URL + '/pub/api/identify/decomposition' | |
r = requests.post(url, data={'profile_counts': json.dumps(profile_counts), 'signatures': signatures, 'others_threshold': 0.0}) | |
# print("STATUS", r.status_code) | |
if r.status_code == 200: | |
return r.json()['decomposition'] | |
def print_profile_counts(profile_counts): | |
""" | |
Printing context-dependent mutational profile | |
""" | |
for mutation, value in profile.items(): | |
print("{}\t{:.0f}".format(mutation, value)) | |
print() | |
def print_decomposition(decomposition): | |
""" | |
Printing the results of decomposition | |
""" | |
for component in decomposition: | |
print("{}\t{:.2f}\t{:.0f}".format(component['name'], component['score'], component['mutations'])) | |
print() | |
def return_decomposition(decomposition): | |
""" | |
Printing the results of decomposition | |
""" | |
listed = {} | |
for component in decomposition: | |
name = str(component['name']) | |
mut = component['mutations'] | |
listed[name] = mut | |
return listed | |
if __name__ == '__main__': | |
for filename in glob.glob(path): | |
vcf_files = [filename, ] | |
with open("further_analysis.csv", "a") as csvfile: # create csv file | |
for file_name in vcf_files: | |
motif_name = get_motifs(file_name, assembly=37) | |
value_list = return_motifs(motif_name) | |
mut = return_motifs(motif_name) | |
if len(mut) == 1: | |
continue | |
profile = get_profile(file_name, assembly=37) | |
if profile is not None: | |
for signature_type in ('MUTAGENE5', 'MUTAGENE10'): | |
decomposition = get_decomposition(profile, signature_type) | |
if signature_type in 'MUTAGENE5': | |
result_a = return_decomposition(decomposition) | |
if signature_type in 'MUTAGENE10': | |
result_b = return_decomposition(decomposition) | |
filewriter = csv.writer(csvfile, delimiter=",", quotechar="|", quoting=csv.QUOTE_MINIMAL) | |
filewriter.writerow(["File name", "\t", "T[C>K]W", "[C>T]G", "Y[C>T]", "N[T>S]W", "[C>R]G", "C[T>A]G]", "\t", "A.1 Tobacco", 'A.2 AID/APOBEC', "A.3 Unknown Etiology ", | |
"A.4 UV Light", "A.5 Deanimation of Methyl-Cytosine", | |
"B.1 Deanimation of Methyl-Cytosine", 'B.2 Unknown Etiology', | |
"B.3 Aristolochic Acid", "B.4 Tobacco or Aflotoxin", "B.5 AID/APOBEC", | |
"B.6 Unknown Etiology", "B.7 Unknown Etiology", "B.8 DNA Polymerase Epsilon", | |
"B.9 UV Radiation", "B.10 Defective DNA Mismatch Repair"]) | |
filewriter.writerow([filename[43:], mut["\t"], mut['APOBEC'], mut['AGEING'], mut['UV Light'], mut['AID'], | |
mut['Tobacco'], mut['Aristolochic Acid (AA)'], mut["\t"], | |
result_a["MUTAGENE A.1"], result_a["MUTAGENE A.2"], result_a["MUTAGENE A.3"], | |
result_a["MUTAGENE A.4"], result_a["MUTAGENE A.5"], result_b["MUTAGENE B.1"], | |
result_b["MUTAGENE B.2"], result_b["MUTAGENE B.3"], result_b["MUTAGENE B.4"], | |
result_b["MUTAGENE B.5"], result_b["MUTAGENE B.6"], result_b["MUTAGENE B.7"], | |
result_b["MUTAGENE B.8"], result_b["MUTAGENE B.9"], result_b["MUTAGENE B.10"]]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment