Created
May 30, 2019 10:32
-
-
Save Nanguage/1e1de98c4339c9d020e244ea4a41ea93 to your computer and use it in GitHub Desktop.
Fetch KEGG pathway's all gene's sequences via KEGG Restful API
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import re | |
import os | |
from concurrent.futures import ThreadPoolExecutor | |
import pandas as pd | |
from tqdm import tqdm | |
def fetch_page(base_url): | |
def fetch(id_): | |
url = base_url.format(id_) | |
r_ = requests.get(url) | |
page_text = r_.text | |
return page_text | |
return fetch | |
def fetch_pathway_page(pathway): | |
base_url = "http://rest.kegg.jp/get/{}/" | |
url = base_url.format(pathway) | |
r_ = requests.get(url) | |
page_text = r_.text | |
return page_text | |
def extract_genes(text): | |
entries = [] | |
lines = text.split("\n") | |
begin = False | |
for i, line in enumerate(lines): | |
if begin and (not line.startswith(" ")): | |
break | |
if begin: | |
entries.append(line.split()[0]) | |
if line.startswith('GENE'): | |
begin = True | |
entries.append(line.split()[1]) | |
return entries | |
def extract_gene_name(p_text): | |
for line in p_text.split("\n"): | |
if line.startswith("NAME"): | |
names = ",".join([i.strip(",") for i in line.split()[1:]]) | |
return names | |
def extract_seq(p_text): | |
lines = p_text.split("\n") | |
return "".join(lines[1:]) | |
def get_pathway_table(pathway, workers=20): | |
table_ = [] | |
text = fetch_pathway_page(pathway) | |
if not text: | |
return | |
genes = extract_genes(text) | |
exe = ThreadPoolExecutor(max_workers=workers) | |
for g, text in tqdm(zip(genes, exe.map(fetch_gene_page, genes)), total=len(genes)): | |
gnames = extract_gene_name(text) | |
seq = extract_seq(fetch_gene_ntseq_page(g)) | |
table_.append([str(g), gnames, seq]) | |
table = pd.DataFrame(table_) | |
table.columns = ['kegg_id', 'gene_names', 'sequence'] | |
return table | |
def save_table(pathway, df): | |
df.to_csv(pathway + ".csv") | |
if __name__ == "__main__": | |
org_id = "ssc" # organism name | |
fetch_pathway_page = fetch_page("http://rest.kegg.jp/get/{}/") | |
fetch_gene_page = fetch_page("http://rest.kegg.jp/get/{}:{}/".format(org_id, "{}")) | |
fetch_gene_ntseq_page = fetch_page("http://rest.kegg.jp/get/{}:{}/ntseq".format(org_id, "{}")) | |
pathways = [ # pathway ids | |
'04010','04630','04064','04668','04151','04150','04152' | |
] | |
pathways = [str(org_id) + i for i in pathways] | |
for pathway in pathways: | |
if os.path.exists(pathway + ".csv"): | |
continue | |
print(pathway) | |
df = get_pathway_table(pathway) | |
if df is not None: | |
save_table(pathway, df) | |
else: | |
print("none") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment