Skip to content

Instantly share code, notes, and snippets.

@Nanguage
Created May 30, 2019 10:32
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Nanguage/1e1de98c4339c9d020e244ea4a41ea93 to your computer and use it in GitHub Desktop.
Save Nanguage/1e1de98c4339c9d020e244ea4a41ea93 to your computer and use it in GitHub Desktop.
Fetch KEGG pathway's all gene's sequences via KEGG Restful API
import requests
import re
import os
from concurrent.futures import ThreadPoolExecutor
import pandas as pd
from tqdm import tqdm
def fetch_page(base_url):
def fetch(id_):
url = base_url.format(id_)
r_ = requests.get(url)
page_text = r_.text
return page_text
return fetch
def fetch_pathway_page(pathway):
base_url = "http://rest.kegg.jp/get/{}/"
url = base_url.format(pathway)
r_ = requests.get(url)
page_text = r_.text
return page_text
def extract_genes(text):
entries = []
lines = text.split("\n")
begin = False
for i, line in enumerate(lines):
if begin and (not line.startswith(" ")):
break
if begin:
entries.append(line.split()[0])
if line.startswith('GENE'):
begin = True
entries.append(line.split()[1])
return entries
def extract_gene_name(p_text):
for line in p_text.split("\n"):
if line.startswith("NAME"):
names = ",".join([i.strip(",") for i in line.split()[1:]])
return names
def extract_seq(p_text):
lines = p_text.split("\n")
return "".join(lines[1:])
def get_pathway_table(pathway, workers=20):
table_ = []
text = fetch_pathway_page(pathway)
if not text:
return
genes = extract_genes(text)
exe = ThreadPoolExecutor(max_workers=workers)
for g, text in tqdm(zip(genes, exe.map(fetch_gene_page, genes)), total=len(genes)):
gnames = extract_gene_name(text)
seq = extract_seq(fetch_gene_ntseq_page(g))
table_.append([str(g), gnames, seq])
table = pd.DataFrame(table_)
table.columns = ['kegg_id', 'gene_names', 'sequence']
return table
def save_table(pathway, df):
df.to_csv(pathway + ".csv")
if __name__ == "__main__":
org_id = "ssc" # organism name
fetch_pathway_page = fetch_page("http://rest.kegg.jp/get/{}/")
fetch_gene_page = fetch_page("http://rest.kegg.jp/get/{}:{}/".format(org_id, "{}"))
fetch_gene_ntseq_page = fetch_page("http://rest.kegg.jp/get/{}:{}/ntseq".format(org_id, "{}"))
pathways = [ # pathway ids
'04010','04630','04064','04668','04151','04150','04152'
]
pathways = [str(org_id) + i for i in pathways]
for pathway in pathways:
if os.path.exists(pathway + ".csv"):
continue
print(pathway)
df = get_pathway_table(pathway)
if df is not None:
save_table(pathway, df)
else:
print("none")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment