Skip to content

Instantly share code, notes, and snippets.

Last active Nov 12, 2021
What would you like to do?
Get FASTA Gene Sequences for all Genes in a KEGG Pathway
#!/usr/bin/env python
Kamil Slowikowski
Updated 2021-03-22
python ko04068
This will write 702 sequences to a fasta file called "K00922.fa"
This script depends on a few packages:
pip install progress bs4
from import Bar
import urllib.request, urllib.error, urllib.parse
from bs4 import BeautifulSoup as bs
import sys
def main():
args = sys.argv
if len(args) != 2:
print('Usage: ./ PATHWAY')
pathway_id = str(args[1])
# pathway_id = 'map04068'
orthology_ids = get_orthology_ids(pathway_id)
print('Found {} orthology ids for pathway "{}"'\
.format(len(orthology_ids), pathway_id))
if not orthology_ids:
for orthology_id in orthology_ids:
gene_ids = get_gene_ids(orthology_id)
print('Writing {} FASTA gene sequences to "{}.fa"'\
.format(len(gene_ids), orthology_id))
with open(orthology_id + '.fa', 'w') as out:
bar = SlowBar('Downloading', max=len(gene_ids))
for gene_id in gene_ids:
fasta = get_fasta(gene_id)
class SlowBar(Bar):
suffix = '%(index)s/%(max)s %(remaining_minutes)d minutes remaining'
def remaining_minutes(self):
return self.eta // 60
def get_ids(url):
response = urllib.request.urlopen(url)
html =
b = bs(html, features="lxml")
links = b.find_all('a')
texts = []
for link in links:
href = link.get('href')
if href and 'www_bget' in href:
return texts
def get_orthology_ids(pathway_id):
URL = ''
FUN = '/dbget-bin/get_linkdb?-t+orthology+path:'
return get_ids(URL + FUN + pathway_id)
def get_gene_ids(orthology_id):
URL = ''
FUN = '/dbget-bin/get_linkdb?-t+genes+ko:'
return get_ids(URL + FUN + orthology_id)
def get_fasta(gene_id):
URL = ''
FUN = '/dbget-bin/www_bget?-f+-n+n+'
response = urllib.request.urlopen(URL + FUN + gene_id)
html = bs(, features="lxml")
return html.pre.text
if __name__ == '__main__':
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment