Skip to content

Instantly share code, notes, and snippets.

@slowkow
Last active September 22, 2022 14:15
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save slowkow/a2327b868f4e927ac4fb to your computer and use it in GitHub Desktop.
Save slowkow/a2327b868f4e927ac4fb to your computer and use it in GitHub Desktop.
Get FASTA Gene Sequences for all Genes in a KEGG Pathway
#!/usr/bin/env python
"""
pathway_genes.py
Kamil Slowikowski
kslowikowski@mgh.harvard.edu
Updated 2021-03-22
Usage
-----
python pathway_genes.py ko04068
This will write 702 sequences to a fasta file called "K00922.fa"
Installation
------------
This script depends on a few packages:
pip install progress bs4
"""
from progress.bar import Bar
import urllib.request, urllib.error, urllib.parse
from bs4 import BeautifulSoup as bs
import sys
def main():
args = sys.argv
if len(args) != 2:
print('Usage: ./pathway_genes.py PATHWAY')
sys.exit(1)
pathway_id = str(args[1])
# pathway_id = 'map04068'
orthology_ids = get_orthology_ids(pathway_id)
print('Found {} orthology ids for pathway "{}"'\
.format(len(orthology_ids), pathway_id))
if not orthology_ids:
sys.exit(1)
for orthology_id in orthology_ids:
gene_ids = get_gene_ids(orthology_id)
print('Writing {} FASTA gene sequences to "{}.fa"'\
.format(len(gene_ids), orthology_id))
with open(orthology_id + '.fa', 'w') as out:
bar = SlowBar('Downloading', max=len(gene_ids))
for gene_id in gene_ids:
fasta = get_fasta(gene_id)
out.write(fasta)
bar.next()
bar.finish()
class SlowBar(Bar):
suffix = '%(index)s/%(max)s %(remaining_minutes)d minutes remaining'
@property
def remaining_minutes(self):
return self.eta // 60
def get_ids(url):
response = urllib.request.urlopen(url)
html = response.read()
b = bs(html, features="lxml")
links = b.find_all('a')
texts = []
for link in links:
href = link.get('href')
if href and 'www_bget' in href:
texts.append(link.text)
return texts
def get_orthology_ids(pathway_id):
URL = 'http://www.genome.jp'
FUN = '/dbget-bin/get_linkdb?-t+orthology+path:'
return get_ids(URL + FUN + pathway_id)
def get_gene_ids(orthology_id):
URL = 'http://www.genome.jp'
FUN = '/dbget-bin/get_linkdb?-t+genes+ko:'
return get_ids(URL + FUN + orthology_id)
def get_fasta(gene_id):
URL = 'http://www.genome.jp'
FUN = '/dbget-bin/www_bget?-f+-n+n+'
response = urllib.request.urlopen(URL + FUN + gene_id)
html = bs(response.read(), features="lxml")
return html.pre.text
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment