slowkow/pathway_genes.py

## pathway_genes.py
#!/usr/bin/env python
"""
pathway_genes.py
Kamil Slowikowski
kslowikowski@mgh.harvard.edu

Updated 2021-03-22


Usage
-----

    python pathway_genes.py ko04068

This will write 702 sequences to a fasta file called "K00922.fa"


Installation
------------

This script depends on a few packages:

    pip install progress bs4

"""


from progress.bar import Bar
import urllib.request, urllib.error, urllib.parse
from bs4 import BeautifulSoup as bs
import sys


def main():
    args = sys.argv
    if len(args) != 2:
        print('Usage: ./pathway_genes.py PATHWAY')
        sys.exit(1)

    pathway_id = str(args[1])
    # pathway_id = 'map04068'

    orthology_ids = get_orthology_ids(pathway_id)

    print('Found {} orthology ids for pathway "{}"'\
            .format(len(orthology_ids), pathway_id))

    if not orthology_ids:
        sys.exit(1)

    for orthology_id in orthology_ids:
        gene_ids = get_gene_ids(orthology_id)

        print('Writing {} FASTA gene sequences to "{}.fa"'\
                .format(len(gene_ids), orthology_id))

        with open(orthology_id + '.fa', 'w') as out:
            bar = SlowBar('Downloading', max=len(gene_ids))
            for gene_id in gene_ids:
                fasta = get_fasta(gene_id)
                out.write(fasta)
                bar.next()
            bar.finish()


class SlowBar(Bar):
    suffix = '%(index)s/%(max)s %(remaining_minutes)d minutes remaining'
    @property
    def remaining_minutes(self):
        return self.eta // 60


def get_ids(url):
    response = urllib.request.urlopen(url)
    html = response.read()
    b = bs(html, features="lxml")
    links = b.find_all('a')
    texts = []
    for link in links:
        href = link.get('href')
        if href and 'www_bget' in href:
            texts.append(link.text)
    return texts


def get_orthology_ids(pathway_id):
    URL = 'http://www.genome.jp'
    FUN = '/dbget-bin/get_linkdb?-t+orthology+path:'
    return get_ids(URL + FUN + pathway_id)


def get_gene_ids(orthology_id):
    URL = 'http://www.genome.jp'
    FUN = '/dbget-bin/get_linkdb?-t+genes+ko:'
    return get_ids(URL + FUN + orthology_id)


def get_fasta(gene_id):
    URL = 'http://www.genome.jp'
    FUN = '/dbget-bin/www_bget?-f+-n+n+'
    response = urllib.request.urlopen(URL + FUN + gene_id)
    html = bs(response.read(), features="lxml")
    return html.pre.text


if __name__ == '__main__':
    main()
	#!/usr/bin/env python
	"""
	pathway_genes.py
	Kamil Slowikowski
	kslowikowski@mgh.harvard.edu

	Updated 2021-03-22


	Usage
	-----

	python pathway_genes.py ko04068

	This will write 702 sequences to a fasta file called "K00922.fa"


	Installation
	------------

	This script depends on a few packages:

	pip install progress bs4

	"""


	from progress.bar import Bar
	import urllib.request, urllib.error, urllib.parse
	from bs4 import BeautifulSoup as bs
	import sys


	def main():
	args = sys.argv
	if len(args) != 2:
	print('Usage: ./pathway_genes.py PATHWAY')
	sys.exit(1)

	pathway_id = str(args[1])
	# pathway_id = 'map04068'

	orthology_ids = get_orthology_ids(pathway_id)

	print('Found {} orthology ids for pathway "{}"'\
	.format(len(orthology_ids), pathway_id))

	if not orthology_ids:
	sys.exit(1)

	for orthology_id in orthology_ids:
	gene_ids = get_gene_ids(orthology_id)

	print('Writing {} FASTA gene sequences to "{}.fa"'\
	.format(len(gene_ids), orthology_id))

	with open(orthology_id + '.fa', 'w') as out:
	bar = SlowBar('Downloading', max=len(gene_ids))
	for gene_id in gene_ids:
	fasta = get_fasta(gene_id)
	out.write(fasta)
	bar.next()
	bar.finish()


	class SlowBar(Bar):
	suffix = '%(index)s/%(max)s %(remaining_minutes)d minutes remaining'
	@property
	def remaining_minutes(self):
	return self.eta // 60


	def get_ids(url):
	response = urllib.request.urlopen(url)
	html = response.read()
	b = bs(html, features="lxml")
	links = b.find_all('a')
	texts = []
	for link in links:
	href = link.get('href')
	if href and 'www_bget' in href:
	texts.append(link.text)
	return texts


	def get_orthology_ids(pathway_id):
	URL = 'http://www.genome.jp'
	FUN = '/dbget-bin/get_linkdb?-t+orthology+path:'
	return get_ids(URL + FUN + pathway_id)


	def get_gene_ids(orthology_id):
	URL = 'http://www.genome.jp'
	FUN = '/dbget-bin/get_linkdb?-t+genes+ko:'
	return get_ids(URL + FUN + orthology_id)


	def get_fasta(gene_id):
	URL = 'http://www.genome.jp'
	FUN = '/dbget-bin/www_bget?-f+-n+n+'
	response = urllib.request.urlopen(URL + FUN + gene_id)
	html = bs(response.read(), features="lxml")
	return html.pre.text


	if __name__ == '__main__':
	main()