Skip to content

Instantly share code, notes, and snippets.

@sminot
Last active December 2, 2020 12:24
Show Gist options
  • Save sminot/e5b13dfbada7883430cfac8d8696222d to your computer and use it in GitHub Desktop.
Save sminot/e5b13dfbada7883430cfac8d8696222d to your computer and use it in GitHub Desktop.
Fetch gene sequences from KEGG
#!/usr/bin/env python3
import requests
from random import shuffle
# Argument parsing code courtesy of @wasade
import click
# Function to fetch the amino acid sequence for a single gene
def _fetch_gene(s, gene_name):
r = s.get(f"http://rest.kegg.jp/get/{gene_name}/aaseq")
assert r.status_code == 200, f"Could not retrieve gene sequence for {gene_name}"
return r.text
@click.command()
@click.option('--ko', type=str, required=True,
help='Input the gene ID ("KXXXXX") as the only argument to the '
'function')
def fetch(ko):
assert ko.startswith("K"), "Must specify KO to search."
s = requests.Session()
# Fetch the list of genes linked to this accession
r = s.get(f"http://rest.kegg.jp/link/genes/{ko}")
# Make sure that we were able to fetch the list of linked genes
assert r.status_code == 200, f"Could not retrieve genes linked to {ko}"
# Store sequences in a dict
seqs = {}
# Parse the list of gene names from the data which was returned
gene_name_list = [
l.rsplit("\t", 1)[1]
for l in r.text.splitlines() if "\t" in l
]
click.echo(f"Preparing to download {len(gene_name_list):,} sequences linked to {ko}")
# Randomly subset to 100 sequences. Change this as you like.
if len(gene_name_list) > 100:
click.echo("Subsetting to a random selection of 100")
shuffle(gene_name_list)
gene_name_list = gene_name_list[:100]
# Fetch each of the individual genes
for gene_name in gene_name_list:
seqs[gene_name] = _fetch_gene(s, gene_name)
if len(seqs) % 10 == 0:
click.echo(f"Fetched {len(seqs):,} gene sequences")
# Write out the results -- change the output file path as you like
with open(f"{ko}.fasta", "w") as handle:
handle.write("\n".join(seqs.values()))
# Reporting
click.echo(f"Wrote out {len(seqs):,} sequences to {ko}.fasta")
if __name__ == '__main__':
fetch()
@sminot
Copy link
Author

sminot commented Nov 30, 2020

Thanks for the enhancement, @wasade!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment