Skip to content

Instantly share code, notes, and snippets.

@sminot
Last active December 2, 2020 12:24
Show Gist options
  • Save sminot/e5b13dfbada7883430cfac8d8696222d to your computer and use it in GitHub Desktop.
Save sminot/e5b13dfbada7883430cfac8d8696222d to your computer and use it in GitHub Desktop.
Fetch gene sequences from KEGG
#!/usr/bin/env python3
import requests
from random import shuffle
# Argument parsing code courtesy of @wasade
import click
# Function to fetch the amino acid sequence for a single gene
def _fetch_gene(s, gene_name):
r = s.get(f"http://rest.kegg.jp/get/{gene_name}/aaseq")
assert r.status_code == 200, f"Could not retrieve gene sequence for {gene_name}"
return r.text
@click.command()
@click.option('--ko', type=str, required=True,
help='Input the gene ID ("KXXXXX") as the only argument to the '
'function')
def fetch(ko):
assert ko.startswith("K"), "Must specify KO to search."
s = requests.Session()
# Fetch the list of genes linked to this accession
r = s.get(f"http://rest.kegg.jp/link/genes/{ko}")
# Make sure that we were able to fetch the list of linked genes
assert r.status_code == 200, f"Could not retrieve genes linked to {ko}"
# Store sequences in a dict
seqs = {}
# Parse the list of gene names from the data which was returned
gene_name_list = [
l.rsplit("\t", 1)[1]
for l in r.text.splitlines() if "\t" in l
]
click.echo(f"Preparing to download {len(gene_name_list):,} sequences linked to {ko}")
# Randomly subset to 100 sequences. Change this as you like.
if len(gene_name_list) > 100:
click.echo("Subsetting to a random selection of 100")
shuffle(gene_name_list)
gene_name_list = gene_name_list[:100]
# Fetch each of the individual genes
for gene_name in gene_name_list:
seqs[gene_name] = _fetch_gene(s, gene_name)
if len(seqs) % 10 == 0:
click.echo(f"Fetched {len(seqs):,} gene sequences")
# Write out the results -- change the output file path as you like
with open(f"{ko}.fasta", "w") as handle:
handle.write("\n".join(seqs.values()))
# Reporting
click.echo(f"Wrote out {len(seqs):,} sequences to {ko}.fasta")
if __name__ == '__main__':
fetch()
@wasade
Copy link

wasade commented Nov 30, 2020

Nice! A minor revision using click and request.Session is below. It looks like you can't issue PRs against gists unfortunately...

#!/usr/bin/env python3

import requests
from random import shuffle
import click


# Function to fetch the amino acid sequence for a single gene
def _fetch_gene(s, gene_name):
    r = s.get(f"http://rest.kegg.jp/get/{gene_name}/aaseq")
    assert r.status_code == 200, ("Could not retrieve gene sequence for "
                                  f"{gene_name}")
    return r.text


@click.command()
@click.option('--ko', type=str, required=True,
              help='Input the gene ID ("KXXXXX") as the only argument to the '
                   'function')
def fetch(ko):
    assert ko.startswith("K"), "Must specify KO to search."

    s = requests.Session()

    # Fetch the list of genes linked to this accession
    r = s.get(f"http://rest.kegg.jp/link/genes/{ko}")

    # Make sure that we were able to fetch the list of linked genes
    assert r.status_code == 200, f"Could not retrieve genes linked to {ko}"

    # Store sequences in a dict
    seqs = {}

    # Parse the list of gene names from the data which was returned
    gene_name_list = [l.rsplit("\t", 1)[1]
                      for l in r.text.splitlines() if "\t" in l]
    click.echo(f"Preparing to download {len(gene_name_list):,} sequences "
               f"linked to {ko}")

    # Randomly subset to 100 sequences. Change this as you like.
    if len(gene_name_list) > 100:
        click.echo("Subsetting to a random selection of 100")
        shuffle(gene_name_list)
        gene_name_list = gene_name_list[:100]

    # Fetch each of the individual genes
    for gene_name in gene_name_list:
        seqs[gene_name] = _fetch_gene(s, gene_name)

    if len(seqs) % 10 == 0:
        click.echo(f"Fetched {len(seqs):,} gene sequences")

    # Write out the results -- change the output file path as you like
    with open(f"{ko}.fasta", "w") as handle:
        handle.write("\n".join(seqs.values()))

    # Reporting
    click.echo(f"Wrote out {len(seqs):,} sequences to {ko}.fasta")


if __name__ == '__main__':
    fetch()

@sminot
Copy link
Author

sminot commented Nov 30, 2020

Thanks for the enhancement, @wasade!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment