Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Metagenome basic profiler
import pandas as pd
from subprocess import Popen, PIPE
def len_fasta(filename):
p1 = Popen(['cat', filename], stdout=PIPE, stderr=PIPE)
p2 = Popen(['grep', '>'], stdin=p1.stdout, stdout=PIPE, stderr=PIPE)
p3 = Popen(['wc', '-l'], stdin=p2.stdout, stdout=PIPE, stderr=PIPE)
result, err = p3.communicate()
if p3.returncode != 0:
raise IOError(err)
return int(result.strip().split()[0])
def show_report(fasta_filename):
print("Number of reads: {}".format(len_fasta(fasta_filename)))
blout_filename = fasta_filename.replace('fasta', 'blout')
data = pd.read_table(blout_filename,
names='qseqid sseqid pident length mismatch gapopen qstart ' \
'qend sstart send evalue bitscore sgi staxids'.split(),
)
print("Number of annotations: {}".format(len(data)))
only_first_taxon = lambda s: s.split(';')[0]
data['staxids2'] = data['staxids'].apply(only_first_taxon)
subdata = data[['qseqid', 'staxids2', 'sgi']]
taxon_counts = subdata.groupby('staxids2').count().sort('qseqid',
ascending=False)
print("Number of taxons: {}".format(len(taxon_counts)))
print("Top 10 taxons")
for tax_id, taxon in taxon_counts.iloc[:10].iterrows():
print(" - id {} has {} reads".format(tax_id, taxon['qseqid']))
gene_counts = subdata.groupby('sgi').count().sort('qseqid',
ascending=False)
print("Number of genes: {}".format(len(gene_counts)))
print("Top 10 genes")
for gene_id, gene in gene_counts.iloc[:10].iterrows():
print(" - id {} has {} reads".format(gene_id, gene['qseqid']))
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("fasta", help="input FASTA file")
args = parser.parse_args()
show_report(args.fasta)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment