Skip to content

Instantly share code, notes, and snippets.

@radaniba
Created November 29, 2012 16:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save radaniba/4170273 to your computer and use it in GitHub Desktop.
Save radaniba/4170273 to your computer and use it in GitHub Desktop.
See code comments for more information
#5. Add accession numbers and sequences to the tree -- now we're using PhyloXML's extra features.
from Bio.Phylo import PhyloXML
# Make a lookup table for sequences
lookup = dict((rec.id, str(rec.seq)) for rec in best_seqs)
for clade in egfr_tree.get_terminals():
key = clade.name
accession = PhyloXML.Accession(key, 'NCBI')
mol_seq = PhyloXML.MolSeq(lookup[key], is_aligned=True)
sequence = PhyloXML.Sequence(type='aa', accession=accession, mol_seq=mol_seq)
clade.sequences.append(sequence)
# Save the annotated phyloXML file
Phylo.write(tree, 'egfr-family-annotated.xml', 'phyloxml')
#2. Extract the best hits from the BLAST result.
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
def get_seqrecs(alignments, threshold):
for aln in alignments:
for hsp in aln.hsps:
if hsp.expect < threshold:
yield SeqRecord(Seq(hsp.sbjct), id=aln.accession)
break
best_seqs = get_seqrecs(blast_record.alignments, 1e-50)
SeqIO.write(best_seqs, open('egfr-family.fasta', 'w+'), 'fasta')
#To help with annotating to your tree later, pick a lookup key here (e.g. accession number) and build a dictionary mapping that key to any additional data you can glean from the BLAST output, such as taxonomy and GI numbers. In this example, we're only keeping the original sequence and accession number.
#4. Load the gene tree with Phylo, and take a quick look at the topology.
from Bio import Phylo
egfr_tree = Phylo.read("egfr-family.dnd", "newick")
Phylo.draw_ascii(egfr_tree)
#3. Re-align the sequences using ClustalW. The program creates an alignment file in Clustal format, "egfr-family.aln", and a gene tree in Newick format, "egfr-family.dnd".
import sys, subprocess
from Bio import AlignIO
from Bio.Align.Applications import ClustalwCommandline
cline = ClustalwCommandline("clustalw", infile="egfr-family.fasta")
child = subprocess.call(str(cline), shell=(sys.platform!="win32"))
#1. Search for homologs of a protein sequence using BLAST.
from Bio.Blast import NBCIStandalone, NCBIXML
query_fname = 'human-egfr.fasta'
result_handle, error_handle = NCBIStandalone.blastall('/usr/bin/blastall', 'blastp',
'/db/fasta/nr', query_fname)
blast_record = NCBIXML.read(result_handle) # This takes some time to run
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment