Created
November 29, 2012 16:47
-
-
Save radaniba/4170273 to your computer and use it in GitHub Desktop.
See code comments for more information
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#5. Add accession numbers and sequences to the tree -- now we're using PhyloXML's extra features. | |
from Bio.Phylo import PhyloXML | |
# Make a lookup table for sequences | |
lookup = dict((rec.id, str(rec.seq)) for rec in best_seqs) | |
for clade in egfr_tree.get_terminals(): | |
key = clade.name | |
accession = PhyloXML.Accession(key, 'NCBI') | |
mol_seq = PhyloXML.MolSeq(lookup[key], is_aligned=True) | |
sequence = PhyloXML.Sequence(type='aa', accession=accession, mol_seq=mol_seq) | |
clade.sequences.append(sequence) | |
# Save the annotated phyloXML file | |
Phylo.write(tree, 'egfr-family-annotated.xml', 'phyloxml') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#2. Extract the best hits from the BLAST result. | |
from Bio import SeqIO | |
from Bio.Seq import Seq | |
from Bio.SeqRecord import SeqRecord | |
def get_seqrecs(alignments, threshold): | |
for aln in alignments: | |
for hsp in aln.hsps: | |
if hsp.expect < threshold: | |
yield SeqRecord(Seq(hsp.sbjct), id=aln.accession) | |
break | |
best_seqs = get_seqrecs(blast_record.alignments, 1e-50) | |
SeqIO.write(best_seqs, open('egfr-family.fasta', 'w+'), 'fasta') | |
#To help with annotating to your tree later, pick a lookup key here (e.g. accession number) and build a dictionary mapping that key to any additional data you can glean from the BLAST output, such as taxonomy and GI numbers. In this example, we're only keeping the original sequence and accession number. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#4. Load the gene tree with Phylo, and take a quick look at the topology. | |
from Bio import Phylo | |
egfr_tree = Phylo.read("egfr-family.dnd", "newick") | |
Phylo.draw_ascii(egfr_tree) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#3. Re-align the sequences using ClustalW. The program creates an alignment file in Clustal format, "egfr-family.aln", and a gene tree in Newick format, "egfr-family.dnd". | |
import sys, subprocess | |
from Bio import AlignIO | |
from Bio.Align.Applications import ClustalwCommandline | |
cline = ClustalwCommandline("clustalw", infile="egfr-family.fasta") | |
child = subprocess.call(str(cline), shell=(sys.platform!="win32")) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#1. Search for homologs of a protein sequence using BLAST. | |
from Bio.Blast import NBCIStandalone, NCBIXML | |
query_fname = 'human-egfr.fasta' | |
result_handle, error_handle = NCBIStandalone.blastall('/usr/bin/blastall', 'blastp', | |
'/db/fasta/nr', query_fname) | |
blast_record = NCBIXML.read(result_handle) # This takes some time to run |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment