radaniba/add-accession-number.pl

## add-accession-number.pl
#5. Add accession numbers and sequences to the tree -- now we're using PhyloXML's extra features.

from Bio.Phylo import PhyloXML

# Make a lookup table for sequences
lookup = dict((rec.id, str(rec.seq)) for rec in best_seqs)

for clade in egfr_tree.get_terminals():
    key = clade.name
    accession = PhyloXML.Accession(key, 'NCBI')
    mol_seq = PhyloXML.MolSeq(lookup[key], is_aligned=True)
    sequence = PhyloXML.Sequence(type='aa', accession=accession, mol_seq=mol_seq)
    clade.sequences.append(sequence)

# Save the annotated phyloXML file
Phylo.write(tree, 'egfr-family-annotated.xml', 'phyloxml')

## extract-best-hit.pl
#2. Extract the best hits from the BLAST result.

from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

def get_seqrecs(alignments, threshold):
    for aln in alignments:
        for hsp in aln.hsps:
            if hsp.expect < threshold:
                yield SeqRecord(Seq(hsp.sbjct), id=aln.accession)
                break

best_seqs = get_seqrecs(blast_record.alignments, 1e-50)
SeqIO.write(best_seqs, open('egfr-family.fasta', 'w+'), 'fasta')

#To help with annotating to your tree later, pick a lookup key here (e.g. accession number) and build a dictionary mapping that key to any additional data you can glean from the BLAST output, such as taxonomy and GI numbers. In this example, we're only keeping the original sequence and accession number.

## load-gene-tree-with-phylo.pl
#4. Load the gene tree with Phylo, and take a quick look at the topology.

from Bio import Phylo

egfr_tree = Phylo.read("egfr-family.dnd", "newick")
Phylo.draw_ascii(egfr_tree)

## realign-with-clustalw.pl
#3. Re-align the sequences using ClustalW. The program creates an alignment file in Clustal format, "egfr-family.aln", and a gene tree in Newick format, "egfr-family.dnd".

import sys, subprocess
from Bio import AlignIO
from Bio.Align.Applications import ClustalwCommandline

cline = ClustalwCommandline("clustalw", infile="egfr-family.fasta")
child = subprocess.call(str(cline), shell=(sys.platform!="win32"))

## search-homolog-protein.pl
#1. Search for homologs of a protein sequence using BLAST.

from Bio.Blast import NBCIStandalone, NCBIXML

query_fname = 'human-egfr.fasta'
result_handle, error_handle = NCBIStandalone.blastall('/usr/bin/blastall', 'blastp',
                                                      '/db/fasta/nr', query_fname)
blast_record = NCBIXML.read(result_handle)  # This takes some time to run
	#5. Add accession numbers and sequences to the tree -- now we're using PhyloXML's extra features.

	from Bio.Phylo import PhyloXML

	# Make a lookup table for sequences
	lookup = dict((rec.id, str(rec.seq)) for rec in best_seqs)

	for clade in egfr_tree.get_terminals():
	key = clade.name
	accession = PhyloXML.Accession(key, 'NCBI')
	mol_seq = PhyloXML.MolSeq(lookup[key], is_aligned=True)
	sequence = PhyloXML.Sequence(type='aa', accession=accession, mol_seq=mol_seq)
	clade.sequences.append(sequence)

	# Save the annotated phyloXML file
	Phylo.write(tree, 'egfr-family-annotated.xml', 'phyloxml')
	#2. Extract the best hits from the BLAST result.

	from Bio import SeqIO
	from Bio.Seq import Seq
	from Bio.SeqRecord import SeqRecord

	def get_seqrecs(alignments, threshold):
	for aln in alignments:
	for hsp in aln.hsps:
	if hsp.expect < threshold:
	yield SeqRecord(Seq(hsp.sbjct), id=aln.accession)
	break

	best_seqs = get_seqrecs(blast_record.alignments, 1e-50)
	SeqIO.write(best_seqs, open('egfr-family.fasta', 'w+'), 'fasta')

	#To help with annotating to your tree later, pick a lookup key here (e.g. accession number) and build a dictionary mapping that key to any additional data you can glean from the BLAST output, such as taxonomy and GI numbers. In this example, we're only keeping the original sequence and accession number.
	#4. Load the gene tree with Phylo, and take a quick look at the topology.

	from Bio import Phylo

	egfr_tree = Phylo.read("egfr-family.dnd", "newick")
	Phylo.draw_ascii(egfr_tree)
	#3. Re-align the sequences using ClustalW. The program creates an alignment file in Clustal format, "egfr-family.aln", and a gene tree in Newick format, "egfr-family.dnd".

	import sys, subprocess
	from Bio import AlignIO
	from Bio.Align.Applications import ClustalwCommandline

	cline = ClustalwCommandline("clustalw", infile="egfr-family.fasta")
	child = subprocess.call(str(cline), shell=(sys.platform!="win32"))
	#1. Search for homologs of a protein sequence using BLAST.

	from Bio.Blast import NBCIStandalone, NCBIXML

	query_fname = 'human-egfr.fasta'
	result_handle, error_handle = NCBIStandalone.blastall('/usr/bin/blastall', 'blastp',
	'/db/fasta/nr', query_fname)
	blast_record = NCBIXML.read(result_handle) # This takes some time to run