Skip to content

Instantly share code, notes, and snippets.

@nikaspran
Created November 25, 2012 21:43
Show Gist options
  • Save nikaspran/4145533 to your computer and use it in GitHub Desktop.
Save nikaspran/4145533 to your computer and use it in GitHub Desktop.
import Bio.Blast.NCBIWWW
import Bio.Blast.NCBIXML
import Bio.Blast.Record
import Bio.Align.Applications
import Bio.SeqIO
from StringIO import StringIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from Bio.Alphabet import generic_protein
from Bio.Align.Applications import MafftCommandline
from sys import maxint as MAX_INT
SEQUENCE_FILE = "py_results.faa"
SEQUENCE_FILE_FORMAT = "fasta"
SEQUENCE_WINDOW = 15
SEQUENCE_NO = "4502027"
ENTREZ_QUERY = '"serum albumin"[Protein] AND mammals[Organism]'
MIN_POSITIVES = 80.0
MAFFT_CMD = "mafft.bat"
REFERENCE_SEQUENCE_ID = "ALBU_HUMAN"
def filter_by_min_positives(blast_results, min_positives_percent):
correct_indices = []
for index, sequence in enumerate(blast_results.alignments):
if sequence.hsps[0].positives * 100.0 / sequence.length >= min_positives_percent:
correct_indices.append(index)
filtered_results = Bio.Blast.Record.Blast()
filtered_results.multiple_alignment = blast_results.multiple_alignment
for index in correct_indices:
filtered_results.descriptions.append(blast_results.descriptions[index])
filtered_results.alignments.append(blast_results.alignments[index])
return filtered_results
def count_differences(seq1, seq2):
return len([i for i, (left, right) in enumerate(zip(seq1, seq2)) if left != right])
def calculate_difference_extreme_indices(aligned_records, reference_record, sequence_window):
min_start, max_start = 0, 0
min_diff, max_diff = MAX_INT, 0
for start_index in range(len(reference_record) - sequence_window):
end_index = start_index + sequence_window
human_subseq = reference_record[start_index:end_index].seq
differences = 0
for record in aligned_records: # no need to skip human record for there will be no differences
record_subseq = record[start_index:end_index].seq
differences += count_differences(human_subseq, record_subseq)
if differences < min_diff:
min_start = start_index
min_diff = differences
if differences > max_diff:
max_start = start_index
max_diff = differences
return min_start, max_start
def run_and_parse_blast(sequence, entrez_query):
blast_result_xml = Bio.Blast.NCBIWWW.qblast(program="blastp", database="swissprot", sequence=sequence,
entrez_query=entrez_query, hitlist_size=500, expect=100.0)
return Bio.Blast.NCBIXML.read(blast_result_xml)
def write_to_fasta(blast_results, filename):
sequence_records = []
for index, alignment in enumerate(blast_results.alignments):
sequence = alignment.hsps[0]
sequence_records.append(SeqRecord(Seq(sequence.sbjct, generic_protein), id=alignment.title))
Bio.SeqIO.write(sequence_records, filename, SEQUENCE_FILE_FORMAT)
def align_using_mafft(input):
mafft_cline = MafftCommandline(cmd=MAFFT_CMD, input=input)
stdout, stderr = mafft_cline()
return list(Bio.SeqIO.parse(StringIO(stdout), SEQUENCE_FILE_FORMAT))
def find_first_record_by_id(sequences, id):
return filter(lambda sequence: id in sequence.id, sequences)[0]
### Main script:
blast_results = run_and_parse_blast(SEQUENCE_NO, ENTREZ_QUERY)
filtered_results = filter_by_min_positives(blast_results, MIN_POSITIVES);
write_to_fasta(filtered_results, SEQUENCE_FILE)
aligned_records = align_using_mafft(SEQUENCE_FILE)
reference_record = find_first_record_by_id(aligned_records, REFERENCE_SEQUENCE_ID)
min_start, max_start = calculate_difference_extreme_indices(aligned_records, reference_record, SEQUENCE_WINDOW)
print "Least different start:", min_start + 1, ", seq:", reference_record[min_start:min_start + SEQUENCE_WINDOW].seq
print "Most different start:", max_start + 1, ", seq:", reference_record[max_start:max_start + SEQUENCE_WINDOW].seq
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment