Last active
February 9, 2017 17:48
-
-
Save fomightez/abd953068a675f416ed3 to your computer and use it in GitHub Desktop.
helping Damian Menning extracting 100% conserved sequences from sequence records in fasta format. VERSION0.2 currently the most developed. The MOST UP-TO-DATE code version and documentation now at https://github.com/fomightez/sequencework/tree/master/Extract_Conserved_from_Aligned .
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
# extract_conserved_from_alignments.py by Damian Menning and Wayne Decatur | |
# ver 0.1 | |
# | |
#******************************************************************************* | |
# USES Python 2.7 | |
# PURPOSE: Takes a file of aligned sequences in FASTA format and extracts the | |
# residues that are 100% conserved in all sequences. As it scans the sequences, | |
# it writes to a file the residue if it is the same or a blank line if it is not. | |
# | |
# | |
# | |
# Dependencies: | |
# Biopython | |
# | |
# v.0.1. Started | |
# | |
# To do: | |
# | |
# | |
# TO RUN: | |
# Example, | |
# Enter on the command line of your terminal, the line | |
#----------------------------------- | |
# python extract_conserved_from_alignments.py | |
#----------------------------------- | |
# | |
# | |
#******************************************************************************* | |
################################## | |
# USER ADJUSTABLE VALUES # | |
file_name = "Gomphonemataceae 18S cut.fas" | |
################################## | |
# | |
# | |
#******************************************************************************* | |
#**********************END USER ADJUSTABLE VARIABLES**************************** | |
#******************************************************************************* | |
#******************************************************************************* | |
###DO NOT EDIT BELOW HERE - ENTER ANY VALUES ABOVE### | |
import os | |
import sys | |
from Bio import SeqIO | |
###---------------------------HELPER FUNCTIONS---------------------------------### | |
def generate_output_file_name(file_name): | |
''' | |
Takes a file name as an argument and returns string for the name of the | |
output file. The generated name is based on the original file | |
name. | |
Specific example | |
================ | |
Calling function with | |
("Gomphonemataceae 18S cut.fas") | |
returns | |
"Gomphonemataceae 18S cut_conserved.fas" | |
''' | |
main_part_of_name, file_extension = os.path.splitext( | |
file_name) #from http://stackoverflow.com/questions/541390/extracting-extension-from-filename-in-python | |
if '.' in file_name: #I don't know if this is needed with the os.path.splitext method but I had it before so left it | |
return main_part_of_name + "_conserved" + file_extension | |
else: | |
return file_name + "_conserved.fas" | |
def make_dict_of_alignments(fasta_seq_file): | |
''' | |
takes a file of aligned sequence records in fasta format and reads each one | |
placing itin a dictionary for easy access later using the id as a key. | |
returns the dictionary it produced. | |
also returns a string of the first sequence identified to use as the measure | |
of length and for establishing a default consensus sequence for comparison. | |
''' | |
# initialize needed variables | |
aligned_seq_dict = {} | |
example_sequence = "NotDefined" | |
for seq_record in SeqIO.parse(fasta_seq_file, "fasta"): | |
# extract the needed info and make dictionary | |
name, sequence = str(seq_record.id), str(seq_record.seq) | |
aligned_seq_dict[name] = sequence | |
# grab first aligned sequence encountered as representative | |
if example_sequence == "NotDefined": | |
example_sequence = str(seq_record.seq) | |
return aligned_seq_dict, example_sequence | |
###--------------------------END OF HELPER FUNCTIONS---------------------------### | |
###-------------------------Main body of script------------------------------### | |
###----------------------GET FILE AND PREPARE TO PARSE-----------------------### | |
# generate output file name based on input file name | |
output_file_name = generate_output_file_name(file_name) | |
# inititalize output file handler | |
output_file_handler = open(output_file_name, "w") | |
#give user some stats and feeback | |
sys.stderr.write("Reading in your file...") | |
# Need dictionary of the aligned sequences for accessing and will use | |
# a representatice sequence from there to extract details such as length and | |
# default consensus. | |
alignments_dict, representative_aligned_seq = make_dict_of_alignments(file_name) | |
# initialize a string for storing the consensus. This is optional in addition to | |
# the outout file being made | |
consensus_string = "" | |
# Iterate over the representative sequence string using each character for | |
# comparison. Idx holds the current index position and will make calling each | |
# position in each sequence easy. | |
# NOTE that the `else` is indented correctly and belongs to the `for` loop. It | |
# will only be performed if the for loop completes without a `break` as | |
# described at https://docs.python.org/2/tutorial/controlflow.html | |
for idx, extracted_residue in enumerate(representative_aligned_seq): | |
for each_key in alignments_dict: | |
if alignments_dict[each_key][idx] != extracted_residue: | |
# A blank line is written to the line output file | |
output_file_handler.write("\n") | |
# consensus_string gets a blank character added to it | |
consensus_string += " " | |
break | |
else: | |
#if matches in all sequences, write the residue | |
output_file_handler.write(extracted_residue) | |
consensus_string += extracted_residue | |
# close the file handling stream | |
output_file_handler.close() | |
#give user some stats and feeback | |
sys.stderr.write("\nConcluded. \n") | |
sys.stderr.write("\nA file named '"+ | |
output_file_name+"' was saved with the results "+ | |
"\nin same directory as the input file.\n\n") | |
sys.stderr.write("\nConsensus sequence = \n"+ | |
consensus_string +".\n\n") | |
#******************************************************************************* | |
###-***********************END MAIN PORTION OF SCRIPT***********************-### | |
#******************************************************************************* |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
# extract_conserved_from_alignments.py by Damian Menning and Wayne Decatur | |
# ver 0.2 | |
# | |
#******************************************************************************* | |
# USES Python 2.7 | |
# PURPOSE: Takes a file of aligned sequences in FASTA format and extracts the | |
# residues that are 100% conserved in all sequences. As it scans the sequences, | |
# it writes to a file the residue if it is the same or a blank line if it is not. | |
# | |
# | |
# | |
# Dependencies: | |
# Biopython | |
# | |
# v.0.1. Basics all in place. | |
# v.0.2. Added a setting for a minimum requirement for the length of the | |
# consensus in order for a block of consensus residues to get printed to | |
# the output file. | |
# | |
# To do: | |
# | |
# | |
# TO RUN: | |
# Example, | |
# Enter on the command line of your terminal, the line | |
#----------------------------------- | |
# python extract_conserved_from_alignments.py | |
#----------------------------------- | |
# | |
# | |
#******************************************************************************* | |
################################## | |
# USER ADJUSTABLE VALUES # | |
file_name = "Gomphonemataceae 18S cut.fas" | |
minimum_length_of_consensus_block = 1 # won't add sequence ragment to the output | |
# file unless matches in a series at least this length. Default is 1 so that | |
# each block of matching sequences, no matter how short, are saved to output. | |
################################## | |
# | |
# | |
#******************************************************************************* | |
#**********************END USER ADJUSTABLE VARIABLES**************************** | |
#******************************************************************************* | |
#******************************************************************************* | |
###DO NOT EDIT BELOW HERE - ENTER ANY VALUES ABOVE### | |
import os | |
import sys | |
from Bio import SeqIO | |
###---------------------------HELPER FUNCTIONS---------------------------------### | |
def generate_output_file_name(file_name): | |
''' | |
Takes a file name as an argument and returns string for the name of the | |
output file. The generated name is based on the original file | |
name. | |
Specific example | |
================ | |
Calling function with | |
("Gomphonemataceae 18S cut.fas") | |
returns | |
"Gomphonemataceae 18S cut_conserved.fas" | |
''' | |
main_part_of_name, file_extension = os.path.splitext( | |
file_name) #from http://stackoverflow.com/questions/541390/extracting-extension-from-filename-in-python | |
if '.' in file_name: #I don't know if this is needed with the os.path.splitext method but I had it before so left it | |
if minimum_length_of_consensus_block != 1: | |
return main_part_of_name + "_conserved_" + str( | |
minimum_length_of_consensus_block) +"bps" + file_extension | |
else: | |
return main_part_of_name + "_conserved" + file_extension | |
else: | |
if minimum_length_of_consensus_block != 1: | |
return file_name + "_conserved_" + str( | |
minimum_length_of_consensus_block) +"bps.fas" | |
else: | |
return file_name + "_conserved.fas" | |
def make_dict_of_alignments(fasta_seq_file): | |
''' | |
takes a file of aligned sequence records in fasta format and reads each one | |
placing itin a dictionary for easy access later using the id as a key. | |
returns the dictionary it produced. | |
also returns a string of the first sequence identified to use as the measure | |
of length and for establishing a default consensus sequence for comparison. | |
''' | |
# initialize needed variables | |
aligned_seq_dict = {} | |
example_sequence = "NotDefined" | |
for seq_record in SeqIO.parse(fasta_seq_file, "fasta"): | |
# extract the needed info and make dictionary | |
name, sequence = str(seq_record.id), str(seq_record.seq) | |
aligned_seq_dict[name] = sequence | |
# grab first aligned sequence encountered as representative | |
if example_sequence == "NotDefined": | |
example_sequence = str(seq_record.seq) | |
return aligned_seq_dict, example_sequence | |
###--------------------------END OF HELPER FUNCTIONS---------------------------### | |
###-------------------------Main body of script------------------------------### | |
###----------------------GET FILE AND PREPARE TO PARSE-----------------------### | |
# generate output file name based on input file name | |
output_file_name = generate_output_file_name(file_name) | |
# inititalize output file handler | |
output_file_handler = open(output_file_name, "w") | |
#give user some stats and feeback | |
sys.stderr.write("Reading in your file...") | |
# Need dictionary of the aligned sequences for accessing and will use | |
# a representatice sequence from there to extract details such as length and | |
# default consensus. | |
alignments_dict, representative_aligned_seq = make_dict_of_alignments(file_name) | |
#initialize a string that will be the current sequence block | |
current_sequence_for_possible_output = "" | |
# initialize a string for storing the consensus. This is optional in addition to | |
# the outout file being made | |
consensus_string = "" | |
# Iterate over the representative sequence string using each character for | |
# comparison. Idx holds the current index position and will make calling each | |
# position in each sequence easy. | |
# NOTE that the `else` is indented correctly and belongs to the `for` loop. It | |
# will only be performed if the for loop completes without a `break` as | |
# described at https://docs.python.org/2/tutorial/controlflow.html | |
for idx, extracted_residue in enumerate(representative_aligned_seq): | |
for each_key in alignments_dict: | |
if alignments_dict[each_key][idx] != extracted_residue: | |
# Now that chain of matches has ended, evaluate if chain series long | |
# enough to save to output file | |
if (len(current_sequence_for_possible_output) >= minimum_length_of_consensus_block): | |
#write conserved series to output file | |
output_file_handler.write(current_sequence_for_possible_output) | |
# A line break is written to the line output file | |
output_file_handler.write("\n") | |
#reset the string collection for blocks to save to the output file | |
current_sequence_for_possible_output = "" | |
# consensus_string gets a blank character added to it | |
consensus_string += " " | |
break | |
else: | |
# if matches in all sequences, save to a growing sequence that may possibly | |
# be written to output if length of sequence chunck meets minimum length | |
# when the chain of matches ends. | |
current_sequence_for_possible_output += extracted_residue | |
# consensus string gets this residue added to it | |
consensus_string += extracted_residue | |
# close the file handling stream | |
output_file_handler.close() | |
#give user some stats and feeback | |
sys.stderr.write("\nConcluded. \n") | |
sys.stderr.write("\nA file named '"+ | |
output_file_name+"' was saved with the results "+ | |
"\nin same directory as the input file.\n\n") | |
sys.stderr.write("\nConsensus sequence = \n"+ | |
consensus_string +".\n\n") | |
#******************************************************************************* | |
###-***********************END MAIN PORTION OF SCRIPT***********************-### | |
#******************************************************************************* |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment