Last active
August 29, 2015 13:58
-
-
Save git2samus/10204954 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
>Rosalind_8980 | |
CACCCCCTAAGTTAGACAGCTACAAGTCACTATGATTTTTTAGCTGCCTAGCACTATTGT | |
TTGAAACTCAGCGTCACACACAGCCAACATCTACCGGACGACCAGTTTTAATTCGTAAAG | |
ACGTGCGGGCTACTGTGATCTAATGTAACCGCACTCTCGCGGAACGCCTTTTGTCTCGGG | |
TCGGCCTATGTCCTTAGGAATGTCCGGTGAATTACAACTAAGAACGGCCTTGGTATCGCG | |
GGTACACCTGTCTATTTTGCATGGTGTGTGTTAACGGTGGCTGCGAGGGGACGAAATCAA | |
TGACCAAGGCAGGAAGTAGGTTAGCGGGCTAGTCGCACACTTTGTTGTCGGTCAAGTGAA | |
TACCCTCAGGACCTGTGCGGTTACGGGTCACGGGAGATCCGCGAAGCTGCCAATTGTATT | |
GACTTTGGCCACGGTATAGGTGTTCAGGTATCCTTGTCATCAACGATACCCTCAGGTCCT | |
TTGAGGCCTCTACTGCCTAGGTGTTAGGCGCATTTTCGCTGAGTCCTAAAACAGTATTAC | |
CAAGGCTACTTAACGTATGGAAATCATCCCCCATTTCCCGTTTTTGTCGAGAGATACGCT | |
CTTTAGGTTATAAGGATAAATAATGAGAACTCATCGAATACCGCGTAAACAATAGCGTGC | |
AGGTCGTGATATAGGACACGATCAGCAAATGCGAGAGTCGAGTCAAGGACCTAGGCTGAT | |
TTGTACCGGAAGTTTCTCTCTCTTCAAAATAGCCTTACTGCCGCGAACCGAGGTGACGTT | |
AGTAGCATCTGCATTGGTTTTGGACGTCTGCTATTGCTATGTCGCCACCCATAGTGTAAA | |
ATACTAGTGGCGAAGTGCTCAGTGAATGCGATTGAGCCAATGACGAACCAGACTAACTTG | |
AACCGTGTGCTCCAATTGGTAAGATATGCTAC | |
>Rosalind_1203 | |
ACCACTTTGTTTCAGTAACCGTTCCTGATGACTCTACTCCCACCACAACGGACACAAGCG | |
GAGTAGGTGGCGGAGTGGTGACGTTTGTTCCAGAGACGGTACCATTAGTCCCCGCGACAA | |
AAATTCCCCTACAAATCCGGTTAGGTGATCTGCACCGGCTTTGAGGCAACCATACCCCTA | |
TAGCCCAAACGTTGGATTGGAGATTGGATTGCCCTCTAGTCCGGACAACCACAGGAGACG | |
GGATCTTATGACAATACGAGCACGAACAGTTCGTTCATGCTTTCCCATTCGAGCGCCCAA | |
CTAATCAACACTTCTATCTGCGACCTTAAATATGACTCGCGAATCAACGAGGGACGCCAT | |
TTAACGCAGAATTTATTTGTTTCGCGACCCGCGTTTTGTACGAGACAGATGCACCGATAT | |
ACGGTTTATCTTTACGCTAGGCCAATTCACCATAGTTCGTCCGCCGTATCGGGCGATTAG | |
TGCGACTCGTATCTGAACATAAATATACGCCACTCGAATATTACATACCCTCTTAGGATG | |
GAGACCTGTTTGTGCCAAGAAGATGTTCGCATTGGAGGTCGTACGTGGACTGGTATGGCT | |
GTGCATCCTATGCCCGACCAACGAGTCACGGTCCCCGAGAGATGGACTACCACATAAAAT | |
TAAGTAGGCAGACAATGGGCCAAGAGAACATGATAATCACGACGTGTCCAGGTAGCGTAG | |
CCCTTTTGAACTTCTCCGGATCGACCCCCAGCGCTGCGGTAGCGGTTATTATTAGAGCTT | |
TAAGTTTCCCTGCCGCCCGGTTAAAAGTCTGTGTGGAGCGGAAGCATAGAATTGCATAGT | |
TTCGGCGAAGGCTTCCCACACACCGTCCGTCAATAAAAACACGATTGACGGTATTGTATC | |
GTCATGCAGCCGCATGCCCTAAAAGTCCGGCC | |
>Rosalind_6922 | |
CTCTGGGTCAACGCAGGAGGGGTGGGAAGTGCCACACGGTCCGATAGACGCTCATTAATA | |
AATCTACGCATCCGCCTCGGACTCTCGTGACATCCTTTTAAACCGCCAGCAACGGTGACC | |
ATGAAAAATCGGTAAGTTTTTCTGGCTAGGGCCCTAAAAAATCCATTTCTCTGTTGAGTG | |
GTTCTTAATGCATCTCTAGCGCACGTGCGAGCGTGCATAGATACGTTCGACGTGTAGACT | |
AACTTCACACCTTTTGAAACGCATAAGTAGCCCAACATAATCCACATGCGCTACGTAATG | |
ATTCTCGGTGATTGCACGGTCATCCCCAAAACAGTACAGGCCGGCTTCTCTATTGGCCGT | |
ATTGAATCTTTGTTATGTCGGTGGATAATAGTTCATTGGAGCACGAGGGCTAGACCCGGT | |
TAGGTAGGTTTTCTCGTCATTTGTTACATAGATAACAACCGCTAAGCGAGGGGGCACTAG | |
ACCTTCTACCGCCGTCATACGTGGCCGGGATACCGAAGTAACTAATCTCTGTTACATACT | |
CTGTTGCAGTCTATGTGCATTCCGCGTTTTACAACGAGTTCGGGGCGAGCACGAACAGCC | |
AAAAGTTTTCCGCACCGCTATTAATTTTGTGGGGAATATCCGTAGGACGTGGGCTGCTAC | |
TCAGGTGAACGACGCGACGCACCAACAACATCACGATGTGCCATTTTCGGTCATTGGATT | |
ACGTCCCCCGCAGCATTAGGCTGTGCACCTAGCCGATAGTTCATGCTATGTCACGAGGTT | |
ACCGCAGGGGCCACAGTCACTTCTGCGTATGAGACACAGAGTATAGTAAAGCTATAGAAA | |
TAAGTCGAGACAGGAGAACCGCGTGTCGTCTCCCCACGGCGGCCTCCCCCGGCGTTAAGC | |
GCTCTTATGATGTGTCGTCGCCATCACCAGCT | |
>Rosalind_7678 | |
GGTCTCCCGAGATGGTTTGTACTTGACTGAAACAGGACGGCATGCCATATTTCGGAAAAT | |
AATACATCGGTCTTCTCAACGCGTCAACCTTGGCCTGGAGATGTGACCCCATAGTTGAGT | |
TATCGGGAAGATTAGTTCCAGCGAACACCCAGGGGCCTAAAAGAGGGGAGCGGGATCAGA | |
CCACTGAATGGTGGCTATCGCTGCCATGCTCCCATCAAAGGTTGTAAGCATAAATTTATC | |
CGATGCGGTGTATATTTGTCGATTTCCTCGAACTTCGTGAACGATTTTTTTAATTAGAAC | |
CAATGCTACGCCAACCTAGATTTTCGTTAGATCCAGTGCATAAGCTACAACGGTGCTCCC | |
AGCGTAGTATTCATCATACCGGGATCTGTAAAAGACCGGCGGTTCTTCGTATGTGCTTTC | |
TAAGAAGGTGAGGCTTAAACTTTTCCCTTATTTGCCCGCGAACGACACCGCAGTGTGAAG | |
CCAGCAGTCAGGGTCTCGACCTGTTGACGAGCCACGTAGTGATTTATACCAGTGGTGCTC | |
TCGGTTGTTGTGACTAGAACGCTGTAAACGACGGCCCTTTAGGGGGATCACGGGCACGCT | |
TGCGTAGGGACACTATATGGAAAGGTCACGTAGACGTCTCGCCGCCGAGCTTACAATATC | |
TATGGACTCGATTTCTGAGGGGGTTTCACATCCGGGTTTTGTGCCGAGCTCTTCTCATCA | |
TTCTATCCTTACAACCCAACCATACCATAACCTCGGTTGTAAGAACTTTATAAACACGGT | |
AGCCCCAGCTTTTACTGGCCAACGTCTCACTCTCTGGCACGTAGTGTGTACTAGGCCGGA | |
TCCGCCATTGGATGTCTAAAGGAGTGTCTCCTAAGCCCTGTGTGACTTAGTCGAACTGGT | |
CGAAGGTAATTATACAAGGAGCATCTGATTCC | |
>Rosalind_4630 | |
CGCGGAGGTCAGAGGGGTTGGTGCGTTCCCGGGTGGGCATAGGGCGCGCACCACGGTTAG | |
GGGAACATGGCAGGAATACTGTTATCTAATGACGCAATCGTGCGTGCCTGGCTTACACGT | |
CTTCGTTGGCATGGCCATAAACCTCCACGTACGTCCTAGAGTCTTTAACTTAAGTGCTAA | |
CGACCTACCAAAGTCCAATAAGACCAGCAGGAATATGGGACTATAGTATACAACTGATCG | |
GACACTTGAGCATATTAGGGAAATAAGATCAGGGCGGAGGGACCTAGCAAGCCTGGCTGA | |
ACCCCAAGTGGCTTGCAAACCACTCGCCCATCCACGGGCACAATCACGTCAAGGAACTCG | |
GGGGTGTCTTCGTGGCGTATGGTTGCGTCAGAGTACAAAATGCTAGCCCTTAGGCTACTA | |
AATACTTACCACCCTACTTGGCCTACATGACGTGACCGACCTATCAGTGAAGTAGTAGAA | |
TTTACCCGAATCACACCTGGCAAATGAAAGGTTGGAACTCATCGGCGTGGTTCCGCCCCC | |
AAGGGGTTATTTCTGTAAGCACCGAAAGACTCAAGTTATAAGTAGGAACTGCGTAGACAC | |
AGGCGCAAGAGGATAACGTACACCAAACATCCCATGTTCTACGTTAGGATAATTTGGGTC | |
TCAACGCCCCGGAGATCTGTCATCTGTAATGCAATGTATGATGTTTAGACGTTCAGCCGA | |
GATGGGTTTGGCATCTAAACGTAACTGGTAAGTACTACAAATGGCTGCGATAGCTTACTC | |
TCGACCCACTCTCAGAATCCCCGAGATCCGGACATTTCTTTTCTTGGTCACTTATATTGT | |
GCCACAAGAAAGCTTTCACGAGTAAGAAATGTTAAAAGGTGTATACTCGTCGGGGCCGGA | |
AAATCATGATGGGACGTAGATCCACATTTGTC | |
>Rosalind_6816 | |
AACCAGGGTCAGGTACCGGGCGTAGGGGTGAAATCGGCTTCTGGAGATCACCTCGTATTT | |
AGCATGCAGCTTATTGGGCGCGCTGCTGTGTACCGCGCCGCGGGGATATTGTAATATAAG | |
GCATTGCAGGTAGCGGTCGGGGCGATGCACTCAACGATCCCCCACCATTCAGATGAGCGG | |
ATGTTAAGTTGTCATCCGTATCACGCATGGGGCACCTCGTGGAACGACTGTCGACACTTT | |
TTTTATAGGTAGGCCATGATCGGTCGTAGCTCCGACATAGGCGCCAATCTCTCTATTCAC | |
CTATTCGACCCGCGCCAAATAAGTCGGGGTGCAAGTGAGCCCTGAAGACTGGGGTGCTCC | |
GATCCTGCCTGTGCAACTCCCTCGCATTTTGCGCCGCTCACGATGATGCAGTCTACCCTG | |
GTGTAGTCTACTCCCGGGCATGCTTCCTCGCCCCCTATACGGACCCGTAGCGGTGGCACG | |
GGGCGGAATCAAGCAGTGTTAGTGACACACAGTGGCGCCCATGCATCAGTCCTCGGTGAC | |
AGCGATGGTGTCGCTGCGAGCCTTCCATCATCAACTCTACGATATAGCGGGATAAACAAA | |
CTTTTGGGTTACGCGCTATATTCAGACTCAGCGCAGGATCGGGATGATCGATAAGATTAG | |
CGCGGGCTTACGTCGCTTTCATGCGGGGTACTCCGCAGTCTATGCGAATGCATTCCACAT | |
GAACGGTGTTTTTTCAAATGAGTCCTCATTGCCACGCTGCGCACAGATGCCGTCATAGCG | |
AATACCAGCTTGCAGCTTACTCCGGGTGTTATCGGCTCAACGGGCGGTTTCGACGGTCTC | |
CCGAAATGTCATTCCATGGCTTAGCTCTAAATTCACGGTTGCATTTTGCATCCTATAGTG | |
CGTTGACAACATGAACCATTTCACCGCTTAAA | |
>Rosalind_6912 | |
TGCGTCCAGCCCTGATCAGTGGTCCCGTTATTTAGTTGCGCGGACGTCTACGTCAGCGAC | |
ACGATTGCAAGAATCTGTATCACTCCGGTAAGCTTCAACTCCACGCTTCGAGTATGCTCG | |
TCTACTGGCAGGGTGAGATATAATAATCAGTAAAGATTGTCCTCACACGACTGTATCATA | |
CAACTAGCATGATAGGTGATGCCACGGCGGCTCTGATGTCCCAGTGGAACCACTTATAAA | |
CGCAAGAGGCGGGACGCGCCGATGTCTTAGACCTATGGGCTCCTCTAACGTAAAAAGTGA | |
TGGGATAGTGGCTTCGTCAATGATCATTACGGATCCGTACATCCGCGCAAGGCTCTTCCG | |
GATAGTGCCCTGGGAGGAGTCTGTTAGTTGGAAATTTGCTCGCCATGTTAGTGGGGTCAC | |
GCTTACAGCTACGGTGGGAACCGTTTGATGATCAATCTACGAGATAGCCTTCGCTGTCCA | |
TTGGTGTTCATTTGTACAACGGGCGTTGGGCGCGTAGCAAAAACCAGACTTAGAGCCTAA | |
TTCATGAGGCCATTATAATGACTACAGGCGTTCTTTAACCGGTCTGATTACGACAGCTAG | |
TGTAGGACACGCAAATTTCCTCATAAAGGAGGGACGTGAACCAAGCTGAGGCCGTGCAAA | |
TCCCACTGGGGGACTCAGGCTACGGACAGGGGCATATACTCGTGGCTGTCATAGAAACGG | |
GGAAGTTCGTTTGCATCGCAAGAGCTCACGCTACCAGTTCGGTGGGGAGGTGACCGGTTA | |
AGATGGTAGACGGGGGAACGCACTAAGCCGGCCGTGTAGGGCTACCGATTTAACTACCTC | |
AGATTAATTAGTTCGTCGGTTCTAGTTCGGTTTCCAAGGTTTTAGCGGAGTGACCAGACA | |
TAGTCCGGGCACGCATCTTGCGGTTTAAACAC | |
>Rosalind_7593 | |
TATCAATGGCCGTGTGGATATCAGACTGTCTCCCGGCCCCCACCGACGCATTCGCGTCGC | |
CCCGTTATGCTACAGTAATGACAGCGATGATCCCCTGAAATCTTGACACAGTGTTGAGCC | |
GAAACTACGTCTATGCGCGTGTCGCCCTTACACCTACACCTAGAGGCTTCCTTCGTGATA | |
CGCGCATTTGCGCCGCGATCTACCTCTCATCTGGTGCCATTTTCGTCGCCTTCTGCTGAA | |
GAGAGCTTGCCGGACCGAAGGTTTTGAACCAAGGAGCAGAAGGCTTGCGCAGAGACTAGT | |
GTACTTTGTACGGGCAAGGGACTTCGATGTAAGGACATTGACATTCATAGAATTCATGGT | |
CCAACGCAATGTGATCTAGAGGCCAGCATGGAACGTGCAACACGTGCACAGTGTCTTAGA | |
GCAATCCGATTAGTTAGACCGTAGCGATTCGATGGGCCGCGCGGTTCACGGCTATGGCAC | |
ATTCTATACTAGAAGTAAGTCCTGTTAATCAAGAGGATCGATATTGGGCACAAAAATTAA | |
CACAGTGAGACAAGGGACCAAGGGTTGCGTAGAATTTACAACGTAGTCGCGTAGTATCAC | |
CCTATCCGTGCAATTCTGGCGCGGCTCGTTTCCGACCAGTGGCTAGCACTATTACAGCCG | |
ACGCGCCGGAGTTGTCTAGCGTCCTGCCAGGCATAATTGAGGGTGTAACCCAATTTACGA | |
ATTGCGCAAATGTACGGCCACTATTAGCTAGCTAATCTGATGTCGGCGCCCACACGCCAT | |
AAGACGCTACACACGAGCAACGCCCAAAGAGCATAGCCGCTCCATCTGCATAGCATCGCC | |
GGTACAACATTATCGAACACGCGATATCCCCCATTAATCTGCGTAGGCATTACTCCACTA | |
TCAAGAGCGTCTCCAGGTTGACTTGTCGTATT | |
>Rosalind_1598 | |
ACGAACATGGAGGTACCATAGCCGGCGAGCATTATCACCAAACAGTCGGCCCGCAGGGGG | |
GGGTATGAAGCATCCGAGTTAAGGCGCCTGGGCGTTACACTGTCACTCCATTGGTGTCTG | |
TCCTAGTGCCCCCAGTATCCGGGTAGTTGGAAGATCACGGCAGGGGAGCTACTGATCCTT | |
GGAACCTTGCTCATACAACATCGTCGACGAATATAGAAAGGGCAGACGGGGCTTGCAATG | |
CTGTTATTGCGTAGTTTGATAACATACAGCATTTCTGGAATGGCACCATACGAAACTCGC | |
AATTCTACTACAGCAGGGACGGACCAACAATCTAGCCACAGAGAAATAGCATTTTTACTG | |
ACCCCCTTCCTCAAAATTTTGTGACGGACAACGCTACCAGGAGCGAAACCCCTTGGTCCA | |
AGATGATCGGCTTGCCTAAGTTTTTAAGGCTAAGGATAGATTCGAGCTCATGGCAAGTCA | |
GGCATAAAACTCCATCTGACCGGCTTAAAATAAGAGTGTAATCACGTGCAATTGCCGGGT | |
TGGCTCTAAACATCCGCTGCTGTTGTCCCAGGGTATATGGTCCTCTCCGAACTCGAGTGA | |
ACCAGTAACCAAGAGCACCATGTCGCACTGAAACTAGGCCGCTTGGCGGCTAACCTCGAC | |
CGCCCTTATCCACATCGTCCTGGGCTACGTTTATCTGGCAAAATTGCTAATAAGCCTCGC | |
CATAGGCGTTCATCAAGTCAGCAAGTACCTCACGGGAATATGAGTTTTAACCTGCCTGAG | |
GCTAGTTCCTAACTATGCTGTCTTCCACAAGGAACAGTCCTAAGCCCATGTGCAGAAAGG | |
AGGACCTTGCACCGGTTCGGTTGGCCACTAAGAACAGTCAAAAGCCACCTCTCATGACTC | |
TACCAAGTATCCGGATAGCGTGCGCGACTGGG | |
>Rosalind_6195 | |
ATGCATTCGCGTTGTTTCTGCCGAGAATATACCTCCATTAAGATGCCGACTAGTAGGTAG | |
CGAAAGAGCCCAGCCGACGGGGGAATCCTGTTGAAATGTTAGTATCTAACTAATATTGGA | |
GAACAAGGTGCCTACTCGATGTCTTAAGTTGTTCAGTGTTTGGGTACATACAAGACAACC | |
AAGATATCTCTAGGTGTGGACCCAACGAACATTCACTTTGAAACATGAGGGGTCCTACCA | |
ACTCTCCTTCCTATACCTGACTTGTGCCGAGTCTCGATCGGCCTCATGGGCCTACTTCGT | |
AATCCAGCAAGACCTTGCAAGGAGCCGTTACAAGTGCGCACACGATCGAACGTAGACTTT | |
TGTACCTATGAGTAGCCACTGGCTACGGTCCTGCTGATCGGGCTCGACGGACGGAATGCT | |
GGGTCAACTATTCTTATGGGGGTGATACATGCTCTCGGTCAGCGACAACGTAAGGCCCAT | |
TACTCGGATGAAGGCTCTCCAAACGGAAGGATCTACTGGTGCCAAAGCCTGTAACTCGTC | |
TCGGTAGTGCCGTGTTGGACCGAGTTTTACTGGCCCGGACCACAAGTCAAAAGGAGTGTT | |
GTTATCAGCATACCGTGAGGGAGGCGCCGATCCCCCGGATTACGAGTGACCGTACTTAGC | |
TGTTGATGTATCAAATGGACAGGACCCTGCTGCCTGACTATTGCGGACGGCCCTCCCTAA | |
AATCAGACCTCCAGCATATCTTTGATCCGCTCCATGGCCGCCCGTCATGAGGAGGGTGGA | |
ACGAGTGATTTGTTCCTTATAAGATTCGTTATCGGAATGTGTCACAACTGGGTCCTGGGC | |
TAGTGGCAGCCGACCAGAGTTCGAGTGCGCTCACTAGTCTGTTAGTGGCGTGGGGAACAC | |
GAGCCGATTTGATGGAAGCACGAGACGATAGG |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
>Rosalind_1 | |
ATCCAGCT | |
>Rosalind_2 | |
GGGCAACT | |
>Rosalind_3 | |
ATGGATCT | |
>Rosalind_4 | |
AAGCAACC | |
>Rosalind_5 | |
TTGGAACT | |
>Rosalind_6 | |
ATGCCATT | |
>Rosalind_7 | |
ATGGCACT |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import sys | |
from collections import OrderedDict | |
# this is a "generator function" | |
# it'll create an iterable when invoked | |
def fastaseq_gen(filename): | |
""" iterator yielding entire sequences from fasta-formatted input file """ | |
# open the file with a context manager so it closes automaticaly | |
with open(filename) as f: | |
# the idea here is that we'll scan each line and create groups | |
# with everything that is within lines starting with > | |
# then the lines of each group will be concatenated with ''.join() | |
# so they produce a single string not the wrapped blocks from FASTA | |
# initialize variable to hold grouped lines with a value we recognize | |
parts = None | |
# this loop is controlled manually | |
# (i.e. iterates infinitely until we "break" out of it) | |
while True: | |
# read the next entire line from the file (it ends with \n) | |
line = f.readline() | |
# at the end of the file f.readline() will return an empty string | |
if not line: | |
# yield last group (if it exists) | |
if parts is not None: | |
yield ''.join(parts) | |
# manually break from while loop to exit function | |
break | |
# lines starting with > are headers, we ignore them | |
if line.startswith('>'): | |
# yield previous group if it exists | |
if parts is not None: | |
yield ''.join(parts) | |
# reset the group variable to a container (list) | |
parts = [] | |
else: | |
# if the line isn't a header we add it to the group | |
# previously removing trailing newlines with .strip() | |
parts.append(line.strip()) | |
def get_profile_matrix(fastaseqs): | |
""" convert fasta sequences into profile matrix """ | |
# create an empty data structure to hold the counts | |
# normal dicts don't mainatain the order of the keys | |
profile_matrix = OrderedDict.fromkeys( | |
('A', 'C', 'G', 'T',) | |
) | |
# iterate the sequences | |
for fastaseq in fastaseqs: | |
# the first time we run the loop we need to initialize the matrix | |
# assuming all fasta seqs are of the same length, we take a random key | |
# and if it's empty we replace all entries with a list of the same | |
# length of the chars in the sequnece filled with zeroes | |
# we can do this because numbers are inmutable | |
# otherwise there would be reference issues | |
if not profile_matrix['A']: | |
for key in profile_matrix: | |
profile_matrix[key] = [0] * len(fastaseq) | |
# use enumerate to index the position of the chars in the string | |
for index, char in enumerate(fastaseq): | |
# update counts for given char | |
profile_matrix[char][index] += 1 | |
# at the end we return the contructed matrix with the counts per char | |
return profile_matrix | |
def get_concensus_string(profile_matrix): | |
""" given a profile matrix return one of the concensus strings """ | |
# accumulator for the result chars | |
result = [] | |
# we'll iterate thru the positions of the sequence and check each entry | |
for index in range(len(profile_matrix['A'])): | |
# initialize/reset counters | |
max_repeats, max_char = 0, '' | |
# now cycle over the keys for this index | |
for char in profile_matrix: | |
# update counters | |
if profile_matrix[char][index] >= max_repeats: | |
max_repeats = profile_matrix[char][index] | |
max_char = char | |
# append largest-counted char to result | |
result.append(max_char) | |
# return all the chars as a single string | |
return ''.join(result) | |
# this check prevents the code from running when the file is imported as module | |
if __name__ == '__main__': | |
try: | |
# read filename from commandline arguments | |
filename = sys.argv[1] | |
except IndexError: | |
# if the argument is missing sys.argv will only contain one element | |
# print the error to stderr and exit with failed statuscode | |
print('Missing filename', file=sys.stderr) | |
sys.exit(1) | |
# process input file and calculate profile matrix | |
fastaseqs = fastaseq_gen(filename) | |
profile_matrix = get_profile_matrix(fastaseqs) | |
# calculate a concensus string | |
concensus_string = get_concensus_string(profile_matrix) | |
# format output | |
print(concensus_string) | |
for key, chars in profile_matrix.items(): | |
print('{}: {}'.format(key, ' '.join(str(char) for char in chars))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment