Created
May 26, 2023 10:59
-
-
Save andrija-zikovic/81538555c0863a402d3a4f8a2b5a7d47 to your computer and use it in GitHub Desktop.
CS50/dna
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
AATGAATGAATGAATGAATGAATGAATGAATGAATGAATGAATGAATGAATGGTTAAAGCCAAGTGGAAGTTGACGAGCTACGGCACAGGTACCCTATACATACGGTAAATGAGTCGGAGGTTGTGGGTTTAAAGTAAGTCCCCGCTCAACATTCAGCAGACCCTCGAAGTGGGCCCTAAAATCGTGTTGCTAACGCTCCGGACCTGACCCCGAGCTTGGCTCCTAATTGTGTACTCTCTCCAACCAAGCAGCGTACCAACGCGGCAACCAGAGCGAAGCTGTACACGTCGATCATCGTTACGCCTCTACTCGATAGTCGTAGAAACTTGTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTGCGGTTGGTAGCTCTAACTGTCATCGTATTCGCGAATACCTCAGATATAAGCTCCAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAAGTGAATGCACGAGAGTGTTATAGCAGATATCCCCGCTGATCCGGCTGCCGAGGAGGTGGGCATGTGACGTTATGCACTACACAGCTACTACCAAGGTCTTCTGCGGGAAAGGATAGACAAACCGGCAACTCCGCGAGGTCGCGGACTTAGTATTGCGACGGCGTCCTAATCGGCTGGATTTGCGGTTTGTTGGCGTTAGTCCAAAGGTGCCGCTAATGTGGCCATATTTACGATCCACCCTATAGGGCTCCAGGTCGTTTTAAGTCGAGTCGTGTCTAGGGGCCATTCCTGGCCTTGAACGAAAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCGGCCTTAATGCTCAGATTCATATGCTGTGAGGCCGAGGGTGGCGTCATATCTTCGATGATGTTGAACATACGGTCCGGTATTTCGACTTGCCACCTGGTACTGCTTTAAAAGATGATACCATCAACAAAAGGGCACGGCGTGCCTCATGCAGGACGGGACGTTGCCTGCCTACAGCGCTCTACGTAGCAATGTCCGTCTTTCTTCATACACGTATGCTCCTAAAGAAATTGTAGTCTAACAGCTTCCAAACTGTAATCGCCGTTAGGTTCGTCTAAAGTAAAAATGATTGCAAGACGCAATCGAAGGAGCCATCGTTTCGAGGTGACTTCTAATATAACTACCTATGGTCATAGCATGCCCCAACATTGAACGAGGTAAGATCACGGGATCGACTGTCCTGGCGAGGGCCTTACGTTAGTCGTGTAATGCTCCGCGCGTCCCAAATATATGAAAGGCACGACACTCCCCACAATTTAACCCTCCCGCCAAATAAGTACCTAGCGGAGATAAGAATCTGGTCGGTCAGAAAAGGGTCTATGTCCTACAGAGTAGGGCGAAGTCCGCATACCGCAACAGTGCGGTGGCAAACGCTTTAATGACCAGGATCGTGCTAGGCAGTGGAATTTCATGTGGATTGGCCCGCGAATGGACAGGGAGCTATGTCTGAACTCTGTTGACGCTGAACTGTATCCGGATCGTCATGTGAATCGTAGCTATGGGAGTGGTGGTACTGTAAGTCAGGGCTACTTACTGCGGGGTATCTATCTATCTATCTATCTATCCTCACAGTTCATGATTATACGGATGTAATTTGCCGCTGGCTCACGATACGGCTATACAGCGTTGGCTCCTAACGTTGCCACCTACAGTCTGCACTTGGGCACTCGGTATGGTATAAAATATATGACGGCAGACGTTGCGATAAGTAAAAGATCGAACAATCTCGCAGCAAATCTTAAAGCGCATCTAACATCGGGCGTGCGAATGGACCGTTCCGAGGGACACTAGTCGAGCCCCTCTTACAGCTCACAGGTAAATCGATTATCGTACGTAAGTCAAGTCGGCACTGCTTTACGGCAGGTAGTAATGGCTGCGTGCTGCGCAGACCTTCTGCCCCTCAGTTAGTCACGGCCACTAGCCCGGGAAAATATAGTTCGGACAGAAAAATCAGTACCCAGCACCCAACTAAAACAAGTTCTATTCCGAGACGCCTGCGGAGAGCCTCACTCGTTATAACTATGTACGGCGGATGGGGGTAGGGTATAAAGGGCATGCGTCTACACCGATTTCCTGGTTAATGATAATCTAGTTCTTAAAGCACTACTAGGCGCTGCGAATAGGGGTATTGGGCAATAGGCCCTGAATTAACCTTGTTTAGGGTTAGCCTATGCAGCGACCGTAGTACAATAATATCTATAAACGGGTACTCTCCAGACGTATTCATTAACTTCTCAATGAGGAACTATCTACAAAATCAATGAGTGATAACAGCGCATATGAAAAGTATGCAGTTGTTTCAAGCTGTTAACGGCCATTTCCACGAACGTGTTCACAGAGTAGAAGAAACGTAAAGCGTTACTCATCTCCGATACGGTGCGTGCGATGGGGCGTATTGCTTGTAATGTCGAGGGACGGGCATTGAAAAGAGTGCCACAGCATATCGGAGCAATTCACTAGTGAGCGTACCTTGATAAAGCAAAAGGATTACCTATTTTGCACACGTGTGCTAACCCCCAAGACCTGTTGAAACCGCCGAGCATCCGCCAATTTCTAGCACAACATTTCCATCTGCAACTAGCCGTAGAGCACTCAGGAATTTGATCTTAACATGATCGTGGAGGCAAGAAAAAAGGATGCAACAGCACCTTAGAGCACGAGATCATTCCTGGTTAATATTATGCTGTACGTCTGTCTGTCTGTCTGTCTGTCTGTCTGTCTGTCTGTCTGTCTGTCTGTCTGTCTGTCTGTCTGTCTGTCTGTCTGTCTGTCTGTCTGTCTGTCTGTCTGTCTGTCTGTCTGTCTGTCTGTCTGTCTGTCTGTCTGTCTGATCGCCATCATTAAGTACTTATATCTGCATAGAACATTAAGCGAGACGTTTGTGAGATATTCCCCTCTGGGCCCTTAGCTTCGCAGTTCCTCAGCGCCCTAAGATAAACGGGTGTAGCAGAAGAATCGGCGTGCTTTTTACAAGTCCTGCCGGCGATTCAGCATCAATTATAAACGGCCCCTAATAGAAATAGGGCGGCAGGAGTCAATTGGTATCGTTTTGGAGCCATTCACCGCCAAGGGTCAGATAACCCGGCATTCACTGCTGTATTCCCGGATTAACGGATCTCGGATCCAATGGCCCTCTGTGCCGATCTAATACTGCACGCTTAGTGGGCGGGATCAGATGAATGGCACCTCAGCCCCCCGAATTCAGTTGCTGGCCAGACGAGGGCGGGGACTGTTTGGAATTATTTGCTCAGTCCTTTTATCATCCCGATGCTATGACTCAATCCTCTAGATCCTTGGATGTCTCAGGAAATCTCACACATCATAGTCAACAAGAAACGAGACAAACTCGACTTGAGACTTCATCGCCTACAGTGTTTTATTGTAACGGGCACCTCTATATGTCGTCTTGATGGCATCAACAGCGCATGGTGATACATCGCTAGCGGCATTAGGCTTGATTGGTGCTTGCCGGGCGGGAGGCCATTTGGAGAGAGGCAGACTATCGTGGCATGCCGTAGCGCTTTGCATGCAGGTGGCGCGACCGTAAGGAGTGCAAGATGTAGATTGTCACGCTAAAGTTTATCACGTGATACTAGCTGACGTGTCCATAAGGCACGCAACAGCCTGCTCTAGGTTACTGTAGGGCTTGGCGATAGCATAGATAGGCCTGAGGGAGTTCTGGCGTAATAGTTGTTAGATAAAGCTGCCCAAATCCAACAGCTGGATTTCATGTGTGTTTGATAGCGCAATGCACTCATACTCAGTCCTTGCCAGCATGCTGTCACACGATGTACATCGTTAGCCCTAAGAGCCCCGTCGAGTAGCTAGTAAGCCTCATGAATGATACTCGGGGCCTCCCGACATAGACGCAGCTTGAGTGTCGGACGAGTATAAGCCATCCCAATGATTTGCCACTTAGAGAGTAGCGCCGTTTGGGATTGAGTCGAAGAGCGTGGCCTTAGACCACATATGATTTGCTTGCGCCTCCGTATCGCTTGCATTTGAGATGGAGCCTCATTTCTCTACCATCGCCGACTAGCAAGTTACCGATGGACAAGCCTAGCTTGTGTACTTTGAGAGTGGCTTCGTCACCAAAGGGTAGCCATAACCTCAATGGCTGTGATCTCTTACCCCCGGGGTCGGGCGAGATCTGGGCGAGAAGACTGCACGAGCCCTAGAAACTGCAAGTGGCACGGCTTCTTGTCCCATAGGCTATTGAGGGCATTGTTGAGTCGAAGTTTCTCCTAAAAATGTGAACATAGTTTCCCGCTCAGAGATACTCGCTTAAAACTCATACCATGGATGGCTGGAATGGACAAGCGGTATTCGTGCTGTGTAGGGATCCGCGTTGGTCTATTAACCACTGAGCGGATGCGGATTAAAGGGACAGACGATTACACGCCACGGAAGTCCTCGTCTGTGACGGGTCCCTCGCGTCTCCCCCAGAGGACCTTCATTCCCCGGTGGAGCGTCCATACGGTCTAGCTTGTACGCTTCGGGGTCGGGTATCGGACTGACCTATACGACAGACATATCCTAGAGAGGCCTAGATGGACCGGGAGCACGCGAGGGCAAACTCCCTCGCTATCCCACTTCGATTTCCCGGGGAGGGCGGCGTTTTAACACGTAAGGCACGTCTATTAGATGAGCTTATATATATGCGAACTTTGATCCAATTGGCACAGAACGTCAATTAAGAAAAATAATACGGAGATAGTGCCGCAATTGTCCATTTATACGCACCCTCTTTCTAGTATCTAACGTTCTTGGTACGCGGTCCACTAGACCCGACTCATAGCGTTATAATTTCCTGGTATCTATTAAATCGTCGGCCGTCTTTTCCACTAGTAACCTGCTCTTAGGCCGCAGGCACGGGCGTACGATACCCCCCGTACGGTGTAACATCAGTGCGAAGTAAATACGGGGCCAGCGTGTAGACGATAGTCATGTTAGCTGGAAGGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATATTCTGAGTATGCCGATCCAGGTTTGGCAGCAACGGAAAATATCTTCTACTTGGGCCCCTATAACGAAATGTCTGCCTAACCACCTTTTTTCTGGACCCTCAACATGCCAGTTAACCCCGCGCGGGAAAAGCGTCTGGCGCGGGCGTCGGGATATACTGACCAGTAGAGCACTGATTAAAGTATTTGTGGTTAAAAATTCACAACGTATTCCATGCGGGACACCGACACGCACGTCAGTTGCTCGCAGGTGATGGTAGAGGGGTGGATCGACCGAGGTCGGGTTGGTGGGTAAAGGTTAGCCTGCACCACGCGAATGTGCTCCATTCAATTTTGGGGGTGCGATTCTCCGTTGCGGGATCCAAGAGGAGTTAAGATGGCCTTGTCCAGTTGAAACTTGGCTGTGGCATGGGCGACAAGATAAAAGGGTTATTACTGATCTAGTCTAGTCTAGTCTAGTCTAGTCTAGTCTAGTCTAGTCTAGTCTAGTCTAGTCTAGTCTAGTCTAGTCTAGTCTAGTCTAGTCTAGTCTAGTCTAGTCTAGTCTAGTCTAGTCTAGTCTAGCACTGAGGTCTAGTACGTACGATGAGTGAGCATCGTTATTGGAAAAAGTCATGAACCGG |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name,AGATC,TTTTTTCT,AATG,TCTAG,GATA,TATC,GAAA,TCTG | |
Albus,15,49,38,5,14,44,14,12 | |
Cedric,31,21,41,28,30,9,36,44 | |
Draco,9,13,8,26,15,25,41,39 | |
Fred,37,40,10,6,5,10,28,8 | |
Ginny,37,47,10,23,5,48,28,23 | |
Hagrid,25,38,45,49,39,18,42,30 | |
Harry,46,49,48,29,15,5,28,40 | |
Hermione,43,31,18,25,26,47,31,36 | |
James,46,41,38,29,15,5,48,22 | |
Kingsley,7,11,18,33,39,31,23,14 | |
Lavender,22,33,43,12,26,18,47,41 | |
Lily,42,47,48,18,35,46,48,50 | |
Lucius,9,13,33,26,45,11,36,39 | |
Luna,18,23,35,13,11,19,14,24 | |
Minerva,17,49,18,7,6,18,17,30 | |
Neville,14,44,28,27,19,7,25,20 | |
Petunia,29,29,40,31,45,20,40,35 | |
Remus,6,18,5,42,39,28,44,22 | |
Ron,37,47,13,25,17,6,13,35 | |
Severus,29,27,32,41,6,27,8,34 | |
Sirius,31,11,28,26,35,19,33,6 | |
Vernon,26,45,34,50,44,30,32,28 | |
Zacharias,29,50,18,23,38,24,22,9 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name,AGATC,AATG,TATC | |
Alice,2,8,3 | |
Bob,4,1,5 | |
Charlie,3,2,5 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import sys | |
def main(): | |
# TODO: Check for command-line usage | |
if len(sys.argv) != 3: | |
sys.exit("Usage: python dna.py databases.csv sequences.txt") | |
exit(1) | |
STRs = [] | |
profiles = [] | |
# TODO: Read database file into a variable | |
with open(sys.argv[1], 'r') as file: | |
reader = csv.DictReader(file) | |
STRs = reader.fieldnames[1:] | |
for row in reader: | |
profiles.append(row) | |
seq_str_count = dict.fromkeys(STRs, 0) | |
# TODO: Read DNA sequence file into a variable | |
with open(sys.argv[2], 'r') as sequence_file: | |
sequence = sequence_file.readline() | |
for STR in STRs: | |
seq_str_count[STR] = longest_match(sequence, STR) | |
# TODO: Find longest match of each STR in DNA sequence | |
for profile in profiles: | |
match_count = 0 | |
for STR in STRs: | |
if int(profile[STR]) != seq_str_count[STR]: | |
continue | |
match_count += 1 | |
# TODO: Check database for matching profiles | |
if match_count == len(STRs): | |
print(profile['name']) | |
exit(0) | |
print("No match") | |
exit(1) | |
def longest_match(sequence, subsequence): | |
"""Returns length of longest run of subsequence in sequence.""" | |
# Initialize variables | |
longest_run = 0 | |
subsequence_length = len(subsequence) | |
sequence_length = len(sequence) | |
# Check each character in sequence for most consecutive runs of subsequence | |
for i in range(sequence_length): | |
# Initialize count of consecutive runs | |
count = 0 | |
# Check for a subsequence match in a "substring" (a subset of characters) within sequence | |
# If a match, move substring to next potential match in sequence | |
# Continue moving substring and checking for matches until out of consecutive matches | |
while True: | |
# Adjust substring start and end | |
start = i + count * subsequence_length | |
end = start + subsequence_length | |
# If there is a match in the substring | |
if sequence[start:end] == subsequence: | |
count += 1 | |
# If there is no match in the substring | |
else: | |
break | |
# Update most consecutive matches found | |
longest_run = max(longest_run, count) | |
# After checking for runs at each character in seqeuence, return longest run found | |
return longest_run | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment