Skip to content

Instantly share code, notes, and snippets.

@andrija-zikovic
Created May 26, 2023 10:59
Show Gist options
  • Save andrija-zikovic/81538555c0863a402d3a4f8a2b5a7d47 to your computer and use it in GitHub Desktop.
Save andrija-zikovic/81538555c0863a402d3a4f8a2b5a7d47 to your computer and use it in GitHub Desktop.
CS50/dna
AATGAATGAATGAATGAATGAATGAATGAATGAATGAATGAATGAATGAATGGTTAAAGCCAAGTGGAAGTTGACGAGCTACGGCACAGGTACCCTATACATACGGTAAATGAGTCGGAGGTTGTGGGTTTAAAGTAAGTCCCCGCTCAACATTCAGCAGACCCTCGAAGTGGGCCCTAAAATCGTGTTGCTAACGCTCCGGACCTGACCCCGAGCTTGGCTCCTAATTGTGTACTCTCTCCAACCAAGCAGCGTACCAACGCGGCAACCAGAGCGAAGCTGTACACGTCGATCATCGTTACGCCTCTACTCGATAGTCGTAGAAACTTGTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTGCGGTTGGTAGCTCTAACTGTCATCGTATTCGCGAATACCTCAGATATAAGCTCCAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAAGTGAATGCACGAGAGTGTTATAGCAGATATCCCCGCTGATCCGGCTGCCGAGGAGGTGGGCATGTGACGTTATGCACTACACAGCTACTACCAAGGTCTTCTGCGGGAAAGGATAGACAAACCGGCAACTCCGCGAGGTCGCGGACTTAGTATTGCGACGGCGTCCTAATCGGCTGGATTTGCGGTTTGTTGGCGTTAGTCCAAAGGTGCCGCTAATGTGGCCATATTTACGATCCACCCTATAGGGCTCCAGGTCGTTTTAAGTCGAGTCGTGTCTAGGGGCCATTCCTGGCCTTGAACGAAAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCGGCCTTAATGCTCAGATTCATATGCTGTGAGGCCGAGGGTGGCGTCATATCTTCGATGATGTTGAACATACGGTCCGGTATTTCGACTTGCCACCTGGTACTGCTTTAAAAGATGATACCATCAACAAAAGGGCACGGCGTGCCTCATGCAGGACGGGACGTTGCCTGCCTACAGCGCTCTACGTAGCAATGTCCGTCTTTCTTCATACACGTATGCTCCTAAAGAAATTGTAGTCTAACAGCTTCCAAACTGTAATCGCCGTTAGGTTCGTCTAAAGTAAAAATGATTGCAAGACGCAATCGAAGGAGCCATCGTTTCGAGGTGACTTCTAATATAACTACCTATGGTCATAGCATGCCCCAACATTGAACGAGGTAAGATCACGGGATCGACTGTCCTGGCGAGGGCCTTACGTTAGTCGTGTAATGCTCCGCGCGTCCCAAATATATGAAAGGCACGACACTCCCCACAATTTAACCCTCCCGCCAAATAAGTACCTAGCGGAGATAAGAATCTGGTCGGTCAGAAAAGGGTCTATGTCCTACAGAGTAGGGCGAAGTCCGCATACCGCAACAGTGCGGTGGCAAACGCTTTAATGACCAGGATCGTGCTAGGCAGTGGAATTTCATGTGGATTGGCCCGCGAATGGACAGGGAGCTATGTCTGAACTCTGTTGACGCTGAACTGTATCCGGATCGTCATGTGAATCGTAGCTATGGGAGTGGTGGTACTGTAAGTCAGGGCTACTTACTGCGGGGTATCTATCTATCTATCTATCTATCCTCACAGTTCATGATTATACGGATGTAATTTGCCGCTGGCTCACGATACGGCTATACAGCGTTGGCTCCTAACGTTGCCACCTACAGTCTGCACTTGGGCACTCGGTATGGTATAAAATATATGACGGCAGACGTTGCGATAAGTAAAAGATCGAACAATCTCGCAGCAAATCTTAAAGCGCATCTAACATCGGGCGTGCGAATGGACCGTTCCGAGGGACACTAGTCGAGCCCCTCTTACAGCTCACAGGTAAATCGATTATCGTACGTAAGTCAAGTCGGCACTGCTTTACGGCAGGTAGTAATGGCTGCGTGCTGCGCAGACCTTCTGCCCCTCAGTTAGTCACGGCCACTAGCCCGGGAAAATATAGTTCGGACAGAAAAATCAGTACCCAGCACCCAACTAAAACAAGTTCTATTCCGAGACGCCTGCGGAGAGCCTCACTCGTTATAACTATGTACGGCGGATGGGGGTAGGGTATAAAGGGCATGCGTCTACACCGATTTCCTGGTTAATGATAATCTAGTTCTTAAAGCACTACTAGGCGCTGCGAATAGGGGTATTGGGCAATAGGCCCTGAATTAACCTTGTTTAGGGTTAGCCTATGCAGCGACCGTAGTACAATAATATCTATAAACGGGTACTCTCCAGACGTATTCATTAACTTCTCAATGAGGAACTATCTACAAAATCAATGAGTGATAACAGCGCATATGAAAAGTATGCAGTTGTTTCAAGCTGTTAACGGCCATTTCCACGAACGTGTTCACAGAGTAGAAGAAACGTAAAGCGTTACTCATCTCCGATACGGTGCGTGCGATGGGGCGTATTGCTTGTAATGTCGAGGGACGGGCATTGAAAAGAGTGCCACAGCATATCGGAGCAATTCACTAGTGAGCGTACCTTGATAAAGCAAAAGGATTACCTATTTTGCACACGTGTGCTAACCCCCAAGACCTGTTGAAACCGCCGAGCATCCGCCAATTTCTAGCACAACATTTCCATCTGCAACTAGCCGTAGAGCACTCAGGAATTTGATCTTAACATGATCGTGGAGGCAAGAAAAAAGGATGCAACAGCACCTTAGAGCACGAGATCATTCCTGGTTAATATTATGCTGTACGTCTGTCTGTCTGTCTGTCTGTCTGTCTGTCTGTCTGTCTGTCTGTCTGTCTGTCTGTCTGTCTGTCTGTCTGTCTGTCTGTCTGTCTGTCTGTCTGTCTGTCTGTCTGTCTGTCTGTCTGTCTGTCTGTCTGTCTGTCTGATCGCCATCATTAAGTACTTATATCTGCATAGAACATTAAGCGAGACGTTTGTGAGATATTCCCCTCTGGGCCCTTAGCTTCGCAGTTCCTCAGCGCCCTAAGATAAACGGGTGTAGCAGAAGAATCGGCGTGCTTTTTACAAGTCCTGCCGGCGATTCAGCATCAATTATAAACGGCCCCTAATAGAAATAGGGCGGCAGGAGTCAATTGGTATCGTTTTGGAGCCATTCACCGCCAAGGGTCAGATAACCCGGCATTCACTGCTGTATTCCCGGATTAACGGATCTCGGATCCAATGGCCCTCTGTGCCGATCTAATACTGCACGCTTAGTGGGCGGGATCAGATGAATGGCACCTCAGCCCCCCGAATTCAGTTGCTGGCCAGACGAGGGCGGGGACTGTTTGGAATTATTTGCTCAGTCCTTTTATCATCCCGATGCTATGACTCAATCCTCTAGATCCTTGGATGTCTCAGGAAATCTCACACATCATAGTCAACAAGAAACGAGACAAACTCGACTTGAGACTTCATCGCCTACAGTGTTTTATTGTAACGGGCACCTCTATATGTCGTCTTGATGGCATCAACAGCGCATGGTGATACATCGCTAGCGGCATTAGGCTTGATTGGTGCTTGCCGGGCGGGAGGCCATTTGGAGAGAGGCAGACTATCGTGGCATGCCGTAGCGCTTTGCATGCAGGTGGCGCGACCGTAAGGAGTGCAAGATGTAGATTGTCACGCTAAAGTTTATCACGTGATACTAGCTGACGTGTCCATAAGGCACGCAACAGCCTGCTCTAGGTTACTGTAGGGCTTGGCGATAGCATAGATAGGCCTGAGGGAGTTCTGGCGTAATAGTTGTTAGATAAAGCTGCCCAAATCCAACAGCTGGATTTCATGTGTGTTTGATAGCGCAATGCACTCATACTCAGTCCTTGCCAGCATGCTGTCACACGATGTACATCGTTAGCCCTAAGAGCCCCGTCGAGTAGCTAGTAAGCCTCATGAATGATACTCGGGGCCTCCCGACATAGACGCAGCTTGAGTGTCGGACGAGTATAAGCCATCCCAATGATTTGCCACTTAGAGAGTAGCGCCGTTTGGGATTGAGTCGAAGAGCGTGGCCTTAGACCACATATGATTTGCTTGCGCCTCCGTATCGCTTGCATTTGAGATGGAGCCTCATTTCTCTACCATCGCCGACTAGCAAGTTACCGATGGACAAGCCTAGCTTGTGTACTTTGAGAGTGGCTTCGTCACCAAAGGGTAGCCATAACCTCAATGGCTGTGATCTCTTACCCCCGGGGTCGGGCGAGATCTGGGCGAGAAGACTGCACGAGCCCTAGAAACTGCAAGTGGCACGGCTTCTTGTCCCATAGGCTATTGAGGGCATTGTTGAGTCGAAGTTTCTCCTAAAAATGTGAACATAGTTTCCCGCTCAGAGATACTCGCTTAAAACTCATACCATGGATGGCTGGAATGGACAAGCGGTATTCGTGCTGTGTAGGGATCCGCGTTGGTCTATTAACCACTGAGCGGATGCGGATTAAAGGGACAGACGATTACACGCCACGGAAGTCCTCGTCTGTGACGGGTCCCTCGCGTCTCCCCCAGAGGACCTTCATTCCCCGGTGGAGCGTCCATACGGTCTAGCTTGTACGCTTCGGGGTCGGGTATCGGACTGACCTATACGACAGACATATCCTAGAGAGGCCTAGATGGACCGGGAGCACGCGAGGGCAAACTCCCTCGCTATCCCACTTCGATTTCCCGGGGAGGGCGGCGTTTTAACACGTAAGGCACGTCTATTAGATGAGCTTATATATATGCGAACTTTGATCCAATTGGCACAGAACGTCAATTAAGAAAAATAATACGGAGATAGTGCCGCAATTGTCCATTTATACGCACCCTCTTTCTAGTATCTAACGTTCTTGGTACGCGGTCCACTAGACCCGACTCATAGCGTTATAATTTCCTGGTATCTATTAAATCGTCGGCCGTCTTTTCCACTAGTAACCTGCTCTTAGGCCGCAGGCACGGGCGTACGATACCCCCCGTACGGTGTAACATCAGTGCGAAGTAAATACGGGGCCAGCGTGTAGACGATAGTCATGTTAGCTGGAAGGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATATTCTGAGTATGCCGATCCAGGTTTGGCAGCAACGGAAAATATCTTCTACTTGGGCCCCTATAACGAAATGTCTGCCTAACCACCTTTTTTCTGGACCCTCAACATGCCAGTTAACCCCGCGCGGGAAAAGCGTCTGGCGCGGGCGTCGGGATATACTGACCAGTAGAGCACTGATTAAAGTATTTGTGGTTAAAAATTCACAACGTATTCCATGCGGGACACCGACACGCACGTCAGTTGCTCGCAGGTGATGGTAGAGGGGTGGATCGACCGAGGTCGGGTTGGTGGGTAAAGGTTAGCCTGCACCACGCGAATGTGCTCCATTCAATTTTGGGGGTGCGATTCTCCGTTGCGGGATCCAAGAGGAGTTAAGATGGCCTTGTCCAGTTGAAACTTGGCTGTGGCATGGGCGACAAGATAAAAGGGTTATTACTGATCTAGTCTAGTCTAGTCTAGTCTAGTCTAGTCTAGTCTAGTCTAGTCTAGTCTAGTCTAGTCTAGTCTAGTCTAGTCTAGTCTAGTCTAGTCTAGTCTAGTCTAGTCTAGTCTAGTCTAGTCTAGCACTGAGGTCTAGTACGTACGATGAGTGAGCATCGTTATTGGAAAAAGTCATGAACCGG
name,AGATC,TTTTTTCT,AATG,TCTAG,GATA,TATC,GAAA,TCTG
Albus,15,49,38,5,14,44,14,12
Cedric,31,21,41,28,30,9,36,44
Draco,9,13,8,26,15,25,41,39
Fred,37,40,10,6,5,10,28,8
Ginny,37,47,10,23,5,48,28,23
Hagrid,25,38,45,49,39,18,42,30
Harry,46,49,48,29,15,5,28,40
Hermione,43,31,18,25,26,47,31,36
James,46,41,38,29,15,5,48,22
Kingsley,7,11,18,33,39,31,23,14
Lavender,22,33,43,12,26,18,47,41
Lily,42,47,48,18,35,46,48,50
Lucius,9,13,33,26,45,11,36,39
Luna,18,23,35,13,11,19,14,24
Minerva,17,49,18,7,6,18,17,30
Neville,14,44,28,27,19,7,25,20
Petunia,29,29,40,31,45,20,40,35
Remus,6,18,5,42,39,28,44,22
Ron,37,47,13,25,17,6,13,35
Severus,29,27,32,41,6,27,8,34
Sirius,31,11,28,26,35,19,33,6
Vernon,26,45,34,50,44,30,32,28
Zacharias,29,50,18,23,38,24,22,9
name,AGATC,AATG,TATC
Alice,2,8,3
Bob,4,1,5
Charlie,3,2,5
import csv
import sys
def main():
# TODO: Check for command-line usage
if len(sys.argv) != 3:
sys.exit("Usage: python dna.py databases.csv sequences.txt")
exit(1)
STRs = []
profiles = []
# TODO: Read database file into a variable
with open(sys.argv[1], 'r') as file:
reader = csv.DictReader(file)
STRs = reader.fieldnames[1:]
for row in reader:
profiles.append(row)
seq_str_count = dict.fromkeys(STRs, 0)
# TODO: Read DNA sequence file into a variable
with open(sys.argv[2], 'r') as sequence_file:
sequence = sequence_file.readline()
for STR in STRs:
seq_str_count[STR] = longest_match(sequence, STR)
# TODO: Find longest match of each STR in DNA sequence
for profile in profiles:
match_count = 0
for STR in STRs:
if int(profile[STR]) != seq_str_count[STR]:
continue
match_count += 1
# TODO: Check database for matching profiles
if match_count == len(STRs):
print(profile['name'])
exit(0)
print("No match")
exit(1)
def longest_match(sequence, subsequence):
"""Returns length of longest run of subsequence in sequence."""
# Initialize variables
longest_run = 0
subsequence_length = len(subsequence)
sequence_length = len(sequence)
# Check each character in sequence for most consecutive runs of subsequence
for i in range(sequence_length):
# Initialize count of consecutive runs
count = 0
# Check for a subsequence match in a "substring" (a subset of characters) within sequence
# If a match, move substring to next potential match in sequence
# Continue moving substring and checking for matches until out of consecutive matches
while True:
# Adjust substring start and end
start = i + count * subsequence_length
end = start + subsequence_length
# If there is a match in the substring
if sequence[start:end] == subsequence:
count += 1
# If there is no match in the substring
else:
break
# Update most consecutive matches found
longest_run = max(longest_run, count)
# After checking for runs at each character in seqeuence, return longest run found
return longest_run
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment