Skip to content

Instantly share code, notes, and snippets.

@dawnandrew100
Created July 31, 2024 19:26
Show Gist options
  • Save dawnandrew100/ab97bbc100c757c9e7146d50641889ba to your computer and use it in GitHub Desktop.
Save dawnandrew100/ab97bbc100c757c9e7146d50641889ba to your computer and use it in GitHub Desktop.
Counts characters of a specified group size that appear in a sequence
from collections import defaultdict
dna: str = "AAACGATGCTAGCATCGGGGCTAGCTACGATTCATCAGCATACGT"
def character_count(count: int, seq: str, order: bool = True) -> dict[str, int]|str:
seqlen = len(seq)
charactermatches: dict[str, int] = defaultdict(int)
if count > seqlen:
return "Count is greater than sequence length! Cannot compute count."
if count == seqlen:
charactermatches[seq] += 1
charactermatches = dict(charactermatches)
return charactermatches
for start, _ in enumerate(seq):
end = start+count
if end > len(seq):
break
key: str = seq[start:end]
#combines matches that contain
#the same letters in different order
if not order:
key = "".join(sorted(key))
charactermatches[key] += 1
charactermatches = dict(charactermatches)
return charactermatches
print(character_count(2,dna))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment