Skip to content

Instantly share code, notes, and snippets.

@JAS-Norway
Last active October 28, 2024 14:33
Show Gist options
  • Save JAS-Norway/5abb1b7826ffb20141f1cbf76da50913 to your computer and use it in GitHub Desktop.
Save JAS-Norway/5abb1b7826ffb20141f1cbf76da50913 to your computer and use it in GitHub Desktop.
Please give me all the feedback you can think of.
from time import time
# Averages around 0.9 seconds.
# Updated program where I tried to use the tips reddit gave me.
def count_overlap(path1: str, path2: str) -> int:
"""
Takes the input of 2 files which contain a number of DNA strings.
Each DNA string is guaranteed to be unique, meaning no two same DNA
strings will exist in the same file.
If they both contain the same string, count +1.
Returns the final amount of same strings.
"""
content1: list[str] = read(path1)
content2: list[str] = read(path2)
content1_set: set[str] = list_to_set(content1)
content2_set: set[str] = list_to_set(content2)
return len(content1_set & content2_set)
def read(path: str) -> list[str]:
with open(path, "r", encoding="utf-8") as reader:
content: list[str] = reader.readlines()
return content
def list_to_set(lst: list[str]) -> set[str]:
ret_set: set[str] = set()
for content in lst:
ret_set.add(content)
return ret_set
def main():
start = time()
test_count_overlap_sample()
end = time()
print(end-start) # Averages around 0.9 seconds.
def test_count_overlap_sample():
print('Tests count_overlap... ', end='')
assert 2 == count_overlap('sample1.txt', 'sample2.txt')
# Tests efficiency (the test will take a long time with the wrong solution):
assert 100001 == count_overlap('id1.txt', 'id2.txt')
print('OK')
if __name__ == "__main__":
main()
from time import time
# Averages around 1.2 seconds.
def count_overlap(path1: str, path2: str) -> int:
"""
Takes the input of 2 files which contain a number of DNA strings.
Each DNA string is guaranteed to be unique, meaning no two same DNA strings will exist in the same file.
If they both contain the same string, count +1.
Returns the final amount of same strings.
"""
dna_count: int = 0
content1: list[str] = read(path1)
content1_map: dict[str, int] = list_to_dict(content1)
content2: list[str] = read(path2)
for line in content2:
if line in content1_map:
dna_count += 1
return dna_count
def read(path: str) -> list[str]:
with open(path, "r", encoding="utf-8") as reader:
content: list[str] = reader.readlines()
return content
def list_to_dict(lst: list[str]) -> dict[str, int]:
ret_dict: dict[str, int] = {}
for i,v in enumerate(lst):
ret_dict.update({v: i})
return ret_dict
def main():
start = time()
test_count_overlap_sample()
end = time()
print(end-start) # Averages around 1.2 seconds.
def test_count_overlap_sample():
print('Tests count_overlap... ', end='')
assert 2 == count_overlap('sample1.txt', 'sample2.txt')
# Tests efficiency (the test will take a long time with the wrong solution):
assert 100001 == count_overlap('id1.txt', 'id2.txt')
print('OK')
if __name__ == "__main__":
main()
from time import time
# Averages around 1.75 seconds.
def count_overlap(path1: str, path2: str) -> int:
"""
Takes the input of 2 files which contain a number of DNA strings.
Each DNA string is guaranteed to be unique, meaning no two same DNA strings will exist in the same file.
If they both contain the same string, count +1.
Returns the final amount of same strings as an int.
"""
content1: dict[str, int] = read(path1)
content2: dict[str, int] = read(path2)
total_len: int = len(content1) + len(content2)
content1.update(content2)
dna_count = total_len - len(content1)
return dna_count
def read(path: str) -> dict[str, int]:
"""
Reads a file of strings and turns it into a map.
"""
with open(path, "r", encoding="utf-8") as reader:
content: list[str] = reader.readlines()
return list_to_dict(content)
def list_to_dict(lst: list[str]) -> dict[str, int]:
ret_dict: dict[str, int] = {}
for i,v in enumerate(lst):
ret_dict.update({v.strip(): i})
return ret_dict
def main():
start = time()
test_count_overlap_sample()
end = time()
print(end-start) # Averages around 1.75 seconds.
def test_count_overlap_sample():
print('Tests count_overlap... ', end='')
assert 2 == count_overlap('sample1.txt', 'sample2.txt')
# Tests efficiency (the test will take a long time with the wrong solution):
assert 100001 == count_overlap('id1.txt', 'id2.txt')
print('OK')
if __name__ == "__main__":
main()
TATT
GAGA
GAGG
AGGA
CATT
GAGA
TATT
@JAS-Norway
Copy link
Author

JAS-Norway commented Oct 27, 2024

These are my 2 solutions to the same problem.
Please give me all the feedback you can think of, like:

  • Bad naming
  • Bad functions
  • Hard to understand
  • Missing important coding philosophies and priciples

I don't mind harsh feedback! I want to learn, and to learn you must humble and able to listen to people better than you. Thank you so much if you decide to help! <3

If you guys want to run the scripts for yourself, you will need the id1.txt and id2.txt files. They are kind of big, so I didn't want to attach them here.
They can be found here in my repo if interested!
https://github.com/JAS-Norway/reddit-feeback-genetics.py-

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment