JAS-Norway/better_genetics.py

## better_genetics.py
from time import time
# Averages around 0.9 seconds.
# Updated program where I tried to use the tips reddit gave me.

def count_overlap(path1: str, path2: str) -> int:
    """
    Takes the input of 2 files which contain a number of DNA strings.
    Each DNA string is guaranteed to be unique, meaning no two same DNA
    strings will exist in the same file.
    If they both contain the same string, count +1.
    Returns the final amount of same strings.
    """
    content1: list[str] = read(path1)
    content2: list[str] = read(path2)
    content1_set: set[str] = list_to_set(content1)
    content2_set: set[str] = list_to_set(content2)
    return len(content1_set & content2_set)


def read(path: str) -> list[str]:
    with open(path, "r", encoding="utf-8") as reader:
        content: list[str] = reader.readlines()
    return content


def list_to_set(lst: list[str]) -> set[str]:
    ret_set: set[str] = set()
    for content in lst:
        ret_set.add(content)
    return ret_set


def main():
    start = time()
    test_count_overlap_sample()
    end = time()
    print(end-start) # Averages around 0.9 seconds.


def test_count_overlap_sample():
    print('Tests count_overlap... ', end='')
    assert 2 == count_overlap('sample1.txt', 'sample2.txt')
    # Tests efficiency (the test will take a long time with the wrong solution):
    assert 100001 == count_overlap('id1.txt', 'id2.txt')
    print('OK')


if __name__ == "__main__":
    main()

## genetics.py
from time import time
# Averages around 1.2 seconds.

def count_overlap(path1: str, path2: str) -> int:
    """
    Takes the input of 2 files which contain a number of DNA strings.
    Each DNA string is guaranteed to be unique, meaning no two same DNA strings will exist in the same file.
    If they both contain the same string, count +1.
    Returns the final amount of same strings.
    """
    dna_count: int = 0
    content1: list[str] = read(path1)
    content1_map: dict[str, int] = list_to_dict(content1)
    content2: list[str] = read(path2)
    for line in content2:
        if line in content1_map:
            dna_count += 1
    return dna_count


def read(path: str) -> list[str]:
    with open(path, "r", encoding="utf-8") as reader:
        content: list[str] = reader.readlines()
    return content


def list_to_dict(lst: list[str]) -> dict[str, int]:
    ret_dict: dict[str, int] = {}
    for i,v in enumerate(lst):
        ret_dict.update({v: i})
    return ret_dict


def main():
    start = time()
    test_count_overlap_sample()
    end = time()
    print(end-start) # Averages around 1.2 seconds.


def test_count_overlap_sample():
    print('Tests count_overlap... ', end='')
    assert 2 == count_overlap('sample1.txt', 'sample2.txt')

    # Tests efficiency (the test will take a long time with the wrong solution):
    assert 100001 == count_overlap('id1.txt', 'id2.txt')
    print('OK')


if __name__ == "__main__":
    main()

## genetics2.py
from time import time
# Averages around 1.75 seconds.

def count_overlap(path1: str, path2: str) -> int:
    """
    Takes the input of 2 files which contain a number of DNA strings.
    Each DNA string is guaranteed to be unique, meaning no two same DNA strings will exist in the same file.
    If they both contain the same string, count +1.
    Returns the final amount of same strings as an int.
    """
    content1: dict[str, int] = read(path1)
    content2: dict[str, int] = read(path2)
    total_len: int = len(content1) + len(content2)
    content1.update(content2)
    dna_count = total_len - len(content1)
    return dna_count


def read(path: str) -> dict[str, int]:
    """
    Reads a file of strings and turns it into a map.
    """
    with open(path, "r", encoding="utf-8") as reader:
        content: list[str] = reader.readlines()
    return list_to_dict(content)


def list_to_dict(lst: list[str]) -> dict[str, int]:
    ret_dict: dict[str, int] = {}
    for i,v in enumerate(lst):
        ret_dict.update({v.strip(): i})
    return ret_dict


def main():
    start = time()
    test_count_overlap_sample()
    end = time()
    print(end-start) # Averages around 1.75 seconds.


def test_count_overlap_sample():
    print('Tests count_overlap... ', end='')
    assert 2 == count_overlap('sample1.txt', 'sample2.txt')

    # Tests efficiency (the test will take a long time with the wrong solution):
    assert 100001 == count_overlap('id1.txt', 'id2.txt')
    print('OK')


if __name__ == "__main__":
    main()

## sample1.txt
TATT
GAGA
GAGG

## sample2.txt
AGGA
CATT
GAGA
TATT
	from time import time
	# Averages around 0.9 seconds.
	# Updated program where I tried to use the tips reddit gave me.

	def count_overlap(path1: str, path2: str) -> int:
	"""
	Takes the input of 2 files which contain a number of DNA strings.
	Each DNA string is guaranteed to be unique, meaning no two same DNA
	strings will exist in the same file.
	If they both contain the same string, count +1.
	Returns the final amount of same strings.
	"""
	content1: list[str] = read(path1)
	content2: list[str] = read(path2)
	content1_set: set[str] = list_to_set(content1)
	content2_set: set[str] = list_to_set(content2)
	return len(content1_set & content2_set)


	def read(path: str) -> list[str]:
	with open(path, "r", encoding="utf-8") as reader:
	content: list[str] = reader.readlines()
	return content


	def list_to_set(lst: list[str]) -> set[str]:
	ret_set: set[str] = set()
	for content in lst:
	ret_set.add(content)
	return ret_set


	def main():
	start = time()
	test_count_overlap_sample()
	end = time()
	print(end-start) # Averages around 0.9 seconds.


	def test_count_overlap_sample():
	print('Tests count_overlap... ', end='')
	assert 2 == count_overlap('sample1.txt', 'sample2.txt')
	# Tests efficiency (the test will take a long time with the wrong solution):
	assert 100001 == count_overlap('id1.txt', 'id2.txt')
	print('OK')


	if __name__ == "__main__":
	main()
	from time import time
	# Averages around 1.2 seconds.

	def count_overlap(path1: str, path2: str) -> int:
	"""
	Takes the input of 2 files which contain a number of DNA strings.
	Each DNA string is guaranteed to be unique, meaning no two same DNA strings will exist in the same file.
	If they both contain the same string, count +1.
	Returns the final amount of same strings.
	"""
	dna_count: int = 0
	content1: list[str] = read(path1)
	content1_map: dict[str, int] = list_to_dict(content1)
	content2: list[str] = read(path2)
	for line in content2:
	if line in content1_map:
	dna_count += 1
	return dna_count


	def read(path: str) -> list[str]:
	with open(path, "r", encoding="utf-8") as reader:
	content: list[str] = reader.readlines()
	return content


	def list_to_dict(lst: list[str]) -> dict[str, int]:
	ret_dict: dict[str, int] = {}
	for i,v in enumerate(lst):
	ret_dict.update({v: i})
	return ret_dict


	def main():
	start = time()
	test_count_overlap_sample()
	end = time()
	print(end-start) # Averages around 1.2 seconds.


	def test_count_overlap_sample():
	print('Tests count_overlap... ', end='')
	assert 2 == count_overlap('sample1.txt', 'sample2.txt')

	# Tests efficiency (the test will take a long time with the wrong solution):
	assert 100001 == count_overlap('id1.txt', 'id2.txt')
	print('OK')


	if __name__ == "__main__":
	main()
	from time import time
	# Averages around 1.75 seconds.

	def count_overlap(path1: str, path2: str) -> int:
	"""
	Takes the input of 2 files which contain a number of DNA strings.
	Each DNA string is guaranteed to be unique, meaning no two same DNA strings will exist in the same file.
	If they both contain the same string, count +1.
	Returns the final amount of same strings as an int.
	"""
	content1: dict[str, int] = read(path1)
	content2: dict[str, int] = read(path2)
	total_len: int = len(content1) + len(content2)
	content1.update(content2)
	dna_count = total_len - len(content1)
	return dna_count


	def read(path: str) -> dict[str, int]:
	"""
	Reads a file of strings and turns it into a map.
	"""
	with open(path, "r", encoding="utf-8") as reader:
	content: list[str] = reader.readlines()
	return list_to_dict(content)


	def list_to_dict(lst: list[str]) -> dict[str, int]:
	ret_dict: dict[str, int] = {}
	for i,v in enumerate(lst):
	ret_dict.update({v.strip(): i})
	return ret_dict


	def main():
	start = time()
	test_count_overlap_sample()
	end = time()
	print(end-start) # Averages around 1.75 seconds.


	def test_count_overlap_sample():
	print('Tests count_overlap... ', end='')
	assert 2 == count_overlap('sample1.txt', 'sample2.txt')

	# Tests efficiency (the test will take a long time with the wrong solution):
	assert 100001 == count_overlap('id1.txt', 'id2.txt')
	print('OK')


	if __name__ == "__main__":
	main()