Created
January 15, 2024 15:32
-
-
Save MrHedmad/2fbfa1688f6d086138cc5bfa9f39dc67 to your computer and use it in GitHub Desktop.
Small script to quickly generate synthetic reads from a genome
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from tqdm import tqdm | |
from random import randint, random, choice | |
from pathlib import Path | |
from sys import stdout | |
def rand_bool(chance: float) -> bool: | |
return random() < chance | |
def read_fasta(file: Path) -> str: | |
with file.open("r") as stream: | |
lines = stream.readlines() | |
lines = [line.strip() for line in lines if not line.startswith(">")] | |
return "".join(lines) | |
def sample_genome(genome: str, n: int, loop: bool = True, read_len: int = 100, mutation_rate: float = 0.064): | |
reads = [] | |
genome_len = len(genome) | |
genome = genome + genome[0:read_len+1] | |
# C G G G T T T T C G | |
# 0 1 2 3 4 5 6 7 8 9 10 | |
# if read_len = 5 and start_pos = 8 | |
# 1 2 3 | |
# 0 1 | |
# So start_pos:min(genome_len, start_pos + read_len - 1) | |
# and 0:(read_len - 1 - (genome_len - start_pos + 1)) | |
for i in tqdm(range(n)): | |
hit = False | |
while not hit: | |
start_pos = randint(0, genome_len - 1) | |
if not loop and start_pos >= genome_len - read_len - 1: | |
pass | |
else: | |
hit = True | |
end = start_pos + read_len | |
read = genome[start_pos:end] | |
for i in range(read_len): | |
if rand_bool(mutation_rate): | |
read = list(read) | |
read[i] = choice(["A", "C", "T", "G"]) | |
read = "".join(read) | |
reads.append(read) | |
out = [] | |
for i, read in enumerate(reads): | |
out.append(f"> read_{i}") | |
out.append(read) | |
return "\n".join(out) | |
def main(args): | |
genome = read_fasta(args.genome) | |
sampled = sample_genome(genome, n = args.n, read_len=args.read_length, mutation_rate=args.mutation_rate, loop=args.loop) | |
print(sampled) | |
stdout.write(sampled) | |
if __name__ == "__main__": | |
import argparse | |
parser = argparse.ArgumentParser() | |
parser.add_argument("genome", type=Path, help="Genome to generate reads from") | |
parser.add_argument("n", type=int, help="Number of reads to sample") | |
# Mutation rate from https://www.nature.com/articles/s41598-018-29325-6 | |
parser.add_argument("--mutation-rate", type=float, help="Chance that a base is mutated, between 1 and 0", default=0.064) | |
parser.add_argument("--read-length", type=int, help="Length of each generated read", default=100) | |
parser.add_argument("--loop", action = "store_true", help="Loop to the end of the genome back to the start") | |
args = parser.parse_args() | |
main(args) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment