Skip to content

Instantly share code, notes, and snippets.

@MrHedmad
Created January 15, 2024 15:32
Show Gist options
  • Save MrHedmad/2fbfa1688f6d086138cc5bfa9f39dc67 to your computer and use it in GitHub Desktop.
Save MrHedmad/2fbfa1688f6d086138cc5bfa9f39dc67 to your computer and use it in GitHub Desktop.
Small script to quickly generate synthetic reads from a genome
from tqdm import tqdm
from random import randint, random, choice
from pathlib import Path
from sys import stdout
def rand_bool(chance: float) -> bool:
return random() < chance
def read_fasta(file: Path) -> str:
with file.open("r") as stream:
lines = stream.readlines()
lines = [line.strip() for line in lines if not line.startswith(">")]
return "".join(lines)
def sample_genome(genome: str, n: int, loop: bool = True, read_len: int = 100, mutation_rate: float = 0.064):
reads = []
genome_len = len(genome)
genome = genome + genome[0:read_len+1]
# C G G G T T T T C G
# 0 1 2 3 4 5 6 7 8 9 10
# if read_len = 5 and start_pos = 8
# 1 2 3
# 0 1
# So start_pos:min(genome_len, start_pos + read_len - 1)
# and 0:(read_len - 1 - (genome_len - start_pos + 1))
for i in tqdm(range(n)):
hit = False
while not hit:
start_pos = randint(0, genome_len - 1)
if not loop and start_pos >= genome_len - read_len - 1:
pass
else:
hit = True
end = start_pos + read_len
read = genome[start_pos:end]
for i in range(read_len):
if rand_bool(mutation_rate):
read = list(read)
read[i] = choice(["A", "C", "T", "G"])
read = "".join(read)
reads.append(read)
out = []
for i, read in enumerate(reads):
out.append(f"> read_{i}")
out.append(read)
return "\n".join(out)
def main(args):
genome = read_fasta(args.genome)
sampled = sample_genome(genome, n = args.n, read_len=args.read_length, mutation_rate=args.mutation_rate, loop=args.loop)
print(sampled)
stdout.write(sampled)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("genome", type=Path, help="Genome to generate reads from")
parser.add_argument("n", type=int, help="Number of reads to sample")
# Mutation rate from https://www.nature.com/articles/s41598-018-29325-6
parser.add_argument("--mutation-rate", type=float, help="Chance that a base is mutated, between 1 and 0", default=0.064)
parser.add_argument("--read-length", type=int, help="Length of each generated read", default=100)
parser.add_argument("--loop", action = "store_true", help="Loop to the end of the genome back to the start")
args = parser.parse_args()
main(args)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment