Skip to content

Instantly share code, notes, and snippets.

@audy
Last active July 31, 2019 23:45
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save audy/aaa91c7bc3fddf2c7e60667b693c5717 to your computer and use it in GitHub Desktop.
Save audy/aaa91c7bc3fddf2c7e60667b693c5717 to your computer and use it in GitHub Desktop.
Argh! Downsample ye treasures, matey!
#!/usr/bin/env python3
import argparse
import random
import sys
import xopen
def iter_fastq(handle):
handle = iter(handle) # wow
while True:
try:
h1 = next(handle).strip()
seq = next(handle).strip()
h2 = next(handle).strip()
qual = next(handle).strip()
assert h1.startswith("@")
assert h2.startswith("+")
yield (h1, seq, h2, qual)
except StopIteration:
break
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--r1-in", required=True)
parser.add_argument("--r2-in", required=True)
parser.add_argument("--r1-out", required=True)
parser.add_argument("--r2-out", required=True)
parser.add_argument("--output", default="/dev/stdout")
parser.add_argument(
"--target-read-count",
required=True,
type=int,
help="number of reads to downsample to. If the total number of reads is less, script will exit 0 without producing all reads",
)
return parser.parse_args()
def main():
args = parse_args()
total_pairs = 0
# assuming that R1 and R2 have the same number of reads :)
with xopen.xopen(args.r1_in, "rt") as handle:
records = iter_fastq(handle)
for record in records:
total_pairs += 1
total_pairs_written = 0
print("total input pairs: {}".format(total_pairs))
# 1 / p
p_inv = total_pairs // args.target_read_count
with xopen.xopen(args.r1_in, "rt") as r1_handle, xopen.xopen(
args.r2_in, "rt"
) as r2_handle, xopen.xopen(args.r1_out, "wt") as r1_out, xopen.xopen(
args.r2_out, "wt"
) as r2_out:
r1_records, r2_records = iter_fastq(r1_handle), iter_fastq(r2_handle)
for n, (r1, r2) in enumerate(zip(r1_records, r2_records)):
if random.randint(0, p_inv) == 0:
r1_out.write("{}\n{}\n{}\n{}\n".format(*r1))
r2_out.write("{}\n{}\n{}\n{}\n".format(*r2))
total_pairs_written += 1
print("target read count: {}".format(args.target_read_count), file=sys.stderr)
print("total pairs: {}".format(total_pairs), file=sys.stderr)
print("downsampled pair count: {}".format(total_pairs_written), file=sys.stderr)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment