Skip to content

Instantly share code, notes, and snippets.

@jamestwebber
Created June 28, 2019 21:08
Show Gist options
  • Save jamestwebber/7f8dbba12a68b0f367223a5ed10f5d1c to your computer and use it in GitHub Desktop.
Save jamestwebber/7f8dbba12a68b0f367223a5ed10f5d1c to your computer and use it in GitHub Desktop.
Filters fastq files for a given list of 20bp guide sequences and outputs a count file
#!/usr/bin/env python3
import argparse
import gzip
import itertools
import os
import multiprocessing
from collections import Counter, defaultdict
if __name__ == '__main__':
parser = argparse.ArgumentParser(
prog='filter_fastq',
description=('Script to filter a fastq.gz file for 20bp guides'
' and output a count file')
)
parser.add_argument('-b', '--guides', required=True,
help='fasta file of guide sequences')
parser.add_argument('-f', '--fastq', required=True, nargs='+',
help='fastq[.gz] file(s) of results to quantify')
parser.add_argument('-o', '--output', required=True,
help='file to write guide counts')
args = parser.parse_args()
print('Reading guides from {}...'.format(args.guides))
# read in the guide sequences, assuming a structure of
# >[gene1].[sg_label]
# [sequence]
# >[gene2].[sg_label]
# etc
with open(args.guides) as f:
lines = [line.strip() for line in f]
labels = [line[1:] for line in lines[::2]]
sequences = lines[1::2]
sequence_set = set(sequences)
print('{} sequences for {} labels\n'.format(len(sequence_set), len(labels)))
# some sequences map to multiple genes
seq2lbl = defaultdict(set)
for seq,lbl in zip(sequences, labels):
seq2lbl[seq].add(lbl)
# function to read and count a fastq file
def read_fastq(fq_fn):
print('Counting reads from {}...'.format(fq_fn))
if fq_fn.endswith('.gz'):
with gzip.open(fq_fn, 'rb') as f:
seqs = [line[:20].decode() for line in itertools.islice(f, 1, None, 4)]
else:
with open(fq_fn) as f:
seqs = [line[:20] for line in itertools.islice(f, 1, None, 4)]
# count up the sequences
seq_counts = Counter(seq for seq in seqs if seq in sequence_set)
# map to guide labels
lbl_c = {lbl:seq_counts[seq] for seq in seq_counts for lbl in seq2lbl[seq]}
total_c = sum(lbl_c.values())
# summary statistics
res = [('Total reads', len(seqs)),
('Guide reads', total_c),
('% Guide reads', '{:.1f}%'.format(100 * total_c / len(seqs)))]
return lbl_c, total_c, res
# use multiprocessing to read in multiple fastq files in parallel
pool = multiprocessing.Pool(processes=min(multiprocessing.cpu_count(),
len(args.fastq)))
try:
lbl_counts, total_counts, results = map(
lambda s: dict(zip(args.fastq, s)),
zip(*pool.map(read_fastq, args.fastq))
)
finally:
pool.close()
pool.join()
# print summary stats to the console
for fastq_fn in args.fastq:
print('\nResults for {}:\n\t{}'.format(
fastq_fn, '\n\t'.join('{:16}:{:>12}'.format(n, v)
for n,v in results[fastq_fn])
))
header = ['sgRNA', 'gene']
header.extend(os.path.basename(fn) for fn in args.fastq)
header.extend('normalized {}'.format(os.path.basename(fn)) for fn in args.fastq)
# write total results to file
with open(args.output, 'w') as OUT:
print(','.join(header), file=OUT)
for lbl in sorted(labels):
c = ','.join('{:d}'.format(lbl_counts[fastq_fn].get(lbl, 0))
for fastq_fn in args.fastq)
n = ','.join('{:f}'.format(lbl_counts[fastq_fn].get(lbl, 0)
/ total_counts[fastq_fn])
for fastq_fn in args.fastq)
print(','.join((lbl, lbl.split('.')[0], c, n)), file=OUT)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment