Skip to content

Instantly share code, notes, and snippets.

@standage
Created July 31, 2018 20:14
Show Gist options
  • Save standage/d977864dcac74ee663ad677f1d3f9575 to your computer and use it in GitHub Desktop.
Save standage/d977864dcac74ee663ad677f1d3f9575 to your computer and use it in GitHub Desktop.
Simple kevlar utility for reservoir sampling of partitioned reads. The sampling function might be of general interest.
#!/usr/bin/env python
import argparse
import kevlar
import random
import sys
def resv_samp(objstream, n=100, stopafter=None, filterfunc=None):
sample = list()
for counter, obj in enumerate(objstream):
if filterfunc:
discard = filterfunc(obj)
if discard:
continue
if len(sample) < n:
sample.append(obj)
continue
guess = random.randint(1, counter)
if guess <= n:
sample[guess-1] = obj
if stopafter and counter > stopafter:
break
return sample
parser = argparse.ArgumentParser()
parser.add_argument('partreads')
parser.add_argument('outfile')
args = parser.parse_args()
reader = kevlar.parse_augmented_fastx(kevlar.open(args.partreads, 'r'))
preader = kevlar.parse_partitioned_reads(reader)
outstream = kevlar.open(args.outfile, 'w')
for partition in resv_samp(preader, filterfunc=lambda p: len(p) > 1000):
for read in partition:
kevlar.print_augmented_fastx(read, outstream)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment