Last active
August 29, 2015 14:04
-
-
Save marcelcaraciolo/51e641816b3cccf06e02 to your computer and use it in GitHub Desktop.
example command
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def can_pipe(command, fastq_file): | |
''' | |
bwa-mem handles longer (> 70bp) reads with improved piping. | |
Randomly samples 5000 reads from the first two million. | |
Default to no piping if more than 75% of the sampled reads are small. | |
''' | |
min_size = 70 | |
thresh = 0.75 | |
head_count = 8000000 | |
tocheck = 5000 | |
cat_cmd = 'cat {fastq_file}' | |
cmd = (cat_cmd + " | head -n {head_count} | " | |
"{command} sample -s42 - {tocheck} | " | |
"awk '{{if(NR%4==2) print length($1)}}' | sort | uniq -c") | |
count_out = subprocess.check_output(cmd.format(**locals()), shell=True, | |
executable="/bin/bash", stderr=open("/dev/null", "w")) | |
if not count_out.strip(): | |
raise IOError("Failed to check fastq file sizes with: %s" % cmd.format(**locals())) | |
shorter = 0 | |
for count, size in (l.strip().split() for l in count_out.strip().split("\n")): | |
if int(size) < min_size: | |
shorter += int(count) | |
return (float(shorter) / float(tocheck)) <= thresh |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment