Skip to content

Instantly share code, notes, and snippets.

@RamRS RamRS/biostars_16336.pl Secret
Created Sep 11, 2018

Embed
What would you like to do?
biostars_16336
use threads;
use Thread::Queue;
# use YourUtils qw/logmsg getNumCPUs/;
my $seqs=qualityTrimFastq("raw.fastq",{numcpus=>4,min_avg_quality=>20});
for(@$seqs){
print $_; # => an entire 8-line entry ($id\$seq\n+\n$qual\n, for each of the two reads in the pair)
}
# reads a shuffled paired end fastq and trims it
# and cleans it
# Author: Lee Katz <lkatz@cdc.gov>
sub qualityTrimFastq($;$){
my($fastq,$settings)=@_;
$$settings{numcpus}||=LKUtils::getNumCPUs();
# trimming
$$settings{min_quality}||=30;
$$settings{bases_to_trim}||=10; # max number of bases that will be trimmed on either side
# cleaning
$$settings{min_avg_quality}||=20;
$$settings{min_avg_quality_window}||=70;
$$settings{min_length}||=62; # twice hash length sounds good
my (@t,$t);
my $Q=Thread::Queue->new;
for(0..$$settings{numcpus}-1){
$t[$t++]=threads->create(sub{
logmsg "Launching thread TID".threads->tid;
my(@entryOut);
while(defined(my $entry=$Q->dequeue)){
my($id1,$seq1,undef,$qual1,$id2,$seq2,undef,$qual2)=split(/\s*\n\s*/,$entry);
my %read1=(id=>$id1,seq=>$seq1,qual=>$qual1,length=>length($seq1));
my %read2=(id=>$id2,seq=>$seq2,qual=>$qual2,length=>length($seq2));
# quality trim: read 1, 3' read
my($numToTrim3,$numToTrim5,@qual);
$numToTrim3=$numToTrim5=0;
@qual=map(ord($_)-33,split(//,$read1{qual}));
for(my $i=0;$i<$$settings{bases_to_trim};$i++){
$numToTrim3++ if($qual[$i]<$$settings{min_quality});
last if($qual[$i]>=$$settings{min_quality});
}
$read1{seq}=substr($read1{seq},$numToTrim3,$read1{length}-$numToTrim3-$numToTrim5);
$read1{qual}=substr($read1{qual},$numToTrim3,$read1{length}-$numToTrim3-$numToTrim5);
# quality trim: read 1, 5' read
$numToTrim3=$numToTrim5=0;
@qual=map(ord($_)-33,split(//,$read2{qual}));
for(my $i=0;$i<$$settings{bases_to_trim};$i++){
$numToTrim3++ if($qual[$i]<$$settings{min_quality});
last if($qual[$i]>=$$settings{min_quality});
}
for(my $i=$read2{length}-$$settings{bases_to_trim};$i<@qual;$i++){
$numToTrim5++ if($qual[$i]<$$settings{min_quality});
last if($qual[$i]>=$$settings{min_quality});
}
$read2{seq}=substr($read2{seq},$numToTrim3,$read2{length}-$numToTrim3-$numToTrim5);
$read2{qual}=substr($read2{qual},$numToTrim3,$read2{length}-$numToTrim3-$numToTrim5);
#cleaning stage
# TODO keep singletons
next if(length($read1{seq})<$$settings{min_length});
next if(length($read2{seq})<$$settings{min_length});
# avg quality
my $qual_is_good=1;
QUALITY_CHECK:
for my $qualStr ($read1{qual},$read2{qual}){
my $length=length($qualStr)-$$settings{min_avg_quality_window};;
my @qual=map(ord($_)-33,split(//,$qualStr));
for(my $i=0;$i<$length;$i++){
my @subQual=@qual[$i..($$settings{min_avg_quality_window}+$i-1)];
my $avgQual=sum(@subQual)/$$settings{min_avg_quality_window};
if($avgQual<$$settings{min_avg_quality}){
#print "$i: ".join(".",@subQual)."\n$avgQual <=> $$settings{min_avg_quality}\n";die;
$qual_is_good=0;
last QUALITY_CHECK;
}
}
}
next if(!$qual_is_good);
my $entryOut="$read1{id}\n$read1{seq}\n+\n$read1{qual}\n$read2{id}\n$read2{seq}\n+\n$read2{qual}\n";
push(@entryOut,$entry);
}
return \@entryOut;
});
}
my $entryCount=0;
open(FQ,'<',$fastq) or die "Could not open $fastq for reading: $!";
while(my $entry=<FQ>){
$entry.=<FQ> for(1..7); # 8 lines total for paired end entry
$Q->enqueue($entry);
$entryCount++;
logmsg "Finished loading $entryCount pairs" if($entryCount%100000==0);
#if($entryCount>99999){
# logmsg "DEBUG: only $entryCount loaded";
# last;
#}
}
close FQ;
$Q->enqueue(undef) for(1..$$settings{numcpus});
# status update
while(my $p=$Q->pending){
logmsg "$p entries left to process";
sleep 10;
}
my @entry;
for(@t){
my $tmp=$_->join;
push(@entry,@$tmp);
}
my $numCleaned=$entryCount-@entry;
logmsg "$numCleaned entries out of $entryCount removed due to low quality";
return \@entry;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.