Last active
December 17, 2015 06:39
-
-
Save leabenedicte/5566826 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl -w | |
use strict; | |
use Getopt::Long; | |
use Data::Dumper; | |
# Lea Benedicte Skov Hansen, Apr May 2013 | |
# Variables that should eventually be defined by user!! | |
my $input; | |
my $output; | |
my $gzip; | |
my $starttrim; | |
my $endtrim; | |
my $phred = ''; | |
my $min_qual; | |
my $N_max = 9999; | |
my $min_mean_qual = 0; | |
my $min_length = 0; | |
GetOptions("i=s" => \$input, # Input filename | |
"o=s" => \$output, # Output filename | |
"z!" => \$gzip, # Gzip output data | |
"s=i" => \$starttrim, # Removes X nucleotiden from the 5' end | |
"e=i" => \$endtrim, # Removed X nucleotides from the 3' end | |
"p=s" => \$phred, # Phred score +33 or +64 (phred33 or phred64) | |
"q=i" => \$min_qual, # Removing bases from the 3' end with scores lower than user input | |
"n=i" => \$N_max, # Max number of Ns allowed in strand | |
"m=i" => \$min_mean_qual, # Minimum mean quality calculated over the whole read | |
"l=i" => \$min_length # Minimum length after trimming | |
); | |
# How to force an input and output | |
# Opens the input file and gunzip if the filename ends with .gz | |
my $fh_in; | |
my $fh_out; | |
if ($input =~ /\.gz$/) { | |
open($fh_in, "gunzip -c $input |") or die "Could not read file '$input': $!\n"; | |
} else { | |
open($fh_in, '<', $input) or die "Could not read file '$input': $!\n"; | |
} | |
# Opens the output file and gzip the data if user switched the -z option | |
if ($gzip) { | |
$output = "$output.gz" if ($output !~ m/\.gz$/); | |
open($fh_out, "| gzip -c > $output") or die "Problems with gzip or file '$output': $!\n"; | |
} else { | |
open($fh_out, '>', $output) or die "Could not open the file '$output': $!\n"; | |
} | |
# Setting the phred score encoding if quality trim is selected by user | |
my $phred_start; | |
if ($min_qual or $min_mean_qual) { | |
if ($phred eq 'phred33') { | |
$phred_start = 33; | |
} elsif ($phred eq 'phred64') { | |
$phred_start = 64; | |
} else { | |
die "Phred score encoding must be given if quality trim is selected (-p phred33 or phred64)\n"; | |
} | |
} | |
my $discarded_reads = 0; | |
my $total_reads = 0; | |
while (my $entry = get_entry($fh_in)) { | |
&starttrim($entry, $starttrim) if ($starttrim); | |
&endtrim($entry, $endtrim) if ($endtrim); | |
&qualtrim($entry, $min_qual, $phred_start) if ($min_qual); | |
my $Ns = &count_n($entry) if ($N_max); | |
my $mean = &mean_qual($entry, $phred_start) if ($min_mean_qual); | |
if ($Ns <= $N_max and $mean >= $min_mean_qual and length($entry->{SEQ}) >= $min_length) { | |
&print_fastq($entry, $fh_out); | |
} else { | |
$discarded_reads++; | |
} | |
$total_reads++; | |
} | |
print "Total number of reads: $total_reads\n"; | |
print "Discarded reads: $discarded_reads\n"; | |
close($fh_in); | |
close($fh_out); | |
# ___SUBROUTINES___ # | |
sub get_entry { | |
# This subroutine creates a entry hash of the for lines that represents a fastq entry | |
my ($fh) = @_; | |
my (%entry); | |
$entry{SEQ_NAME} = <$fh>; # Sequence name | |
$entry{SEQ} = <$fh>; # Sequence | |
my $note = <$fh>; # Omits the comment line | |
$entry{QUAL} = <$fh>; # Quality score | |
return unless $entry{SEQ}; # Returns nothing when end of file is reached | |
# Quality check | |
die "ERROR: This file is not in fastq format\n" if substr($entry{SEQ_NAME},0,1) ne '@'; | |
die "ERROR: This file is not in fastq format\n" if substr($note,0,1) ne '+'; | |
die "ERROR: The file seems to be truncated\n" if !defined $entry{QUAL}; | |
die "ERROR: The lengths of the sequence and the quality scores are not equal\n" if (length($entry{SEQ}) != length($entry{QUAL})); | |
chomp %entry; | |
$entry{SEQ_NAME} = substr $entry{SEQ_NAME}, 1; | |
return \%entry; #wantarray ? %entry : \%entry; | |
} | |
sub starttrim { | |
# This subroutines removes X nucleotides from the 5' end and removes the quality scores | |
my ($h_ref, $length) = @_; | |
${$h_ref}{SEQ} = substr(${$h_ref}{SEQ},$length); | |
${$h_ref}{QUAL} = substr(${$h_ref}{QUAL}, $length); | |
} | |
sub endtrim { | |
# This subroutine removed X nucleotides from the 3' end and removies the quality scores | |
my ($h_ref, $length) = @_; | |
${$h_ref}{SEQ} = substr(${$h_ref}{SEQ}, 0, length(${$h_ref}{SEQ})-$length); | |
${$h_ref}{QUAL} = substr(${$h_ref}{QUAL}, 0, length(${$h_ref}{QUAL})-$length); | |
} | |
sub qualtrim { | |
# This subroutine will remove bases from the 3' end until the min quality score is reached | |
my ($h_ref, $min, $phred_start) = @_; | |
my @tmp = split(//,${$h_ref}{QUAL}); | |
my $i = $#tmp; | |
while($i >= 0 and ((ord $tmp[$i]) - $phred_start) < $min) { | |
$i--; | |
} | |
${$h_ref}{SEQ} = substr(${$h_ref}{SEQ},0,$i+1); #!!!!!!!! Måske en f løkke somspringer substr over hvis der ikke skal klippen noget af | |
${$h_ref}{QUAL} = substr(${$h_ref}{QUAL},0,$i+1); # Tæl hvor mange reads blev trimmet Kan man undlade =?? | |
} | |
sub count_n { | |
# This subroutine counts number of Ns in a DNA strand | |
my ($h_ref) = @_; | |
my @tmp = split(//,${$h_ref}{SEQ}); | |
my $Ns = 0; | |
foreach my $char (@tmp) { | |
$Ns++ if (uc($char) eq 'N'); | |
} | |
return $Ns; | |
} | |
sub mean_qual { | |
# This subroutine calculates the mean quality over a ASCII coded quality score | |
my ($h_ref, $phred_start) = @_; | |
my $sum = 0; | |
for (my $i = 0; $i < length($h_ref->{QUAL}); $i++) { | |
$sum += ord(substr($h_ref->{QUAL}, $i, 1)) - $phred_start; | |
} | |
my $mean = $sum/length($h_ref->{QUAL}); | |
return $mean; | |
} | |
sub print_fastq { | |
# This subroutine will print a fastq hash to a file | |
my ($h_ref,$fh_out) = @_; | |
print $fh_out "@",${$h_ref}{SEQ_NAME},"\n"; | |
print $fh_out ${$h_ref}{SEQ}, "\n"; | |
print $fh_out "+\n"; | |
print $fh_out ${$h_ref}{QUAL}, "\n"; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
$& = $MATCH
$` = $PREMATCH
$' = $POSTMATCH