Skip to content

Instantly share code, notes, and snippets.

@maasha
Forked from leabenedicte/gist:5566826
Created May 13, 2013 12:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save maasha/5567819 to your computer and use it in GitHub Desktop.
Save maasha/5567819 to your computer and use it in GitHub Desktop.
#!/usr/bin/perl -w
use strict;
use Getopt::Long;
use Data::Dumper;
# Lea Benedicte Skov Hansen, Apr May 2013
# Variables that should eventually be defined by user!!
my $input;
my $output;
my $gzip;
my $starttrim;
my $endtrim;
my $phred = '';
my $min_qual;
my $N_max = 9999;
my $min_mean_qual = 0;
my $min_length = 0;
GetOptions("i=s" => \$input, # Input filename
"o=s" => \$output, # Output filename
"z!" => \$gzip, # Gzip output data
"s=i" => \$starttrim, # Removes X nucleotiden from the 5' end
"e=i" => \$endtrim, # Removed X nucleotides from the 3' end
"p=s" => \$phred, # Phred score +33 or +64 (phred33 or phred64)
"q=i" => \$min_qual, # Removing bases from the 3' end with scores lower than user input
"n=i" => \$N_max, # Max number of Ns allowed in strand
"m=i" => \$min_mean_qual, # Minimum mean quality calculated over the whole read
"l=i" => \$min_length # Minimum length after trimming
);
# How to force an input and output
# Opens the input file and gunzip if the filename ends with .gz
my $fh_in;
my $fh_out;
if ($input =~ /\.gz$/) {
open($fh_in, "gunzip -c $input |") or die "Could not read file '$input': $!\n";
} else {
open($fh_in, '<', $input) or die "Could not read file '$input': $!\n";
}
# Opens the output file and gzip the data if user switched the -z option
if ($gzip) {
$output = "$output.gz" if ($output !~ m/\.gz$/);
open($fh_out, "| gzip -c > $output") or die "Problems with gzip or file '$output': $!\n";
} else {
open($fh_out, '>', $output) or die "Could not open the file '$output': $!\n";
}
# Setting the phred score encoding if quality trim is selected by user
my $phred_start;
if ($min_qual or $min_mean_qual) {
if ($phred eq 'phred33') {
$phred_start = 33;
} elsif ($phred eq 'phred64') {
$phred_start = 64;
} else {
die "Phred score encoding must be given if quality trim is selected (-p phred33 or phred64)\n";
}
}
my $discarded_reads = 0;
my $total_reads = 0;
while (my $entry = get_entry($fh_in)) {
&starttrim($entry, $starttrim) if ($starttrim);
&endtrim($entry, $endtrim) if ($endtrim);
&qualtrim($entry, $min_qual, $phred_start) if ($min_qual);
my $Ns = &count_n($entry) if ($N_max);
my $mean = &mean_qual($entry, $phred_start) if ($min_mean_qual);
if ($Ns <= $N_max and $mean >= $min_mean_qual and length($entry->{SEQ}) >= $min_length) {
&print_fastq($entry, $fh_out);
} else {
$discarded_reads++;
}
$total_reads++;
}
print "Total number of reads: $total_reads\n";
print "Discarded reads: $discarded_reads\n";
close($fh_in);
close($fh_out);
# ___SUBROUTINES___ #
sub get_entry {
# This subroutine creates a entry hash of the for lines that represents a fastq entry
my ($fh) = @_;
my (%entry);
$entry{SEQ_NAME} = <$fh>; # Sequence name
$entry{SEQ} = <$fh>; # Sequence
my $note = <$fh>; # Omits the comment line
$entry{QUAL} = <$fh>; # Quality score
return unless $entry{SEQ}; # Returns nothing when end of file is reached
# Quality check
die "ERROR: This file is not in fastq format\n" if substr($entry{SEQ_NAME},0,1) ne '@';
die "ERROR: This file is not in fastq format\n" if substr($note,0,1) ne '+';
die "ERROR: The file seems to be truncated\n" if !defined $entry{QUAL};
die "ERROR: The lengths of the sequence and the quality scores are not equal\n" if (length($entry{SEQ}) != length($entry{QUAL}));
chomp %entry;
$entry{SEQ_NAME} = substr $entry{SEQ_NAME}, 1;
return \%entry; #wantarray ? %entry : \%entry;
}
sub starttrim {
# This subroutines removes X nucleotides from the 5' end and removes the quality scores
my ($h_ref, $length) = @_;
${$h_ref}{SEQ} = substr(${$h_ref}{SEQ},$length);
${$h_ref}{QUAL} = substr(${$h_ref}{QUAL}, $length);
}
sub endtrim {
# This subroutine removed X nucleotides from the 3' end and removies the quality scores
my ($h_ref, $length) = @_;
${$h_ref}{SEQ} = substr(${$h_ref}{SEQ}, 0, length(${$h_ref}{SEQ})-$length);
${$h_ref}{QUAL} = substr(${$h_ref}{QUAL}, 0, length(${$h_ref}{QUAL})-$length);
}
sub qualtrim {
# This subroutine will remove bases from the 3' end until the min quality score is reached
my ($h_ref, $min, $phred_start) = @_;
my @tmp = split(//,${$h_ref}{QUAL});
my $i = $#tmp;
while($i >= 0 and ((ord $tmp[$i]) - $phred_start) < $min) {
$i--;
}
${$h_ref}{SEQ} = substr(${$h_ref}{SEQ},0,$i+1); #!!!!!!!! Måske en f løkke somspringer substr over hvis der ikke skal klippen noget af
${$h_ref}{QUAL} = substr(${$h_ref}{QUAL},0,$i+1); # Tæl hvor mange reads blev trimmet Kan man undlade =??
}
sub count_n {
# This subroutine counts number of Ns in a DNA strand
my ($h_ref) = @_;
my @tmp = split(//,${$h_ref}{SEQ});
my $Ns = 0;
foreach my $char (@tmp) {
$Ns++ if (uc($char) eq 'N');
}
return $Ns;
}
sub mean_qual {
# This subroutine calculates the mean quality over a ASCII coded quality score
my ($h_ref, $phred_start) = @_;
my $sum = 0;
for (my $i = 0; $i < length($h_ref->{QUAL}); $i++) {
$sum += ord(substr($h_ref->{QUAL}, $i, 1)) - $phred_start;
}
my $mean = $sum/length($h_ref->{QUAL});
return $mean;
}
sub print_fastq {
# This subroutine will print a fastq hash to a file
my ($h_ref,$fh_out) = @_;
print $fh_out "@",${$h_ref}{SEQ_NAME},"\n";
print $fh_out ${$h_ref}{SEQ}, "\n";
print $fh_out "+\n";
print $fh_out ${$h_ref}{QUAL}, "\n";
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment