leabenedicte/gist:5566826

## gistfile1.txt
#!/usr/bin/perl -w

use strict;
use Getopt::Long;
use Data::Dumper;

# Lea Benedicte Skov Hansen, Apr May 2013

# Variables that should eventually be defined by user!!

my $input;
my $output;
my $gzip;
my $starttrim;
my $endtrim;
my $phred = '';
my $min_qual;
my $N_max = 9999;
my $min_mean_qual = 0;
my $min_length = 0;
GetOptions("i=s" => \$input,         # Input filename
           "o=s" => \$output,        # Output filename
           "z!" => \$gzip,           # Gzip output data
           "s=i" => \$starttrim,     # Removes X nucleotiden from the 5' end
           "e=i" => \$endtrim,       # Removed X nucleotides from the 3' end
           "p=s" => \$phred,         # Phred score +33 or +64 (phred33 or phred64)
           "q=i" => \$min_qual,      # Removing bases from the 3' end with scores lower than user input
           "n=i" => \$N_max,         # Max number of Ns allowed in strand
           "m=i" => \$min_mean_qual, # Minimum mean quality calculated over the whole read
           "l=i" => \$min_length     # Minimum length after trimming
          );

# How to force an input and output


# Opens the input file and gunzip if the filename ends with .gz
my $fh_in;
my $fh_out;
if ($input =~ /\.gz$/) {
   open($fh_in, "gunzip -c $input |") or die "Could not read file '$input': $!\n";
} else {
   open($fh_in, '<', $input) or die "Could not read file '$input': $!\n";
}

# Opens the output file and gzip the data if user switched the -z option
if ($gzip) {
   $output = "$output.gz" if ($output !~ m/\.gz$/);
   open($fh_out, "| gzip -c > $output") or die "Problems with gzip or file '$output': $!\n";
} else {
   open($fh_out, '>', $output) or die "Could not open the file '$output': $!\n";
}

# Setting the phred score encoding if quality trim is selected by user
my $phred_start;
if ($min_qual or $min_mean_qual) {
   if ($phred eq 'phred33') {
      $phred_start = 33;
   } elsif ($phred eq 'phred64') {
      $phred_start = 64;
   } else {
      die "Phred score encoding must be given if quality trim is selected (-p phred33 or phred64)\n";
   }
}

my $discarded_reads = 0;
my $total_reads = 0;
while (my $entry = get_entry($fh_in)) {
   &starttrim($entry, $starttrim) if ($starttrim);
   &endtrim($entry, $endtrim) if ($endtrim);
   &qualtrim($entry, $min_qual, $phred_start) if ($min_qual);
   my $Ns = &count_n($entry) if ($N_max);
   my $mean = &mean_qual($entry, $phred_start) if ($min_mean_qual);
   if ($Ns <= $N_max and $mean >= $min_mean_qual and length($entry->{SEQ}) >= $min_length) {
      &print_fastq($entry, $fh_out);
   } else {
      $discarded_reads++;
   }
   $total_reads++;
}

print "Total number of reads: $total_reads\n";
print "Discarded reads: $discarded_reads\n";

close($fh_in);
close($fh_out);

# ___SUBROUTINES___ #

sub get_entry {

   # This subroutine creates a entry hash of the for lines that represents a fastq entry

   my ($fh) = @_;
   my (%entry);

   $entry{SEQ_NAME} = <$fh>; # Sequence name
   $entry{SEQ}      = <$fh>; # Sequence
   my $note         = <$fh>; # Omits the comment line
   $entry{QUAL}     = <$fh>; # Quality score

   return unless $entry{SEQ}; # Returns nothing when end of file is reached

   # Quality check
   die "ERROR: This file is not in fastq format\n" if substr($entry{SEQ_NAME},0,1) ne '@';
   die "ERROR: This file is not in fastq format\n" if substr($note,0,1) ne '+';
   die "ERROR: The file seems to be truncated\n" if !defined $entry{QUAL};
   die "ERROR: The lengths of the sequence and the quality scores are not equal\n" if (length($entry{SEQ}) != length($entry{QUAL}));

   chomp %entry;
   $entry{SEQ_NAME} = substr $entry{SEQ_NAME}, 1;
   return \%entry;                    #wantarray ? %entry : \%entry;
}

sub starttrim {

   # This subroutines removes X nucleotides from the 5' end and removes the quality scores

   my ($h_ref, $length) = @_;
   ${$h_ref}{SEQ} = substr(${$h_ref}{SEQ},$length);
   ${$h_ref}{QUAL} = substr(${$h_ref}{QUAL}, $length);
}

sub endtrim {

   # This subroutine removed X nucleotides from the 3' end and removies the quality scores

   my ($h_ref, $length) = @_;
   ${$h_ref}{SEQ} = substr(${$h_ref}{SEQ}, 0, length(${$h_ref}{SEQ})-$length);
   ${$h_ref}{QUAL} = substr(${$h_ref}{QUAL}, 0, length(${$h_ref}{QUAL})-$length);
}

sub qualtrim {

   # This subroutine will remove bases from the 3' end until the min quality score is reached

   my ($h_ref, $min, $phred_start) = @_;
   my @tmp = split(//,${$h_ref}{QUAL});
   my $i = $#tmp;
   while($i >= 0 and ((ord $tmp[$i]) - $phred_start) < $min) {
      $i--;
   }
   ${$h_ref}{SEQ} = substr(${$h_ref}{SEQ},0,$i+1);         #!!!!!!!! Måske en f løkke somspringer substr over hvis der ikke skal klippen noget af
   ${$h_ref}{QUAL} = substr(${$h_ref}{QUAL},0,$i+1);       # Tæl hvor mange reads blev trimmet Kan man undlade =??
}

sub count_n {

   # This subroutine counts number of Ns in a DNA strand

   my ($h_ref) = @_;
   my @tmp = split(//,${$h_ref}{SEQ});
   my $Ns = 0;
   foreach my $char (@tmp) {
      $Ns++ if (uc($char) eq 'N');
   }
   return $Ns;
}

sub mean_qual {

   # This subroutine calculates the mean quality over a ASCII coded quality score

   my ($h_ref, $phred_start) = @_;
   my $sum = 0;
   for (my $i = 0; $i < length($h_ref->{QUAL}); $i++) {
      $sum += ord(substr($h_ref->{QUAL}, $i, 1)) - $phred_start;
   }
   my $mean = $sum/length($h_ref->{QUAL});
   return $mean;
}

sub print_fastq {

   # This subroutine will print a fastq hash to a file

   my ($h_ref,$fh_out) = @_;
   print $fh_out "@",${$h_ref}{SEQ_NAME},"\n";
   print $fh_out ${$h_ref}{SEQ}, "\n";
   print $fh_out "+\n";
   print $fh_out ${$h_ref}{QUAL}, "\n";
}
	#!/usr/bin/perl -w

	use strict;
	use Getopt::Long;
	use Data::Dumper;

	# Lea Benedicte Skov Hansen, Apr May 2013

	# Variables that should eventually be defined by user!!

	my $input;
	my $output;
	my $gzip;
	my $starttrim;
	my $endtrim;
	my $phred = '';
	my $min_qual;
	my $N_max = 9999;
	my $min_mean_qual = 0;
	my $min_length = 0;
	GetOptions("i=s" => \$input, # Input filename
	"o=s" => \$output, # Output filename
	"z!" => \$gzip, # Gzip output data
	"s=i" => \$starttrim, # Removes X nucleotiden from the 5' end
	"e=i" => \$endtrim, # Removed X nucleotides from the 3' end
	"p=s" => \$phred, # Phred score +33 or +64 (phred33 or phred64)
	"q=i" => \$min_qual, # Removing bases from the 3' end with scores lower than user input
	"n=i" => \$N_max, # Max number of Ns allowed in strand
	"m=i" => \$min_mean_qual, # Minimum mean quality calculated over the whole read
	"l=i" => \$min_length # Minimum length after trimming
	);

	# How to force an input and output


	# Opens the input file and gunzip if the filename ends with .gz
	my $fh_in;
	my $fh_out;
	if ($input =~ /\.gz$/) {
	open($fh_in, "gunzip -c $input \|") or die "Could not read file '$input': $!\n";
	} else {
	open($fh_in, '<', $input) or die "Could not read file '$input': $!\n";
	}

	# Opens the output file and gzip the data if user switched the -z option
	if ($gzip) {
	$output = "$output.gz" if ($output !~ m/\.gz$/);
	open($fh_out, "\| gzip -c > $output") or die "Problems with gzip or file '$output': $!\n";
	} else {
	open($fh_out, '>', $output) or die "Could not open the file '$output': $!\n";
	}

	# Setting the phred score encoding if quality trim is selected by user
	my $phred_start;
	if ($min_qual or $min_mean_qual) {
	if ($phred eq 'phred33') {
	$phred_start = 33;
	} elsif ($phred eq 'phred64') {
	$phred_start = 64;
	} else {
	die "Phred score encoding must be given if quality trim is selected (-p phred33 or phred64)\n";
	}
	}

	my $discarded_reads = 0;
	my $total_reads = 0;
	while (my $entry = get_entry($fh_in)) {
	&starttrim($entry, $starttrim) if ($starttrim);
	&endtrim($entry, $endtrim) if ($endtrim);
	&qualtrim($entry, $min_qual, $phred_start) if ($min_qual);
	my $Ns = &count_n($entry) if ($N_max);
	my $mean = &mean_qual($entry, $phred_start) if ($min_mean_qual);
	if ($Ns <= $N_max and $mean >= $min_mean_qual and length($entry->{SEQ}) >= $min_length) {
	&print_fastq($entry, $fh_out);
	} else {
	$discarded_reads++;
	}
	$total_reads++;
	}

	print "Total number of reads: $total_reads\n";
	print "Discarded reads: $discarded_reads\n";

	close($fh_in);
	close($fh_out);

	# ___SUBROUTINES___ #

	sub get_entry {

	# This subroutine creates a entry hash of the for lines that represents a fastq entry

	my ($fh) = @_;
	my (%entry);

	$entry{SEQ_NAME} = <$fh>; # Sequence name
	$entry{SEQ} = <$fh>; # Sequence
	my $note = <$fh>; # Omits the comment line
	$entry{QUAL} = <$fh>; # Quality score

	return unless $entry{SEQ}; # Returns nothing when end of file is reached

	# Quality check
	die "ERROR: This file is not in fastq format\n" if substr($entry{SEQ_NAME},0,1) ne '@';
	die "ERROR: This file is not in fastq format\n" if substr($note,0,1) ne '+';
	die "ERROR: The file seems to be truncated\n" if !defined $entry{QUAL};
	die "ERROR: The lengths of the sequence and the quality scores are not equal\n" if (length($entry{SEQ}) != length($entry{QUAL}));

	chomp %entry;
	$entry{SEQ_NAME} = substr $entry{SEQ_NAME}, 1;
	return \%entry; #wantarray ? %entry : \%entry;
	}

	sub starttrim {

	# This subroutines removes X nucleotides from the 5' end and removes the quality scores

	my ($h_ref, $length) = @_;
	${$h_ref}{SEQ} = substr(${$h_ref}{SEQ},$length);
	${$h_ref}{QUAL} = substr(${$h_ref}{QUAL}, $length);
	}

	sub endtrim {

	# This subroutine removed X nucleotides from the 3' end and removies the quality scores

	my ($h_ref, $length) = @_;
	${$h_ref}{SEQ} = substr(${$h_ref}{SEQ}, 0, length(${$h_ref}{SEQ})-$length);
	${$h_ref}{QUAL} = substr(${$h_ref}{QUAL}, 0, length(${$h_ref}{QUAL})-$length);
	}

	sub qualtrim {

	# This subroutine will remove bases from the 3' end until the min quality score is reached

	my ($h_ref, $min, $phred_start) = @_;
	my @tmp = split(//,${$h_ref}{QUAL});
	my $i = $#tmp;
	while($i >= 0 and ((ord $tmp[$i]) - $phred_start) < $min) {
	$i--;
	}
	${$h_ref}{SEQ} = substr(${$h_ref}{SEQ},0,$i+1); #!!!!!!!! Måske en f løkke somspringer substr over hvis der ikke skal klippen noget af
	${$h_ref}{QUAL} = substr(${$h_ref}{QUAL},0,$i+1); # Tæl hvor mange reads blev trimmet Kan man undlade =??
	}

	sub count_n {

	# This subroutine counts number of Ns in a DNA strand

	my ($h_ref) = @_;
	my @tmp = split(//,${$h_ref}{SEQ});
	my $Ns = 0;
	foreach my $char (@tmp) {
	$Ns++ if (uc($char) eq 'N');
	}
	return $Ns;
	}

	sub mean_qual {

	# This subroutine calculates the mean quality over a ASCII coded quality score

	my ($h_ref, $phred_start) = @_;
	my $sum = 0;
	for (my $i = 0; $i < length($h_ref->{QUAL}); $i++) {
	$sum += ord(substr($h_ref->{QUAL}, $i, 1)) - $phred_start;
	}
	my $mean = $sum/length($h_ref->{QUAL});
	return $mean;
	}

	sub print_fastq {

	# This subroutine will print a fastq hash to a file

	my ($h_ref,$fh_out) = @_;
	print $fh_out "@",${$h_ref}{SEQ_NAME},"\n";
	print $fh_out ${$h_ref}{SEQ}, "\n";
	print $fh_out "+\n";
	print $fh_out ${$h_ref}{QUAL}, "\n";
	}