Skip to content

Instantly share code, notes, and snippets.

@dansmith01
Created December 10, 2012 20:47
Show Gist options
  • Save dansmith01/4253267 to your computer and use it in GitHub Desktop.
Save dansmith01/4253267 to your computer and use it in GitHub Desktop.
Demultiplexes and quality-checks Illumina sequencing reads. (Fastq in, Fasta out)
#!/usr/bin/perl
#
# ---------------------------------------- #
# Daniel Smith - October 1st, 2012 #
# Argonne National Laboratory #
# Creative Commons License BY-SA 3.0 #
# ---------------------------------------- #
#
use strict;
use warnings;
use Getopt::Std;
use Cwd 'abs_path';
my $USAGE = <<EOF;
Usage: $0 -m mappingfile.txt -i reads.fastq.gz -b barcodes.fastq.gz -o seqs.fna
or: $0 -m mappingfile.txt -i '*R1*.fastq' -b '*R2*.fastq' -o seqs.fna -q 10
or: $0 -m mappingfile.txt -i seqs1.fq,seqs2.fq -b bc1.fq,bc2.fq -o seqs.fna
This script demultiplexes and quality-checks sequencing reads.
Required parameters:
-m Mapping file in the format of sampleID<tab>barcodeSequence
-i Sequence reads, in fastq format (can be a gzipped file) glob paths & csv lists OK
-b Barcode reads, in fastq format (can be a gzipped file) glob paths & csv lists OK
-o Output file
Optional parameters:
-q Minimum Phred quality score for entire sequence (default: 20)
EOF
our ($opt_m, $opt_i, $opt_b, $opt_o, $opt_q);
getopt('mibonq');
die ("Error: No mapping file specied (-m).\n\n$USAGE") unless (defined($opt_m));
die ("Error: No sequence file specied (-i).\n\n$USAGE") unless (defined($opt_i));
die ("Error: No barcode file specied (-b).\n\n$USAGE") unless (defined($opt_b));
die ("Error: No output file specied (-o).\n\n$USAGE") unless (defined($opt_o));
my @seq_files = glob(join(' ', split(',', $opt_i)));
my @bc_files = glob(join(' ', split(',', $opt_b)));
die "Error: '$opt_i' does not match any files.\n\n$USAGE" unless (scalar(@seq_files));
die "Error: '$opt_b' does not match any files.\n\n$USAGE" unless (scalar(@bc_files));
if (scalar(@seq_files) != scalar(@bc_files)) {
print STDERR "Error: Number of sequence files does not match number of barcode files.\n\n";
die ($USAGE);
}
foreach ($opt_m, @seq_files, @bc_files) {
next if (-r $_);
print STDERR "Error: Unable to open file '$_' for reading.\n\n";
die ($USAGE);
}
my $MIN_Q = $opt_q || 20;
if ($MIN_Q !~ m/^[1-9]\d*$/) {
print STDERR "Error: '$MIN_Q' is not a valid number\n\n";
die ($USAGE);
}
my %barcode2id = ();
my %id2barcode = ();
my $lists = {};
open (MAPPINGFILE, $opt_m) or die ("Can't open file '$opt_m': $!\n");
while (my $line = <MAPPINGFILE>) {
chomp $line;
next if (substr($line, 0, 1) eq "#");
my @f = split("\t", $line);
next unless (scalar(@f) >= 2);
if ($f[0] !~ /^[0-9A-Za-z\.]+$/) {
die ("Error: First column in mapping file must only contain alphanumeric or '.' characters.\n\n$USAGE");
}
if ($f[1] !~ /^[ATCG]+$/) {
die ("Error: Second column in mapping file must be barcode sequence.\n\n$USAGE");
}
$f[1] = reverse($f[1]);
$f[1] =~ tr/ATCG/TAGC/;
die ("Error: Duplicate Barcode: '$f[1]'\n") if (exists($barcode2id{$f[1]}));
die ("Error: Duplicate ID: '$f[0]'\n") if (exists($id2barcode{$f[0]}));
$barcode2id{$f[1]} = $f[0];
$id2barcode{$f[0]} = $f[1];
$lists->{$f[0]} = { 'count' => 0 };
}
close MAPPINGFILE;
my $seqs_input = 0;
my $seqs_bc_ok = 0;
my $seqs_Ns_ok = 0;
my $seqs_ql_ok = 0;
my $seqs_out = 0;
open (OUT, ">$opt_o") or die("Can't open output file '$opt_o': $!\n");
foreach my $i (0..$#seq_files) {
if (substr($seq_files[$i], -3) eq ".gz") { open(READS, "gunzip -c '$seq_files[$i]'|") or die("$seq_files[$i] error: $!\n"); }
else { open(READS, $seq_files[$i]) or die("$seq_files[$i] error: $!\n"); }
if (substr($bc_files[$i], -3) eq ".gz") { open(BARCODES, "gunzip -c '$bc_files[$i]'|") or die("$bc_files[$i] error: $!\n"); }
else { open(BARCODES, $bc_files[$i]) or die("$bc_files[$i] error: $!\n"); }
my $firstline = 1;
while (!eof(READS)) {
$seqs_input++;
my $bc_header = <BARCODES>;
my $barcode = <BARCODES>;
<BARCODES>;
<BARCODES>;
my $seq_header = <READS>;
my $seq = <READS>;
<READS>;
my $qscores = <READS>;
if ($firstline) {
$bc_header = substr($bc_header, 0, index($bc_header, ' '));
$seq_header = substr($seq_header, 0, index($seq_header, ' '));
if ($bc_header ne $seq_header) {
print STDERR "Error: Barcode header does not match sequence header.\n";
print STDERR " Barcode File: ". $bc_files[$i] ."\n";
print STDERR " Sequence File: ". $seq_files[$i] ."\n";
print STDERR " Barcode Header: $bc_header\n";
print STDERR " Sequence Header: $seq_header\n\n";
die ($USAGE);
}
} else {
$firstline = 0;
}
chomp($barcode);
chomp($seq);
chomp($qscores);
my $sid = $barcode2id{$barcode} || next;
$seqs_bc_ok++;
# Skip if the sequence contains any N characters
next if (index($seq, "N") > -1);
$seqs_Ns_ok++;
# Skip if any base is below a Phred score of 20
next if (grep($_ - 33 < $MIN_Q, map(ord($_), split('', $qscores))));
$seqs_ql_ok++;
$seqs_out++;
$lists->{$sid}->{'count'}++;
my $id = sprintf("%s_%05d", $sid, $lists->{$sid}->{'count'});
print OUT ">", $id, "\n", $seq, "\n";
}
close BARCODES;
close READS;
}
close OUT;
open (INFO, ">$opt_o.info.txt") or die("Can't open output file '$opt_o.info': $!\n");
print INFO "Input Sequences: $seqs_input\n";
print INFO "Barcode Matched: $seqs_bc_ok\n";
print INFO "No Ns in Sequence: $seqs_Ns_ok\n";
print INFO "Entire Sequence >= Phred Q$MIN_Q: $seqs_ql_ok\n\n";
print INFO "# of Samples in Mapping File: ", scalar(keys %barcode2id), "\n";
print INFO "Mapping File: ", abs_path($opt_m), "\n";
print INFO "Sequence File(s): ", join(", ", map(abs_path($_), @seq_files)), "\n";
print INFO "Barcode File(s): ", join(", ", map(abs_path($_), @bc_files)), "\n";
print INFO "Output File: ", abs_path($opt_o), "\n\n";
print INFO "Total number of sequences passing all filters for each sample:\n";
my @ordered = sort { $lists->{$b}->{'count'} <=> $lists->{$a}->{'count'} } keys %{$lists};
foreach my $sid (@ordered) {
print INFO "$sid\t", $lists->{$sid}->{'count'}, "\n";
}
close INFO;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment