Created
December 10, 2012 20:47
-
-
Save dansmith01/4253267 to your computer and use it in GitHub Desktop.
Demultiplexes and quality-checks Illumina sequencing reads. (Fastq in, Fasta out)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
# | |
# ---------------------------------------- # | |
# Daniel Smith - October 1st, 2012 # | |
# Argonne National Laboratory # | |
# Creative Commons License BY-SA 3.0 # | |
# ---------------------------------------- # | |
# | |
use strict; | |
use warnings; | |
use Getopt::Std; | |
use Cwd 'abs_path'; | |
my $USAGE = <<EOF; | |
Usage: $0 -m mappingfile.txt -i reads.fastq.gz -b barcodes.fastq.gz -o seqs.fna | |
or: $0 -m mappingfile.txt -i '*R1*.fastq' -b '*R2*.fastq' -o seqs.fna -q 10 | |
or: $0 -m mappingfile.txt -i seqs1.fq,seqs2.fq -b bc1.fq,bc2.fq -o seqs.fna | |
This script demultiplexes and quality-checks sequencing reads. | |
Required parameters: | |
-m Mapping file in the format of sampleID<tab>barcodeSequence | |
-i Sequence reads, in fastq format (can be a gzipped file) glob paths & csv lists OK | |
-b Barcode reads, in fastq format (can be a gzipped file) glob paths & csv lists OK | |
-o Output file | |
Optional parameters: | |
-q Minimum Phred quality score for entire sequence (default: 20) | |
EOF | |
our ($opt_m, $opt_i, $opt_b, $opt_o, $opt_q); | |
getopt('mibonq'); | |
die ("Error: No mapping file specied (-m).\n\n$USAGE") unless (defined($opt_m)); | |
die ("Error: No sequence file specied (-i).\n\n$USAGE") unless (defined($opt_i)); | |
die ("Error: No barcode file specied (-b).\n\n$USAGE") unless (defined($opt_b)); | |
die ("Error: No output file specied (-o).\n\n$USAGE") unless (defined($opt_o)); | |
my @seq_files = glob(join(' ', split(',', $opt_i))); | |
my @bc_files = glob(join(' ', split(',', $opt_b))); | |
die "Error: '$opt_i' does not match any files.\n\n$USAGE" unless (scalar(@seq_files)); | |
die "Error: '$opt_b' does not match any files.\n\n$USAGE" unless (scalar(@bc_files)); | |
if (scalar(@seq_files) != scalar(@bc_files)) { | |
print STDERR "Error: Number of sequence files does not match number of barcode files.\n\n"; | |
die ($USAGE); | |
} | |
foreach ($opt_m, @seq_files, @bc_files) { | |
next if (-r $_); | |
print STDERR "Error: Unable to open file '$_' for reading.\n\n"; | |
die ($USAGE); | |
} | |
my $MIN_Q = $opt_q || 20; | |
if ($MIN_Q !~ m/^[1-9]\d*$/) { | |
print STDERR "Error: '$MIN_Q' is not a valid number\n\n"; | |
die ($USAGE); | |
} | |
my %barcode2id = (); | |
my %id2barcode = (); | |
my $lists = {}; | |
open (MAPPINGFILE, $opt_m) or die ("Can't open file '$opt_m': $!\n"); | |
while (my $line = <MAPPINGFILE>) { | |
chomp $line; | |
next if (substr($line, 0, 1) eq "#"); | |
my @f = split("\t", $line); | |
next unless (scalar(@f) >= 2); | |
if ($f[0] !~ /^[0-9A-Za-z\.]+$/) { | |
die ("Error: First column in mapping file must only contain alphanumeric or '.' characters.\n\n$USAGE"); | |
} | |
if ($f[1] !~ /^[ATCG]+$/) { | |
die ("Error: Second column in mapping file must be barcode sequence.\n\n$USAGE"); | |
} | |
$f[1] = reverse($f[1]); | |
$f[1] =~ tr/ATCG/TAGC/; | |
die ("Error: Duplicate Barcode: '$f[1]'\n") if (exists($barcode2id{$f[1]})); | |
die ("Error: Duplicate ID: '$f[0]'\n") if (exists($id2barcode{$f[0]})); | |
$barcode2id{$f[1]} = $f[0]; | |
$id2barcode{$f[0]} = $f[1]; | |
$lists->{$f[0]} = { 'count' => 0 }; | |
} | |
close MAPPINGFILE; | |
my $seqs_input = 0; | |
my $seqs_bc_ok = 0; | |
my $seqs_Ns_ok = 0; | |
my $seqs_ql_ok = 0; | |
my $seqs_out = 0; | |
open (OUT, ">$opt_o") or die("Can't open output file '$opt_o': $!\n"); | |
foreach my $i (0..$#seq_files) { | |
if (substr($seq_files[$i], -3) eq ".gz") { open(READS, "gunzip -c '$seq_files[$i]'|") or die("$seq_files[$i] error: $!\n"); } | |
else { open(READS, $seq_files[$i]) or die("$seq_files[$i] error: $!\n"); } | |
if (substr($bc_files[$i], -3) eq ".gz") { open(BARCODES, "gunzip -c '$bc_files[$i]'|") or die("$bc_files[$i] error: $!\n"); } | |
else { open(BARCODES, $bc_files[$i]) or die("$bc_files[$i] error: $!\n"); } | |
my $firstline = 1; | |
while (!eof(READS)) { | |
$seqs_input++; | |
my $bc_header = <BARCODES>; | |
my $barcode = <BARCODES>; | |
<BARCODES>; | |
<BARCODES>; | |
my $seq_header = <READS>; | |
my $seq = <READS>; | |
<READS>; | |
my $qscores = <READS>; | |
if ($firstline) { | |
$bc_header = substr($bc_header, 0, index($bc_header, ' ')); | |
$seq_header = substr($seq_header, 0, index($seq_header, ' ')); | |
if ($bc_header ne $seq_header) { | |
print STDERR "Error: Barcode header does not match sequence header.\n"; | |
print STDERR " Barcode File: ". $bc_files[$i] ."\n"; | |
print STDERR " Sequence File: ". $seq_files[$i] ."\n"; | |
print STDERR " Barcode Header: $bc_header\n"; | |
print STDERR " Sequence Header: $seq_header\n\n"; | |
die ($USAGE); | |
} | |
} else { | |
$firstline = 0; | |
} | |
chomp($barcode); | |
chomp($seq); | |
chomp($qscores); | |
my $sid = $barcode2id{$barcode} || next; | |
$seqs_bc_ok++; | |
# Skip if the sequence contains any N characters | |
next if (index($seq, "N") > -1); | |
$seqs_Ns_ok++; | |
# Skip if any base is below a Phred score of 20 | |
next if (grep($_ - 33 < $MIN_Q, map(ord($_), split('', $qscores)))); | |
$seqs_ql_ok++; | |
$seqs_out++; | |
$lists->{$sid}->{'count'}++; | |
my $id = sprintf("%s_%05d", $sid, $lists->{$sid}->{'count'}); | |
print OUT ">", $id, "\n", $seq, "\n"; | |
} | |
close BARCODES; | |
close READS; | |
} | |
close OUT; | |
open (INFO, ">$opt_o.info.txt") or die("Can't open output file '$opt_o.info': $!\n"); | |
print INFO "Input Sequences: $seqs_input\n"; | |
print INFO "Barcode Matched: $seqs_bc_ok\n"; | |
print INFO "No Ns in Sequence: $seqs_Ns_ok\n"; | |
print INFO "Entire Sequence >= Phred Q$MIN_Q: $seqs_ql_ok\n\n"; | |
print INFO "# of Samples in Mapping File: ", scalar(keys %barcode2id), "\n"; | |
print INFO "Mapping File: ", abs_path($opt_m), "\n"; | |
print INFO "Sequence File(s): ", join(", ", map(abs_path($_), @seq_files)), "\n"; | |
print INFO "Barcode File(s): ", join(", ", map(abs_path($_), @bc_files)), "\n"; | |
print INFO "Output File: ", abs_path($opt_o), "\n\n"; | |
print INFO "Total number of sequences passing all filters for each sample:\n"; | |
my @ordered = sort { $lists->{$b}->{'count'} <=> $lists->{$a}->{'count'} } keys %{$lists}; | |
foreach my $sid (@ordered) { | |
print INFO "$sid\t", $lists->{$sid}->{'count'}, "\n"; | |
} | |
close INFO; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment