Skip to content

Instantly share code, notes, and snippets.

@nylander
Created June 15, 2023 09:16
Show Gist options
  • Save nylander/287d1f47c669a350c2e7b97a3da58df5 to your computer and use it in GitHub Desktop.
Save nylander/287d1f47c669a350c2e7b97a3da58df5 to your computer and use it in GitHub Desktop.
Create a sample sheet from NGI (SciLifeLab) sequencing delivery
#!/usr/bin/env perl
=pod
=encoding utf8
=head1 NAME
create_sample_sheet.pl - Create sample sheet from NGI delivery
=head1 SYNOPSIS
$ create_sample_sheet.pl 00-Reports/S.OmeName_22_01_sample_info.txt
=head1 DESCRIPTION
Change directory to main delivery directory (e.g. "cd P27213"). Assuming there
is a column with user defined IDs in the file "00-Reports/*_sample_info.txt"
(tab-separated columns), the script will read that file (and column nrs 1 and
2), and then locate corresponding .fastq.gz files.
Prints tab separated entries to STDOUT.
Note: currently assumes PE libraries.
=head1 OPTIONS
=over 4
=item B<-a,--atlas> Output table for the ATLAS workflow (see L<https://github.com/metagenome-atlas/atlas>).
=item B<-e,--eager> Output table for the nf-core/eager workflow (See L<https://nf-co.re/eager/2.4.7/usage#tsv-input-method>).
=item B<-h,--help> Display help.
=item B<-v,--version> Display version.
=back
=head1 AUTHOR
Johan Nylander <johan.nylander@nrm.se>
=head1 VERSION
0.2
=head1 COPYRIGHT AND LICENSE
Copyright 2023 Johan Nylander.
Distributed under the MIT License.
=cut
use strict;
use warnings;
use File::Find;
use File::Basename;
use Cwd;
use Data::Dumper;
use Getopt::Long;
Getopt::Long::Configure("no_ignore_case", "no_auto_abbrev");
my $version = '0.2';
# For eager tsv format settings, see https://nf-co.re/eager/2.4.7/usage#tsv-input-method
my $eager_colour_chemistry = '2';
my $eager_seqtype = 'PE';
my $eager_organism = 'NA';
my $eager_strandedness = 'double';
my $eager_udg_treatment = 'full';
my $eager_bam = 'NA';
my $atlas = q{};
my $eager = q{};
my %HoH = ();
my @uid_order = ();
my @ngiid_order = ();
my @file_paths = ();
my $cwd = getcwd();
GetOptions(
'a|atlas' => \$atlas,
'e|eager' => \$eager,
'v|version' => sub { print "$version\n"; exit(0); },
'h' => sub { print "Usage: $0 [OPTIONS][--help] sample_info_file\n"; exit(0); },
'help' => sub { exec("perldoc", $0); exit(0); },
) or
die ("$0 Error in command line arguments\nUsage: $0 [OPTIONS][--help] sample_info_file\n");
if (@ARGV == 0 && -t STDIN && -t STDERR) {
print "Usage: $0 [OPTIONS][--help] sample_info_file\n";
exit(1);
}
my $sample_info_file = shift or die "Error: need sample file as argument\n";
sub find_files {
return unless -f and /^.*\.fastq\.gz\z/s;
my $file = $File::Find::name;
push @file_paths, $file;
}
open (my $SAMPLE, "<", $sample_info_file) or die "Error: could not open sample file";
while (<$SAMPLE>) {
chomp;
next if (/^NGI ID/);
my ($ngiid, $userid, @rest) = split /\t/;
push (@uid_order, $userid);
push (@ngiid_order, $ngiid);
$HoH{$ngiid}{'uid'} = $userid;
$HoH{$ngiid}{'ngiid'} = $ngiid;
}
close($SAMPLE);
find(\&find_files, $cwd);
for my $file_path (@file_paths) {
for my $id (@ngiid_order) {
if ($file_path =~ /$id/) {
if ($file_path =~/R1_001\.fastq\.gz/) {
$HoH{$id}{'R1'} = $file_path;
}
elsif ($file_path =~/R2_001\.fastq\.gz/) {
$HoH{$id}{'R2'} = $file_path;
}
}
else {
print STDERR "Warning: could not find fastq.gz file for ID $id\n";
}
}
}
if ($eager) {
# See https://nf-co.re/eager/2.4.7/usage#tsv-input-method
print STDOUT "Sample_Name\tLibrary_ID\tLane\tColour_Chemistry\tSeqType\tOrganism\tStrandedness\tUDG_Treatment\tR1\tR2\tBAM\n";
for my $id (@ngiid_order) {
if (exists $HoH{$id}{'R1'} && exists $HoH{$id}{'R2'}) {
print STDOUT $HoH{$id}{'uid'}, "\t";
print STDOUT $HoH{$id}{'ngiid'}, "\t";
my ($filename, $dirs, $suffix) = fileparse($HoH{$id}{'R1'}, '_R1_001.fastq.gz');
my $lane = '';
if ($filename =~ /.+_L(\d+)$/) {
$lane = $1;
}
print STDOUT $lane, "\t";
print STDOUT $eager_colour_chemistry, "\t";
print STDOUT $eager_seqtype, "\t";
print STDOUT $eager_organism, "\t";
print STDOUT $eager_strandedness, "\t";
print STDOUT $eager_udg_treatment, "\t";
print STDOUT $HoH{$id}{'R1'}, "\t";
print STDOUT $HoH{$id}{'R2'}, "\t";
print STDOUT $eager_bam, "\n";
}
}
}
elsif ($atlas) {
print STDOUT "Sample\tLib\tFile\tRead\tPath\n";
for my $id (@ngiid_order) {
print STDOUT $HoH{$id}{'uid'}, "\t";
my ($filename, $dirs, $suffix) = fileparse($HoH{$id}{'R1'}, '_R1_001.fastq.gz');
my @d = split /\//, $dirs;
my $lib = pop(@d);
print STDOUT $lib, "\t";
print STDOUT $filename, "\t";
print STDOUT "R1", "\t";
print STDOUT $dirs, "\n";
print STDOUT $HoH{$id}{'uid'}, "\t";
($filename, $dirs, $suffix) = fileparse($HoH{$id}{'R2'}, '_R2_001.fastq.gz');
@d = split /\//, $dirs;
$lib = pop(@d);
print STDOUT $lib, "\t";
print STDOUT $filename, "\t";
print STDOUT "R2", "\t";
print STDOUT $dirs, "\n";
}
}
else {
for my $id (@ngiid_order) {
if (exists $HoH{$id}{'R1'} && exists $HoH{$id}{'R2'}) {
#print $HoH{$id}{'uid'}, "\t", $HoH{$id}{'ngiid'}, "\t", $HoH{$id}{'R2'}, "\t", $HoH{$id}{'R2'}, "\n";
print $HoH{$id}{'ngiid'}, "\t", $HoH{$id}{'uid'}, "\t", $HoH{$id}{'R1'}, "\t", $HoH{$id}{'R2'}, "\n";
}
}
}
__END__
# %HoH = (
# 'P27213_116' => {
# 'uid' => 'PS-B6',
# 'ngiid' => 'P27213_116',
# 'R1' => '/proj/nrmdnalab_storage/Pleurozium_NV/ngisthlm00062/files/P27213/P27213_116/02-FASTQ/221014_A01901_0056_AHNLGHDSX3/P27213_116_S68_L002_R1_001.fastq.gz',
# 'R2' => '/proj/nrmdnalab_storage/Pleurozium_NV/ngisthlm00062/files/P27213/P27213_116/02-FASTQ/221014_A01901_0056_AHNLGHDSX3/P27213_116_S68_L002_R2_001.fastq.gz',
# },
# );
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment