nylander/create_sample_sheet.pl

## create_sample_sheet.pl
#!/usr/bin/env perl

=pod

=encoding utf8

=head1 NAME

create_sample_sheet.pl - Create sample sheet from NGI delivery

=head1 SYNOPSIS

    $ create_sample_sheet.pl 00-Reports/S.OmeName_22_01_sample_info.txt

=head1 DESCRIPTION

Change directory to main delivery directory (e.g. "cd P27213").  Assuming there
is a column with user defined IDs in the file "00-Reports/*_sample_info.txt"
(tab-separated columns), the script will read that file (and column nrs 1 and
2), and then locate corresponding .fastq.gz files.

Prints tab separated entries to STDOUT.

Note: currently assumes PE libraries.

=head1 OPTIONS

=over 4

=item B<-a,--atlas>  Output table for the ATLAS workflow (see L<https://github.com/metagenome-atlas/atlas>).

=item B<-e,--eager>  Output table for the nf-core/eager workflow (See L<https://nf-co.re/eager/2.4.7/usage#tsv-input-method>).

=item B<-h,--help>  Display help.

=item B<-v,--version>  Display version.

=back

=head1 AUTHOR

Johan Nylander <johan.nylander@nrm.se>

=head1 VERSION

0.2

=head1 COPYRIGHT AND LICENSE

Copyright 2023 Johan Nylander.
Distributed under the MIT License.

=cut

use strict;
use warnings;
use File::Find;
use File::Basename;
use Cwd;
use Data::Dumper;
use Getopt::Long;
Getopt::Long::Configure("no_ignore_case", "no_auto_abbrev");

my $version = '0.2';

# For eager tsv format settings, see https://nf-co.re/eager/2.4.7/usage#tsv-input-method
my $eager_colour_chemistry = '2';
my $eager_seqtype = 'PE';
my $eager_organism = 'NA';
my $eager_strandedness = 'double';
my $eager_udg_treatment = 'full';
my $eager_bam = 'NA';

my $atlas = q{};
my $eager = q{};
my %HoH = ();
my @uid_order = ();
my @ngiid_order = ();
my @file_paths = ();

my $cwd = getcwd();

GetOptions(
    'a|atlas'   => \$atlas,
    'e|eager'   => \$eager,
    'v|version' => sub { print "$version\n"; exit(0); },
    'h'         => sub { print "Usage: $0 [OPTIONS][--help] sample_info_file\n"; exit(0); },
    'help'      => sub { exec("perldoc", $0); exit(0); },
) or
die ("$0 Error in command line arguments\nUsage: $0 [OPTIONS][--help] sample_info_file\n");

if (@ARGV == 0 && -t STDIN && -t STDERR) {
    print "Usage: $0 [OPTIONS][--help] sample_info_file\n";
    exit(1);
}

my $sample_info_file = shift or die "Error: need sample file as argument\n";

sub find_files {
    return unless -f and /^.*\.fastq\.gz\z/s;
    my $file = $File::Find::name;
    push @file_paths, $file;
}

open (my $SAMPLE, "<", $sample_info_file) or die "Error: could not open sample file";
while (<$SAMPLE>) {
    chomp;
    next if (/^NGI ID/);
    my ($ngiid, $userid, @rest) = split /\t/;
    push (@uid_order, $userid);
    push (@ngiid_order, $ngiid);
    $HoH{$ngiid}{'uid'} = $userid;
    $HoH{$ngiid}{'ngiid'} = $ngiid;
}
close($SAMPLE);

find(\&find_files, $cwd);

for my $file_path (@file_paths) {
    for my $id (@ngiid_order) {
        if ($file_path =~ /$id/) {
            if ($file_path =~/R1_001\.fastq\.gz/) {
                $HoH{$id}{'R1'} = $file_path;
            }
            elsif ($file_path =~/R2_001\.fastq\.gz/) {
                $HoH{$id}{'R2'} = $file_path;
            }
        }
        else {
            print STDERR "Warning: could not find fastq.gz file for ID $id\n";
        }
    }
}

if ($eager) {
    # See https://nf-co.re/eager/2.4.7/usage#tsv-input-method
    print STDOUT "Sample_Name\tLibrary_ID\tLane\tColour_Chemistry\tSeqType\tOrganism\tStrandedness\tUDG_Treatment\tR1\tR2\tBAM\n";
    for my $id (@ngiid_order) {
        if (exists $HoH{$id}{'R1'} && exists $HoH{$id}{'R2'}) {
            print STDOUT $HoH{$id}{'uid'}, "\t";
            print STDOUT $HoH{$id}{'ngiid'}, "\t";
            my ($filename, $dirs, $suffix) = fileparse($HoH{$id}{'R1'}, '_R1_001.fastq.gz');
            my $lane = '';
            if ($filename =~ /.+_L(\d+)$/) {
                $lane = $1;
            }
            print STDOUT $lane, "\t";
            print STDOUT $eager_colour_chemistry, "\t";
            print STDOUT $eager_seqtype, "\t";
            print STDOUT $eager_organism, "\t";
            print STDOUT $eager_strandedness, "\t";
            print STDOUT $eager_udg_treatment, "\t";
            print STDOUT $HoH{$id}{'R1'}, "\t";
            print STDOUT $HoH{$id}{'R2'}, "\t";
            print STDOUT $eager_bam, "\n";
        }
    }
}
elsif ($atlas) {
    print STDOUT "Sample\tLib\tFile\tRead\tPath\n";
    for my $id (@ngiid_order) {
        print STDOUT $HoH{$id}{'uid'}, "\t";
        my ($filename, $dirs, $suffix) = fileparse($HoH{$id}{'R1'}, '_R1_001.fastq.gz');
        my @d = split /\//, $dirs;
        my $lib = pop(@d);
        print STDOUT $lib, "\t";
        print STDOUT $filename, "\t";
        print STDOUT "R1", "\t";
        print STDOUT $dirs, "\n";
        print STDOUT $HoH{$id}{'uid'}, "\t";
        ($filename, $dirs, $suffix) = fileparse($HoH{$id}{'R2'}, '_R2_001.fastq.gz');
        @d = split /\//, $dirs;
        $lib = pop(@d);
        print STDOUT $lib, "\t";
        print STDOUT $filename, "\t";
        print STDOUT "R2", "\t";
        print STDOUT $dirs, "\n";
    }
}
else {
    for my $id (@ngiid_order) {
        if (exists $HoH{$id}{'R1'} && exists $HoH{$id}{'R2'}) {
            #print $HoH{$id}{'uid'}, "\t",  $HoH{$id}{'ngiid'}, "\t", $HoH{$id}{'R2'}, "\t", $HoH{$id}{'R2'}, "\n";
            print $HoH{$id}{'ngiid'}, "\t",  $HoH{$id}{'uid'}, "\t", $HoH{$id}{'R1'}, "\t", $HoH{$id}{'R2'}, "\n";
        }
    }
}

__END__
# %HoH = (
#     'P27213_116' => {
#         'uid'   => 'PS-B6',
#         'ngiid' => 'P27213_116',
#         'R1'    => '/proj/nrmdnalab_storage/Pleurozium_NV/ngisthlm00062/files/P27213/P27213_116/02-FASTQ/221014_A01901_0056_AHNLGHDSX3/P27213_116_S68_L002_R1_001.fastq.gz',
#         'R2'    => '/proj/nrmdnalab_storage/Pleurozium_NV/ngisthlm00062/files/P27213/P27213_116/02-FASTQ/221014_A01901_0056_AHNLGHDSX3/P27213_116_S68_L002_R2_001.fastq.gz',
#     },
# );
	#!/usr/bin/env perl

	=pod

	=encoding utf8

	=head1 NAME

	create_sample_sheet.pl - Create sample sheet from NGI delivery

	=head1 SYNOPSIS

	$ create_sample_sheet.pl 00-Reports/S.OmeName_22_01_sample_info.txt

	=head1 DESCRIPTION

	Change directory to main delivery directory (e.g. "cd P27213"). Assuming there
	is a column with user defined IDs in the file "00-Reports/*_sample_info.txt"
	(tab-separated columns), the script will read that file (and column nrs 1 and
	2), and then locate corresponding .fastq.gz files.

	Prints tab separated entries to STDOUT.

	Note: currently assumes PE libraries.

	=head1 OPTIONS

	=over 4

	=item B<-a,--atlas> Output table for the ATLAS workflow (see L<https://github.com/metagenome-atlas/atlas>).

	=item B<-e,--eager> Output table for the nf-core/eager workflow (See L<https://nf-co.re/eager/2.4.7/usage#tsv-input-method>).

	=item B<-h,--help> Display help.

	=item B<-v,--version> Display version.

	=back

	=head1 AUTHOR

	Johan Nylander <johan.nylander@nrm.se>

	=head1 VERSION

	0.2

	=head1 COPYRIGHT AND LICENSE

	Copyright 2023 Johan Nylander.
	Distributed under the MIT License.

	=cut

	use strict;
	use warnings;
	use File::Find;
	use File::Basename;
	use Cwd;
	use Data::Dumper;
	use Getopt::Long;
	Getopt::Long::Configure("no_ignore_case", "no_auto_abbrev");

	my $version = '0.2';

	# For eager tsv format settings, see https://nf-co.re/eager/2.4.7/usage#tsv-input-method
	my $eager_colour_chemistry = '2';
	my $eager_seqtype = 'PE';
	my $eager_organism = 'NA';
	my $eager_strandedness = 'double';
	my $eager_udg_treatment = 'full';
	my $eager_bam = 'NA';

	my $atlas = q{};
	my $eager = q{};
	my %HoH = ();
	my @uid_order = ();
	my @ngiid_order = ();
	my @file_paths = ();

	my $cwd = getcwd();

	GetOptions(
	'a\|atlas' => \$atlas,
	'e\|eager' => \$eager,
	'v\|version' => sub { print "$version\n"; exit(0); },
	'h' => sub { print "Usage: $0 [OPTIONS][--help] sample_info_file\n"; exit(0); },
	'help' => sub { exec("perldoc", $0); exit(0); },
	) or
	die ("$0 Error in command line arguments\nUsage: $0 [OPTIONS][--help] sample_info_file\n");

	if (@ARGV == 0 && -t STDIN && -t STDERR) {
	print "Usage: $0 [OPTIONS][--help] sample_info_file\n";
	exit(1);
	}

	my $sample_info_file = shift or die "Error: need sample file as argument\n";

	sub find_files {
	return unless -f and /^.*\.fastq\.gz\z/s;
	my $file = $File::Find::name;
	push @file_paths, $file;
	}

	open (my $SAMPLE, "<", $sample_info_file) or die "Error: could not open sample file";
	while (<$SAMPLE>) {
	chomp;
	next if (/^NGI ID/);
	my ($ngiid, $userid, @rest) = split /\t/;
	push (@uid_order, $userid);
	push (@ngiid_order, $ngiid);
	$HoH{$ngiid}{'uid'} = $userid;
	$HoH{$ngiid}{'ngiid'} = $ngiid;
	}
	close($SAMPLE);

	find(\&find_files, $cwd);

	for my $file_path (@file_paths) {
	for my $id (@ngiid_order) {
	if ($file_path =~ /$id/) {
	if ($file_path =~/R1_001\.fastq\.gz/) {
	$HoH{$id}{'R1'} = $file_path;
	}
	elsif ($file_path =~/R2_001\.fastq\.gz/) {
	$HoH{$id}{'R2'} = $file_path;
	}
	}
	else {
	print STDERR "Warning: could not find fastq.gz file for ID $id\n";
	}
	}
	}

	if ($eager) {
	# See https://nf-co.re/eager/2.4.7/usage#tsv-input-method
	print STDOUT "Sample_Name\tLibrary_ID\tLane\tColour_Chemistry\tSeqType\tOrganism\tStrandedness\tUDG_Treatment\tR1\tR2\tBAM\n";
	for my $id (@ngiid_order) {
	if (exists $HoH{$id}{'R1'} && exists $HoH{$id}{'R2'}) {
	print STDOUT $HoH{$id}{'uid'}, "\t";
	print STDOUT $HoH{$id}{'ngiid'}, "\t";
	my ($filename, $dirs, $suffix) = fileparse($HoH{$id}{'R1'}, '_R1_001.fastq.gz');
	my $lane = '';
	if ($filename =~ /.+_L(\d+)$/) {
	$lane = $1;
	}
	print STDOUT $lane, "\t";
	print STDOUT $eager_colour_chemistry, "\t";
	print STDOUT $eager_seqtype, "\t";
	print STDOUT $eager_organism, "\t";
	print STDOUT $eager_strandedness, "\t";
	print STDOUT $eager_udg_treatment, "\t";
	print STDOUT $HoH{$id}{'R1'}, "\t";
	print STDOUT $HoH{$id}{'R2'}, "\t";
	print STDOUT $eager_bam, "\n";
	}
	}
	}
	elsif ($atlas) {
	print STDOUT "Sample\tLib\tFile\tRead\tPath\n";
	for my $id (@ngiid_order) {
	print STDOUT $HoH{$id}{'uid'}, "\t";
	my ($filename, $dirs, $suffix) = fileparse($HoH{$id}{'R1'}, '_R1_001.fastq.gz');
	my @d = split /\//, $dirs;
	my $lib = pop(@d);
	print STDOUT $lib, "\t";
	print STDOUT $filename, "\t";
	print STDOUT "R1", "\t";
	print STDOUT $dirs, "\n";
	print STDOUT $HoH{$id}{'uid'}, "\t";
	($filename, $dirs, $suffix) = fileparse($HoH{$id}{'R2'}, '_R2_001.fastq.gz');
	@d = split /\//, $dirs;
	$lib = pop(@d);
	print STDOUT $lib, "\t";
	print STDOUT $filename, "\t";
	print STDOUT "R2", "\t";
	print STDOUT $dirs, "\n";
	}
	}
	else {
	for my $id (@ngiid_order) {
	if (exists $HoH{$id}{'R1'} && exists $HoH{$id}{'R2'}) {
	#print $HoH{$id}{'uid'}, "\t", $HoH{$id}{'ngiid'}, "\t", $HoH{$id}{'R2'}, "\t", $HoH{$id}{'R2'}, "\n";
	print $HoH{$id}{'ngiid'}, "\t", $HoH{$id}{'uid'}, "\t", $HoH{$id}{'R1'}, "\t", $HoH{$id}{'R2'}, "\n";
	}
	}
	}

	__END__
	# %HoH = (
	# 'P27213_116' => {
	# 'uid' => 'PS-B6',
	# 'ngiid' => 'P27213_116',
	# 'R1' => '/proj/nrmdnalab_storage/Pleurozium_NV/ngisthlm00062/files/P27213/P27213_116/02-FASTQ/221014_A01901_0056_AHNLGHDSX3/P27213_116_S68_L002_R1_001.fastq.gz',
	# 'R2' => '/proj/nrmdnalab_storage/Pleurozium_NV/ngisthlm00062/files/P27213/P27213_116/02-FASTQ/221014_A01901_0056_AHNLGHDSX3/P27213_116_S68_L002_R2_001.fastq.gz',
	# },
	# );