Last active
October 21, 2022 08:15
-
-
Save kantale/3d7b05481fadd60e47ed21b7059f9dc8 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
# ./inferHLATypes.pl --BAM /gpfs1/well/gsk_hla/bam_output/AA02O9Q_Z2.bam --graph PRG_MHC_GRCh38_withIMGT --sampleID NA12878Direct | |
use warnings; | |
use strict; | |
use FindBin; | |
use File::Spec; | |
use Getopt::Long; | |
use Data::Dumper; | |
use Sys::Hostname; | |
use Cwd qw/getcwd abs_path/; | |
$| = 1; | |
my $this_bin_dir = $FindBin::RealBin; | |
my $_BAM; | |
my $graph; | |
my $GRAPHDIR; | |
my $sampleID; | |
my $qsub; | |
my $samtools_bin; | |
my $bwa_bin; | |
my $java_bin; | |
my $picard_sam2fastq_bin; | |
my $workingDir_param; | |
my $maxThreads = 1; | |
my $moreReferencesDir; | |
my $extractExonkMerCounts; | |
my $longReads = 0; | |
my $testing = 0; | |
GetOptions ( | |
'BAM:s' => \$_BAM, | |
'graph:s' => \$graph, | |
'GRAPHDIR:s' => \$GRAPHDIR, | |
'sampleID:s' => \$sampleID, | |
'qsub:s' => \$qsub, | |
'workingDir:s' => \$workingDir_param, | |
'samtools_bin:s' => \$samtools_bin, | |
'bwa_bin:s' => \$bwa_bin, | |
'java_bin:s' => \$java_bin, | |
'picard_sam2fastq_bin:s' => \$picard_sam2fastq_bin, | |
'maxThreads:s' => \$maxThreads, | |
'moreReferencesDir:s' => \$moreReferencesDir, | |
'extractExonkMerCounts:s' => \$extractExonkMerCounts, | |
'longReads:s' => \$longReads, | |
'testing:s' => \$testing, | |
); | |
if ($extractExonkMerCounts) | |
{ | |
warn "--extractExonkMerCounts 1: This feature is experimental and unlikely to work on your machine"; | |
die unless(-e '../exonkMerExtraction/GRCh38.forkMers'); | |
die unless(-e '../exonkMerExtraction/exonCoordinates_manual.txt.forExtraction'); | |
} | |
unless((not $longReads) or ($longReads eq 'ont2d') or ($longReads eq 'pacbio')) | |
{ | |
die "Please specify --longReads ont2d or --longReads pacbio"; | |
} | |
my %paths_ini; | |
my $paths_ini = $this_bin_dir . '/paths.ini'; | |
if(-e $paths_ini) | |
{ | |
open(INI, '<', $paths_ini) or die "Cannot open $paths_ini"; | |
while(<INI>) | |
{ | |
chomp; | |
next unless($_); | |
$_ =~ s/[\n\r]//g; | |
next if($_ =~ /^\s+$/); | |
die "Invalid line format in $paths_ini -- expect all lines to be either empty or key=value pairs" unless($_ =~ /^(.+)=(.*)$/); | |
my $id = $1; | |
my @alts = split(/,/, $2); | |
$paths_ini{$id} = \@alts; | |
} | |
close(INI); | |
} | |
$samtools_bin = find_path('samtools_bin', $samtools_bin, 'samtools'); | |
$bwa_bin = find_path('bwa_bin', $bwa_bin, 'bwa'); | |
$java_bin = find_path('java_bin', $java_bin, 'java'); | |
$picard_sam2fastq_bin = find_path('picard_sam2fastq_bin', $picard_sam2fastq_bin, 'picard'); | |
my $FASTQ_extraction_command_part1; | |
if($picard_sam2fastq_bin =~ /SamToFastq\.jar$/) | |
{ | |
$FASTQ_extraction_command_part1 = qq($java_bin -Xmx10g -XX:-UseGCOverheadLimit -jar $picard_sam2fastq_bin); | |
} | |
elsif($picard_sam2fastq_bin =~ /picard-tools$/) | |
{ | |
$FASTQ_extraction_command_part1 = qq($picard_sam2fastq_bin SamToFastq); | |
} | |
elsif($picard_sam2fastq_bin =~ /picard\.jar$/) | |
{ | |
$FASTQ_extraction_command_part1 = qq($java_bin -Xmx10g -XX:-UseGCOverheadLimit -jar $picard_sam2fastq_bin SamToFastq); | |
} | |
elsif($picard_sam2fastq_bin =~ /picard$/) | |
{ | |
$FASTQ_extraction_command_part1 = qq($picard_sam2fastq_bin SamToFastq); | |
} | |
else | |
{ | |
die "I can't interpret the specified Picard command: $picard_sam2fastq_bin"; | |
} | |
if($testing) | |
{ | |
my $previous_dir = getcwd; | |
chdir($this_bin_dir) or die "Cannot cd into $this_bin_dir"; | |
die "Binary ../bin/HLA-LA (from $this_bin_dir) not there!" unless(-e '../bin/HLA-LA'); | |
my $cmd_test = qq(../bin/HLA-LA --action testBinary); | |
system($cmd_test) and die "HLA*LA test command $cmd_test failed"; | |
chdir($previous_dir) or die "Cannot chdir into $previous_dir"; | |
print "HLA-LA.pl test\n\n"; | |
print "\t", "samtools_bin", ": ", $samtools_bin, "\n"; | |
print "\t", "bwa_bin", ": ", $bwa_bin, "\n"; | |
print "\t", "java_bin", ": ", $java_bin, "\n"; | |
print "\t", "picard_sam2fastq_bin", ": ", $picard_sam2fastq_bin, "\n"; | |
exit 0; | |
} | |
my $working_dir; | |
if($paths_ini{workingDir}[0] and not defined $workingDir_param) | |
{ | |
$working_dir = $paths_ini{workingDir}[0]; | |
$working_dir =~ s/\$HLA\-LA\-DIR/$this_bin_dir/; | |
} | |
else | |
{ | |
unless(defined $workingDir_param) | |
{ | |
die "\n\nPlease specify a working directory via --workingDir.\n\nOutput for sample with ID \$sampleID will go a correspondingly named sub-directory of the working directory.\n\nFor example, if --workingDir is /path/working, and --sampleID is mySample, then the output will go into directory /path/working/mySample.\n\n"; | |
} | |
$working_dir = $workingDir_param; | |
} | |
unless(-d $working_dir) | |
{ | |
die "\n\nSpecified working directory $working_dir is either not present or not a directory.\n\nYou might have specified an invalid path via --workingDir.\n\n"; | |
} | |
$working_dir = abs_path($working_dir); | |
unless($sampleID =~ /^\w+$/) | |
{ | |
die "Please use only alphanumeric characters - \\w+ - for --sampleID"; | |
} | |
my $working_dir_thisSample = $working_dir . '/' . $sampleID; | |
print "HLA-LA.pl\n\n"; | |
print "Identified paths:\n"; | |
print "\t", "samtools_bin", ": ", $samtools_bin, "\n"; | |
print "\t", "bwa_bin", ": ", $bwa_bin, "\n"; | |
print "\t", "java_bin", ": ", $java_bin, "\n"; | |
print "\t", "picard_sam2fastq_bin", ": ", $picard_sam2fastq_bin, "\n"; | |
print "\t", "General working directory", ": ", $working_dir, "\n"; | |
print "\t", "Sample-specific working directory", ": ", $working_dir_thisSample, "\n"; | |
print "\n"; | |
unless(-d $working_dir_thisSample) | |
{ | |
mkdir($working_dir_thisSample) or die "Cannot mkdir $working_dir_thisSample"; | |
} | |
my $samtools_version = `$samtools_bin --version` ; | |
die "Can't parse samtools version output" unless($samtools_version =~ /samtools ([\d\.]+)/); | |
$samtools_version = $1; | |
my $samtools_version_numeric = $samtools_version; | |
if($samtools_version_numeric =~ /^(\d+)\.(\d+)\.(\d+)$/) | |
{ | |
$samtools_version_numeric = $1 . '.' . $2 . $3; | |
} | |
#unless($samtools_version_numeric >= 1.3) | |
#{ | |
# die "I need samtools >=1.3"; | |
#} | |
#my $BAM = File::Spec->abs2rel($_BAM); | |
my $BAM = $_BAM; | |
print "1343 BAM: $BAM \n"; | |
unless(-e $BAM) | |
{ | |
die "BAM (or CRAM) $BAM (inferred from input $_BAM) not existing" unless(-e $BAM); | |
unless((-e $BAM . '.bai') or (-e $BAM . '.crai')) | |
{ | |
die "File $BAM does not appear to be indexed"; | |
} | |
} | |
my $full_graph_dir = $GRAPHDIR . '/' . $graph; | |
my $known_references_dir = $full_graph_dir . '/knownReferences'; | |
unless(-e $full_graph_dir) | |
{ | |
die "Graph directory $full_graph_dir not found - valid graph names are subdirectories of the graphs directory in the HLA-LA root"; | |
} | |
unless((-e $full_graph_dir . '/sequences.txt') and ((-e $full_graph_dir . '/extendedReferenceGenomePath.txt') or (-e $full_graph_dir . '/extendedReferenceGenome/extendedReferenceGenome.fa')) and (-d $known_references_dir)) | |
{ | |
die "Graph directory $full_graph_dir does not seem to be complete - does this directory specify a valid graph for HLA-LA?"; | |
} | |
# get index for this BAM | |
my $command_idxstats = qq($samtools_bin idxstats $BAM); | |
my $command_idxstats_output = `$command_idxstats`; | |
unless($command_idxstats_output) | |
{ | |
die "Didn't get a sensible output from command $command_idxstats"; | |
} | |
my %BAM_idx; | |
my @BAM_idx_contigOrder; | |
my @idxstats_lines = split(/\n/, $command_idxstats_output); | |
foreach my $l (@idxstats_lines) | |
{ | |
my @l_fields = split(/\t/, $l); | |
die unless(scalar(@l_fields) >= 1); | |
die if(exists $BAM_idx{$l_fields[0]}); | |
$BAM_idx{$l_fields[0]} = $l_fields[1]; | |
push(@BAM_idx_contigOrder, $l_fields[0]); | |
} | |
my @files_references = glob($known_references_dir . '/*.txt'); | |
die "No known reference files in knownReferences ($full_graph_dir)?" unless(@files_references); | |
my $additional_references_dir = $this_bin_dir . '/additionalReferences/' . $graph; | |
if(-e $additional_references_dir) | |
{ | |
my @additional_files_references = glob($additional_references_dir . '/*.txt'); | |
push(@files_references, @additional_files_references); | |
} | |
if($moreReferencesDir and (-e $moreReferencesDir)) | |
{ | |
my $mD = $moreReferencesDir . '/' . $graph; | |
if(-d $mD) | |
{ | |
my @even_more_files_references = glob($mD . '/*.txt'); | |
push(@files_references, @even_more_files_references); | |
} | |
} | |
my @compatible_files; | |
my %extractContigs_complete_byFile; | |
my %extractContigs_partial_byFile; | |
foreach my $f (@files_references) | |
{ | |
open(F, '<', $f) or die "Cannot open $f"; | |
my $F_firstLine = <F>; | |
chomp($F_firstLine); | |
my @firstLine_fields = split(/\t/, $F_firstLine); | |
my @expected_firstLine_fields = qw/contigID contigLength ExtractCompleteContig PartialExtraction_Start PartialExtraction_Stop/; | |
die "Incorrect header for $f ($#firstLine_fields vs $#expected_firstLine_fields)" unless($#firstLine_fields == $#expected_firstLine_fields); | |
for(my $i = 0; $i <= $#firstLine_fields; $i++) | |
{ | |
die "Incorrect header for $f" unless($firstLine_fields[$i] eq $expected_firstLine_fields[$i]); | |
} | |
my $n_contigs = 0; | |
my $n_contigs_matching = 0; | |
while(<F>) | |
{ | |
my $line = $_; | |
chomp($line); | |
next unless($line); | |
my @line_fields = split(/\t/, $line, -1); | |
$n_contigs++; | |
if((exists $BAM_idx{$line_fields[0]}) and ($BAM_idx{$line_fields[0]} == $line_fields[1])) | |
{ | |
$n_contigs_matching++; | |
} | |
else | |
{ | |
# print "Mismatch $line_fields[0] - $line_fields[1] - " . $BAM_idx{$line_fields[0]} . "\n"; | |
} | |
die if(($line_fields[2]) and (($line_fields[3]) and ($line_fields[4]))); | |
if($line_fields[2]) | |
{ | |
if($line_fields[0] eq '*') | |
{ | |
$extractContigs_complete_byFile{$f}{$line_fields[0]} = 1; | |
} | |
else | |
{ | |
$extractContigs_partial_byFile{$f}{$line_fields[0]} = [1, $line_fields[1]]; | |
} | |
} | |
if(($line_fields[3]) and ($line_fields[4])) | |
{ | |
die if($line_fields[0] eq '*'); | |
die "Coordinate field $line_fields[3] has non-numeric characters" unless($line_fields[3] =~ /^\d+$/); | |
die "Coordinate field $line_fields[4] has non-numeric characters" unless($line_fields[4] =~ /^\d+$/); | |
$extractContigs_partial_byFile{$f}{$line_fields[0]} = [$line_fields[3], $line_fields[4]]; | |
} | |
} | |
close(F); | |
if(($n_contigs_matching == $n_contigs) and ($n_contigs == scalar(@BAM_idx_contigOrder))) | |
{ | |
push(@compatible_files, $f); | |
} | |
} | |
if(scalar(@compatible_files) == 0) | |
{ | |
die "Have found no compatible reference specifications in $known_references_dir - create a file for this BAM file and try again."; | |
} | |
if(scalar(@compatible_files) > 1) | |
{ | |
die "Found more than one compatible reference file in $known_references_dir - a duplicate?\n\n".Dumper(\@compatible_files); | |
} | |
my $compatible_reference_file = $compatible_files[0]; | |
my @refIDs_for_extraction; | |
foreach my $refID (@BAM_idx_contigOrder) | |
{ | |
next if($refID eq '*'); | |
die if((exists $extractContigs_complete_byFile{$compatible_reference_file}{$refID}) and (exists $extractContigs_partial_byFile{$compatible_reference_file}{$refID})); | |
die unless((not defined $extractContigs_complete_byFile{$compatible_reference_file}{$refID}) or ($extractContigs_complete_byFile{$compatible_reference_file}{$refID} eq '0') or ($extractContigs_complete_byFile{$compatible_reference_file}{$refID} eq '1')); | |
if($extractContigs_complete_byFile{$compatible_reference_file}{$refID}) | |
{ | |
push(@refIDs_for_extraction, $refID); | |
} | |
if($extractContigs_partial_byFile{$compatible_reference_file}{$refID}) | |
{ | |
push(@refIDs_for_extraction, $refID . ':' . $extractContigs_partial_byFile{$compatible_reference_file}{$refID}[0] . '-' . $extractContigs_partial_byFile{$compatible_reference_file}{$refID}[1]); | |
} | |
} | |
die "No contigs for extraction specified in $compatible_reference_file?" unless(scalar(@refIDs_for_extraction)); | |
my $target_extraction = $working_dir_thisSample . '/extraction.bam'; | |
my $target_extraction_mapped = $working_dir_thisSample . '/extraction_mapped.bam'; | |
my $extraction_command = qq($samtools_bin view -bo $target_extraction_mapped $BAM ).join(' ', @refIDs_for_extraction); | |
print "Extract reads from ", scalar(@refIDs_for_extraction), " regions...\n"; | |
if(system($extraction_command) != 0) | |
{ | |
die "Extraction command $extraction_command failed"; | |
} | |
if($extractContigs_complete_byFile{$compatible_reference_file}{'*'}) | |
{ | |
my $target_extraction_unmapped = $working_dir_thisSample . '/extraction_unmapped.bam'; | |
my $extraction_command_unmapped = qq($samtools_bin view -b -f 4 $BAM > $target_extraction_unmapped); | |
print "Extract unmapped reads...\n"; | |
print "extraction_command_unmapped=$extraction_command_unmapped\n"; | |
print "BAM: $BAM\n"; | |
if(system($extraction_command_unmapped) != 0) | |
{ | |
die "Extraction command $extraction_command_unmapped failed"; | |
} | |
unlink($target_extraction) if (-e $target_extraction); | |
my $extraction_command_merge = qq($samtools_bin merge $target_extraction $target_extraction_mapped $target_extraction_unmapped); | |
print "Merging...\n"; | |
if(system($extraction_command_merge) != 0) | |
{ | |
die "Merge command $extraction_command_merge failed"; | |
} | |
} | |
else | |
{ | |
my $mv_command = qq(mv $target_extraction_mapped $target_extraction); | |
if(system($mv_command) != 0) | |
{ | |
die "Move command $mv_command failed"; | |
} | |
} | |
my $index_command = qq($samtools_bin index $target_extraction); | |
print "Indexing...\n"; | |
if(system($index_command) != 0) | |
{ | |
die "Index command $index_command failed"; | |
} | |
my $target_FASTQ_1 = $working_dir_thisSample . '/R_1.fastq'; | |
my $target_FASTQ_2 = $working_dir_thisSample . '/R_2.fastq'; | |
my $target_FASTQ_U = $working_dir_thisSample . '/R_U.fastq'; | |
my $target_FASTQ_U_split = $working_dir_thisSample . '/R_U.fastq.splitLongReads'; | |
my $FASTQ_extraction_command = qq($FASTQ_extraction_command_part1 VALIDATION_STRINGENCY=LENIENT I=$target_extraction F=$target_FASTQ_1 F2=$target_FASTQ_2 FU=$target_FASTQ_U 2>&1); | |
print "Extract FASTQ...\n\t$FASTQ_extraction_command\n"; | |
my $FASTQ_extraction_output = `$FASTQ_extraction_command`; | |
#if(($FASTQ_extraction_output =~ /Exception/) or ($FASTQ_extraction_output !~ /net.sf.picard.sam.SamToFastq done/)) | |
if(($FASTQ_extraction_output !~ /picard.sam.SamToFastq done/)) | |
{ | |
die "Picard output: \n\n" . $FASTQ_extraction_output . "\n\nExtraction command $FASTQ_extraction_command $! \n\nAbort because the Picard FASTQ extraction process might have failed. I think so because I could not find the string 'picard.sam.SamToFastq done' in the Picard output.\n\n"; | |
} | |
if($longReads) | |
{ | |
die "You activated --longReads, but the two files $target_FASTQ_1 and $target_FASTQ_2 (which store paired-end reads) are not empty - this is weird, and I will abort." unless(((-s $target_FASTQ_1) == 0) && ((-s $target_FASTQ_2) == 0)); | |
unless((-s $target_FASTQ_U) > 0) | |
{ | |
die "You activated --longReads, but my attempt to extract long unpaired reads from the specified input BAM failed. Abort."; | |
} | |
open(INLONG, '<', $target_FASTQ_U) or die; | |
open(OUTLONG, '>', $target_FASTQ_U_split) or die; | |
while(<INLONG>) | |
{ | |
chomp; | |
my $readID = $_; | |
die unless(substr($readID, 0, 1) eq '@'); | |
my $sequence = <INLONG>; chomp($sequence); | |
my $plus = <INLONG>; | |
my $qualities = <INLONG>; chomp($qualities); | |
die unless(substr($plus, 0, 1) eq '+'); | |
die unless(length($sequence) == length($qualities)); | |
if(length($sequence) < 50000) | |
{ | |
print OUTLONG $readID, "\n", $sequence, "\n", "+\n", $qualities, "\n"; | |
} | |
else | |
{ | |
my $runningI = 0; | |
while(length($sequence)) | |
{ | |
die unless(length($sequence) == length($qualities)); | |
my $thisPart_readID = '>rP' . $runningI . substr($readID, 1); | |
my $extractionLength = (length($sequence) > 50000) ? 50000 : length($sequence); | |
print OUTLONG $thisPart_readID, "\n", substr($sequence, 0, $extractionLength), "\n", "+\n", substr($qualities, 0, $extractionLength), "\n"; | |
substr($sequence, 0, $extractionLength) = ''; | |
substr($qualities, 0, $extractionLength) = ''; | |
$runningI++; | |
} | |
} | |
} | |
close(INLONG); | |
close(OUTLONG); | |
} | |
else | |
{ | |
die "You didn't activate --longReads, but the two files $target_FASTQ_1 and $target_FASTQ_2 (which store paired-end reads) are empty - this is weird, and I will abort." unless(((-s $target_FASTQ_1) > 0) && ((-s $target_FASTQ_2) > 0)); | |
} | |
#if(system($FASTQ_extraction_command) != 0) | |
#{ | |
#} | |
my $mapAgainstCompleteGenome = ($extractContigs_complete_byFile{$compatible_reference_file}{'*'}) ? 1 : 0; | |
#$mapAgainstCompleteGenome = 0; | |
if($extractExonkMerCounts) | |
{ | |
die if($longReads); | |
die unless(-e '../exonkMerExtraction/GRCh38.forkMers'); | |
die unless(-e '../exonkMerExtraction/exonCoordinates_manual.txt'); | |
my $command_extraction = qq(perl extractkMerCounts.pl --sampleID $sampleID --outputDirectory $working_dir_thisSample --referenceGenome ../exonkMerExtraction/GRCh38.forkMers --exonCoordinates ../exonkMerExtraction/exonCoordinates_manual.txt --FASTQ1 $target_FASTQ_1 --FASTQ2 $target_FASTQ_2 --bwa_bin $bwa_bin --samtools_bin $samtools_bin --maxThreads $maxThreads); | |
print "Now executing: $command_extraction\n"; | |
system($command_extraction) and die "Command $command_extraction failed\n"; | |
} | |
else | |
{ | |
my $host = hostname(); | |
# my $MHC_PRG_2_bin = (($host =~ /rescomp/) or ($host =~ /comp[ABC]/)) ? '../bin_cluster3/HLA-LA' : '../bin/HLA-LA'; | |
# my $MHC_PRG_2_bin = '../bin/HLA-LA'; | |
# my $MHC_PRG_2_bin = '/HLA-LA/bin/HLA-LA'; | |
my $MHC_PRG_2_bin = '/usr/local/bin/HLA-LA'; | |
my $previous_dir = getcwd; | |
chdir($this_bin_dir) or die "Cannot cd into $this_bin_dir"; | |
die "Binary $MHC_PRG_2_bin not there!" unless(-e $MHC_PRG_2_bin); | |
my $command_MHC_PRG = qq($MHC_PRG_2_bin --action HLA --maxThreads $maxThreads --sampleID $sampleID --outputDirectory $working_dir_thisSample --PRG_graph_dir $full_graph_dir --FASTQU $target_FASTQ_U_split --FASTQ1 $target_FASTQ_1 --FASTQ2 $target_FASTQ_2 --bwa_bin $bwa_bin --samtools_bin $samtools_bin --mapAgainstCompleteGenome $mapAgainstCompleteGenome --longReads $longReads); | |
print "\nNow executing:\n$command_MHC_PRG\n"; | |
if(system($command_MHC_PRG) != 0) | |
{ | |
die "HLA-LA execution not successful. Command was $command_MHC_PRG\n"; | |
} | |
chdir($previous_dir) or die "Cannot cd into $previous_dir"; | |
} | |
sub find_path | |
{ | |
my $id = shift; | |
my $supplied_value = shift; | |
my $forWhich = shift; | |
if(defined $supplied_value) | |
{ | |
die "Command-line supplied value/file for parameter $id not existing" unless(-e $supplied_value); | |
return $supplied_value; | |
} | |
if(exists $paths_ini{$id}) | |
{ | |
foreach my $alternative (@{$paths_ini{$id}}) | |
{ | |
if(-e $alternative) | |
{ | |
return $alternative; | |
} | |
} | |
} | |
if($forWhich) | |
{ | |
my $which_output = `which $forWhich`; | |
$which_output =~ s/[\n\r]//g; | |
if($which_output and (-e $which_output)) | |
{ | |
return $which_output; | |
} | |
} | |
die "I couldn't figure out a path for ${id}. Order of precedence: check for parameter --${id}; check paths.ini in $this_bin_dir for a key named $id; parse, if command string defined, the output of the command 'which ${forWhich}' ('which ' means that the command string is not defined)."; | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment