Skip to content

Instantly share code, notes, and snippets.

@avrilcoghlan
Last active December 12, 2015 07:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save avrilcoghlan/4738208 to your computer and use it in GitHub Desktop.
Save avrilcoghlan/4738208 to your computer and use it in GitHub Desktop.
Perl script to run CNVnator on the Sanger compute farm
#!/usr/local/bin/perl
=head1 NAME
run_cnvnator_on_farm_wrapper.pl
=head1 SYNOPSIS
run_cnvnator_on_farm_wrapper.pl input_fasta input_bam cnvnator window_size output_gff outputdir
where input_fasta is the input fasta file for the assembly,
input_bam is the input bam file,
cnvnator is the path to the CNVnator installation,
window_size is the window size (bin size) to use for CNVnator,
output_gff is the output gff file,
outputdir is the output directory for writing output files.
=head1 DESCRIPTION
This script takes an input fasta file for an assembly (<input_fasta>), and an
input bam file for that assembly (<input_bam>) and runs CNVnator with a certain
window size (<window_size). It then converts the CNVnator output into an output
gff file (<output_gff>).
=head1 VERSION
Perl script last edited 6-Feb-2013.
=head1 CONTACT
alc@sanger.ac.uk (Avril Coghlan)
=cut
#
# Perl script run_cnvnator_on_farm_wrapper.pl
# Written by Avril Coghlan (alc@sanger.ac.uk)
# 6-Feb-13.
# Last edited 6-Feb-2013.
# SCRIPT SYNOPSIS: run_cnvnator_on_farm_wrapper.pl: wrapper script to run CNVnator on the farm, for an input fasta file and input bam file.
#
#------------------------------------------------------------------#
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:
use strict;
use warnings;
my $num_args = $#ARGV + 1;
if ($num_args != 6)
{
print "Usage of run_cnvnator_on_farm_wrapper.pl\n\n";
print "perl run_cnvnator_on_farm_wrapper.pl <input_fasta> <input_bam> <cnvnator> <window_size> <output_gff> <outputdir>\n";
print "where <input_fasta> is the input fasta file,\n";
print " <input_bam> is the input bam file,\n";
print " <cnvnator> is the path to the CNVnator installation,\n";
print " <window_size> is the window size (bin size) to use for CNVnator,\n";
print " <output_gff> is the output gff file,\n";
print " <outputdir> is the output directory for writing output files\n";
print "For example, >perl run_cnvnator_on_farm_wrapper.pl /lustre/scratch108/parasites/alt/SVEN/alc_mapping_temp/Cons.fa\n";
print "/lustre/scratch108/parasites/alt/SVEN/alc_mapping_temp/Output/out.sorted.bam /nfs/users/nfs_b/bf3/bin/CNVnator_v0.2.7/src/\n";
print "100 cnvnator_100bp.gff /lustre/scratch108/parasites/alc/StrongyloidesCNVnator/\n";
exit;
}
# FIND THE PATH TO THE INPUT FASTA FILE:
my $input_fasta = $ARGV[0];
# FIND THE PATH TO THE INPUT BAM FILE:
my $input_bam = $ARGV[1];
# FIND THE PATH TO THE CNVNATOR INSTALLATION:
my $cnvnator = $ARGV[2];
# FIND THE WINDOW SIZE TO USE:
my $windowsize = $ARGV[3];
# FIND THE PATH TO THE OUTPUT GFF FILE:
my $output_gff = $ARGV[4];
# FIND THE DIRECTORY TO USE FOR OUTPUT FILES:
my $outputdir = $ARGV[5];
#------------------------------------------------------------------#
# TEST SUBROUTINES:
my $PRINT_TEST_DATA = 0; # SAYS WHETHER TO PRINT DATA USED DURING TESTING.
&test_print_error;
&test_make_shellscript($outputdir);
#------------------------------------------------------------------#
# RUN THE MAIN PART OF THE CODE:
&run_main_program($outputdir,$input_fasta,$input_bam,$cnvnator,$windowsize,$output_gff);
print STDERR "FINISHED.\n";
#------------------------------------------------------------------#
# RUN THE MAIN PART OF THE CODE:
sub run_main_program
{
my $outputdir = $_[0]; # DIRECTORY TO PUT OUTPUT FILES IN.
my $input_fasta = $_[1]; # THE INPUT FASTA FILE
my $input_bam = $_[2]; # INPUT BAM FILE
my $cnvnator = $_[3]; # PATH TO CNVNATOR INSTALLATION
my $windowsize = $_[4]; # WINDOW SIZE TO USE
my $output_gff = $_[5]; # THE OUTPUT GFF FILE
my $errorcode; # RETURNED AS 0 IF THERE IS NO ERROR.
my $errormsg; # RETURNED AS 'none' IF THERE IS NO ERROR.
my $tempdir; # TEMPORARY DIRECTORY TO STORE THE DATA IN
my $pattern; # PATTERN OF TEMPORARY FASTA FILES' FILENAMES
my $fasta; # A TEMPORARY FASTA FILE
my $letters; # LETTERS IN THE NAME OF $fasta
my @temp; #
my $shellscript; # SHELL SCRIPT FOR SUBMITTING A JOB TO THE FARM
my $out; # FILE WITH STANDARD OUTPUT FROM FARM JOB
my $err; # FILE WITH STANDARD ERROR FROM FARM JOB
my $cmd; # COMMAND TO RUN
my $jobgroup; # JOB GROUP TO USE
my $random_number; # RANDOM NUMBER TO USE IN JOB GROUP NAME
my $running; # VARIABLE THAT SAYS WHETHER THE JOBS ARE STILL RUNNING
my $jobs; # DETAILS OF THE JOBS THAT ARE STILL RUNNING
my $line; #
my $tempout; # TEMPORARY OUTPUT FILE
my @shellscripts; # ARRAY OF SHELLSCRIPT NAMES
my $i; #
my $memory; # MEMORY TO REQUEST FROM THE FARM
my $memorytimes1000; # MEMORY TO REQUEST FROM THE FARM
# MAKE A TEMPORARY DIRECTORY TO STORE THE DATA IN:
($tempdir,$errorcode,$errormsg) = &make_dirname($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
system "mkdir $tempdir";
# DIVIDE UP THE INPUT FASTA FILE INTO SMALLER FILES WITH 5 SEQUENCES EACH:
$cmd = "split_up_fasta.pl $input_fasta 5 fasta $tempdir";
system "$cmd";
# SPECIFY A JOB GROUP:
$random_number = rand();
$jobgroup = "MyCNVnatorJobGroup".$random_number;
$cmd = "bgadd -L 100 \/$jobgroup"; # DEFINE A JOB GROUP WITH ONLY 100 JOBS RUN AT ONCE
system "$cmd";
# SUBMIT THE JOBS FOR EACH OF THE SMALLER FILES:
chdir( $tempdir ) or die "ERROR: run_main_program: cannot chdir to $tempdir $!";
$pattern = $tempdir."/fasta*";
my $cnt = 0;
open(TMP,"ls $pattern |");
while(<TMP>)
{
$cnt++;
$fasta = $_;
chomp $fasta;
@temp = split(/fasta/,$fasta);
$letters = $temp[$#temp];
# MAKE SHELL SCRIPT TO SUBMIT TO THE FARM AS A JOB:
($shellscript,$errorcode,$errormsg) = &make_shellscript($letters,$fasta,$outputdir,$input_bam,$tempdir,$cnvnator,$windowsize);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
@shellscripts = (@shellscripts,$shellscript);
$out = $shellscript.".o";
$err = $shellscript.".e";
print STDERR "Submitting job $letters (fasta $fasta)...\n";
$memory = 3500; # THIS SEEMS TO WORK OK FOR MOST JOBS
$memorytimes1000 = $memory*1000;
$cmd = "bsub -g \/$jobgroup -o $tempdir/$out -e $tempdir/$err -R \"select[mem > $memory] rusage[mem=$memory]\" -M$memorytimes1000 $outputdir/$shellscript";
system "$cmd";
}
close(TMP);
# CHECK IF THE JOBS IN THE JOB GROUP ARE FINISHED:
sleep(30); # IT TAKES A FEW SECONDS FOR THE JOBS TO APPEAR IN 'bjobs'
$running = 1;
while($running == 1)
{
$jobs = "";
$cmd = "bjobs -g \/$jobgroup";
open(TMP,"$cmd |");
while(<TMP>)
{
$line = $_;
$jobs = $jobs.$line;
}
close(TMP);
# xxx print STDERR "Jobs running: $jobs\n";
if (!($jobs =~ /[a-z]/) || $jobs =~ /No unfinished job found/) { $running = 0;}
sleep(30); # WAIT ANOTHER 30 SECONDS BEFORE LOOKING AT THE QUEUE AGAIN
}
# CHECK IF THERE ARE ERRORS IN THE STANDARD OUTPUT ERROR OR STANDARD OUTPUT FILES:
chdir( $outputdir ) or die "ERROR: run_main_program: cannot chdir to $outputdir $!";
for ($i = 0; $i <= $#shellscripts; $i++)
{
$shellscript = $shellscripts[$i];
$out = $tempdir."/".$shellscript.".o";
$err = $tempdir."/".$shellscript.".e";
open(TMP,"grep Exited $out |");
while(<TMP>)
{
$line = $_;
if ($line =~ /Exited/)
{
$errormsg = "ERROR: run_main_program: found error message in standard output file $out\n";
$errorcode = 9; # ERRORCODE=9 (THIS SHOULDN'T HAPPEN, SO CAN'T TEST FOR)
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
}
}
open(TMP,"grep -i error $out |");
while(<TMP>)
{
$line = $_;
$line =~ tr/[A-Z]/[a-z]/;
if ($line =~ /error/)
{
$errormsg = "ERROR: run_main_program: found error message in standard error file $err\n";
$errorcode = 10; # ERRORCODE=10 (THIS SHOULDN'T HAPPEN, SO CAN'T TEST)
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
}
}
}
# DELETE THE JOB GROUP:
$cmd = "bgdel \/$jobgroup";
system "$cmd";
# PUT ALL THE OUTPUT FILES INTO A TEMPORARY FILE:
($tempout,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
$cmd = "cat $tempdir/*out > $tempout";
system "$cmd";
# MAKE A GFF FILE FILE:
$cmd = "make_cnvnator_gff.pl $tempout $output_gff $outputdir";
system "$cmd";
# DELETE TEMPORARY FILES:
system "rm -f $tempout";
# xxx don't delete as can be deleted before they are run
# for ($i = 0; $i <= $#shellscripts; $i++)
# {
# $shellscript = $shellscripts[$i];
# $shellscript = $outputdir."/".$shellscript;
# system "rm -f $shellscript";
# }
# system "rm -rf $tempdir";
}
#------------------------------------------------------------------#
# TEST &make_shellscript
sub test_make_shellscript
{
my $outputdir = $_[0]; # DIRECTORY TO PUT OUTPUT FILES INTO
my $shellscript; # SHELL SCRIPT FILE
my $errorcode; # RETURNED AS 0 FROM A FUNCTION IF THERE IS NO ERROR
my $errormsg; # RETURNED AS 'none' FROM A FUNCTION IF THERE IS NO ERROR
my $expected_shellscript; # FILE WITH EXPECTED CONTENTS OF $shellscript
my $differences; # DIFFERENCES BETWEEN $shellscript AND $expected_shellscript
my $length_differences; # LENGTH OF $differences
my $line; #
($shellscript,$errorcode,$errormsg) = &make_shellscript('aa','~alc/fasta',$outputdir,'~alc/bam','~alc/tempdir','~alc/cnvnator',30);
if ($errorcode != 0) { print STDERR "ERROR: test_make_shellscript: failed test1\n"; exit;}
($expected_shellscript,$errorcode,$errormsg) = &make_dirname($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(EXPECTED,">$expected_shellscript") || die "ERROR: test_make_shellscript: failed test1\n";
print EXPECTED "#!/usr/bin/env tcsh\n";
print EXPECTED "setenv ROOTSYS \"/nfs/users/nfs_b/bf3/bin/root/\"\n";
print EXPECTED "setenv LD_LIBRARY_PATH \${LD_LIBRARY_PATH}:\${ROOTSYS}/lib\n";
print EXPECTED "run_cnvnator_on_assembly.pl ~alc/fasta ~alc/bam myscriptaa.out ~alc/tempdir ~alc/cnvnator 30\n";
close(EXPECTED);
$differences = "";
$shellscript = $outputdir."/".$shellscript;
open(TEMP,"diff $shellscript $expected_shellscript |");
while(<TEMP>)
{
$line = $_;
$differences = $differences.$line;
}
close(TEMP);
$length_differences = length($differences);
if ($length_differences != 0) { print STDERR "ERROR: test_make_shellscript: failed test1 (shellscript $shellscript expected_shellscript $expected_shellscript)\n"; exit;}
system "rm -f $shellscript";
system "rm -f $expected_shellscript";
}
#------------------------------------------------------------------#
# MAKE SHELL SCRIPT TO SUBMIT TO THE FARM AS A JOB:
# SUBROUTINE SYNOPSIS: make_shellscript(): make a shellscript to submit to the farm
sub make_shellscript
{
my $letters = $_[0]; # LETTERS TO USE TO IDENTIFY SHELL SCRIPT, eg. aa
my $fasta = $_[1]; # FASTA FILE
my $outputdir = $_[2]; # DIRECTORY TO WRITE OUTPUT FILES INTO
my $input_bam = $_[3]; # INPUT BAM FILE
my $tempdir = $_[4]; # TEMPORARY DIRECTORY THAT THE OUTPUT FILE WILL APPEAR IN
my $cnvnator = $_[5]; # PATH TO THE CNVNATOR INSTALLATION
my $windowsize = $_[6]; # THE WINDOW SIZE TO USE FOR CNVNATOR
my $shellscript; # SHELL SCRIPT
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR
my $output; # OUTPUT FILE
my @temp; #
$output = "myscript".$letters.".out";
$shellscript = $outputdir."/myscript".$letters;
open(SCRIPT,">$shellscript") || die "ERROR: make_shellscript: cannot open shellscript $shellscript\n";
print SCRIPT "#!/usr/bin/env tcsh\n";
print SCRIPT "setenv ROOTSYS \"/nfs/users/nfs_b/bf3/bin/root/\"\n";
print SCRIPT "setenv LD_LIBRARY_PATH \${LD_LIBRARY_PATH}:\${ROOTSYS}/lib\n";
print SCRIPT "run_cnvnator_on_assembly.pl $fasta $input_bam $output $tempdir $cnvnator $windowsize\n";
close(SCRIPT);
system "chmod a+x $shellscript";
@temp = split(/\//,$shellscript);
$shellscript = $temp[$#temp];
return($shellscript,$errorcode,$errormsg);
}
#------------------------------------------------------------------#
# SUBROUTINE TO MAKE A NAME FOR A TEMPORARY DIRECTORY:
# SUBROUTINE SYNOPSIS: make_dirname(): make a directory name for a temporary directory
sub make_dirname
{
my $outputdir = $_[0]; # OUTPUT DIRECTORY TO WRITE TEMPORARY DIRECTORY TO
my $found_name = 0; # SAYS WHETHER WE HAVE FOUND A DIRECTORY NAME YET
my $dirname = "none";# NEW DIRECTORY NAME TO USE
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR
my $poss_dirname; # POSSIBLE DIRECTORY NAME TO USE
my $random_number; # RANDOM NUMBER TO USE IN TEMPORARY DIRECTORY NAME
while($found_name == 0)
{
$random_number = rand();
$poss_dirname = $outputdir."/tmp".$random_number;
if (!(-e $poss_dirname))
{
$dirname = $poss_dirname;
$found_name = 1;
}
}
if ($found_name == 0 || $dirname eq 'none')
{
$errormsg = "ERROR: make_dirame: found_name $found_name dirname $dirname\n";
$errorcode = 7; # ERRORCODE=7 (SHOULDN'T HAPPEN, SO CAN'T TEST)
return($dirname,$errorcode,$errormsg);
}
return($dirname,$errorcode,$errormsg);
}
#------------------------------------------------------------------#
# SUBROUTINE TO MAKE A FILE NAME FOR A TEMPORARY FILE:
sub make_filename
{
my $outputdir = $_[0]; # OUTPUT DIRECTORY TO WRITE TEMPORARY FILE NAME TO
my $found_name = 0; # SAYS WHETHER WE HAVE FOUND A FILE NAME YET
my $filename = "none";# NEW FILE NAME TO USE
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR
my $poss_filename; # POSSIBLE FILE NAME TO USE
my $random_number; # RANDOM NUMBER TO USE IN TEMPORARY FILE NAME
while($found_name == 0)
{
$random_number = rand();
$poss_filename = $outputdir."/tmp".$random_number;
if (!(-e $poss_filename))
{
$filename = $poss_filename;
$found_name = 1;
}
}
if ($found_name == 0 || $filename eq 'none')
{
$errormsg = "ERROR: make_filename: found_name $found_name filename $filename\n";
$errorcode = 6; # ERRORCODE=6 (SHOULDN'T HAPPEN, SO CAN'T TEST)
return($filename,$errorcode,$errormsg);
}
return($filename,$errorcode,$errormsg);
}
#------------------------------------------------------------------#
# TEST &print_error
sub test_print_error
{
my $errormsg; # RETURNED AS 'none' FROM A FUNCTION IF THERE WAS NO ERROR
my $errorcode; # RETURNED AS 0 FROM A FUNCTION IF THERE WAS NO ERROR
($errormsg,$errorcode) = &print_error(45,45,1);
if ($errorcode != 12) { print STDERR "ERROR: test_print_error: failed test1\n"; exit;}
($errormsg,$errorcode) = &print_error('My error message','My error message',1);
if ($errorcode != 11) { print STDERR "ERROR: test_print_error: failed test2\n"; exit;}
($errormsg,$errorcode) = &print_error('none',45,1);
if ($errorcode != 13) { print STDERR "ERROR: test_print_error: failed test3\n"; exit;}
($errormsg,$errorcode) = &print_error('My error message', 0, 1);
if ($errorcode != 13) { print STDERR "ERROR: test_print_error: failed test4\n"; exit;}
}
#------------------------------------------------------------------#
# PRINT OUT AN ERROR MESSAGE AND EXIT.
sub print_error
{
my $errormsg = $_[0]; # THIS SHOULD BE NOT 'none' IF AN ERROR OCCURRED.
my $errorcode = $_[1]; # THIS SHOULD NOT BE 0 IF AN ERROR OCCURRED.
my $called_from_test = $_[2]; # SAYS WHETHER THIS WAS CALLED FROM test_print_error OR NOT
if ($errorcode =~ /[A-Z]/ || $errorcode =~ /[a-z]/)
{
if ($called_from_test == 1)
{
$errorcode = 11; $errormsg = "ERROR: print_error: the errorcode is $errorcode, should be a number.\n"; # ERRORCODE=11
return($errormsg,$errorcode);
}
else
{
print STDERR "ERROR: print_error: the errorcode is $errorcode, should be a number.\n";
exit;
}
}
if (!($errormsg =~ /[A-Z]/ || $errormsg =~ /[a-z]/))
{
if ($called_from_test == 1)
{
$errorcode = 12; $errormsg = "ERROR: print_error: the errormessage $errormsg does not seem to contain text.\n"; # ERRORCODE=12
return($errormsg,$errorcode);
}
else
{
print STDERR "ERROR: print_error: the errormessage $errormsg does not seem to contain text.\n";
exit;
}
}
if ($errormsg eq 'none' || $errorcode == 0)
{
if ($called_from_test == 1)
{
$errorcode = 13; $errormsg = "ERROR: print_error: errormsg $errormsg, errorcode $errorcode.\n"; # ERRORCODE=13
return($errormsg,$errorcode);
}
else
{
print STDERR "ERROR: print_error: errormsg $errormsg, errorcode $errorcode.\n";
exit;
}
}
else
{
print STDERR "$errormsg";
exit;
}
return($errormsg,$errorcode);
}
#------------------------------------------------------------------#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment