Skip to content

Instantly share code, notes, and snippets.

@chapmanb
Last active August 29, 2015 14:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save chapmanb/9248ab8409a473249a07 to your computer and use it in GitHub Desktop.
Save chapmanb/9248ab8409a473249a07 to your computer and use it in GitHub Desktop.
#!/usr/bin/perl
###################################################################
# FindSomatic.pl
#
# Tool for detecting somatic mutations in normal/tumor pair
# using microassembly
#
# Author: Giuseppe Narzisi
# Date: December 11, 2013
#
###################################################################
use warnings;
use strict;
use POSIX;
use FindBin qw($Bin);
use lib $Bin; # add $Bin directory to @INC
use Usage;
use Utils qw(:DEFAULT $findVariants $findDenovos $findSomatic $exportTool $bamtools);
use HashesIO;
use Parallel::ForkManager;
use List::Util qw[min max];
#use Math::Random qw(:all);
#use Sys::CPU;
#use IO::Uncompress::Gunzip qw(gunzip $GunzipError);
use File::Spec;
use Getopt::Long;
use MLDBM::Sync; # this gets the default, SDBM_File
use MLDBM qw(DB_File Storable); # use Storable for serializing
use MLDBM qw(MLDBM::Sync::SDBM_File); # use extended SDBM_File, handles values > 1024 bytes
use Fcntl qw(:DEFAULT); # import symbols O_CREAT & O_RDWR for use with DBMs
$|=1;
# required arguments
my $BAMNORMAL;
my $BAMTUMOR;
my $BEDFILE;
my $REF;
my $defaults = getDefaults();
# optional arguments
my $kmer = $defaults->{kmer};
my $cov_threshold = $defaults->{cov_threshold};
my $tip_cov_threshold = $defaults->{tip_cov_threshold};
my $radius = $defaults->{radius};
my $windowSize = $defaults->{windowSize};
my $delta = $defaults->{delta};
my $map_qual = $defaults->{map_qual};
my $maxmismatch = $defaults->{maxmismatch};
my $min_cov = $defaults->{min_cov};
my $covratio = $defaults->{covratio};
my $outratio = $defaults->{outratio};
my $WORK = $defaults->{WORK};
my $MAX_PROCESSES = $defaults->{MAX_PROCESSES};
my $selected = $defaults->{selected};
my $dfs_limit = $defaults->{pathlimit};
my $outformat = $defaults->{format};
my $intarget;
my $help = 0;
my $VERBOSE = 0;
my %candidates;
my %exons;
my %locations;
my %normalCov;
my %tumorCov;
my %normalL2K;
my %tumorL2K;
my %normalSVs;
my %tumorSVs;
my %somaticSVs;
my %commonSVs;
my @members = qw/normal tumor/;
my $argcnt = scalar(@ARGV);
my $start_time = time;
#####################################################
#
# Command line options processing
#
GetOptions(
'help!' => \$help,
'verbose!' => \$VERBOSE,
# required parameters
'normal=s' => \$BAMNORMAL,
'tumor=s' => \$BAMTUMOR,
'bed=s' => \$BEDFILE,
'ref=s' => \$REF,
# optional paramters
'kmer=i' => \$kmer,
'covthr=i' => \$cov_threshold,
'lowcov=i' => \$tip_cov_threshold,
'mincov=i' => \$min_cov,
'covratio=f' => \$covratio,
'radius=i' => \$radius,
'window=i' => \$windowSize,
'step=i' => \$delta,
'mapscore=i' => \$map_qual,
'mismatches=i' => \$maxmismatch,
'dir=s' => \$WORK,
'numprocs=i' => \$MAX_PROCESSES,
'coords=s' => \$selected,
'format=s' => \$outformat,
# ouptut parameters
'intarget!' => \$intarget,
'outratio=f' => \$outratio,
) or usageDenovo("FindDenovo.pl");
#####################################################
#
# Command line options processing
#
sub init()
{
usageSomatic("FindSomatic.pl") if ($argcnt < 1);
usageSomatic("FindSomatic.pl") if( $help );
if( (!defined $BAMNORMAL) || (!defined $BAMTUMOR) || (!defined $BEDFILE) || (!defined $REF) ) {
print STDERR "Required parameter missing!\n";
usageSomatic("FindSomatic.pl");
}
mkdir $WORK if ! -r $WORK;
if($MAX_PROCESSES == 1) { $MAX_PROCESSES = 0; }
}
#####################################################
sub printParams {
my $file = $_[0];
print STDERR "-- Print parameters to $file\n";
open PFILE, "> $file" or die "Can't open $file ($!)\n";
print PFILE "Parameters: \n";
print PFILE "-- K-mer size (bp): $kmer\n";
print PFILE "-- coverage threshold: $cov_threshold\n";
print PFILE "-- low coverage threshold: $tip_cov_threshold\n";
print PFILE "-- size (bp) of the left and right extensions (radius): $radius\n";
print PFILE "-- window-size size (bp): $windowSize\n";
print PFILE "-- step size (bp): $delta\n";
print PFILE "-- minimum mapping-quality: $map_qual\n";
print PFILE "-- minimum coverage for somatic mutation: $min_cov\n";
print PFILE "-- minimum coverage ratio for sequencing error: $covratio\n";
print PFILE "-- minimum coverage ratio for exporting mutation: $outratio\n";
print PFILE "-- max number of mismatches for near-perfect repeats: $maxmismatch\n";
print PFILE "-- output directory: $WORK\n";
print PFILE "-- max number of parallel jobs: $MAX_PROCESSES\n";
print PFILE "-- BAM file normal: $BAMNORMAL\n";
print PFILE "-- BAM file tumor: $BAMTUMOR\n";
print PFILE "-- bedfile: $BEDFILE\n";
print PFILE "-- reference file: $REF\n";
print PFILE "-- file of selected locations: $selected\n";
print PFILE "-- output format for variants: $outformat\n";
if($intarget) { print PFILE "-- output variants in target? yes\n"; }
else { print PFILE "-- output variants in target? no\n"; }
close PFILE;
}
## link input files
#####################################################
sub linkFiles {
print STDERR "-- Linking input bamfile\n";
my $bamfile = $_[0];
my $dirname = $_[1];
my $flag = 0;
print STDERR "$bamfile\n";
my $fdir = "$WORK/$dirname";
# get absolute path
my $bam_abs_path = File::Spec->rel2abs($bamfile);
if (! -r $fdir) {
$flag = 1;
## link the input files
mkdir $fdir;
if (-e "$bam_abs_path") {
symlink "$bam_abs_path", "$fdir/bamfile.bam";
}
else {
print STDERR "Can't find BAM file: $bamfile\n";
next;
}
if (-e "$bam_abs_path.bai") {
symlink "$bam_abs_path.bai", "$fdir/bamfile.bam.bai";
}
else {
print STDERR "Indexing BAM file...\n";
runCmd("index bamfile", "$bamtools index -in $bam_abs_path");
symlink "$bam_abs_path.bai", "$fdir/bamfile.bam.bai";
#print STDERR "Can't find index BAM file: $bamfile.bai\n";
next;
}
}
return $flag;
}
## load mutations and coverage info
#####################################################
sub loadHashes {
print STDERR "-- Loading mutations and coverage info\n";
# clear hashes
for my $k (keys %normalSVs) { delete $normalSVs{$k}; }
for my $k (keys %tumorSVs) { delete $tumorSVs{$k}; }
for my $k (keys %normalCov) { delete $normalCov{$k}; }
for my $k (keys %tumorCov) { delete $tumorCov{$k}; }
# load databases of mutations
print STDERR "load variants for normal...\n";
loadDB("$WORK/normal/variants.db", \%normalSVs, \%exons, 0);
print STDERR "load variants for tumor...\n";
loadDB("$WORK/tumor/variants.db", \%tumorSVs, \%exons, 0);
print STDERR "load coverage for normal...\n";
loadCov("$WORK/normal/refcov.txt", \%normalCov);
print STDERR "load coverage for tumor...\n";
loadCov("$WORK/tumor/refcov.txt", \%tumorCov);
print STDERR "load loc2key for for normal...\n";
loadLoc2Key("$WORK/normal/loc2keys.txt.gz", \%normalL2K);
print STDERR "load loc2key for for tumor...\n";
loadLoc2Key("$WORK/tumor/loc2keys.txt.gz", \%tumorL2K);
#remove reference coverage files:
print STDERR "remove coverage file for normal...\n";
runCmd("remove coverage file", "rm $WORK/normal/refcov.txt.gz");
print STDERR "remove coverage file for tumor...\n";
runCmd("remove coverage file", "rm $WORK/tumor/refcov.txt.gz");
}
## call variants on each family memeber
#####################################################
sub callSVs {
my $bamfile = $_[0];
my $dir = $_[1];
print STDERR "-- Detect mutations on $dir\n";
my $bam_abs_path = File::Spec->rel2abs($bamfile);
my $command = "$findVariants ".
"--bam $bam_abs_path ".
"--bed $BEDFILE ".
"--ref $REF ".
"--kmer $kmer ".
"--covthr $cov_threshold ".
"--lowcov $tip_cov_threshold ".
"--covratio $covratio ".
"--radius $radius ".
"--window $windowSize ".
"--step $delta ".
"--mapscore $map_qual ".
"--mismatches $maxmismatch ".
"--dir $WORK/$dir ".
"--sample ALL ".
"--numprocs $MAX_PROCESSES ".
"--coords $selected ".
"--format $outformat ".
"--cov2file";
if($intarget) { $command .= " --intarget"; }
if($VERBOSE) { $command .= " --verbose"; }
print STDERR "Command: $command\n" if($VERBOSE);
runCmd("findVariants", "$command");
}
## Compute best state for the location of the mutation
#####################################################
sub computeBestState {
my $vars;
my $covthr = 0; # min cov threshold to remove sequencing errors
foreach my $ID (@members) {
if($ID eq "normal") { $vars = \%normalSVs; }
if($ID eq "tumor") { $vars = \%tumorSVs; }
foreach my $currKey (keys %$vars) {
#print STDERR "$currKey\n";
next if($currKey eq "numPaths");
my $var = $vars->{$currKey};
my $chr = $var->{chr};
my $pos = $var->{pos};
my $loc = "$chr:$pos";
my $refState;
#my $mutState;
my $altState;
my $refCov;
my $mutCov;
my $altCov;
foreach my $role (@members) {
#print STDERR "$role: ";
my $varh;
my $refhash;
my $l2k;
if($role eq "normal") { $varh = \%normalSVs; $refhash = \%normalCov; $l2k = \%normalL2K; }
if($role eq "tumor") { $varh = \%tumorSVs; $refhash = \%tumorCov; $l2k = \%tumorL2K; }
$refState->{$role} = 0; $altState->{$role} = 0;
$refCov->{$role} = 0; $mutCov->{$role} = 0; $altCov->{$role} = 0;
my $sum = 0;
my $cnt = 0;
if ( exists($refhash->{$loc}) ) { # if there is reference coverage at location
## compute reference coverage at locus
for (my $t = $pos; $t<($pos+$kmer+1); $t++) { # compute avg coverage over window of size K
my $loc2 = "$chr:$t";
if ( exists $refhash->{$loc2} &&
!( exists($l2k->{$loc2}) && ( (scalar @{$l2k->{$loc2}->{keys}})!=0) ) ) {
$sum += $refhash->{$loc2};
$cnt++;
}
}
}
my $RCov = $sum;
if($cnt>0) { $RCov = floor($sum/$cnt); }
if($RCov > $covthr) { $refState->{$role} = 1; $refCov->{$role} = $RCov; }
else { $refState->{$role} = 0; $refCov->{$role} = 0; }
## compute coverage of mut at locus
my $ACov = 0;
if( exists($varh->{$currKey}) ) {
my $altMut = $varh->{$currKey};
$ACov = $altMut->{mincov};
}
if($ACov > $covthr) { $altState->{$role} = 1; $mutCov->{$role} = $ACov; }
else { $altState->{$role} = 0; $mutCov->{$role} = 0; }
## compute alternative allele coverage
my $OCov = 0;
if( exists($l2k->{$loc}) ) {
foreach my $altKey (@{$l2k->{$loc}->{keys}}) {
if( exists($varh->{$altKey}) && ($altKey ne $currKey) ) {
my $altMut = $varh->{$altKey};
$OCov += $altMut->{mincov};
}
}
}
if($OCov > $covthr) { $altCov->{$role} = $OCov; }
else { $altCov->{$role} = 0; }
}
# 2 2/0 1 (ref: N,T / qry: N,T)
my @RS=(); my @AS=();
my @RC=(); my @AC=(); my @OC=();
foreach my $role (@members) { # the order or members must be N,T
if( ($refState->{$role} == 1) && ($altState->{$role} == 1) ) {
push @RS , 1;
push @AS , 1;
}
elsif( ($refState->{$role} == 1) && ($altState->{$role} == 0) ) {
push @RS , 2;
push @AS , 0;
}
elsif( ($refState->{$role} == 0) && ($altState->{$role} == 1) ) {
push @RS , 0;
push @AS , 2;
}
elsif( ($refState->{$role} == 0) && ($altState->{$role} == 0) ) {
push @RS , 0;
push @AS , 0;
}
push @RC , $refCov->{$role};
push @AC , $mutCov->{$role};
push @OC , $altCov->{$role};
}
my $stateStr = "$RS[0]$RS[1]/$AS[0]$AS[1]";
my $covStr = "$RC[0] $RC[1]/$AC[0] $AC[1]/$OC[0] $OC[1]";
$var->{bestState} = $stateStr;
$var->{covState} = $covStr;
# compute hom/het state and chi2score
parseCovState($var, $ID);
$vars->{$currKey} = $var;
}
}
}
## parse covState and compute hom/het state and chi2score
##########################################
sub parseCovState {
my $sv = $_[0];
my $role = $_[1];
my $BS = $sv->{covState};
my ($REF,$ALT,$OTH) = split("/",$BS);
my @R = split(" ", $REF); # reference
my @A = split(" ", $ALT); # alternative
my @O = split(" ", $OTH); # other
my $id = 0;
if($role eq "normal") { $id = 0; }
if($role eq "tumor") { $id = 1; }
my $alleleCnt = 0;
if($R[$id] > 0) { $alleleCnt++; }
if($A[$id] > 0) { $alleleCnt++; }
if($O[$id] > 0) { $alleleCnt++; }
my $totCov = $R[$id] + $A[$id] + $O[$id];
$sv->{altcov} = $totCov - $sv->{mincov};
if($alleleCnt > 1) { $sv->{zygosity} = "het"; }
else { $sv->{zygosity} = "hom"; }
my $covRatio = 0;
my $chi2Score = "na";
if ($totCov > 0) {
$covRatio = sprintf('%.2f', ($sv->{mincov} / $totCov) );
$sv->{covratio} = $covRatio;
#Chi-squared Test for allele coverage
my $o1 = $sv->{mincov}; # observed 1
my $o2 = $sv->{altcov}; # observed 2
my $e1 = $totCov/2; # expected 1
my $e2 = $totCov/2; # expected 2
my $term1 = (($o1-$e1)*($o1-$e1))/$e1;
my $term2 = (($o2-$e2)*($o2-$e2))/$e2;
$chi2Score = sprintf('%.2f', $term1 + $term2);
}
$sv->{chi2score} = $chi2Score;
}
## parse bestState and update mutation info
##########################################
sub parseBestState {
my $sv = $_[0];
my $BS = $sv->{bestState};
$sv->{somatic} = "no";
$sv->{inheritance} = "no";
if( ($BS eq "12/10") || ($BS eq "02/20") ) { # only in normal
$sv->{inheritance} = "no";
}
else {
my ($REF,$ALT) = split("/",$BS);
my @R = split("", $REF); # reference
my @A = split("", $ALT); # alternative
my $n=0; my $t=1;
#if ( ($A[$n]>0) && ($A[$t]>0) ) { # both in normal and tumor
if ( $A[$n] == $A[$t] ) { # genotypes for ALT identical
$sv->{inheritance} = "normal";
}
#elsif ( ($A[$n]==0) && ($A[$t]>0) ) { # only in tumor
elsif ( $A[$n] != $A[$t] ) { # genotypes for ALT different
$sv->{inheritance} = "no";
$sv->{somatic} = "yes";
}
else { # unknown state
$sv->{inheritance} = "no";
}
#if($A[$t]>0) { $sv->{id} = "tumor"; }
}
}
## find denovo events
#####################################################
sub findSomaticMut {
print STDERR "-- Finding somatic mutations\n";
my $family = $_[0];
if (-e "$WORK/somatic.db.dir") { runCmd("remove database files", "rm $WORK/somatic.db.*"); }
if (-e "$WORK/common.db.dir") { runCmd("remove database files", "rm $WORK/common.db.*"); }
my $somatic_dbm_obj = tie %somaticSVs, 'MLDBM::Sync', "$WORK/somatic.db", O_CREAT|O_RDWR, 0640;
my $common_dbm_obj = tie %commonSVs, 'MLDBM::Sync', "$WORK/common.db", O_CREAT|O_RDWR, 0640;
$somatic_dbm_obj->Lock;
$common_dbm_obj->Lock;
foreach my $k (keys %tumorSVs) {
my $mut = $tumorSVs{$k};
# parse bestState and update info
parseBestState($mut);
my $chr = $mut->{chr};
my $pos = $mut->{pos};
my $key = "$chr:$pos";
# skip mutation if normal not sampled
next if (!exists $normalCov{$key});
next if ($normalCov{$key} < $min_cov); # min cov requirement
# compute inheritance
if($mut->{somatic} eq "yes") {
$somaticSVs{$k} = $mut;
}
else { # non somatic
if ($mut->{inheritance} eq "normal") {
$commonSVs{$k} = $mut;
}
}
$tumorSVs{$k} = $mut;
}
$somatic_dbm_obj->UnLock;
$common_dbm_obj->UnLock;
}
## export family mutations to file
#####################################################
sub exportSVs {
print STDERR "-- Exporting SVs to file\n";
my $max_cov = 10000000;
# export somatic
#runCmd("export all", "$exportTool --db $WORK/somatic.db --bed $BEDFILE --format annovar --type all --mincov $min_cov --maxcov $max_cov --covratio $outratio > $WORK/somatic.${min_cov}x.all.txt");
#runCmd("export snp", "$exportTool --db $WORK/somatic.db --bed $BEDFILE --format annovar --type snp --mincov $min_cov --maxcov $max_cov --covratio $outratio > $WORK/somatic.${min_cov}x.snp.txt");
#runCmd("export indels", "$exportTool --db $WORK/somatic.db --bed $BEDFILE --format annovar --type indel --mincov $min_cov --maxcov $max_cov --covratio $outratio > $WORK/somatic.${min_cov}x.indel.txt");
my $command_somatic = "$exportTool ".
"--db $WORK/somatic.db ".
"--bed $BEDFILE ".
"--format $outformat ".
"--type indel ".
"--mincov $min_cov ".
"--maxcov $max_cov ".
"--covratio $outratio";
if($intarget) { $command_somatic .= " --intarget"; }
if ($outformat eq "annovar") { $command_somatic .= " > $WORK/somatic.${min_cov}x.indel.annovar"; }
elsif ($outformat eq "vcf") { $command_somatic .= " > $WORK/somatic.${min_cov}x.indel.vcf"; }
print STDERR "Command: $command_somatic\n" if($VERBOSE);
runCmd("export somatic", $command_somatic);
# export common variants
#runCmd("export all", "$exportTool --db $WORK/common.db --bed $BEDFILE --format annovar --type all --mincov $min_cov --maxcov $max_cov --covratio $outratio > $WORK/common.${min_cov}x.all.txt");
#runCmd("export snp", "$exportTool --db $WORK/common.db --bed $BEDFILE --format annovar --type snp --mincov $min_cov --maxcov $max_cov --covratio $outratio > $WORK/common.${min_cov}x.snp.txt");
#runCmd("export indels", "$exportTool --db $WORK/common.db --bed $BEDFILE --format annovar --type indel --mincov $min_cov --maxcov $max_cov --covratio $outratio > $WORK/common.${min_cov}x.indel.txt");
my $command_comm = "$exportTool ".
"--db $WORK/common.db ".
"--bed $BEDFILE ".
"--format $outformat ".
"--type indel ".
"--mincov $min_cov ".
"--maxcov $max_cov ".
"--covratio $outratio";
if($intarget) { $command_comm .= " --intarget"; }
if ($outformat eq "annovar") { $command_comm .= " > $WORK/common.${min_cov}x.indel.annovar"; }
elsif ($outformat eq "vcf") { $command_comm .= " > $WORK/common.${min_cov}x.indel.vcf"; }
print STDERR "Command: $command_comm\n" if($VERBOSE);
runCmd("export common", $command_comm);
}
## process single family
#####################################################
sub processBAM {
my $bamfile = $_[0];
my $outdir = $_[1];
#my $toprocess = linkFiles("$bamfile", "$outdir"); # link input files
#if($toprocess == 1) {
callSVs($bamfile, $outdir); # call mutations on each family
#}
#else { print STDERR "$outdir already processed!\n"; }
}
## do the job
##########################################
init();
printParams("$WORK/parameters.txt"); # print parameters
processBAM($BAMNORMAL, "normal"); # process normal
processBAM($BAMTUMOR, "tumor"); # process tumor
loadHashes(); # load mutations and coverage info
computeBestState(); # compute bestState for each mutation
findSomaticMut(); # detect somatic mutations in tumor
exportSVs(); # export mutations to file
##########################################
my $time_taken = time - $start_time;
#if($VERBOSE) {
elapsedTime($time_taken, "FindSomatic");
#}
#!/usr/bin/perl
###################################################################
# FindVariants.pl
#
# Tool for detecting variants in one single exome
# using microassembly
#
# Author: Giuseppe Narzisi
# Date: December 11, 2013
#
###################################################################
use warnings;
use strict;
use POSIX;
use FindBin qw($Bin);
use lib $Bin; # add $Bin directory to @INC
use Usage;
use Utils;
use SequenceIO;
use HashesIO;
use Parallel::ForkManager;
use List::Util qw[min max];
#use MathRandom::Random qw(:all);
use Digest::MD5 qw(md5_hex);
#use IO::Uncompress::Gunzip qw(gunzip $GunzipError);
use Getopt::Long;
use File::Spec;
use MLDBM::Sync; # this gets the default, SDBM_File
use MLDBM qw(DB_File Storable); # use Storable for serializing
use MLDBM qw(MLDBM::Sync::SDBM_File); # use extended SDBM_File, handles values > 1024 bytes
use Fcntl qw(:DEFAULT); # import symbols O_CREAT & O_RDWR for use with DBMs
$|=1;
# required arguments
my $bamfile;
my $bedfile;
my $REF;
my $defaults = getDefaults();
# optional arguments
my $kmer = $defaults->{kmer};
my $cov_threshold = $defaults->{cov_threshold};
my $tip_cov_threshold = $defaults->{tip_cov_threshold};
my $radius = $defaults->{radius};
my $windowSize = $defaults->{windowSize};
my $delta = $defaults->{delta};
my $map_qual = $defaults->{map_qual};
my $maxmismatch = $defaults->{maxmismatch};
my $min_cov = $defaults->{min_cov};
my $covratio = $defaults->{covratio};
my $outratio = $defaults->{outratio};
my $WORK = $defaults->{WORK};
my $MAX_PROCESSES = $defaults->{MAX_PROCESSES};
my $sample = $defaults->{sample};
my $selected = $defaults->{selected};
my $outformat = $defaults->{format};
my $intarget;
#microassembler parameters:
my $dfs_limit = $defaults->{pathlimit};
my $max_tip_len = 25;
my $maxKmer = 90;
my $help = 0;
my $VERBOSE = 0;
my $COV2FILE = 0;
# programs via absolute path
#my $samtools = "samtools";
my $microassembler = "$Bin/Microassembler/Microassembler";
my $exportTool = "$Bin/ExportVariants.pl";
my $bamtools = "$Bin/bamtools-2.3.0/bin/bamtools";
my $rgfile = "readgroups.txt";
#my %pathways;
#my %exonshash;
my %variants;
my %exons;
my %locations;
my %readgroups;
my %refcov;
my %loc2keys;
###my %genome;
my $variants_dbm_obj;
my $argcnt = scalar(@ARGV);
my $start_time = time;
#####################################################
#
# Command line options processing
#
GetOptions(
'help!' => \$help,
'verbose!' => \$VERBOSE,
'cov2file' => \$COV2FILE,
# required parameters
'bam=s' => \$bamfile,
'bed=s' => \$bedfile,
'ref=s' => \$REF,
# optional paramters
'kmer=i' => \$kmer,
'covthr=i' => \$cov_threshold,
'lowcov=i' => \$tip_cov_threshold,
'mincov=i' => \$min_cov,
'covratio=f' => \$covratio,
'radius=i' => \$radius,
'window=i' => \$windowSize,
'step=i' => \$delta,
'mapscore=i' => \$map_qual,
'mismatches=i' => \$maxmismatch,
'pathlimit=i' => \$dfs_limit,
'dir=s' => \$WORK,
'numprocs=i' => \$MAX_PROCESSES,
'sample=s' => \$sample,
'coords=s' => \$selected,
'format=s' => \$outformat,
# ouptut parameters
'intarget!' => \$intarget,
'outratio=f' => \$outratio,
) or usageSingle("FindVariants.pl");
#####################################################
#
# Initilization
#
sub init()
{
usageSingle("FindVariants.pl") if ($argcnt < 1);
usageSingle("FindVariants.pl") if( $help );
if( (!defined $bamfile) || (!defined $bedfile) || (!defined $REF) ) {
print STDERR "Required parameter missing!\n";
usageSingle("FindVariants.pl");
}
mkdir $WORK if ! -r $WORK;
if($MAX_PROCESSES == 1) { $MAX_PROCESSES = 0; }
}
#####################################################
sub printParams {
my $file = $_[0];
print STDERR "-- Print parameters to $file\n";
open PFILE, "> $file" or die "Can't open $file ($!)\n";
print PFILE "Parameters: \n";
print PFILE "-- k-mer size (bp): $kmer\n";
print PFILE "-- coverage threshold: $cov_threshold\n";
print PFILE "-- low coverage threshold: $tip_cov_threshold\n";
print PFILE "-- size (bp) of the left and right extensions (radius): $radius\n";
print PFILE "-- window-size size (bp): $windowSize\n";
print PFILE "-- step size (bp): $delta\n";
print PFILE "-- minimum mapping quality: $map_qual\n";
print PFILE "-- minimum coverage for mutation: $min_cov\n";
print PFILE "-- minimum coverage ratio for sequencing errors: $covratio\n";
print PFILE "-- minimum coverage ratio for exporting mutation: $outratio\n";
print PFILE "-- max number of mismatches for near-perfect repeats: $maxmismatch\n";
print PFILE "-- limit number of sequence paths: $dfs_limit\n";
print PFILE "-- output directory: $WORK\n";
print PFILE "-- max number of parallel jobs: $MAX_PROCESSES\n";
print PFILE "-- bam file: $bamfile\n";
print PFILE "-- bed file: $bedfile\n";
print PFILE "-- reference file: $REF\n";
print PFILE "-- sample: $sample\n";
print PFILE "-- output format for variants: $outformat\n";
print PFILE "-- file of selected coordinates: $selected\n";
if($intarget) { print PFILE "-- output variants in target? yes\n"; }
else { print PFILE "-- output variants in target? no\n"; }
close PFILE;
}
## build directory structure and run assembly
#####################################################
sub run {
my $fdir = "$WORK";
if (! -r $fdir) { mkdir $fdir; }
printParams("$fdir/parameters.txt");
if($selected ne "null") {
loadCoordinates("$selected", \%exons, $VERBOSE);
}
else {
loadExonsBed("$bedfile", \%exons, $radius, $VERBOSE);
}
#load genome from fasta file
###loadGenomeFasta($REF, \%genome);
if (-e "$fdir/variants.db.dir") { runCmd("remove database files", "rm $fdir/variants.db.*"); }
$variants_dbm_obj = tie %variants, 'MLDBM::Sync', "$fdir/variants.db", O_CREAT|O_RDWR, 0640;
#$variants_dbm_obj->SyncCacheSize('1000M'); # 1000 Megabyte max memory used
#print "$bamfile\n";
# get absolute path
my $bam_abs_path = File::Spec->rel2abs($bamfile);
## link the input files
if (-e $bam_abs_path) {
symlink "$bam_abs_path", "$fdir/bamfile.bam";
}
else {
print STDERR "Can't find bamfile: $bamfile\n";
next;
}
if (-e "$bam_abs_path.bai") {
symlink "$bam_abs_path.bai", "$fdir/bamfile.bam.bai";
}
else {
print STDERR "Indexing BAM file...\n";
runCmd("index bamfile", "$bamtools index -in $bam_abs_path");
symlink "$bam_abs_path.bai", "$fdir/bamfile.bam.bai";
}
parseHeader($WORK, \%readgroups);
saveReadGroup("$sample", $WORK, \%readgroups, $rgfile);
print STDERR "Assembly Exons\n";
assemblyExons(\%variants, $variants_dbm_obj, \%refcov, \%loc2keys);
if ($COV2FILE) { printRefCov("$WORK/refcov.txt", \%refcov); }
printLoc2key("$WORK/loc2keys.txt", \%loc2keys);
# export variants to file
exportSVs();
}
## export family mutations to file
#####################################################
sub exportSVs {
#$exportvars --db $DIR/variants.db --bed $BEDFILE --format annovar --type all --mincov $mincov --maxcov $maxcov > $DIR/variants.txt
print STDERR "-- Exporting SVs to file\n";
my $max_cov = 10000000;
# export denovos
#runCmd("export denovos", "$exportTool --db $WORK/variants.db --bed $bedfile --format annovar --type all --mincov $min_cov --maxcov $max_cov --covratio $outratio > $WORK/variants.${min_cov}x.all.txt");
#runCmd("export denovos", "$exportTool --db $WORK/variants.db --bed $bedfile --format annovar --type snp --mincov $min_cov --maxcov $max_cov --covratio $outratio > $WORK/variants.${min_cov}x.snp.txt");
#runCmd("export denovos", "$exportTool --db $WORK/variants.db --bed $bedfile --format annovar --type indel --mincov $min_cov --maxcov $max_cov --covratio $outratio > $WORK/variants.${min_cov}x.indel.txt");
my $command = "$exportTool ".
"--db $WORK/variants.db ".
"--bed $bedfile ".
"--format $outformat ".
"--type indel ".
"--mincov $min_cov ".
"--maxcov $max_cov ".
"--covratio $outratio";
if($intarget) { $command .= " --intarget"; }
if ($outformat eq "annovar") { $command .= " > $WORK/variants.${min_cov}x.indel.annovar"; }
elsif ($outformat eq "vcf") { $command .= " > $WORK/variants.${min_cov}x.indel.vcf"; }
print STDERR "Command: $command\n" if($VERBOSE);
runCmd("ExportVariants", $command);
}
#####################################################
## Write the list of read groups per family member on file
## Extract the sequence around indel from reference
## Create input files for microassembler
## Run microassembler
sub assemblyExons {
my $hash = $_[0];
my $dbm_obj = $_[1];
my $ref_hash = $_[2];
my $loc2keys_hash = $_[3];
my $count_exons = 0;
my $count_regions = 0;
my @regions;
my @region_files;
my $PREFIX = "$WORK";
#foreach my $exonkey (sort {$a <=> $b} keys %exons) { # for each chromosome
foreach my $exonkey (keys %exons) { # for each chromosome
$count_exons = 0;
#foreach my $exon (sort {$a->{start} <=> $b->{start}} @{$exons{$exonkey}}) { # for each exon
foreach my $exon (@{$exons{$exonkey}}) { # for each exon
$count_exons++;
#last if($count_exons > 5);
#my %pathways;
#my %exonshash;
# Forks and returns the pid for the child:
#my $pid = $pm->start and next;
my $start = $exon->{start};
my $end = $exon->{end};
my $chr = $exon->{chr};
#print STDERR "$chr:$start-$end\n";
## extend region left and right by radius
my $left = $start-$radius;
if ($left < 0) { $left = $start; }
my $right = $end+$radius;
#if ($right > $end) { $right = $end; }
#print STDERR "$chr:$left-$right\n";
my $exonSize = $right-$left;
# generate list of regions to examine
my $l = $left;
my $u = $l+$windowSize;
if($u >= $right) {
$u = $right;
my $region;
$region->{chr} = $chr;
$region->{left} = $l;
$region->{right} = $u;
push @regions, $region;
#print "$region->{chr}:$region->{left}-$region->{right}\n";
$count_regions++;
}
else {
while($u < $right) {
my $region;
$region->{chr} = $chr;
$region->{left} = $l;
$region->{right} = $u;
$region->{exonstart} = $left;
$region->{exonend} = $right;
push @regions, $region;
$count_regions++;
$l += $delta;
$u += $delta;
if($u > $right) {
$u = $right;
$l = $u-$windowSize;
my $region;
$region->{chr} = $chr;
$region->{left} = $l;
$region->{right} = $u;
$region->{exonstart} = $left;
$region->{exonend} = $right;
push @regions, $region;
#print "$region->{chr}:$region->{left}-$region->{right}\n";
$count_regions++;
}
}
}
}
}
#update statistics in the database
if(exists $hash->{stats}) {
my $sts = $hash->{stats};
$sts->{num_exceptions} = 0;
$sts->{num_dfs_limit} = 0;
$sts->{num_partial_align} = 0;
$sts->{num_with_cycles} = 0;
$sts->{num_ok} = 0;
$hash->{stats} = $sts;
}
#schedule jobs
schedule_jobs(\@regions, $hash, $dbm_obj, $ref_hash, $loc2keys_hash, "regions");
my $stats = $hash->{stats};
#if ($VERBOSE) {
if(exists $hash->{stats}) {
my $sts = $hash->{stats};
print STDERR "\nDfs limit: $sts->{num_dfs_limit}\n";
print STDERR "Partial alignments: $sts->{num_partial_align}\n";
print STDERR "Cycles: $sts->{num_with_cycles}\n";
print STDERR "OK assemblies: $sts->{num_ok}\n";
print STDERR "Exceptions: $sts->{num_exceptions}\n";
print STDERR "Repeats: $sts->{num_repeats}\n\n";
}
#}
}
# partition the region to assemble into equal size
# random sets and run the parallel jobs
#####################################################
sub schedule_jobs {
my $array = $_[0];
my $hash = $_[1];
my $dbm_obj = $_[2];
my $ref_hash = $_[3];
my $loc2keys_hash = $_[4];
my $fprefix = $_[5];
my $PREFIX = "$WORK";
# random permutation of the regions to assemble
#my @regions = random_permutation(@$array);
my @regions = @$array;
my $arraySize = $#regions + 1;
print STDERR "start assembly of $arraySize regions.\n";
# partition regions
my $step = $arraySize;
if ($MAX_PROCESSES>0) {
$step = ceil($arraySize/$MAX_PROCESSES);
}
print STDERR "stepSize: $step\n";
for(my $i = 0; $i < $arraySize; $i+=$step) {
my $s = $i;
my $e = min( ($arraySize - 1), ($i + $step-1) );
my $ref_file = "$fprefix-$s-$e.fa";
#push @region_files, $ref_file;
open FILE, "> $PREFIX/$ref_file" or die "Can't open $PREFIX/$ref_file ($!)\n";
foreach my $reg (@regions[$s..$e]) { # for each region
my $chr = $reg->{chr};
my $left = $reg->{left};
my $right = $reg->{right};
my $exonstart = $reg->{exonstart};
my $exonend = $reg->{exonend};
#my $chrom = $chr;
#if ($chr =~ /^chr/) { $chrom = substr($chr,3); }
print FILE ">$chr:$left-$right\n";
### die "Undefined sequence ($chr)\n" if (!exists($genome{$chr}));
my ($header, $seq) = split(/\n/, `samtools faidx $REF $chr:$left-$right`, 2);
$seq =~ s/[\n\r\s]+//g;
###my $seq = substr($genome{$chr}->{seq}, $left-1, $right-$left+1);
#my $seq = substr($genome{$chr}->{seq},$exonstart, $exonend-$exonstart+1);
for(my $i=0; $i<length($seq); $i+=80) {
my $str = substr($seq,$i,80);
print FILE "$str\n";
}
}
close FILE;
}
# free genome memory before parallelization
###for my $k (keys %genome) { delete $genome{$k}; }
my $pm = new Parallel::ForkManager($MAX_PROCESSES);
my $count = 0;
for(my $i = 0; $i < $arraySize; $i+=$step) {
$count++;
my $pid = $pm->start and next;
my $s = $i;
my $e = min( ($arraySize - 1), ($i + $step-1) );
my $ref_file = "$fprefix-$s-$e.fa";
print STDERR "$count. [$s..$e]\n";
assemblyRegion($count, $ref_file, \%$hash, $dbm_obj);
$pm->finish; # Terminates the child process
}
$pm->wait_all_children; # wait for all parallel jobs to complete
# load reference coverage info from files
loadRefCovMerge($ref_hash, $count, $WORK);
loadLoc2KeyMerge($loc2keys_hash, $ref_hash, $count, $kmer, $WORK);
# compute zygosity for all mutations
computeHomHetState($hash, $ref_hash, $loc2keys_hash);
}
## Compute homozygous vs heterozygous state and
## flags possible sequencing errors
#####################################################
sub computeHomHetState {
print STDERR "Computing hom/het state...\n";
my $vars = $_[0];
my $refh = $_[1];
my $l2k = $_[2];
my $hom_cnt = 0;
my $het_cnt = 0;
# improve performance: tie once to database and do all read/wrote operations.
$variants_dbm_obj->Lock;
foreach my $currKey (keys %$vars) {
next if($currKey eq "stats"); # skip stats info
my $currMut = $vars->{$currKey};
my $chr = $currMut->{chr};
my $pos = $currMut->{pos};
my $loc = "$chr:$pos";
my $alleleCnt = 0;
my $Rcov = 0;
my $Acov = 0;
my $Ocov = 0;
my $totCov = 0;
# reference coverage
my $sum = 0;
my $cnt = 0;
if ( (exists $refh->{$loc}) ) { # if cov at location
## compute reference coverage at locus
for (my $t = $pos; $t<($pos+$kmer+1); $t++) { # compute avg coverage over window of size K
my $loc2 = "$chr:$t";
if ( (exists $refh->{$loc2}) &&
!( (exists $l2k->{$loc2}) && ( (scalar @{$l2k->{$loc2}->{keys}})!=0 ) ) ) {
$sum += $refh->{$loc2};
$cnt++;
}
}
}
$Rcov=$sum;
if($cnt>0) { $Rcov = floor($sum/$cnt); }
# alternative allele coverage
$Acov = $currMut->{mincov};
# other coverage
$Ocov = 0;
if(exists($l2k->{$loc})) {
foreach my $altKey (@{$l2k->{$loc}->{keys}}) {
if( exists($vars->{$altKey}) && ($altKey ne $currKey) ) {
my $altMut = $vars->{$altKey};
$Ocov += $altMut->{mincov};
}
}
}
$totCov = $Rcov + $Acov + $Ocov;
$currMut->{altcov} = $totCov - $currMut->{mincov};
if($Rcov > 0) { $alleleCnt++; }
if($Acov > 0) { $alleleCnt++; }
if($Ocov > 0) { $alleleCnt++; }
if($alleleCnt > 1) {
$currMut->{zygosity} = "het";
$het_cnt++;
}
else { # alleCnt <= 1
$currMut->{zygosity} = "hom";
$hom_cnt++;
}
# update mutation in db
$vars->{$currKey} = $currMut;
}
$variants_dbm_obj->UnLock;
print STDERR "Num. Homozygous: $hom_cnt\n";
print STDERR "Num. Heterozygous: $het_cnt\n";
}
# assembly a specific region
#####################################################
sub assemblyRegion {
my $num = $_[0];
my $refs_file = $_[1];
my $hash = $_[2];
my $dbm_obj = $_[3];
#my %pathways;
#my %exonshash;
my $PREFIX = "$WORK";
my $log_dir = "$WORK/logs";
if (! -r $log_dir) { mkdir $log_dir; }
my $outfile = "$log_dir/$refs_file.log";
my $command = "$microassembler -R -I -C ";
if($VERBOSE) { $command .= "-v "; }
$command .= "-F $dfs_limit ".
"-k $kmer ".
"-K $maxKmer ".
"-M $maxmismatch ".
"-l $max_tip_len ".
"-c $cov_threshold ".
"-x $covratio ".
"-d $tip_cov_threshold ".
"-b $map_qual ".
"-p $PREFIX ".
"-m bamfile.bam ".
"-g $rgfile ".
" -r $refs_file ".
">& $outfile";
## run microassembler on each fasta file
runCmd("microassembly", "$command");
runCmd("gzip", "gzip -f $outfile");
parseLogFile("$outfile", $num, \%$hash, $dbm_obj);
# remove reference file
runCmd("delete file", "rm $PREFIX/$refs_file");
}
## parse the log file generated by microassembler and populate data structures containing the output statistics
#####################################################
sub parseLogFile {
my $logFile = $_[0];
my $num = $_[1];
my $hash = $_[2];
my $dbm_obj = $_[3];
#my $PATHS = new IO::Uncompress::Gunzip "$logFile.gz" or die "IO::Uncompress::Gunzip failed: $GunzipError\n";
open PATHS, "gunzip -c $logFile.gz|" or die "Can't open $logFile.gz ($!)\n!";
my $curc;
my $curs;
my $cure;
my @pathcov;
my @pathdiff;
my %tmp_hash;
my %tmp_ref_cov;
my %tmp_loc2keys;
my $stats;
$stats->{num_dfs_limit} = 0;
$stats->{num_partial_align} = 0;
$stats->{num_with_cycles} = 0;
$stats->{num_ok} = 0;
$stats->{num_exceptions} = 0;
$stats->{num_repeats} = 0;
my $exon;
my @pathways;
while (<PATHS>)
{
if (/== Processing\s+\d+:\s+([\w\.]+):(\d+)-(\d+)/) {
$curc = $1; # chr
$curs = $2; # start
$cure = $3; # end
}
elsif (/cov: (\S+)/) {
my $cov = $1;
$exon->{start} = $curs;
$exon->{end} = $cure;
$exon->{cov} = $cov;
$exon->{status} = "-";
my $exonkey = sprintf("%s:%d:%d", $curc, $curs, $cure);
#$exonshash->{$exonkey} = $exon;
}
elsif (/ref trim5: (\d+) trim3: (\d+) uncovered: (\d+) ref_dist: (\d+)/) {
$exon->{trim5} = $1;
$exon->{trim3} = $2;
$exon->{uncovered} = $3;
$exon->{ref_dist} = $4;
}
elsif (/allcycles: (\d+)/) {
$exon->{cycles} = $1;
if ($exon->{cycles} >= 1) { $stats->{num_with_cycles} += 1; }
}
elsif (/exception/) {
$exon->{status} = "excp";
$stats->{num_exceptions} += 1;
}
elsif (/Found repeat in/ || /Found cycle in/) {
$exon->{status} = "repeat";
$stats->{num_repeats} += 1;
}
elsif (/WARNING: DFS_LIMIT/) {
$exon->{status} = "dfs";
$stats->{num_dfs_limit} += 1;
}
elsif(/^>sp/) {
$exon->{status} = "partial";
$stats->{num_partial_align} += 1;
}
elsif(/^c':/) {
chomp;
my $covstr = (split(/:/, $_))[1];
@pathcov = split(/ /, $covstr);
}
elsif(/^d':/) {
chomp;
my $diffstr = (split(/:/, $_))[1];
@pathdiff = split(//,$diffstr);
}
elsif (/^>p_([\w\.]+):(\d+)-(\d+)_(\d+)/) {
$exon->{status} = "ok";
my $chr = $1;
my $path;
$path->{chr} = $chr;
$path->{rstart} = $2;
$path->{rend} = $3;
$path->{trim5} = $exon->{trim5};
$path->{trim3} = $exon->{trim3};
$path->{pcnt} = $4;
@{$path->{cov}} = @pathcov;
@{$path->{diff}} = @pathdiff;
chomp;
# >p_9:96439830-96440030_1 cycle: 0 match: 200 snp: 0 ins: 0 del: 1 96439931:T|-|25.7
my ($tag, $cc, $cycle, $mm, $match, $ss, $snp, $ii, $ins, $dd, $del, $info) = split(/\s/, $_, 12);
$path->{cycle} = $cycle;
$path->{match} = $match;
$path->{snp} = $snp;
$path->{ins} = $ins;
$path->{del} = $del;
$path->{info} = $info;
#my $pathkey = sprintf("%s:%d:%d", $chr, $2, $3);
if($path->{cycle} == 0) { ## skip paths with cycles
push @pathways, $path;
}
}
elsif (/FINISHED/) { # end of current region
if((scalar @pathways) > 0) {
if( ($exon->{status} eq "ok") && ($exon->{cycles} == 0) ) { # skip regions hard to assemble and paths with at least one cycle
# at this point the exon status is ok
$stats->{num_ok} += 1;
extractVariantsFromPath(\@pathways, $exon, \%tmp_hash, \%tmp_ref_cov, $stats, \%tmp_loc2keys);
}
undef (@pathways); # free up the memory resources occupied by the array
}
}
}
#copy mutations to database
$dbm_obj->Lock;
foreach my $key (keys %tmp_hash) {
my $mut = $tmp_hash{$key};
if( !(exists $hash->{$key}) ) {
$hash->{$key} = $mut;
}
else { # denovo already in the table
# update the coverage to the highest value found so far
my $mut_old = $hash->{$key};
if ($mut_old->{avgcov} < $mut->{avgcov}) { $mut_old->{avgcov} = $mut->{avgcov}; } # max avg coverage
if ($mut_old->{mincov} < $mut->{mincov}) { $mut_old->{mincov} = $mut->{mincov}; } # max min coverage
$hash->{$key} = $mut_old;
}
}
#update statistics in the database
if( !(exists $hash->{stats}) ) {
$hash->{stats} = $stats;
}
else {
my $old_stats = $hash->{stats};
$old_stats->{num_repeats} += $stats->{num_repeats};
$old_stats->{num_exceptions} += $stats->{num_exceptions};
$old_stats->{num_dfs_limit} += $stats->{num_dfs_limit};
$old_stats->{num_partial_align} += $stats->{num_partial_align};
$old_stats->{num_with_cycles} += $stats->{num_with_cycles};
$old_stats->{num_ok} += $stats->{num_ok};
$hash->{stats} = $old_stats;
}
$dbm_obj->UnLock;
# write reference coverage to file
my $outfile = "$WORK/refcov.$num.txt";
open FILE, "> $outfile" or die "Can't open $outfile ($!)\n";
foreach my $key (keys %tmp_ref_cov) {
print FILE "$key\t$tmp_ref_cov{$key}\n";
}
close FILE;
# write mutations by positions to file
$outfile = "$WORK/loc2keys.$num.txt";
open FILE, "> $outfile" or die "Can't open $outfile ($!)\n";
foreach my $loc (keys %tmp_loc2keys) {
print FILE "$loc";
foreach my $key (@{$tmp_loc2keys{$loc}}) { print FILE "\t$key"; }
print FILE "\n";
}
close FILE;
close PATHS;
}
## extract mutations from path
##########################################
sub extractVariantsFromPath {
my $pathways = $_[0];
my $exon = $_[1];
my $hash = $_[2];
my $ref_cov_hash = $_[3];
my $stats = $_[4];
my $mutbypos_hash = $_[5];
my $outflag = 0;
my $exoncov = $exon->{cov};
my $exonsta = $exon->{status};
my $exoncycle = $exon->{cycles};
$exoncycle = 0 if !defined $exoncycle;
foreach my $path (@{$pathways}) { ## for each path in the region
updateRefCoverage($path, $ref_cov_hash); # updated coverage info for matching reference
my $chr = $path->{chr};
#if ($chr =~ /^chr/) { $chr = substr($chr,3); }
my $info = $path->{info};
$info = "" if !defined $info;
#print "---$info---\n";
my @var = split /\s+/, $info;
foreach my $v (@var) { ## for each variant
my ($pos, $ref, $qry, $avgcov, $mincov, $prevbp) = split /[:|]/, $v;
my $t = "";
my $l = 0;
if ( ($ref !~ /-+/) && ($qry =~ /-+/) ) {$t = "del"; $l = length($qry); } # deletion
elsif( ($ref =~ /-+/) && ($qry !~ /-+/) ) {$t = "ins"; $l = length($qry); } # insertion
elsif( ($ref !~ /-+/) && ($qry !~ /-+/) ) {$t = "snp"; $l = length($qry); } # SNP
$outflag = 1;
my $mut;
$mut->{chr} = $chr;
$mut->{pos} = $pos;
$mut->{type} = $t;
$mut->{len} = $l;
$mut->{ref} = $ref;
$mut->{seq} = $qry;
$mut->{prevbp} = $prevbp;
$mut->{avgcov} = $avgcov;
$mut->{mincov} = $mincov;
$mut->{status} = $exonsta;
$mut->{zygosity} = "na";
$mut->{altcov} = 0;
$mut->{inheritance} = "na";
my $ref_encoded = md5_hex($ref);
my $qry_encoded = md5_hex($qry);
my $key = sprintf("%s:%d:%s:%d:%s:%s", $chr, $pos, $t, $l, $ref_encoded, $qry_encoded);
my $loc = sprintf("%s:%d", $chr, $pos);
#print "$key\n";
## add mutation to the hash table
if( !(exists $hash->{$key}) ) {
$hash->{$key} = $mut;
push @{$mutbypos_hash->{$loc}}, $key;
}
else { # already in the table
# update the coverage to the highest value found so far
my $mut_old = $hash->{$key};
if ($mut_old->{avgcov} < $mut->{avgcov}) {
$mut_old->{avgcov} = $mut->{avgcov}; # max avg coverage
}
if ($mut_old->{mincov} < $mut->{mincov}) {
$mut_old->{mincov} = $mut->{mincov}; # max min coverage
}
$hash->{$key} = $mut_old;
}
}
}
#return 1 if at least one event is detected
return $outflag;
}
## update covearge info for bases matching the reference
##########################################
sub updateRefCoverage {
my $p = $_[0];
my $ref_hash = $_[1];
my @cov = @{$p->{cov}};
my @diff = @{$p->{diff}};
my $chr = $p->{chr};
my $start = $p->{rstart};
my $trim5 = $p->{trim5};
#my $end = $p->{rend};
#my $L = $end-$start+1; # length of the region
my $len = scalar @diff;
my $r = $start+$trim5; # reference index
my $j = 0; # used to scan the coverage array
for (my $i = 0; $i<$len; $i++) {
my $key = "$chr:$r";
if ($diff[$i] eq " ") { # if matching the reference
#if( ($i>0) && ($diff[$i-1] eq " ") ) { # test needed to correctly handle homozygous ins
if( ($i>0) && ($diff[$i-1] ne "^") ) { # test needed to correctly handle homozygous ins
if ( (exists $ref_hash->{$key}) ) { # already in the table
my $cov = $ref_hash->{$key};
if ($cov < $cov[$j]) { # keep max coverage
$cov = $cov[$j];
}
$ref_hash->{$key} = $cov;
}
else { # not in the table
$ref_hash->{$key} = $cov[$j];
}
}
$j++; $r++;
}
elsif ($diff[$i] eq "x") { $j++; $r++; } # snp
elsif ($diff[$i] eq "^") { $j++; } # ins
elsif ($diff[$i] eq "v") { $r++; } # del
}
}
## do the job
##########################################
init();
run();
##########################################
my $time_taken = time - $start_time;
#if($VERBOSE) {
elapsedTime($time_taken, "FindVariants");
#}
diff --git a/FindSomatic.pl b/FindSomatic.pl
old mode 100755
new mode 100644
index 7a42a1e..6e90b4e
--- a/FindSomatic.pl
+++ b/FindSomatic.pl
@@ -508,10 +508,12 @@ sub parseBestState {
my @R = split("", $REF); # reference
my @A = split("", $ALT); # alternative
my $n=0; my $t=1;
- if ( ($A[$n]>0) && ($A[$t]>0) ) { # both in normal and tumor
+ #if ( ($A[$n]>0) && ($A[$t]>0) ) { # both in normal and tumor
+ if ( $A[$n] == $A[$t] ) { # genotypes for ALT identical
$sv->{inheritance} = "normal";
}
- elsif ( ($A[$n]==0) && ($A[$t]>0) ) { # only in tumor
+ #elsif ( ($A[$n]==0) && ($A[$t]>0) ) { # only in tumor
+ elsif ( $A[$n] != $A[$t] ) { # genotypes for ALT different
$sv->{inheritance} = "no";
$sv->{somatic} = "yes";
}
diff --git a/FindVariants.pl b/FindVariants.pl
index 5f0dceb..712095b 100755
--- a/FindVariants.pl
+++ b/FindVariants.pl
@@ -218,7 +218,7 @@ sub run {
}
# decide if loading genome from fasta or use samtools faidx
- if (-e "$bedfile") { $USEFAIDX = 0; }
+ if (-e "$bedfile") { $USEFAIDX = 1; }
elsif ($bedfile =~ m/(\w+):(\d+)-(\d+)/) {
#parse region
my ($chr,$interval) = split(':',$bedfile);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment