laurianesimon/CheckIfReadIsCoveringInterval

## CheckIfReadIsCoveringInterval
#!/usr/bin/perl
use strict;
use warnings;
use Getopt::Long;
use SamUtils;
my $VERSION='0.4';
my $lastModif='2016-Aug-29';

my($limMin,$limSup,$softclip);

# set the options
&GetOptions("h|help"     	=> \&help,
	    "minstart:i"	=> \$limMin,		# beginning of the read must be <= to  limmin
	    "minend:i"	=> \$limSup,		# end of the read must be >= limsup
	    "S|softclip"	=> \$softclip		# activate the checkIfIsCoveringIntervalWithSEnd
);

@ARGV or &help;

# verification of the arg
(-e "$ARGV[0]") or &help2($ARGV[0]);


#########################################################################################################
#						main							#
#########################################################################################################

if(defined($softclip)){
	&mainWithS($ARGV[0]);				# analyze a file
}

else{
	&main($ARGV[0]);				# analyze a file
}

#########################################################################################################
#						SUBS							#
#########################################################################################################

sub main { # read the file

	my $samfile = shift @_;

	my ($totalreads,$coveringreads);

	open (F1, $samfile) || die("opening of the file fails\n");

		while (my$li=<F1>){

			#next if ($li=~/^@/);			# get ride of the header

			chomp $li;

			if ($li=~/^@/){

				print "$li\n";
			}

			else{
				$totalreads++;

				my@tab=split(/\t/,$li);

				my($read,$ref,$start,$cigar,$seq)=($tab[0],$tab[2],$tab[3],$tab[5],$tab[9]);

				if(SamUtils::checkIfIsCoveringInterval($start,$cigar,$limMin,$limSup)){ # (<=$limMin && >=$limSup)

					$coveringreads++;

					print "$li\n";
				}
			}
		}

	close F1;

	print STDERR "total processed reads : $totalreads\n";

	print STDERR "total covering reads :  $coveringreads\n";
}

#########################################################################################################

sub mainWithS { # reqd the file

	my $samfile = shift @_;

	my ($totalreads,$coveringreads);

	open (F1, $samfile) || die("opening of the file fails\n");

		while (my$li=<F1>){

			#next if ($li=~/^@/);			# get ride of the header

			chomp $li;

			if ($li=~/^@/){

				print "$li\n";
			}

			else{
				$totalreads++;

				my@tab=split(/\t/,$li);

				my($read,$ref,$start,$cigar,$seq)=($tab[0],$tab[2],$tab[3],$tab[5],$tab[9]);

				if(SamUtils::checkIfIsCoveringIntervalWithSEnd($start,$cigar,$limMin,$limSup)){ # (<=$limMin && >=$limSup)

					$coveringreads++;

					print "$li\n";
				}
			}
		}

	close F1;

	print STDERR "total processed reads : $totalreads\n";

	print STDERR "total covering reads :  $coveringreads\n";
}

#########################################################################################################

sub help {
	my@tab = split(/\//,$0);
	my $name = pop(@tab);
	print STDERR "\n";
	print STDERR join("-", $0, $VERSION, $lastModif), "\n";
	print STDERR "USAGE: $name [-h|--help]  <sam file>\n";
	exit(1) ;
}

sub help2 {
	my $file = shift @_;
	print STDERR "The file \"$file\" doesn't exist or is not at the specified/current location\n";
	exit(1) ;
}


# end


## CountPolymorphisms_v4
#!/usr/bin/perl
use strict;
use warnings;
use Getopt::Long;
use SamUtils;
my $VERSION='0.2';
my $lastModif='2016-Mar-04';

my $startref=0;
my $endref;
my $meandepth=1;

my $bruit=10;

# set the options
&GetOptions("h|help"     	=> \&help,
	  "startref:i"	=> \$startref,		# position of the beginning of the ref (after the N)
	  "endref:i"	=> \$endref,		# position of the end of the ref (before the N)

	  "meandepth:i"	=> \$meandepth
	);

@ARGV or &help;

# verification of the arg
(-e "$ARGV[0]") or &help2($ARGV[0]);

#########################################################################################################
#						info							#
#########################################################################################################

# count the polymorphims presents for each ref for each position

# arg0 = ref.fasta
# arg1 = file.sam

### apport // v1 => elimination of the background

#########################################################################################################
#						main							#
#########################################################################################################

my@tabname = split(/\//,$0);
my $scriptname = pop(@tabname);

#############################

&main($ARGV[0],$ARGV[1]);				# analyze a file

#########################################################################################################
#						SUBS							#
#########################################################################################################

sub main { # read the file

	my ($refFile,$samfile) = @_;

	# 1/ stockage des refs

	my%ref_seq;
	my%ref_pos_snp_nb;
	my%ref_pos_totalsnps;
	my%totalreadsByRef;
	my%normalbasebypos;
	my%ref_pos_depth;

	open (F1, "$refFile") || die ("$scriptname --sub main : opening fails $refFile");

	my$ref="";
	my$switch=0;

	while (my$li=<F1>) {

		if($li=~/^>/){
			chomp $li;
			$li=~s/^>//;
			$li=~s/^[\W]+//;
			$li=~s/[\W]+$//;
			$li=~s/ .+$//;

			#print STDERR "ref : $li memorised\n";

			$ref=$li;
			$switch=1;
			$ref_seq{$ref}="";
			next;
		}

		elsif ($switch==1) {
			chomp $li;
			$ref_seq{$ref}.=$li;
		}
	}
	close F1;

	foreach my$ref (keys %ref_seq){

		my@tabseq=split(//,$ref_seq{$ref});

		for (my$i=0;$i<scalar(@tabseq);$i++){

			my$pos=$i+1;
			$normalbasebypos{$ref}{$pos}=$tabseq[$i];

			$ref_pos_depth{$ref}{$pos}=0;

			if($pos>=$startref && $pos<=$endref){
				$ref_pos_totalsnps{$ref}{$pos}=0;
			}
		}
	}

	# 2/ reading of the sam file

	open (F2, $samfile) || die("$scriptname --sub main : opening fails $samfile\n");

		print STDERR "file $samfile opened\n";

		while (my$li=<F2>){

			next if ($li=~/^@/);			# get ride of the header

			chomp $li;

			my@tab=split(/\t/,$li);

			my($read,$flag,$ref,$start,$mapq,$cigar,$seq)=($tab[0],$tab[1],$tab[2],$tab[3],$tab[4],$tab[5],$tab[9]);

			$totalreadsByRef{$ref}++;

			my$res_comparaison=SamUtils::compareReadToRef($cigar,$start,$seq,$ref_seq{$ref});

			#print "$res_comparaison\n";

			my$listOfSnps=SamUtils::trimSubseqOfSnplist($res_comparaison,$startref,$endref);

			if(defined($listOfSnps)){

				#print "$listOfSnps\n";

				my@tab_snps= split(/;/,$listOfSnps);

				#print "## @tab_snps\n";

				foreach my$element (@tab_snps){

					my($pos,$snp)=split(/:/,$element);

					$ref_pos_snp_nb{$ref}{$pos}{$snp}++;

					$ref_pos_totalsnps{$ref}{$pos}++;
				}
			}

			### incrementation depth

			my$endpos=SamUtils::returnEndPos($start,$cigar);

			for (my$i=$start;$i<=$endpos;$i++){

				$ref_pos_depth{$ref}{$i}++;
			}

		}
	close F2;


	# # # # # Elimination of background

	foreach my$refc (keys %ref_pos_snp_nb){									# for each ref

		foreach my$posc (keys $ref_pos_snp_nb{$refc}){							# for each position

			$ref_pos_totalsnps{$refc}{$posc}=0;

			foreach my$snpc (keys $ref_pos_snp_nb{$refc}{$posc}){					# for each SNP

				if ($ref_pos_snp_nb{$refc}{$posc}{$snpc}<=$bruit){				# if snp< noize snp=0

					#$ref_pos_snp_nb{$refc}{$posc}{$snpc}=0;

					$ref_pos_depth{$refc}{$posc}-=$ref_pos_snp_nb{$refc}{$posc}{$snpc};

					delete($ref_pos_snp_nb{$refc}{$posc}{$snpc});
				}
				else {									# if not snp = snp-bruit

					$ref_pos_snp_nb{$refc}{$posc}{$snpc}-=$bruit;

					$ref_pos_totalsnps{$refc}{$posc}+=$ref_pos_snp_nb{$refc}{$posc}{$snpc};	# total pos += snp

					$ref_pos_depth{$refc}{$posc}-=$bruit;
				}
			}
		}
	}

	# # # # #

	# 3/ printing of result

	my$outname=$samfile;
	$outname=~s/^.*\///;
	$outname=~s/\.sam$//i;

	foreach my$refc (sort(keys %ref_pos_totalsnps)){										# for each ref

		my$outfile=$refc."_counts_snp_from_".$outname.".txt";

		open (F3, "> $outfile") || die("$scriptname --sub main : pb ouverture $outfile\n");

			print F3 "position\tnumber of reads with snp\ttotal reads\t% snp";
			print F3 "\tnormalised snp\tnormalised total\tnormalised % snp";
			print F3 "\tnormal base\tsnp:number of reads\n";

			foreach my$posc (sort {$a<=>$b}(keys $ref_pos_totalsnps{$refc})){						# pour chaque position

				my$proportion;

				### non normalised

				if($ref_pos_depth{$refc}{$posc}>0){

					$proportion=int($ref_pos_totalsnps{$refc}{$posc}/$ref_pos_depth{$refc}{$posc}*10000)/100;		# calcul de la proportion de reads mutÃ©s
				}
				else{ $proportion=0;}

				my$newpos=$posc-$startref+1;

				print F3 "$newpos\t";

				print F3 "$ref_pos_totalsnps{$refc}{$posc}\t";

				print F3 "$ref_pos_depth{$refc}{$posc}\t";

				print F3 "$proportion\t";


				### normalised


				my$newtotalsnps=int($ref_pos_totalsnps{$refc}{$posc}/$meandepth*100)/100;

				#if($newtotalsnps<1){$newtotalsnps=0;}

				my$newdepth=$ref_pos_depth{$refc}{$posc}/$meandepth;

				if($newdepth<1){$newdepth=0;}

				if($newdepth>0){

					$proportion=int($newtotalsnps/$newdepth*10000)/100;					# calculqtion of the proportion of mutated reads 				}
				else{ $proportion=0;}

				$newdepth=int($newdepth*100)/100;

				print F3 "$newtotalsnps\t";

				print F3 "$newdepth\t";

				print F3 "$proportion\t";


				### other

				print F3 "$normalbasebypos{$refc}{$posc}";

				if(defined($ref_pos_snp_nb{$refc}{$posc})){

					foreach my$snpc (sort(keys $ref_pos_snp_nb{$refc}{$posc})){						# pour chaque snp

						print F3 "\t$snpc:$ref_pos_snp_nb{$refc}{$posc}{$snpc}";
					}
				}


				print F3 "\n";
			}

		close F3;

	}

}


#########################################################################################################

sub help {
	my@tab = split(/\//,$0);
	my $name = pop(@tab);
	print STDERR "\n";
	print STDERR join("-", $0, $VERSION, $lastModif), "\n";
	print STDERR "USAGE: $name [-h|--help] --startref:int --endref:int <ref.fasta> <file.sam>\n\n";
	exit(1) ;
}

sub help2 {
	my $file = shift @_;
	print STDERR "The file \"$file\" doesn't exist or is not at the specified/current location\n";
	exit(1) ;
}


# end


## GetSamWithRefReplacedByTstretchs
#!/usr/bin/perl
use strict;
use warnings;
use Getopt::Long;
use SamUtils;
my $VERSION='0.4';
my $lastModif='2016-Aug-21';

my($refFile,$limMin,$limSup,$size,$endRef,$list,$softclip,$tstretchmode);
my$mismatches=0;

# set the options
&GetOptions("h|help"		=> \&help,
	# required
	  "r|ref:s"		=> \$refFile,
	  "m|mismatches:i"		=> \$mismatches,
	  "minstart:i"		=> \$limMin,		# start of the reads must be <= to  limmin
	  "minend:i"		=> \$limSup,		# end of the read must be >= limsup
	  "size:i"		=> \$size,		# size of the fragment extracted
	  "endref:i"		=> \$endRef,		# position of the end of the ref(before the N)
	  "l|list:s"		=> \$list,		# list of tstretchs
	# optional
	  "t|tstretchmode"		=> \$tstretchmode,		# if we want the name of the T-stretch
	  "S|softclip"		=> \$softclip		# activate  checkIfIsCoveringIntervalWithSEnd
	);

@ARGV or &help;

# verification of arg
(-e "$ARGV[0]") or &help2($ARGV[0]);

#########################################################################################################
#						info							#
#########################################################################################################

# input =  sam file

# -r = ref fasta


#########################################################################################################
#						main							#
#########################################################################################################

my@tab = split(/\//,$0);
my $scriptname = pop(@tab);

my %tstretch_chr;
my %tstretch_name;

&readTstretchsList($list);

if(defined($softclip)){
	&mainWithS($ARGV[0],$refFile,$mismatches);				# analyze a file
}

else{
	&main($ARGV[0],$refFile,$mismatches);					# analyze a file
}


#########################################################################################################
#						SUBS							#
#########################################################################################################

sub main { # read the fichier

	my ($samfile,$refFile,$mismatches) = @_;


	# 1/ stockage of refs

	my%ref_seq;
	my%tstretch_count;
	my$totalreads=0;
	my$coveringreads=0;
	my$okreads=0;

	open (F1, "$refFile") || die ("$scriptname --sub main : openig fails $refFile");

	my$ref="";
	my$switch=0;

	while (my$li=<F1>) {

		if($li=~/^>/){
			chomp $li;
			$li=~s/^>//;
			$li=~s/^[\W]+//;
			$li=~s/[\W]+$//;
			$li=~s/ .+$//;

			print STDERR "ref : $li memorised\n";

			$ref=$li;
			$switch=1;
			$ref_seq{$ref}="";
			next;
		}

		elsif ($switch==1) {
			chomp $li;
			$ref_seq{$ref}.=$li;

			#$switch=0;
		}
	}
	close F1;

	# 2/ reading of the sam file

	open (F2, $samfile) || die("$scriptname --sub main : opening fails $samfile\n");

		print STDERR "file $samfile opened\n";

		while (my$li=<F2>){

			next if ($li=~/^@/);			# get ride of the header

			chomp $li;

			my@tab=split(/\t/,$li);

			#my($read,$flag,$ref,$start,$mapq,$cigar,$seq)=($tab[0],$tab[1],$tab[2],$tab[3],$tab[4],$tab[5],$tab[9]);

			my$read= shift @tab;
			my$flag= shift @tab;
			my$ref= shift @tab;
			my$start= shift @tab;
			my$mapq= shift @tab;
			my$cigar= shift @tab;
			my$other1= shift @tab;
			my$other2= shift @tab;
			my$other3= shift @tab;
			my$other= join ("\t", $other1,$other2,$other3);
			my$seq= shift @tab;
			my$last= join ("\t", @tab);


			if(SamUtils::checkIfIsCoveringInterval($start,$cigar,$limMin,$limSup)){


				my$before=$endRef-$start;	# lenght to pull out before


				# extraire fragment
				my$fragment="youpi";

				$fragment=SamUtils::extractFrag($before,$cigar,$seq,$size);

				if(defined($tstretchmode)){
					if(defined($tstretch_name{$fragment})){
						$ref=$tstretch_name{$fragment};
					}

					else{
						$ref="*";
					}
				}

				else{
					if(defined($tstretch_chr{$fragment})){
						$ref=$tstretch_chr{$fragment};
					}

					else{
						$ref="*";
					}
				}

				print "$read\t$flag\t$ref\t$start\t$mapq\t$cigar\t$other\t$seq\t$last\n";
			}
		}

	close F2;

	print STDERR "end Sam file\n";
}

#########################################################################################################

sub mainWithS { # read the file

	my ($samfile,$refFile,$mismatches) = @_;


	# 1/ stockage of refs

	my%ref_seq;
	my%tstretch_count;
	my$totalreads=0;
	my$coveringreads=0;
	my$okreads=0;

	open (F1, "$refFile") || die ("$scriptname --sub mainWithS : opening fails $refFile");

	my$ref="";
	my$switch=0;

	while (my$li=<F1>) {

		if($li=~/^>/){
			chomp $li;
			$li=~s/^>//;
			$li=~s/^[\W]+//;
			$li=~s/[\W]+$//;
			$li=~s/ .+$//;

			$ref=$li;
			$switch=1;
			$ref_seq{$ref}="";
			next;
		}

		elsif ($switch==1) {
			chomp $li;
			$ref_seq{$ref}.=$li;
		}
	}
	close F1;

	# 2/ reading of sam

	open (F2, $samfile) || die("$scriptname --sub mainWithS : pb ouverture $samfile\n");

		print STDERR "file $samfile opened\n";

		while (my$li=<F2>){

			next if ($li=~/^@/);			# get ride of  the header

			chomp $li;

			my@tab=split(/\t/,$li);

			#my($read,$flag,$ref,$start,$mapq,$cigar,$seq)=($tab[0],$tab[1],$tab[2],$tab[3],$tab[4],$tab[5],$tab[9]);

			my$read= shift @tab;
			my$flag= shift @tab;
			my$ref= shift @tab;
			my$start= shift @tab;
			my$mapq= shift @tab;
			my$cigar= shift @tab;
			my$other1= shift @tab;
			my$other2= shift @tab;
			my$other3= shift @tab;
			my$other= join ("\t", $other1,$other2,$other3);
			my$seq= shift @tab;
			my$last= join ("\t", @tab);


			if(SamUtils::checkIfIsCoveringIntervalWithSEnd($start,$cigar,$limMin,$limSup)){


				my$before=$endRef-$start;	# lenght to pull out before

				# extract fragment
				my$fragment="youpi";

				$fragment=SamUtils::extractFrag($before,$cigar,$seq,$size);


				if(defined($tstretchmode)){
					if(defined($tstretch_name{$fragment})){
						$ref=$tstretch_name{$fragment};
					}

					else{
						$ref="*";
					}
				}

				else{
					if(defined($tstretch_chr{$fragment})){
						$ref=$tstretch_chr{$fragment};
					}

					else{
						$ref="*";
					}
				}


				#print "$read\t$flag\t$ref\t$start\t$mapq\t$cigar\t$seq\n";
				print "$read\t$flag\t$ref\t$start\t$mapq\t$cigar\t$other\t$seq\t$last\n";
			}
		}

	close F2;

	print STDERR "end Sam file\n";
}

#########################################################################################################


sub readTstretchsList { # read thefichier

	my $file = shift @_;

	open (F1, $file) || die("$scriptname --sub readTstretchsList : pb ouverture $file\n");

		while (my$li=<F1>){

			next if ($li=~/^#/);			# get rie of the header

			chomp $li;

			my@tab=split(/\t/,$li);

			$tstretch_chr{$tab[0]}=$tab[1];

			if(scalar(@tab)>2){$tstretch_name{$tab[0]}=$tab[2];}
		}

	close F1;
}

#########################################################################################################

sub help {
	my@tab = split(/\//,$0);
	my $name = pop(@tab);

	print STDERR "\n";
	print STDERR join("-", $0, $VERSION, $lastModif), "\n";
	print STDERR "USAGE: $name\t[options] <sam file>\n";

	print STDERR "\t# required\n";
	print STDERR "\t\t[-r|--ref]<fasta reference file>\n";
	print STDERR "\t\t[-m|--mismatches]<mismatches>\n";
	print STDERR "\t\t[--minstart:int] = minimal start of reads\n";
	print STDERR "\t\t[--minend:int] = minimal end of reads\n";
	print STDERR "\t\t[--size:int] = tstretchs size +1\n";
	print STDERR "\t\t[--endref:int] = end of reference sequence (endpos of tstretch)\n";
	print STDERR "\t\t[-l|--list] <tstretchs tabulated list>\n";

	print STDERR "\t# optional\n";
	print STDERR "\t\t[-h|--help]\n";
	print STDERR "\t\t[-t|--tstretchmode] if tstretchs name wanted for output refs\n";
	print STDERR "\t\t[-S|--softclip] if reads longer than reference\n";

	exit(1) ;
}

sub help2 {
	my $file = shift @_;
	print STDERR "The file \"$file\" doesn't exist or is not at the specified/current location\n";
	exit(1) ;
}


# end


## GrepTstretchsInFastq
#!/usr/bin/perl
use strict;
use warnings;
use Getopt::Long;
my $VERSION='0.2';
my $lastModif='2016-Mar-04';


my($tstretchList,$fastqFile);

# set the options
&GetOptions("h|help"    => \&help,
	    "l|list:s"	=> \$tstretchList,
	    "f|fastq:s"	=> \$fastqFile
	);

#@ARGV or &help;

# verification of the arg
(-e "$tstretchList") or &help2($tstretchList);
(-e "$fastqFile") or &help2($fastqFile);


#########################################################################################################
#						main							#
#########################################################################################################

# goal : make group of reads depending on the T-Stretchs

my %tstretch_chr;				# list of tstretchs to analyze

&readTstretchList($tstretchList);				# analyze a file

foreach my$element (keys %tstretch_chr){

	&searchTstretch($tstretch_chr{$element},$element,$fastqFile);		# analyze @_

}


#########################################################################################################
#						SUBS							#
#########################################################################################################

sub readTstretchList { # reads the t-stretch list format [0]=tstretch [1]=chr

	my $file = shift @_;

	open (F1, $file) || die("file opening failed\n");

		while (my$li=<F1>){

			next if ($li=~/^#/);			# get ride off the header

			chomp $li;

			my@tab=split(/\t/,$li);

			$tstretch_chr{$tab[0]}=$tab[1];		# stock the tstretch

			print STDERR "Tstretch $tab[1] : $tab[0]\n";
		}

	close F1;

	print STDERR "#-#-# Tstretch list memorized #-#-#\n";
}

#########################################################################################################

sub searchTstretch {

	my($chr,$tstretch,$file) = @_;	# => in the order of @_

	my$filename = $file;
	$filename =~ s/.*\///;

	my$fileout = $chr."_".$tstretch."_from_".$filename;

	`grep -B1 -A2 "$tstretch" $file | grep -v "^--\$" > $fileout`;

	#print "$chr : $tstretch =";

	#`grep -c "$tstretch" $file`;

	#print " occurences\n";

	print STDERR "Tstretch $chr : $tstretch done\n";

}

#########################################################################################################

sub help {
	my@tab = split(/\//,$0);
	my $name = pop(@tab);
	print STDERR "\n";
	print STDERR join("-", $0, $VERSION, $lastModif), "\n";
	print STDERR "USAGE: $name [-h|--help]  <all_snp file>\n";
	print STDERR "The snp file made by ListSnp.pl needs to be sliced by refs => one ref at the same time !\n\n";
	exit(1) ;
}

sub help2 {
	my $file = shift @_;
	print STDERR "The file \"$file\" doesn't exist or is not at the specified/current location\n";
	exit(1) ;
}


# end


## ListSnpModules_v2
#!/usr/bin/perl
use strict;
use warnings;
use Getopt::Long;
use SamUtils;
my $VERSION='0.8';
my $lastModif='2016-Aug-21';
my@t = split(/\//,$0);
my $scriptname = pop(@t);

my($liminf,$limsup);

# set the options
&GetOptions("h|help"     => \&help,
	  "liminf:i"   => \$liminf,
	  "limsup:i"   => \$limsup,
	);

# arg0 = ref fasta files
@ARGV or &help;

#########################################################################################################
#						infos							#
#########################################################################################################

# list of all SNP on all reads

#########################################################################################################
#						main							#
#########################################################################################################

# stockage of refs

my%ref_seq;

open (F1, "$ARGV[0]") || die("$scriptname --step open Refs : opening fails $ARGV[0]\n");

my$ref="";
my$switch=0;

while (my$li=<F1>) {

	if($li=~/^>/){
		chomp $li;
		$li=~s/^>//;
		$li=~s/^[\W]+//;
		$li=~s/[\W]+$//;
		$li=~s/ .+$//;

		print STDERR "ref : $li memorised\n";

		$ref=$li;
		$switch=1;
		$ref_seq{$ref}="";
		next;
	}

	elsif ($switch==1) {
		chomp $li;
		$ref_seq{$ref}.=$li;

		#$switch=0;
	}
}
close F1;


#########################################################################################################

my%countSnps;
my$totalreads=0;
my%ref_totalreads;
my%ref_countSnps;

print "#reference\tread\tstart_pos\tend_pos\tSNPs\n";

# while :

while (my$li=<STDIN>) {

	next if ($li=~/^@/);

	chomp $li;

	my@tab=split(/\t/, $li);

	my($read,$flag,$ref,$start,$cigar,$seq)=($tab[0],$tab[1],$tab[2],$tab[3],$tab[5],$tab[9]);

	if ($flag != 12 && $flag != 13 && $flag != 69 && $flag != 77 && $flag != 117 && $flag != 133 && $flag != 141 && $flag != 181 && $flag < 512 || $flag >= 1024){

		$totalreads++;

		$ref_totalreads{$ref}++;

		#if($totalreads/10000==int($totalreads/10000)){print STDERR "$totalreads reads processed\n";}

		if(!(defined($ref_seq{$ref}))){print STDERR "error : no refseq available for \"$ref\"\n";}

		my$res_comparaison;

		$res_comparaison=SamUtils::compareReadToRef($cigar,$start,$seq,$ref_seq{$ref});

		## cut res_comparaison

		if(defined($res_comparaison) && defined($liminf) && defined($limsup)){

			my$newres_comparaison=SamUtils::trimSubseqOfSnplist($res_comparaison,$liminf,$limsup);

			$res_comparaison=$newres_comparaison;

		}

		##

		my$nbSnp=SamUtils::countSnpFromSnplist($res_comparaison);

		$countSnps{$nbSnp}++;

		$ref_countSnps{$ref}{$nbSnp}++;

		my$endpos=SamUtils::returnEndPos($start,$cigar);

		## new start new end
=head
		if(defined($liminf) && defined($limsup)){

			$start=$liminf;

			my$newEnd=$endpos-$liminf;

			if($newEnd>$limsup){$newEnd=$limsup;}

			$endpos=$newEnd;

		}
=cut
		##

		if(defined($res_comparaison)){
			print "$ref\t$read\t$start\t$endpos\t$res_comparaison\n"; # ref ; read ; start ; stop ; SNPs
		}

		else{
			print "$ref\t$read\t$start\t$endpos\t\n"; # ref ; read ; start ; stop ; SNPs
		}

	}

}# end while

my$multiSnp=0;

print STDERR "total reads :\t$totalreads\n";

foreach my$k (sort(keys %countSnps)){

	my$pourcentagec=$countSnps{$k}/$totalreads*100;

	if($k<=2){print STDERR "reads with $k snps :\t$countSnps{$k}\t$pourcentagec%\n";}

	if($k>2){

		$multiSnp+=$countSnps{$k};
	}
}

my$pourcentagec2=$multiSnp/$totalreads*100;

print STDERR "reads with more than 2 snps :\t$multiSnp\t$pourcentagec2%\n";

#########################################################################################################

foreach my$refc (sort(keys %ref_totalreads)){

	my$multiSnp2=0;

	my$totalR=$ref_totalreads{$refc};

	print STDERR "------------------------------------------\n";

	print STDERR "reference : $refc\n";

	print STDERR "total reads :\t$ref_totalreads{$refc}\n";

	foreach my$k (sort(keys $ref_countSnps{$refc})){

		my$pourcentagec=$ref_countSnps{$refc}{$k}/$totalR*100;

		if($k<=2){print STDERR "reads with $k snps :\t$ref_countSnps{$refc}{$k}\t$pourcentagec%\n";}

		if($k>2){

			$multiSnp2+=$ref_countSnps{$refc}{$k};
		}
	}

	my$pourcentagec2b=$multiSnp2/$totalR*100;

	print STDERR "reads with more than 2 snps :\t$multiSnp2\t$pourcentagec2b%\n";

}

#########################################################################################################
#						SUBS							#
#########################################################################################################

sub help {
	print STDERR join("-", $0, $VERSION, $lastModif), "\n";
	print STDERR "USAGE: STDIN \| $0 [-h|--help]  <reference_sequences_used_in_mapping.fasta>\n";
	print STDERR "USAGE: STDIN = cat/awk from .sam file ; filtered by quality\n";
	print STDERR "Example :\n";
	print STDERR "awk '{ if (\$5>=30) { print \$0 ; } }' filein.sam | scripts/ListSnp.pl ref.fasta > fileout.txt\n";
	exit(1) ;
}


## main-script
bash Script
#!/bin/sh
##########################################################################################
# Info #
##########################################################################################
#
# required perl scripts :
#
# GrepTstretchsInFastq.pl
# CheckIfReadIsCoveringInterval.pl
# GetSamWithRefReplacedByTstretchs.pl
# CountPolymorphisms.pl
# ListSnpModules_v2.pl
#
# >> Samutils.pm required !
#
#
##########################################################################################
# Environment variables #
##########################################################################################
refTstretchs=/PATH/liste_tstretchs.csv
sizeTstretchs=16
fastq1=/PATH/File_score10_paired_fused.fastq
fastqName1=File_score10_paired_fused
fastq2=/PATH/File_score10_1P_notFused.fastq
fastqName2=File_score10_1P_notFused
fastq3=/PATH/File_score10_2P_notFused.fastq
fastqName3=File_score10_2P_notFused
fastq4=/PATH/File_score10_1U.fastq
fastqName4=File_score10_1U
fastq5=/PATH/File_score10_2U.fastq
fastqName5=File_score10_2U
##########################################################################################
refFasta=/PATH/reference_transcribed_sequence.fasta
startRef=21
endref=140
endTstretch=156
lim=20
refFasta2=/PATH/reference_transcribed_sequence.fasta
##########################################################################################
# PIPELINE #
##########################################################################################
# A/ Raw reads Grep :
# a)
GrepTstretchsInFastq.pl \
-l $refTstretchs \
-f $fastq1
GrepTstretchsInFastq.pl \
-l $refTstretchs \
-f $fastq2
GrepTstretchsInFastq.pl \
-l $refTstretchs \
-f $fastq3
GrepTstretchsInFastq.pl \
-l $refTstretchs \
-f $fastq4
GrepTstretchsInFastq.pl \
-l $refTstretchs \
-f $fastq5
# b)
fastqName1b=$fastqName1"_withT-stretchs.fastq"
fastqName2b=$fastqName2"_withT-stretchs.fastq"
fastqName3b=$fastqName3"_withT-stretchs.fastq"
fastqName4b=$fastqName4"_withT-stretchs.fastq"
fastqName5b=$fastqName5"_withT-stretchs.fastq"
cat *$fastqName1.fastq > $fastqName1b
cat *$fastqName2.fastq > $fastqName2b
cat *$fastqName3.fastq > $fastqName3b
cat *$fastqName4.fastq > $fastqName4b
cat *$fastqName5.fastq > $fastqName5b
mkdir fastq_par_Tstretchs
mv *$fastqName*.fastq fastq_par_Tstretchs/.
echo "step A done"
##########################################################################################
#B/ Mapping
#a)
bwa mem -t32 -v2 $refFasta $fastqName1b 2> log_mapping.txt | grep -v ^@ \
| awk '{if($2 != 4 && $2 != 12 && $2 != 13 && $2 != 20 && $2 != 69 && $2 != 77 && $2 != 117 && $2 !=
133 && $2 != 141 && $2 != 181 && ($2 < 512 || $2 >= 1024)){print $0;}}' \
> $fastqName1"_vs_transcrit_mapped.sam"
bwa mem -t32 -v2 $refFasta $fastqName2b 2>> log_mapping.txt | grep -v ^@ \
| awk '{if($2 != 4 && $2 != 12 && $2 != 13 && $2 != 20 && $2 != 69 && $2 != 77 && $2 != 117 && $2 !=
133 && $2 != 141 && $2 != 181 && ($2 < 512 || $2 >= 1024)){print $0;}}' \
> $fastqName2"_vs_transcrit_mapped.sam"
bwa mem -t32 -v2 $refFasta $fastqName3b 2>> log_mapping.txt | grep -v ^@ \
| awk '{if($2 != 4 && $2 != 12 && $2 != 13 && $2 != 20 && $2 != 69 && $2 != 77 && $2 != 117 && $2 !=
133 && $2 != 141 && $2 != 181 && ($2 < 512 || $2 >= 1024)){print $0;}}' \
> $fastqName3"_vs_transcrit_mapped.sam"
bwa mem -t32 -v2 $refFasta $fastqName4b 2>> log_mapping.txt | grep -v ^@ \
| awk '{if($2 != 4 && $2 != 12 && $2 != 13 && $2 != 20 && $2 != 69 && $2 != 77 && $2 != 117 && $2 !=
133 && $2 != 141 && $2 != 181 && ($2 < 512 || $2 >= 1024)){print $0;}}' \
> $fastqName4"_vs_transcrit_mapped.sam"
bwa mem -t32 -v2 $refFasta $fastqName5b 2>> log_mapping.txt | grep -v ^@ \
| awk '{if($2 != 4 && $2 != 12 && $2 != 13 && $2 != 20 && $2 != 69 && $2 != 77 && $2 != 117 && $2 !=
133 && $2 != 141 && $2 != 181 && ($2 < 512 || $2 >= 1024)){print $0;}}' \
> $fastqName5"_vs_transcrit_mapped.sam"
#b)
cat *_mapped.sam > all_reads_mapped.sam
echo "step B done"
##########################################################################################
#C/ Selection of reads covering all the transcribed sequence
CheckIfReadIsCoveringInterval.pl --minstart $startRef --minend $endref all_reads_mapped.sam >
all_reads_coveringTranscript_mapped.sam
echo "step C done"
##########################################################################################
#D/ Replace of the mapped references by the corresponding chromosome (give the result by cluster)
#a)
GetSamWithRefReplacedByTstretchs.pl \
-r $refFasta \
-m 0 --minstart $startRef --minend $endTstretch --size $sizeTstretchs --endref $endref \
-l $refTstretchs \
-S all_reads_coveringTranscript_mapped.sam > all_reads_splitByTstretchs_mapped.sam
#(to check)
awk '{print $3;}' all_reads_splitByTstretchs_mapped.sam | sort | uniq -c
#b) elimination of reads « * » in a) = don't have the t-stretch in the right place
awk '{if($3~"chr"){print $0;}}' all_reads_splitByTstretchs_mapped.sam >
all_reads_splitByTstretchs_mapped_filtered.sam
echo "step D done"
##########################################################################################
#E/ Polymorphisms analyze
CountPolymorphisms.pl --startref $startRef --endref $endref $refFasta2
all_reads_splitByTstretchs_mapped_filtered.sam
cat all_reads_splitByTstretchs_mapped_filtered.sam | ListSnpModules_v2.pl --liminf $lim --limsup $endref
$refFasta2 \
> all_reads_splitByTstretchs_polymorphisms.txt 2> all_reads_splitByTstretchs_stats.txt
echo "step E done"
##########################################################################################
##########################################################################################
echo "end"
Script perl CheckIfReadIsCoveringInterval.pl
#!/usr/bin/perl
use strict;
use warnings;
use Getopt::Long;
use SamUtils;
my $VERSION='0.3';
my $lastModif='2016-Jun-09';
my($limMin,$limSup,$softclip);
# set the options
&GetOptions("h|help" => \&help,
"minstart:i" => \$limMin, # read start must be <= limmin
"minend:i" => \$limSup, # read end must be >= limsup
"S|softclip" => \$softclip # if reads mapped exceed the reference sequence
);
@ARGV or &help;
# verification of arg
(-e "$ARGV[0]") or &help2($ARGV[0]);
##########################################################################################
# main #
##########################################################################################
if(defined($softclip)){
&mainWithS($ARGV[0]);
}
else{
&main($ARGV[0]);
}
##########################################################################################
# SUBS #
##########################################################################################
sub main { # read the file
my $samfile = shift @_;
my ($totalreads,$coveringreads); # nb of total reads in sam file, nb of covering reads
open (F1, $samfile) || die("can't open the file\n");
while (my$li=<F1>){
next if ($li=~/^@/); # filter header
$totalreads++;
chomp $li;
my@tab=split(/\t/,$li);
# needed informations
my($read,$ref,$start,$cigar,$seq)=($tab[0],$tab[2],$tab[3],$tab[5],$tab[9]);
# if(<=$limMin && >=$limSup) ==1
if(SamUtils::checkIfIsCoveringInterval($start,$cigar,$limMin,$limSup)){
$coveringreads++; # count read as covering
print "$li\n"; # let read pass through
}
}
close F1;
print STDERR "total processed reads : $totalreads\n";
print STDERR "total covering reads : $coveringreads\n";
}
##########################################################################################
sub mainWithS { # lit le fichier
my $samfile = shift @_;
my ($totalreads,$coveringreads); # nb of total reads in
sam file, nb of covering reads
open (F1, $samfile) || die("pb ouverture fichier\n");
while (my$li=<F1>){
next if ($li=~/^@/); # filter header
$totalreads++;
chomp $li;
my@tab=split(/\t/,$li);
# needed informations
my($read,$ref,$start,$cigar,$seq)=($tab[0],$tab[2],$tab[3],$tab[5],$tab[9]);
# if(<=$limMin && >=$limSup) ==1
if(SamUtils::checkIfIsCoveringIntervalWithSEnd($start,$cigar,$limMin,$limSup)){
$coveringreads++; # count read as covering
print "$li\n"; # let read pass through
}
}
close F1;
print STDERR "total processed reads : $totalreads\n";
print STDERR "total covering reads : $coveringreads\n";
}
##########################################################################################
sub help {
my@tab = split(/\//,$0);
my $name = pop(@tab);
print STDERR "\n";
print STDERR join("-", $0, $VERSION, $lastModif), "\n";
print STDERR "USAGE: $name [-h|--help] --minstart:int --minend:int [-S|--softclip] <sam file>\n";
exit(1) ;
}
sub help2 {
my $file = shift @_;
print STDERR "The file \"$file\" doesn't exist or is not at the specified/current location\n";
exit(1) ;
}
# end
Script perl GetSamWithRefReplacedByTstretchs.pl
#!/usr/bin/perl
use strict;
use warnings;
use Getopt::Long;
use SamUtils;
my $VERSION='0.4';
my $lastModif='2016-Aug-21';
my@tab = split(/\//,$0);
my$scriptname = pop(@tab);
my($refFile,$limMin,$limSup,$size,$endRef,$list,$softclip,$tstretchmode);
my$mismatches=0;
# set the options
&GetOptions("h|help" => \&help,
# required
"r|ref:s" => \$refFile,
"m|mismatches:i" => \$mismatches,
"minstart:i" => \$limMin, # read start must be <= limmin
"minend:i" => \$limSup, # read end must be >= limsup
"size:i"=> \$size, # required extract fragment size
"endref:i" => \$endRef, # end reference position
"l|list:s" => \$list, # tstretchs list
# optional
"t|tstretchmode" => \$tstretchmode, # if we want Tstretch name in new refs
# // normal mode is chr name
"S|softclip" => \$softclip # if wanted fragment is out of reference boundaries
);
@ARGV or &help;
# sam file existence control
(-e "$ARGV[0]") or &help2($ARGV[0]);
##########################################################################################
# info #
##########################################################################################
# input = sam file
# -r = ref.fasta
# -l = tstretchlist.csv/tsv/txt
##########################################################################################
# main #
##########################################################################################
my %tstretch_chr; # store corresponding chr for each T-stretch
my %tstretch_name; # store corresponding T-stretch name for each T-stretch
&readTstretchsList($list);
if(defined($softclip)){
&mainWithS($ARGV[0],$refFile,$mismatches);
}
else{
&main($ARGV[0],$refFile,$mismatches);
}
##########################################################################################
# SUBS #
##########################################################################################
sub main {
my ($samfile,$refFile,$mismatches) = @_;
# 1/ references recording
my%ref_seq; # key=reference name ; value=sequence
my%tstretch_count; # key=Tstretch ; value=counts
my$totalreads=0; # total reads in Sam file
my$coveringreads=0; # total reads covering the interval
open (F1, "$refFile") || die ("$scriptname --sub main : pb opening $refFile");
my$ref="";
my$switch=0;
while (my$li=<F1>) {
if($li=~/^>/){
chomp $li;
$li=~s/^>//;
$li=~s/^[\W]+//;
$li=~s/[\W]+$//;
$li=~s/ .+$//;
print STDERR "ref : $li memorised\n";
$ref=$li;
$switch=1;
$ref_seq{$ref}=""; # store current ref
next;
}
elsif ($switch==1) {
chomp $li;
$ref_seq{$ref}.=$li; # add seq to current ref
}
}
close F1;
# 2/ Sam file processing
open (F2, $samfile) || die("$scriptname --sub main : pb ouverture $samfile\n");
print STDERR "file $samfile opened\n";
while (my$li=<F2>){
next if ($li=~/^@/); # header
chomp $li;
my@tab=split(/\t/,$li);
my$read= shift @tab;
my$flag= shift @tab;
my$ref= shift @tab;
my$start= shift @tab;
my$mapq= shift @tab;
my$cigar= shift @tab;
my$other1= shift @tab;
my$other2= shift @tab;
my$other3= shift @tab;
my$other= join ("\t", $other1,$other2,$other3);
my$seq= shift @tab;
my$last= join ("\t", @tab);
# check if current read is covering the defined interval
if(SamUtils::checkIfIsCoveringInterval($start,$cigar,$limMin,$limSup)){
my$before=$endRef-$start; # read length : start to endref
# extract fragment
my$fragment="";
$fragment=SamUtils::extractFrag($before,$cigar,$seq,$size);
if(defined($tstretchmode)){ # if we want Tstretch name as new refs
if(defined($tstretch_name{$fragment})){
$ref=$tstretch_name{$fragment};
}
else{
$ref="*";
}
}
else{ # if we want chr name as new refs
if(defined($tstretch_chr{$fragment})){
$ref=$tstretch_chr{$fragment};
}
else{
$ref="*";
}
}
print "$read\t$flag\t$ref\t$start\t$mapq\t$cigar\t$other\t$seq\t$last\n";
}
}
close F2;
print STDERR "end Sam file\n";
}
##########################################################################################
sub mainWithS { # lit le fichier
my ($samfile,$refFile,$mismatches) = @_;
# 1/ references recording
my%ref_seq; # key=reference name ; value=sequence
my%tstretch_count; # key=Tstretch ; value=counts
my$totalreads=0; # total reads in Sam file
my$coveringreads=0; # total reads covering the interval
open (F1, "$refFile") || die ("$scriptname --sub main : pb opening $refFile");
my$ref="";
my$switch=0;
while (my$li=<F1>) {
if($li=~/^>/){
chomp $li;
$li=~s/^>//;
$li=~s/^[\W]+//;
$li=~s/[\W]+$//;
$li=~s/ .+$//;
print STDERR "ref : $li memorised\n";
$ref=$li;
$switch=1;
$ref_seq{$ref}=""; # store current ref
next;
}
elsif ($switch==1) {
chomp $li;
$ref_seq{$ref}.=$li; # add seq to current ref
}
}
close F1;
# 2/ Sam file processing
open (F2, $samfile) || die("$scriptname --sub main : pb ouverture $samfile\n");
print STDERR "file $samfile opened\n";
while (my$li=<F2>){
next if ($li=~/^@/); # header
chomp $li;
my@tab=split(/\t/,$li);
my$read= shift @tab;
my$flag= shift @tab;
my$ref= shift @tab;
my$start= shift @tab;
my$mapq= shift @tab;
my$cigar= shift @tab;
my$other1= shift @tab;
my$other2= shift @tab;
my$other3= shift @tab;
my$other= join ("\t", $other1,$other2,$other3);
my$seq= shift @tab;
my$last= join ("\t", @tab);
# check if current read is covering the defined interval
if(SamUtils::checkIfIsCoveringIntervalWithSEnd($start,$cigar,$limMin,$limSup)){
my$before=$endRef-$start; # read length : start to endref
# extract fragment
my$fragment="";
$fragment=SamUtils::extractFrag($before,$cigar,$seq,$size);
if(defined($tstretchmode)){ # if we want Tstretch name as new refs
if(defined($tstretch_name{$fragment})){
$ref=$tstretch_name{$fragment};
}
else{
$ref="*";
}
}
else{ # if we want chr name as new refs
if(defined($tstretch_chr{$fragment})){
$ref=$tstretch_chr{$fragment};
}
else{
$ref="*";
}
}
print "$read\t$flag\t$ref\t$start\t$mapq\t$cigar\t$other\t$seq\t$last\n";
}
}
close F2;
print STDERR "end Sam file\n";
}
##########################################################################################
sub readTstretchsList {
my $file = shift @_;
open (F1, $file) || die("$scriptname --sub readTstretchsList : pb ouverture $file\n");
while (my$li=<F1>){
next if ($li=~/^#/); # for header
chomp $li;
my@tab=split(/\t/,$li);
$tstretch_chr{$tab[0]}=$tab[1]; # store : key=current Tstretch ; val=chr name
if(scalar(@tab)>2){
# store : key=current Tstretch ; val=Tstretch name
$tstretch_name{$tab[0]}=$tab[2];
}
}
close F1;
}
##########################################################################################
sub help {
my@tab = split(/\//,$0);
my $name = pop(@tab);
print STDERR "\n";
print STDERR join("-", $0, $VERSION, $lastModif), "\n";
print STDERR "USAGE: $name\t[options] <sam file>\n";
print STDERR "\t# required\n";
print STDERR "\t\t[-r|--ref]<fasta reference file>\n";
print STDERR "\t\t[-m|--mismatches]<mismatches>\n";
print STDERR "\t\t[--minstart:int] = minimal start of reads\n";
print STDERR "\t\t[--minend:int] = minimal end of reads\n";
print STDERR "\t\t[--size:int] = tstretchs size +1\n";
print STDERR "\t\t[--endref:int] = end of reference sequence (endpos of tstretch)\n";
print STDERR "\t\t[-l|--list] <tstretchs tabulated list>\n";
print STDERR "\t# optional\n";
print STDERR "\t\t[-h|--help]\n";
print STDERR "\t\t[-t|--tstretchmode] if tstretchs name wanted for output refs\n";
print STDERR "\t\t[-S|--softclip] if reads longer than reference\n";
exit(1) ;
}
sub help2 {
my $file = shift @_;
print STDERR "The file \"$file\" doesn't exist or is not at the specified/current location\n";
exit(1) ;
}
# end
Script perl CountPolymorphisms_v4.pl
#!/usr/bin/perl
use strict;
use warnings;
use Getopt::Long;
use SamUtils;
my $VERSION='0.2';
my $lastModif='2016-Aug-21';
my@tabname = split(/\//,$0);
my $scriptname = pop(@tabname);
my $startref=0;
my $endref;
my $meandepth=1;
my $bruit=10;
# set the options
&GetOptions("h|help" => \&help,
"startref:i" => \$startref, # start reference position
"endref:i" => \$endref, # end reference position
"meandepth:i" => \$meandepth # if we want to normalise by sequencing depth
);
@ARGV or &help;
# file existence ctrl
(-e "$ARGV[0]") or &help2($ARGV[0]);
###########################################################################################
# info
###########################################################################################
# count polymorphisms present at each position l
# arg0 = ref.fasta
# arg1 = file.sam
### apport // v1
###########################################################################################
# main
###########################################################################################
&main($ARGV[0],$ARGV[1]);
###########################################################################################
# SUBS
###########################################################################################
sub main {
my ($refFile,$samfile) = @_;
# 1/ references recording
my%ref_seq; # key=reference name ; value=sequence
my%ref_pos_snp_nb; # key=reference name ; key2=position ; key3=snp ;
value=count
my%ref_pos_totalsnps; # key=reference name ; key2=position ; value=total of all snps
my%totalreadsByRef; # key=rererence name ; value= total of correspondig reads
my%normalbasebypos; # key=reference name ; key2=position ; value=base
my%ref_pos_depth; # key=reference name ; key2=position ; value=position
depth
open (F1, "$refFile") || die ("$scriptname --sub main : pb ouverture $refFile");
my$ref="";
my$switch=0;
while (my$li=<F1>) {
if($li=~/^>/){
chomp $li;
$li=~s/^>//;
$li=~s/^[\W]+//;
$li=~s/[\W]+$//;
$li=~s/ .+$//;
$ref=$li;
$switch=1;
$ref_seq{$ref}=""; # store current ref
next;
}
elsif ($switch==1) {
chomp $li;
$ref_seq{$ref}.=$li; # add seq to current ref
}
}
close F1;
foreach my$ref (keys %ref_seq){
my@tabseq=split(//,$ref_seq{$ref});
for (my$i=0;$i<scalar(@tabseq);$i++){
my$pos=$i+1;
$normalbasebypos{$ref}{$pos}=$tabseq[$i];
$ref_pos_depth{$ref}{$pos}=0;
if($pos>=$startref && $pos<=$endref){
$ref_pos_totalsnps{$ref}{$pos}=0;
}
}
}
# 2/ sam file processing
open (F2, $samfile) || die("$scriptname --sub main : pb ouverture $samfile\n");
print STDERR "file $samfile opened\n";
while (my$li=<F2>){
next if ($li=~/^@/);
chomp $li;
my@tab=split(/\t/,$li);
my($read,$flag,$ref,$start,$mapq,$cigar,$seq)=($tab[0],$tab[1],$tab[2],$tab[3],
$tab[4],$tab[5],$tab[9]);
$totalreadsByRef{$ref}++;
# search all existing SNPs
my$res_comparaison=SamUtils::compareReadToRef($cigar,$start,$seq,
$ref_seq{$ref});
# keap only covering interval SNPs
my$listOfSnps=SamUtils::trimSubseqOfSnplist($res_comparaison,$startref,$endref);
if(defined($listOfSnps)){
my@tab_snps= split(/;/,$listOfSnps);
foreach my$element (@tab_snps){
my($pos,$snp)=split(/:/,$element);
$ref_pos_snp_nb{$ref}{$pos}{$snp}++;
$ref_pos_totalsnps{$ref}{$pos}++;
}
}
### depth incrementation
my$endpos=SamUtils::returnEndPos($start,$cigar);
for (my$i=$start;$i<=$endpos;$i++){
$ref_pos_depth{$ref}{$i}++;
}
}
close F2;
# # # # # Sequencing noise elimination
foreach my$refc (keys %ref_pos_snp_nb){
# foreach ref
foreach my$posc (keys $ref_pos_snp_nb{$refc}){
# foreach position
$ref_pos_totalsnps{$refc}{$posc}=0;
foreach my$snpc (keys $ref_pos_snp_nb{$refc}{$posc}){
# foreach existing snp
if ($ref_pos_snp_nb{$refc}{$posc}{$snpc}<=$bruit){
# if snp< bruit snp=0
$ref_pos_depth{$refc}{$posc}-=$ref_pos_snp_nb{$refc}{$posc}
{$snpc};
delete($ref_pos_snp_nb{$refc}{$posc}{$snpc});
}
else {
# else snp = snp-bruit
$ref_pos_snp_nb{$refc}{$posc}{$snpc}-=$bruit;
$ref_pos_totalsnps{$refc}{$posc}+=$ref_pos_snp_nb{$refc}
{$posc}{$snpc}; # total pos += snp
$ref_pos_depth{$refc}{$posc}-=$bruit;
}
}
}
}
# # # # #
# 3/ Results impression
my$outname=$samfile;
$outname=~s/^.*\///;
$outname=~s/\.sam$//i;
foreach my$refc (sort(keys %ref_pos_totalsnps)){
# foreach ref
my$outfile=$refc."_counts_snp_from_".$outname.".txt";
open (F3, "> $outfile") || die("$scriptname --sub main : pb ouverture $outfile\n");
print F3 "position\tnumber of reads with snp\ttotal reads\t% snp";
print F3 "\tnormalised snp\tnormalised total\tnormalised % snp";
print F3 "\tnormal base\tsnp:number of reads\n";
foreach my$posc (sort {$a<=>$b}(keys $ref_pos_totalsnps{$refc})){
# foreach position
my$proportion;
### non normalise
if($ref_pos_depth{$refc}{$posc}>0){
# mutated reads proportion computing
$proportion=int($ref_pos_totalsnps{$refc}{$posc}/
$ref_pos_depth{$refc}{$posc}*10000)/100;
}
else{ $proportion=0;}
my$newpos=$posc-$startref+1;
print F3 "$newpos\t";
print F3 "$ref_pos_totalsnps{$refc}{$posc}\t";
print F3 "$ref_pos_depth{$refc}{$posc}\t";
print F3 "$proportion\t";
### normalise
my$newtotalsnps=int($ref_pos_totalsnps{$refc}{$posc}/
$meandepth*100)/100;
my$newdepth=$ref_pos_depth{$refc}{$posc}/$meandepth;
if($newdepth<1){$newdepth=0;}
if($newdepth>0){
# mutated reads proportion computing
$proportion=int($newtotalsnps/$newdepth*10000)/100;
}
else{ $proportion=0;}
$newdepth=int($newdepth*100)/100;
print F3 "$newtotalsnps\t";
print F3 "$newdepth\t";
print F3 "$proportion\t";
### other
print F3 "$normalbasebypos{$refc}{$posc}";
if(defined($ref_pos_snp_nb{$refc}{$posc})){
foreach my$snpc (sort(keys $ref_pos_snp_nb{$refc}{$posc})){
# foreach existing snp
print F3 "\t$snpc:$ref_pos_snp_nb{$refc}{$posc}{$snpc}";
}
}
print F3 "\n";
}
close F3;
}
}
###########################################################################################
sub help {
my@tab = split(/\//,$0);
my $name = pop(@tab);
print STDERR "\n";
print STDERR join("-", $0, $VERSION, $lastModif), "\n";
print STDERR "USAGE: $name [-h|--help] --startref:int --endref:int <ref.fasta> <file.sam>\n\n";
exit(1) ;
}
sub help2 {
my $file = shift @_;
print STDERR "The file \"$file\" doesn't exist or is not at the specified/current location\n";
exit(1) ;
}
# end
	#!/usr/bin/perl
	use strict;
	use warnings;
	use Getopt::Long;
	use SamUtils;
	my $VERSION='0.4';
	my $lastModif='2016-Aug-29';

	my($limMin,$limSup,$softclip);

	# set the options
	&GetOptions("h\|help" => \&help,
	"minstart:i" => \$limMin, # beginning of the read must be <= to limmin
	"minend:i" => \$limSup, # end of the read must be >= limsup
	"S\|softclip" => \$softclip # activate the checkIfIsCoveringIntervalWithSEnd
	);

	@ARGV or &help;

	# verification of the arg
	(-e "$ARGV[0]") or &help2($ARGV[0]);



	#########################################################################################################
	# main #
	#########################################################################################################

	if(defined($softclip)){
	&mainWithS($ARGV[0]); # analyze a file
	}

	else{
	&main($ARGV[0]); # analyze a file
	}

	#########################################################################################################
	# SUBS #
	#########################################################################################################

	sub main { # read the file

	my $samfile = shift @_;

	my ($totalreads,$coveringreads);

	open (F1, $samfile) \|\| die("opening of the file fails\n");

	while (my$li=<F1>){

	#next if ($li=~/^@/); # get ride of the header

	chomp $li;

	if ($li=~/^@/){

	print "$li\n";
	}

	else{
	$totalreads++;

	my@tab=split(/\t/,$li);

	my($read,$ref,$start,$cigar,$seq)=($tab[0],$tab[2],$tab[3],$tab[5],$tab[9]);

	if(SamUtils::checkIfIsCoveringInterval($start,$cigar,$limMin,$limSup)){ # (<=$limMin && >=$limSup)

	$coveringreads++;

	print "$li\n";
	}
	}
	}

	close F1;

	print STDERR "total processed reads : $totalreads\n";

	print STDERR "total covering reads : $coveringreads\n";
	}

	#########################################################################################################

	sub mainWithS { # reqd the file

	my $samfile = shift @_;

	my ($totalreads,$coveringreads);

	open (F1, $samfile) \|\| die("opening of the file fails\n");

	while (my$li=<F1>){

	#next if ($li=~/^@/); # get ride of the header

	chomp $li;

	if ($li=~/^@/){

	print "$li\n";
	}

	else{
	$totalreads++;

	my@tab=split(/\t/,$li);

	my($read,$ref,$start,$cigar,$seq)=($tab[0],$tab[2],$tab[3],$tab[5],$tab[9]);

	if(SamUtils::checkIfIsCoveringIntervalWithSEnd($start,$cigar,$limMin,$limSup)){ # (<=$limMin && >=$limSup)

	$coveringreads++;

	print "$li\n";
	}
	}
	}

	close F1;

	print STDERR "total processed reads : $totalreads\n";

	print STDERR "total covering reads : $coveringreads\n";
	}

	#########################################################################################################

	sub help {
	my@tab = split(/\//,$0);
	my $name = pop(@tab);
	print STDERR "\n";
	print STDERR join("-", $0, $VERSION, $lastModif), "\n";
	print STDERR "USAGE: $name [-h\|--help] <sam file>\n";
	exit(1) ;
	}

	sub help2 {
	my $file = shift @_;
	print STDERR "The file \"$file\" doesn't exist or is not at the specified/current location\n";
	exit(1) ;
	}






	# end