Last active
October 17, 2017 08:57
-
-
Save laurianesimon/a9fc44aa83305c576e914710cae75f87 to your computer and use it in GitHub Desktop.
extract and count polymophisms ( use the Main script file)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
use strict; | |
use warnings; | |
use Getopt::Long; | |
use SamUtils; | |
my $VERSION='0.4'; | |
my $lastModif='2016-Aug-29'; | |
my($limMin,$limSup,$softclip); | |
# set the options | |
&GetOptions("h|help" => \&help, | |
"minstart:i" => \$limMin, # beginning of the read must be <= to limmin | |
"minend:i" => \$limSup, # end of the read must be >= limsup | |
"S|softclip" => \$softclip # activate the checkIfIsCoveringIntervalWithSEnd | |
); | |
@ARGV or &help; | |
# verification of the arg | |
(-e "$ARGV[0]") or &help2($ARGV[0]); | |
######################################################################################################### | |
# main # | |
######################################################################################################### | |
if(defined($softclip)){ | |
&mainWithS($ARGV[0]); # analyze a file | |
} | |
else{ | |
&main($ARGV[0]); # analyze a file | |
} | |
######################################################################################################### | |
# SUBS # | |
######################################################################################################### | |
sub main { # read the file | |
my $samfile = shift @_; | |
my ($totalreads,$coveringreads); | |
open (F1, $samfile) || die("opening of the file fails\n"); | |
while (my$li=<F1>){ | |
#next if ($li=~/^@/); # get ride of the header | |
chomp $li; | |
if ($li=~/^@/){ | |
print "$li\n"; | |
} | |
else{ | |
$totalreads++; | |
my@tab=split(/\t/,$li); | |
my($read,$ref,$start,$cigar,$seq)=($tab[0],$tab[2],$tab[3],$tab[5],$tab[9]); | |
if(SamUtils::checkIfIsCoveringInterval($start,$cigar,$limMin,$limSup)){ # (<=$limMin && >=$limSup) | |
$coveringreads++; | |
print "$li\n"; | |
} | |
} | |
} | |
close F1; | |
print STDERR "total processed reads : $totalreads\n"; | |
print STDERR "total covering reads : $coveringreads\n"; | |
} | |
######################################################################################################### | |
sub mainWithS { # reqd the file | |
my $samfile = shift @_; | |
my ($totalreads,$coveringreads); | |
open (F1, $samfile) || die("opening of the file fails\n"); | |
while (my$li=<F1>){ | |
#next if ($li=~/^@/); # get ride of the header | |
chomp $li; | |
if ($li=~/^@/){ | |
print "$li\n"; | |
} | |
else{ | |
$totalreads++; | |
my@tab=split(/\t/,$li); | |
my($read,$ref,$start,$cigar,$seq)=($tab[0],$tab[2],$tab[3],$tab[5],$tab[9]); | |
if(SamUtils::checkIfIsCoveringIntervalWithSEnd($start,$cigar,$limMin,$limSup)){ # (<=$limMin && >=$limSup) | |
$coveringreads++; | |
print "$li\n"; | |
} | |
} | |
} | |
close F1; | |
print STDERR "total processed reads : $totalreads\n"; | |
print STDERR "total covering reads : $coveringreads\n"; | |
} | |
######################################################################################################### | |
sub help { | |
my@tab = split(/\//,$0); | |
my $name = pop(@tab); | |
print STDERR "\n"; | |
print STDERR join("-", $0, $VERSION, $lastModif), "\n"; | |
print STDERR "USAGE: $name [-h|--help] <sam file>\n"; | |
exit(1) ; | |
} | |
sub help2 { | |
my $file = shift @_; | |
print STDERR "The file \"$file\" doesn't exist or is not at the specified/current location\n"; | |
exit(1) ; | |
} | |
# end | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
use strict; | |
use warnings; | |
use Getopt::Long; | |
use SamUtils; | |
my $VERSION='0.2'; | |
my $lastModif='2016-Mar-04'; | |
my $startref=0; | |
my $endref; | |
my $meandepth=1; | |
my $bruit=10; | |
# set the options | |
&GetOptions("h|help" => \&help, | |
"startref:i" => \$startref, # position of the beginning of the ref (after the N) | |
"endref:i" => \$endref, # position of the end of the ref (before the N) | |
"meandepth:i" => \$meandepth | |
); | |
@ARGV or &help; | |
# verification of the arg | |
(-e "$ARGV[0]") or &help2($ARGV[0]); | |
######################################################################################################### | |
# info # | |
######################################################################################################### | |
# count the polymorphims presents for each ref for each position | |
# arg0 = ref.fasta | |
# arg1 = file.sam | |
### apport // v1 => elimination of the background | |
######################################################################################################### | |
# main # | |
######################################################################################################### | |
my@tabname = split(/\//,$0); | |
my $scriptname = pop(@tabname); | |
############################# | |
&main($ARGV[0],$ARGV[1]); # analyze a file | |
######################################################################################################### | |
# SUBS # | |
######################################################################################################### | |
sub main { # read the file | |
my ($refFile,$samfile) = @_; | |
# 1/ stockage des refs | |
my%ref_seq; | |
my%ref_pos_snp_nb; | |
my%ref_pos_totalsnps; | |
my%totalreadsByRef; | |
my%normalbasebypos; | |
my%ref_pos_depth; | |
open (F1, "$refFile") || die ("$scriptname --sub main : opening fails $refFile"); | |
my$ref=""; | |
my$switch=0; | |
while (my$li=<F1>) { | |
if($li=~/^>/){ | |
chomp $li; | |
$li=~s/^>//; | |
$li=~s/^[\W]+//; | |
$li=~s/[\W]+$//; | |
$li=~s/ .+$//; | |
#print STDERR "ref : $li memorised\n"; | |
$ref=$li; | |
$switch=1; | |
$ref_seq{$ref}=""; | |
next; | |
} | |
elsif ($switch==1) { | |
chomp $li; | |
$ref_seq{$ref}.=$li; | |
} | |
} | |
close F1; | |
foreach my$ref (keys %ref_seq){ | |
my@tabseq=split(//,$ref_seq{$ref}); | |
for (my$i=0;$i<scalar(@tabseq);$i++){ | |
my$pos=$i+1; | |
$normalbasebypos{$ref}{$pos}=$tabseq[$i]; | |
$ref_pos_depth{$ref}{$pos}=0; | |
if($pos>=$startref && $pos<=$endref){ | |
$ref_pos_totalsnps{$ref}{$pos}=0; | |
} | |
} | |
} | |
# 2/ reading of the sam file | |
open (F2, $samfile) || die("$scriptname --sub main : opening fails $samfile\n"); | |
print STDERR "file $samfile opened\n"; | |
while (my$li=<F2>){ | |
next if ($li=~/^@/); # get ride of the header | |
chomp $li; | |
my@tab=split(/\t/,$li); | |
my($read,$flag,$ref,$start,$mapq,$cigar,$seq)=($tab[0],$tab[1],$tab[2],$tab[3],$tab[4],$tab[5],$tab[9]); | |
$totalreadsByRef{$ref}++; | |
my$res_comparaison=SamUtils::compareReadToRef($cigar,$start,$seq,$ref_seq{$ref}); | |
#print "$res_comparaison\n"; | |
my$listOfSnps=SamUtils::trimSubseqOfSnplist($res_comparaison,$startref,$endref); | |
if(defined($listOfSnps)){ | |
#print "$listOfSnps\n"; | |
my@tab_snps= split(/;/,$listOfSnps); | |
#print "## @tab_snps\n"; | |
foreach my$element (@tab_snps){ | |
my($pos,$snp)=split(/:/,$element); | |
$ref_pos_snp_nb{$ref}{$pos}{$snp}++; | |
$ref_pos_totalsnps{$ref}{$pos}++; | |
} | |
} | |
### incrementation depth | |
my$endpos=SamUtils::returnEndPos($start,$cigar); | |
for (my$i=$start;$i<=$endpos;$i++){ | |
$ref_pos_depth{$ref}{$i}++; | |
} | |
} | |
close F2; | |
# # # # # Elimination of background | |
foreach my$refc (keys %ref_pos_snp_nb){ # for each ref | |
foreach my$posc (keys $ref_pos_snp_nb{$refc}){ # for each position | |
$ref_pos_totalsnps{$refc}{$posc}=0; | |
foreach my$snpc (keys $ref_pos_snp_nb{$refc}{$posc}){ # for each SNP | |
if ($ref_pos_snp_nb{$refc}{$posc}{$snpc}<=$bruit){ # if snp< noize snp=0 | |
#$ref_pos_snp_nb{$refc}{$posc}{$snpc}=0; | |
$ref_pos_depth{$refc}{$posc}-=$ref_pos_snp_nb{$refc}{$posc}{$snpc}; | |
delete($ref_pos_snp_nb{$refc}{$posc}{$snpc}); | |
} | |
else { # if not snp = snp-bruit | |
$ref_pos_snp_nb{$refc}{$posc}{$snpc}-=$bruit; | |
$ref_pos_totalsnps{$refc}{$posc}+=$ref_pos_snp_nb{$refc}{$posc}{$snpc}; # total pos += snp | |
$ref_pos_depth{$refc}{$posc}-=$bruit; | |
} | |
} | |
} | |
} | |
# # # # # | |
# 3/ printing of result | |
my$outname=$samfile; | |
$outname=~s/^.*\///; | |
$outname=~s/\.sam$//i; | |
foreach my$refc (sort(keys %ref_pos_totalsnps)){ # for each ref | |
my$outfile=$refc."_counts_snp_from_".$outname.".txt"; | |
open (F3, "> $outfile") || die("$scriptname --sub main : pb ouverture $outfile\n"); | |
print F3 "position\tnumber of reads with snp\ttotal reads\t% snp"; | |
print F3 "\tnormalised snp\tnormalised total\tnormalised % snp"; | |
print F3 "\tnormal base\tsnp:number of reads\n"; | |
foreach my$posc (sort {$a<=>$b}(keys $ref_pos_totalsnps{$refc})){ # pour chaque position | |
my$proportion; | |
### non normalised | |
if($ref_pos_depth{$refc}{$posc}>0){ | |
$proportion=int($ref_pos_totalsnps{$refc}{$posc}/$ref_pos_depth{$refc}{$posc}*10000)/100; # calcul de la proportion de reads mutés | |
} | |
else{ $proportion=0;} | |
my$newpos=$posc-$startref+1; | |
print F3 "$newpos\t"; | |
print F3 "$ref_pos_totalsnps{$refc}{$posc}\t"; | |
print F3 "$ref_pos_depth{$refc}{$posc}\t"; | |
print F3 "$proportion\t"; | |
### normalised | |
my$newtotalsnps=int($ref_pos_totalsnps{$refc}{$posc}/$meandepth*100)/100; | |
#if($newtotalsnps<1){$newtotalsnps=0;} | |
my$newdepth=$ref_pos_depth{$refc}{$posc}/$meandepth; | |
if($newdepth<1){$newdepth=0;} | |
if($newdepth>0){ | |
$proportion=int($newtotalsnps/$newdepth*10000)/100; # calculqtion of the proportion of mutated reads } | |
else{ $proportion=0;} | |
$newdepth=int($newdepth*100)/100; | |
print F3 "$newtotalsnps\t"; | |
print F3 "$newdepth\t"; | |
print F3 "$proportion\t"; | |
### other | |
print F3 "$normalbasebypos{$refc}{$posc}"; | |
if(defined($ref_pos_snp_nb{$refc}{$posc})){ | |
foreach my$snpc (sort(keys $ref_pos_snp_nb{$refc}{$posc})){ # pour chaque snp | |
print F3 "\t$snpc:$ref_pos_snp_nb{$refc}{$posc}{$snpc}"; | |
} | |
} | |
print F3 "\n"; | |
} | |
close F3; | |
} | |
} | |
######################################################################################################### | |
sub help { | |
my@tab = split(/\//,$0); | |
my $name = pop(@tab); | |
print STDERR "\n"; | |
print STDERR join("-", $0, $VERSION, $lastModif), "\n"; | |
print STDERR "USAGE: $name [-h|--help] --startref:int --endref:int <ref.fasta> <file.sam>\n\n"; | |
exit(1) ; | |
} | |
sub help2 { | |
my $file = shift @_; | |
print STDERR "The file \"$file\" doesn't exist or is not at the specified/current location\n"; | |
exit(1) ; | |
} | |
# end | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
use strict; | |
use warnings; | |
use Getopt::Long; | |
use SamUtils; | |
my $VERSION='0.4'; | |
my $lastModif='2016-Aug-21'; | |
my($refFile,$limMin,$limSup,$size,$endRef,$list,$softclip,$tstretchmode); | |
my$mismatches=0; | |
# set the options | |
&GetOptions("h|help" => \&help, | |
# required | |
"r|ref:s" => \$refFile, | |
"m|mismatches:i" => \$mismatches, | |
"minstart:i" => \$limMin, # start of the reads must be <= to limmin | |
"minend:i" => \$limSup, # end of the read must be >= limsup | |
"size:i" => \$size, # size of the fragment extracted | |
"endref:i" => \$endRef, # position of the end of the ref(before the N) | |
"l|list:s" => \$list, # list of tstretchs | |
# optional | |
"t|tstretchmode" => \$tstretchmode, # if we want the name of the T-stretch | |
"S|softclip" => \$softclip # activate checkIfIsCoveringIntervalWithSEnd | |
); | |
@ARGV or &help; | |
# verification of arg | |
(-e "$ARGV[0]") or &help2($ARGV[0]); | |
######################################################################################################### | |
# info # | |
######################################################################################################### | |
# input = sam file | |
# -r = ref fasta | |
######################################################################################################### | |
# main # | |
######################################################################################################### | |
my@tab = split(/\//,$0); | |
my $scriptname = pop(@tab); | |
my %tstretch_chr; | |
my %tstretch_name; | |
&readTstretchsList($list); | |
if(defined($softclip)){ | |
&mainWithS($ARGV[0],$refFile,$mismatches); # analyze a file | |
} | |
else{ | |
&main($ARGV[0],$refFile,$mismatches); # analyze a file | |
} | |
######################################################################################################### | |
# SUBS # | |
######################################################################################################### | |
sub main { # read the fichier | |
my ($samfile,$refFile,$mismatches) = @_; | |
# 1/ stockage of refs | |
my%ref_seq; | |
my%tstretch_count; | |
my$totalreads=0; | |
my$coveringreads=0; | |
my$okreads=0; | |
open (F1, "$refFile") || die ("$scriptname --sub main : openig fails $refFile"); | |
my$ref=""; | |
my$switch=0; | |
while (my$li=<F1>) { | |
if($li=~/^>/){ | |
chomp $li; | |
$li=~s/^>//; | |
$li=~s/^[\W]+//; | |
$li=~s/[\W]+$//; | |
$li=~s/ .+$//; | |
print STDERR "ref : $li memorised\n"; | |
$ref=$li; | |
$switch=1; | |
$ref_seq{$ref}=""; | |
next; | |
} | |
elsif ($switch==1) { | |
chomp $li; | |
$ref_seq{$ref}.=$li; | |
#$switch=0; | |
} | |
} | |
close F1; | |
# 2/ reading of the sam file | |
open (F2, $samfile) || die("$scriptname --sub main : opening fails $samfile\n"); | |
print STDERR "file $samfile opened\n"; | |
while (my$li=<F2>){ | |
next if ($li=~/^@/); # get ride of the header | |
chomp $li; | |
my@tab=split(/\t/,$li); | |
#my($read,$flag,$ref,$start,$mapq,$cigar,$seq)=($tab[0],$tab[1],$tab[2],$tab[3],$tab[4],$tab[5],$tab[9]); | |
my$read= shift @tab; | |
my$flag= shift @tab; | |
my$ref= shift @tab; | |
my$start= shift @tab; | |
my$mapq= shift @tab; | |
my$cigar= shift @tab; | |
my$other1= shift @tab; | |
my$other2= shift @tab; | |
my$other3= shift @tab; | |
my$other= join ("\t", $other1,$other2,$other3); | |
my$seq= shift @tab; | |
my$last= join ("\t", @tab); | |
if(SamUtils::checkIfIsCoveringInterval($start,$cigar,$limMin,$limSup)){ | |
my$before=$endRef-$start; # lenght to pull out before | |
# extraire fragment | |
my$fragment="youpi"; | |
$fragment=SamUtils::extractFrag($before,$cigar,$seq,$size); | |
if(defined($tstretchmode)){ | |
if(defined($tstretch_name{$fragment})){ | |
$ref=$tstretch_name{$fragment}; | |
} | |
else{ | |
$ref="*"; | |
} | |
} | |
else{ | |
if(defined($tstretch_chr{$fragment})){ | |
$ref=$tstretch_chr{$fragment}; | |
} | |
else{ | |
$ref="*"; | |
} | |
} | |
print "$read\t$flag\t$ref\t$start\t$mapq\t$cigar\t$other\t$seq\t$last\n"; | |
} | |
} | |
close F2; | |
print STDERR "end Sam file\n"; | |
} | |
######################################################################################################### | |
sub mainWithS { # read the file | |
my ($samfile,$refFile,$mismatches) = @_; | |
# 1/ stockage of refs | |
my%ref_seq; | |
my%tstretch_count; | |
my$totalreads=0; | |
my$coveringreads=0; | |
my$okreads=0; | |
open (F1, "$refFile") || die ("$scriptname --sub mainWithS : opening fails $refFile"); | |
my$ref=""; | |
my$switch=0; | |
while (my$li=<F1>) { | |
if($li=~/^>/){ | |
chomp $li; | |
$li=~s/^>//; | |
$li=~s/^[\W]+//; | |
$li=~s/[\W]+$//; | |
$li=~s/ .+$//; | |
$ref=$li; | |
$switch=1; | |
$ref_seq{$ref}=""; | |
next; | |
} | |
elsif ($switch==1) { | |
chomp $li; | |
$ref_seq{$ref}.=$li; | |
} | |
} | |
close F1; | |
# 2/ reading of sam | |
open (F2, $samfile) || die("$scriptname --sub mainWithS : pb ouverture $samfile\n"); | |
print STDERR "file $samfile opened\n"; | |
while (my$li=<F2>){ | |
next if ($li=~/^@/); # get ride of the header | |
chomp $li; | |
my@tab=split(/\t/,$li); | |
#my($read,$flag,$ref,$start,$mapq,$cigar,$seq)=($tab[0],$tab[1],$tab[2],$tab[3],$tab[4],$tab[5],$tab[9]); | |
my$read= shift @tab; | |
my$flag= shift @tab; | |
my$ref= shift @tab; | |
my$start= shift @tab; | |
my$mapq= shift @tab; | |
my$cigar= shift @tab; | |
my$other1= shift @tab; | |
my$other2= shift @tab; | |
my$other3= shift @tab; | |
my$other= join ("\t", $other1,$other2,$other3); | |
my$seq= shift @tab; | |
my$last= join ("\t", @tab); | |
if(SamUtils::checkIfIsCoveringIntervalWithSEnd($start,$cigar,$limMin,$limSup)){ | |
my$before=$endRef-$start; # lenght to pull out before | |
# extract fragment | |
my$fragment="youpi"; | |
$fragment=SamUtils::extractFrag($before,$cigar,$seq,$size); | |
if(defined($tstretchmode)){ | |
if(defined($tstretch_name{$fragment})){ | |
$ref=$tstretch_name{$fragment}; | |
} | |
else{ | |
$ref="*"; | |
} | |
} | |
else{ | |
if(defined($tstretch_chr{$fragment})){ | |
$ref=$tstretch_chr{$fragment}; | |
} | |
else{ | |
$ref="*"; | |
} | |
} | |
#print "$read\t$flag\t$ref\t$start\t$mapq\t$cigar\t$seq\n"; | |
print "$read\t$flag\t$ref\t$start\t$mapq\t$cigar\t$other\t$seq\t$last\n"; | |
} | |
} | |
close F2; | |
print STDERR "end Sam file\n"; | |
} | |
######################################################################################################### | |
sub readTstretchsList { # read thefichier | |
my $file = shift @_; | |
open (F1, $file) || die("$scriptname --sub readTstretchsList : pb ouverture $file\n"); | |
while (my$li=<F1>){ | |
next if ($li=~/^#/); # get rie of the header | |
chomp $li; | |
my@tab=split(/\t/,$li); | |
$tstretch_chr{$tab[0]}=$tab[1]; | |
if(scalar(@tab)>2){$tstretch_name{$tab[0]}=$tab[2];} | |
} | |
close F1; | |
} | |
######################################################################################################### | |
sub help { | |
my@tab = split(/\//,$0); | |
my $name = pop(@tab); | |
print STDERR "\n"; | |
print STDERR join("-", $0, $VERSION, $lastModif), "\n"; | |
print STDERR "USAGE: $name\t[options] <sam file>\n"; | |
print STDERR "\t# required\n"; | |
print STDERR "\t\t[-r|--ref]<fasta reference file>\n"; | |
print STDERR "\t\t[-m|--mismatches]<mismatches>\n"; | |
print STDERR "\t\t[--minstart:int] = minimal start of reads\n"; | |
print STDERR "\t\t[--minend:int] = minimal end of reads\n"; | |
print STDERR "\t\t[--size:int] = tstretchs size +1\n"; | |
print STDERR "\t\t[--endref:int] = end of reference sequence (endpos of tstretch)\n"; | |
print STDERR "\t\t[-l|--list] <tstretchs tabulated list>\n"; | |
print STDERR "\t# optional\n"; | |
print STDERR "\t\t[-h|--help]\n"; | |
print STDERR "\t\t[-t|--tstretchmode] if tstretchs name wanted for output refs\n"; | |
print STDERR "\t\t[-S|--softclip] if reads longer than reference\n"; | |
exit(1) ; | |
} | |
sub help2 { | |
my $file = shift @_; | |
print STDERR "The file \"$file\" doesn't exist or is not at the specified/current location\n"; | |
exit(1) ; | |
} | |
# end | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
use strict; | |
use warnings; | |
use Getopt::Long; | |
my $VERSION='0.2'; | |
my $lastModif='2016-Mar-04'; | |
my($tstretchList,$fastqFile); | |
# set the options | |
&GetOptions("h|help" => \&help, | |
"l|list:s" => \$tstretchList, | |
"f|fastq:s" => \$fastqFile | |
); | |
#@ARGV or &help; | |
# verification of the arg | |
(-e "$tstretchList") or &help2($tstretchList); | |
(-e "$fastqFile") or &help2($fastqFile); | |
######################################################################################################### | |
# main # | |
######################################################################################################### | |
# goal : make group of reads depending on the T-Stretchs | |
my %tstretch_chr; # list of tstretchs to analyze | |
&readTstretchList($tstretchList); # analyze a file | |
foreach my$element (keys %tstretch_chr){ | |
&searchTstretch($tstretch_chr{$element},$element,$fastqFile); # analyze @_ | |
} | |
######################################################################################################### | |
# SUBS # | |
######################################################################################################### | |
sub readTstretchList { # reads the t-stretch list format [0]=tstretch [1]=chr | |
my $file = shift @_; | |
open (F1, $file) || die("file opening failed\n"); | |
while (my$li=<F1>){ | |
next if ($li=~/^#/); # get ride off the header | |
chomp $li; | |
my@tab=split(/\t/,$li); | |
$tstretch_chr{$tab[0]}=$tab[1]; # stock the tstretch | |
print STDERR "Tstretch $tab[1] : $tab[0]\n"; | |
} | |
close F1; | |
print STDERR "#-#-# Tstretch list memorized #-#-#\n"; | |
} | |
######################################################################################################### | |
sub searchTstretch { | |
my($chr,$tstretch,$file) = @_; # => in the order of @_ | |
my$filename = $file; | |
$filename =~ s/.*\///; | |
my$fileout = $chr."_".$tstretch."_from_".$filename; | |
`grep -B1 -A2 "$tstretch" $file | grep -v "^--\$" > $fileout`; | |
#print "$chr : $tstretch ="; | |
#`grep -c "$tstretch" $file`; | |
#print " occurences\n"; | |
print STDERR "Tstretch $chr : $tstretch done\n"; | |
} | |
######################################################################################################### | |
sub help { | |
my@tab = split(/\//,$0); | |
my $name = pop(@tab); | |
print STDERR "\n"; | |
print STDERR join("-", $0, $VERSION, $lastModif), "\n"; | |
print STDERR "USAGE: $name [-h|--help] <all_snp file>\n"; | |
print STDERR "The snp file made by ListSnp.pl needs to be sliced by refs => one ref at the same time !\n\n"; | |
exit(1) ; | |
} | |
sub help2 { | |
my $file = shift @_; | |
print STDERR "The file \"$file\" doesn't exist or is not at the specified/current location\n"; | |
exit(1) ; | |
} | |
# end | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
use strict; | |
use warnings; | |
use Getopt::Long; | |
use SamUtils; | |
my $VERSION='0.8'; | |
my $lastModif='2016-Aug-21'; | |
my@t = split(/\//,$0); | |
my $scriptname = pop(@t); | |
my($liminf,$limsup); | |
# set the options | |
&GetOptions("h|help" => \&help, | |
"liminf:i" => \$liminf, | |
"limsup:i" => \$limsup, | |
); | |
# arg0 = ref fasta files | |
@ARGV or &help; | |
######################################################################################################### | |
# infos # | |
######################################################################################################### | |
# list of all SNP on all reads | |
######################################################################################################### | |
# main # | |
######################################################################################################### | |
# stockage of refs | |
my%ref_seq; | |
open (F1, "$ARGV[0]") || die("$scriptname --step open Refs : opening fails $ARGV[0]\n"); | |
my$ref=""; | |
my$switch=0; | |
while (my$li=<F1>) { | |
if($li=~/^>/){ | |
chomp $li; | |
$li=~s/^>//; | |
$li=~s/^[\W]+//; | |
$li=~s/[\W]+$//; | |
$li=~s/ .+$//; | |
print STDERR "ref : $li memorised\n"; | |
$ref=$li; | |
$switch=1; | |
$ref_seq{$ref}=""; | |
next; | |
} | |
elsif ($switch==1) { | |
chomp $li; | |
$ref_seq{$ref}.=$li; | |
#$switch=0; | |
} | |
} | |
close F1; | |
######################################################################################################### | |
my%countSnps; | |
my$totalreads=0; | |
my%ref_totalreads; | |
my%ref_countSnps; | |
print "#reference\tread\tstart_pos\tend_pos\tSNPs\n"; | |
# while : | |
while (my$li=<STDIN>) { | |
next if ($li=~/^@/); | |
chomp $li; | |
my@tab=split(/\t/, $li); | |
my($read,$flag,$ref,$start,$cigar,$seq)=($tab[0],$tab[1],$tab[2],$tab[3],$tab[5],$tab[9]); | |
if ($flag != 12 && $flag != 13 && $flag != 69 && $flag != 77 && $flag != 117 && $flag != 133 && $flag != 141 && $flag != 181 && $flag < 512 || $flag >= 1024){ | |
$totalreads++; | |
$ref_totalreads{$ref}++; | |
#if($totalreads/10000==int($totalreads/10000)){print STDERR "$totalreads reads processed\n";} | |
if(!(defined($ref_seq{$ref}))){print STDERR "error : no refseq available for \"$ref\"\n";} | |
my$res_comparaison; | |
$res_comparaison=SamUtils::compareReadToRef($cigar,$start,$seq,$ref_seq{$ref}); | |
## cut res_comparaison | |
if(defined($res_comparaison) && defined($liminf) && defined($limsup)){ | |
my$newres_comparaison=SamUtils::trimSubseqOfSnplist($res_comparaison,$liminf,$limsup); | |
$res_comparaison=$newres_comparaison; | |
} | |
## | |
my$nbSnp=SamUtils::countSnpFromSnplist($res_comparaison); | |
$countSnps{$nbSnp}++; | |
$ref_countSnps{$ref}{$nbSnp}++; | |
my$endpos=SamUtils::returnEndPos($start,$cigar); | |
## new start new end | |
=head | |
if(defined($liminf) && defined($limsup)){ | |
$start=$liminf; | |
my$newEnd=$endpos-$liminf; | |
if($newEnd>$limsup){$newEnd=$limsup;} | |
$endpos=$newEnd; | |
} | |
=cut | |
## | |
if(defined($res_comparaison)){ | |
print "$ref\t$read\t$start\t$endpos\t$res_comparaison\n"; # ref ; read ; start ; stop ; SNPs | |
} | |
else{ | |
print "$ref\t$read\t$start\t$endpos\t\n"; # ref ; read ; start ; stop ; SNPs | |
} | |
} | |
}# end while | |
my$multiSnp=0; | |
print STDERR "total reads :\t$totalreads\n"; | |
foreach my$k (sort(keys %countSnps)){ | |
my$pourcentagec=$countSnps{$k}/$totalreads*100; | |
if($k<=2){print STDERR "reads with $k snps :\t$countSnps{$k}\t$pourcentagec%\n";} | |
if($k>2){ | |
$multiSnp+=$countSnps{$k}; | |
} | |
} | |
my$pourcentagec2=$multiSnp/$totalreads*100; | |
print STDERR "reads with more than 2 snps :\t$multiSnp\t$pourcentagec2%\n"; | |
######################################################################################################### | |
foreach my$refc (sort(keys %ref_totalreads)){ | |
my$multiSnp2=0; | |
my$totalR=$ref_totalreads{$refc}; | |
print STDERR "------------------------------------------\n"; | |
print STDERR "reference : $refc\n"; | |
print STDERR "total reads :\t$ref_totalreads{$refc}\n"; | |
foreach my$k (sort(keys $ref_countSnps{$refc})){ | |
my$pourcentagec=$ref_countSnps{$refc}{$k}/$totalR*100; | |
if($k<=2){print STDERR "reads with $k snps :\t$ref_countSnps{$refc}{$k}\t$pourcentagec%\n";} | |
if($k>2){ | |
$multiSnp2+=$ref_countSnps{$refc}{$k}; | |
} | |
} | |
my$pourcentagec2b=$multiSnp2/$totalR*100; | |
print STDERR "reads with more than 2 snps :\t$multiSnp2\t$pourcentagec2b%\n"; | |
} | |
######################################################################################################### | |
# SUBS # | |
######################################################################################################### | |
sub help { | |
print STDERR join("-", $0, $VERSION, $lastModif), "\n"; | |
print STDERR "USAGE: STDIN \| $0 [-h|--help] <reference_sequences_used_in_mapping.fasta>\n"; | |
print STDERR "USAGE: STDIN = cat/awk from .sam file ; filtered by quality\n"; | |
print STDERR "Example :\n"; | |
print STDERR "awk '{ if (\$5>=30) { print \$0 ; } }' filein.sam | scripts/ListSnp.pl ref.fasta > fileout.txt\n"; | |
exit(1) ; | |
} | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
bash Script | |
#!/bin/sh | |
########################################################################################## | |
# Info # | |
########################################################################################## | |
# | |
# required perl scripts : | |
# | |
# GrepTstretchsInFastq.pl | |
# CheckIfReadIsCoveringInterval.pl | |
# GetSamWithRefReplacedByTstretchs.pl | |
# CountPolymorphisms.pl | |
# ListSnpModules_v2.pl | |
# | |
# >> Samutils.pm required ! | |
# | |
# | |
########################################################################################## | |
# Environment variables # | |
########################################################################################## | |
refTstretchs=/PATH/liste_tstretchs.csv | |
sizeTstretchs=16 | |
fastq1=/PATH/File_score10_paired_fused.fastq | |
fastqName1=File_score10_paired_fused | |
fastq2=/PATH/File_score10_1P_notFused.fastq | |
fastqName2=File_score10_1P_notFused | |
fastq3=/PATH/File_score10_2P_notFused.fastq | |
fastqName3=File_score10_2P_notFused | |
fastq4=/PATH/File_score10_1U.fastq | |
fastqName4=File_score10_1U | |
fastq5=/PATH/File_score10_2U.fastq | |
fastqName5=File_score10_2U | |
########################################################################################## | |
refFasta=/PATH/reference_transcribed_sequence.fasta | |
startRef=21 | |
endref=140 | |
endTstretch=156 | |
lim=20 | |
refFasta2=/PATH/reference_transcribed_sequence.fasta | |
########################################################################################## | |
# PIPELINE # | |
########################################################################################## | |
# A/ Raw reads Grep : | |
# a) | |
GrepTstretchsInFastq.pl \ | |
-l $refTstretchs \ | |
-f $fastq1 | |
GrepTstretchsInFastq.pl \ | |
-l $refTstretchs \ | |
-f $fastq2 | |
GrepTstretchsInFastq.pl \ | |
-l $refTstretchs \ | |
-f $fastq3 | |
GrepTstretchsInFastq.pl \ | |
-l $refTstretchs \ | |
-f $fastq4 | |
GrepTstretchsInFastq.pl \ | |
-l $refTstretchs \ | |
-f $fastq5 | |
# b) | |
fastqName1b=$fastqName1"_withT-stretchs.fastq" | |
fastqName2b=$fastqName2"_withT-stretchs.fastq" | |
fastqName3b=$fastqName3"_withT-stretchs.fastq" | |
fastqName4b=$fastqName4"_withT-stretchs.fastq" | |
fastqName5b=$fastqName5"_withT-stretchs.fastq" | |
cat *$fastqName1.fastq > $fastqName1b | |
cat *$fastqName2.fastq > $fastqName2b | |
cat *$fastqName3.fastq > $fastqName3b | |
cat *$fastqName4.fastq > $fastqName4b | |
cat *$fastqName5.fastq > $fastqName5b | |
mkdir fastq_par_Tstretchs | |
mv *$fastqName*.fastq fastq_par_Tstretchs/. | |
echo "step A done" | |
########################################################################################## | |
#B/ Mapping | |
#a) | |
bwa mem -t32 -v2 $refFasta $fastqName1b 2> log_mapping.txt | grep -v ^@ \ | |
| awk '{if($2 != 4 && $2 != 12 && $2 != 13 && $2 != 20 && $2 != 69 && $2 != 77 && $2 != 117 && $2 != | |
133 && $2 != 141 && $2 != 181 && ($2 < 512 || $2 >= 1024)){print $0;}}' \ | |
> $fastqName1"_vs_transcrit_mapped.sam" | |
bwa mem -t32 -v2 $refFasta $fastqName2b 2>> log_mapping.txt | grep -v ^@ \ | |
| awk '{if($2 != 4 && $2 != 12 && $2 != 13 && $2 != 20 && $2 != 69 && $2 != 77 && $2 != 117 && $2 != | |
133 && $2 != 141 && $2 != 181 && ($2 < 512 || $2 >= 1024)){print $0;}}' \ | |
> $fastqName2"_vs_transcrit_mapped.sam" | |
bwa mem -t32 -v2 $refFasta $fastqName3b 2>> log_mapping.txt | grep -v ^@ \ | |
| awk '{if($2 != 4 && $2 != 12 && $2 != 13 && $2 != 20 && $2 != 69 && $2 != 77 && $2 != 117 && $2 != | |
133 && $2 != 141 && $2 != 181 && ($2 < 512 || $2 >= 1024)){print $0;}}' \ | |
> $fastqName3"_vs_transcrit_mapped.sam" | |
bwa mem -t32 -v2 $refFasta $fastqName4b 2>> log_mapping.txt | grep -v ^@ \ | |
| awk '{if($2 != 4 && $2 != 12 && $2 != 13 && $2 != 20 && $2 != 69 && $2 != 77 && $2 != 117 && $2 != | |
133 && $2 != 141 && $2 != 181 && ($2 < 512 || $2 >= 1024)){print $0;}}' \ | |
> $fastqName4"_vs_transcrit_mapped.sam" | |
bwa mem -t32 -v2 $refFasta $fastqName5b 2>> log_mapping.txt | grep -v ^@ \ | |
| awk '{if($2 != 4 && $2 != 12 && $2 != 13 && $2 != 20 && $2 != 69 && $2 != 77 && $2 != 117 && $2 != | |
133 && $2 != 141 && $2 != 181 && ($2 < 512 || $2 >= 1024)){print $0;}}' \ | |
> $fastqName5"_vs_transcrit_mapped.sam" | |
#b) | |
cat *_mapped.sam > all_reads_mapped.sam | |
echo "step B done" | |
########################################################################################## | |
#C/ Selection of reads covering all the transcribed sequence | |
CheckIfReadIsCoveringInterval.pl --minstart $startRef --minend $endref all_reads_mapped.sam > | |
all_reads_coveringTranscript_mapped.sam | |
echo "step C done" | |
########################################################################################## | |
#D/ Replace of the mapped references by the corresponding chromosome (give the result by cluster) | |
#a) | |
GetSamWithRefReplacedByTstretchs.pl \ | |
-r $refFasta \ | |
-m 0 --minstart $startRef --minend $endTstretch --size $sizeTstretchs --endref $endref \ | |
-l $refTstretchs \ | |
-S all_reads_coveringTranscript_mapped.sam > all_reads_splitByTstretchs_mapped.sam | |
#(to check) | |
awk '{print $3;}' all_reads_splitByTstretchs_mapped.sam | sort | uniq -c | |
#b) elimination of reads « * » in a) = don't have the t-stretch in the right place | |
awk '{if($3~"chr"){print $0;}}' all_reads_splitByTstretchs_mapped.sam > | |
all_reads_splitByTstretchs_mapped_filtered.sam | |
echo "step D done" | |
########################################################################################## | |
#E/ Polymorphisms analyze | |
CountPolymorphisms.pl --startref $startRef --endref $endref $refFasta2 | |
all_reads_splitByTstretchs_mapped_filtered.sam | |
cat all_reads_splitByTstretchs_mapped_filtered.sam | ListSnpModules_v2.pl --liminf $lim --limsup $endref | |
$refFasta2 \ | |
> all_reads_splitByTstretchs_polymorphisms.txt 2> all_reads_splitByTstretchs_stats.txt | |
echo "step E done" | |
########################################################################################## | |
########################################################################################## | |
echo "end" | |
Script perl CheckIfReadIsCoveringInterval.pl | |
#!/usr/bin/perl | |
use strict; | |
use warnings; | |
use Getopt::Long; | |
use SamUtils; | |
my $VERSION='0.3'; | |
my $lastModif='2016-Jun-09'; | |
my($limMin,$limSup,$softclip); | |
# set the options | |
&GetOptions("h|help" => \&help, | |
"minstart:i" => \$limMin, # read start must be <= limmin | |
"minend:i" => \$limSup, # read end must be >= limsup | |
"S|softclip" => \$softclip # if reads mapped exceed the reference sequence | |
); | |
@ARGV or &help; | |
# verification of arg | |
(-e "$ARGV[0]") or &help2($ARGV[0]); | |
########################################################################################## | |
# main # | |
########################################################################################## | |
if(defined($softclip)){ | |
&mainWithS($ARGV[0]); | |
} | |
else{ | |
&main($ARGV[0]); | |
} | |
########################################################################################## | |
# SUBS # | |
########################################################################################## | |
sub main { # read the file | |
my $samfile = shift @_; | |
my ($totalreads,$coveringreads); # nb of total reads in sam file, nb of covering reads | |
open (F1, $samfile) || die("can't open the file\n"); | |
while (my$li=<F1>){ | |
next if ($li=~/^@/); # filter header | |
$totalreads++; | |
chomp $li; | |
my@tab=split(/\t/,$li); | |
# needed informations | |
my($read,$ref,$start,$cigar,$seq)=($tab[0],$tab[2],$tab[3],$tab[5],$tab[9]); | |
# if(<=$limMin && >=$limSup) ==1 | |
if(SamUtils::checkIfIsCoveringInterval($start,$cigar,$limMin,$limSup)){ | |
$coveringreads++; # count read as covering | |
print "$li\n"; # let read pass through | |
} | |
} | |
close F1; | |
print STDERR "total processed reads : $totalreads\n"; | |
print STDERR "total covering reads : $coveringreads\n"; | |
} | |
########################################################################################## | |
sub mainWithS { # lit le fichier | |
my $samfile = shift @_; | |
my ($totalreads,$coveringreads); # nb of total reads in | |
sam file, nb of covering reads | |
open (F1, $samfile) || die("pb ouverture fichier\n"); | |
while (my$li=<F1>){ | |
next if ($li=~/^@/); # filter header | |
$totalreads++; | |
chomp $li; | |
my@tab=split(/\t/,$li); | |
# needed informations | |
my($read,$ref,$start,$cigar,$seq)=($tab[0],$tab[2],$tab[3],$tab[5],$tab[9]); | |
# if(<=$limMin && >=$limSup) ==1 | |
if(SamUtils::checkIfIsCoveringIntervalWithSEnd($start,$cigar,$limMin,$limSup)){ | |
$coveringreads++; # count read as covering | |
print "$li\n"; # let read pass through | |
} | |
} | |
close F1; | |
print STDERR "total processed reads : $totalreads\n"; | |
print STDERR "total covering reads : $coveringreads\n"; | |
} | |
########################################################################################## | |
sub help { | |
my@tab = split(/\//,$0); | |
my $name = pop(@tab); | |
print STDERR "\n"; | |
print STDERR join("-", $0, $VERSION, $lastModif), "\n"; | |
print STDERR "USAGE: $name [-h|--help] --minstart:int --minend:int [-S|--softclip] <sam file>\n"; | |
exit(1) ; | |
} | |
sub help2 { | |
my $file = shift @_; | |
print STDERR "The file \"$file\" doesn't exist or is not at the specified/current location\n"; | |
exit(1) ; | |
} | |
# end | |
Script perl GetSamWithRefReplacedByTstretchs.pl | |
#!/usr/bin/perl | |
use strict; | |
use warnings; | |
use Getopt::Long; | |
use SamUtils; | |
my $VERSION='0.4'; | |
my $lastModif='2016-Aug-21'; | |
my@tab = split(/\//,$0); | |
my$scriptname = pop(@tab); | |
my($refFile,$limMin,$limSup,$size,$endRef,$list,$softclip,$tstretchmode); | |
my$mismatches=0; | |
# set the options | |
&GetOptions("h|help" => \&help, | |
# required | |
"r|ref:s" => \$refFile, | |
"m|mismatches:i" => \$mismatches, | |
"minstart:i" => \$limMin, # read start must be <= limmin | |
"minend:i" => \$limSup, # read end must be >= limsup | |
"size:i"=> \$size, # required extract fragment size | |
"endref:i" => \$endRef, # end reference position | |
"l|list:s" => \$list, # tstretchs list | |
# optional | |
"t|tstretchmode" => \$tstretchmode, # if we want Tstretch name in new refs | |
# // normal mode is chr name | |
"S|softclip" => \$softclip # if wanted fragment is out of reference boundaries | |
); | |
@ARGV or &help; | |
# sam file existence control | |
(-e "$ARGV[0]") or &help2($ARGV[0]); | |
########################################################################################## | |
# info # | |
########################################################################################## | |
# input = sam file | |
# -r = ref.fasta | |
# -l = tstretchlist.csv/tsv/txt | |
########################################################################################## | |
# main # | |
########################################################################################## | |
my %tstretch_chr; # store corresponding chr for each T-stretch | |
my %tstretch_name; # store corresponding T-stretch name for each T-stretch | |
&readTstretchsList($list); | |
if(defined($softclip)){ | |
&mainWithS($ARGV[0],$refFile,$mismatches); | |
} | |
else{ | |
&main($ARGV[0],$refFile,$mismatches); | |
} | |
########################################################################################## | |
# SUBS # | |
########################################################################################## | |
sub main { | |
my ($samfile,$refFile,$mismatches) = @_; | |
# 1/ references recording | |
my%ref_seq; # key=reference name ; value=sequence | |
my%tstretch_count; # key=Tstretch ; value=counts | |
my$totalreads=0; # total reads in Sam file | |
my$coveringreads=0; # total reads covering the interval | |
open (F1, "$refFile") || die ("$scriptname --sub main : pb opening $refFile"); | |
my$ref=""; | |
my$switch=0; | |
while (my$li=<F1>) { | |
if($li=~/^>/){ | |
chomp $li; | |
$li=~s/^>//; | |
$li=~s/^[\W]+//; | |
$li=~s/[\W]+$//; | |
$li=~s/ .+$//; | |
print STDERR "ref : $li memorised\n"; | |
$ref=$li; | |
$switch=1; | |
$ref_seq{$ref}=""; # store current ref | |
next; | |
} | |
elsif ($switch==1) { | |
chomp $li; | |
$ref_seq{$ref}.=$li; # add seq to current ref | |
} | |
} | |
close F1; | |
# 2/ Sam file processing | |
open (F2, $samfile) || die("$scriptname --sub main : pb ouverture $samfile\n"); | |
print STDERR "file $samfile opened\n"; | |
while (my$li=<F2>){ | |
next if ($li=~/^@/); # header | |
chomp $li; | |
my@tab=split(/\t/,$li); | |
my$read= shift @tab; | |
my$flag= shift @tab; | |
my$ref= shift @tab; | |
my$start= shift @tab; | |
my$mapq= shift @tab; | |
my$cigar= shift @tab; | |
my$other1= shift @tab; | |
my$other2= shift @tab; | |
my$other3= shift @tab; | |
my$other= join ("\t", $other1,$other2,$other3); | |
my$seq= shift @tab; | |
my$last= join ("\t", @tab); | |
# check if current read is covering the defined interval | |
if(SamUtils::checkIfIsCoveringInterval($start,$cigar,$limMin,$limSup)){ | |
my$before=$endRef-$start; # read length : start to endref | |
# extract fragment | |
my$fragment=""; | |
$fragment=SamUtils::extractFrag($before,$cigar,$seq,$size); | |
if(defined($tstretchmode)){ # if we want Tstretch name as new refs | |
if(defined($tstretch_name{$fragment})){ | |
$ref=$tstretch_name{$fragment}; | |
} | |
else{ | |
$ref="*"; | |
} | |
} | |
else{ # if we want chr name as new refs | |
if(defined($tstretch_chr{$fragment})){ | |
$ref=$tstretch_chr{$fragment}; | |
} | |
else{ | |
$ref="*"; | |
} | |
} | |
print "$read\t$flag\t$ref\t$start\t$mapq\t$cigar\t$other\t$seq\t$last\n"; | |
} | |
} | |
close F2; | |
print STDERR "end Sam file\n"; | |
} | |
########################################################################################## | |
sub mainWithS { # lit le fichier | |
my ($samfile,$refFile,$mismatches) = @_; | |
# 1/ references recording | |
my%ref_seq; # key=reference name ; value=sequence | |
my%tstretch_count; # key=Tstretch ; value=counts | |
my$totalreads=0; # total reads in Sam file | |
my$coveringreads=0; # total reads covering the interval | |
open (F1, "$refFile") || die ("$scriptname --sub main : pb opening $refFile"); | |
my$ref=""; | |
my$switch=0; | |
while (my$li=<F1>) { | |
if($li=~/^>/){ | |
chomp $li; | |
$li=~s/^>//; | |
$li=~s/^[\W]+//; | |
$li=~s/[\W]+$//; | |
$li=~s/ .+$//; | |
print STDERR "ref : $li memorised\n"; | |
$ref=$li; | |
$switch=1; | |
$ref_seq{$ref}=""; # store current ref | |
next; | |
} | |
elsif ($switch==1) { | |
chomp $li; | |
$ref_seq{$ref}.=$li; # add seq to current ref | |
} | |
} | |
close F1; | |
# 2/ Sam file processing | |
open (F2, $samfile) || die("$scriptname --sub main : pb ouverture $samfile\n"); | |
print STDERR "file $samfile opened\n"; | |
while (my$li=<F2>){ | |
next if ($li=~/^@/); # header | |
chomp $li; | |
my@tab=split(/\t/,$li); | |
my$read= shift @tab; | |
my$flag= shift @tab; | |
my$ref= shift @tab; | |
my$start= shift @tab; | |
my$mapq= shift @tab; | |
my$cigar= shift @tab; | |
my$other1= shift @tab; | |
my$other2= shift @tab; | |
my$other3= shift @tab; | |
my$other= join ("\t", $other1,$other2,$other3); | |
my$seq= shift @tab; | |
my$last= join ("\t", @tab); | |
# check if current read is covering the defined interval | |
if(SamUtils::checkIfIsCoveringIntervalWithSEnd($start,$cigar,$limMin,$limSup)){ | |
my$before=$endRef-$start; # read length : start to endref | |
# extract fragment | |
my$fragment=""; | |
$fragment=SamUtils::extractFrag($before,$cigar,$seq,$size); | |
if(defined($tstretchmode)){ # if we want Tstretch name as new refs | |
if(defined($tstretch_name{$fragment})){ | |
$ref=$tstretch_name{$fragment}; | |
} | |
else{ | |
$ref="*"; | |
} | |
} | |
else{ # if we want chr name as new refs | |
if(defined($tstretch_chr{$fragment})){ | |
$ref=$tstretch_chr{$fragment}; | |
} | |
else{ | |
$ref="*"; | |
} | |
} | |
print "$read\t$flag\t$ref\t$start\t$mapq\t$cigar\t$other\t$seq\t$last\n"; | |
} | |
} | |
close F2; | |
print STDERR "end Sam file\n"; | |
} | |
########################################################################################## | |
sub readTstretchsList { | |
my $file = shift @_; | |
open (F1, $file) || die("$scriptname --sub readTstretchsList : pb ouverture $file\n"); | |
while (my$li=<F1>){ | |
next if ($li=~/^#/); # for header | |
chomp $li; | |
my@tab=split(/\t/,$li); | |
$tstretch_chr{$tab[0]}=$tab[1]; # store : key=current Tstretch ; val=chr name | |
if(scalar(@tab)>2){ | |
# store : key=current Tstretch ; val=Tstretch name | |
$tstretch_name{$tab[0]}=$tab[2]; | |
} | |
} | |
close F1; | |
} | |
########################################################################################## | |
sub help { | |
my@tab = split(/\//,$0); | |
my $name = pop(@tab); | |
print STDERR "\n"; | |
print STDERR join("-", $0, $VERSION, $lastModif), "\n"; | |
print STDERR "USAGE: $name\t[options] <sam file>\n"; | |
print STDERR "\t# required\n"; | |
print STDERR "\t\t[-r|--ref]<fasta reference file>\n"; | |
print STDERR "\t\t[-m|--mismatches]<mismatches>\n"; | |
print STDERR "\t\t[--minstart:int] = minimal start of reads\n"; | |
print STDERR "\t\t[--minend:int] = minimal end of reads\n"; | |
print STDERR "\t\t[--size:int] = tstretchs size +1\n"; | |
print STDERR "\t\t[--endref:int] = end of reference sequence (endpos of tstretch)\n"; | |
print STDERR "\t\t[-l|--list] <tstretchs tabulated list>\n"; | |
print STDERR "\t# optional\n"; | |
print STDERR "\t\t[-h|--help]\n"; | |
print STDERR "\t\t[-t|--tstretchmode] if tstretchs name wanted for output refs\n"; | |
print STDERR "\t\t[-S|--softclip] if reads longer than reference\n"; | |
exit(1) ; | |
} | |
sub help2 { | |
my $file = shift @_; | |
print STDERR "The file \"$file\" doesn't exist or is not at the specified/current location\n"; | |
exit(1) ; | |
} | |
# end | |
Script perl CountPolymorphisms_v4.pl | |
#!/usr/bin/perl | |
use strict; | |
use warnings; | |
use Getopt::Long; | |
use SamUtils; | |
my $VERSION='0.2'; | |
my $lastModif='2016-Aug-21'; | |
my@tabname = split(/\//,$0); | |
my $scriptname = pop(@tabname); | |
my $startref=0; | |
my $endref; | |
my $meandepth=1; | |
my $bruit=10; | |
# set the options | |
&GetOptions("h|help" => \&help, | |
"startref:i" => \$startref, # start reference position | |
"endref:i" => \$endref, # end reference position | |
"meandepth:i" => \$meandepth # if we want to normalise by sequencing depth | |
); | |
@ARGV or &help; | |
# file existence ctrl | |
(-e "$ARGV[0]") or &help2($ARGV[0]); | |
########################################################################################### | |
# info | |
########################################################################################### | |
# count polymorphisms present at each position l | |
# arg0 = ref.fasta | |
# arg1 = file.sam | |
### apport // v1 | |
########################################################################################### | |
# main | |
########################################################################################### | |
&main($ARGV[0],$ARGV[1]); | |
########################################################################################### | |
# SUBS | |
########################################################################################### | |
sub main { | |
my ($refFile,$samfile) = @_; | |
# 1/ references recording | |
my%ref_seq; # key=reference name ; value=sequence | |
my%ref_pos_snp_nb; # key=reference name ; key2=position ; key3=snp ; | |
value=count | |
my%ref_pos_totalsnps; # key=reference name ; key2=position ; value=total of all snps | |
my%totalreadsByRef; # key=rererence name ; value= total of correspondig reads | |
my%normalbasebypos; # key=reference name ; key2=position ; value=base | |
my%ref_pos_depth; # key=reference name ; key2=position ; value=position | |
depth | |
open (F1, "$refFile") || die ("$scriptname --sub main : pb ouverture $refFile"); | |
my$ref=""; | |
my$switch=0; | |
while (my$li=<F1>) { | |
if($li=~/^>/){ | |
chomp $li; | |
$li=~s/^>//; | |
$li=~s/^[\W]+//; | |
$li=~s/[\W]+$//; | |
$li=~s/ .+$//; | |
$ref=$li; | |
$switch=1; | |
$ref_seq{$ref}=""; # store current ref | |
next; | |
} | |
elsif ($switch==1) { | |
chomp $li; | |
$ref_seq{$ref}.=$li; # add seq to current ref | |
} | |
} | |
close F1; | |
foreach my$ref (keys %ref_seq){ | |
my@tabseq=split(//,$ref_seq{$ref}); | |
for (my$i=0;$i<scalar(@tabseq);$i++){ | |
my$pos=$i+1; | |
$normalbasebypos{$ref}{$pos}=$tabseq[$i]; | |
$ref_pos_depth{$ref}{$pos}=0; | |
if($pos>=$startref && $pos<=$endref){ | |
$ref_pos_totalsnps{$ref}{$pos}=0; | |
} | |
} | |
} | |
# 2/ sam file processing | |
open (F2, $samfile) || die("$scriptname --sub main : pb ouverture $samfile\n"); | |
print STDERR "file $samfile opened\n"; | |
while (my$li=<F2>){ | |
next if ($li=~/^@/); | |
chomp $li; | |
my@tab=split(/\t/,$li); | |
my($read,$flag,$ref,$start,$mapq,$cigar,$seq)=($tab[0],$tab[1],$tab[2],$tab[3], | |
$tab[4],$tab[5],$tab[9]); | |
$totalreadsByRef{$ref}++; | |
# search all existing SNPs | |
my$res_comparaison=SamUtils::compareReadToRef($cigar,$start,$seq, | |
$ref_seq{$ref}); | |
# keap only covering interval SNPs | |
my$listOfSnps=SamUtils::trimSubseqOfSnplist($res_comparaison,$startref,$endref); | |
if(defined($listOfSnps)){ | |
my@tab_snps= split(/;/,$listOfSnps); | |
foreach my$element (@tab_snps){ | |
my($pos,$snp)=split(/:/,$element); | |
$ref_pos_snp_nb{$ref}{$pos}{$snp}++; | |
$ref_pos_totalsnps{$ref}{$pos}++; | |
} | |
} | |
### depth incrementation | |
my$endpos=SamUtils::returnEndPos($start,$cigar); | |
for (my$i=$start;$i<=$endpos;$i++){ | |
$ref_pos_depth{$ref}{$i}++; | |
} | |
} | |
close F2; | |
# # # # # Sequencing noise elimination | |
foreach my$refc (keys %ref_pos_snp_nb){ | |
# foreach ref | |
foreach my$posc (keys $ref_pos_snp_nb{$refc}){ | |
# foreach position | |
$ref_pos_totalsnps{$refc}{$posc}=0; | |
foreach my$snpc (keys $ref_pos_snp_nb{$refc}{$posc}){ | |
# foreach existing snp | |
if ($ref_pos_snp_nb{$refc}{$posc}{$snpc}<=$bruit){ | |
# if snp< bruit snp=0 | |
$ref_pos_depth{$refc}{$posc}-=$ref_pos_snp_nb{$refc}{$posc} | |
{$snpc}; | |
delete($ref_pos_snp_nb{$refc}{$posc}{$snpc}); | |
} | |
else { | |
# else snp = snp-bruit | |
$ref_pos_snp_nb{$refc}{$posc}{$snpc}-=$bruit; | |
$ref_pos_totalsnps{$refc}{$posc}+=$ref_pos_snp_nb{$refc} | |
{$posc}{$snpc}; # total pos += snp | |
$ref_pos_depth{$refc}{$posc}-=$bruit; | |
} | |
} | |
} | |
} | |
# # # # # | |
# 3/ Results impression | |
my$outname=$samfile; | |
$outname=~s/^.*\///; | |
$outname=~s/\.sam$//i; | |
foreach my$refc (sort(keys %ref_pos_totalsnps)){ | |
# foreach ref | |
my$outfile=$refc."_counts_snp_from_".$outname.".txt"; | |
open (F3, "> $outfile") || die("$scriptname --sub main : pb ouverture $outfile\n"); | |
print F3 "position\tnumber of reads with snp\ttotal reads\t% snp"; | |
print F3 "\tnormalised snp\tnormalised total\tnormalised % snp"; | |
print F3 "\tnormal base\tsnp:number of reads\n"; | |
foreach my$posc (sort {$a<=>$b}(keys $ref_pos_totalsnps{$refc})){ | |
# foreach position | |
my$proportion; | |
### non normalise | |
if($ref_pos_depth{$refc}{$posc}>0){ | |
# mutated reads proportion computing | |
$proportion=int($ref_pos_totalsnps{$refc}{$posc}/ | |
$ref_pos_depth{$refc}{$posc}*10000)/100; | |
} | |
else{ $proportion=0;} | |
my$newpos=$posc-$startref+1; | |
print F3 "$newpos\t"; | |
print F3 "$ref_pos_totalsnps{$refc}{$posc}\t"; | |
print F3 "$ref_pos_depth{$refc}{$posc}\t"; | |
print F3 "$proportion\t"; | |
### normalise | |
my$newtotalsnps=int($ref_pos_totalsnps{$refc}{$posc}/ | |
$meandepth*100)/100; | |
my$newdepth=$ref_pos_depth{$refc}{$posc}/$meandepth; | |
if($newdepth<1){$newdepth=0;} | |
if($newdepth>0){ | |
# mutated reads proportion computing | |
$proportion=int($newtotalsnps/$newdepth*10000)/100; | |
} | |
else{ $proportion=0;} | |
$newdepth=int($newdepth*100)/100; | |
print F3 "$newtotalsnps\t"; | |
print F3 "$newdepth\t"; | |
print F3 "$proportion\t"; | |
### other | |
print F3 "$normalbasebypos{$refc}{$posc}"; | |
if(defined($ref_pos_snp_nb{$refc}{$posc})){ | |
foreach my$snpc (sort(keys $ref_pos_snp_nb{$refc}{$posc})){ | |
# foreach existing snp | |
print F3 "\t$snpc:$ref_pos_snp_nb{$refc}{$posc}{$snpc}"; | |
} | |
} | |
print F3 "\n"; | |
} | |
close F3; | |
} | |
} | |
########################################################################################### | |
sub help { | |
my@tab = split(/\//,$0); | |
my $name = pop(@tab); | |
print STDERR "\n"; | |
print STDERR join("-", $0, $VERSION, $lastModif), "\n"; | |
print STDERR "USAGE: $name [-h|--help] --startref:int --endref:int <ref.fasta> <file.sam>\n\n"; | |
exit(1) ; | |
} | |
sub help2 { | |
my $file = shift @_; | |
print STDERR "The file \"$file\" doesn't exist or is not at the specified/current location\n"; | |
exit(1) ; | |
} | |
# end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment