Last active
June 10, 2017 18:34
-
-
Save SophieDePaula/6aaa86c01ddd0507b8c24e050ae6be62 to your computer and use it in GitHub Desktop.
FinalProject Bachelor degree - Purl
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Script #1 appendix A | |
#this script unites 3 tables from UCSC, in order to add the type of alternative #splicing and the location of exon start and exon end to each gene | |
#***********************- pseudo code -************************* | |
# 1. foreach row in table REFALT | |
# 1.1. extract location data of alt splicing variant | |
# 1.2. find according to this location the matching row in table ALTEV | |
# 1.2.1. if exists | |
# 1.2.1.1 print to TOUT the variant + the alt splicing type column from table ALTEV | |
# 2. foreach row in table TOUT | |
# 2.1 extract gene ID of alt splicing variant | |
# 2.2. find according to this gene ID the matching row in table refseq | |
# 2.2.1. print to OUT the variant + exons data columns from table refseq | |
#*************************************************************** | |
open(REFALT, "RefSeq_Alt") or die "Could not open RefSeq_Alt"; # UCSC table of alternative splicing events and refseq genes - size: 30,289 | |
open(ALTEV, "AltEvents") or die "Could not open AltEvents"; # UCSC table of alternative events - size: 91,223 | |
open(REFSEQ, "refseq") or die "Could not open refseq"; # UCSC table of refseq genes - size: 41,748 | |
open(LOG, ">LOG") or die "Could not open LOG"; # log file of program | |
open(TOUT, ">tempAltEventsWithType") or die "Could not open AltEventsWithType"; # temp output file | |
open(OUT, ">AltEventsWithType") or die "Could not open AltEventsWithType"; # output file | |
# print header of temp result table | |
print TOUT "chrom\trefseqGeneChromStart\trefseqGeneChromEnd\tAltEventsChromStart\tAltEventsChromEnd\tgeneID\trefseqcore\tstrand\trefseqTranscriptStart\trefseqTranscriptEnd\ttranscriptScore\tAltEventType\tnumberOfExons\n"; | |
# column variables for each line in REFALT table | |
$chrom; # Reference sequence chromosome or scaffold | |
$chromStart; # Start position in chromosome | |
$chromEnd; # End position in chromosome | |
$geneID; # Name of gene (usually transcript_id from GTF) | |
$score; # AltEvents table Score, from 0-1000 | |
$strand; # + or - for strand | |
$codingStart; # Transcription start position | |
$codingEnd; # Transcription end position | |
$transcriptScore; # Transcription score | |
$numberOfExons; # Number of exons | |
$line = <REFALT>; | |
print LOG "lines of RefSeq_Alt\n"; | |
while (defined ($line)) # foreach row in table REFALT | |
{ | |
chomp($line); | |
@LineArray = split(/\t/,$line); # splitting the line to the columns | |
$chrom = $LineArray[0]; | |
$chromStart = $LineArray[1]; | |
$chromEnd = $LineArray[2]; | |
$geneID = $LineArray[3]; | |
$score = $LineArray[4]; | |
$strand = $LineArray[5]; | |
$codingStart = $LineArray[6]; | |
$codingEnd = $LineArray[7]; | |
$transcriptScore = $LineArray[8]; | |
$numberOfExons = $LineArray[9]; | |
print LOG "chrom = " . $chrom . ";; "; | |
print LOG "chromStart = " . $chromStart . ";; "; | |
print LOG "chromEnd = " . $chromEnd . ";; "; | |
print LOG "geneID = " . $geneID . ";; "; | |
print LOG "score = " . $score . ";; "; | |
print LOG "strand = " . $strand . ";; "; | |
print LOG "codingStart = " . $codingStart . ";; "; | |
print LOG "codingEnd = " . $codingEnd . ";; "; | |
print LOG "transcriptScore = " . $transcriptScore . ";; "; | |
print LOG "numberOfExons = " . $numberOfExons . "\n"; | |
# 1.1. extract location data of alt splicing variant | |
findAlternativeSplicingTypeByLocation($chrom, $chromStart, $chromEnd, $strand); | |
$line = <REFALT>; # take next line from table | |
} | |
close(TOUT); | |
open(TOUT, "tempAltEventsWithType") or die "Could not open AltEventsWithType"; # temp output file | |
# print header of result table | |
print OUT "chrom\trefseqGeneChromStart\trefseqGeneChromEnd\tAltEventsChromStart\tAltEventsChromEnd\tgeneID\trefseqcore\tstrand\trefseqcodingStart\trefseqcodingEnd\ttranscriptScore\tAltEventType\trefseqTranscriptStart\trefseqTranscriptEnd\trefseqExonCount\trefseqExonStarts\trefseqExonEnds\tsecondName\n"; | |
%refseqMap; # map of exon information - key{$geneID} value{$cdsStart \t $cdsEnd \t $exonCount \t $exonStarts \t $exonEnds \t $secondName} | |
# column variables for each line in REFSEQ table | |
$bin; # Indexing field to speed chromosome range queries. | |
$geneID; # Name of gene (usually transcript_id from GTF) | |
$chrom; # Reference sequence chromosome or scaffold | |
$strand; # + or - for strand | |
$txStart; # Transcription start position | |
$txEnd; # Transcription end position | |
$cdsStart; # Coding region start | |
$cdsEnd; # Coding region end | |
$exonCount; # Number of exons | |
$exonStarts; # Exon start positions | |
$exonEnds; # Exon end positions | |
$score; # score | |
$secondName; # Alternate name (e.g. gene_id from GTF) | |
$line = <REFSEQ>; # jump over header in top of the table | |
$line = <REFSEQ>; | |
print LOG "lines of RefSeq into % refseqMap\n"; | |
while (defined ($line)) # fill data in %refseqMap Map from REFSEQ table | |
{ | |
chomp($line); | |
@LineArray = split(/\t/,$line); # splitting the line to the columns | |
$bin = $LineArray[0]; | |
$geneID = $LineArray[1]; | |
$chrom = $LineArray[2]; | |
$strand = $LineArray[3]; | |
$txStart = $LineArray[4]; | |
$txEnd = $LineArray[5]; | |
$cdsStart = $LineArray[6]; | |
$cdsEnd = $LineArray[7]; | |
$exonCount = $LineArray[8]; | |
$exonStarts = $LineArray[9]; | |
$exonEnds = $LineArray[10]; | |
$score = $LineArray[11]; | |
$secondName = $LineArray[12]; | |
chomp($geneID); | |
$refseqMap{$geneID} = $txStart."\t".$txEnd."\t".$exonCount."\t".$exonStarts."\t".$exonEnds."\t".$secondName; | |
print LOG "refseqMap{".$geneID."} = ".$refseqMap{$geneID}."\n"; | |
$line = <REFSEQ>; # take next line from table | |
} | |
# column variables for each line in TOUT table | |
$chrom; # Reference sequence chromosome or scaffold | |
$refseqChromStart; # Start position in chromosome according to refseq table | |
$refseqChromEnd; # End position in chromosome accodrding to refseq table | |
$altEvChromStart; # Start position in chromosome according to AltEvents table | |
$altEvChromEnd; # End position in chromosome accodrding to AltEvents table | |
$geneID; # Name of gene (usually transcript_id from GTF) | |
$score; # AltEvents table Score, from 0-1000 | |
$strand; # + or - for strand | |
$transcriptStart; # Transcription start position | |
$transcriptEnd; # Transcription end position | |
$transcriptScore; # Transcription score | |
$altEventType; # type of alternative splicing event - more explantions in attached file 'Alternative Splicing type UCSC' | |
$numberOfExons; # Number of exons | |
$line = <TOUT>; # jump over header in top of the table | |
$line = <TOUT>; | |
# 2. foreach row in table TOUT | |
while (defined ($line)) | |
{ | |
chomp($line); | |
@LineArray = split(/\t/,$line); # splitting the line to the columns | |
$chrom = $LineArray[0]; | |
$refseqChromStart = $LineArray[1]; | |
$refseqChromEnd = $LineArray[2]; | |
$altEvChromStart = $LineArray[3]; | |
$altEvChromEnd = $LineArray[4]; | |
$geneID = $LineArray[5]; | |
$score = $LineArray[6]; | |
$strand = $LineArray[7]; | |
$codingStart = $LineArray[8]; | |
$codingEnd = $LineArray[9]; | |
$transcriptScore = $LineArray[10]; | |
$altEventType = $LineArray[11]; | |
$numberOfExons = $LineArray[12]; | |
chomp($geneID); # 2.1 extract gene ID of alt splicing variant | |
# 2.2. find according to this gene ID the matching row in table refseq | |
if(defined($refseqMap{$geneID})) | |
{ | |
# 2.2.1. print to OUT the variant + exons data columns from table refseq | |
print OUT $chrom."\t".$refseqChromStart."\t".$refseqChromEnd."\t".$altEvChromStart."\t".$altEvChromEnd."\t".$geneID."\t".$score."\t".$strand."\t".$codingStart."\t".$codingEnd."\t".$transcriptScore."\t".$altEventType."\t".$refseqMap{$geneID}."\n"; | |
} | |
$line = <TOUT>; # take next line from table | |
} | |
close(REFALT); | |
close(ALTEV); | |
close(REFSEQ); | |
close(LOG); | |
close(OUT); | |
close(TOUT); | |
#findAlternativeSplicingTypeByLocation( $chrom, $chromStart, $chromEnd, $strand) | |
sub findAlternativeSplicingTypeByLocation | |
{ | |
seek(ALTEV, 0, SEEK_SET); # move file cursor to start of file | |
my $i_chrom = $_[0]; # input parameter | |
my $i_chromStart = $_[1]; # input parameter | |
my $i_chromEnd = $_[2]; # input parameter | |
my $i_strand = $_[3]; # input parameter | |
my @i_LineArray; | |
# column variables for each line in ALTEV table | |
my $tempChrom; | |
my $tempChromStart; | |
my $tempChromEnd; | |
my $tempStrand; | |
my $tempAltType; # type of alternative splicing event - more explantions in attached file 'Alternative Splicing type UCSC' | |
print LOG "\nfindAlternativeSplicingTypeByLocation ( ".$i_chrom." , ".$i_chromStart." , ".$i_chromEnd." , ".$i_strand. " )\n\n"; | |
my $i_line = <ALTEV>; # jump over header in top of the table | |
my $i_line = <ALTEV>; | |
while (defined ($i_line)) # 1.2. find according to input location parameters the matching row in table ALTEV | |
{ | |
@i_LineArray = split(/\t/,$i_line); # splitting the line to the columns | |
$tempChrom = $i_LineArray[1]; | |
$tempChromStart = $i_LineArray[2]; | |
$tempChromEnd = $i_LineArray[3]; | |
$tempStrand = $i_LineArray[6]; | |
$tempAltType = $i_LineArray[4]; | |
chomp($i_chrom); | |
chomp($tempChrom); | |
chomp($i_strand); | |
chomp($tempStrand); | |
if (($i_chrom eq $tempChrom) && ($i_strand eq $tempStrand)) | |
{ | |
if (($i_chromStart <= $tempChromStart) && ($i_chromEnd >= $tempChromEnd)) # 1.2.1. if exists | |
{ | |
# 1.2.1.1 print to TOUT the variant + the alt splicing type column from table ALTEV | |
print TOUT $tempChrom."\t".$i_chromStart."\t".$i_chromEnd."\t".$tempChromStart."\t".$tempChromEnd."\t".$geneID."\t".$score."\t".$strand."\t".$codingStart."\t".$codingEnd."\t".$transcriptScore."\t".$tempAltType."\t".$numberOfExons."\n"; | |
} | |
} | |
$i_line = <ALTEV>; # take next line from table | |
} | |
} | |
#Script #2 appendix B | |
#this script unites 2 tables and finds their intersection, in order to find all #the genes that contains synonymous SNPs. | |
#***********************- pseudo code -*************************************************************************************** | |
# 1. foreach row in table AltEventsWithType | |
# 1.1. extract the chromosome number | |
# 1.2 extract the strand | |
# 1.3 extract the transcription start and end | |
# 1.4 extract the rest of the relevent data | |
# 1.5 put everything in a hashtable, the key = gene id, value = the rest of the data | |
#2. foreach row in table snp135 | |
# 2.1 extract the relevent data. | |
# 2.2 call findSnpInAlternativeSplicingEvent function with the chromosome number, strand,SNP location,SNP Id and allels fields. | |
# | |
#findSnpInAlternativeSplicingEvent: | |
# 1.comper the chromosome number of the alternative splicing event and the snp | |
# 1.1 if it mach | |
# 1.1.1 comper the strand of the alternative splicing event and the snp. | |
# 1.1.1.1 if it mach | |
# 1.1.1.1.1 comper the start and end of the transcription area and the snp locatoin | |
# 1.1.1.1.1.1 if it mach | |
# 1.1.1.1.1.1.1 print to OUT the relevent data | |
#****************************************************************************************************************************** | |
open(ALTEV, "AltEventsWithType") or die "Could not open RefSeq_Alt"; # The output from script 4 size: 171,884 | |
open(SNP135, "snp135") or die "Could not open AltEvents"; # UCSC table of synonymous SNPs size: 38,301 | |
open(LOG, ">LOG") or die "Could not open LOG"; # log file of program | |
open(OUT, ">SNP splicing joining") or die "Could not open AltEventsWithType"; # output file | |
open(MAP, ">maps") or die "Could not open AltEventsWithType"; | |
open(TOUT, ">tempSnp") or die "Could not open tempSnp"; | |
$chrom; # Reference sequence chromosome or scaffold | |
$codigSeqStart; # coding start position | |
$codigSeqEnd; # coding end position | |
$strand; # + or - for strand | |
$transcriptStart; # Transcription start position | |
$transcriptEnd; # Transcription end position | |
$geneID; # Name of gene (usually transcript_id from GTF) | |
$AltEvnetType; # alternative splicing type | |
$numberOfExons; # Number of exons | |
$ExonsStart; # start positions of exons | |
$ExonsEnd; # end positions of exons | |
$line = <ALTEV>; #skip the unrelevent row | |
$line = <ALTEV>; | |
while (defined ($line)) # foreach row in table aternative splicing event | |
{ | |
chomp($line); | |
@LineArray = split(/\t/,$line); # splitting the line to the columns | |
$chrom = $LineArray[0]; | |
$chromStart = $LineArray[1]; | |
$chromEnd = $LineArray[2]; | |
$geneId = $LineArray[5]; | |
$strand = $LineArray[7]; | |
$transcriptStart = $LineArray[12]; | |
$transcriptEnd = $LineArray[13]; | |
$AltEvnetType = $LineArray[11]; | |
$codigSeqStart = $LineArray[8]; | |
$codigSeqEnd = $LineArray[9]; | |
$numberOfExons = $LineArray[14]; | |
$ExonsStart = $LineArray[15]; | |
$ExonsEnd = $LineArray[16]; | |
print LOG "chrom = " . $chrom . ";; "; | |
print LOG "strand = " . $strand . ";; "; | |
print LOG "start coding = ".$codigSeqStart.";; "; | |
print LOG "end coding = ".$codigSeqEnd.";; "; | |
print LOG "geneID = " . $geneId . ";; "; | |
print LOG "alterantive splicing type = ". $AltEvnetType. ";;" ; | |
print LOG "score = " . $score . ";; "; | |
print LOG "transcriptStart = " . $transcriptStart . ";; "; | |
print LOG "transcriptEnd = " . $transcriptEnd . ";; "; | |
print LOG "numberOfExons = " . $numberOfExons . ";; "; | |
print LOG "start positions of exons = " . $ExonsStart . ";; "; | |
print LOG "end positions of exons = " . $ExonsEnd . "\n"; | |
$AltEventsMap{$geneId} = $chrom."\t".$strand."\t".$codigSeqStart."\t".$codigSeqEnd."\t".$transcriptStart."\t".$transcriptEnd."\t".$geneId."\t".$numberOfExons."\t".$ExonsStart."\t".$ExonsEnd."\t".$AltEvnetType."\t".$chromStart."\t".$chromEnd; | |
$line = <ALTEV>; # take next line from table | |
} | |
$line = <SNP135>; | |
$line = <SNP135>; #skip unrelevent rows | |
$line = <SNP135>; | |
print OUT "chrom chromStart chromEnd strand geneId codigSeqStart codigSeqEnd transcriptStart transcriptEnd SNPlocation Allels SNPId alternativeSplicingType numberOfExons ExonsStart ExonsEnd\n"; | |
while (defined ($line)) # foreach row in table SNP | |
{ | |
chomp($line); | |
@LineArray2 = split(/\t/,$line); # splitting the line to the columns | |
$chrom = $LineArray2[1]; | |
$SNPlocation = $LineArray2[2]; | |
$SNPId = $LineArray2[4]; | |
$strand = $LineArray2[6]; | |
$Allels = $LineArray2[9]; | |
print LOG "chrom = " . $chrom . ";; "; | |
print LOG "SNP location = ".$SNPlocation.";; "; | |
print LOG "SNP name = ".$SNPname.";; "; | |
print LOG "strand = ".$strand.";; "; | |
print LOG "Allels = ".$Allels."\n "; | |
findSnpInAlternativeSplicingEvent($chrom, $strand, $SNPlocation,$SNPId,$Allels); | |
$line = <SNP135>; | |
} | |
close(OUT); | |
#findAlternativeSplicingTypeByLocation($chrom, $strand, $SNPlocation,$SNPId,$Allels) | |
sub findSnpInAlternativeSplicingEvent | |
{ | |
my $chrom = $_[0]; | |
my $strand = $_[1]; | |
my $SNPlocation = $_[2]; | |
my $SNPId = $_[3]; | |
my $Allels = $_[4]; | |
foreach $geneId(%AltEventsMap) | |
{ | |
$geneInfo = $AltEventsMap{$geneId}; | |
chomp($geneInfo); | |
@AltArray = split(/\t/,$geneInfo); # splitting the line of every value in a line of the map | |
if($AltArray[0] eq $chrom) | |
{ | |
if($AltArray[1] eq $strand) | |
{ | |
if(($AltArray[2] <= $SNPlocation) && ($AltArray[3] >= $SNPlocation)) | |
{ | |
print TOUT $AltArray[0]." == ".$chrom." ".$AltArray[1]." == ".$strand." ".$AltArray[4]." <= ".$SNPlocation." && ".$AltArray[5].">= ".$SNPlocation."\n"; | |
print OUT $AltArray[0]."\t".$AltArray[11]."\t".$AltArray[12]."\t".$AltArray[1]."\t".$AltArray[6]."\t".$AltArray[2]."\t".$AltArray[3]."\t".$AltArray[4]."\t".$AltArray[5]."\t".$SNPlocation."\t".$Allels."\t".$SNPId."\t".$AltArray[10]."\t".$AltArray[7]."\t".$AltArray[8]."\t".$AltArray[9]."\n"; | |
} | |
} | |
} | |
} | |
} | |
#Script #3 appendix C | |
#this script finds all genes from the previous table that contains synonymous #SNPs at the first or last 50 nucleotides of exon | |
#***********************- pseudo code -************************* | |
# 1. foreach row in table SNPSPL | |
# 1.1. extract exon data of alt splicing variant and SNP location | |
# 1.2. foreach exon start and end location check if SNP is in the first 50 bp or last 50 bp of exon | |
# 1.2.1. if SNP is in range of some exon | |
# 1.2.1.1 print to OUT the variant + exon number that SNP is in the range of | |
#*************************************************************** | |
# joining of UCSC tables that hold splicing variants with SNP within - size: 29,788 | |
open(SNPSPL, "SNP splicing joining") or die "Could not open SNP splicing joining"; | |
open(OUT, ">SNP splicing joining exon filtered") or die "Could not open SNP splicing joining exon filtered"; # output file | |
open(LOG, ">LOG") or die "Could not open LOG"; # log file of program | |
$line = <SNPSPL>; # get header of input file | |
$BASE_PAIR_OFFSET = 50; # base pair offset at exon start and end search SNP | |
# print header of result table | |
chop($line); | |
print OUT $line."\tSNPInExonNumber\n"; | |
# column variables for each line in SNPSPL table | |
$chrom; # Reference sequence chromosome or scaffold | |
$chromStart; # chrom position start | |
$chromEnd; # chrom position end | |
$strand; # + or - for strand | |
$geneId; # Name of gene (usually transcript_id from GTF) | |
$codigSeqStart; # Coding region start | |
$codigSeqEnd; # Coding region end | |
$transcriptStart; # Transcription start position | |
$transcriptEnd; # Transcription end position | |
$SNPlocation; # SNP location | |
$Allels; # observed nucleutide allele | |
$SNPId; # unique SNP ID | |
$alternativeSplicingType; # type of alternative splicing event | |
$numberOfExons; # Number of exons | |
$ExonsStart; # Exon start positions | |
$ExonsEnd; # Exon end positions | |
$line = <SNPSPL>; | |
while (defined ($line)) # 1. foreach row in table SNPSPL | |
{ | |
# 1.1. extract exon data of alt splicing variant and SNP location | |
chomp($line); | |
@LineArray = split(/\t/,$line); # splitting the line to the columns | |
$chrom = $LineArray[0]; | |
$chromStart = $LineArray[1]; | |
$chromEnd = $LineArray[2]; | |
$strand = $LineArray[3]; | |
$geneId = $LineArray[4]; | |
$codigSeqStart = $LineArray[5]; | |
$codigSeqEnd = $LineArray[6]; | |
$transcriptStart = $LineArray[7]; | |
$transcriptEnd = $LineArray[8]; | |
$SNPlocation = $LineArray[9]; | |
$Allels = $LineArray[10]; | |
$SNPId = $LineArray[11]; | |
$alternativeSplicingType = $LineArray[12]; | |
$numberOfExons = $LineArray[13]; | |
$ExonsStart = $LineArray[14]; | |
$ExonsEnd = $LineArray[15]; | |
chomp($numberOfExons); | |
chomp($SNPlocation); | |
# print LOG "numberOfExons = ".$numberOfExons."\n"; | |
@ExonStartArray = split(/,/,$ExonsStart); # splitting the exon start positions | |
@ExonEndArray = split(/,/,$ExonsEnd); # splitting the exon end positions | |
# 1.2. foreach exon start and end location check if SNP is in the first 50 bp or last 50 bp of exon | |
for ($i = 0; $i < $numberOfExons; $i++) | |
{ | |
chomp($ExonStartArray[$i]); # chomp current exon's start position | |
chomp($ExonEndArray[$i]); # chomp current exon's end position | |
$ExonStartPos = $ExonStartArray[$i]; | |
$ExonEndPos = $ExonEndArray[$i]; | |
# print LOG "SNPlocation = ".$SNPlocation."\n"; | |
# print LOG "ExonStartPos = ".$ExonStartPos."\n"; | |
# print LOG "ExonEndPos = ".$ExonEndPos."\n"; | |
# 1.2.1. if SNP is in range of some exon | |
if ((($SNPlocation >= $ExonStartPos) && ($SNPlocation <= ($ExonStartPos + $BASE_PAIR_OFFSET))) || | |
(($SNPlocation >= ($ExonEndPos - $BASE_PAIR_OFFSET)) && ($SNPlocation <= $ExonEndPos))) | |
{ | |
# 1.2.1.1 print to OUT the variant + exon number that SNP is in the range of | |
print OUT $line."\t".($i+1)."\n"; | |
} | |
} | |
$line = <SNPSPL>; # take next line from table | |
} | |
close(SNPSPL); | |
close(OUT); | |
close(LOG); | |
#Script #4 appendix D | |
#this script checks the frequency of each type of alternative splicing in 2 #tables – all genes that alternatively spliced, compard with all genes that #contains synonymous SNP at the start or the end of exon | |
#***********************- pseudo code -************************* | |
# 1. foreach row in table AltEventsWithType | |
# 1.1. take the alternative splicing type and put it in map as a key | |
# 1.2. increment the value of the key | |
# 2. foreach row in table SNP splicing joining | |
# 2.1 take the alternative splicing type and put it in map as a key | |
# 2.2. increment the value of the key | |
# 3. calculate the frequency of every type of alternative splicing for each map | |
# 3. print the maps and the fequencies to OUT in two separeted tables | |
#*************************************************************** | |
open(ALTTYPE, "AltEventsWithType") or die "Could not open AltEventsWithType"; # table of alternative splicing events with types - size: 171,883 | |
open(SNPTYPE, "SNP splicing joining") or die "Could not open "; # table of alternative splicing events with SNPs and types - size: 29,788 | |
open(LOG, ">LOG") or die "Could not open LOG"; # log file of program | |
open(OUT, ">AltSplicingTypeFrequency") or die "Could not open AltSplicingTypeFrequency"; # output file | |
%typesFreq; # map of alternative splicing type (key) and number of apearences of the type in the first table (value) | |
$line = <ALTTYPE>; # first line contains the description of the table | |
$line = <ALTTYPE>; # takes the second line | |
while (defined ($line)) # foreach row in table ALTTYPE | |
{ | |
chomp($line); | |
@LineArray = split(/\t/,$line); # splitting the line to the columns | |
$key = $LineArray[11]; # the alternative splicing type | |
if(defined $typesFreq{$key}) # if this type is already exists in the map | |
{ | |
$typesFreq{$key}++; # increment its number of apearances | |
} | |
else | |
{ | |
$typesFreq{$key} = 1; # set as new key and set its number of apearences to 1 | |
} | |
$line = <ALTTYPE>; | |
} | |
%typesFreqWithSNP; # map of alternative splicing type (key) and number of apearences of the type in the second table (value) | |
$line = <SNPTYPE>; # first line contains the description of the table | |
$line = <SNPTYPE>; # takes the second line | |
while (defined ($line)) # foreach row in table SNPTYPE | |
{ | |
chomp($line); | |
@LineArray = split(/\t/,$line); # splitting the line to the columns | |
$key = $LineArray[12]; # the alternative splicing type | |
if(defined $typesFreqWithSNP{$key}) # if this type is already exists in the map | |
{ | |
$typesFreqWithSNP{$key}++; # increment its number of apearances | |
} | |
else | |
{ | |
$typesFreqWithSNP{$key} = 1; # set as new key and set its number of apearences to 1 | |
} | |
$line = <SNPTYPE>; | |
} | |
@keys = keys(%typesFreq); | |
$altFreqSum = 0; | |
foreach $key(@keys) # sum the total alternative splicing events of the first table | |
{ | |
$altFreqSum += $typesFreq{$key}; | |
} | |
print OUT "AltEventsWithType Frequency Table:\n"; | |
print OUT "AltType\tNumberOfApearencesInTable\tTotalFrequency\n"; | |
foreach $key(@keys) # calculate the frequency of every type of alternative splicing in the first table and print the data to OUT | |
{ | |
$freq = $typesFreq{$key}/$altFreqSum; | |
print OUT $key . "\t"; | |
print OUT $typesFreq{$key} . "\t"; | |
print OUT $freq . "\n"; | |
} | |
print OUT "\n"; | |
@keys = keys(%typesFreqWithSNP); | |
$SNPFreqSum = 0; | |
foreach $key(@keys) # sum the total alternative splicing events of the second table | |
{ | |
$SNPFreqSum += $typesFreqWithSNP{$key}; | |
} | |
print OUT "SNP splicing joining Frequency Table:\n"; | |
print OUT "AltTypeWithSNP\tNumberOfApearencesInTable\tTotalFrequency\n"; | |
foreach $key(@keys) # calculate the frequency of every type of alternative splicing of the second table and print the data to OUT | |
{ | |
$freq = $typesFreqWithSNP{$key}/$SNPFreqSum; | |
print OUT $key . "\t"; | |
print OUT $typesFreqWithSNP{$key} . "\t"; | |
print OUT $freq . "\n"; | |
} | |
# Script #5 appendix E | |
#! /usr/bin/perl-w | |
#this script compute the needed table for a Chi Square test between no snp \snp #and no alternative splicing\alternative splicing | |
#***********************- pseudo code -************************* | |
# 1. get all refseq genes and hold them in HASH | |
# 2. get all refseq genes that have AS and hold them in HASH | |
# 3. get all refseq genes that have SNP - was computed by auxiliry script "findRefseqWithSNP.pl" - and hold in HASH | |
# 4. get all refseq genes that have AS and SNP and hold them in HASH | |
# 5. count all refseq genes that have AS and no SNP | |
# 6. count all refseq genes that have SNP and no AS | |
# 7. count all refseq genes that dont have SNP and dont have AS | |
#*************************************************************** | |
# openning the required input and output files | |
open(AS_GENES, "RefSeq_Alt") or die "RefSeq_Alt"; | |
open(ALL_GENES, "refseq") or die "refseq"; | |
open(SNP_GENES, "refseq_genes_with_snp") or die "refseq_genes_with_snp"; # was computed by auxiliry script -findRefseqWithSNP.pl | |
open(OUT, ">instances_count_summary") or die "out"; | |
# init data structures | |
%refseqGenes = (); # holds all refseq genes | |
$refseqGenesCounter = 0; #holds the number of refseq genes | |
%refseqGenesWithASAndSNP = (); # holds all refseq genes that have a SNP and AS | |
$refseqGenesWithASAndSNPCounter = 0; #holds the number of refseq genes that have a SNP and AS | |
%refseqGenesWithAS = (); # holds all refseq genes that have AS | |
$refseqGenesWithASCounter = 0; #holds the number of refseq genes that have AS | |
%refseqGenesWithASAndNoSNP = (); # holds all refseq genes that have AS but no SNP | |
$refseqGenesWithASAndNoSNPCounter = 0; #holds the number of refseq genes that have AS but no SNP | |
%refseqGenesWithSNP = (); # holds all refseq genes that have SNP | |
$refseqGenesWithSNPCounter = 0; #holds the number of refseq genes that have SNP | |
%refseqGenesWithSNPAndNoAS = (); # holds all refseq genes that have SNP but no AS | |
$refseqGenesWithSNPAndNoASCounter = 0; #holds the number of refseq genes that have SNP but no AS | |
%refseqGenesWithNoSNPAndNoAS = (); # holds all refseq genes that have no SNP and no AS | |
$refseqGenesWithNoSNPAndNoASCounter = 0; #holds the number of refseq genes that have no SNP and no AS | |
print OUT "***********************- instances count summary -*************************\n\n"; | |
# get all refseq genes | |
$line = <ALL_GENES>; | |
$line = <ALL_GENES>; | |
while (defined $line) | |
{ | |
chomp($line); | |
@LineArray = split(/\t/,$line); | |
$geneID = $LineArray[1]; # get gene ID | |
if (defined $refseqGenes{$geneID}) | |
{ | |
} | |
else | |
{ | |
$refseqGenes{$geneID} = 1; | |
$refseqGenesCounter++; | |
} | |
$line = <ALL_GENES>; | |
} | |
print OUT "All refseq genes - " . $refseqGenesCounter . "\n\n"; | |
# get all refseq genes that have AS | |
$line = <AS_GENES>; | |
while (defined $line) | |
{ | |
chomp($line); | |
@LineArray = split(/\t/,$line); | |
$geneID = $LineArray[3]; # get gene ID | |
if (defined $refseqGenesWithAS{$geneID}) | |
{ | |
} | |
else | |
{ | |
$refseqGenesWithAS{$geneID} = 1; | |
$refseqGenesWithASCounter++; | |
} | |
$line = <AS_GENES>; | |
} | |
# get all refseq genes that have SNP - was computed by auxiliry script "findRefseqWithSNP.pl" | |
$line = <SNP_GENES>; | |
$line = <SNP_GENES>; | |
while (defined $line) | |
{ | |
chomp($line); | |
@LineArray = split(/\t/,$line); | |
$geneID = $LineArray[4]; # get gene ID | |
if (defined $refseqGenesWithSNP{$geneID}) | |
{ | |
} | |
else | |
{ | |
$refseqGenesWithSNP{$geneID} = 1; | |
$refseqGenesWithSNPCounter++; | |
} | |
$line = <SNP_GENES>; | |
} | |
# get all refseq genes that have AS and SNP | |
foreach $geneID(%refseqGenes) | |
{ | |
if ($refseqGenes{$geneID} == 1) | |
{ | |
if (defined $refseqGenesWithAS{$geneID}) # has a AS | |
{ | |
if (defined $refseqGenesWithSNP{$geneID}) # has a SNP and AS | |
{ | |
$refseqGenesWithASAndSNP{$geneID} = 1; | |
$refseqGenesWithASAndSNPCounter++; | |
} | |
else # no SNP | |
{ | |
} | |
} | |
else # no AS | |
{ | |
} | |
} | |
} | |
print OUT "instances with Alternative splicing and SNP - " . $refseqGenesWithASAndSNPCounter . "\n"; | |
# get all refseq genes that have AS and no SNP | |
foreach $geneID(%refseqGenesWithAS) | |
{ | |
if ($refseqGenesWithAS{$geneID} == 1) | |
{ | |
if (defined $refseqGenesWithSNP{$geneID}) # has a SNP | |
{ | |
} | |
else # no SNP | |
{ | |
$refseqGenesWithASAndNoSNP{$geneID} = 1; | |
$refseqGenesWithASAndNoSNPCounter++; | |
} | |
} | |
} | |
print OUT "instances with Alternative splicing and without SNP - " . $refseqGenesWithASAndNoSNPCounter . "\n"; | |
# get all refseq genes that have SNP and no AS | |
foreach $geneID(%refseqGenesWithSNP) | |
{ | |
if ($refseqGenesWithSNP{$geneID} == 1) | |
{ | |
if (defined $refseqGenesWithAS{$geneID}) # has a AS | |
{ | |
} | |
else # no AS | |
{ | |
$refseqGenesWithSNPAndNoAS{$geneID} = 1; | |
$refseqGenesWithSNPAndNoASCounter++; | |
} | |
} | |
} | |
print OUT "instances without Alternative splicing and with SNP - " . $refseqGenesWithSNPAndNoASCounter . "\n"; | |
# get all refseq genes that dont have SNP and dont have AS | |
foreach $geneID(%refseqGenes) | |
{ | |
if ($refseqGenes{$geneID} == 1) | |
{ | |
if (defined $refseqGenesWithAS{$geneID}) # has a AS | |
{ | |
} | |
else # no AS | |
{ | |
if (defined $refseqGenesWithSNP{$geneID}) # has a SNP | |
{ | |
} | |
else # no SNP and no AS | |
{ | |
$refseqGenesWithNoSNPAndNoAS{$geneID} = 1; | |
$refseqGenesWithNoSNPAndNoASCounter++; | |
} | |
} | |
} | |
} | |
print OUT "instances without Alternative splicing and without SNP - " . $refseqGenesWithNoSNPAndNoASCounter . "\n"; | |
close(SNPAS_GENES); | |
close(AS_GENES); | |
close(ALL_GENES); | |
close(SNP_GENES); | |
close(OUT); | |
# Heper Script regarding script #5 appendix F | |
#! /usr/bin/perl-w | |
# This script finds all the refseq genes which have a SNP inside them | |
#***********************- pseudo code -*************************************************************************************** | |
# 1. foreach row in table refseq table | |
# 1.1. extract the chromosome number | |
# 1.2 extract the strand | |
# 1.3 extract the transcription start and end | |
# 1.4 extract the rest of the relevent data | |
# 1.5 put everything in a hashtable, the key = gene id, value = the rest of the data | |
#2. foreach row in table snp135 | |
# 2.1 extract the relevent data. | |
# 2.2 call findSnpInAlternativeSplicingEvent function with the chromosome number, strand,SNP location,SNP Id and allels fields. | |
# | |
#findSnpInAlternativeSplicingEvent: | |
# 1.comper the chromosome number of the alternative splicing event and the snp | |
# 1.1 if it mach | |
# 1.1.1 comper the strand of the alternative splicing event and the snp. | |
# 1.1.1.1 if it mach | |
# 1.1.1.1.1 comper the start and end of the transcription area and the snp locatoin | |
# 1.1.1.1.1.1 if it mach | |
# 1.1.1.1.1.1.1 print to OUT the relevent data | |
#****************************************************************************************************************************** | |
open(REFSEQ, "refseq") or die "Could not open RefSeq_Alt"; | |
open(SNP135, "snp135") or die "Could not open AltEvents"; # UCSC table of synonymous SNPs size: 38,301 | |
open(OUT, ">refseq_genes_with_snp") or die "Could not open AltEventsWithType"; # output file | |
$chrom; # Reference sequence chromosome or scaffold | |
$codigSeqStart; # coding start position | |
$codigSeqEnd; # coding end position | |
$strand; # + or - for strand | |
$transcriptStart; # Transcription start position | |
$transcriptEnd; # Transcription end position | |
$geneID; # Name of gene (usually transcript_id from GTF) | |
$AltEvnetType; # alternative splicing type | |
$numberOfExons; # Number of exons | |
$ExonsStart; # start positions of exons | |
$ExonsEnd; # end positions of exons | |
$line = <REFSEQ>; #skip the unrelevent row | |
$line = <REFSEQ>; | |
while (defined ($line)) # foreach row in table aternative splicing event | |
{ | |
chomp($line); | |
@LineArray = split(/\t/,$line); # splitting the line to the columns | |
$chrom = $LineArray[2]; | |
$geneId = $LineArray[1]; | |
$strand = $LineArray[3]; | |
$transcriptStart = $LineArray[4]; | |
$transcriptEnd = $LineArray[5]; | |
$codigSeqStart = $LineArray[6]; | |
$codigSeqEnd = $LineArray[7]; | |
$numberOfExons = $LineArray[8]; | |
$ExonsStart = $LineArray[9]; | |
$ExonsEnd = $LineArray[10]; | |
$AltEventsMap{$geneId} = $chrom."\t".$strand."\t".$codigSeqStart."\t".$codigSeqEnd."\t".$transcriptStart."\t".$transcriptEnd."\t".$geneId."\t".$numberOfExons."\t".$ExonsStart."\t".$ExonsEnd."\t".$AltEvnetType; | |
$line = <REFSEQ>; # take next line from table | |
} | |
$line = <SNP135>; | |
$line = <SNP135>; #skip unrelevent rows | |
$line = <SNP135>; | |
print OUT "chrom strand geneId codigSeqStart codigSeqEnd transcriptStart transcriptEnd SNPlocation Allels SNPId numberOfExons ExonsStart ExonsEnd\n"; | |
while (defined ($line)) # foreach row in table SNP | |
{ | |
chomp($line); | |
@LineArray2 = split(/\t/,$line); # splitting the line to the columns | |
$chrom = $LineArray2[1]; | |
$SNPlocation = $LineArray2[2]; | |
$SNPId = $LineArray2[4]; | |
$strand = $LineArray2[6]; | |
$Allels = $LineArray2[9]; | |
findSnpInAlternativeSplicingEvent($chrom, $strand, $SNPlocation,$SNPId,$Allels); | |
$line = <SNP135>; | |
} | |
close(OUT); | |
#findAlternativeSplicingTypeByLocation($chrom, $strand, $SNPlocation,$SNPId,$Allels) | |
sub findSnpInAlternativeSplicingEvent | |
{ | |
my $chrom = $_[0]; | |
my $strand = $_[1]; | |
my $SNPlocation = $_[2]; | |
my $SNPId = $_[3]; | |
my $Allels = $_[4]; | |
foreach $geneId(%AltEventsMap) | |
{ | |
$geneInfo = $AltEventsMap{$geneId}; | |
chomp($geneInfo); | |
@AltArray = split(/\t/,$geneInfo); # splitting the line of every value in a line of the map | |
if($AltArray[0] eq $chrom) | |
{ | |
if($AltArray[1] eq $strand) | |
{ | |
if(($AltArray[2] <= $SNPlocation) && ($AltArray[3] >= $SNPlocation)) | |
{ | |
print OUT $AltArray[0]."\t".$AltArray[11]."\t".$AltArray[12]."\t".$AltArray[1]."\t".$AltArray[6]."\t".$AltArray[2]."\t".$AltArray[3]."\t".$AltArray[4]."\t".$AltArray[5]."\t".$SNPlocation."\t".$Allels."\t".$SNPId."\t".$AltArray[10]."\t".$AltArray[7]."\t".$AltArray[8]."\t".$AltArray[9]."\n"; | |
} | |
} | |
} | |
} | |
} | |
# appendix G | |
#! /usr/bin/perl-w | |
#this script compute the needed table for a Chi Square test between end/start of #exon and no alternative splicing\alternative splicing | |
open(AS, "SNP splicing joining exon filtered") or die "SNP splicing joining exon filtered"; | |
open(SNP, "refseq_genes_with_snp") or die "Could not open RefSeq_Alt"; | |
open(ALT, "RefSeq_Alt") or die "Could not open RefSeq_Alt"; | |
open(OUT,"> test3") or die "could not open test3"; | |
@res; | |
for($i = 0; $i < 4; $i++) | |
{ | |
$res[$i] = 0; | |
} | |
$line = <AS>; | |
$line = <AS>; | |
while(defined $line) | |
{ | |
@geneArr = split(/\t/,$line); | |
$SNPlocation = $geneArr[9]; | |
$exon = $geneArr[16]; | |
$exonsStart = $geneArr[14]; | |
$exonsEnd = $geneArr[15]; | |
$NumberOfExons = $geneArr[13]; | |
@exonsStart = split(/\s/,$exonsStart); | |
@exonsEnd = split(/\s/,$exonsEnd); | |
if( $SNPlocation - $exonsStart[$exon-1] <= 50) # increasing the sum of alternative splising at the begining of the exon | |
{ | |
$res[0]++; | |
} | |
elsif($exonsEnd[$exon-1] - $SNPlocation <= 50) # increasing the sum of alternative splising at the end of the exon | |
{ | |
$res[2]++; | |
} | |
$line = <AS>; | |
} | |
close(AS); | |
$SNPline = <SNP>; | |
$SNPline = <SNP>; | |
$ALTline = <ALT>; | |
@snpArr = split(/\s/,$SNPline); | |
@altArr = split(/\s/,$ALTline); | |
$SNPgeneID = $snpArr[4]; | |
$ALTgeneID = $altArr[3]; | |
$flag = 1; | |
while(defined $SNPline) | |
{ | |
while( defined $ALTline && $flag != 0) | |
{ | |
if($SNPgeneID eq $ALTgeneID) | |
{ | |
$flag = 0; | |
} | |
$ALTline = <ALT>; | |
@altArr = split(/\s/,$ALTline); | |
$ALTgeneID = $altArr[3]; | |
} | |
close(ALT); | |
open(ALT, "RefSeq_Alt") or die "Could not open RefSeq_Alt"; | |
if($flag) | |
{ | |
$NumberOfExons = $snpArr[13]; | |
$exonsStart = $snpArr[14]; | |
@exonsStart = split(/\s/,$exonsStart); | |
$exonsEnd = $snpArr[15]; | |
@exonsEnd = split(/\s/,$exonsEnd); | |
$SNPlocation = $snpArr[9]; | |
for($i = 0; $i < $NumberOfExons; $i++) | |
{ | |
if( $SNPlocation - $exonsStart[$i] <= 50 && $SNPlocation <= $exonsEnd[$i]) | |
{ | |
$res[1]++;#increasing the sum of not alternative splised at the begining of the exon | |
} | |
} | |
for($i = 0; $i < $NumberOfExons; $i++) | |
{ | |
if( $exonsEnd[$i] - $SNPlocation <= 50 && $exonsEnd[$i] - $SNPlocation >= 0 ) | |
{ | |
$res[3]++; #increasing the sum of not alternative splised at the end of the exon | |
} | |
} | |
} | |
$SNPline = <SNP>; | |
@snpArr = split(/\s/,$SNPline); | |
$SNPgeneID = $snpArr[4]; | |
$flag = 1; | |
} | |
print OUT "\t\tAlternative splicing\tno Alternative splicing\n"; | |
print OUT "Begining\t".$res[0]."\t".$res[1]."\n"; | |
print OUT "End\t".$res[2]."\t".$res[3]."\n"; | |
#script #6 appendix H | |
#! /usr/bin/perl-w | |
#***********************- pseudo code -************************* | |
# 1. foreach row in table REFSEQ | |
# 1.1 foreach row in ALTEV | |
# 1.1.1 if an alternative splicing event if found within current gene then they are printed to OUT file | |
#*************************************************************** | |
open(ALTEV, "AltEvents") or die "Could not open AltEvents"; # UCSC table of alternative events - size: 91,223 | |
open(REFSEQ, "refseq") or die "Could not open refseq"; # UCSC table of refseq genes - size: 41,748 | |
open(LOG, ">LOG") or die "Could not open LOG"; # log file of program | |
open(OUT, ">AltEventsWithTypeNew") or die "Could not open AltEventsWithTypeNew"; # output file | |
# print header of result table | |
print OUT "chrom\trefseqGeneChromStart\trefseqGeneChromEnd\tEmptyPlaceHolder\tEmptyPlaceHolder\tgeneID\trefseqcore\tstrand\trefseqcodingStart\trefseqcodingEnd\ttranscriptScore\tEmptyPlaceHolder\trefseqTranscriptStart\trefseqTranscriptEnd\trefseqExonCount\trefseqExonStarts\trefseqExonEnds\tsecondName\n"; | |
print OUT "!\taltEventChromStart\taltEventChromEnd\taltEventType\n"; | |
# column variables for each line in REFSEQ table | |
$bin; # Indexing field to speed chromosome range queries. | |
$geneID; # Name of gene (usually transcript_id from GTF) | |
$chrom; # Reference sequence chromosome or scaffold | |
$strand; # + or - for strand | |
$txStart; # Transcription start position | |
$txEnd; # Transcription end position | |
$cdsStart; # Coding region start | |
$cdsEnd; # Coding region end | |
$exonCount; # Number of exons | |
$exonStarts; # Exon start positions | |
$exonEnds; # Exon end positions | |
$score; # score | |
$secondName; # Alternate name (e.g. gene_id from GTF) | |
$line = <REFSEQ>; # jump over header in top of the table | |
$line = <REFSEQ>; | |
while (defined ($line)) # fill data in %refseqMap Map from REFSEQ table | |
{ | |
chomp($line); | |
@LineArray = split(/\t/,$line); # splitting the line to the columns | |
$bin = $LineArray[0]; | |
$geneID = $LineArray[1]; | |
$chrom = $LineArray[2]; | |
$strand = $LineArray[3]; | |
$txStart = $LineArray[4]; | |
$txEnd = $LineArray[5]; | |
$cdsStart = $LineArray[6]; | |
$cdsEnd = $LineArray[7]; | |
$exonCount = $LineArray[8]; | |
$exonStarts = $LineArray[9]; | |
$exonEnds = $LineArray[10]; | |
$score = $LineArray[11]; | |
$secondName = $LineArray[12]; | |
chomp($geneID); | |
chomp($chrom); | |
chomp($strand); | |
# call helper function that finds all alternative splicing events in current gene and print to outpur file | |
findAlternativeSplicingTypeByLocation($chrom, $txStart, $txEnd, $strand); | |
$line = <REFSEQ>; # take next line from table | |
} | |
close(ALTEV); | |
close(REFSEQ); | |
close(LOG); | |
close(OUT); | |
#findAlternativeSplicingTypeByLocation( $chrom, $chromStart, $chromEnd, $strand) | |
sub findAlternativeSplicingTypeByLocation | |
{ | |
seek(ALTEV, 0, SEEK_SET); # move file cursor to start of file | |
my $i_chrom = $_[0]; # input parameter | |
my $i_chromStart = $_[1]; # input parameter | |
my $i_chromEnd = $_[2]; # input parameter | |
my $i_strand = $_[3]; # input parameter | |
my @i_LineArray; | |
# column variables for each line in ALTEV table | |
my $tempChrom; | |
my $tempChromStart; | |
my $tempChromEnd; | |
my $tempStrand; | |
my $tempAltType; # type of alternative splicing event - more explantions in attached file 'Alternative Splicing type UCSC' | |
my $firstMatchFound = 1; | |
print LOG "\nfindAlternativeSplicingTypeByLocation ( ".$i_chrom." , ".$i_chromStart." , ".$i_chromEnd." , ".$i_strand. " )\n\n"; | |
my $i_line = <ALTEV>; # jump over header in top of the table | |
my $i_line = <ALTEV>; | |
while (defined ($i_line)) # 1.2. find according to input location parameters the matching alternative splicing events | |
{ | |
@i_LineArray = split(/\t/,$i_line); # splitting the line to the columns | |
$tempChrom = $i_LineArray[1]; | |
$tempChromStart = $i_LineArray[2]; | |
$tempChromEnd = $i_LineArray[3]; | |
$tempStrand = $i_LineArray[6]; | |
$tempAltType = $i_LineArray[4]; | |
chomp($tempChrom); | |
chomp($tempStrand); | |
if (($i_chrom eq $tempChrom) && ($i_strand eq $tempStrand)) | |
{ | |
if (($i_chromStart <= $tempChromStart) && ($i_chromEnd >= $tempChromEnd)) # if alternative splicing event whithin gene | |
{ | |
if ($firstMatchFound == 1) # print gene information to output file only if match was found | |
{ | |
$firstMatchFound = 0; | |
# print to OUT the refaeq gene | |
print OUT $chrom."\t".$txStart."\t".$txEnd."\t"."emptyPlaceHolder"."\t"."emptyPlaceHolder"."\t".$geneID."\t".$score."\t".$strand."\t".$cdsStart."\t".$cdsEnd."\t".$score."\t"."emptyPlaceHolder"."\t".$txStart."\t".$txEnd."\t".$exonCount."\t".$exonStarts."\t".$exonEnds."\t".$secondName."\n"; | |
} | |
# print alternative splicing event information | |
print OUT "!\t".$tempChromStart."\t".$tempChromEnd."\t".$tempAltType."\n"; | |
} | |
} | |
$i_line = <ALTEV>; # take next line from table | |
} | |
} | |
#script #7 appendix I | |
#! /usr/bin/perl-w | |
#***********************- Description -*************************************************** | |
# This script finds per each gene the SNP's that are contained within it. | |
# in addition per each match this script finds the relevant alternative splicing events | |
# by overlapping exon and alterantive splicing event. | |
#***************************************************************************************** | |
open(ALTEV, "AltEventsWithTypeNew") or die "Could not open RefSeq_Alt"; # The output from script 1 size: 171,884 | |
open(SNP135, "snp135") or die "Could not open AltEvents"; # UCSC table of synonymous SNPs size: 38,301 | |
open(LOG, ">LOG") or die "Could not open LOG"; # log file of program | |
open(OUT, ">SNP splicing joining new") or die "Could not open AltEventsWithType"; # output file | |
$chrom; # Reference sequence chromosome or scaffold | |
$codigSeqStart; # coding start position | |
$codigSeqEnd; # coding end position | |
$strand; # + or - for strand | |
$transcriptStart; # Transcription start position | |
$transcriptEnd; # Transcription end position | |
$geneID; # Name of gene (usually transcript_id from GTF) | |
$AltEvnetType; # alternative splicing type | |
$numberOfExons; # Number of exons | |
$ExonsStart; # start positions of exons | |
$ExonsEnd; # end positions of exons | |
$line = <ALTEV>; #skip the unrelevent row | |
$line = <ALTEV>; | |
while (defined ($line)) # foreach row in table aternative splicing event | |
{ | |
chomp($line); | |
if ($line =~ /^!/) # if current line holds alternative splicing information - add it to hash | |
{ # hold alternative splicing data in HASH with geneID as key | |
if (defined $AltEventsInformationMap{$geneId}) | |
{ | |
$AltEventsInformationMap{$geneId} = $AltEventsInformationMap{$geneId} . $line . "\n"; | |
} | |
else | |
{ | |
$AltEventsInformationMap{$geneId} = $line . "\n"; | |
} | |
} | |
else # if current line holds gene information - call helper function | |
{ | |
@LineArray = split(/\t/,$line); # splitting the line to the columns | |
$chrom = $LineArray[0]; | |
$chromStart = $LineArray[1]; | |
$chromEnd = $LineArray[2]; | |
$geneId = $LineArray[5]; | |
$strand = $LineArray[7]; | |
$transcriptStart = $LineArray[12]; | |
$transcriptEnd = $LineArray[13]; | |
$AltEvnetType = $LineArray[11]; | |
$codigSeqStart = $LineArray[8]; | |
$codigSeqEnd = $LineArray[9]; | |
$numberOfExons = $LineArray[14]; | |
$ExonsStart = $LineArray[15]; | |
$ExonsEnd = $LineArray[16]; | |
# hold gene data in HASH with geneID as key | |
$RefaeqGenesMap{$geneId} = $chrom."\t".$strand."\t".$codigSeqStart."\t".$codigSeqEnd."\t".$transcriptStart."\t".$transcriptEnd."\t".$geneId."\t".$numberOfExons."\t".$ExonsStart."\t".$ExonsEnd."\t".$AltEvnetType."\t".$chromStart."\t".$chromEnd; | |
} | |
$line = <ALTEV>; # take next line from table | |
} | |
$line = <SNP135>; | |
$line = <SNP135>; #skip unrelevent rows | |
$line = <SNP135>; | |
# print output file headers | |
print OUT "chrom chromStart chromEnd strand geneId codigSeqStart codigSeqEnd transcriptStart transcriptEnd SNPlocation Allels SNPId alternativeSplicingType numberOfExons ExonsStart ExonsEnd\n"; | |
print OUT "!\taltEventChromStart\taltEventChromEnd\taltEventType\n"; | |
while (defined ($line)) # foreach row in table SNP | |
{ | |
chomp($line); | |
@LineArray2 = split(/\t/,$line); # splitting the line to the columns | |
$chrom = $LineArray2[1]; | |
$SNPlocation = $LineArray2[2]; | |
$SNPId = $LineArray2[4]; | |
$strand = $LineArray2[6]; | |
$Allels = $LineArray2[9]; | |
# call helper function to find the genes that are relevant to current SNP | |
findSnpInAlternativeSplicingEvent($chrom, $strand, $SNPlocation,$SNPId,$Allels); | |
$line = <SNP135>; | |
} | |
close(OUT); | |
#findAlternativeSplicingTypeByLocation($chrom, $strand, $SNPlocation,$SNPId,$Allels) | |
sub findSnpInAlternativeSplicingEvent | |
{ | |
my $chrom = $_[0]; | |
my $strand = $_[1]; | |
my $SNPlocation = $_[2]; | |
my $SNPId = $_[3]; | |
my $Allels = $_[4]; | |
foreach $geneId(%RefaeqGenesMap) # foreach gene | |
{ | |
$geneInfo = $RefaeqGenesMap{$geneId}; | |
chomp($geneInfo); | |
@AltArray = split(/\t/,$geneInfo); # splitting the line of every value in a line of the map | |
if($AltArray[0] eq $chrom) | |
{ | |
if($AltArray[1] eq $strand) | |
{ | |
if(($AltArray[2] <= $SNPlocation) && ($AltArray[3] >= $SNPlocation)) # if snp is within the gene | |
{ | |
print OUT $AltArray[0]."\t".$AltArray[11]."\t".$AltArray[12]."\t".$AltArray[1]."\t".$AltArray[6]."\t".$AltArray[2]."\t".$AltArray[3]."\t".$AltArray[4]."\t".$AltArray[5]."\t".$SNPlocation."\t".$Allels."\t".$SNPId."\t".$AltArray[10]."\t".$AltArray[7]."\t".$AltArray[8]."\t".$AltArray[9]."\n"; | |
# find the relevant alterantive splicing events of current gene + SNP match | |
findGeneAlternativeSplicingEventsRelatedToSNP($geneId, $SNPlocation); | |
} | |
} | |
} | |
} | |
} | |
#findGeneAlternativeSplicingEventsRelatedToSNP($geneId, $SNPlocation) | |
sub findGeneAlternativeSplicingEventsRelatedToSNP | |
{ | |
my $geneID = $_[0]; | |
my $SNPlocation = $_[1]; | |
my $altEventsInfo = $AltEventsInformationMap{$geneId}; | |
my $geneInfo = $RefaeqGenesMap{$geneId}; | |
my @geneLineArray = split(/\t/,$geneInfo); # splitting the line to the columns | |
my @altEventsArray = split(/\n/,$altEventsInfo); | |
my $altEventsArraySize = @altEventsArray; | |
my $numberOfExons = $geneLineArray[7]; | |
my $ExonsStart = $geneLineArray[8]; | |
my $ExonsEnd = $geneLineArray[9]; | |
my @ExonStartArray = split(/,/,$ExonsStart); # splitting the exon start positions | |
my @ExonEndArray = split(/,/,$ExonsEnd); # splitting the exon end positions | |
# foreach exon start and end location check if SNP is in the first 50 bp or last 50 bp of exon | |
for (my $i = 0; $i < $numberOfExons; $i++) | |
{ | |
chomp($ExonStartArray[$i]); # chomp current exon's start position | |
chomp($ExonEndArray[$i]); # chomp current exon's end position | |
my $ExonStartPos = $ExonStartArray[$i]; | |
my $ExonEndPos = $ExonEndArray[$i]; | |
# if SNP is in range of some exon | |
if (($SNPlocation >= $ExonStartPos) && ($SNPlocation <= $ExonEndPos)) | |
{ | |
#foreach alternative splicing event - print to output the relevant alternative splicing events of gene and SNP | |
for (my $j = 0; $j < $altEventsArraySize; $j++) | |
{ | |
my $altEvent = @altEventsArray[$j]; | |
my @altEventArray = split(/\t/,$altEvent); | |
my $altEventChromStart = $altEventArray[1]; | |
my $altEventChromEnd = $altEventArray[2]; | |
# if snp is related to alternative splicing then print it to output | |
if (($altEventChromStart <= $ExonStartPos) && ($ExonEndPos <= $altEventChromEnd)) | |
{ | |
print OUT $altEvent . "\n"; | |
} | |
elsif (($altEventChromStart >= $ExonStartPos) && ($ExonEndPos >= $altEventChromEnd)) | |
{ | |
print OUT $altEvent . "\n"; | |
} | |
elsif (($altEventChromStart >= $ExonStartPos) && ($ExonStartPos <= $altEventChromEnd)) | |
{ | |
print OUT $altEvent . "\n"; | |
} | |
elsif (($altEventChromStart >= $ExonEndPos) && ($ExonEndPos <= $altEventChromEnd)) | |
{ | |
print OUT $altEvent . "\n"; | |
} | |
else | |
{ | |
#nothing | |
} | |
} | |
} | |
} | |
} | |
#script #8 appendix J | |
#! /usr/bin/perl-w | |
#***********************- Description -******************************************************* | |
# This script filters the genes by SNPs that are in the first or last 3 positions in an exon. | |
#********************************************************************************************* | |
# joining of UCSC tables that hold splicing variants with SNP within - size: 29,788 | |
open(SNPSPL, "SNP splicing joining new") or die "Could not open SNP splicing joining"; | |
open(OUT, ">SNP splicing joining exon filtered new") or die "Could not open SNP splicing joining exon filtered"; # output file | |
open(LOG, ">LOG") or die "Could not open LOG"; # log file of program | |
$line = <SNPSPL>; # get header of input file | |
$BASE_PAIR_OFFSET = 3; # base pair offset at exon start and end search SNP | |
# print header of result table | |
chop($line); | |
print OUT $line."\tSNPInExonNumber\tSNPoffsetInExon\tSNPStartOrEndExon\n"; | |
print OUT "!\taltEventChromStart\taltEventChromEnd\taltEventType\n"; | |
# column variables for each line in SNPSPL table | |
$chrom; # Reference sequence chromosome or scaffold | |
$chromStart; # chrom position start | |
$chromEnd; # chrom position end | |
$strand; # + or - for strand | |
$geneId; # Name of gene (usually transcript_id from GTF) | |
$codigSeqStart; # Coding region start | |
$codigSeqEnd; # Coding region end | |
$transcriptStart; # Transcription start position | |
$transcriptEnd; # Transcription end position | |
$SNPlocation; # SNP location | |
$Allels; # observed nucleutide allele | |
$SNPId; # unique SNP ID | |
$alternativeSplicingType; # type of alternative splicing event | |
$numberOfExons; # Number of exons | |
$ExonsStart; # Exon start positions | |
$ExonsEnd; # Exon end positions | |
$line = <SNPSPL>; | |
$line = <SNPSPL>; | |
$isInFirstLastThreeNuc = 0; | |
while (defined ($line)) # 1. foreach row in table SNPSPL | |
{ | |
if( $line =~ /^chr/) | |
{ | |
$isInFirstLastThreeNuc = 0; | |
# 1.1. extract exon data of alt splicing variant and SNP location | |
chomp($line); | |
@LineArray = split(/\t/,$line); # splitting the line to the columns | |
$chrom = $LineArray[0]; | |
$chromStart = $LineArray[1]; | |
$chromEnd = $LineArray[2]; | |
$strand = $LineArray[3]; | |
$geneId = $LineArray[4]; | |
$codigSeqStart = $LineArray[5]; | |
$codigSeqEnd = $LineArray[6]; | |
$transcriptStart = $LineArray[7]; | |
$transcriptEnd = $LineArray[8]; | |
$SNPlocation = $LineArray[9]; | |
$Allels = $LineArray[10]; | |
$SNPId = $LineArray[11]; | |
$alternativeSplicingType = $LineArray[12]; | |
$numberOfExons = $LineArray[13]; | |
$ExonsStart = $LineArray[14]; | |
$ExonsEnd = $LineArray[15]; | |
chomp($numberOfExons); | |
chomp($SNPlocation); | |
$StartOrEnd; | |
# print LOG "numberOfExons = ".$numberOfExons."\n"; | |
@ExonStartArray = split(/,/,$ExonsStart); # splitting the exon start positions | |
@ExonEndArray = split(/,/,$ExonsEnd); # splitting the exon end positions | |
# 1.2. foreach exon start and end location check if SNP is in the first 50 bp or last 50 bp of exon | |
for ($i = 0; $i < $numberOfExons; $i++) | |
{ | |
chomp($ExonStartArray[$i]); # chomp current exon's start position | |
chomp($ExonEndArray[$i]); # chomp current exon's end position | |
$ExonStartPos = $ExonStartArray[$i]; | |
$ExonEndPos = $ExonEndArray[$i]; | |
# print LOG "SNPlocation = ".$SNPlocation."\n"; | |
# print LOG "ExonStartPos = ".$ExonStartPos."\n"; | |
# print LOG "ExonEndPos = ".$ExonEndPos."\n"; | |
# 1.2.1. if SNP is in range of some exon | |
if (($SNPlocation >= $ExonStartPos) && ($SNPlocation <= ($ExonStartPos + $BASE_PAIR_OFFSET))) | |
{ | |
# 1.2.1.1 print to OUT the variant + exon number that SNP is in the range of | |
$isInFirstLastThreeNuc = 1; | |
$SNPoffset = $SNPlocation - $ExonStartPos; | |
# if the strand is from 3' to 5' or the other way around | |
if( $strand eq "+") | |
{ | |
$SNPoffset = "+".$SNPoffset; | |
$StartOrEnd = "StartOfExon"; | |
} | |
else | |
{ | |
$SNPoffset = "-".$SNPoffset; | |
$StartOrEnd = "EndOfExon"; | |
} | |
print OUT $line."\t".($i+1)."\t".$SNPoffset."\t".$StartOrEnd."\n"; | |
} | |
elsif (($SNPlocation >= ($ExonEndPos - $BASE_PAIR_OFFSET)) && ($SNPlocation <= $ExonEndPos)) | |
{ | |
$isInFirstLastThreeNuc = 1; | |
$SNPoffset = $ExonEndPos - $SNPlocation; | |
# if the strand is from 3' to 5' or the other way around | |
if( $strand eq "+") | |
{ | |
$SNPoffset = "-".$SNPoffset; | |
$StartOrEnd = "EndOfExon"; | |
} | |
else | |
{ | |
$SNPoffset = "+".$SNPoffset; | |
$StartOrEnd = "StartOfExon"; | |
} | |
print OUT $line."\t".($i+1)."\t".$SNPoffset."\t".$StartOrEnd."\n"; | |
} | |
} | |
} | |
# prints the lines that represent alternative splaicing event if there is a SNP in the first or last 3 Nucleotides | |
else | |
{ | |
if($isInFirstLastThreeNuc == 1) | |
{ | |
print OUT $line; | |
} | |
} | |
$line = <SNPSPL>; # take next line from table | |
} | |
close(SNPSPL); | |
close(OUT); | |
close(LOG); | |
#script #9 appendix K | |
#! /usr/bin/perl-w | |
#***********************- Description -******************************************************* | |
#this script compute the first test by creating 132 tables that combains alernative splicing | |
#types (6), allel variant (11) and location of the snp ( start or end of exon). | |
#********************************************************************************************* | |
open(IN, "SNP splicing joining exon filtered new") or die "Could not open SNP splicing joining exon filtered new"; | |
open(OUT, ">alt_typ_allel_res") or die "Could not openalt_typ_allel_res"; # output file | |
open(YES, ">alt_typ_allel_res_yes") or die "Could not openalt_typ_allel_res"; # output file | |
$line = <IN>; | |
$line = <IN>; | |
$Allels; # observed nucleutide allele | |
$alternativeSplicingType; # type of alternative splicing event | |
$SNPStartOrEndExon; # location of the SNP beginning or end of the exon | |
$counter = 0; | |
# construct chi square table with p.v. = 0.05 | |
%ChiSquareValue = (); # key-d.o.f value-chi square value | |
$ChiSquareValue{1} = 3.84; | |
$ChiSquareValue{2} = 5.99; | |
$ChiSquareValue{3} = 7.81; | |
$ChiSquareValue{4} = 9.49; | |
$ChiSquareValue{5} = 11.1; | |
$ChiSquareValue{6} = 12.6; | |
$ChiSquareValue{7} = 14.1; | |
$ChiSquareValue{8} = 15.5; | |
$ChiSquareValue{9} = 16.9; | |
$ChiSquareValue{10} = 18.3; | |
$ChiSquareValue{11} = 19.7; | |
$ChiSquareValue{12} = 21; | |
$ChiSquareValue{13} = 22.4; | |
$ChiSquareValue{14} = 23.7; | |
$ChiSquareValue{15} = 25; | |
$ChiSquareValue{16} = 26.3; | |
$ChiSquareValue{17} = 27.6; | |
$ChiSquareValue{18} = 28.9; | |
$ChiSquareValue{19} = 30.1; | |
$ChiSquareValue{20} = 31.4; | |
$ChiSquareValue{21} = 32.7; | |
$ChiSquareValue{22} = 33.9; | |
$ChiSquareValue{23} = 35.2; | |
$ChiSquareValue{24} = 36.4; | |
$ChiSquareValue{25} = 37.7; | |
$ChiSquareValue{26} = 38.9; | |
$ChiSquareValue{27} = 40.1; | |
$ChiSquareValue{28} = 41.3; | |
$ChiSquareValue{29} = 42.6; | |
$ChiSquareValue{30} = 43.8; | |
@AltTyps; | |
$AltTyps[0] = "altFinish"; | |
$AltTyps[1] = "cassetteExon"; | |
$AltTyps[2] = "retainedIntron"; | |
$AltTyps[3] = "bleedingExon"; | |
$AltTyps[4] = "altThreePrime"; | |
$AltTyps[5] = "altFivePrime"; | |
$AltTypsLen = @AltTyps; | |
@SNPvariations; | |
$SNPvariations[0] = "A/T"; | |
$SNPvariations[1] = "A/G"; | |
$SNPvariations[2] = "A/C"; | |
$SNPvariations[3] = "G/T"; | |
$SNPvariations[4] = "C/G"; | |
$SNPvariations[5] = "C/T"; | |
$SNPvariations[6] = "A/G/T"; | |
$SNPvariations[7] = "A/C/G"; | |
$SNPvariations[8] = "A/C/T"; | |
$SNPvariations[9] = "C/G/T"; | |
$SNPvariations[10] = "A/C/G/T"; | |
$SNPvariationsLen = @SNPvariations; | |
#saves the value of each cell in the obs chart | |
@ObstableArr; | |
for($k = 0; $k < 4; $k++) | |
{ | |
$ObstableArr[$k] = 0; | |
} | |
$NumOfTests = 0; | |
@BeginningOrEndOfExonArr; | |
$BeginningOrEndOfExonArr[0] = "StartOfExon"; | |
$BeginningOrEndOfExonArr[1] = "EndOfExon"; | |
for($i = 0; $i < $AltTypsLen; $i++) | |
{ | |
for($j = 0; $j < $SNPvariationsLen; $j++) | |
{ | |
$line = <IN>; # get the line of the gene | |
#preforming the twice one for the beginning of the exon and ones for the end | |
for($k = 0; $k < 2; $k++) | |
{ | |
$NumOfTests++; | |
while(defined $line) | |
{ | |
chomp($line); | |
@LineArray = split(/\t/,$line); # splitting the line to the columns | |
$Allels = $LineArray[10]; | |
$SNPStartOrEndOfExon = $LineArray[18]; | |
chomp($SNPStartOrEndOfExon); | |
$line = <IN>; # get the line of the splicing variant | |
while($line =~ /^!/) | |
{ | |
@LineAltTypeArray = split(/\t/,$line); # splitting the line to the columns | |
$alternativeSplicingType = $LineAltTypeArray[3]; | |
chomp($alternativeSplicingType); | |
if(( $Allels eq $SNPvariations[$j]) && ($alternativeSplicingType eq $AltTyps[$i])&& ($SNPStartOrEndOfExon eq $BeginningOrEndOfExonArr[$k])) | |
{ | |
#increasing the right cell of the obs table | |
$ObstableArr[0]++; | |
$sumOfOBSInstances++; | |
} | |
elsif(( $Allels eq $SNPvariations[$j]) && ($alternativeSplicingType ne $AltTyps[$i])&& ($SNPStartOrEndOfExon eq $BeginningOrEndOfExonArr[$k])) | |
{ | |
#increasing the right cell of the obs table | |
$ObstableArr[1]++; | |
$sumOfOBSInstances++; | |
} | |
elsif(( $Allels ne $SNPvariations[$j]) && ($alternativeSplicingType eq $AltTyps[$i])&& ($SNPStartOrEndOfExon eq $BeginningOrEndOfExonArr[$k])) | |
{ | |
#increasing the right cell of the obs table | |
$ObstableArr[2]++; | |
$sumOfOBSInstances++; | |
} | |
elsif(($SNPStartOrEndOfExon eq $BeginningOrEndOfExonArr[$k])) | |
{ | |
#( $Allels ne $SNPvariations[$j]) && ($alternativeSplicingType ne $AltTyps[$i])&& | |
#increasing the right cell of the obs table | |
$ObstableArr[3]++; | |
$sumOfOBSInstances++; | |
} | |
$line = <IN>; | |
} | |
} | |
close(IN); | |
open(IN, "SNP splicing joining exon filtered new") or die "Could not open SNP splicing joining exon filtered new"; | |
$line = <IN>; | |
$line = <IN>; | |
if($sumOfOBSInstances != 0) | |
{ | |
#printing the OBS table data | |
print OUT "statistic test number\t".$NumOfTests."\t".$AltTyps[$i]."\t".$SNPvariations[$j]."\t".$BeginningOrEndOfExonArr[$k]."\n"; | |
print OUT "OBS table\n"; | |
print OUT "\t\t".$AltTyps[$i]."\t no ".$AltTyps[$i]."\n"; | |
print OUT $SNPvariations[$j]."(".$BeginningOrEndOfExonArr[$k].")\t\t".$ObstableArr[0]."\t\t".$ObstableArr[1]."\n"; | |
print OUT "no ".$SNPvariations[$j]."(".$BeginningOrEndOfExonArr[$k].")\t\t".$ObstableArr[2]."\t\t".$ObstableArr[3]."\n\n"; | |
# compute the data needed for the exp table | |
$SumOfRowWithAllel = $ObstableArr[0] + $ObstableArr[1]; | |
$SumOfRowWIthOutAllel = $ObstableArr[2] + $ObstableArr[3]; | |
$SumOfColWithAS = $ObstableArr[0] + $ObstableArr[2]; | |
$SumOfColWithOutAS = $ObstableArr[1] + $ObstableArr[3]; | |
$expTable[0] = ($SumOfRowWithAllel * $SumOfColWithAS)/$sumOfOBSInstances; | |
$expTable[1] = ($SumOfRowWithAllel * $SumOfColWithOutAS)/$sumOfOBSInstances; | |
$expTable[2] = ($SumOfRowWIthOutAllel * $SumOfColWithAS)/$sumOfOBSInstances; | |
$expTable[3] = ($SumOfRowWIthOutAllel * $SumOfColWithOutAS)/$sumOfOBSInstances; | |
$expTableHasValueLowerThanFive = 0; | |
if(($expTable[0] < 5) || ($expTable[1] < 5) || | |
($expTable[2] < 5) || ($expTable[3] < 5)) | |
{ # found at least one box in the expTable with value lower than 5 - no correlation | |
$expTableHasValueLowerThanFive = 1; | |
} | |
#prints out the exp table | |
#print OUT "\n exp table\n"; | |
#print OUT "\t\t".$AltTyps[$i]."\t\t no ".$AltTyps[$i]."\n"; | |
#print OUT $SNPvariations[$j]."\t\t".$expTable[0]."\t".$expTable[1]."\n"; | |
#print OUT "no ".$SNPvariations[$j]."\t".$expTable[2]."\t".$expTable[3]."\n\n "; | |
#compute Chi Square value and print result | |
$ChiSquare = 0; | |
for($h = 0; $h < 4; $h++) | |
{ | |
if($expTable[$h] != 0) | |
{ | |
$ChiSquare = ((($ObstableArr[$h] - $expTable[$h])*($ObstableArr[$h] - $expTable[$h]))/$expTable[$h]) + $ChiSquare; | |
#print OUT $ObstableArr[$h]." - ".$expTable[$h]." * ".$ObstableArr[$h]." - ".$expTable[$h]." / ".$expTable[$h]." + ".$ChiSquare."\n"; | |
} | |
} | |
# the dof is always 1 (all the table are 2X2) | |
$dof = 1; | |
print OUT "X^2 value = " . $ChiSquare . "\n"; | |
print OUT "d.o.f = " .$dof."\n"; | |
# chaeck if computed p value is lower than 0.05 | |
if ($dof >= 1 && $dof <= 30) | |
{ | |
# 2.2 there is a correlation between SNP allele and distinct mRNA - print result | |
$passingChiSquareValue = $ChiSquareValue{$dof}; | |
print OUT "P.V = ".$passingChiSquareValue."\n\n"; | |
if ($ChiSquare >= $passingChiSquareValue) | |
{ | |
$counter++; | |
#print YES "statistic test number\t".$NumOfTests."\t".$AltTyps[$i]."\t".$SNPvariations[$j]."\t".$BeginningOrEndOfExonArr[$k]."\n"; | |
print YES "OBS table\n"; | |
print YES "\t\t".$AltTyps[$i]."\t\t no ".$AltTyps[$i]."\n"; | |
print YES $SNPvariations[$j]."(".$BeginningOrEndOfExonArr[$k].")\t\t".$ObstableArr[0]."\t\t".$ObstableArr[1]."\n"; | |
print YES "no ".$SNPvariations[$j]."(".$BeginningOrEndOfExonArr[$k].")\t\t".$ObstableArr[2]."\t\t".$ObstableArr[3]."\n\n"; | |
print YES "X^2 value = " . $ChiSquare . "\n"; | |
print YES "d.o.f = " .$dof."\n"; | |
print YES "P.V = ".$passingChiSquareValue."\n\n"; | |
#print YES "\n exp table\n"; | |
#print YES "\t\t".$AltTyps[$i]."\t\t no ".$AltTyps[$i]."\n"; | |
#print YES $SNPvariations[$j]."\t\t".$expTable[0]."\t".$expTable[1]."\n"; | |
#print YES "no ".$SNPvariations[$j]."\t".$expTable[2]."\t".$expTable[3]."\n\n "; | |
if($expTableHasValueLowerThanFive == 1) | |
{ | |
print OUT "Corralation found: No\n"; | |
} | |
else | |
{ | |
print OUT "Corralation found: Yes\n"; | |
} | |
} | |
else | |
{ | |
print OUT "Corralation found: No\n"; | |
} | |
} | |
print OUT "\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\n"; | |
for($f = 0; $f < 4; $f++) | |
{ | |
$ObstableArr[$f] = 0; | |
} | |
for($f = 0; $f < 4; $f++) | |
{ | |
$expTable[$f] = 0; | |
} | |
$sumOfOBSInstances = 0; | |
$expTableHasValueLowerThanFive = 0; | |
} | |
} | |
} | |
print OUT "number of yes's".$counter." \n"; | |
} | |
#script #10 appendix L | |
#! /usr/bin/perl-w | |
#***********************- Description -******************************************************* | |
# This script finds all the mRNA and EST sequences that match a certain SNP and its containing | |
# gene. | |
#********************************************************************************************* | |
# open files | |
open(HUMAN_EST, "human_est_UCSC") or die "Could not open HUMAN_EST"; | |
open(HUMAN_MRNA, "human_mRNA_UCSC") or die "Could not open HUMAN_MRNA"; | |
open(SNPGENES, "SNP splicing joining exon filtered new") or die "Could not open HUMAN_EST"; | |
open(OUT, ">human_mrna_est_UCSC_filtered_by_SNP") or die "human_mrna_est_UCSC_filtered_by_SNP"; | |
#print headers | |
print OUT "\$NM_000000\t\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\n"; | |
print OUT "\@SNIP\t\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\n"; | |
print OUT "chrom\tchromStart\tchromEnd\tstrand\tgeneId\tcodigSeqStart\tcodigSeqEnd\ttranscriptStart\ttranscriptEnd\tSNPlocation\tAllels\tSNPId\talternativeSplicingType\tnumberOfExons\tExonsStart\tExonsEnd\tSNPInExonNumber\tSNPoffsetInExon\tSNPStartOrEndExon\n"; | |
print OUT "!\taltEventChromStart\taltEventChromEnd\taltEventType\n"; | |
print OUT "> typeOfSeq\tblockNumber\tOffset\t#bin\tmatches\tmisMatches\trepMatches\tnCount\tqNumInsert\tqBaseInsert\ttNumInsert\ttBaseInsert\tstrand\tqName\tqSize\tqStart\tqEnd\ttName\ttSize\ttStart\ttEnd\tblockCount\tblockSizes\tqStarts\ttStarts\n\n"; | |
$line = <SNPGENES>; | |
$line = <SNPGENES>; | |
$line = <SNPGENES>; | |
while (defined $line) | |
{ | |
$altEventLines = ""; | |
$matchedSequencesLines = ""; # holds the returned result of all matched sequences | |
@LineArray = split(/\t/,$line); # splitting the line to the columns | |
$geneSNPLine = $line; | |
$geneId = $LineArray[4]; | |
$SNPchrom = @LineArray[0]; | |
$SNPstrand = @LineArray[3]; | |
$SNPlocation = @LineArray[9]; | |
$SNPID = @LineArray[11]; | |
# call helper function that finds all the sequences that match gene and SNP | |
$matchedSequencesLines = findSequencesBySNPLocation($SNPchrom, $SNPstrand, $SNPlocation); | |
# concatinate all alternative events | |
$line = <SNPGENES>; | |
while ($line =~ /^!/) | |
{ | |
$altEventLines = $altEventLines . $line; | |
$line = <SNPGENES>; | |
} | |
chomp($altEventLines); | |
chomp($matchedSequencesLines); | |
# print results to output file | |
print OUT "\$". $geneId ."\t\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\n"; | |
print OUT "\@". $SNPID ."\t\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\n"; | |
print OUT $geneSNPLine; | |
print OUT $altEventLines . "\n"; | |
print OUT $matchedSequencesLines . "\n"; | |
} | |
close(HUMAN_EST); | |
close(HUMAN_MRNA); | |
close(SNPGENES); | |
close(OUT); | |
#findSequencesBySNPLocation($chrom, $strand, $SNPlocation) | |
sub findSequencesBySNPLocation() | |
{ | |
seek(HUMAN_EST, 0, SEEK_SET); # move file cursor to start of file | |
seek(HUMAN_MRNA, 0, SEEK_SET); # move file cursor to start of file | |
my $i_chrom = $_[0]; | |
my $i_strand = $_[1]; | |
my $i_SNPlocation = $_[2]; | |
# data structure to hold sequence information | |
my $matchedSequencesResult; | |
my @LineArray; | |
my $chrom; | |
my $strand; | |
my $tStart; | |
my $tEnd; | |
my $numberOfBlocks; | |
my $blocksSize; | |
my @blocksSize; | |
my $blockStart; | |
my @blocksStart; | |
my $qName; | |
my $mRNALine = <HUMAN_MRNA>; | |
$mRNALine = <HUMAN_MRNA>; | |
while (defined $mRNALine) # loop over mRNA sequences | |
{ | |
my @LineArray = split(/\t/,$mRNALine); # splitting the line to the columns | |
my $chrom = @LineArray[14]; | |
my $strand = @LineArray[9]; | |
my $tStart = @LineArray[16]; | |
my $tEnd = @LineArray[17]; | |
my $numberOfBlocks = @LineArray[18]; | |
my $blocksSize = @LineArray[19]; | |
my @blocksSize = split(/,/,$blocksSize); | |
my $blockStart = @LineArray[21]; | |
my @blocksStart = split(/,/,$blockStart); | |
my $qName = @LineArray[10]; | |
if( $chrom eq $i_chrom) #checks if the SNP is in the same chromosome as the mRNA | |
{ | |
if($strand eq $i_strand) #checks if the SNP is in the same strand as the mRNA | |
{ | |
if(($tStart <= $i_SNPlocation) && ($i_SNPlocation <= $tEnd)) #checks if the SNP is in the range of the mRNA | |
{ | |
for(my $i = 0 ; $i < $numberOfBlocks ; $i++) #for each block in the mRNA | |
{ | |
my $blockEnd = $blocksStart[$i] + $blocksSize[$i]; | |
if(($blocksStart[$i] <= $i_SNPlocation) && ($blockEnd >= $i_SNPlocation)) # checks if the SNP is in the range of a block of the mRNA | |
{ | |
my $offsetSnp = $i_SNPlocation - $blocksStart[$i]; # calculates the location of the snp in the mRNA block | |
my $index = $i + 1; # the block that the SNP is in | |
$matchedSequencesResult = $matchedSequencesResult . "> mRNA\t".$index."\t".$offsetSnp."\t".$mRNALine; | |
break; | |
} | |
} | |
} | |
} | |
} | |
$mRNALine = <HUMAN_MRNA>; | |
} | |
my $estLine = <HUMAN_EST>; | |
$estLine = <HUMAN_EST>; | |
while (defined $estLine) # loop over EST sequences | |
{ | |
my @LineArray = split(/\t/,$estLine); # splitting the line to the columns | |
my $chrom = @LineArray[14]; | |
my $strand = @LineArray[9]; | |
my $tStart = @LineArray[16]; | |
my $tEnd = @LineArray[17]; | |
my $numberOfBlocks = @LineArray[18]; | |
my $blocksSize = @LineArray[19]; | |
my @blocksSize = split(/,/,$blocksSize); | |
my $blockStart = @LineArray[21]; | |
my @blocksStart = split(/,/,$blockStart); | |
my $qName = @LineArray[10]; | |
if( $chrom eq $i_chrom) #checks if the SNP is in the same chromosome as the EST | |
{ | |
if($strand eq $i_strand) #checks if the SNP is in the same strand as the EST | |
{ | |
if(($tStart <= $i_SNPlocation) && ($i_SNPlocation <= $tEnd)) #checks if the SNP is in the range of the EST | |
{ | |
for(my $i = 0 ; $i < $numberOfBlocks ; $i++) #for each block in the EST | |
{ | |
my $blockEnd = $blocksStart[$i] + $blocksSize[$i]; | |
if(($blocksStart[$i] <= $i_SNPlocation) && ($blockEnd >= $i_SNPlocation)) # checks if the SNP is in the range of a block of the EST | |
{ | |
my $offsetSnp = $i_SNPlocation - $blocksStart[$i]; # calculates the location of the snp in the EST block | |
my $index = $i + 1; # the block that the SNP is in | |
$matchedSequencesResult = $matchedSequencesResult . "> EST\t".$index."\t".$offsetSnp."\t".$estLine; | |
break; | |
} | |
} | |
} | |
} | |
} | |
$estLine = <HUMAN_EST>; | |
} | |
return $matchedSequencesResult; | |
} | |
#script #11 appendix M | |
#! /usr/bin/perl-w | |
#***********************- Description -************************* | |
# This script uses the computed block number and SNP offset to extract | |
# the SNP allele from the sequence. | |
#*************************************************************** | |
#***********************- pseudo code -************************* | |
#1. foreach sequence in the input file (starts with ">") | |
# 1.1 mark the sequence ID that needed from the sequences database in hashtables - mrnaIDHash and estIDHash | |
#2. extract only marked sequences from databse into hashtables - mrnaHash and estHash | |
#3. foreach sequence in the input file (starts with ">") | |
#4. extract the snip allele an print it to the output file | |
#*************************************************************** | |
# openning the required input and output files | |
open(SEQ, "human_mrna_est_UCSC_filtered_by_SNP") or die "Could not open human_mrna_est_UCSC_filtered_by_SNP"; | |
open(EST, "est.fa") or die "Could not open human_ESTs_sequences_UCSC"; | |
open(MRNA, "mrna.fa") or die "Could not open human_mRNA_sequences_UCSC"; | |
open(OUT, ">human_mrna_est_UCSC_filtered_by_SNP_with_allele") or die "human_mrna_est_UCSC_filtered_by_SNP_with_allele"; # output file | |
open(LOG, ">LOG") or die "Could not open LOG"; # log file of program | |
open(TOUT, ">TOUT") or die "Could not open LOG"; | |
# get header of input file | |
$geneIDHeader = <SEQ>; # get header of input file | |
$SNPHeader = <SEQ>; # get header of input file | |
$SNPInfoHeader = <SEQ>; # get header of input filed | |
$AltTypeHeader = <SEQ>; # get header of input filed | |
$seqHeader = <SEQ>; # get header of input file | |
# set header of output file | |
print OUT $geneIDHeader; | |
print OUT $SNPHeader; | |
print OUT "\%Allel\t\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\n"; | |
print OUT $SNPInfoHeader; | |
print OUT $AltTypeHeader; | |
print OUT $seqHeader; | |
print OUT "\n"; | |
$line = <SEQ>; | |
$line = <SEQ>; | |
%mrnaIDHash = (); | |
%estIDHash = (); | |
# go over the input file, filter out gene withut alternative splicing events or any sequences | |
# and mark the required sequences for extraction from database | |
while (defined $line) | |
{ | |
if($line =~ /^\$/) | |
{ | |
if ($sequencesOfGeneCounter > 0 && $altEventsOfGeneCounter > 0) | |
{ | |
print TOUT $geneLines; | |
} | |
$sequencesOfGeneCounter = 0; | |
$altEventsOfGeneCounter = 0; | |
$geneLines = ""; | |
} | |
elsif($line =~ /^!/) | |
{ | |
$altEventsOfGeneCounter++; | |
} | |
elsif($line =~ /^>/) | |
{ | |
$sequencesOfGeneCounter++; | |
@SeqInfo = split(/\t/,$line); | |
$seqType = substr($SeqInfo[0], 2); | |
$seqName = $SeqInfo[13]; | |
chomp($seqName); | |
if($seqType eq "mRNA") # mark this sequence ID for extraction from database | |
{ | |
$mrnaIDHash{$seqName} = 1; | |
} | |
elsif($seqType eq "EST") | |
{ | |
$estIDHash{$seqName} = 1; | |
} | |
} | |
$geneLines = $geneLines . $line; | |
$line = <SEQ>; | |
} | |
# fill hash table with marked mRNA sequences for extraction | |
%mrnaHash = (); | |
$line = <MRNA>; | |
while (defined $line) | |
{ | |
if($line =~ /^>/) | |
{ | |
$sequenceID = $line; | |
$seq = ""; | |
$line = <MRNA>; | |
while(defined $line) | |
{ | |
if($line =~ /^>/) | |
{ | |
@LineArray = split(/ /,$sequenceID); | |
$tempID = $LineArray[0]; | |
$tempID = substr($tempID, 1); | |
chomp($tempID); | |
if (defined $mrnaIDHash{$tempID}) # if this sequence was marked for extraction | |
{ | |
print LOG "mrnaHash:" . $tempID . "\n"; | |
$mrnaHash{$tempID} = $seq; | |
} | |
last; | |
} | |
else | |
{ | |
chomp($line); | |
$seq = $seq . $line; | |
} | |
$line = <MRNA>; | |
} | |
} | |
} | |
%mrnaIDHash = (); | |
# fill hash table with marked EST sequences for extraction | |
%estHash = (); | |
$line = <EST>; | |
while (defined $line) | |
{ | |
if($line =~ /^>/) | |
{ | |
$sequenceID = $line; | |
$seq = ""; | |
$line = <EST>; | |
while(defined $line) | |
{ | |
if($line =~ /^>/) | |
{ | |
@LineArray = split(/ /,$sequenceID); | |
$tempID = $LineArray[0]; | |
$tempID = substr($tempID, 1); | |
chomp($tempID); | |
if (defined $estIDHash{$tempID}) # if this sequence was marked for extraction | |
{ | |
print LOG "estHash:" . $tempID . "\n"; | |
$estHash{$tempID} = $seq; | |
} | |
last; | |
} | |
else | |
{ | |
chomp($line); | |
$seq = $seq . $line; | |
} | |
$line = <EST>; | |
} | |
} | |
} | |
%estIDHash = (); | |
close(TOUT); | |
open(TOUT, "TOUT") or die "Could not open TOUT"; | |
$line = <TOUT>; | |
while (defined $line) | |
{ | |
chomp($line); | |
print OUT $line."\n"; | |
if($line =~ /^>/) # foreach sequence - extract the smip allele from the sequence | |
{ | |
@SeqInfo = split(/\t/,$line); | |
$seqType = substr($SeqInfo[0], 2); | |
$seqName = $SeqInfo[13]; | |
chomp($seqName); | |
$blockNumber = $SeqInfo[1]; | |
$offset = $SeqInfo[2]; | |
$allBlocks = $SeqInfo[23]; | |
@BlockArray = split(/,/,$allBlocks); | |
$SNPBlock = $BlockArray[$blockNumber-1]; | |
$snipIndex = $SNPBlock + $offset - 1; | |
$strand = $SeqInfo[12]; | |
if($seqType eq "mRNA") | |
{ | |
if (defined $mrnaHash{$seqName}) | |
{ | |
$seq = $mrnaHash{$seqName}; # get sequence from hash | |
if($strand eq "-") # if needed reverse the sequence to get opposite strand | |
{ | |
$reversed = reverse $seq; | |
$seq = $reversed; | |
} | |
$snip = substr($seq, $snipIndex+1, 1); # extract snip allele | |
print OUT "\%" . $snip . "\t\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\n"; | |
} | |
} | |
elsif($seqType eq "EST") | |
{ | |
print LOG "if defined: " . $seqName . "\n"; | |
if (defined $estHash{$seqName}) | |
{ | |
$seq = $estHash{$seqName}; # get sequence from hash | |
if($strand eq "-") # if needed reverse the sequence to get opposite strand | |
{ | |
$reversed = reverse $seq; | |
$seq = $reversed; | |
} | |
$snip = substr($seq, $snipIndex+1, 1); # extract snip allele | |
print OUT "\%" . $snip . "\t\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\n"; | |
} | |
} | |
} | |
$line = <TOUT>; | |
} | |
close(SEQ); | |
close(EST); | |
close(MRNA); | |
close(OUT); | |
close(LOG); | |
close(TOUT); | |
#script #12 appendix N | |
#! /usr/bin/perl-w | |
#****************- Description -*********************************************** | |
# This script finds per each sequence the relevant alternative splicing events | |
# that were found at the sequence. | |
# sequences with no type of alternative splicing are not printed out to the output | |
# file. | |
#****************************************************************************** | |
open(IN, "human_mrna_est_UCSC_filtered_by_SNP_with_allele") or die "human_mrna_est_UCSC_filtered_by_SNP_with_allele"; # output file | |
open(OUT, ">human_mrna_est_UCSC_filtered_by_SNP_with_allele_with_splicing_type") or die "human_mrna_est_UCSC_filtered_by_SNP_with_allele"; # output file | |
# get header of input file | |
$geneIDHeader = <IN>; # get header of input file | |
$SNPHeader = <IN>; # get header of input file | |
$SNPAlleleHeader = <IN>; # get header of input filed | |
$SNPInfoHeader = <IN>; # get header of input filed | |
$AltTypeHeader = <IN>; # get header of input filed | |
$seqHeader = <IN>; # get header of input file | |
# set header of output file | |
print OUT $geneIDHeader; | |
print OUT $SNPHeader; | |
print OUT $SNPInfoHeader; | |
print OUT $AltTypeHeader; | |
print OUT $seqHeader; | |
print OUT $SNPAlleleHeader; | |
print OUT "\#\taltEventChromStart\taltEventChromEnd\taltEventType\n"; | |
print OUT "\n"; | |
$line = <IN>; | |
$line = <IN>; | |
$currGeneLine; | |
# go over the input file | |
while (defined $line) | |
{ | |
if($line =~ /^\$/) # new gene block of data | |
{ | |
$currAltEventsCounter = 0; # clear currrent alternative splicing events counter | |
@currAltEventsArray = (); # clear currrent alternative splicing events array | |
print OUT $line; | |
} | |
elsif($line =~ /^\@/) #line of gene information | |
{ | |
print OUT $line; | |
} | |
elsif($line =~ /^chr/) #line of gene information | |
{ | |
$currGeneLine = $line; | |
print OUT $line; | |
} | |
elsif($line =~ /^!/) #line of gene information | |
{ | |
# insert new alternative splicing event into array | |
$currAltEventsArray[$currAltEventsCounter] = $line; | |
$currAltEventsCounter++; | |
print OUT $line; | |
} | |
elsif($line =~ /^>/) #line of gene information | |
{ | |
$currSeqLine = $line; # get sequence line | |
$line = <IN>; # get sequence SNP allele | |
$currAlleleLine = $line; | |
$matchedAltEventForSeq = ""; # init string holding all matched alternative splicing events of sequence | |
# foreach of current genes alternative splicing events | |
for ($i = 0; $i < $currAltEventsCounter; $i++) | |
{ | |
$tempAltEventLine = $currAltEventsArray[$i]; # copy current alternative splicing event | |
# call helper function to check if alternative splicing event is present in current sequence | |
$bool = iSAltEventPresentInSequence($currSeqLine, $tempAltEventLine, $currGeneLine); | |
if ($bool == 1) # if alternative splicing event is present in current sequence | |
{ # concatinate all the relevant results | |
$matchedAltEventForSeq = $matchedAltEventForSeq . "#\t" . $tempAltEventLine; | |
} | |
} | |
if ($matchedAltEventForSeq ne "") # print out all the alternatove splicing events matched to current sequence | |
{ | |
print OUT $currSeqLine; | |
print OUT $currAlleleLine; | |
print OUT $matchedAltEventForSeq; | |
} | |
} | |
$line = <IN>; | |
} | |
close(IN); | |
close(OUT); | |
#iSAltEventPresentInSequence($sequenceLine, $altEventLine, $geneLine) | |
sub iSAltEventPresentInSequence() | |
{ | |
my $i_sequenceLine = $_[0]; | |
my $i_altEventLine = $_[1]; | |
my $i_geneLine = $_[2]; | |
# split input lines to arrays | |
my @seqLineArray = split(/\t/,$i_sequenceLine); | |
my @altEventLineArray = split(/\t/,$i_altEventLine); | |
my @geneLineArray = split(/\t/,$i_geneLine); | |
my $altEventType = $altEventLineArray[3]; | |
chomp($altEventType); | |
#chomp($altEventType); # temp workaround - when running in windows | |
chop($altEventType); # temp workaround - when running in linux | |
if ($altEventType eq "altFinish") | |
{ | |
# check if alt finish event is present in sequence. | |
# the chromEnd of alternative event represents the end of transcript | |
# check if the transcript end position matches the chromEnd of alternative splicing event | |
my $altEventChromStart = $altEventLineArray[1]; | |
my $altEventChromEnd = $altEventLineArray[2]; | |
my $seqTranscriptEnd = $seqLineArray[20]; | |
chomp($altEventChromEnd); | |
chomp($seqTranscriptEnd); | |
if ($seqTranscriptEnd eq $altEventChromEnd) | |
{ | |
return 1; # alt finish found - return true | |
} | |
} | |
elsif ($altEventType eq "cassetteExon") | |
{ | |
# check if sequence has a matching exon to the alternative splicing event positions | |
# the chromStart of altEvent represent exon start position | |
# the chromEnd of altEvent represents exon end position | |
# extract relevant data | |
my $altEventChromStart = $altEventLineArray[1]; | |
my $altEventChromEnd = $altEventLineArray[2]; | |
my $seqTranscriptStart = $seqLineArray[19]; | |
my $seqTranscriptEnd = $seqLineArray[20]; | |
my $numberOfBlocks = $seqLineArray[21]; | |
my $blockSizesLine = $seqLineArray[22]; | |
my $transStartsLine = $seqLineArray[24]; | |
my @blocksSizesArray = split(/,/,$blockSizesLine); | |
my @blocksStartArray = split(/,/,$transStartsLine); | |
chomp($altEventChromStart); | |
chomp($altEventChromEnd); | |
# if alternative splicing event is within the sequence transcription positions | |
if(($seqTranscriptStart <= $altEventChromStart) && ($altEventChromEnd <= $seqTranscriptEnd)) | |
{ | |
# go over all sequence blocks - if the relevant exon doesnt exist in | |
# transcript then cassetteExon event found | |
for (my $j = 0; $j < $numberOfBlocks; $j++) | |
{ | |
# compute block end position | |
my $blockEndPosition = $blocksStartArray[$j] + $blocksSizesArray[$j]; | |
# if alternative splicing event matches the current block - cassette exon wasnt found in sequence | |
if(($blocksStartArray[$j] <= $altEventChromStart) && ($altEventChromEnd <= $blockEndPosition)) | |
{ | |
return 0; # cassette exon wasnt found - return false | |
} | |
elsif(($altEventChromStart <= $blocksStartArray[$j]) && ($blockEndPosition <= $altEventChromEnd)) | |
{ | |
return 0; # cassette exon wasnt found - return false | |
} | |
} | |
} | |
# after looping over sequence blocks, the matching block to the cassette exon was not found - return true | |
return 1; | |
} | |
elsif ($altEventType eq "retainedIntron") | |
{ | |
# check if in the sequence there is a block which matches the positions of intron start and end | |
# the chromStart of altEvent represent intron start position | |
# the chromEnd of altEvent represents intron end position | |
# extract relevant data | |
my $altEventChromStart = $altEventLineArray[1]; | |
my $altEventChromEnd = $altEventLineArray[2]; | |
my $seqTranscriptStart = $seqLineArray[19]; | |
my $seqTranscriptEnd = $seqLineArray[20]; | |
my $numberOfBlocks = $seqLineArray[21]; | |
my $blockSizesLine = $seqLineArray[22]; | |
my $transStartsLine = $seqLineArray[24]; | |
my @blocksSizesArray = split(/,/,$blockSizesLine); | |
my @blocksStartArray = split(/,/,$transStartsLine); | |
chomp($altEventChromStart); | |
chomp($altEventChromEnd); | |
# if alternative splicing event is within the sequence transcription positions | |
if(($seqTranscriptStart <= $altEventChromStart) && ($altEventChromEnd <= $seqTranscriptEnd)) | |
{ | |
# go over all sequence blocks - if the relevant intron is found in | |
# a block then retained intron event is found | |
for (my $j = 0; $j < $numberOfBlocks; $j++) | |
{ | |
# compute block end position | |
my $blockEndPosition = $blocksStartArray[$j] + $blocksSizesArray[$j]; | |
# if alternative splicing event matches the current block - retained intron was found in sequence | |
if(($blocksStartArray[$j] <= $altEventChromStart) && ($altEventChromEnd <= $blockEndPosition)) | |
{ | |
return 1; # retained intron was found - return true | |
} | |
elsif(($altEventChromStart <= $blocksStartArray[$j]) && ($blockEndPosition <= $altEventChromEnd)) | |
{ | |
return 1; # retained intron was found - return true | |
} | |
} | |
} | |
} | |
elsif ($altEventType eq "bleedingExon") | |
{ | |
# check if initail or terminal exon overlap an intron of the exact positions as given by the | |
# chromStart chromEnd positions of the alternative splicing event. | |
# chromStart represent the start position of additional intronic sequence to the exon. | |
# chromEnd represent the end position of additional intronic sequence to the exon. | |
# extract relevant data | |
my $altEventChromStart = $altEventLineArray[1]; | |
my $altEventChromEnd = $altEventLineArray[2]; | |
my $seqTranscriptStart = $seqLineArray[19]; | |
my $seqTranscriptEnd = $seqLineArray[20]; | |
my $numberOfBlocks = $seqLineArray[21]; | |
my $blockSizesLine = $seqLineArray[22]; | |
my $transStartsLine = $seqLineArray[24]; | |
my @blocksSizesArray = split(/,/,$blockSizesLine); | |
my @blocksStartArray = split(/,/,$transStartsLine); | |
chomp($altEventChromStart); | |
chomp($altEventChromEnd); | |
# if alternative splicing event is within the sequence transcription positions | |
if(($seqTranscriptStart <= $altEventChromStart) && ($altEventChromStart <= $seqTranscriptEnd)) | |
{ | |
my $firstBlockIndex = 0; | |
my $lastBlockIndex = $numberOfBlocks - 1; | |
# compute block end positions | |
my $firstBlockEndPosition = $blocksStartArray[$firstBlockIndex] + $blocksSizesArray[$firstBlockIndex]; | |
my $lastBlockEndPosition = $blocksStartArray[$lastBlockIndex] + $blocksSizesArray[$lastBlockIndex]; | |
# if alternative splicing event matches the initail or terminal exon - bleedingExon was found | |
if(($blocksStartArray[$firstBlockIndex] <= $altEventChromStart) && ($altEventChromEnd <= $firstBlockEndPosition)) | |
{ | |
return 1; # bleedingExon was found - return true | |
} | |
elsif(($blocksStartArray[$lastBlockIndex] <= $altEventChromStart) && ($altEventChromEnd <= $lastBlockEndPosition)) | |
{ | |
return 1; # bleedingExon was found - return true | |
} | |
} | |
} | |
elsif ($altEventType eq "altThreePrime") | |
{ | |
# check if in a sequence there is an intron with alternative 3' position. | |
# this affects the 5' position of the adjacent exon. | |
# the chromStart/chromEnd positions of alt Event represent the 5' positions of adjacent exon. | |
# extract relevant data | |
my $currStrand = $geneLineArray[3]; | |
my $altEventChromStart = $altEventLineArray[1]; | |
my $altEventChromEnd = $altEventLineArray[2]; | |
my $seqTranscriptStart = $seqLineArray[19]; | |
my $seqTranscriptEnd = $seqLineArray[20]; | |
my $numberOfBlocks = $seqLineArray[21]; | |
my $blockSizesLine = $seqLineArray[22]; | |
my $transStartsLine = $seqLineArray[24]; | |
my @blocksSizesArray = split(/,/,$blockSizesLine); | |
my @blocksStartArray = split(/,/,$transStartsLine); | |
my $numberOfExons = $geneLineArray[13]; | |
my $exonStartLine = $geneLineArray[14]; | |
my $exonEndLine = $geneLineArray[15]; | |
my @exonStartsArray = split(/,/,$exonStartLine); | |
my @exonEndsArray = split(/,/,$exonEndLine); | |
my $altEventExonStart; | |
my $altEventExonEnd; | |
chomp($altEventChromStart); | |
chomp($altEventChromEnd); | |
# loop over all gene exons and find the matching gene exon | |
for (my $j = 0; $j < $numberOfExons; $j++) | |
{ | |
if (($exonStartsArray[$j] <= $altEventChromStart) && ($altEventChromEnd <= $exonEndsArray[$j])) | |
{ # found the exon in which the alternative splicing event happens | |
$altEventExonStart = $exonStartsArray[$j]; | |
$altEventExonEnd = $exonEndsArray[$j]; | |
} | |
} | |
if ($currStrand eq "+") | |
{ | |
# go over all sequence blocks | |
for (my $j = 0; $j < $numberOfBlocks; $j++) | |
{ | |
# compute block end position | |
my $blockEndPosition = $blocksStartArray[$j] + $blocksSizesArray[$j]; | |
# if the block is smaller then exon due to altThreePrime event in adjacent intron | |
if (($blocksStartArray[$j] eq $altEventChromEnd) && ($blockEndPosition eq $altEventExonEnd)) | |
{ | |
return 1;# altThreePrime was found - return true | |
} | |
} | |
} | |
elsif ($currStrand eq "-") | |
{ | |
# go over all sequence blocks | |
for (my $j = 0; $j < $numberOfBlocks; $j++) | |
{ | |
# compute block end position | |
my $blockEndPosition = $blocksStartArray[$j] + $blocksSizesArray[$j]; | |
# if the block is smaller then exon due to altThreePrime event in adjacent intron | |
if (($blocksStartArray[$j] eq $altEventExonStart) && ($blockEndPosition eq $altEventChromStart)) | |
{ | |
return 1;# altThreePrime was found - return true | |
} | |
} | |
} | |
} | |
elsif ($altEventType eq "altFivePrime") | |
{ | |
# check if in a sequence there is an intron with alternative 5' position. | |
# this affects the 3' position of the adjacent exon. | |
# the chromStart/chromEnd positions of alt Event represents the 3' positions of adjacent exon. | |
# extract relevant data | |
my $currStrand = $geneLineArray[3]; | |
my $altEventChromStart = $altEventLineArray[1]; | |
my $altEventChromEnd = $altEventLineArray[2]; | |
my $seqTranscriptStart = $seqLineArray[19]; | |
my $seqTranscriptEnd = $seqLineArray[20]; | |
my $numberOfBlocks = $seqLineArray[21]; | |
my $blockSizesLine = $seqLineArray[22]; | |
my $transStartsLine = $seqLineArray[24]; | |
my @blocksSizesArray = split(/,/,$blockSizesLine); | |
my @blocksStartArray = split(/,/,$transStartsLine); | |
my $numberOfExons = $geneLineArray[13]; | |
my $exonStartLine = $geneLineArray[14]; | |
my $exonEndLine = $geneLineArray[15]; | |
my @exonStartsArray = split(/,/,$exonStartLine); | |
my @exonEndsArray = split(/,/,$exonEndLine); | |
my $altEventExonStart; | |
my $altEventExonEnd; | |
chomp($altEventChromStart); | |
chomp($altEventChromEnd); | |
# loop over all gene exons and find the matching gene exon | |
for (my $j = 0; $j < $numberOfExons; $j++) | |
{ | |
if (($exonStartsArray[$j] <= $altEventChromStart) && ($altEventChromEnd <= $exonEndsArray[$j])) | |
{ # found the exon in which the alternative splicing event happens | |
$altEventExonStart = $exonStartsArray[$j]; | |
$altEventExonEnd = $exonEndsArray[$j]; | |
} | |
} | |
if ($currStrand eq "+") | |
{ | |
# go over all sequence blocks | |
for (my $j = 0; $j < $numberOfBlocks; $j++) | |
{ | |
# compute block end position | |
my $blockEndPosition = $blocksStartArray[$j] + $blocksSizesArray[$j]; | |
# if the block is smaller then exon due to altThreePrime event in adjacent intron | |
if (($blocksStartArray[$j] eq $altEventExonStart) && ($blockEndPosition eq $altEventChromStart)) | |
{ | |
return 1;# altFivePrime was found - return true | |
} | |
} | |
} | |
elsif ($currStrand eq "-") | |
{ | |
# go over all sequence blocks | |
for (my $j = 0; $j < $numberOfBlocks; $j++) | |
{ | |
# compute block end position | |
my $blockEndPosition = $blocksStartArray[$j] + $blocksSizesArray[$j]; | |
# if the block is smaller then exon due to altFivePrime event in adjacent intron | |
if (($blocksStartArray[$j] eq $altEventChromEnd) && ($blockEndPosition eq $altEventExonEnd)) | |
{ | |
return 1;# altFivePrime was found - return true | |
} | |
} | |
} | |
} | |
return 0; # return false | |
} | |
#script #13 appendix O | |
#! /usr/bin/perl-w | |
#***********************- Description -******************************************************* | |
#this script compute the second test by creating 11 tables (that had the most distinct results) | |
#and checks the corralation between alernative splicing type and a specific allel. | |
#********************************************************************************************* | |
open(IN, "human_mrna_est_UCSC_filtered_by_SNP_with_allele_with_splicing_type") or die "Could not open human_mrna_est_UCSC_filtered_by_SNP_with_allele_with_splicing_type"; # UCSC table of alternative events - size: 91,223 | |
open(OUT, ">statTest4") or die "Could not open statTest4"; # output file | |
@TableArr; | |
$counter = 1; | |
# construct chi square table with p.v. = 0.05 | |
%ChiSquareValue = (); # key-d.o.f value-chi square value | |
$ChiSquareValue{1} = 3.84; | |
$ChiSquareValue{2} = 5.99; | |
$ChiSquareValue{3} = 7.81; | |
$ChiSquareValue{4} = 9.49; | |
$ChiSquareValue{5} = 11.1; | |
$ChiSquareValue{6} = 12.6; | |
$ChiSquareValue{7} = 14.1; | |
$ChiSquareValue{8} = 15.5; | |
$ChiSquareValue{9} = 16.9; | |
$ChiSquareValue{10} = 18.3; | |
$ChiSquareValue{11} = 19.7; | |
$ChiSquareValue{12} = 21; | |
$ChiSquareValue{13} = 22.4; | |
$ChiSquareValue{14} = 23.7; | |
$ChiSquareValue{15} = 25; | |
$ChiSquareValue{16} = 26.3; | |
$ChiSquareValue{17} = 27.6; | |
$ChiSquareValue{18} = 28.9; | |
$ChiSquareValue{19} = 30.1; | |
$ChiSquareValue{20} = 31.4; | |
$ChiSquareValue{21} = 32.7; | |
$ChiSquareValue{22} = 33.9; | |
$ChiSquareValue{23} = 35.2; | |
$ChiSquareValue{24} = 36.4; | |
$ChiSquareValue{25} = 37.7; | |
$ChiSquareValue{26} = 38.9; | |
$ChiSquareValue{27} = 40.1; | |
$ChiSquareValue{28} = 41.3; | |
$ChiSquareValue{29} = 42.6; | |
$ChiSquareValue{30} = 43.8; | |
# an arr that contains the tables that we will preforme the second statistic test on | |
$TableArr[0] = "cassetteExon\ta!g\tEndOfExon"; | |
$TableArr[1] = "cassetteExon\ta!c\tEndOfExon"; | |
$TableArr[2] = "cassetteExon\tc!t\tEndOfExon"; | |
$TableArr[3] = "retainedIntron\ta!g\tEndOfExon"; | |
$TableArr[4] = "altFinish\ta!c!g\tEndOfExon"; | |
$TableArr[5] = "retainedIntron\tc!t\tEndOfExon"; | |
$TableArr[6] = "bleedingExon\ta!g\tEndOfExon"; | |
$TableArr[7] = "bleedingExon\tc!t\tEndOfExon"; | |
$TableArr[8] = "altThreePrime\tc!t\tEndOfExon"; | |
$TableArr[9] = "cassetteExon\tg!t\tStartOfExon"; | |
$TableArr[10] = "altFinish\ta!c\tEndOfExon"; | |
$TableArrLen = @TableArr; | |
for($i = 0; $i < 9; $i++) | |
{ | |
$line = <IN>; | |
} | |
@resTable; | |
for($i = 0; $i < $TableArrLen; $i++) #a loop that pases on the wanted tabel that are contained in tha arr TableArr above | |
{ | |
$sumOfOBSInstances = 0; # a counter for all the items that are in the needed table | |
for($t = 0; $t < 6; $t++) # reset the result table for each table that we are performing the test | |
{ | |
$resTable[$t] = 0; | |
} | |
while(defined $line) | |
{ | |
if($line =~ /^chr/) | |
{ | |
@GeneArray = split(/\t/,$line); # splitting the line to the columns | |
$StartOrEndOfExon = $GeneArray[18]; | |
chomp($StartOrEndOfExon); | |
$line = <IN>; | |
while( $line =~ /^!/) | |
{ | |
$line = <IN>; | |
} | |
} | |
while( $line =~ /^>/) #if the line represents a sequence (mRNA or EST) | |
{ | |
$line = <IN>; | |
$allel = substr($line,1,1); | |
$line = <IN>; | |
$NumberOfAltTypesPerSeq = 0; | |
while( $line =~ /^#/) #taking all the alternative splicing types of a certain sequence | |
{ | |
@LineArray = split(/\t/,$line); # splitting the line to the columns | |
$altType = $LineArray[4]; | |
chomp($altType); | |
$AltTypeArr[$NumberOfAltTypesPerSeq] = $altType; # an arr that contains all the alternative splicing types | |
$NumberOfAltTypesPerSeq++; | |
$line = <IN>; | |
} | |
#taking the data of the current test from the table arr above | |
@TableArrData = split(/\t/,$TableArr[$i]); | |
$altTypeFromTableArr = $TableArrData[0]; | |
$allels = $TableArrData[1]; | |
@allelsArrFromTableArr = split(/!/,$allels); | |
$numberOfAllelsInArr = @allelsArrFromTableArr; | |
$StartOrEndOfExonFromTableArr = $TableArrData[2]; | |
if( $StartOrEndOfExon eq $StartOrEndOfExonFromTableArr) | |
{ | |
for($j = 0; $j < $numberOfAllelsInArr; $j++) | |
{ | |
for($b = 0; $b < $NumberOfAltTypesPerSeq; $b++) #checking if ther is a match for each type of splicing in the mRNA/EST | |
{ | |
if($allel eq $allelsArrFromTableArr[$j]) | |
{ | |
$sumOfOBSInstances++; | |
if( $AltTypeArr[$b] eq $altTypeFromTableArr) | |
{ | |
#calculating the index in the obs table | |
$TableIndex = ($j * 2); | |
#increasing the right cell of the obs table | |
$resTable[$TableIndex]++; | |
} | |
else | |
{ | |
#calculating the index in the obs table | |
$TableIndex = ($j * 2) + 1; | |
#increasing the right cell of the obs table | |
$resTable[$TableIndex]++; | |
} | |
} | |
} | |
} | |
} | |
} | |
$line = <IN>; | |
} | |
print OUT "table number ".$counter." "; | |
$counter++; | |
print OUT $altTypeFromTableArr." "; | |
#compute the exp table | |
for( $c = 0; $c < $numberOfAllelsInArr; $c++) | |
{ | |
print OUT $allelsArrFromTableArr[$c]." "; | |
} | |
print OUT $StartOrEndOfExonFromTableArr."\n\n"; | |
$expTableHasValueLowerThanFive = 0; | |
if($numberOfAllelsInArr == 2) | |
{ | |
$SumOfRowWithFirstAllel = $resTable[0] + $resTable[1]; | |
$SumOfRowWIthSecondAllel = $resTable[2] + $resTable[3]; | |
$SumOfColWithAS = $resTable[0] + $resTable[2]; | |
$SumOfColWithOutAS = $resTable[1] + $resTable[3]; | |
} | |
else | |
{ | |
$SumOfRowWithFirstAllel = $resTable[0] + $resTable[1]; | |
$SumOfRowWIthSecondAllel = $resTable[2] + $resTable[3]; | |
$SumOfRowWIthThridAllel = $resTable[4] + $resTable[5]; | |
$SumOfColWithAS = $resTable[0] + $resTable[2] + $resTable[4]; | |
$SumOfColWithOutAS = $resTable[1] + $resTable[3] + $resTable[5]; | |
} | |
if($numberOfAllelsInArr == 2) | |
{ | |
$expTable[0] = ($SumOfRowWithFirstAllel * $SumOfColWithAS)/$sumOfOBSInstances; | |
$expTable[1] = ($SumOfRowWithFirstAllel * $SumOfColWithOutAS)/$sumOfOBSInstances; | |
$expTable[2] = ($SumOfRowWIthSecondAllel * $SumOfColWithAS)/$sumOfOBSInstances; | |
$expTable[3] = ($SumOfRowWIthSecondAllel * $SumOfColWithOutAS)/$sumOfOBSInstances; | |
if(($expTable[0] < 5) || ($expTable[1] < 5) || | |
($expTable[2] < 5) || ($expTable[3] < 5)) | |
{ # found at least one box in the expTable with value lower than 5 - no correlation | |
$expTableHasValueLowerThanFive = 1; | |
} | |
} | |
else | |
{ | |
$expTable[0] = ($SumOfRowWithFirstAllel * $SumOfColWithAS)/$sumOfOBSInstances; | |
$expTable[1] = ($SumOfRowWithFirstAllel * $SumOfColWithOutAS)/$sumOfOBSInstances; | |
$expTable[2] = ($SumOfRowWIthSecondAllel * $SumOfColWithAS)/$sumOfOBSInstances; | |
$expTable[3] = ($SumOfRowWIthSecondAllel * $SumOfColWithOutAS)/$sumOfOBSInstances; | |
$expTable[4] = ($SumOfRowWIthThridAllel * $SumOfColWithAS) /$sumOfOBSInstances; | |
$expTable[5] = ($SumOfRowWIthThridAllel * $SumOfColWithOutAS) /$sumOfOBSInstances; | |
if(($expTable[0] < 5) || ($expTable[1] < 5) || | |
($expTable[2] < 5) || ($expTable[3] < 5) || | |
($expTable[4] < 5) || ($expTable[5] < 5)) | |
{ # found at least one box in the expTable with value lower than 5 - no correlation | |
$expTableHasValueLowerThanFive = 1; | |
} | |
} | |
#compute Chi Square value and print result | |
$ChiSquare = 0; | |
if($numberOfAllelsInArr == 2) | |
{ | |
for($h = 0; $h < 4; $h++) | |
{ | |
if($expTable[$h] != 0) | |
{ | |
$ChiSquare = ((($resTable[$h] - $expTable[$h])*($resTable[$h] - $expTable[$h]))/$expTable[$h]) + $ChiSquare; | |
} | |
} | |
} | |
else | |
{ | |
for($h = 0; $h < 6; $h++) | |
{ | |
if($expTable[$h] != 0) | |
{ | |
$ChiSquare = ((($resTable[$h] - $expTable[$h])*($resTable[$h] - $expTable[$h]))/$expTable[$h]) + $ChiSquare; | |
} | |
} | |
} | |
#compute the dof | |
$dof = $numberOfAllelsInArr - 1; | |
#printing to the output file | |
print OUT "OBS table\n"; | |
print OUT "\t".$altTypeFromTableArr."\tno ".$altTypeFromTableArr."\n"; | |
$z = 0; | |
for($h = 0; $h < $numberOfAllelsInArr; $h++) | |
{ | |
print OUT $allelsArrFromTableArr[$h]."\t\t".$resTable[$z]."\t\t".$resTable[$z+1]."\n"; | |
$z++; | |
$z++; | |
} | |
#print OUT "EXP table\n"; | |
#print OUT "\t".$altTypeFromTableArr."\tno ".$altTypeFromTableArr."\n"; | |
#$z = 0; | |
#for($h = 0; $h < $numberOfAllelsInArr; $h++) | |
#{ | |
# print OUT $allelsArrFromTableArr[$h]."\t\t".$expTable[$z]."\t\t".$expTable[$z+1]."\n"; | |
# $z++; | |
# $z++; | |
#} | |
print OUT "\n"; | |
print OUT "X^2 value = " . $ChiSquare . "\n"; | |
print OUT "d.o.f = " .$dof."\n"; | |
# chaeck if computed p value is lower than 0.05 | |
if ($dof >= 1 && $dof <= 30) | |
{ | |
# 2.2 there is a correlation between SNP allele and distinct mRNA - print result | |
$passingChiSquareValue = $ChiSquareValue{$dof}; | |
print OUT "P.V = ".$passingChiSquareValue."\n"; | |
if (($ChiSquare >= $passingChiSquareValue) && ($expTableHasValueLowerThanFive == 0)) | |
{ | |
print OUT "Corralation found: Yes\n"; | |
} | |
else | |
{ | |
print OUT "Corralation found: No\n"; | |
} | |
} | |
print OUT "\n"; | |
close(IN); | |
open(IN, "human_mrna_est_UCSC_filtered_by_SNP_with_allele_with_splicing_type") or die "Could not open human_mrna_est_UCSC_filtered_by_SNP_with_allele_with_splicing_type"; # UCSC table of alternative events - size: 91,223 | |
for($k = 0; $k < 9; $k++) | |
{ | |
$line = <IN>; | |
} | |
} | |
close(OUT); | |
close(IN); | |
#script #14 appendix P | |
#! /usr/bin/perl-w | |
#***********************- Description -************************* | |
# this script finds intersection of gene between the 8 tables | |
#*************************************************************** | |
open(IN, "human_mrna_est_UCSC_filtered_by_SNP_with_allele_with_splicing_type") or die "Could not open human_mrna_est_UCSC_filtered_by_SNP_with_allele_with_splirelevancing_type"; # UCSC table of alternative events - size: 91,223 | |
open(OUT, ">GenePerEachStatTest") or die "Could not open statTest4"; # output file | |
@TableArr; | |
$counter = 1; | |
%totalGeneHash = (); | |
$totalGeneCounter = 0; | |
%totalRepeatedGeneHash = (); | |
$totalRepeatedGeneCounter = 0; | |
# an arr that contains the tables that we will preforme the second statistic test on | |
$TableArr[0] = "cassetteExon\tg!t\tStartOfExon"; | |
$TableArr[1] = "altThreePrime\tc!t\tEndOfExon"; | |
$TableArr[2] = "cassetteExon\tc!t\tEndOfExon"; | |
$TableArr[3] = "retainedIntron\ta!g\tEndOfExon"; | |
$TableArr[4] = "altFinish\ta!c!g\tEndOfExon"; | |
$TableArr[5] = "retainedIntron\tc!t\tEndOfExon"; | |
$TableArr[6] = "bleedingExon\ta!g\tEndOfExon"; | |
$TableArr[7] = "bleedingExon\tc!t\tEndOfExon"; | |
$TableArrLen = @TableArr; | |
@GeneSizeArr; # this array will hold the amount of genes for each table that describs above | |
@GenesByTableArr; # the value is the genes that belongs to this TableArr | |
%HashByGeneId; # the key is the Gene Id and the value is the table that he is in | |
%HashForGeneData; # the key is the Gene Id and the value is the alternative splicing type, allel and end of beginninng of the exon | |
#takes line until the relevant line | |
for($i = 0; $i < 9; $i++) | |
{ | |
$line = <IN>; | |
} | |
#a loop that pases on the wanted tabel that are contained in tha array TableArr above | |
for($i = 0; $i < $TableArrLen; $i++) | |
{ | |
$GeneCounter = 0; #will count the genes in each of the tables | |
@TableArrData = split(/\t/,$TableArr[$i]); #holds the relevant information about the current test | |
$altTypeFromTableArr = $TableArrData[0]; | |
$allels = $TableArrData[1]; | |
@allelsArrFromTableArr = split(/!/,$allels); #thhis array holds all the allels that are relevant for the test by splitig the spcific part of the line | |
$numberOfAllelsInArr = @allelsArrFromTableArr; | |
$StartOrEndOfExonFromTableArr = $TableArrData[2]; | |
# priting a titel | |
print OUT "table number ".$counter."\t"; | |
$counter++; | |
print OUT $altTypeFromTableArr." "; | |
for( $c = 0; $c < $numberOfAllelsInArr; $c++) | |
{ | |
print OUT $allelsArrFromTableArr[$c]." "; | |
} | |
print OUT $StartOrEndOfExonFromTableArr."\n\n"; | |
#pessing on all the line in the input file | |
while(defined $line) | |
{ | |
if($line =~ /^chr/) | |
{ | |
@GeneArray = split(/\t/,$line); # splitting the line to the columns | |
$StartOrEndOfExon = $GeneArray[18]; | |
$geneID = $GeneArray[4]; | |
chomp($StartOrEndOfExon); | |
$GeneLine = $line; | |
$curGeneLine = ""; | |
$line = <IN>; | |
while( $line =~ /^!/) | |
{ | |
$line = <IN>; | |
} | |
} | |
while( $line =~ /^>/) #if the line represents a sequence (mRNA or EST) | |
{ | |
$line = <IN>; | |
$allel = substr($line,1,1); | |
$line = <IN>; | |
$NumberOfAltTypesPerSeq = 0; | |
while( $line =~ /^#/) #taking all the alternative splicing types of a certain sequence | |
{ | |
@LineArray = split(/\t/,$line); # splitting the line to the columns | |
$altType = $LineArray[4]; | |
chomp($altType); | |
$AltTypeArr[$NumberOfAltTypesPerSeq] = $altType; # an arr that contains all the alternative splicing types | |
$NumberOfAltTypesPerSeq++; | |
$line = <IN>; | |
} | |
# checks if the location of the SNP in the exon is compatible to the location in the test information from the array above | |
if( $StartOrEndOfExon eq $StartOrEndOfExonFromTableArr) | |
{ | |
#checks if the the allel appears in one of the relevant allels of the test | |
for($j = 0; $j < $numberOfAllelsInArr; $j++) | |
{ | |
#checks if the the alternative splicing type appears in one of types of the sequence | |
for($b = 0; $b < $NumberOfAltTypesPerSeq; $b++) #checking if ther is a match for each type of splicing in the mRNA/EST | |
{ | |
if($allel eq $allelsArrFromTableArr[$j]) | |
{ | |
# counter for the sum of the table for further calculation | |
$sumOfOBSInstances++; | |
if( $AltTypeArr[$b] eq $altTypeFromTableArr) | |
{ | |
#checks if this gene line was alredy printed | |
if( $curGeneLine ne $GeneLine) | |
{ | |
$curGeneLine = $GeneLine; | |
$GeneCounter++; | |
print OUT $GeneLine."\n"; | |
if( defined $GenesByTableArr[$i]) | |
{ | |
$GenesByTableArr[$i] = $GenesByTableArr[$i]."\t".$geneID; | |
} | |
else | |
{ | |
$GenesByTableArr[$i] = $geneID; | |
} | |
$flag = 0; | |
#here we save the information about the the tables that the gene appears in and the information about the gene it self in a different hash | |
if(defined $HashByGeneId{$geneID}) | |
{ | |
@GeneValue = split(/\s/,$HashByGeneId{$geneID}); # splitting the line to the columns | |
$Len = @GeneValue; | |
for($c = 0; $c < $Len; $c++) | |
{ | |
if( $i == $GeneValue[$c]) | |
{ | |
$flag = 1; | |
} | |
} | |
if($flag == 0) | |
{ | |
$temp = $i + 1; | |
$HashByGeneId{$geneID} = $HashByGeneId{$geneID}." ".$temp; #saves the information about the tables that the gene appears | |
} | |
# saves the relevant information about the gene | |
$HashForGeneData{$geneID} = $HashForGeneData{$geneID}."\t".$AltTypeArr[$b]." "; | |
for($d = 0; $d < @allelsArrFromTableArr; $d++) | |
{ | |
$HashForGeneData{$geneID} = $HashForGeneData{$geneID}.$allelsArrFromTableArr[$d]." "; | |
} | |
$HashForGeneData{$geneID} = $HashForGeneData{$geneID}.$StartOrEndOfExonFromTableArr; | |
} | |
else | |
{ | |
$temp = $i + 1; | |
$HashByGeneId{$geneID} = $temp; #saves the information about the tables that the gene appears | |
# saves the relevant information about the gene | |
$HashForGeneData{$geneID} = $AltTypeArr[$b]." "; | |
for($d = 0; $d < @allelsArrFromTableArr; $d++) | |
{ | |
$HashForGeneData{$geneID} = $HashForGeneData{$geneID}.$allelsArrFromTableArr[$d]." "; | |
} | |
$HashForGeneData{$geneID} = $HashForGeneData{$geneID}.$StartOrEndOfExonFromTableArr; | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
$line = <IN>; | |
} | |
close(IN); | |
open(IN, "human_mrna_est_UCSC_filtered_by_SNP_with_allele_with_splicing_type") or die "Could not open human_mrna_est_UCSC_filtered_by_SNP_with_allele_with_splicing_type"; # UCSC table of alternative events - size: 91,223 | |
for($k = 0; $k < 9; $k++) | |
{ | |
$line = <IN>; | |
} | |
$GeneSizeArr[$i] = $GeneCounter; | |
print OUT "number of genes: ".$GeneCounter."\n"; | |
} | |
# sum up the total amount of all genes with repetition | |
$sum = 0; | |
for($t = 0; $t < @GeneSizeArr; $t++) | |
{ | |
$sum = $sum + $GeneSizeArr[$t]; | |
} | |
print OUT "total sum of all genes with repetition: ".$sum."\n"; | |
print OUT "Overlapping Genes\n"; | |
print OUT "\nGeneId\tNumberOfTables\tTheTables\tTheAltType+Allels\n"; | |
$counterOfOverLapGenes = 0; | |
#prints out the genes that appear in more then 1 table | |
foreach $gene ( keys %HashByGeneId) | |
{ | |
@GeneValue = split(/\s/,$HashByGeneId{$gene}); # splitting the line to the columns | |
$numberOfTables = @GeneValue; | |
if($numberOfTables > 1) | |
{ | |
$counterOfOverLapGenes++; | |
print OUT $gene."\t".$numberOfTables."\t".$HashByGeneId{$gene}."\t".$HashForGeneData{$gene}."\n"; | |
} | |
} | |
#prints out the total anount of gene per test and total amout of overlapping genes per test | |
print OUT "the number of overlapp genes: ".$counterOfOverLapGenes."\n"; | |
for( $i = 0; $i < $TableArrLen; $i++) | |
{ | |
$counter = 0; | |
#checks how many overlapping genes there are in the spcific case | |
@GenesPerTableArr = split(/\t/,$GenesByTableArr[$i]); | |
$genesPerTableArrLen = @GenesPerTableArr; | |
for($j = 0; $j < $genesPerTableArrLen; $j++) | |
{ | |
foreach $item (keys %HashByGeneId) | |
{ | |
@GeneValue = split(/\s/,$HashByGeneId{$item}); # splitting the line to the columns | |
$numberOfTables = @GeneValue; | |
if($numberOfTables > 1) | |
{ | |
if($GenesPerTableArr[$j] eq $item) | |
{ | |
$counter++; | |
} | |
} | |
} | |
} | |
print OUT $TableArr[$i]."\nTotal gene number is: ".$GeneSizeArr[$i]." the number of genes that appears in more the 1 table is: ".$counter."\n"; | |
} | |
close(OUT); | |
close(IN); | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment