Skip to content

Instantly share code, notes, and snippets.

@SophieDePaula
Last active June 10, 2017 18:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save SophieDePaula/6aaa86c01ddd0507b8c24e050ae6be62 to your computer and use it in GitHub Desktop.
Save SophieDePaula/6aaa86c01ddd0507b8c24e050ae6be62 to your computer and use it in GitHub Desktop.
FinalProject Bachelor degree - Purl
#Script #1 appendix A
#this script unites 3 tables from UCSC, in order to add the type of alternative #splicing and the location of exon start and exon end to each gene
#***********************- pseudo code -*************************
# 1. foreach row in table REFALT
# 1.1. extract location data of alt splicing variant
# 1.2. find according to this location the matching row in table ALTEV
# 1.2.1. if exists
# 1.2.1.1 print to TOUT the variant + the alt splicing type column from table ALTEV
# 2. foreach row in table TOUT
# 2.1 extract gene ID of alt splicing variant
# 2.2. find according to this gene ID the matching row in table refseq
# 2.2.1. print to OUT the variant + exons data columns from table refseq
#***************************************************************
open(REFALT, "RefSeq_Alt") or die "Could not open RefSeq_Alt"; # UCSC table of alternative splicing events and refseq genes - size: 30,289
open(ALTEV, "AltEvents") or die "Could not open AltEvents"; # UCSC table of alternative events - size: 91,223
open(REFSEQ, "refseq") or die "Could not open refseq"; # UCSC table of refseq genes - size: 41,748
open(LOG, ">LOG") or die "Could not open LOG"; # log file of program
open(TOUT, ">tempAltEventsWithType") or die "Could not open AltEventsWithType"; # temp output file
open(OUT, ">AltEventsWithType") or die "Could not open AltEventsWithType"; # output file
# print header of temp result table
print TOUT "chrom\trefseqGeneChromStart\trefseqGeneChromEnd\tAltEventsChromStart\tAltEventsChromEnd\tgeneID\trefseqcore\tstrand\trefseqTranscriptStart\trefseqTranscriptEnd\ttranscriptScore\tAltEventType\tnumberOfExons\n";
# column variables for each line in REFALT table
$chrom; # Reference sequence chromosome or scaffold
$chromStart; # Start position in chromosome
$chromEnd; # End position in chromosome
$geneID; # Name of gene (usually transcript_id from GTF)
$score; # AltEvents table Score, from 0-1000
$strand; # + or - for strand
$codingStart; # Transcription start position
$codingEnd; # Transcription end position
$transcriptScore; # Transcription score
$numberOfExons; # Number of exons
$line = <REFALT>;
print LOG "lines of RefSeq_Alt\n";
while (defined ($line)) # foreach row in table REFALT
{
chomp($line);
@LineArray = split(/\t/,$line); # splitting the line to the columns
$chrom = $LineArray[0];
$chromStart = $LineArray[1];
$chromEnd = $LineArray[2];
$geneID = $LineArray[3];
$score = $LineArray[4];
$strand = $LineArray[5];
$codingStart = $LineArray[6];
$codingEnd = $LineArray[7];
$transcriptScore = $LineArray[8];
$numberOfExons = $LineArray[9];
print LOG "chrom = " . $chrom . ";; ";
print LOG "chromStart = " . $chromStart . ";; ";
print LOG "chromEnd = " . $chromEnd . ";; ";
print LOG "geneID = " . $geneID . ";; ";
print LOG "score = " . $score . ";; ";
print LOG "strand = " . $strand . ";; ";
print LOG "codingStart = " . $codingStart . ";; ";
print LOG "codingEnd = " . $codingEnd . ";; ";
print LOG "transcriptScore = " . $transcriptScore . ";; ";
print LOG "numberOfExons = " . $numberOfExons . "\n";
# 1.1. extract location data of alt splicing variant
findAlternativeSplicingTypeByLocation($chrom, $chromStart, $chromEnd, $strand);
$line = <REFALT>; # take next line from table
}
close(TOUT);
open(TOUT, "tempAltEventsWithType") or die "Could not open AltEventsWithType"; # temp output file
# print header of result table
print OUT "chrom\trefseqGeneChromStart\trefseqGeneChromEnd\tAltEventsChromStart\tAltEventsChromEnd\tgeneID\trefseqcore\tstrand\trefseqcodingStart\trefseqcodingEnd\ttranscriptScore\tAltEventType\trefseqTranscriptStart\trefseqTranscriptEnd\trefseqExonCount\trefseqExonStarts\trefseqExonEnds\tsecondName\n";
%refseqMap; # map of exon information - key{$geneID} value{$cdsStart \t $cdsEnd \t $exonCount \t $exonStarts \t $exonEnds \t $secondName}
# column variables for each line in REFSEQ table
$bin; # Indexing field to speed chromosome range queries.
$geneID; # Name of gene (usually transcript_id from GTF)
$chrom; # Reference sequence chromosome or scaffold
$strand; # + or - for strand
$txStart; # Transcription start position
$txEnd; # Transcription end position
$cdsStart; # Coding region start
$cdsEnd; # Coding region end
$exonCount; # Number of exons
$exonStarts; # Exon start positions
$exonEnds; # Exon end positions
$score; # score
$secondName; # Alternate name (e.g. gene_id from GTF)
$line = <REFSEQ>; # jump over header in top of the table
$line = <REFSEQ>;
print LOG "lines of RefSeq into % refseqMap\n";
while (defined ($line)) # fill data in %refseqMap Map from REFSEQ table
{
chomp($line);
@LineArray = split(/\t/,$line); # splitting the line to the columns
$bin = $LineArray[0];
$geneID = $LineArray[1];
$chrom = $LineArray[2];
$strand = $LineArray[3];
$txStart = $LineArray[4];
$txEnd = $LineArray[5];
$cdsStart = $LineArray[6];
$cdsEnd = $LineArray[7];
$exonCount = $LineArray[8];
$exonStarts = $LineArray[9];
$exonEnds = $LineArray[10];
$score = $LineArray[11];
$secondName = $LineArray[12];
chomp($geneID);
$refseqMap{$geneID} = $txStart."\t".$txEnd."\t".$exonCount."\t".$exonStarts."\t".$exonEnds."\t".$secondName;
print LOG "refseqMap{".$geneID."} = ".$refseqMap{$geneID}."\n";
$line = <REFSEQ>; # take next line from table
}
# column variables for each line in TOUT table
$chrom; # Reference sequence chromosome or scaffold
$refseqChromStart; # Start position in chromosome according to refseq table
$refseqChromEnd; # End position in chromosome accodrding to refseq table
$altEvChromStart; # Start position in chromosome according to AltEvents table
$altEvChromEnd; # End position in chromosome accodrding to AltEvents table
$geneID; # Name of gene (usually transcript_id from GTF)
$score; # AltEvents table Score, from 0-1000
$strand; # + or - for strand
$transcriptStart; # Transcription start position
$transcriptEnd; # Transcription end position
$transcriptScore; # Transcription score
$altEventType; # type of alternative splicing event - more explantions in attached file 'Alternative Splicing type UCSC'
$numberOfExons; # Number of exons
$line = <TOUT>; # jump over header in top of the table
$line = <TOUT>;
# 2. foreach row in table TOUT
while (defined ($line))
{
chomp($line);
@LineArray = split(/\t/,$line); # splitting the line to the columns
$chrom = $LineArray[0];
$refseqChromStart = $LineArray[1];
$refseqChromEnd = $LineArray[2];
$altEvChromStart = $LineArray[3];
$altEvChromEnd = $LineArray[4];
$geneID = $LineArray[5];
$score = $LineArray[6];
$strand = $LineArray[7];
$codingStart = $LineArray[8];
$codingEnd = $LineArray[9];
$transcriptScore = $LineArray[10];
$altEventType = $LineArray[11];
$numberOfExons = $LineArray[12];
chomp($geneID); # 2.1 extract gene ID of alt splicing variant
# 2.2. find according to this gene ID the matching row in table refseq
if(defined($refseqMap{$geneID}))
{
# 2.2.1. print to OUT the variant + exons data columns from table refseq
print OUT $chrom."\t".$refseqChromStart."\t".$refseqChromEnd."\t".$altEvChromStart."\t".$altEvChromEnd."\t".$geneID."\t".$score."\t".$strand."\t".$codingStart."\t".$codingEnd."\t".$transcriptScore."\t".$altEventType."\t".$refseqMap{$geneID}."\n";
}
$line = <TOUT>; # take next line from table
}
close(REFALT);
close(ALTEV);
close(REFSEQ);
close(LOG);
close(OUT);
close(TOUT);
#findAlternativeSplicingTypeByLocation( $chrom, $chromStart, $chromEnd, $strand)
sub findAlternativeSplicingTypeByLocation
{
seek(ALTEV, 0, SEEK_SET); # move file cursor to start of file
my $i_chrom = $_[0]; # input parameter
my $i_chromStart = $_[1]; # input parameter
my $i_chromEnd = $_[2]; # input parameter
my $i_strand = $_[3]; # input parameter
my @i_LineArray;
# column variables for each line in ALTEV table
my $tempChrom;
my $tempChromStart;
my $tempChromEnd;
my $tempStrand;
my $tempAltType; # type of alternative splicing event - more explantions in attached file 'Alternative Splicing type UCSC'
print LOG "\nfindAlternativeSplicingTypeByLocation ( ".$i_chrom." , ".$i_chromStart." , ".$i_chromEnd." , ".$i_strand. " )\n\n";
my $i_line = <ALTEV>; # jump over header in top of the table
my $i_line = <ALTEV>;
while (defined ($i_line)) # 1.2. find according to input location parameters the matching row in table ALTEV
{
@i_LineArray = split(/\t/,$i_line); # splitting the line to the columns
$tempChrom = $i_LineArray[1];
$tempChromStart = $i_LineArray[2];
$tempChromEnd = $i_LineArray[3];
$tempStrand = $i_LineArray[6];
$tempAltType = $i_LineArray[4];
chomp($i_chrom);
chomp($tempChrom);
chomp($i_strand);
chomp($tempStrand);
if (($i_chrom eq $tempChrom) && ($i_strand eq $tempStrand))
{
if (($i_chromStart <= $tempChromStart) && ($i_chromEnd >= $tempChromEnd)) # 1.2.1. if exists
{
# 1.2.1.1 print to TOUT the variant + the alt splicing type column from table ALTEV
print TOUT $tempChrom."\t".$i_chromStart."\t".$i_chromEnd."\t".$tempChromStart."\t".$tempChromEnd."\t".$geneID."\t".$score."\t".$strand."\t".$codingStart."\t".$codingEnd."\t".$transcriptScore."\t".$tempAltType."\t".$numberOfExons."\n";
}
}
$i_line = <ALTEV>; # take next line from table
}
}
#Script #2 appendix B
#this script unites 2 tables and finds their intersection, in order to find all #the genes that contains synonymous SNPs.
#***********************- pseudo code -***************************************************************************************
# 1. foreach row in table AltEventsWithType
# 1.1. extract the chromosome number
# 1.2 extract the strand
# 1.3 extract the transcription start and end
# 1.4 extract the rest of the relevent data
# 1.5 put everything in a hashtable, the key = gene id, value = the rest of the data
#2. foreach row in table snp135
# 2.1 extract the relevent data.
# 2.2 call findSnpInAlternativeSplicingEvent function with the chromosome number, strand,SNP location,SNP Id and allels fields.
#
#findSnpInAlternativeSplicingEvent:
# 1.comper the chromosome number of the alternative splicing event and the snp
# 1.1 if it mach
# 1.1.1 comper the strand of the alternative splicing event and the snp.
# 1.1.1.1 if it mach
# 1.1.1.1.1 comper the start and end of the transcription area and the snp locatoin
# 1.1.1.1.1.1 if it mach
# 1.1.1.1.1.1.1 print to OUT the relevent data
#******************************************************************************************************************************
open(ALTEV, "AltEventsWithType") or die "Could not open RefSeq_Alt"; # The output from script 4 size: 171,884
open(SNP135, "snp135") or die "Could not open AltEvents"; # UCSC table of synonymous SNPs size: 38,301
open(LOG, ">LOG") or die "Could not open LOG"; # log file of program
open(OUT, ">SNP splicing joining") or die "Could not open AltEventsWithType"; # output file
open(MAP, ">maps") or die "Could not open AltEventsWithType";
open(TOUT, ">tempSnp") or die "Could not open tempSnp";
$chrom; # Reference sequence chromosome or scaffold
$codigSeqStart; # coding start position
$codigSeqEnd; # coding end position
$strand; # + or - for strand
$transcriptStart; # Transcription start position
$transcriptEnd; # Transcription end position
$geneID; # Name of gene (usually transcript_id from GTF)
$AltEvnetType; # alternative splicing type
$numberOfExons; # Number of exons
$ExonsStart; # start positions of exons
$ExonsEnd; # end positions of exons
$line = <ALTEV>; #skip the unrelevent row
$line = <ALTEV>;
while (defined ($line)) # foreach row in table aternative splicing event
{
chomp($line);
@LineArray = split(/\t/,$line); # splitting the line to the columns
$chrom = $LineArray[0];
$chromStart = $LineArray[1];
$chromEnd = $LineArray[2];
$geneId = $LineArray[5];
$strand = $LineArray[7];
$transcriptStart = $LineArray[12];
$transcriptEnd = $LineArray[13];
$AltEvnetType = $LineArray[11];
$codigSeqStart = $LineArray[8];
$codigSeqEnd = $LineArray[9];
$numberOfExons = $LineArray[14];
$ExonsStart = $LineArray[15];
$ExonsEnd = $LineArray[16];
print LOG "chrom = " . $chrom . ";; ";
print LOG "strand = " . $strand . ";; ";
print LOG "start coding = ".$codigSeqStart.";; ";
print LOG "end coding = ".$codigSeqEnd.";; ";
print LOG "geneID = " . $geneId . ";; ";
print LOG "alterantive splicing type = ". $AltEvnetType. ";;" ;
print LOG "score = " . $score . ";; ";
print LOG "transcriptStart = " . $transcriptStart . ";; ";
print LOG "transcriptEnd = " . $transcriptEnd . ";; ";
print LOG "numberOfExons = " . $numberOfExons . ";; ";
print LOG "start positions of exons = " . $ExonsStart . ";; ";
print LOG "end positions of exons = " . $ExonsEnd . "\n";
$AltEventsMap{$geneId} = $chrom."\t".$strand."\t".$codigSeqStart."\t".$codigSeqEnd."\t".$transcriptStart."\t".$transcriptEnd."\t".$geneId."\t".$numberOfExons."\t".$ExonsStart."\t".$ExonsEnd."\t".$AltEvnetType."\t".$chromStart."\t".$chromEnd;
$line = <ALTEV>; # take next line from table
}
$line = <SNP135>;
$line = <SNP135>; #skip unrelevent rows
$line = <SNP135>;
print OUT "chrom chromStart chromEnd strand geneId codigSeqStart codigSeqEnd transcriptStart transcriptEnd SNPlocation Allels SNPId alternativeSplicingType numberOfExons ExonsStart ExonsEnd\n";
while (defined ($line)) # foreach row in table SNP
{
chomp($line);
@LineArray2 = split(/\t/,$line); # splitting the line to the columns
$chrom = $LineArray2[1];
$SNPlocation = $LineArray2[2];
$SNPId = $LineArray2[4];
$strand = $LineArray2[6];
$Allels = $LineArray2[9];
print LOG "chrom = " . $chrom . ";; ";
print LOG "SNP location = ".$SNPlocation.";; ";
print LOG "SNP name = ".$SNPname.";; ";
print LOG "strand = ".$strand.";; ";
print LOG "Allels = ".$Allels."\n ";
findSnpInAlternativeSplicingEvent($chrom, $strand, $SNPlocation,$SNPId,$Allels);
$line = <SNP135>;
}
close(OUT);
#findAlternativeSplicingTypeByLocation($chrom, $strand, $SNPlocation,$SNPId,$Allels)
sub findSnpInAlternativeSplicingEvent
{
my $chrom = $_[0];
my $strand = $_[1];
my $SNPlocation = $_[2];
my $SNPId = $_[3];
my $Allels = $_[4];
foreach $geneId(%AltEventsMap)
{
$geneInfo = $AltEventsMap{$geneId};
chomp($geneInfo);
@AltArray = split(/\t/,$geneInfo); # splitting the line of every value in a line of the map
if($AltArray[0] eq $chrom)
{
if($AltArray[1] eq $strand)
{
if(($AltArray[2] <= $SNPlocation) && ($AltArray[3] >= $SNPlocation))
{
print TOUT $AltArray[0]." == ".$chrom." ".$AltArray[1]." == ".$strand." ".$AltArray[4]." <= ".$SNPlocation." && ".$AltArray[5].">= ".$SNPlocation."\n";
print OUT $AltArray[0]."\t".$AltArray[11]."\t".$AltArray[12]."\t".$AltArray[1]."\t".$AltArray[6]."\t".$AltArray[2]."\t".$AltArray[3]."\t".$AltArray[4]."\t".$AltArray[5]."\t".$SNPlocation."\t".$Allels."\t".$SNPId."\t".$AltArray[10]."\t".$AltArray[7]."\t".$AltArray[8]."\t".$AltArray[9]."\n";
}
}
}
}
}
#Script #3 appendix C
#this script finds all genes from the previous table that contains synonymous #SNPs at the first or last 50 nucleotides of exon
#***********************- pseudo code -*************************
# 1. foreach row in table SNPSPL
# 1.1. extract exon data of alt splicing variant and SNP location
# 1.2. foreach exon start and end location check if SNP is in the first 50 bp or last 50 bp of exon
# 1.2.1. if SNP is in range of some exon
# 1.2.1.1 print to OUT the variant + exon number that SNP is in the range of
#***************************************************************
# joining of UCSC tables that hold splicing variants with SNP within - size: 29,788
open(SNPSPL, "SNP splicing joining") or die "Could not open SNP splicing joining";
open(OUT, ">SNP splicing joining exon filtered") or die "Could not open SNP splicing joining exon filtered"; # output file
open(LOG, ">LOG") or die "Could not open LOG"; # log file of program
$line = <SNPSPL>; # get header of input file
$BASE_PAIR_OFFSET = 50; # base pair offset at exon start and end search SNP
# print header of result table
chop($line);
print OUT $line."\tSNPInExonNumber\n";
# column variables for each line in SNPSPL table
$chrom; # Reference sequence chromosome or scaffold
$chromStart; # chrom position start
$chromEnd; # chrom position end
$strand; # + or - for strand
$geneId; # Name of gene (usually transcript_id from GTF)
$codigSeqStart; # Coding region start
$codigSeqEnd; # Coding region end
$transcriptStart; # Transcription start position
$transcriptEnd; # Transcription end position
$SNPlocation; # SNP location
$Allels; # observed nucleutide allele
$SNPId; # unique SNP ID
$alternativeSplicingType; # type of alternative splicing event
$numberOfExons; # Number of exons
$ExonsStart; # Exon start positions
$ExonsEnd; # Exon end positions
$line = <SNPSPL>;
while (defined ($line)) # 1. foreach row in table SNPSPL
{
# 1.1. extract exon data of alt splicing variant and SNP location
chomp($line);
@LineArray = split(/\t/,$line); # splitting the line to the columns
$chrom = $LineArray[0];
$chromStart = $LineArray[1];
$chromEnd = $LineArray[2];
$strand = $LineArray[3];
$geneId = $LineArray[4];
$codigSeqStart = $LineArray[5];
$codigSeqEnd = $LineArray[6];
$transcriptStart = $LineArray[7];
$transcriptEnd = $LineArray[8];
$SNPlocation = $LineArray[9];
$Allels = $LineArray[10];
$SNPId = $LineArray[11];
$alternativeSplicingType = $LineArray[12];
$numberOfExons = $LineArray[13];
$ExonsStart = $LineArray[14];
$ExonsEnd = $LineArray[15];
chomp($numberOfExons);
chomp($SNPlocation);
# print LOG "numberOfExons = ".$numberOfExons."\n";
@ExonStartArray = split(/,/,$ExonsStart); # splitting the exon start positions
@ExonEndArray = split(/,/,$ExonsEnd); # splitting the exon end positions
# 1.2. foreach exon start and end location check if SNP is in the first 50 bp or last 50 bp of exon
for ($i = 0; $i < $numberOfExons; $i++)
{
chomp($ExonStartArray[$i]); # chomp current exon's start position
chomp($ExonEndArray[$i]); # chomp current exon's end position
$ExonStartPos = $ExonStartArray[$i];
$ExonEndPos = $ExonEndArray[$i];
# print LOG "SNPlocation = ".$SNPlocation."\n";
# print LOG "ExonStartPos = ".$ExonStartPos."\n";
# print LOG "ExonEndPos = ".$ExonEndPos."\n";
# 1.2.1. if SNP is in range of some exon
if ((($SNPlocation >= $ExonStartPos) && ($SNPlocation <= ($ExonStartPos + $BASE_PAIR_OFFSET))) ||
(($SNPlocation >= ($ExonEndPos - $BASE_PAIR_OFFSET)) && ($SNPlocation <= $ExonEndPos)))
{
# 1.2.1.1 print to OUT the variant + exon number that SNP is in the range of
print OUT $line."\t".($i+1)."\n";
}
}
$line = <SNPSPL>; # take next line from table
}
close(SNPSPL);
close(OUT);
close(LOG);
#Script #4 appendix D
#this script checks the frequency of each type of alternative splicing in 2 #tables – all genes that alternatively spliced, compard with all genes that #contains synonymous SNP at the start or the end of exon
#***********************- pseudo code -*************************
# 1. foreach row in table AltEventsWithType
# 1.1. take the alternative splicing type and put it in map as a key
# 1.2. increment the value of the key
# 2. foreach row in table SNP splicing joining
# 2.1 take the alternative splicing type and put it in map as a key
# 2.2. increment the value of the key
# 3. calculate the frequency of every type of alternative splicing for each map
# 3. print the maps and the fequencies to OUT in two separeted tables
#***************************************************************
open(ALTTYPE, "AltEventsWithType") or die "Could not open AltEventsWithType"; # table of alternative splicing events with types - size: 171,883
open(SNPTYPE, "SNP splicing joining") or die "Could not open "; # table of alternative splicing events with SNPs and types - size: 29,788
open(LOG, ">LOG") or die "Could not open LOG"; # log file of program
open(OUT, ">AltSplicingTypeFrequency") or die "Could not open AltSplicingTypeFrequency"; # output file
%typesFreq; # map of alternative splicing type (key) and number of apearences of the type in the first table (value)
$line = <ALTTYPE>; # first line contains the description of the table
$line = <ALTTYPE>; # takes the second line
while (defined ($line)) # foreach row in table ALTTYPE
{
chomp($line);
@LineArray = split(/\t/,$line); # splitting the line to the columns
$key = $LineArray[11]; # the alternative splicing type
if(defined $typesFreq{$key}) # if this type is already exists in the map
{
$typesFreq{$key}++; # increment its number of apearances
}
else
{
$typesFreq{$key} = 1; # set as new key and set its number of apearences to 1
}
$line = <ALTTYPE>;
}
%typesFreqWithSNP; # map of alternative splicing type (key) and number of apearences of the type in the second table (value)
$line = <SNPTYPE>; # first line contains the description of the table
$line = <SNPTYPE>; # takes the second line
while (defined ($line)) # foreach row in table SNPTYPE
{
chomp($line);
@LineArray = split(/\t/,$line); # splitting the line to the columns
$key = $LineArray[12]; # the alternative splicing type
if(defined $typesFreqWithSNP{$key}) # if this type is already exists in the map
{
$typesFreqWithSNP{$key}++; # increment its number of apearances
}
else
{
$typesFreqWithSNP{$key} = 1; # set as new key and set its number of apearences to 1
}
$line = <SNPTYPE>;
}
@keys = keys(%typesFreq);
$altFreqSum = 0;
foreach $key(@keys) # sum the total alternative splicing events of the first table
{
$altFreqSum += $typesFreq{$key};
}
print OUT "AltEventsWithType Frequency Table:\n";
print OUT "AltType\tNumberOfApearencesInTable\tTotalFrequency\n";
foreach $key(@keys) # calculate the frequency of every type of alternative splicing in the first table and print the data to OUT
{
$freq = $typesFreq{$key}/$altFreqSum;
print OUT $key . "\t";
print OUT $typesFreq{$key} . "\t";
print OUT $freq . "\n";
}
print OUT "\n";
@keys = keys(%typesFreqWithSNP);
$SNPFreqSum = 0;
foreach $key(@keys) # sum the total alternative splicing events of the second table
{
$SNPFreqSum += $typesFreqWithSNP{$key};
}
print OUT "SNP splicing joining Frequency Table:\n";
print OUT "AltTypeWithSNP\tNumberOfApearencesInTable\tTotalFrequency\n";
foreach $key(@keys) # calculate the frequency of every type of alternative splicing of the second table and print the data to OUT
{
$freq = $typesFreqWithSNP{$key}/$SNPFreqSum;
print OUT $key . "\t";
print OUT $typesFreqWithSNP{$key} . "\t";
print OUT $freq . "\n";
}
# Script #5 appendix E
#! /usr/bin/perl-w
#this script compute the needed table for a Chi Square test between no snp \snp #and no alternative splicing\alternative splicing
#***********************- pseudo code -*************************
# 1. get all refseq genes and hold them in HASH
# 2. get all refseq genes that have AS and hold them in HASH
# 3. get all refseq genes that have SNP - was computed by auxiliry script "findRefseqWithSNP.pl" - and hold in HASH
# 4. get all refseq genes that have AS and SNP and hold them in HASH
# 5. count all refseq genes that have AS and no SNP
# 6. count all refseq genes that have SNP and no AS
# 7. count all refseq genes that dont have SNP and dont have AS
#***************************************************************
# openning the required input and output files
open(AS_GENES, "RefSeq_Alt") or die "RefSeq_Alt";
open(ALL_GENES, "refseq") or die "refseq";
open(SNP_GENES, "refseq_genes_with_snp") or die "refseq_genes_with_snp"; # was computed by auxiliry script -findRefseqWithSNP.pl
open(OUT, ">instances_count_summary") or die "out";
# init data structures
%refseqGenes = (); # holds all refseq genes
$refseqGenesCounter = 0; #holds the number of refseq genes
%refseqGenesWithASAndSNP = (); # holds all refseq genes that have a SNP and AS
$refseqGenesWithASAndSNPCounter = 0; #holds the number of refseq genes that have a SNP and AS
%refseqGenesWithAS = (); # holds all refseq genes that have AS
$refseqGenesWithASCounter = 0; #holds the number of refseq genes that have AS
%refseqGenesWithASAndNoSNP = (); # holds all refseq genes that have AS but no SNP
$refseqGenesWithASAndNoSNPCounter = 0; #holds the number of refseq genes that have AS but no SNP
%refseqGenesWithSNP = (); # holds all refseq genes that have SNP
$refseqGenesWithSNPCounter = 0; #holds the number of refseq genes that have SNP
%refseqGenesWithSNPAndNoAS = (); # holds all refseq genes that have SNP but no AS
$refseqGenesWithSNPAndNoASCounter = 0; #holds the number of refseq genes that have SNP but no AS
%refseqGenesWithNoSNPAndNoAS = (); # holds all refseq genes that have no SNP and no AS
$refseqGenesWithNoSNPAndNoASCounter = 0; #holds the number of refseq genes that have no SNP and no AS
print OUT "***********************- instances count summary -*************************\n\n";
# get all refseq genes
$line = <ALL_GENES>;
$line = <ALL_GENES>;
while (defined $line)
{
chomp($line);
@LineArray = split(/\t/,$line);
$geneID = $LineArray[1]; # get gene ID
if (defined $refseqGenes{$geneID})
{
}
else
{
$refseqGenes{$geneID} = 1;
$refseqGenesCounter++;
}
$line = <ALL_GENES>;
}
print OUT "All refseq genes - " . $refseqGenesCounter . "\n\n";
# get all refseq genes that have AS
$line = <AS_GENES>;
while (defined $line)
{
chomp($line);
@LineArray = split(/\t/,$line);
$geneID = $LineArray[3]; # get gene ID
if (defined $refseqGenesWithAS{$geneID})
{
}
else
{
$refseqGenesWithAS{$geneID} = 1;
$refseqGenesWithASCounter++;
}
$line = <AS_GENES>;
}
# get all refseq genes that have SNP - was computed by auxiliry script "findRefseqWithSNP.pl"
$line = <SNP_GENES>;
$line = <SNP_GENES>;
while (defined $line)
{
chomp($line);
@LineArray = split(/\t/,$line);
$geneID = $LineArray[4]; # get gene ID
if (defined $refseqGenesWithSNP{$geneID})
{
}
else
{
$refseqGenesWithSNP{$geneID} = 1;
$refseqGenesWithSNPCounter++;
}
$line = <SNP_GENES>;
}
# get all refseq genes that have AS and SNP
foreach $geneID(%refseqGenes)
{
if ($refseqGenes{$geneID} == 1)
{
if (defined $refseqGenesWithAS{$geneID}) # has a AS
{
if (defined $refseqGenesWithSNP{$geneID}) # has a SNP and AS
{
$refseqGenesWithASAndSNP{$geneID} = 1;
$refseqGenesWithASAndSNPCounter++;
}
else # no SNP
{
}
}
else # no AS
{
}
}
}
print OUT "instances with Alternative splicing and SNP - " . $refseqGenesWithASAndSNPCounter . "\n";
# get all refseq genes that have AS and no SNP
foreach $geneID(%refseqGenesWithAS)
{
if ($refseqGenesWithAS{$geneID} == 1)
{
if (defined $refseqGenesWithSNP{$geneID}) # has a SNP
{
}
else # no SNP
{
$refseqGenesWithASAndNoSNP{$geneID} = 1;
$refseqGenesWithASAndNoSNPCounter++;
}
}
}
print OUT "instances with Alternative splicing and without SNP - " . $refseqGenesWithASAndNoSNPCounter . "\n";
# get all refseq genes that have SNP and no AS
foreach $geneID(%refseqGenesWithSNP)
{
if ($refseqGenesWithSNP{$geneID} == 1)
{
if (defined $refseqGenesWithAS{$geneID}) # has a AS
{
}
else # no AS
{
$refseqGenesWithSNPAndNoAS{$geneID} = 1;
$refseqGenesWithSNPAndNoASCounter++;
}
}
}
print OUT "instances without Alternative splicing and with SNP - " . $refseqGenesWithSNPAndNoASCounter . "\n";
# get all refseq genes that dont have SNP and dont have AS
foreach $geneID(%refseqGenes)
{
if ($refseqGenes{$geneID} == 1)
{
if (defined $refseqGenesWithAS{$geneID}) # has a AS
{
}
else # no AS
{
if (defined $refseqGenesWithSNP{$geneID}) # has a SNP
{
}
else # no SNP and no AS
{
$refseqGenesWithNoSNPAndNoAS{$geneID} = 1;
$refseqGenesWithNoSNPAndNoASCounter++;
}
}
}
}
print OUT "instances without Alternative splicing and without SNP - " . $refseqGenesWithNoSNPAndNoASCounter . "\n";
close(SNPAS_GENES);
close(AS_GENES);
close(ALL_GENES);
close(SNP_GENES);
close(OUT);
# Heper Script regarding script #5 appendix F
#! /usr/bin/perl-w
# This script finds all the refseq genes which have a SNP inside them
#***********************- pseudo code -***************************************************************************************
# 1. foreach row in table refseq table
# 1.1. extract the chromosome number
# 1.2 extract the strand
# 1.3 extract the transcription start and end
# 1.4 extract the rest of the relevent data
# 1.5 put everything in a hashtable, the key = gene id, value = the rest of the data
#2. foreach row in table snp135
# 2.1 extract the relevent data.
# 2.2 call findSnpInAlternativeSplicingEvent function with the chromosome number, strand,SNP location,SNP Id and allels fields.
#
#findSnpInAlternativeSplicingEvent:
# 1.comper the chromosome number of the alternative splicing event and the snp
# 1.1 if it mach
# 1.1.1 comper the strand of the alternative splicing event and the snp.
# 1.1.1.1 if it mach
# 1.1.1.1.1 comper the start and end of the transcription area and the snp locatoin
# 1.1.1.1.1.1 if it mach
# 1.1.1.1.1.1.1 print to OUT the relevent data
#******************************************************************************************************************************
open(REFSEQ, "refseq") or die "Could not open RefSeq_Alt";
open(SNP135, "snp135") or die "Could not open AltEvents"; # UCSC table of synonymous SNPs size: 38,301
open(OUT, ">refseq_genes_with_snp") or die "Could not open AltEventsWithType"; # output file
$chrom; # Reference sequence chromosome or scaffold
$codigSeqStart; # coding start position
$codigSeqEnd; # coding end position
$strand; # + or - for strand
$transcriptStart; # Transcription start position
$transcriptEnd; # Transcription end position
$geneID; # Name of gene (usually transcript_id from GTF)
$AltEvnetType; # alternative splicing type
$numberOfExons; # Number of exons
$ExonsStart; # start positions of exons
$ExonsEnd; # end positions of exons
$line = <REFSEQ>; #skip the unrelevent row
$line = <REFSEQ>;
while (defined ($line)) # foreach row in table aternative splicing event
{
chomp($line);
@LineArray = split(/\t/,$line); # splitting the line to the columns
$chrom = $LineArray[2];
$geneId = $LineArray[1];
$strand = $LineArray[3];
$transcriptStart = $LineArray[4];
$transcriptEnd = $LineArray[5];
$codigSeqStart = $LineArray[6];
$codigSeqEnd = $LineArray[7];
$numberOfExons = $LineArray[8];
$ExonsStart = $LineArray[9];
$ExonsEnd = $LineArray[10];
$AltEventsMap{$geneId} = $chrom."\t".$strand."\t".$codigSeqStart."\t".$codigSeqEnd."\t".$transcriptStart."\t".$transcriptEnd."\t".$geneId."\t".$numberOfExons."\t".$ExonsStart."\t".$ExonsEnd."\t".$AltEvnetType;
$line = <REFSEQ>; # take next line from table
}
$line = <SNP135>;
$line = <SNP135>; #skip unrelevent rows
$line = <SNP135>;
print OUT "chrom strand geneId codigSeqStart codigSeqEnd transcriptStart transcriptEnd SNPlocation Allels SNPId numberOfExons ExonsStart ExonsEnd\n";
while (defined ($line)) # foreach row in table SNP
{
chomp($line);
@LineArray2 = split(/\t/,$line); # splitting the line to the columns
$chrom = $LineArray2[1];
$SNPlocation = $LineArray2[2];
$SNPId = $LineArray2[4];
$strand = $LineArray2[6];
$Allels = $LineArray2[9];
findSnpInAlternativeSplicingEvent($chrom, $strand, $SNPlocation,$SNPId,$Allels);
$line = <SNP135>;
}
close(OUT);
#findAlternativeSplicingTypeByLocation($chrom, $strand, $SNPlocation,$SNPId,$Allels)
sub findSnpInAlternativeSplicingEvent
{
my $chrom = $_[0];
my $strand = $_[1];
my $SNPlocation = $_[2];
my $SNPId = $_[3];
my $Allels = $_[4];
foreach $geneId(%AltEventsMap)
{
$geneInfo = $AltEventsMap{$geneId};
chomp($geneInfo);
@AltArray = split(/\t/,$geneInfo); # splitting the line of every value in a line of the map
if($AltArray[0] eq $chrom)
{
if($AltArray[1] eq $strand)
{
if(($AltArray[2] <= $SNPlocation) && ($AltArray[3] >= $SNPlocation))
{
print OUT $AltArray[0]."\t".$AltArray[11]."\t".$AltArray[12]."\t".$AltArray[1]."\t".$AltArray[6]."\t".$AltArray[2]."\t".$AltArray[3]."\t".$AltArray[4]."\t".$AltArray[5]."\t".$SNPlocation."\t".$Allels."\t".$SNPId."\t".$AltArray[10]."\t".$AltArray[7]."\t".$AltArray[8]."\t".$AltArray[9]."\n";
}
}
}
}
}
# appendix G
#! /usr/bin/perl-w
#this script compute the needed table for a Chi Square test between end/start of #exon and no alternative splicing\alternative splicing
open(AS, "SNP splicing joining exon filtered") or die "SNP splicing joining exon filtered";
open(SNP, "refseq_genes_with_snp") or die "Could not open RefSeq_Alt";
open(ALT, "RefSeq_Alt") or die "Could not open RefSeq_Alt";
open(OUT,"> test3") or die "could not open test3";
@res;
for($i = 0; $i < 4; $i++)
{
$res[$i] = 0;
}
$line = <AS>;
$line = <AS>;
while(defined $line)
{
@geneArr = split(/\t/,$line);
$SNPlocation = $geneArr[9];
$exon = $geneArr[16];
$exonsStart = $geneArr[14];
$exonsEnd = $geneArr[15];
$NumberOfExons = $geneArr[13];
@exonsStart = split(/\s/,$exonsStart);
@exonsEnd = split(/\s/,$exonsEnd);
if( $SNPlocation - $exonsStart[$exon-1] <= 50) # increasing the sum of alternative splising at the begining of the exon
{
$res[0]++;
}
elsif($exonsEnd[$exon-1] - $SNPlocation <= 50) # increasing the sum of alternative splising at the end of the exon
{
$res[2]++;
}
$line = <AS>;
}
close(AS);
$SNPline = <SNP>;
$SNPline = <SNP>;
$ALTline = <ALT>;
@snpArr = split(/\s/,$SNPline);
@altArr = split(/\s/,$ALTline);
$SNPgeneID = $snpArr[4];
$ALTgeneID = $altArr[3];
$flag = 1;
while(defined $SNPline)
{
while( defined $ALTline && $flag != 0)
{
if($SNPgeneID eq $ALTgeneID)
{
$flag = 0;
}
$ALTline = <ALT>;
@altArr = split(/\s/,$ALTline);
$ALTgeneID = $altArr[3];
}
close(ALT);
open(ALT, "RefSeq_Alt") or die "Could not open RefSeq_Alt";
if($flag)
{
$NumberOfExons = $snpArr[13];
$exonsStart = $snpArr[14];
@exonsStart = split(/\s/,$exonsStart);
$exonsEnd = $snpArr[15];
@exonsEnd = split(/\s/,$exonsEnd);
$SNPlocation = $snpArr[9];
for($i = 0; $i < $NumberOfExons; $i++)
{
if( $SNPlocation - $exonsStart[$i] <= 50 && $SNPlocation <= $exonsEnd[$i])
{
$res[1]++;#increasing the sum of not alternative splised at the begining of the exon
}
}
for($i = 0; $i < $NumberOfExons; $i++)
{
if( $exonsEnd[$i] - $SNPlocation <= 50 && $exonsEnd[$i] - $SNPlocation >= 0 )
{
$res[3]++; #increasing the sum of not alternative splised at the end of the exon
}
}
}
$SNPline = <SNP>;
@snpArr = split(/\s/,$SNPline);
$SNPgeneID = $snpArr[4];
$flag = 1;
}
print OUT "\t\tAlternative splicing\tno Alternative splicing\n";
print OUT "Begining\t".$res[0]."\t".$res[1]."\n";
print OUT "End\t".$res[2]."\t".$res[3]."\n";
#script #6 appendix H
#! /usr/bin/perl-w
#***********************- pseudo code -*************************
# 1. foreach row in table REFSEQ
# 1.1 foreach row in ALTEV
# 1.1.1 if an alternative splicing event if found within current gene then they are printed to OUT file
#***************************************************************
open(ALTEV, "AltEvents") or die "Could not open AltEvents"; # UCSC table of alternative events - size: 91,223
open(REFSEQ, "refseq") or die "Could not open refseq"; # UCSC table of refseq genes - size: 41,748
open(LOG, ">LOG") or die "Could not open LOG"; # log file of program
open(OUT, ">AltEventsWithTypeNew") or die "Could not open AltEventsWithTypeNew"; # output file
# print header of result table
print OUT "chrom\trefseqGeneChromStart\trefseqGeneChromEnd\tEmptyPlaceHolder\tEmptyPlaceHolder\tgeneID\trefseqcore\tstrand\trefseqcodingStart\trefseqcodingEnd\ttranscriptScore\tEmptyPlaceHolder\trefseqTranscriptStart\trefseqTranscriptEnd\trefseqExonCount\trefseqExonStarts\trefseqExonEnds\tsecondName\n";
print OUT "!\taltEventChromStart\taltEventChromEnd\taltEventType\n";
# column variables for each line in REFSEQ table
$bin; # Indexing field to speed chromosome range queries.
$geneID; # Name of gene (usually transcript_id from GTF)
$chrom; # Reference sequence chromosome or scaffold
$strand; # + or - for strand
$txStart; # Transcription start position
$txEnd; # Transcription end position
$cdsStart; # Coding region start
$cdsEnd; # Coding region end
$exonCount; # Number of exons
$exonStarts; # Exon start positions
$exonEnds; # Exon end positions
$score; # score
$secondName; # Alternate name (e.g. gene_id from GTF)
$line = <REFSEQ>; # jump over header in top of the table
$line = <REFSEQ>;
while (defined ($line)) # fill data in %refseqMap Map from REFSEQ table
{
chomp($line);
@LineArray = split(/\t/,$line); # splitting the line to the columns
$bin = $LineArray[0];
$geneID = $LineArray[1];
$chrom = $LineArray[2];
$strand = $LineArray[3];
$txStart = $LineArray[4];
$txEnd = $LineArray[5];
$cdsStart = $LineArray[6];
$cdsEnd = $LineArray[7];
$exonCount = $LineArray[8];
$exonStarts = $LineArray[9];
$exonEnds = $LineArray[10];
$score = $LineArray[11];
$secondName = $LineArray[12];
chomp($geneID);
chomp($chrom);
chomp($strand);
# call helper function that finds all alternative splicing events in current gene and print to outpur file
findAlternativeSplicingTypeByLocation($chrom, $txStart, $txEnd, $strand);
$line = <REFSEQ>; # take next line from table
}
close(ALTEV);
close(REFSEQ);
close(LOG);
close(OUT);
#findAlternativeSplicingTypeByLocation( $chrom, $chromStart, $chromEnd, $strand)
sub findAlternativeSplicingTypeByLocation
{
seek(ALTEV, 0, SEEK_SET); # move file cursor to start of file
my $i_chrom = $_[0]; # input parameter
my $i_chromStart = $_[1]; # input parameter
my $i_chromEnd = $_[2]; # input parameter
my $i_strand = $_[3]; # input parameter
my @i_LineArray;
# column variables for each line in ALTEV table
my $tempChrom;
my $tempChromStart;
my $tempChromEnd;
my $tempStrand;
my $tempAltType; # type of alternative splicing event - more explantions in attached file 'Alternative Splicing type UCSC'
my $firstMatchFound = 1;
print LOG "\nfindAlternativeSplicingTypeByLocation ( ".$i_chrom." , ".$i_chromStart." , ".$i_chromEnd." , ".$i_strand. " )\n\n";
my $i_line = <ALTEV>; # jump over header in top of the table
my $i_line = <ALTEV>;
while (defined ($i_line)) # 1.2. find according to input location parameters the matching alternative splicing events
{
@i_LineArray = split(/\t/,$i_line); # splitting the line to the columns
$tempChrom = $i_LineArray[1];
$tempChromStart = $i_LineArray[2];
$tempChromEnd = $i_LineArray[3];
$tempStrand = $i_LineArray[6];
$tempAltType = $i_LineArray[4];
chomp($tempChrom);
chomp($tempStrand);
if (($i_chrom eq $tempChrom) && ($i_strand eq $tempStrand))
{
if (($i_chromStart <= $tempChromStart) && ($i_chromEnd >= $tempChromEnd)) # if alternative splicing event whithin gene
{
if ($firstMatchFound == 1) # print gene information to output file only if match was found
{
$firstMatchFound = 0;
# print to OUT the refaeq gene
print OUT $chrom."\t".$txStart."\t".$txEnd."\t"."emptyPlaceHolder"."\t"."emptyPlaceHolder"."\t".$geneID."\t".$score."\t".$strand."\t".$cdsStart."\t".$cdsEnd."\t".$score."\t"."emptyPlaceHolder"."\t".$txStart."\t".$txEnd."\t".$exonCount."\t".$exonStarts."\t".$exonEnds."\t".$secondName."\n";
}
# print alternative splicing event information
print OUT "!\t".$tempChromStart."\t".$tempChromEnd."\t".$tempAltType."\n";
}
}
$i_line = <ALTEV>; # take next line from table
}
}
#script #7 appendix I
#! /usr/bin/perl-w
#***********************- Description -***************************************************
# This script finds per each gene the SNP's that are contained within it.
# in addition per each match this script finds the relevant alternative splicing events
# by overlapping exon and alterantive splicing event.
#*****************************************************************************************
open(ALTEV, "AltEventsWithTypeNew") or die "Could not open RefSeq_Alt"; # The output from script 1 size: 171,884
open(SNP135, "snp135") or die "Could not open AltEvents"; # UCSC table of synonymous SNPs size: 38,301
open(LOG, ">LOG") or die "Could not open LOG"; # log file of program
open(OUT, ">SNP splicing joining new") or die "Could not open AltEventsWithType"; # output file
$chrom; # Reference sequence chromosome or scaffold
$codigSeqStart; # coding start position
$codigSeqEnd; # coding end position
$strand; # + or - for strand
$transcriptStart; # Transcription start position
$transcriptEnd; # Transcription end position
$geneID; # Name of gene (usually transcript_id from GTF)
$AltEvnetType; # alternative splicing type
$numberOfExons; # Number of exons
$ExonsStart; # start positions of exons
$ExonsEnd; # end positions of exons
$line = <ALTEV>; #skip the unrelevent row
$line = <ALTEV>;
while (defined ($line)) # foreach row in table aternative splicing event
{
chomp($line);
if ($line =~ /^!/) # if current line holds alternative splicing information - add it to hash
{ # hold alternative splicing data in HASH with geneID as key
if (defined $AltEventsInformationMap{$geneId})
{
$AltEventsInformationMap{$geneId} = $AltEventsInformationMap{$geneId} . $line . "\n";
}
else
{
$AltEventsInformationMap{$geneId} = $line . "\n";
}
}
else # if current line holds gene information - call helper function
{
@LineArray = split(/\t/,$line); # splitting the line to the columns
$chrom = $LineArray[0];
$chromStart = $LineArray[1];
$chromEnd = $LineArray[2];
$geneId = $LineArray[5];
$strand = $LineArray[7];
$transcriptStart = $LineArray[12];
$transcriptEnd = $LineArray[13];
$AltEvnetType = $LineArray[11];
$codigSeqStart = $LineArray[8];
$codigSeqEnd = $LineArray[9];
$numberOfExons = $LineArray[14];
$ExonsStart = $LineArray[15];
$ExonsEnd = $LineArray[16];
# hold gene data in HASH with geneID as key
$RefaeqGenesMap{$geneId} = $chrom."\t".$strand."\t".$codigSeqStart."\t".$codigSeqEnd."\t".$transcriptStart."\t".$transcriptEnd."\t".$geneId."\t".$numberOfExons."\t".$ExonsStart."\t".$ExonsEnd."\t".$AltEvnetType."\t".$chromStart."\t".$chromEnd;
}
$line = <ALTEV>; # take next line from table
}
$line = <SNP135>;
$line = <SNP135>; #skip unrelevent rows
$line = <SNP135>;
# print output file headers
print OUT "chrom chromStart chromEnd strand geneId codigSeqStart codigSeqEnd transcriptStart transcriptEnd SNPlocation Allels SNPId alternativeSplicingType numberOfExons ExonsStart ExonsEnd\n";
print OUT "!\taltEventChromStart\taltEventChromEnd\taltEventType\n";
while (defined ($line)) # foreach row in table SNP
{
chomp($line);
@LineArray2 = split(/\t/,$line); # splitting the line to the columns
$chrom = $LineArray2[1];
$SNPlocation = $LineArray2[2];
$SNPId = $LineArray2[4];
$strand = $LineArray2[6];
$Allels = $LineArray2[9];
# call helper function to find the genes that are relevant to current SNP
findSnpInAlternativeSplicingEvent($chrom, $strand, $SNPlocation,$SNPId,$Allels);
$line = <SNP135>;
}
close(OUT);
#findAlternativeSplicingTypeByLocation($chrom, $strand, $SNPlocation,$SNPId,$Allels)
sub findSnpInAlternativeSplicingEvent
{
my $chrom = $_[0];
my $strand = $_[1];
my $SNPlocation = $_[2];
my $SNPId = $_[3];
my $Allels = $_[4];
foreach $geneId(%RefaeqGenesMap) # foreach gene
{
$geneInfo = $RefaeqGenesMap{$geneId};
chomp($geneInfo);
@AltArray = split(/\t/,$geneInfo); # splitting the line of every value in a line of the map
if($AltArray[0] eq $chrom)
{
if($AltArray[1] eq $strand)
{
if(($AltArray[2] <= $SNPlocation) && ($AltArray[3] >= $SNPlocation)) # if snp is within the gene
{
print OUT $AltArray[0]."\t".$AltArray[11]."\t".$AltArray[12]."\t".$AltArray[1]."\t".$AltArray[6]."\t".$AltArray[2]."\t".$AltArray[3]."\t".$AltArray[4]."\t".$AltArray[5]."\t".$SNPlocation."\t".$Allels."\t".$SNPId."\t".$AltArray[10]."\t".$AltArray[7]."\t".$AltArray[8]."\t".$AltArray[9]."\n";
# find the relevant alterantive splicing events of current gene + SNP match
findGeneAlternativeSplicingEventsRelatedToSNP($geneId, $SNPlocation);
}
}
}
}
}
#findGeneAlternativeSplicingEventsRelatedToSNP($geneId, $SNPlocation)
sub findGeneAlternativeSplicingEventsRelatedToSNP
{
my $geneID = $_[0];
my $SNPlocation = $_[1];
my $altEventsInfo = $AltEventsInformationMap{$geneId};
my $geneInfo = $RefaeqGenesMap{$geneId};
my @geneLineArray = split(/\t/,$geneInfo); # splitting the line to the columns
my @altEventsArray = split(/\n/,$altEventsInfo);
my $altEventsArraySize = @altEventsArray;
my $numberOfExons = $geneLineArray[7];
my $ExonsStart = $geneLineArray[8];
my $ExonsEnd = $geneLineArray[9];
my @ExonStartArray = split(/,/,$ExonsStart); # splitting the exon start positions
my @ExonEndArray = split(/,/,$ExonsEnd); # splitting the exon end positions
# foreach exon start and end location check if SNP is in the first 50 bp or last 50 bp of exon
for (my $i = 0; $i < $numberOfExons; $i++)
{
chomp($ExonStartArray[$i]); # chomp current exon's start position
chomp($ExonEndArray[$i]); # chomp current exon's end position
my $ExonStartPos = $ExonStartArray[$i];
my $ExonEndPos = $ExonEndArray[$i];
# if SNP is in range of some exon
if (($SNPlocation >= $ExonStartPos) && ($SNPlocation <= $ExonEndPos))
{
#foreach alternative splicing event - print to output the relevant alternative splicing events of gene and SNP
for (my $j = 0; $j < $altEventsArraySize; $j++)
{
my $altEvent = @altEventsArray[$j];
my @altEventArray = split(/\t/,$altEvent);
my $altEventChromStart = $altEventArray[1];
my $altEventChromEnd = $altEventArray[2];
# if snp is related to alternative splicing then print it to output
if (($altEventChromStart <= $ExonStartPos) && ($ExonEndPos <= $altEventChromEnd))
{
print OUT $altEvent . "\n";
}
elsif (($altEventChromStart >= $ExonStartPos) && ($ExonEndPos >= $altEventChromEnd))
{
print OUT $altEvent . "\n";
}
elsif (($altEventChromStart >= $ExonStartPos) && ($ExonStartPos <= $altEventChromEnd))
{
print OUT $altEvent . "\n";
}
elsif (($altEventChromStart >= $ExonEndPos) && ($ExonEndPos <= $altEventChromEnd))
{
print OUT $altEvent . "\n";
}
else
{
#nothing
}
}
}
}
}
#script #8 appendix J
#! /usr/bin/perl-w
#***********************- Description -*******************************************************
# This script filters the genes by SNPs that are in the first or last 3 positions in an exon.
#*********************************************************************************************
# joining of UCSC tables that hold splicing variants with SNP within - size: 29,788
open(SNPSPL, "SNP splicing joining new") or die "Could not open SNP splicing joining";
open(OUT, ">SNP splicing joining exon filtered new") or die "Could not open SNP splicing joining exon filtered"; # output file
open(LOG, ">LOG") or die "Could not open LOG"; # log file of program
$line = <SNPSPL>; # get header of input file
$BASE_PAIR_OFFSET = 3; # base pair offset at exon start and end search SNP
# print header of result table
chop($line);
print OUT $line."\tSNPInExonNumber\tSNPoffsetInExon\tSNPStartOrEndExon\n";
print OUT "!\taltEventChromStart\taltEventChromEnd\taltEventType\n";
# column variables for each line in SNPSPL table
$chrom; # Reference sequence chromosome or scaffold
$chromStart; # chrom position start
$chromEnd; # chrom position end
$strand; # + or - for strand
$geneId; # Name of gene (usually transcript_id from GTF)
$codigSeqStart; # Coding region start
$codigSeqEnd; # Coding region end
$transcriptStart; # Transcription start position
$transcriptEnd; # Transcription end position
$SNPlocation; # SNP location
$Allels; # observed nucleutide allele
$SNPId; # unique SNP ID
$alternativeSplicingType; # type of alternative splicing event
$numberOfExons; # Number of exons
$ExonsStart; # Exon start positions
$ExonsEnd; # Exon end positions
$line = <SNPSPL>;
$line = <SNPSPL>;
$isInFirstLastThreeNuc = 0;
while (defined ($line)) # 1. foreach row in table SNPSPL
{
if( $line =~ /^chr/)
{
$isInFirstLastThreeNuc = 0;
# 1.1. extract exon data of alt splicing variant and SNP location
chomp($line);
@LineArray = split(/\t/,$line); # splitting the line to the columns
$chrom = $LineArray[0];
$chromStart = $LineArray[1];
$chromEnd = $LineArray[2];
$strand = $LineArray[3];
$geneId = $LineArray[4];
$codigSeqStart = $LineArray[5];
$codigSeqEnd = $LineArray[6];
$transcriptStart = $LineArray[7];
$transcriptEnd = $LineArray[8];
$SNPlocation = $LineArray[9];
$Allels = $LineArray[10];
$SNPId = $LineArray[11];
$alternativeSplicingType = $LineArray[12];
$numberOfExons = $LineArray[13];
$ExonsStart = $LineArray[14];
$ExonsEnd = $LineArray[15];
chomp($numberOfExons);
chomp($SNPlocation);
$StartOrEnd;
# print LOG "numberOfExons = ".$numberOfExons."\n";
@ExonStartArray = split(/,/,$ExonsStart); # splitting the exon start positions
@ExonEndArray = split(/,/,$ExonsEnd); # splitting the exon end positions
# 1.2. foreach exon start and end location check if SNP is in the first 50 bp or last 50 bp of exon
for ($i = 0; $i < $numberOfExons; $i++)
{
chomp($ExonStartArray[$i]); # chomp current exon's start position
chomp($ExonEndArray[$i]); # chomp current exon's end position
$ExonStartPos = $ExonStartArray[$i];
$ExonEndPos = $ExonEndArray[$i];
# print LOG "SNPlocation = ".$SNPlocation."\n";
# print LOG "ExonStartPos = ".$ExonStartPos."\n";
# print LOG "ExonEndPos = ".$ExonEndPos."\n";
# 1.2.1. if SNP is in range of some exon
if (($SNPlocation >= $ExonStartPos) && ($SNPlocation <= ($ExonStartPos + $BASE_PAIR_OFFSET)))
{
# 1.2.1.1 print to OUT the variant + exon number that SNP is in the range of
$isInFirstLastThreeNuc = 1;
$SNPoffset = $SNPlocation - $ExonStartPos;
# if the strand is from 3' to 5' or the other way around
if( $strand eq "+")
{
$SNPoffset = "+".$SNPoffset;
$StartOrEnd = "StartOfExon";
}
else
{
$SNPoffset = "-".$SNPoffset;
$StartOrEnd = "EndOfExon";
}
print OUT $line."\t".($i+1)."\t".$SNPoffset."\t".$StartOrEnd."\n";
}
elsif (($SNPlocation >= ($ExonEndPos - $BASE_PAIR_OFFSET)) && ($SNPlocation <= $ExonEndPos))
{
$isInFirstLastThreeNuc = 1;
$SNPoffset = $ExonEndPos - $SNPlocation;
# if the strand is from 3' to 5' or the other way around
if( $strand eq "+")
{
$SNPoffset = "-".$SNPoffset;
$StartOrEnd = "EndOfExon";
}
else
{
$SNPoffset = "+".$SNPoffset;
$StartOrEnd = "StartOfExon";
}
print OUT $line."\t".($i+1)."\t".$SNPoffset."\t".$StartOrEnd."\n";
}
}
}
# prints the lines that represent alternative splaicing event if there is a SNP in the first or last 3 Nucleotides
else
{
if($isInFirstLastThreeNuc == 1)
{
print OUT $line;
}
}
$line = <SNPSPL>; # take next line from table
}
close(SNPSPL);
close(OUT);
close(LOG);
#script #9 appendix K
#! /usr/bin/perl-w
#***********************- Description -*******************************************************
#this script compute the first test by creating 132 tables that combains alernative splicing
#types (6), allel variant (11) and location of the snp ( start or end of exon).
#*********************************************************************************************
open(IN, "SNP splicing joining exon filtered new") or die "Could not open SNP splicing joining exon filtered new";
open(OUT, ">alt_typ_allel_res") or die "Could not openalt_typ_allel_res"; # output file
open(YES, ">alt_typ_allel_res_yes") or die "Could not openalt_typ_allel_res"; # output file
$line = <IN>;
$line = <IN>;
$Allels; # observed nucleutide allele
$alternativeSplicingType; # type of alternative splicing event
$SNPStartOrEndExon; # location of the SNP beginning or end of the exon
$counter = 0;
# construct chi square table with p.v. = 0.05
%ChiSquareValue = (); # key-d.o.f value-chi square value
$ChiSquareValue{1} = 3.84;
$ChiSquareValue{2} = 5.99;
$ChiSquareValue{3} = 7.81;
$ChiSquareValue{4} = 9.49;
$ChiSquareValue{5} = 11.1;
$ChiSquareValue{6} = 12.6;
$ChiSquareValue{7} = 14.1;
$ChiSquareValue{8} = 15.5;
$ChiSquareValue{9} = 16.9;
$ChiSquareValue{10} = 18.3;
$ChiSquareValue{11} = 19.7;
$ChiSquareValue{12} = 21;
$ChiSquareValue{13} = 22.4;
$ChiSquareValue{14} = 23.7;
$ChiSquareValue{15} = 25;
$ChiSquareValue{16} = 26.3;
$ChiSquareValue{17} = 27.6;
$ChiSquareValue{18} = 28.9;
$ChiSquareValue{19} = 30.1;
$ChiSquareValue{20} = 31.4;
$ChiSquareValue{21} = 32.7;
$ChiSquareValue{22} = 33.9;
$ChiSquareValue{23} = 35.2;
$ChiSquareValue{24} = 36.4;
$ChiSquareValue{25} = 37.7;
$ChiSquareValue{26} = 38.9;
$ChiSquareValue{27} = 40.1;
$ChiSquareValue{28} = 41.3;
$ChiSquareValue{29} = 42.6;
$ChiSquareValue{30} = 43.8;
@AltTyps;
$AltTyps[0] = "altFinish";
$AltTyps[1] = "cassetteExon";
$AltTyps[2] = "retainedIntron";
$AltTyps[3] = "bleedingExon";
$AltTyps[4] = "altThreePrime";
$AltTyps[5] = "altFivePrime";
$AltTypsLen = @AltTyps;
@SNPvariations;
$SNPvariations[0] = "A/T";
$SNPvariations[1] = "A/G";
$SNPvariations[2] = "A/C";
$SNPvariations[3] = "G/T";
$SNPvariations[4] = "C/G";
$SNPvariations[5] = "C/T";
$SNPvariations[6] = "A/G/T";
$SNPvariations[7] = "A/C/G";
$SNPvariations[8] = "A/C/T";
$SNPvariations[9] = "C/G/T";
$SNPvariations[10] = "A/C/G/T";
$SNPvariationsLen = @SNPvariations;
#saves the value of each cell in the obs chart
@ObstableArr;
for($k = 0; $k < 4; $k++)
{
$ObstableArr[$k] = 0;
}
$NumOfTests = 0;
@BeginningOrEndOfExonArr;
$BeginningOrEndOfExonArr[0] = "StartOfExon";
$BeginningOrEndOfExonArr[1] = "EndOfExon";
for($i = 0; $i < $AltTypsLen; $i++)
{
for($j = 0; $j < $SNPvariationsLen; $j++)
{
$line = <IN>; # get the line of the gene
#preforming the twice one for the beginning of the exon and ones for the end
for($k = 0; $k < 2; $k++)
{
$NumOfTests++;
while(defined $line)
{
chomp($line);
@LineArray = split(/\t/,$line); # splitting the line to the columns
$Allels = $LineArray[10];
$SNPStartOrEndOfExon = $LineArray[18];
chomp($SNPStartOrEndOfExon);
$line = <IN>; # get the line of the splicing variant
while($line =~ /^!/)
{
@LineAltTypeArray = split(/\t/,$line); # splitting the line to the columns
$alternativeSplicingType = $LineAltTypeArray[3];
chomp($alternativeSplicingType);
if(( $Allels eq $SNPvariations[$j]) && ($alternativeSplicingType eq $AltTyps[$i])&& ($SNPStartOrEndOfExon eq $BeginningOrEndOfExonArr[$k]))
{
#increasing the right cell of the obs table
$ObstableArr[0]++;
$sumOfOBSInstances++;
}
elsif(( $Allels eq $SNPvariations[$j]) && ($alternativeSplicingType ne $AltTyps[$i])&& ($SNPStartOrEndOfExon eq $BeginningOrEndOfExonArr[$k]))
{
#increasing the right cell of the obs table
$ObstableArr[1]++;
$sumOfOBSInstances++;
}
elsif(( $Allels ne $SNPvariations[$j]) && ($alternativeSplicingType eq $AltTyps[$i])&& ($SNPStartOrEndOfExon eq $BeginningOrEndOfExonArr[$k]))
{
#increasing the right cell of the obs table
$ObstableArr[2]++;
$sumOfOBSInstances++;
}
elsif(($SNPStartOrEndOfExon eq $BeginningOrEndOfExonArr[$k]))
{
#( $Allels ne $SNPvariations[$j]) && ($alternativeSplicingType ne $AltTyps[$i])&&
#increasing the right cell of the obs table
$ObstableArr[3]++;
$sumOfOBSInstances++;
}
$line = <IN>;
}
}
close(IN);
open(IN, "SNP splicing joining exon filtered new") or die "Could not open SNP splicing joining exon filtered new";
$line = <IN>;
$line = <IN>;
if($sumOfOBSInstances != 0)
{
#printing the OBS table data
print OUT "statistic test number\t".$NumOfTests."\t".$AltTyps[$i]."\t".$SNPvariations[$j]."\t".$BeginningOrEndOfExonArr[$k]."\n";
print OUT "OBS table\n";
print OUT "\t\t".$AltTyps[$i]."\t no ".$AltTyps[$i]."\n";
print OUT $SNPvariations[$j]."(".$BeginningOrEndOfExonArr[$k].")\t\t".$ObstableArr[0]."\t\t".$ObstableArr[1]."\n";
print OUT "no ".$SNPvariations[$j]."(".$BeginningOrEndOfExonArr[$k].")\t\t".$ObstableArr[2]."\t\t".$ObstableArr[3]."\n\n";
# compute the data needed for the exp table
$SumOfRowWithAllel = $ObstableArr[0] + $ObstableArr[1];
$SumOfRowWIthOutAllel = $ObstableArr[2] + $ObstableArr[3];
$SumOfColWithAS = $ObstableArr[0] + $ObstableArr[2];
$SumOfColWithOutAS = $ObstableArr[1] + $ObstableArr[3];
$expTable[0] = ($SumOfRowWithAllel * $SumOfColWithAS)/$sumOfOBSInstances;
$expTable[1] = ($SumOfRowWithAllel * $SumOfColWithOutAS)/$sumOfOBSInstances;
$expTable[2] = ($SumOfRowWIthOutAllel * $SumOfColWithAS)/$sumOfOBSInstances;
$expTable[3] = ($SumOfRowWIthOutAllel * $SumOfColWithOutAS)/$sumOfOBSInstances;
$expTableHasValueLowerThanFive = 0;
if(($expTable[0] < 5) || ($expTable[1] < 5) ||
($expTable[2] < 5) || ($expTable[3] < 5))
{ # found at least one box in the expTable with value lower than 5 - no correlation
$expTableHasValueLowerThanFive = 1;
}
#prints out the exp table
#print OUT "\n exp table\n";
#print OUT "\t\t".$AltTyps[$i]."\t\t no ".$AltTyps[$i]."\n";
#print OUT $SNPvariations[$j]."\t\t".$expTable[0]."\t".$expTable[1]."\n";
#print OUT "no ".$SNPvariations[$j]."\t".$expTable[2]."\t".$expTable[3]."\n\n ";
#compute Chi Square value and print result
$ChiSquare = 0;
for($h = 0; $h < 4; $h++)
{
if($expTable[$h] != 0)
{
$ChiSquare = ((($ObstableArr[$h] - $expTable[$h])*($ObstableArr[$h] - $expTable[$h]))/$expTable[$h]) + $ChiSquare;
#print OUT $ObstableArr[$h]." - ".$expTable[$h]." * ".$ObstableArr[$h]." - ".$expTable[$h]." / ".$expTable[$h]." + ".$ChiSquare."\n";
}
}
# the dof is always 1 (all the table are 2X2)
$dof = 1;
print OUT "X^2 value = " . $ChiSquare . "\n";
print OUT "d.o.f = " .$dof."\n";
# chaeck if computed p value is lower than 0.05
if ($dof >= 1 && $dof <= 30)
{
# 2.2 there is a correlation between SNP allele and distinct mRNA - print result
$passingChiSquareValue = $ChiSquareValue{$dof};
print OUT "P.V = ".$passingChiSquareValue."\n\n";
if ($ChiSquare >= $passingChiSquareValue)
{
$counter++;
#print YES "statistic test number\t".$NumOfTests."\t".$AltTyps[$i]."\t".$SNPvariations[$j]."\t".$BeginningOrEndOfExonArr[$k]."\n";
print YES "OBS table\n";
print YES "\t\t".$AltTyps[$i]."\t\t no ".$AltTyps[$i]."\n";
print YES $SNPvariations[$j]."(".$BeginningOrEndOfExonArr[$k].")\t\t".$ObstableArr[0]."\t\t".$ObstableArr[1]."\n";
print YES "no ".$SNPvariations[$j]."(".$BeginningOrEndOfExonArr[$k].")\t\t".$ObstableArr[2]."\t\t".$ObstableArr[3]."\n\n";
print YES "X^2 value = " . $ChiSquare . "\n";
print YES "d.o.f = " .$dof."\n";
print YES "P.V = ".$passingChiSquareValue."\n\n";
#print YES "\n exp table\n";
#print YES "\t\t".$AltTyps[$i]."\t\t no ".$AltTyps[$i]."\n";
#print YES $SNPvariations[$j]."\t\t".$expTable[0]."\t".$expTable[1]."\n";
#print YES "no ".$SNPvariations[$j]."\t".$expTable[2]."\t".$expTable[3]."\n\n ";
if($expTableHasValueLowerThanFive == 1)
{
print OUT "Corralation found: No\n";
}
else
{
print OUT "Corralation found: Yes\n";
}
}
else
{
print OUT "Corralation found: No\n";
}
}
print OUT "\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\n";
for($f = 0; $f < 4; $f++)
{
$ObstableArr[$f] = 0;
}
for($f = 0; $f < 4; $f++)
{
$expTable[$f] = 0;
}
$sumOfOBSInstances = 0;
$expTableHasValueLowerThanFive = 0;
}
}
}
print OUT "number of yes's".$counter." \n";
}
#script #10 appendix L
#! /usr/bin/perl-w
#***********************- Description -*******************************************************
# This script finds all the mRNA and EST sequences that match a certain SNP and its containing
# gene.
#*********************************************************************************************
# open files
open(HUMAN_EST, "human_est_UCSC") or die "Could not open HUMAN_EST";
open(HUMAN_MRNA, "human_mRNA_UCSC") or die "Could not open HUMAN_MRNA";
open(SNPGENES, "SNP splicing joining exon filtered new") or die "Could not open HUMAN_EST";
open(OUT, ">human_mrna_est_UCSC_filtered_by_SNP") or die "human_mrna_est_UCSC_filtered_by_SNP";
#print headers
print OUT "\$NM_000000\t\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\n";
print OUT "\@SNIP\t\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\n";
print OUT "chrom\tchromStart\tchromEnd\tstrand\tgeneId\tcodigSeqStart\tcodigSeqEnd\ttranscriptStart\ttranscriptEnd\tSNPlocation\tAllels\tSNPId\talternativeSplicingType\tnumberOfExons\tExonsStart\tExonsEnd\tSNPInExonNumber\tSNPoffsetInExon\tSNPStartOrEndExon\n";
print OUT "!\taltEventChromStart\taltEventChromEnd\taltEventType\n";
print OUT "> typeOfSeq\tblockNumber\tOffset\t#bin\tmatches\tmisMatches\trepMatches\tnCount\tqNumInsert\tqBaseInsert\ttNumInsert\ttBaseInsert\tstrand\tqName\tqSize\tqStart\tqEnd\ttName\ttSize\ttStart\ttEnd\tblockCount\tblockSizes\tqStarts\ttStarts\n\n";
$line = <SNPGENES>;
$line = <SNPGENES>;
$line = <SNPGENES>;
while (defined $line)
{
$altEventLines = "";
$matchedSequencesLines = ""; # holds the returned result of all matched sequences
@LineArray = split(/\t/,$line); # splitting the line to the columns
$geneSNPLine = $line;
$geneId = $LineArray[4];
$SNPchrom = @LineArray[0];
$SNPstrand = @LineArray[3];
$SNPlocation = @LineArray[9];
$SNPID = @LineArray[11];
# call helper function that finds all the sequences that match gene and SNP
$matchedSequencesLines = findSequencesBySNPLocation($SNPchrom, $SNPstrand, $SNPlocation);
# concatinate all alternative events
$line = <SNPGENES>;
while ($line =~ /^!/)
{
$altEventLines = $altEventLines . $line;
$line = <SNPGENES>;
}
chomp($altEventLines);
chomp($matchedSequencesLines);
# print results to output file
print OUT "\$". $geneId ."\t\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\$\n";
print OUT "\@". $SNPID ."\t\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\n";
print OUT $geneSNPLine;
print OUT $altEventLines . "\n";
print OUT $matchedSequencesLines . "\n";
}
close(HUMAN_EST);
close(HUMAN_MRNA);
close(SNPGENES);
close(OUT);
#findSequencesBySNPLocation($chrom, $strand, $SNPlocation)
sub findSequencesBySNPLocation()
{
seek(HUMAN_EST, 0, SEEK_SET); # move file cursor to start of file
seek(HUMAN_MRNA, 0, SEEK_SET); # move file cursor to start of file
my $i_chrom = $_[0];
my $i_strand = $_[1];
my $i_SNPlocation = $_[2];
# data structure to hold sequence information
my $matchedSequencesResult;
my @LineArray;
my $chrom;
my $strand;
my $tStart;
my $tEnd;
my $numberOfBlocks;
my $blocksSize;
my @blocksSize;
my $blockStart;
my @blocksStart;
my $qName;
my $mRNALine = <HUMAN_MRNA>;
$mRNALine = <HUMAN_MRNA>;
while (defined $mRNALine) # loop over mRNA sequences
{
my @LineArray = split(/\t/,$mRNALine); # splitting the line to the columns
my $chrom = @LineArray[14];
my $strand = @LineArray[9];
my $tStart = @LineArray[16];
my $tEnd = @LineArray[17];
my $numberOfBlocks = @LineArray[18];
my $blocksSize = @LineArray[19];
my @blocksSize = split(/,/,$blocksSize);
my $blockStart = @LineArray[21];
my @blocksStart = split(/,/,$blockStart);
my $qName = @LineArray[10];
if( $chrom eq $i_chrom) #checks if the SNP is in the same chromosome as the mRNA
{
if($strand eq $i_strand) #checks if the SNP is in the same strand as the mRNA
{
if(($tStart <= $i_SNPlocation) && ($i_SNPlocation <= $tEnd)) #checks if the SNP is in the range of the mRNA
{
for(my $i = 0 ; $i < $numberOfBlocks ; $i++) #for each block in the mRNA
{
my $blockEnd = $blocksStart[$i] + $blocksSize[$i];
if(($blocksStart[$i] <= $i_SNPlocation) && ($blockEnd >= $i_SNPlocation)) # checks if the SNP is in the range of a block of the mRNA
{
my $offsetSnp = $i_SNPlocation - $blocksStart[$i]; # calculates the location of the snp in the mRNA block
my $index = $i + 1; # the block that the SNP is in
$matchedSequencesResult = $matchedSequencesResult . "> mRNA\t".$index."\t".$offsetSnp."\t".$mRNALine;
break;
}
}
}
}
}
$mRNALine = <HUMAN_MRNA>;
}
my $estLine = <HUMAN_EST>;
$estLine = <HUMAN_EST>;
while (defined $estLine) # loop over EST sequences
{
my @LineArray = split(/\t/,$estLine); # splitting the line to the columns
my $chrom = @LineArray[14];
my $strand = @LineArray[9];
my $tStart = @LineArray[16];
my $tEnd = @LineArray[17];
my $numberOfBlocks = @LineArray[18];
my $blocksSize = @LineArray[19];
my @blocksSize = split(/,/,$blocksSize);
my $blockStart = @LineArray[21];
my @blocksStart = split(/,/,$blockStart);
my $qName = @LineArray[10];
if( $chrom eq $i_chrom) #checks if the SNP is in the same chromosome as the EST
{
if($strand eq $i_strand) #checks if the SNP is in the same strand as the EST
{
if(($tStart <= $i_SNPlocation) && ($i_SNPlocation <= $tEnd)) #checks if the SNP is in the range of the EST
{
for(my $i = 0 ; $i < $numberOfBlocks ; $i++) #for each block in the EST
{
my $blockEnd = $blocksStart[$i] + $blocksSize[$i];
if(($blocksStart[$i] <= $i_SNPlocation) && ($blockEnd >= $i_SNPlocation)) # checks if the SNP is in the range of a block of the EST
{
my $offsetSnp = $i_SNPlocation - $blocksStart[$i]; # calculates the location of the snp in the EST block
my $index = $i + 1; # the block that the SNP is in
$matchedSequencesResult = $matchedSequencesResult . "> EST\t".$index."\t".$offsetSnp."\t".$estLine;
break;
}
}
}
}
}
$estLine = <HUMAN_EST>;
}
return $matchedSequencesResult;
}
#script #11 appendix M
#! /usr/bin/perl-w
#***********************- Description -*************************
# This script uses the computed block number and SNP offset to extract
# the SNP allele from the sequence.
#***************************************************************
#***********************- pseudo code -*************************
#1. foreach sequence in the input file (starts with ">")
# 1.1 mark the sequence ID that needed from the sequences database in hashtables - mrnaIDHash and estIDHash
#2. extract only marked sequences from databse into hashtables - mrnaHash and estHash
#3. foreach sequence in the input file (starts with ">")
#4. extract the snip allele an print it to the output file
#***************************************************************
# openning the required input and output files
open(SEQ, "human_mrna_est_UCSC_filtered_by_SNP") or die "Could not open human_mrna_est_UCSC_filtered_by_SNP";
open(EST, "est.fa") or die "Could not open human_ESTs_sequences_UCSC";
open(MRNA, "mrna.fa") or die "Could not open human_mRNA_sequences_UCSC";
open(OUT, ">human_mrna_est_UCSC_filtered_by_SNP_with_allele") or die "human_mrna_est_UCSC_filtered_by_SNP_with_allele"; # output file
open(LOG, ">LOG") or die "Could not open LOG"; # log file of program
open(TOUT, ">TOUT") or die "Could not open LOG";
# get header of input file
$geneIDHeader = <SEQ>; # get header of input file
$SNPHeader = <SEQ>; # get header of input file
$SNPInfoHeader = <SEQ>; # get header of input filed
$AltTypeHeader = <SEQ>; # get header of input filed
$seqHeader = <SEQ>; # get header of input file
# set header of output file
print OUT $geneIDHeader;
print OUT $SNPHeader;
print OUT "\%Allel\t\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\n";
print OUT $SNPInfoHeader;
print OUT $AltTypeHeader;
print OUT $seqHeader;
print OUT "\n";
$line = <SEQ>;
$line = <SEQ>;
%mrnaIDHash = ();
%estIDHash = ();
# go over the input file, filter out gene withut alternative splicing events or any sequences
# and mark the required sequences for extraction from database
while (defined $line)
{
if($line =~ /^\$/)
{
if ($sequencesOfGeneCounter > 0 && $altEventsOfGeneCounter > 0)
{
print TOUT $geneLines;
}
$sequencesOfGeneCounter = 0;
$altEventsOfGeneCounter = 0;
$geneLines = "";
}
elsif($line =~ /^!/)
{
$altEventsOfGeneCounter++;
}
elsif($line =~ /^>/)
{
$sequencesOfGeneCounter++;
@SeqInfo = split(/\t/,$line);
$seqType = substr($SeqInfo[0], 2);
$seqName = $SeqInfo[13];
chomp($seqName);
if($seqType eq "mRNA") # mark this sequence ID for extraction from database
{
$mrnaIDHash{$seqName} = 1;
}
elsif($seqType eq "EST")
{
$estIDHash{$seqName} = 1;
}
}
$geneLines = $geneLines . $line;
$line = <SEQ>;
}
# fill hash table with marked mRNA sequences for extraction
%mrnaHash = ();
$line = <MRNA>;
while (defined $line)
{
if($line =~ /^>/)
{
$sequenceID = $line;
$seq = "";
$line = <MRNA>;
while(defined $line)
{
if($line =~ /^>/)
{
@LineArray = split(/ /,$sequenceID);
$tempID = $LineArray[0];
$tempID = substr($tempID, 1);
chomp($tempID);
if (defined $mrnaIDHash{$tempID}) # if this sequence was marked for extraction
{
print LOG "mrnaHash:" . $tempID . "\n";
$mrnaHash{$tempID} = $seq;
}
last;
}
else
{
chomp($line);
$seq = $seq . $line;
}
$line = <MRNA>;
}
}
}
%mrnaIDHash = ();
# fill hash table with marked EST sequences for extraction
%estHash = ();
$line = <EST>;
while (defined $line)
{
if($line =~ /^>/)
{
$sequenceID = $line;
$seq = "";
$line = <EST>;
while(defined $line)
{
if($line =~ /^>/)
{
@LineArray = split(/ /,$sequenceID);
$tempID = $LineArray[0];
$tempID = substr($tempID, 1);
chomp($tempID);
if (defined $estIDHash{$tempID}) # if this sequence was marked for extraction
{
print LOG "estHash:" . $tempID . "\n";
$estHash{$tempID} = $seq;
}
last;
}
else
{
chomp($line);
$seq = $seq . $line;
}
$line = <EST>;
}
}
}
%estIDHash = ();
close(TOUT);
open(TOUT, "TOUT") or die "Could not open TOUT";
$line = <TOUT>;
while (defined $line)
{
chomp($line);
print OUT $line."\n";
if($line =~ /^>/) # foreach sequence - extract the smip allele from the sequence
{
@SeqInfo = split(/\t/,$line);
$seqType = substr($SeqInfo[0], 2);
$seqName = $SeqInfo[13];
chomp($seqName);
$blockNumber = $SeqInfo[1];
$offset = $SeqInfo[2];
$allBlocks = $SeqInfo[23];
@BlockArray = split(/,/,$allBlocks);
$SNPBlock = $BlockArray[$blockNumber-1];
$snipIndex = $SNPBlock + $offset - 1;
$strand = $SeqInfo[12];
if($seqType eq "mRNA")
{
if (defined $mrnaHash{$seqName})
{
$seq = $mrnaHash{$seqName}; # get sequence from hash
if($strand eq "-") # if needed reverse the sequence to get opposite strand
{
$reversed = reverse $seq;
$seq = $reversed;
}
$snip = substr($seq, $snipIndex+1, 1); # extract snip allele
print OUT "\%" . $snip . "\t\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\n";
}
}
elsif($seqType eq "EST")
{
print LOG "if defined: " . $seqName . "\n";
if (defined $estHash{$seqName})
{
$seq = $estHash{$seqName}; # get sequence from hash
if($strand eq "-") # if needed reverse the sequence to get opposite strand
{
$reversed = reverse $seq;
$seq = $reversed;
}
$snip = substr($seq, $snipIndex+1, 1); # extract snip allele
print OUT "\%" . $snip . "\t\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\%\n";
}
}
}
$line = <TOUT>;
}
close(SEQ);
close(EST);
close(MRNA);
close(OUT);
close(LOG);
close(TOUT);
#script #12 appendix N
#! /usr/bin/perl-w
#****************- Description -***********************************************
# This script finds per each sequence the relevant alternative splicing events
# that were found at the sequence.
# sequences with no type of alternative splicing are not printed out to the output
# file.
#******************************************************************************
open(IN, "human_mrna_est_UCSC_filtered_by_SNP_with_allele") or die "human_mrna_est_UCSC_filtered_by_SNP_with_allele"; # output file
open(OUT, ">human_mrna_est_UCSC_filtered_by_SNP_with_allele_with_splicing_type") or die "human_mrna_est_UCSC_filtered_by_SNP_with_allele"; # output file
# get header of input file
$geneIDHeader = <IN>; # get header of input file
$SNPHeader = <IN>; # get header of input file
$SNPAlleleHeader = <IN>; # get header of input filed
$SNPInfoHeader = <IN>; # get header of input filed
$AltTypeHeader = <IN>; # get header of input filed
$seqHeader = <IN>; # get header of input file
# set header of output file
print OUT $geneIDHeader;
print OUT $SNPHeader;
print OUT $SNPInfoHeader;
print OUT $AltTypeHeader;
print OUT $seqHeader;
print OUT $SNPAlleleHeader;
print OUT "\#\taltEventChromStart\taltEventChromEnd\taltEventType\n";
print OUT "\n";
$line = <IN>;
$line = <IN>;
$currGeneLine;
# go over the input file
while (defined $line)
{
if($line =~ /^\$/) # new gene block of data
{
$currAltEventsCounter = 0; # clear currrent alternative splicing events counter
@currAltEventsArray = (); # clear currrent alternative splicing events array
print OUT $line;
}
elsif($line =~ /^\@/) #line of gene information
{
print OUT $line;
}
elsif($line =~ /^chr/) #line of gene information
{
$currGeneLine = $line;
print OUT $line;
}
elsif($line =~ /^!/) #line of gene information
{
# insert new alternative splicing event into array
$currAltEventsArray[$currAltEventsCounter] = $line;
$currAltEventsCounter++;
print OUT $line;
}
elsif($line =~ /^>/) #line of gene information
{
$currSeqLine = $line; # get sequence line
$line = <IN>; # get sequence SNP allele
$currAlleleLine = $line;
$matchedAltEventForSeq = ""; # init string holding all matched alternative splicing events of sequence
# foreach of current genes alternative splicing events
for ($i = 0; $i < $currAltEventsCounter; $i++)
{
$tempAltEventLine = $currAltEventsArray[$i]; # copy current alternative splicing event
# call helper function to check if alternative splicing event is present in current sequence
$bool = iSAltEventPresentInSequence($currSeqLine, $tempAltEventLine, $currGeneLine);
if ($bool == 1) # if alternative splicing event is present in current sequence
{ # concatinate all the relevant results
$matchedAltEventForSeq = $matchedAltEventForSeq . "#\t" . $tempAltEventLine;
}
}
if ($matchedAltEventForSeq ne "") # print out all the alternatove splicing events matched to current sequence
{
print OUT $currSeqLine;
print OUT $currAlleleLine;
print OUT $matchedAltEventForSeq;
}
}
$line = <IN>;
}
close(IN);
close(OUT);
#iSAltEventPresentInSequence($sequenceLine, $altEventLine, $geneLine)
sub iSAltEventPresentInSequence()
{
my $i_sequenceLine = $_[0];
my $i_altEventLine = $_[1];
my $i_geneLine = $_[2];
# split input lines to arrays
my @seqLineArray = split(/\t/,$i_sequenceLine);
my @altEventLineArray = split(/\t/,$i_altEventLine);
my @geneLineArray = split(/\t/,$i_geneLine);
my $altEventType = $altEventLineArray[3];
chomp($altEventType);
#chomp($altEventType); # temp workaround - when running in windows
chop($altEventType); # temp workaround - when running in linux
if ($altEventType eq "altFinish")
{
# check if alt finish event is present in sequence.
# the chromEnd of alternative event represents the end of transcript
# check if the transcript end position matches the chromEnd of alternative splicing event
my $altEventChromStart = $altEventLineArray[1];
my $altEventChromEnd = $altEventLineArray[2];
my $seqTranscriptEnd = $seqLineArray[20];
chomp($altEventChromEnd);
chomp($seqTranscriptEnd);
if ($seqTranscriptEnd eq $altEventChromEnd)
{
return 1; # alt finish found - return true
}
}
elsif ($altEventType eq "cassetteExon")
{
# check if sequence has a matching exon to the alternative splicing event positions
# the chromStart of altEvent represent exon start position
# the chromEnd of altEvent represents exon end position
# extract relevant data
my $altEventChromStart = $altEventLineArray[1];
my $altEventChromEnd = $altEventLineArray[2];
my $seqTranscriptStart = $seqLineArray[19];
my $seqTranscriptEnd = $seqLineArray[20];
my $numberOfBlocks = $seqLineArray[21];
my $blockSizesLine = $seqLineArray[22];
my $transStartsLine = $seqLineArray[24];
my @blocksSizesArray = split(/,/,$blockSizesLine);
my @blocksStartArray = split(/,/,$transStartsLine);
chomp($altEventChromStart);
chomp($altEventChromEnd);
# if alternative splicing event is within the sequence transcription positions
if(($seqTranscriptStart <= $altEventChromStart) && ($altEventChromEnd <= $seqTranscriptEnd))
{
# go over all sequence blocks - if the relevant exon doesnt exist in
# transcript then cassetteExon event found
for (my $j = 0; $j < $numberOfBlocks; $j++)
{
# compute block end position
my $blockEndPosition = $blocksStartArray[$j] + $blocksSizesArray[$j];
# if alternative splicing event matches the current block - cassette exon wasnt found in sequence
if(($blocksStartArray[$j] <= $altEventChromStart) && ($altEventChromEnd <= $blockEndPosition))
{
return 0; # cassette exon wasnt found - return false
}
elsif(($altEventChromStart <= $blocksStartArray[$j]) && ($blockEndPosition <= $altEventChromEnd))
{
return 0; # cassette exon wasnt found - return false
}
}
}
# after looping over sequence blocks, the matching block to the cassette exon was not found - return true
return 1;
}
elsif ($altEventType eq "retainedIntron")
{
# check if in the sequence there is a block which matches the positions of intron start and end
# the chromStart of altEvent represent intron start position
# the chromEnd of altEvent represents intron end position
# extract relevant data
my $altEventChromStart = $altEventLineArray[1];
my $altEventChromEnd = $altEventLineArray[2];
my $seqTranscriptStart = $seqLineArray[19];
my $seqTranscriptEnd = $seqLineArray[20];
my $numberOfBlocks = $seqLineArray[21];
my $blockSizesLine = $seqLineArray[22];
my $transStartsLine = $seqLineArray[24];
my @blocksSizesArray = split(/,/,$blockSizesLine);
my @blocksStartArray = split(/,/,$transStartsLine);
chomp($altEventChromStart);
chomp($altEventChromEnd);
# if alternative splicing event is within the sequence transcription positions
if(($seqTranscriptStart <= $altEventChromStart) && ($altEventChromEnd <= $seqTranscriptEnd))
{
# go over all sequence blocks - if the relevant intron is found in
# a block then retained intron event is found
for (my $j = 0; $j < $numberOfBlocks; $j++)
{
# compute block end position
my $blockEndPosition = $blocksStartArray[$j] + $blocksSizesArray[$j];
# if alternative splicing event matches the current block - retained intron was found in sequence
if(($blocksStartArray[$j] <= $altEventChromStart) && ($altEventChromEnd <= $blockEndPosition))
{
return 1; # retained intron was found - return true
}
elsif(($altEventChromStart <= $blocksStartArray[$j]) && ($blockEndPosition <= $altEventChromEnd))
{
return 1; # retained intron was found - return true
}
}
}
}
elsif ($altEventType eq "bleedingExon")
{
# check if initail or terminal exon overlap an intron of the exact positions as given by the
# chromStart chromEnd positions of the alternative splicing event.
# chromStart represent the start position of additional intronic sequence to the exon.
# chromEnd represent the end position of additional intronic sequence to the exon.
# extract relevant data
my $altEventChromStart = $altEventLineArray[1];
my $altEventChromEnd = $altEventLineArray[2];
my $seqTranscriptStart = $seqLineArray[19];
my $seqTranscriptEnd = $seqLineArray[20];
my $numberOfBlocks = $seqLineArray[21];
my $blockSizesLine = $seqLineArray[22];
my $transStartsLine = $seqLineArray[24];
my @blocksSizesArray = split(/,/,$blockSizesLine);
my @blocksStartArray = split(/,/,$transStartsLine);
chomp($altEventChromStart);
chomp($altEventChromEnd);
# if alternative splicing event is within the sequence transcription positions
if(($seqTranscriptStart <= $altEventChromStart) && ($altEventChromStart <= $seqTranscriptEnd))
{
my $firstBlockIndex = 0;
my $lastBlockIndex = $numberOfBlocks - 1;
# compute block end positions
my $firstBlockEndPosition = $blocksStartArray[$firstBlockIndex] + $blocksSizesArray[$firstBlockIndex];
my $lastBlockEndPosition = $blocksStartArray[$lastBlockIndex] + $blocksSizesArray[$lastBlockIndex];
# if alternative splicing event matches the initail or terminal exon - bleedingExon was found
if(($blocksStartArray[$firstBlockIndex] <= $altEventChromStart) && ($altEventChromEnd <= $firstBlockEndPosition))
{
return 1; # bleedingExon was found - return true
}
elsif(($blocksStartArray[$lastBlockIndex] <= $altEventChromStart) && ($altEventChromEnd <= $lastBlockEndPosition))
{
return 1; # bleedingExon was found - return true
}
}
}
elsif ($altEventType eq "altThreePrime")
{
# check if in a sequence there is an intron with alternative 3' position.
# this affects the 5' position of the adjacent exon.
# the chromStart/chromEnd positions of alt Event represent the 5' positions of adjacent exon.
# extract relevant data
my $currStrand = $geneLineArray[3];
my $altEventChromStart = $altEventLineArray[1];
my $altEventChromEnd = $altEventLineArray[2];
my $seqTranscriptStart = $seqLineArray[19];
my $seqTranscriptEnd = $seqLineArray[20];
my $numberOfBlocks = $seqLineArray[21];
my $blockSizesLine = $seqLineArray[22];
my $transStartsLine = $seqLineArray[24];
my @blocksSizesArray = split(/,/,$blockSizesLine);
my @blocksStartArray = split(/,/,$transStartsLine);
my $numberOfExons = $geneLineArray[13];
my $exonStartLine = $geneLineArray[14];
my $exonEndLine = $geneLineArray[15];
my @exonStartsArray = split(/,/,$exonStartLine);
my @exonEndsArray = split(/,/,$exonEndLine);
my $altEventExonStart;
my $altEventExonEnd;
chomp($altEventChromStart);
chomp($altEventChromEnd);
# loop over all gene exons and find the matching gene exon
for (my $j = 0; $j < $numberOfExons; $j++)
{
if (($exonStartsArray[$j] <= $altEventChromStart) && ($altEventChromEnd <= $exonEndsArray[$j]))
{ # found the exon in which the alternative splicing event happens
$altEventExonStart = $exonStartsArray[$j];
$altEventExonEnd = $exonEndsArray[$j];
}
}
if ($currStrand eq "+")
{
# go over all sequence blocks
for (my $j = 0; $j < $numberOfBlocks; $j++)
{
# compute block end position
my $blockEndPosition = $blocksStartArray[$j] + $blocksSizesArray[$j];
# if the block is smaller then exon due to altThreePrime event in adjacent intron
if (($blocksStartArray[$j] eq $altEventChromEnd) && ($blockEndPosition eq $altEventExonEnd))
{
return 1;# altThreePrime was found - return true
}
}
}
elsif ($currStrand eq "-")
{
# go over all sequence blocks
for (my $j = 0; $j < $numberOfBlocks; $j++)
{
# compute block end position
my $blockEndPosition = $blocksStartArray[$j] + $blocksSizesArray[$j];
# if the block is smaller then exon due to altThreePrime event in adjacent intron
if (($blocksStartArray[$j] eq $altEventExonStart) && ($blockEndPosition eq $altEventChromStart))
{
return 1;# altThreePrime was found - return true
}
}
}
}
elsif ($altEventType eq "altFivePrime")
{
# check if in a sequence there is an intron with alternative 5' position.
# this affects the 3' position of the adjacent exon.
# the chromStart/chromEnd positions of alt Event represents the 3' positions of adjacent exon.
# extract relevant data
my $currStrand = $geneLineArray[3];
my $altEventChromStart = $altEventLineArray[1];
my $altEventChromEnd = $altEventLineArray[2];
my $seqTranscriptStart = $seqLineArray[19];
my $seqTranscriptEnd = $seqLineArray[20];
my $numberOfBlocks = $seqLineArray[21];
my $blockSizesLine = $seqLineArray[22];
my $transStartsLine = $seqLineArray[24];
my @blocksSizesArray = split(/,/,$blockSizesLine);
my @blocksStartArray = split(/,/,$transStartsLine);
my $numberOfExons = $geneLineArray[13];
my $exonStartLine = $geneLineArray[14];
my $exonEndLine = $geneLineArray[15];
my @exonStartsArray = split(/,/,$exonStartLine);
my @exonEndsArray = split(/,/,$exonEndLine);
my $altEventExonStart;
my $altEventExonEnd;
chomp($altEventChromStart);
chomp($altEventChromEnd);
# loop over all gene exons and find the matching gene exon
for (my $j = 0; $j < $numberOfExons; $j++)
{
if (($exonStartsArray[$j] <= $altEventChromStart) && ($altEventChromEnd <= $exonEndsArray[$j]))
{ # found the exon in which the alternative splicing event happens
$altEventExonStart = $exonStartsArray[$j];
$altEventExonEnd = $exonEndsArray[$j];
}
}
if ($currStrand eq "+")
{
# go over all sequence blocks
for (my $j = 0; $j < $numberOfBlocks; $j++)
{
# compute block end position
my $blockEndPosition = $blocksStartArray[$j] + $blocksSizesArray[$j];
# if the block is smaller then exon due to altThreePrime event in adjacent intron
if (($blocksStartArray[$j] eq $altEventExonStart) && ($blockEndPosition eq $altEventChromStart))
{
return 1;# altFivePrime was found - return true
}
}
}
elsif ($currStrand eq "-")
{
# go over all sequence blocks
for (my $j = 0; $j < $numberOfBlocks; $j++)
{
# compute block end position
my $blockEndPosition = $blocksStartArray[$j] + $blocksSizesArray[$j];
# if the block is smaller then exon due to altFivePrime event in adjacent intron
if (($blocksStartArray[$j] eq $altEventChromEnd) && ($blockEndPosition eq $altEventExonEnd))
{
return 1;# altFivePrime was found - return true
}
}
}
}
return 0; # return false
}
#script #13 appendix O
#! /usr/bin/perl-w
#***********************- Description -*******************************************************
#this script compute the second test by creating 11 tables (that had the most distinct results)
#and checks the corralation between alernative splicing type and a specific allel.
#*********************************************************************************************
open(IN, "human_mrna_est_UCSC_filtered_by_SNP_with_allele_with_splicing_type") or die "Could not open human_mrna_est_UCSC_filtered_by_SNP_with_allele_with_splicing_type"; # UCSC table of alternative events - size: 91,223
open(OUT, ">statTest4") or die "Could not open statTest4"; # output file
@TableArr;
$counter = 1;
# construct chi square table with p.v. = 0.05
%ChiSquareValue = (); # key-d.o.f value-chi square value
$ChiSquareValue{1} = 3.84;
$ChiSquareValue{2} = 5.99;
$ChiSquareValue{3} = 7.81;
$ChiSquareValue{4} = 9.49;
$ChiSquareValue{5} = 11.1;
$ChiSquareValue{6} = 12.6;
$ChiSquareValue{7} = 14.1;
$ChiSquareValue{8} = 15.5;
$ChiSquareValue{9} = 16.9;
$ChiSquareValue{10} = 18.3;
$ChiSquareValue{11} = 19.7;
$ChiSquareValue{12} = 21;
$ChiSquareValue{13} = 22.4;
$ChiSquareValue{14} = 23.7;
$ChiSquareValue{15} = 25;
$ChiSquareValue{16} = 26.3;
$ChiSquareValue{17} = 27.6;
$ChiSquareValue{18} = 28.9;
$ChiSquareValue{19} = 30.1;
$ChiSquareValue{20} = 31.4;
$ChiSquareValue{21} = 32.7;
$ChiSquareValue{22} = 33.9;
$ChiSquareValue{23} = 35.2;
$ChiSquareValue{24} = 36.4;
$ChiSquareValue{25} = 37.7;
$ChiSquareValue{26} = 38.9;
$ChiSquareValue{27} = 40.1;
$ChiSquareValue{28} = 41.3;
$ChiSquareValue{29} = 42.6;
$ChiSquareValue{30} = 43.8;
# an arr that contains the tables that we will preforme the second statistic test on
$TableArr[0] = "cassetteExon\ta!g\tEndOfExon";
$TableArr[1] = "cassetteExon\ta!c\tEndOfExon";
$TableArr[2] = "cassetteExon\tc!t\tEndOfExon";
$TableArr[3] = "retainedIntron\ta!g\tEndOfExon";
$TableArr[4] = "altFinish\ta!c!g\tEndOfExon";
$TableArr[5] = "retainedIntron\tc!t\tEndOfExon";
$TableArr[6] = "bleedingExon\ta!g\tEndOfExon";
$TableArr[7] = "bleedingExon\tc!t\tEndOfExon";
$TableArr[8] = "altThreePrime\tc!t\tEndOfExon";
$TableArr[9] = "cassetteExon\tg!t\tStartOfExon";
$TableArr[10] = "altFinish\ta!c\tEndOfExon";
$TableArrLen = @TableArr;
for($i = 0; $i < 9; $i++)
{
$line = <IN>;
}
@resTable;
for($i = 0; $i < $TableArrLen; $i++) #a loop that pases on the wanted tabel that are contained in tha arr TableArr above
{
$sumOfOBSInstances = 0; # a counter for all the items that are in the needed table
for($t = 0; $t < 6; $t++) # reset the result table for each table that we are performing the test
{
$resTable[$t] = 0;
}
while(defined $line)
{
if($line =~ /^chr/)
{
@GeneArray = split(/\t/,$line); # splitting the line to the columns
$StartOrEndOfExon = $GeneArray[18];
chomp($StartOrEndOfExon);
$line = <IN>;
while( $line =~ /^!/)
{
$line = <IN>;
}
}
while( $line =~ /^>/) #if the line represents a sequence (mRNA or EST)
{
$line = <IN>;
$allel = substr($line,1,1);
$line = <IN>;
$NumberOfAltTypesPerSeq = 0;
while( $line =~ /^#/) #taking all the alternative splicing types of a certain sequence
{
@LineArray = split(/\t/,$line); # splitting the line to the columns
$altType = $LineArray[4];
chomp($altType);
$AltTypeArr[$NumberOfAltTypesPerSeq] = $altType; # an arr that contains all the alternative splicing types
$NumberOfAltTypesPerSeq++;
$line = <IN>;
}
#taking the data of the current test from the table arr above
@TableArrData = split(/\t/,$TableArr[$i]);
$altTypeFromTableArr = $TableArrData[0];
$allels = $TableArrData[1];
@allelsArrFromTableArr = split(/!/,$allels);
$numberOfAllelsInArr = @allelsArrFromTableArr;
$StartOrEndOfExonFromTableArr = $TableArrData[2];
if( $StartOrEndOfExon eq $StartOrEndOfExonFromTableArr)
{
for($j = 0; $j < $numberOfAllelsInArr; $j++)
{
for($b = 0; $b < $NumberOfAltTypesPerSeq; $b++) #checking if ther is a match for each type of splicing in the mRNA/EST
{
if($allel eq $allelsArrFromTableArr[$j])
{
$sumOfOBSInstances++;
if( $AltTypeArr[$b] eq $altTypeFromTableArr)
{
#calculating the index in the obs table
$TableIndex = ($j * 2);
#increasing the right cell of the obs table
$resTable[$TableIndex]++;
}
else
{
#calculating the index in the obs table
$TableIndex = ($j * 2) + 1;
#increasing the right cell of the obs table
$resTable[$TableIndex]++;
}
}
}
}
}
}
$line = <IN>;
}
print OUT "table number ".$counter." ";
$counter++;
print OUT $altTypeFromTableArr." ";
#compute the exp table
for( $c = 0; $c < $numberOfAllelsInArr; $c++)
{
print OUT $allelsArrFromTableArr[$c]." ";
}
print OUT $StartOrEndOfExonFromTableArr."\n\n";
$expTableHasValueLowerThanFive = 0;
if($numberOfAllelsInArr == 2)
{
$SumOfRowWithFirstAllel = $resTable[0] + $resTable[1];
$SumOfRowWIthSecondAllel = $resTable[2] + $resTable[3];
$SumOfColWithAS = $resTable[0] + $resTable[2];
$SumOfColWithOutAS = $resTable[1] + $resTable[3];
}
else
{
$SumOfRowWithFirstAllel = $resTable[0] + $resTable[1];
$SumOfRowWIthSecondAllel = $resTable[2] + $resTable[3];
$SumOfRowWIthThridAllel = $resTable[4] + $resTable[5];
$SumOfColWithAS = $resTable[0] + $resTable[2] + $resTable[4];
$SumOfColWithOutAS = $resTable[1] + $resTable[3] + $resTable[5];
}
if($numberOfAllelsInArr == 2)
{
$expTable[0] = ($SumOfRowWithFirstAllel * $SumOfColWithAS)/$sumOfOBSInstances;
$expTable[1] = ($SumOfRowWithFirstAllel * $SumOfColWithOutAS)/$sumOfOBSInstances;
$expTable[2] = ($SumOfRowWIthSecondAllel * $SumOfColWithAS)/$sumOfOBSInstances;
$expTable[3] = ($SumOfRowWIthSecondAllel * $SumOfColWithOutAS)/$sumOfOBSInstances;
if(($expTable[0] < 5) || ($expTable[1] < 5) ||
($expTable[2] < 5) || ($expTable[3] < 5))
{ # found at least one box in the expTable with value lower than 5 - no correlation
$expTableHasValueLowerThanFive = 1;
}
}
else
{
$expTable[0] = ($SumOfRowWithFirstAllel * $SumOfColWithAS)/$sumOfOBSInstances;
$expTable[1] = ($SumOfRowWithFirstAllel * $SumOfColWithOutAS)/$sumOfOBSInstances;
$expTable[2] = ($SumOfRowWIthSecondAllel * $SumOfColWithAS)/$sumOfOBSInstances;
$expTable[3] = ($SumOfRowWIthSecondAllel * $SumOfColWithOutAS)/$sumOfOBSInstances;
$expTable[4] = ($SumOfRowWIthThridAllel * $SumOfColWithAS) /$sumOfOBSInstances;
$expTable[5] = ($SumOfRowWIthThridAllel * $SumOfColWithOutAS) /$sumOfOBSInstances;
if(($expTable[0] < 5) || ($expTable[1] < 5) ||
($expTable[2] < 5) || ($expTable[3] < 5) ||
($expTable[4] < 5) || ($expTable[5] < 5))
{ # found at least one box in the expTable with value lower than 5 - no correlation
$expTableHasValueLowerThanFive = 1;
}
}
#compute Chi Square value and print result
$ChiSquare = 0;
if($numberOfAllelsInArr == 2)
{
for($h = 0; $h < 4; $h++)
{
if($expTable[$h] != 0)
{
$ChiSquare = ((($resTable[$h] - $expTable[$h])*($resTable[$h] - $expTable[$h]))/$expTable[$h]) + $ChiSquare;
}
}
}
else
{
for($h = 0; $h < 6; $h++)
{
if($expTable[$h] != 0)
{
$ChiSquare = ((($resTable[$h] - $expTable[$h])*($resTable[$h] - $expTable[$h]))/$expTable[$h]) + $ChiSquare;
}
}
}
#compute the dof
$dof = $numberOfAllelsInArr - 1;
#printing to the output file
print OUT "OBS table\n";
print OUT "\t".$altTypeFromTableArr."\tno ".$altTypeFromTableArr."\n";
$z = 0;
for($h = 0; $h < $numberOfAllelsInArr; $h++)
{
print OUT $allelsArrFromTableArr[$h]."\t\t".$resTable[$z]."\t\t".$resTable[$z+1]."\n";
$z++;
$z++;
}
#print OUT "EXP table\n";
#print OUT "\t".$altTypeFromTableArr."\tno ".$altTypeFromTableArr."\n";
#$z = 0;
#for($h = 0; $h < $numberOfAllelsInArr; $h++)
#{
# print OUT $allelsArrFromTableArr[$h]."\t\t".$expTable[$z]."\t\t".$expTable[$z+1]."\n";
# $z++;
# $z++;
#}
print OUT "\n";
print OUT "X^2 value = " . $ChiSquare . "\n";
print OUT "d.o.f = " .$dof."\n";
# chaeck if computed p value is lower than 0.05
if ($dof >= 1 && $dof <= 30)
{
# 2.2 there is a correlation between SNP allele and distinct mRNA - print result
$passingChiSquareValue = $ChiSquareValue{$dof};
print OUT "P.V = ".$passingChiSquareValue."\n";
if (($ChiSquare >= $passingChiSquareValue) && ($expTableHasValueLowerThanFive == 0))
{
print OUT "Corralation found: Yes\n";
}
else
{
print OUT "Corralation found: No\n";
}
}
print OUT "\n";
close(IN);
open(IN, "human_mrna_est_UCSC_filtered_by_SNP_with_allele_with_splicing_type") or die "Could not open human_mrna_est_UCSC_filtered_by_SNP_with_allele_with_splicing_type"; # UCSC table of alternative events - size: 91,223
for($k = 0; $k < 9; $k++)
{
$line = <IN>;
}
}
close(OUT);
close(IN);
#script #14 appendix P
#! /usr/bin/perl-w
#***********************- Description -*************************
# this script finds intersection of gene between the 8 tables
#***************************************************************
open(IN, "human_mrna_est_UCSC_filtered_by_SNP_with_allele_with_splicing_type") or die "Could not open human_mrna_est_UCSC_filtered_by_SNP_with_allele_with_splirelevancing_type"; # UCSC table of alternative events - size: 91,223
open(OUT, ">GenePerEachStatTest") or die "Could not open statTest4"; # output file
@TableArr;
$counter = 1;
%totalGeneHash = ();
$totalGeneCounter = 0;
%totalRepeatedGeneHash = ();
$totalRepeatedGeneCounter = 0;
# an arr that contains the tables that we will preforme the second statistic test on
$TableArr[0] = "cassetteExon\tg!t\tStartOfExon";
$TableArr[1] = "altThreePrime\tc!t\tEndOfExon";
$TableArr[2] = "cassetteExon\tc!t\tEndOfExon";
$TableArr[3] = "retainedIntron\ta!g\tEndOfExon";
$TableArr[4] = "altFinish\ta!c!g\tEndOfExon";
$TableArr[5] = "retainedIntron\tc!t\tEndOfExon";
$TableArr[6] = "bleedingExon\ta!g\tEndOfExon";
$TableArr[7] = "bleedingExon\tc!t\tEndOfExon";
$TableArrLen = @TableArr;
@GeneSizeArr; # this array will hold the amount of genes for each table that describs above
@GenesByTableArr; # the value is the genes that belongs to this TableArr
%HashByGeneId; # the key is the Gene Id and the value is the table that he is in
%HashForGeneData; # the key is the Gene Id and the value is the alternative splicing type, allel and end of beginninng of the exon
#takes line until the relevant line
for($i = 0; $i < 9; $i++)
{
$line = <IN>;
}
#a loop that pases on the wanted tabel that are contained in tha array TableArr above
for($i = 0; $i < $TableArrLen; $i++)
{
$GeneCounter = 0; #will count the genes in each of the tables
@TableArrData = split(/\t/,$TableArr[$i]); #holds the relevant information about the current test
$altTypeFromTableArr = $TableArrData[0];
$allels = $TableArrData[1];
@allelsArrFromTableArr = split(/!/,$allels); #thhis array holds all the allels that are relevant for the test by splitig the spcific part of the line
$numberOfAllelsInArr = @allelsArrFromTableArr;
$StartOrEndOfExonFromTableArr = $TableArrData[2];
# priting a titel
print OUT "table number ".$counter."\t";
$counter++;
print OUT $altTypeFromTableArr." ";
for( $c = 0; $c < $numberOfAllelsInArr; $c++)
{
print OUT $allelsArrFromTableArr[$c]." ";
}
print OUT $StartOrEndOfExonFromTableArr."\n\n";
#pessing on all the line in the input file
while(defined $line)
{
if($line =~ /^chr/)
{
@GeneArray = split(/\t/,$line); # splitting the line to the columns
$StartOrEndOfExon = $GeneArray[18];
$geneID = $GeneArray[4];
chomp($StartOrEndOfExon);
$GeneLine = $line;
$curGeneLine = "";
$line = <IN>;
while( $line =~ /^!/)
{
$line = <IN>;
}
}
while( $line =~ /^>/) #if the line represents a sequence (mRNA or EST)
{
$line = <IN>;
$allel = substr($line,1,1);
$line = <IN>;
$NumberOfAltTypesPerSeq = 0;
while( $line =~ /^#/) #taking all the alternative splicing types of a certain sequence
{
@LineArray = split(/\t/,$line); # splitting the line to the columns
$altType = $LineArray[4];
chomp($altType);
$AltTypeArr[$NumberOfAltTypesPerSeq] = $altType; # an arr that contains all the alternative splicing types
$NumberOfAltTypesPerSeq++;
$line = <IN>;
}
# checks if the location of the SNP in the exon is compatible to the location in the test information from the array above
if( $StartOrEndOfExon eq $StartOrEndOfExonFromTableArr)
{
#checks if the the allel appears in one of the relevant allels of the test
for($j = 0; $j < $numberOfAllelsInArr; $j++)
{
#checks if the the alternative splicing type appears in one of types of the sequence
for($b = 0; $b < $NumberOfAltTypesPerSeq; $b++) #checking if ther is a match for each type of splicing in the mRNA/EST
{
if($allel eq $allelsArrFromTableArr[$j])
{
# counter for the sum of the table for further calculation
$sumOfOBSInstances++;
if( $AltTypeArr[$b] eq $altTypeFromTableArr)
{
#checks if this gene line was alredy printed
if( $curGeneLine ne $GeneLine)
{
$curGeneLine = $GeneLine;
$GeneCounter++;
print OUT $GeneLine."\n";
if( defined $GenesByTableArr[$i])
{
$GenesByTableArr[$i] = $GenesByTableArr[$i]."\t".$geneID;
}
else
{
$GenesByTableArr[$i] = $geneID;
}
$flag = 0;
#here we save the information about the the tables that the gene appears in and the information about the gene it self in a different hash
if(defined $HashByGeneId{$geneID})
{
@GeneValue = split(/\s/,$HashByGeneId{$geneID}); # splitting the line to the columns
$Len = @GeneValue;
for($c = 0; $c < $Len; $c++)
{
if( $i == $GeneValue[$c])
{
$flag = 1;
}
}
if($flag == 0)
{
$temp = $i + 1;
$HashByGeneId{$geneID} = $HashByGeneId{$geneID}." ".$temp; #saves the information about the tables that the gene appears
}
# saves the relevant information about the gene
$HashForGeneData{$geneID} = $HashForGeneData{$geneID}."\t".$AltTypeArr[$b]." ";
for($d = 0; $d < @allelsArrFromTableArr; $d++)
{
$HashForGeneData{$geneID} = $HashForGeneData{$geneID}.$allelsArrFromTableArr[$d]." ";
}
$HashForGeneData{$geneID} = $HashForGeneData{$geneID}.$StartOrEndOfExonFromTableArr;
}
else
{
$temp = $i + 1;
$HashByGeneId{$geneID} = $temp; #saves the information about the tables that the gene appears
# saves the relevant information about the gene
$HashForGeneData{$geneID} = $AltTypeArr[$b]." ";
for($d = 0; $d < @allelsArrFromTableArr; $d++)
{
$HashForGeneData{$geneID} = $HashForGeneData{$geneID}.$allelsArrFromTableArr[$d]." ";
}
$HashForGeneData{$geneID} = $HashForGeneData{$geneID}.$StartOrEndOfExonFromTableArr;
}
}
}
}
}
}
}
}
$line = <IN>;
}
close(IN);
open(IN, "human_mrna_est_UCSC_filtered_by_SNP_with_allele_with_splicing_type") or die "Could not open human_mrna_est_UCSC_filtered_by_SNP_with_allele_with_splicing_type"; # UCSC table of alternative events - size: 91,223
for($k = 0; $k < 9; $k++)
{
$line = <IN>;
}
$GeneSizeArr[$i] = $GeneCounter;
print OUT "number of genes: ".$GeneCounter."\n";
}
# sum up the total amount of all genes with repetition
$sum = 0;
for($t = 0; $t < @GeneSizeArr; $t++)
{
$sum = $sum + $GeneSizeArr[$t];
}
print OUT "total sum of all genes with repetition: ".$sum."\n";
print OUT "Overlapping Genes\n";
print OUT "\nGeneId\tNumberOfTables\tTheTables\tTheAltType+Allels\n";
$counterOfOverLapGenes = 0;
#prints out the genes that appear in more then 1 table
foreach $gene ( keys %HashByGeneId)
{
@GeneValue = split(/\s/,$HashByGeneId{$gene}); # splitting the line to the columns
$numberOfTables = @GeneValue;
if($numberOfTables > 1)
{
$counterOfOverLapGenes++;
print OUT $gene."\t".$numberOfTables."\t".$HashByGeneId{$gene}."\t".$HashForGeneData{$gene}."\n";
}
}
#prints out the total anount of gene per test and total amout of overlapping genes per test
print OUT "the number of overlapp genes: ".$counterOfOverLapGenes."\n";
for( $i = 0; $i < $TableArrLen; $i++)
{
$counter = 0;
#checks how many overlapping genes there are in the spcific case
@GenesPerTableArr = split(/\t/,$GenesByTableArr[$i]);
$genesPerTableArrLen = @GenesPerTableArr;
for($j = 0; $j < $genesPerTableArrLen; $j++)
{
foreach $item (keys %HashByGeneId)
{
@GeneValue = split(/\s/,$HashByGeneId{$item}); # splitting the line to the columns
$numberOfTables = @GeneValue;
if($numberOfTables > 1)
{
if($GenesPerTableArr[$j] eq $item)
{
$counter++;
}
}
}
}
print OUT $TableArr[$i]."\nTotal gene number is: ".$GeneSizeArr[$i]." the number of genes that appears in more the 1 table is: ".$counter."\n";
}
close(OUT);
close(IN);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment