Created
May 3, 2018 21:05
-
-
Save guineverelee/4efe7d457d6a50bb42887ae5218fb185 to your computer and use it in GitHub Desktop.
Hypermut Plus: Statistical and graphic tool for the impact of APOBEC-3G/F on HIV genomes. 2018
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#To calculate Fisher's p and look at the genome distribution of hypermutated sites in your amplicon, please input an aligned fasta with reference sequence at the top | |
#Step 00. Setup R | |
rm(list=ls()) | |
closeAllConnections() | |
graphics.off() | |
cat("\014") #send CTRL+L to R, clean console | |
StartSysTime <- Sys.time() | |
MyWD <- getwd() | |
setwd(MyWD) | |
library(Biostrings) #PC@work Biostrings_2.38.1 | |
#User Input: | |
MyFASTAFileName <- "20171105_Hypermut_n574.fasta_Raligned_GLtrimmed.fasta" | |
dir.create("Output") | |
#STEP1. LOAD THE FASTA FILE and CONVERT into DF | |
MyFASTA <- readDNAStringSet(MyFASTAFileName) | |
MyOutput01_SampleID <- c() | |
MyOutput02_RawSeq <- c() | |
for (i in 1:length(MyFASTA)){ | |
MyOutput01_SampleID <- append(MyOutput01_SampleID,names(MyFASTA[i])) | |
MyOutput02_RawSeq <- append(MyOutput02_RawSeq,as.character(MyFASTA[i])) | |
} | |
#STEP2. Index REF for 3G and 3F sites | |
MyREF01_Length <- nchar(MyOutput02_RawSeq[1]) | |
MyResult <- "" | |
for (j in 1:MyREF01_Length){ | |
#2a. Count and Output category per-base-position: C=Control, N=not-G, G=3G (GGD), F=3F (GAD) | |
#Site1. Is it G? | |
if (substring(MyOutput02_RawSeq[1],j,j) == "G"){ | |
#Site2. Is it -? If so, check next position until it is not -. Then see if it is G or A. | |
k=1 | |
while(substring(MyOutput02_RawSeq[1],j+k,j+k) == "-"){k <- k+1} | |
if (substring(MyOutput02_RawSeq[1],j+k,j+k) == "G" | substring(MyOutput02_RawSeq[1],j+k,j+k) == "A"){ | |
#Site3. Is it -? If so, check next position until it is not -. Then see if it is !C. | |
n=1 | |
while(substring(MyOutput02_RawSeq[1],j+k+n,j+k+n) == "-"){n <- n+1} | |
if (substring(MyOutput02_RawSeq[1],j+k,j+k) == "G" & substring(MyOutput02_RawSeq[1],j+k+n,j+k+n) != "C"){ | |
MyResult <- paste(MyResult,"G",sep="") | |
} else if (substring(MyOutput02_RawSeq[1],j+k,j+k) == "A" & substring(MyOutput02_RawSeq[1],j+k+n,j+k+n) != "C"){ | |
MyResult <- paste(MyResult,"F",sep="") | |
} else {MyResult <- paste(MyResult,"C",sep="")} | |
} else if (substring(MyOutput02_RawSeq[1],j+k,j+k) == "-"){ | |
MyResult <- paste(MyResult,"N",sep="") | |
} else {MyResult <- paste(MyResult,"C",sep="")} | |
} else {MyResult <- paste(MyResult,"N",sep="")} | |
} | |
write(MyResult,file="./Output/Output01_REFCategory.txt") | |
MyREFIndex_G <- unlist(gregexpr(pattern="G",MyResult)) | |
MyREFIndex_F <- unlist(gregexpr(pattern="F",MyResult)) | |
MyREFIndex_C <- unlist(gregexpr(pattern="C",MyResult)) | |
MyREFIndex_N <- unlist(gregexpr(pattern="N",MyResult)) | |
#STEP3. COUNT NUMBERS of 3G and 3F ## PLus ## revert A>G in original | |
MyOutput03a1_APOBEC3G2A_count <- c() | |
MyOutput03b1_APOBEC3F2A_count <- c() | |
MyOutput03c1_APOBEC3FGsum2A_count <- c() | |
MyOutput03d1_Control_count <- c() | |
MyRevertedSEQ_A2G <- c() | |
MyQuery_GsumIndex_AllSamples <- c() | |
MyQuery_FsumIndex_AllSamples <- c() | |
MyQuery_GFsumIndex_AllSamples <- c() | |
for (m in 1:length(MyOutput02_RawSeq)){ | |
MyQuery_Gindex <- c() | |
MyQuery_Findex <- c() | |
#3a. APOBEC 3G Count | |
g=0 | |
for (o in 1:length(MyREFIndex_G)){ | |
if(substring(MyOutput02_RawSeq[m],MyREFIndex_G[o],MyREFIndex_G[o]) == "A"){ | |
g <- g+1 | |
MyQuery_Gindex <- append(MyQuery_Gindex,MyREFIndex_G[o]) | |
} | |
} | |
MyOutput03a1_APOBEC3G2A_count <- append(MyOutput03a1_APOBEC3G2A_count,g) | |
if (m!=1){ | |
MyQuery_GsumIndex_AllSamples <- append(MyQuery_GsumIndex_AllSamples,MyQuery_Gindex) | |
} | |
#3b. APOBEC 3F Count | |
f=0 | |
for (p in 1:length(MyREFIndex_F)){ | |
if(substring(MyOutput02_RawSeq[m],MyREFIndex_F[p],MyREFIndex_F[p]) == "A"){ | |
f <- f+1 | |
MyQuery_Findex <- append(MyQuery_Findex,MyREFIndex_F[p]) | |
} | |
} | |
MyOutput03b1_APOBEC3F2A_count <- append(MyOutput03b1_APOBEC3F2A_count,f) | |
MyOutput03c1_APOBEC3FGsum2A_count <- append(MyOutput03c1_APOBEC3FGsum2A_count,f+g) | |
if (m!=1){ | |
MyQuery_FsumIndex_AllSamples <- append(MyQuery_FsumIndex_AllSamples,MyQuery_Findex) | |
} | |
#3c. Control Count | |
c=0 | |
for (q in 1:length(MyREFIndex_C)){ | |
if(substring(MyOutput02_RawSeq[m],MyREFIndex_C[q],MyREFIndex_C[q]) == "A"){c <- c+1} | |
} | |
MyOutput03d1_Control_count <- append(MyOutput03d1_Control_count,c) | |
#3d. Revert A2G in hypermuted positions | |
if (length(append(MyQuery_Gindex,MyQuery_Findex))==0){ | |
MyRevertedSEQ_A2G <- append(MyRevertedSEQ_A2G,MyOutput02_RawSeq[m]) | |
} else { | |
#REVERT | |
MyCurrentSeq <- MyOutput02_RawSeq[m] | |
MyQuery_GFsumIndex <- sort(append(MyQuery_Gindex,MyQuery_Findex)) | |
for (x in 1:length(MyQuery_GFsumIndex)){ | |
Start <- MyQuery_GFsumIndex[x] | |
End <- MyQuery_GFsumIndex[x] | |
substr(MyCurrentSeq,Start,End) <- "G" | |
} | |
MyRevertedSEQ_A2G <- append(MyRevertedSEQ_A2G,MyCurrentSeq) | |
#3e. Genome Distribution append | |
MyQuery_GFsumIndex_AllSamples <- append(MyQuery_GFsumIndex_AllSamples,MyQuery_GFsumIndex) | |
} | |
} | |
MyFASTA_Fixed <- DNAStringSet(MyRevertedSEQ_A2G) | |
names(MyFASTA_Fixed) <- names(MyFASTA) | |
writeXStringSet(MyFASTA_Fixed,file="./Output/Output02_MyFASTA_Reverted.fasta") | |
#STEP3.1 Summary data: NOT | |
MyOutput03a2_APOBEC3G2A_NOTcount <- length(MyREFIndex_G)-MyOutput03a1_APOBEC3G2A_count | |
MyOutput03b2_APOBEC3F2A_NOTcount <- length(MyREFIndex_F)-MyOutput03b1_APOBEC3F2A_count | |
MyOutput03c2_APOBEC3FGsum2A_NOTcount <- length(MyREFIndex_G)+length(MyREFIndex_F)-MyOutput03c1_APOBEC3FGsum2A_count | |
MyOutput03d2_Control_NOTcount <- length(MyREFIndex_C)-MyOutput03d1_Control_count | |
#STEP3.2 Summary data: TOTAL | |
MyOutput03a3_APOBEC3G2A_TOTALcount <- rep(length(MyREFIndex_G),length(MyOutput01_SampleID)) | |
MyOutput03b3_APOBEC3F2A_TOTALcount <- rep(length(MyREFIndex_F),length(MyOutput01_SampleID)) | |
MyOutput03c3_APOBEC3FGsum2A_TOTALcount <- rep(length(MyREFIndex_G)+length(MyREFIndex_F),length(MyOutput01_SampleID)) | |
MyOutput03d3_Control_TOTALcount <- rep(length(MyREFIndex_C),length(MyOutput01_SampleID)) | |
#STEP3.3 Summary data: PERCENT HypermutedG>A/TotalG | |
MyOutput03a4_APOBEC3G2A_PERC <- MyOutput03a1_APOBEC3G2A_count/length(MyREFIndex_G)*100 | |
MyOutput03b4_APOBEC3F2A_PERC <- MyOutput03b1_APOBEC3F2A_count/length(MyREFIndex_F)*100 | |
MyOutput03c4_APOBEC3FGsum2A_PERC <- MyOutput03c1_APOBEC3FGsum2A_count/(length(MyREFIndex_G)+length(MyREFIndex_F))*100 | |
MyOutput03d4_Control_PERC <- MyOutput03d1_Control_count/length(MyREFIndex_C)*100 | |
#STEP3.4 Summary data: PERCENT Contribution G sites / total mutated sites | |
MyOutput03a5_APOBEC3G2A_PERC <- MyOutput03a1_APOBEC3G2A_count/(MyOutput03a1_APOBEC3G2A_count+MyOutput03b1_APOBEC3F2A_count)*100 | |
MyOutput03b5_APOBEC3F2A_PERC <- MyOutput03b1_APOBEC3F2A_count/(MyOutput03a1_APOBEC3G2A_count+MyOutput03b1_APOBEC3F2A_count)*100 | |
#STEP 4. Reproduce WebHypermut 2.0 tool: FISHER'S | |
MyFisherVector <- c() | |
for (m in 1:length(MyOutput02_RawSeq)){ | |
Q1 <- MyOutput03c1_APOBEC3FGsum2A_count[m] | |
Q2 <- MyOutput03c2_APOBEC3FGsum2A_NOTcount[m] | |
Q3 <- MyOutput03d1_Control_count[m] | |
Q4 <- MyOutput03d2_Control_NOTcount[m] | |
table <- rbind(c(Q1,Q2),c(Q3,Q4)) | |
MyFisher <- fisher.test(table,alternative="greater") | |
MyFisherVector <- append(MyFisherVector,MyFisher$p.value) | |
} | |
#STEP 5. Genome distribution | |
pdf("./Output/Output03_GenomeDistribHistogram.pdf") | |
par(mfrow=c(3,1)) #nrow,ncol | |
hist(sort(MyQuery_GsumIndex_AllSamples) | |
,main="" | |
,xlab="HIV Genome Coordinate - APOBEC 3G") | |
title("Distribution of Hypermutations in HIV Genome") #set main title here# | |
hist(sort(MyQuery_FsumIndex_AllSamples) | |
,main="" | |
,xlab="HIV Genome Coordinate - APOBEC 3F") | |
hist(sort(MyQuery_GFsumIndex_AllSamples) | |
,main="" | |
,xlab="HIV Genome Coordinate - APOBEC 3G + 3F") | |
dev.off() | |
#STEP 6. Export DF | |
MyDF <- as.data.frame(cbind( | |
MyOutput01_SampleID | |
, MyOutput02_RawSeq | |
, MyOutput03a1_APOBEC3G2A_count | |
, MyOutput03a2_APOBEC3G2A_NOTcount | |
, MyOutput03a3_APOBEC3G2A_TOTALcount | |
, MyOutput03a4_APOBEC3G2A_PERC | |
, MyOutput03a5_APOBEC3G2A_PERC | |
, MyOutput03b1_APOBEC3F2A_count | |
, MyOutput03b2_APOBEC3F2A_NOTcount | |
, MyOutput03b3_APOBEC3F2A_TOTALcount | |
, MyOutput03b4_APOBEC3F2A_PERC | |
, MyOutput03b5_APOBEC3F2A_PERC | |
, MyOutput03c1_APOBEC3FGsum2A_count | |
, MyOutput03c2_APOBEC3FGsum2A_NOTcount | |
, MyOutput03c3_APOBEC3FGsum2A_TOTALcount | |
, MyOutput03c4_APOBEC3FGsum2A_PERC | |
, MyOutput03d1_Control_count | |
, MyOutput03d2_Control_NOTcount | |
, MyOutput03d3_Control_TOTALcount | |
, MyOutput03d4_Control_PERC | |
, MyRevertedSEQ_A2G | |
, MyFisherVector | |
)) | |
write.csv(MyDF,file="./Output/Output04_MyDF_Summary.csv") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment