guineverelee/R_Hypermut_ver01.R

## R_Hypermut_ver01.R
#To calculate Fisher's p and look at the genome distribution of hypermutated sites in your amplicon, please input an aligned fasta with reference sequence at the top

#Step 00. Setup R
rm(list=ls())
closeAllConnections()
graphics.off()
cat("\014") #send CTRL+L to R, clean console
StartSysTime <- Sys.time()
MyWD <- getwd()
setwd(MyWD)
library(Biostrings) #PC@work Biostrings_2.38.1

#User Input:
MyFASTAFileName <- "20171105_Hypermut_n574.fasta_Raligned_GLtrimmed.fasta"
dir.create("Output")

#STEP1.  LOAD THE FASTA FILE and CONVERT into DF
MyFASTA <- readDNAStringSet(MyFASTAFileName)
MyOutput01_SampleID <- c()
MyOutput02_RawSeq <- c()
for (i in 1:length(MyFASTA)){
  MyOutput01_SampleID <- append(MyOutput01_SampleID,names(MyFASTA[i]))
  MyOutput02_RawSeq <- append(MyOutput02_RawSeq,as.character(MyFASTA[i]))
}

#STEP2.  Index REF for 3G and 3F sites
MyREF01_Length <- nchar(MyOutput02_RawSeq[1])
MyResult <- ""
for (j in 1:MyREF01_Length){
  #2a. Count and Output category per-base-position: C=Control, N=not-G, G=3G (GGD), F=3F (GAD)
  #Site1.  Is it G?
  if (substring(MyOutput02_RawSeq[1],j,j) == "G"){
    #Site2.  Is it -?  If so, check next position until it is not -.  Then see if it is G or A.
    k=1
    while(substring(MyOutput02_RawSeq[1],j+k,j+k) == "-"){k <- k+1}
    if (substring(MyOutput02_RawSeq[1],j+k,j+k) == "G" | substring(MyOutput02_RawSeq[1],j+k,j+k) == "A"){
      #Site3.  Is it -?  If so, check next position until it is not -.  Then see if it is !C.
      n=1
      while(substring(MyOutput02_RawSeq[1],j+k+n,j+k+n) == "-"){n <- n+1}
      if (substring(MyOutput02_RawSeq[1],j+k,j+k) == "G" & substring(MyOutput02_RawSeq[1],j+k+n,j+k+n) != "C"){
        MyResult <- paste(MyResult,"G",sep="")
      } else if (substring(MyOutput02_RawSeq[1],j+k,j+k) == "A" & substring(MyOutput02_RawSeq[1],j+k+n,j+k+n) != "C"){
        MyResult <- paste(MyResult,"F",sep="")
      } else {MyResult <- paste(MyResult,"C",sep="")}
    } else if (substring(MyOutput02_RawSeq[1],j+k,j+k) == "-"){
      MyResult <- paste(MyResult,"N",sep="")
    } else {MyResult <- paste(MyResult,"C",sep="")}
  } else {MyResult <- paste(MyResult,"N",sep="")}
}
write(MyResult,file="./Output/Output01_REFCategory.txt")
MyREFIndex_G <- unlist(gregexpr(pattern="G",MyResult))
MyREFIndex_F <- unlist(gregexpr(pattern="F",MyResult))
MyREFIndex_C <- unlist(gregexpr(pattern="C",MyResult))
MyREFIndex_N <- unlist(gregexpr(pattern="N",MyResult))

#STEP3.  COUNT NUMBERS of 3G and 3F ## PLus ## revert A>G in original
MyOutput03a1_APOBEC3G2A_count <- c()
MyOutput03b1_APOBEC3F2A_count <- c()
MyOutput03c1_APOBEC3FGsum2A_count <- c()
MyOutput03d1_Control_count <- c()
MyRevertedSEQ_A2G <- c()
MyQuery_GsumIndex_AllSamples <- c()
MyQuery_FsumIndex_AllSamples <- c()
MyQuery_GFsumIndex_AllSamples <- c()

for (m in 1:length(MyOutput02_RawSeq)){
  MyQuery_Gindex <- c()
  MyQuery_Findex <- c()
  #3a. APOBEC 3G Count
  g=0
  for (o in 1:length(MyREFIndex_G)){
    if(substring(MyOutput02_RawSeq[m],MyREFIndex_G[o],MyREFIndex_G[o]) == "A"){
      g <- g+1
      MyQuery_Gindex <- append(MyQuery_Gindex,MyREFIndex_G[o])
    }
  }
  MyOutput03a1_APOBEC3G2A_count <- append(MyOutput03a1_APOBEC3G2A_count,g)
  if (m!=1){
    MyQuery_GsumIndex_AllSamples <- append(MyQuery_GsumIndex_AllSamples,MyQuery_Gindex)
  }
  #3b. APOBEC 3F Count
  f=0
  for (p in 1:length(MyREFIndex_F)){
    if(substring(MyOutput02_RawSeq[m],MyREFIndex_F[p],MyREFIndex_F[p]) == "A"){
      f <- f+1
      MyQuery_Findex <- append(MyQuery_Findex,MyREFIndex_F[p])
    }
  }
  MyOutput03b1_APOBEC3F2A_count <- append(MyOutput03b1_APOBEC3F2A_count,f)
  MyOutput03c1_APOBEC3FGsum2A_count <- append(MyOutput03c1_APOBEC3FGsum2A_count,f+g)
  if (m!=1){
    MyQuery_FsumIndex_AllSamples <- append(MyQuery_FsumIndex_AllSamples,MyQuery_Findex)
  }
  #3c. Control Count
  c=0
  for (q in 1:length(MyREFIndex_C)){
    if(substring(MyOutput02_RawSeq[m],MyREFIndex_C[q],MyREFIndex_C[q]) == "A"){c <- c+1}
  }
  MyOutput03d1_Control_count <- append(MyOutput03d1_Control_count,c)
  #3d. Revert A2G in hypermuted positions
  if (length(append(MyQuery_Gindex,MyQuery_Findex))==0){
    MyRevertedSEQ_A2G <- append(MyRevertedSEQ_A2G,MyOutput02_RawSeq[m])
  } else {
    #REVERT
    MyCurrentSeq <- MyOutput02_RawSeq[m]
    MyQuery_GFsumIndex <- sort(append(MyQuery_Gindex,MyQuery_Findex))
    for (x in 1:length(MyQuery_GFsumIndex)){
      Start <- MyQuery_GFsumIndex[x]
      End <- MyQuery_GFsumIndex[x]
      substr(MyCurrentSeq,Start,End) <- "G"
    }
    MyRevertedSEQ_A2G <- append(MyRevertedSEQ_A2G,MyCurrentSeq)
    #3e. Genome Distribution append
    MyQuery_GFsumIndex_AllSamples <- append(MyQuery_GFsumIndex_AllSamples,MyQuery_GFsumIndex)
  }
}
MyFASTA_Fixed <- DNAStringSet(MyRevertedSEQ_A2G)
names(MyFASTA_Fixed) <- names(MyFASTA)
writeXStringSet(MyFASTA_Fixed,file="./Output/Output02_MyFASTA_Reverted.fasta")


#STEP3.1 Summary data: NOT
MyOutput03a2_APOBEC3G2A_NOTcount <- length(MyREFIndex_G)-MyOutput03a1_APOBEC3G2A_count
MyOutput03b2_APOBEC3F2A_NOTcount <- length(MyREFIndex_F)-MyOutput03b1_APOBEC3F2A_count
MyOutput03c2_APOBEC3FGsum2A_NOTcount <- length(MyREFIndex_G)+length(MyREFIndex_F)-MyOutput03c1_APOBEC3FGsum2A_count
MyOutput03d2_Control_NOTcount <- length(MyREFIndex_C)-MyOutput03d1_Control_count
#STEP3.2 Summary data: TOTAL
MyOutput03a3_APOBEC3G2A_TOTALcount <- rep(length(MyREFIndex_G),length(MyOutput01_SampleID))
MyOutput03b3_APOBEC3F2A_TOTALcount <- rep(length(MyREFIndex_F),length(MyOutput01_SampleID))
MyOutput03c3_APOBEC3FGsum2A_TOTALcount <- rep(length(MyREFIndex_G)+length(MyREFIndex_F),length(MyOutput01_SampleID))
MyOutput03d3_Control_TOTALcount <- rep(length(MyREFIndex_C),length(MyOutput01_SampleID))
#STEP3.3 Summary data:  PERCENT HypermutedG>A/TotalG
MyOutput03a4_APOBEC3G2A_PERC <- MyOutput03a1_APOBEC3G2A_count/length(MyREFIndex_G)*100
MyOutput03b4_APOBEC3F2A_PERC <- MyOutput03b1_APOBEC3F2A_count/length(MyREFIndex_F)*100
MyOutput03c4_APOBEC3FGsum2A_PERC <- MyOutput03c1_APOBEC3FGsum2A_count/(length(MyREFIndex_G)+length(MyREFIndex_F))*100
MyOutput03d4_Control_PERC <- MyOutput03d1_Control_count/length(MyREFIndex_C)*100
#STEP3.4 Summary data:  PERCENT Contribution G sites / total mutated sites
MyOutput03a5_APOBEC3G2A_PERC <- MyOutput03a1_APOBEC3G2A_count/(MyOutput03a1_APOBEC3G2A_count+MyOutput03b1_APOBEC3F2A_count)*100
MyOutput03b5_APOBEC3F2A_PERC <- MyOutput03b1_APOBEC3F2A_count/(MyOutput03a1_APOBEC3G2A_count+MyOutput03b1_APOBEC3F2A_count)*100


#STEP 4.  Reproduce WebHypermut 2.0 tool:  FISHER'S
MyFisherVector <- c()
for (m in 1:length(MyOutput02_RawSeq)){
  Q1 <- MyOutput03c1_APOBEC3FGsum2A_count[m]
  Q2 <- MyOutput03c2_APOBEC3FGsum2A_NOTcount[m]
  Q3 <- MyOutput03d1_Control_count[m]
  Q4 <- MyOutput03d2_Control_NOTcount[m]
  table <- rbind(c(Q1,Q2),c(Q3,Q4))
  MyFisher <- fisher.test(table,alternative="greater")
  MyFisherVector <- append(MyFisherVector,MyFisher$p.value)
}

#STEP 5.  Genome distribution
pdf("./Output/Output03_GenomeDistribHistogram.pdf")
par(mfrow=c(3,1)) #nrow,ncol
hist(sort(MyQuery_GsumIndex_AllSamples)
     ,main=""
     ,xlab="HIV Genome Coordinate - APOBEC 3G")
title("Distribution of Hypermutations in HIV Genome") #set main title here#
hist(sort(MyQuery_FsumIndex_AllSamples)
     ,main=""
     ,xlab="HIV Genome Coordinate - APOBEC 3F")
hist(sort(MyQuery_GFsumIndex_AllSamples)
     ,main=""
     ,xlab="HIV Genome Coordinate - APOBEC 3G + 3F")
dev.off()

#STEP 6.  Export DF
MyDF <- as.data.frame(cbind(
  MyOutput01_SampleID
  , MyOutput02_RawSeq
  , MyOutput03a1_APOBEC3G2A_count
  , MyOutput03a2_APOBEC3G2A_NOTcount
  , MyOutput03a3_APOBEC3G2A_TOTALcount
  , MyOutput03a4_APOBEC3G2A_PERC
  , MyOutput03a5_APOBEC3G2A_PERC
  , MyOutput03b1_APOBEC3F2A_count
  , MyOutput03b2_APOBEC3F2A_NOTcount
  , MyOutput03b3_APOBEC3F2A_TOTALcount
  , MyOutput03b4_APOBEC3F2A_PERC
  , MyOutput03b5_APOBEC3F2A_PERC
  , MyOutput03c1_APOBEC3FGsum2A_count
  , MyOutput03c2_APOBEC3FGsum2A_NOTcount
  , MyOutput03c3_APOBEC3FGsum2A_TOTALcount
  , MyOutput03c4_APOBEC3FGsum2A_PERC
  , MyOutput03d1_Control_count
  , MyOutput03d2_Control_NOTcount
  , MyOutput03d3_Control_TOTALcount
  , MyOutput03d4_Control_PERC
  , MyRevertedSEQ_A2G
  , MyFisherVector
))
write.csv(MyDF,file="./Output/Output04_MyDF_Summary.csv")
	#To calculate Fisher's p and look at the genome distribution of hypermutated sites in your amplicon, please input an aligned fasta with reference sequence at the top

	#Step 00. Setup R
	rm(list=ls())
	closeAllConnections()
	graphics.off()
	cat("\014") #send CTRL+L to R, clean console
	StartSysTime <- Sys.time()
	MyWD <- getwd()
	setwd(MyWD)
	library(Biostrings) #PC@work Biostrings_2.38.1

	#User Input:
	MyFASTAFileName <- "20171105_Hypermut_n574.fasta_Raligned_GLtrimmed.fasta"
	dir.create("Output")

	#STEP1. LOAD THE FASTA FILE and CONVERT into DF
	MyFASTA <- readDNAStringSet(MyFASTAFileName)
	MyOutput01_SampleID <- c()
	MyOutput02_RawSeq <- c()
	for (i in 1:length(MyFASTA)){
	MyOutput01_SampleID <- append(MyOutput01_SampleID,names(MyFASTA[i]))
	MyOutput02_RawSeq <- append(MyOutput02_RawSeq,as.character(MyFASTA[i]))
	}

	#STEP2. Index REF for 3G and 3F sites
	MyREF01_Length <- nchar(MyOutput02_RawSeq[1])
	MyResult <- ""
	for (j in 1:MyREF01_Length){
	#2a. Count and Output category per-base-position: C=Control, N=not-G, G=3G (GGD), F=3F (GAD)
	#Site1. Is it G?
	if (substring(MyOutput02_RawSeq[1],j,j) == "G"){
	#Site2. Is it -? If so, check next position until it is not -. Then see if it is G or A.
	k=1
	while(substring(MyOutput02_RawSeq[1],j+k,j+k) == "-"){k <- k+1}
	if (substring(MyOutput02_RawSeq[1],j+k,j+k) == "G" \| substring(MyOutput02_RawSeq[1],j+k,j+k) == "A"){
	#Site3. Is it -? If so, check next position until it is not -. Then see if it is !C.
	n=1
	while(substring(MyOutput02_RawSeq[1],j+k+n,j+k+n) == "-"){n <- n+1}
	if (substring(MyOutput02_RawSeq[1],j+k,j+k) == "G" & substring(MyOutput02_RawSeq[1],j+k+n,j+k+n) != "C"){
	MyResult <- paste(MyResult,"G",sep="")
	} else if (substring(MyOutput02_RawSeq[1],j+k,j+k) == "A" & substring(MyOutput02_RawSeq[1],j+k+n,j+k+n) != "C"){
	MyResult <- paste(MyResult,"F",sep="")
	} else {MyResult <- paste(MyResult,"C",sep="")}
	} else if (substring(MyOutput02_RawSeq[1],j+k,j+k) == "-"){
	MyResult <- paste(MyResult,"N",sep="")
	} else {MyResult <- paste(MyResult,"C",sep="")}
	} else {MyResult <- paste(MyResult,"N",sep="")}
	}
	write(MyResult,file="./Output/Output01_REFCategory.txt")
	MyREFIndex_G <- unlist(gregexpr(pattern="G",MyResult))
	MyREFIndex_F <- unlist(gregexpr(pattern="F",MyResult))
	MyREFIndex_C <- unlist(gregexpr(pattern="C",MyResult))
	MyREFIndex_N <- unlist(gregexpr(pattern="N",MyResult))

	#STEP3. COUNT NUMBERS of 3G and 3F ## PLus ## revert A>G in original
	MyOutput03a1_APOBEC3G2A_count <- c()
	MyOutput03b1_APOBEC3F2A_count <- c()
	MyOutput03c1_APOBEC3FGsum2A_count <- c()
	MyOutput03d1_Control_count <- c()
	MyRevertedSEQ_A2G <- c()
	MyQuery_GsumIndex_AllSamples <- c()
	MyQuery_FsumIndex_AllSamples <- c()
	MyQuery_GFsumIndex_AllSamples <- c()

	for (m in 1:length(MyOutput02_RawSeq)){
	MyQuery_Gindex <- c()
	MyQuery_Findex <- c()
	#3a. APOBEC 3G Count
	g=0
	for (o in 1:length(MyREFIndex_G)){
	if(substring(MyOutput02_RawSeq[m],MyREFIndex_G[o],MyREFIndex_G[o]) == "A"){
	g <- g+1
	MyQuery_Gindex <- append(MyQuery_Gindex,MyREFIndex_G[o])
	}
	}
	MyOutput03a1_APOBEC3G2A_count <- append(MyOutput03a1_APOBEC3G2A_count,g)
	if (m!=1){
	MyQuery_GsumIndex_AllSamples <- append(MyQuery_GsumIndex_AllSamples,MyQuery_Gindex)
	}
	#3b. APOBEC 3F Count
	f=0
	for (p in 1:length(MyREFIndex_F)){
	if(substring(MyOutput02_RawSeq[m],MyREFIndex_F[p],MyREFIndex_F[p]) == "A"){
	f <- f+1
	MyQuery_Findex <- append(MyQuery_Findex,MyREFIndex_F[p])
	}
	}
	MyOutput03b1_APOBEC3F2A_count <- append(MyOutput03b1_APOBEC3F2A_count,f)
	MyOutput03c1_APOBEC3FGsum2A_count <- append(MyOutput03c1_APOBEC3FGsum2A_count,f+g)
	if (m!=1){
	MyQuery_FsumIndex_AllSamples <- append(MyQuery_FsumIndex_AllSamples,MyQuery_Findex)
	}
	#3c. Control Count
	c=0
	for (q in 1:length(MyREFIndex_C)){
	if(substring(MyOutput02_RawSeq[m],MyREFIndex_C[q],MyREFIndex_C[q]) == "A"){c <- c+1}
	}
	MyOutput03d1_Control_count <- append(MyOutput03d1_Control_count,c)
	#3d. Revert A2G in hypermuted positions
	if (length(append(MyQuery_Gindex,MyQuery_Findex))==0){
	MyRevertedSEQ_A2G <- append(MyRevertedSEQ_A2G,MyOutput02_RawSeq[m])
	} else {
	#REVERT
	MyCurrentSeq <- MyOutput02_RawSeq[m]
	MyQuery_GFsumIndex <- sort(append(MyQuery_Gindex,MyQuery_Findex))
	for (x in 1:length(MyQuery_GFsumIndex)){
	Start <- MyQuery_GFsumIndex[x]
	End <- MyQuery_GFsumIndex[x]
	substr(MyCurrentSeq,Start,End) <- "G"
	}
	MyRevertedSEQ_A2G <- append(MyRevertedSEQ_A2G,MyCurrentSeq)
	#3e. Genome Distribution append
	MyQuery_GFsumIndex_AllSamples <- append(MyQuery_GFsumIndex_AllSamples,MyQuery_GFsumIndex)
	}
	}
	MyFASTA_Fixed <- DNAStringSet(MyRevertedSEQ_A2G)
	names(MyFASTA_Fixed) <- names(MyFASTA)
	writeXStringSet(MyFASTA_Fixed,file="./Output/Output02_MyFASTA_Reverted.fasta")


	#STEP3.1 Summary data: NOT
	MyOutput03a2_APOBEC3G2A_NOTcount <- length(MyREFIndex_G)-MyOutput03a1_APOBEC3G2A_count
	MyOutput03b2_APOBEC3F2A_NOTcount <- length(MyREFIndex_F)-MyOutput03b1_APOBEC3F2A_count
	MyOutput03c2_APOBEC3FGsum2A_NOTcount <- length(MyREFIndex_G)+length(MyREFIndex_F)-MyOutput03c1_APOBEC3FGsum2A_count
	MyOutput03d2_Control_NOTcount <- length(MyREFIndex_C)-MyOutput03d1_Control_count
	#STEP3.2 Summary data: TOTAL
	MyOutput03a3_APOBEC3G2A_TOTALcount <- rep(length(MyREFIndex_G),length(MyOutput01_SampleID))
	MyOutput03b3_APOBEC3F2A_TOTALcount <- rep(length(MyREFIndex_F),length(MyOutput01_SampleID))
	MyOutput03c3_APOBEC3FGsum2A_TOTALcount <- rep(length(MyREFIndex_G)+length(MyREFIndex_F),length(MyOutput01_SampleID))
	MyOutput03d3_Control_TOTALcount <- rep(length(MyREFIndex_C),length(MyOutput01_SampleID))
	#STEP3.3 Summary data: PERCENT HypermutedG>A/TotalG
	MyOutput03a4_APOBEC3G2A_PERC <- MyOutput03a1_APOBEC3G2A_count/length(MyREFIndex_G)*100
	MyOutput03b4_APOBEC3F2A_PERC <- MyOutput03b1_APOBEC3F2A_count/length(MyREFIndex_F)*100
	MyOutput03c4_APOBEC3FGsum2A_PERC <- MyOutput03c1_APOBEC3FGsum2A_count/(length(MyREFIndex_G)+length(MyREFIndex_F))*100
	MyOutput03d4_Control_PERC <- MyOutput03d1_Control_count/length(MyREFIndex_C)*100
	#STEP3.4 Summary data: PERCENT Contribution G sites / total mutated sites
	MyOutput03a5_APOBEC3G2A_PERC <- MyOutput03a1_APOBEC3G2A_count/(MyOutput03a1_APOBEC3G2A_count+MyOutput03b1_APOBEC3F2A_count)*100
	MyOutput03b5_APOBEC3F2A_PERC <- MyOutput03b1_APOBEC3F2A_count/(MyOutput03a1_APOBEC3G2A_count+MyOutput03b1_APOBEC3F2A_count)*100


	#STEP 4. Reproduce WebHypermut 2.0 tool: FISHER'S
	MyFisherVector <- c()
	for (m in 1:length(MyOutput02_RawSeq)){
	Q1 <- MyOutput03c1_APOBEC3FGsum2A_count[m]
	Q2 <- MyOutput03c2_APOBEC3FGsum2A_NOTcount[m]
	Q3 <- MyOutput03d1_Control_count[m]
	Q4 <- MyOutput03d2_Control_NOTcount[m]
	table <- rbind(c(Q1,Q2),c(Q3,Q4))
	MyFisher <- fisher.test(table,alternative="greater")
	MyFisherVector <- append(MyFisherVector,MyFisher$p.value)
	}

	#STEP 5. Genome distribution
	pdf("./Output/Output03_GenomeDistribHistogram.pdf")
	par(mfrow=c(3,1)) #nrow,ncol
	hist(sort(MyQuery_GsumIndex_AllSamples)
	,main=""
	,xlab="HIV Genome Coordinate - APOBEC 3G")
	title("Distribution of Hypermutations in HIV Genome") #set main title here#
	hist(sort(MyQuery_FsumIndex_AllSamples)
	,main=""
	,xlab="HIV Genome Coordinate - APOBEC 3F")
	hist(sort(MyQuery_GFsumIndex_AllSamples)
	,main=""
	,xlab="HIV Genome Coordinate - APOBEC 3G + 3F")
	dev.off()

	#STEP 6. Export DF
	MyDF <- as.data.frame(cbind(
	MyOutput01_SampleID
	, MyOutput02_RawSeq
	, MyOutput03a1_APOBEC3G2A_count
	, MyOutput03a2_APOBEC3G2A_NOTcount
	, MyOutput03a3_APOBEC3G2A_TOTALcount
	, MyOutput03a4_APOBEC3G2A_PERC
	, MyOutput03a5_APOBEC3G2A_PERC
	, MyOutput03b1_APOBEC3F2A_count
	, MyOutput03b2_APOBEC3F2A_NOTcount
	, MyOutput03b3_APOBEC3F2A_TOTALcount
	, MyOutput03b4_APOBEC3F2A_PERC
	, MyOutput03b5_APOBEC3F2A_PERC
	, MyOutput03c1_APOBEC3FGsum2A_count
	, MyOutput03c2_APOBEC3FGsum2A_NOTcount
	, MyOutput03c3_APOBEC3FGsum2A_TOTALcount
	, MyOutput03c4_APOBEC3FGsum2A_PERC
	, MyOutput03d1_Control_count
	, MyOutput03d2_Control_NOTcount
	, MyOutput03d3_Control_TOTALcount
	, MyOutput03d4_Control_PERC
	, MyRevertedSEQ_A2G
	, MyFisherVector
	))
	write.csv(MyDF,file="./Output/Output04_MyDF_Summary.csv")