Skip to content

Instantly share code, notes, and snippets.

@guineverelee
Created May 3, 2018 21:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save guineverelee/4efe7d457d6a50bb42887ae5218fb185 to your computer and use it in GitHub Desktop.
Save guineverelee/4efe7d457d6a50bb42887ae5218fb185 to your computer and use it in GitHub Desktop.
Hypermut Plus: Statistical and graphic tool for the impact of APOBEC-3G/F on HIV genomes. 2018
#To calculate Fisher's p and look at the genome distribution of hypermutated sites in your amplicon, please input an aligned fasta with reference sequence at the top
#Step 00. Setup R
rm(list=ls())
closeAllConnections()
graphics.off()
cat("\014") #send CTRL+L to R, clean console
StartSysTime <- Sys.time()
MyWD <- getwd()
setwd(MyWD)
library(Biostrings) #PC@work Biostrings_2.38.1
#User Input:
MyFASTAFileName <- "20171105_Hypermut_n574.fasta_Raligned_GLtrimmed.fasta"
dir.create("Output")
#STEP1. LOAD THE FASTA FILE and CONVERT into DF
MyFASTA <- readDNAStringSet(MyFASTAFileName)
MyOutput01_SampleID <- c()
MyOutput02_RawSeq <- c()
for (i in 1:length(MyFASTA)){
MyOutput01_SampleID <- append(MyOutput01_SampleID,names(MyFASTA[i]))
MyOutput02_RawSeq <- append(MyOutput02_RawSeq,as.character(MyFASTA[i]))
}
#STEP2. Index REF for 3G and 3F sites
MyREF01_Length <- nchar(MyOutput02_RawSeq[1])
MyResult <- ""
for (j in 1:MyREF01_Length){
#2a. Count and Output category per-base-position: C=Control, N=not-G, G=3G (GGD), F=3F (GAD)
#Site1. Is it G?
if (substring(MyOutput02_RawSeq[1],j,j) == "G"){
#Site2. Is it -? If so, check next position until it is not -. Then see if it is G or A.
k=1
while(substring(MyOutput02_RawSeq[1],j+k,j+k) == "-"){k <- k+1}
if (substring(MyOutput02_RawSeq[1],j+k,j+k) == "G" | substring(MyOutput02_RawSeq[1],j+k,j+k) == "A"){
#Site3. Is it -? If so, check next position until it is not -. Then see if it is !C.
n=1
while(substring(MyOutput02_RawSeq[1],j+k+n,j+k+n) == "-"){n <- n+1}
if (substring(MyOutput02_RawSeq[1],j+k,j+k) == "G" & substring(MyOutput02_RawSeq[1],j+k+n,j+k+n) != "C"){
MyResult <- paste(MyResult,"G",sep="")
} else if (substring(MyOutput02_RawSeq[1],j+k,j+k) == "A" & substring(MyOutput02_RawSeq[1],j+k+n,j+k+n) != "C"){
MyResult <- paste(MyResult,"F",sep="")
} else {MyResult <- paste(MyResult,"C",sep="")}
} else if (substring(MyOutput02_RawSeq[1],j+k,j+k) == "-"){
MyResult <- paste(MyResult,"N",sep="")
} else {MyResult <- paste(MyResult,"C",sep="")}
} else {MyResult <- paste(MyResult,"N",sep="")}
}
write(MyResult,file="./Output/Output01_REFCategory.txt")
MyREFIndex_G <- unlist(gregexpr(pattern="G",MyResult))
MyREFIndex_F <- unlist(gregexpr(pattern="F",MyResult))
MyREFIndex_C <- unlist(gregexpr(pattern="C",MyResult))
MyREFIndex_N <- unlist(gregexpr(pattern="N",MyResult))
#STEP3. COUNT NUMBERS of 3G and 3F ## PLus ## revert A>G in original
MyOutput03a1_APOBEC3G2A_count <- c()
MyOutput03b1_APOBEC3F2A_count <- c()
MyOutput03c1_APOBEC3FGsum2A_count <- c()
MyOutput03d1_Control_count <- c()
MyRevertedSEQ_A2G <- c()
MyQuery_GsumIndex_AllSamples <- c()
MyQuery_FsumIndex_AllSamples <- c()
MyQuery_GFsumIndex_AllSamples <- c()
for (m in 1:length(MyOutput02_RawSeq)){
MyQuery_Gindex <- c()
MyQuery_Findex <- c()
#3a. APOBEC 3G Count
g=0
for (o in 1:length(MyREFIndex_G)){
if(substring(MyOutput02_RawSeq[m],MyREFIndex_G[o],MyREFIndex_G[o]) == "A"){
g <- g+1
MyQuery_Gindex <- append(MyQuery_Gindex,MyREFIndex_G[o])
}
}
MyOutput03a1_APOBEC3G2A_count <- append(MyOutput03a1_APOBEC3G2A_count,g)
if (m!=1){
MyQuery_GsumIndex_AllSamples <- append(MyQuery_GsumIndex_AllSamples,MyQuery_Gindex)
}
#3b. APOBEC 3F Count
f=0
for (p in 1:length(MyREFIndex_F)){
if(substring(MyOutput02_RawSeq[m],MyREFIndex_F[p],MyREFIndex_F[p]) == "A"){
f <- f+1
MyQuery_Findex <- append(MyQuery_Findex,MyREFIndex_F[p])
}
}
MyOutput03b1_APOBEC3F2A_count <- append(MyOutput03b1_APOBEC3F2A_count,f)
MyOutput03c1_APOBEC3FGsum2A_count <- append(MyOutput03c1_APOBEC3FGsum2A_count,f+g)
if (m!=1){
MyQuery_FsumIndex_AllSamples <- append(MyQuery_FsumIndex_AllSamples,MyQuery_Findex)
}
#3c. Control Count
c=0
for (q in 1:length(MyREFIndex_C)){
if(substring(MyOutput02_RawSeq[m],MyREFIndex_C[q],MyREFIndex_C[q]) == "A"){c <- c+1}
}
MyOutput03d1_Control_count <- append(MyOutput03d1_Control_count,c)
#3d. Revert A2G in hypermuted positions
if (length(append(MyQuery_Gindex,MyQuery_Findex))==0){
MyRevertedSEQ_A2G <- append(MyRevertedSEQ_A2G,MyOutput02_RawSeq[m])
} else {
#REVERT
MyCurrentSeq <- MyOutput02_RawSeq[m]
MyQuery_GFsumIndex <- sort(append(MyQuery_Gindex,MyQuery_Findex))
for (x in 1:length(MyQuery_GFsumIndex)){
Start <- MyQuery_GFsumIndex[x]
End <- MyQuery_GFsumIndex[x]
substr(MyCurrentSeq,Start,End) <- "G"
}
MyRevertedSEQ_A2G <- append(MyRevertedSEQ_A2G,MyCurrentSeq)
#3e. Genome Distribution append
MyQuery_GFsumIndex_AllSamples <- append(MyQuery_GFsumIndex_AllSamples,MyQuery_GFsumIndex)
}
}
MyFASTA_Fixed <- DNAStringSet(MyRevertedSEQ_A2G)
names(MyFASTA_Fixed) <- names(MyFASTA)
writeXStringSet(MyFASTA_Fixed,file="./Output/Output02_MyFASTA_Reverted.fasta")
#STEP3.1 Summary data: NOT
MyOutput03a2_APOBEC3G2A_NOTcount <- length(MyREFIndex_G)-MyOutput03a1_APOBEC3G2A_count
MyOutput03b2_APOBEC3F2A_NOTcount <- length(MyREFIndex_F)-MyOutput03b1_APOBEC3F2A_count
MyOutput03c2_APOBEC3FGsum2A_NOTcount <- length(MyREFIndex_G)+length(MyREFIndex_F)-MyOutput03c1_APOBEC3FGsum2A_count
MyOutput03d2_Control_NOTcount <- length(MyREFIndex_C)-MyOutput03d1_Control_count
#STEP3.2 Summary data: TOTAL
MyOutput03a3_APOBEC3G2A_TOTALcount <- rep(length(MyREFIndex_G),length(MyOutput01_SampleID))
MyOutput03b3_APOBEC3F2A_TOTALcount <- rep(length(MyREFIndex_F),length(MyOutput01_SampleID))
MyOutput03c3_APOBEC3FGsum2A_TOTALcount <- rep(length(MyREFIndex_G)+length(MyREFIndex_F),length(MyOutput01_SampleID))
MyOutput03d3_Control_TOTALcount <- rep(length(MyREFIndex_C),length(MyOutput01_SampleID))
#STEP3.3 Summary data: PERCENT HypermutedG>A/TotalG
MyOutput03a4_APOBEC3G2A_PERC <- MyOutput03a1_APOBEC3G2A_count/length(MyREFIndex_G)*100
MyOutput03b4_APOBEC3F2A_PERC <- MyOutput03b1_APOBEC3F2A_count/length(MyREFIndex_F)*100
MyOutput03c4_APOBEC3FGsum2A_PERC <- MyOutput03c1_APOBEC3FGsum2A_count/(length(MyREFIndex_G)+length(MyREFIndex_F))*100
MyOutput03d4_Control_PERC <- MyOutput03d1_Control_count/length(MyREFIndex_C)*100
#STEP3.4 Summary data: PERCENT Contribution G sites / total mutated sites
MyOutput03a5_APOBEC3G2A_PERC <- MyOutput03a1_APOBEC3G2A_count/(MyOutput03a1_APOBEC3G2A_count+MyOutput03b1_APOBEC3F2A_count)*100
MyOutput03b5_APOBEC3F2A_PERC <- MyOutput03b1_APOBEC3F2A_count/(MyOutput03a1_APOBEC3G2A_count+MyOutput03b1_APOBEC3F2A_count)*100
#STEP 4. Reproduce WebHypermut 2.0 tool: FISHER'S
MyFisherVector <- c()
for (m in 1:length(MyOutput02_RawSeq)){
Q1 <- MyOutput03c1_APOBEC3FGsum2A_count[m]
Q2 <- MyOutput03c2_APOBEC3FGsum2A_NOTcount[m]
Q3 <- MyOutput03d1_Control_count[m]
Q4 <- MyOutput03d2_Control_NOTcount[m]
table <- rbind(c(Q1,Q2),c(Q3,Q4))
MyFisher <- fisher.test(table,alternative="greater")
MyFisherVector <- append(MyFisherVector,MyFisher$p.value)
}
#STEP 5. Genome distribution
pdf("./Output/Output03_GenomeDistribHistogram.pdf")
par(mfrow=c(3,1)) #nrow,ncol
hist(sort(MyQuery_GsumIndex_AllSamples)
,main=""
,xlab="HIV Genome Coordinate - APOBEC 3G")
title("Distribution of Hypermutations in HIV Genome") #set main title here#
hist(sort(MyQuery_FsumIndex_AllSamples)
,main=""
,xlab="HIV Genome Coordinate - APOBEC 3F")
hist(sort(MyQuery_GFsumIndex_AllSamples)
,main=""
,xlab="HIV Genome Coordinate - APOBEC 3G + 3F")
dev.off()
#STEP 6. Export DF
MyDF <- as.data.frame(cbind(
MyOutput01_SampleID
, MyOutput02_RawSeq
, MyOutput03a1_APOBEC3G2A_count
, MyOutput03a2_APOBEC3G2A_NOTcount
, MyOutput03a3_APOBEC3G2A_TOTALcount
, MyOutput03a4_APOBEC3G2A_PERC
, MyOutput03a5_APOBEC3G2A_PERC
, MyOutput03b1_APOBEC3F2A_count
, MyOutput03b2_APOBEC3F2A_NOTcount
, MyOutput03b3_APOBEC3F2A_TOTALcount
, MyOutput03b4_APOBEC3F2A_PERC
, MyOutput03b5_APOBEC3F2A_PERC
, MyOutput03c1_APOBEC3FGsum2A_count
, MyOutput03c2_APOBEC3FGsum2A_NOTcount
, MyOutput03c3_APOBEC3FGsum2A_TOTALcount
, MyOutput03c4_APOBEC3FGsum2A_PERC
, MyOutput03d1_Control_count
, MyOutput03d2_Control_NOTcount
, MyOutput03d3_Control_TOTALcount
, MyOutput03d4_Control_PERC
, MyRevertedSEQ_A2G
, MyFisherVector
))
write.csv(MyDF,file="./Output/Output04_MyDF_Summary.csv")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment