Skip to content

Instantly share code, notes, and snippets.

@SamBuckberry
Created June 27, 2017 08:29
Show Gist options
  • Save SamBuckberry/1fb57d8e6f9927a086516cb536c172f2 to your computer and use it in GitHub Desktop.
Save SamBuckberry/1fb57d8e6f9927a086516cb536c172f2 to your computer and use it in GitHub Desktop.
library(data.table)
library(R.utils)
aggregateStrandsCG <- function(CGmap){
# Do not output scientific notation
options(scipen=999)
# Create the temp directory for uncompressed CGmap file
tmpFile <- tempfile(tmpdir = tempdir(), fileext = ".CGmap.tmp")
command <- paste("zcat ", CGmap, " > ", tmpFile, sep = "")
system(command)
# Read the file
dat <- fread(input = tmpFile, sep = "\t",
select = c(1,2,3,5,7,8),
col.names = c("chr", "base", "position",
"context", "C_reads", "CT_reads"))
# Subset to CG context only
dat <- dat[dat$context == "CG", ]
# Set up the locus id
dat$locus <- NA
# Get a locus id relative to forward strand
dat$locus <- ifelse(test = dat$base == "G",
yes = paste(dat$chr,
dat$position - 1, sep = ":"),
no = paste(dat$chr,
dat$position, sep = ":"))
# Drop the unused columns
dat <- dat[ ,c("chr", "base", "position", "context") := NULL]
# Sum the read counts for + and - strand
dat <- dat[, lapply(.SD, sum), by=.(locus),
.SDcols=c("C_reads", "CT_reads")]
# Format for DSS
dat <- dat[ ,c("chr", "pos") := tstrsplit(locus, ":", fixed=TRUE)]
dat <- dat[ ,c(4,5,3,2), with=FALSE]
dat <- dat[ , pos:=as.integer(pos)]
setDF(dat)
colnames(dat) <- c("chr", "pos", "N", "X")
# Delete the temp file
file.remove(tmpFile)
# return the aggregated data object
return(dat)
}
# Apply the function
dat <- aggregateStrandsCG(CGmap = "sample.CGmap")
# Write the output file
out <- gzfile("sample.CGmap_aggregated.gz")
write.table(x = dat, file = out, quote = FALSE, sep = "\t",
row.names = FALSE, col.names = TRUE)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment