SamBuckberry/aggregateCG.R

## aggregateCG.R
library(data.table)
library(R.utils)

aggregateStrandsCG <- function(CGmap){

    # Do not output scientific notation
    options(scipen=999)

    # Create the temp directory for uncompressed CGmap file
    tmpFile <- tempfile(tmpdir = tempdir(), fileext = ".CGmap.tmp")

    command <- paste("zcat ", CGmap, " > ", tmpFile, sep = "")
    system(command)

    # Read the file
    dat <- fread(input = tmpFile, sep = "\t",
                    select = c(1,2,3,5,7,8),
                    col.names = c("chr", "base", "position",
                        "context", "C_reads", "CT_reads"))

    # Subset to CG context only
    dat <- dat[dat$context == "CG", ]

    # Set up the locus id
    dat$locus <- NA

    # Get a locus id relative to forward strand
    dat$locus <- ifelse(test = dat$base == "G",
                        yes = paste(dat$chr,
                            dat$position - 1, sep = ":"),
                        no = paste(dat$chr,
                            dat$position, sep = ":"))

    # Drop the unused columns
    dat <- dat[ ,c("chr", "base", "position", "context") := NULL]

    # Sum the read counts for + and - strand
    dat <- dat[, lapply(.SD, sum), by=.(locus),
        .SDcols=c("C_reads", "CT_reads")]

    # Format for DSS
    dat <- dat[ ,c("chr", "pos") := tstrsplit(locus, ":", fixed=TRUE)]
    dat <- dat[ ,c(4,5,3,2), with=FALSE]
    dat <- dat[ , pos:=as.integer(pos)]
    setDF(dat)
    colnames(dat) <- c("chr", "pos", "N", "X")

    # Delete the temp file
    file.remove(tmpFile)

    # return the aggregated data object
    return(dat)
}

# Apply the function
dat <- aggregateStrandsCG(CGmap = "sample.CGmap")

# Write the output file
out <- gzfile("sample.CGmap_aggregated.gz")
write.table(x = dat, file = out, quote = FALSE, sep = "\t",
    row.names = FALSE, col.names = TRUE)
	library(data.table)
	library(R.utils)

	aggregateStrandsCG <- function(CGmap){

	# Do not output scientific notation
	options(scipen=999)

	# Create the temp directory for uncompressed CGmap file
	tmpFile <- tempfile(tmpdir = tempdir(), fileext = ".CGmap.tmp")

	command <- paste("zcat ", CGmap, " > ", tmpFile, sep = "")
	system(command)

	# Read the file
	dat <- fread(input = tmpFile, sep = "\t",
	select = c(1,2,3,5,7,8),
	col.names = c("chr", "base", "position",
	"context", "C_reads", "CT_reads"))

	# Subset to CG context only
	dat <- dat[dat$context == "CG", ]

	# Set up the locus id
	dat$locus <- NA

	# Get a locus id relative to forward strand
	dat$locus <- ifelse(test = dat$base == "G",
	yes = paste(dat$chr,
	dat$position - 1, sep = ":"),
	no = paste(dat$chr,
	dat$position, sep = ":"))

	# Drop the unused columns
	dat <- dat[ ,c("chr", "base", "position", "context") := NULL]

	# Sum the read counts for + and - strand
	dat <- dat[, lapply(.SD, sum), by=.(locus),
	.SDcols=c("C_reads", "CT_reads")]

	# Format for DSS
	dat <- dat[ ,c("chr", "pos") := tstrsplit(locus, ":", fixed=TRUE)]
	dat <- dat[ ,c(4,5,3,2), with=FALSE]
	dat <- dat[ , pos:=as.integer(pos)]
	setDF(dat)
	colnames(dat) <- c("chr", "pos", "N", "X")

	# Delete the temp file
	file.remove(tmpFile)

	# return the aggregated data object
	return(dat)
	}

	# Apply the function
	dat <- aggregateStrandsCG(CGmap = "sample.CGmap")

	# Write the output file
	out <- gzfile("sample.CGmap_aggregated.gz")
	write.table(x = dat, file = out, quote = FALSE, sep = "\t",
	row.names = FALSE, col.names = TRUE)