Skip to content

Instantly share code, notes, and snippets.

@MicTech
Last active August 29, 2015 14:06
Show Gist options
  • Save MicTech/edfbba90971bebd58650 to your computer and use it in GitHub Desktop.
Save MicTech/edfbba90971bebd58650 to your computer and use it in GitHub Desktop.
library(plyr)
block.size = 128 * 1024 * 1024
files.informations <- read.csv('workflow.csv', header=FALSE, stringsAsFactors=F)
files <- files.informations[c("V5", "V8")]
colnames(files) <- c("size", "filepath")
for(i in 1:nrow(files)) {
splitted.path <- strsplit(files[i,"filepath"], "/")
splitted.path.vector <- unlist(splitted.path)
files[i, "filename"] <- tail(splitted.path.vector,1)
files[i, "path"] <- paste(splitted.path.vector[1:(length(splitted.path.vector) - 1)], collapse = "/")
files[i, "size.mb"] <- files[i, "size"] / 1024 / 1024
files[i, "size.gb"] <- files[i, "size"] / 1024 / 1024 / 1024
file.block.count <- ceiling((files[i,"size"] / block.size))
files[i, "blocks"] <- file.block.count
hdfs.size = file.block.count * block.size
files[i, "hdfs.size"] <- hdfs.size
files[i, "hdfs.size.mb"] <- hdfs.size / 1024 / 1024
files[i, "hdfs.size.gb"] <- hdfs.size / 1024 / 1024 / 1024
}
aggregate <- ddply(files, 'path', function(x){
c(size = sum(x$size),
size.gb = sum(x$size.gb),
file.count = nrow(x),
size.mean.mb = mean(x$size.mb),
hdfs.blocks = sum(x$blocks),
hdfs.size = sum(x$hdfs.size),
hdfs.size.gb = sum(x$hdfs.size.gb)
)
})
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment