Skip to content

Instantly share code, notes, and snippets.

@fawda123
Last active August 29, 2015 14:00
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save fawda123/20688ace86604259de4e to your computer and use it in GitHub Desktop.
# 'root' character string of directory to search
# 'file_typs' character vector of file types to search
# 'omit_blank' logical indicating of blank lines are counted
# 'recursive' logical indicating if all directories within 'root' are searched
# 'lns' logical indicating if lines are counted, use F for counting characters
# 'trace' logical for monitoring progress
file.lens <- function(root, file_typs, omit_blank = F, recursive = T,
lns = T, trace = T){
require(reshape2)
# get files by recursive search, uses 'list.files'
if(trace) cat(paste0('Finding files in ', root, '...\n'))
files <- list.files(
root,
pattern = paste(paste0('\\.',file_typs,'$'), collapse = '|'),
recursive = recursive,
full.names = T,
ignore.case = T
)
# stop if no files found
if(length(files) == 0) return(cat('No files found\n'))
# read files, get info
out_ls <- vector('list', length(files))
names(out_ls) <- files
for(fl in files){
if(trace) cat(fl, '\n')
# read file
tmp <- readLines(fl, warn = F)
# get file information
fl_len <- length(tmp)
if(omit_blank) fl_len <- sum(tmp != '')
fl_nchr <- sum(nchar(tmp))
Date <- file.info(fl)$mtime
# combine file info, append to output
# line or character total
if(lns)
out <- data.frame(
fl = basename(fl),
Length = fl_len,
Date = as.character(Date),
stringsAsFactors = F)
else
out <- data.frame(
fl = basename(fl),
Length = fl_nchr,
Date = as.character(Date),
stringsAsFactors = F)
out_ls[[fl]] <- out
}
# convert list to data frame
out_df <- do.call('rbind', out_ls)
out_df$fl_typ <- out_df$fl
out_df <- data.frame(out_df, row.names = 1:nrow(out_df))
out_df$fl_typ <- tolower(lapply(strsplit(out_df$fl_typ, '\\.'),
function(x) x[[length(x)]]))
# convert file date to posix then date, order by date
out_df$Date <- as.POSIXct(as.character(out_df$Date),
format = '%Y-%m-%d %H:%M:%S')
out_df$Date <- as.Date(out_df$Date)
out_df <- out_df[order(out_df$Date),]
# get cumulative sums by file type
out_df <- lapply(
split(out_df, out_df$fl_typ),
function(x){
tmp <- x[order(x$Date),]
tmp$cum_len <- cumsum(tmp$Length)
tmp[,!names(tmp) %in% 'fl_typ']
})
# add column of file type using melt
# reassign column name
out_df <- melt(out_df, id.var = c('fl', 'Length', 'Date', 'cum_len'))
names(out_df)[names(out_df) %in% 'L1'] <- 'Type'
return(out_df)
}
@albertosantini
Copy link

out_df <- melt(out_df, id.var = names(out_df[[1]])) displays Error in measure.attributes[[1]] : subscript out of bounds

R version 3.1.0 (2014-04-10)
Platform: x86_64-w64-mingw32/x64 (64-bit)

locale:
[1] LC_COLLATE=English_United States.1252 
[2] LC_CTYPE=English_United States.1252   
[3] LC_MONETARY=English_United States.1252
[4] LC_NUMERIC=C                          
[5] LC_TIME=English_United States.1252    

attached base packages:
[1] grid      stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
[1] gridExtra_0.9.1 ggplot2_0.9.3.1 reshape2_1.4   

loaded via a namespace (and not attached):
 [1] colorspace_1.2-4 digest_0.6.4     gtable_0.1.2     labeling_0.2    
 [5] MASS_7.3-31      munsell_0.4.2    plyr_1.8.1       proto_0.3-10    
 [9] Rcpp_0.11.1      scales_0.2.4     stringr_0.6.2    tools_3.1.0     

@fawda123
Copy link
Author

Just updated, try it now....

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment