Skip to content

Instantly share code, notes, and snippets.

@fawda123 fawda123/file_lens.r
Last active Aug 29, 2015

Embed
What would you like to do?
# 'root' character string of directory to search
# 'file_typs' character vector of file types to search
# 'omit_blank' logical indicating of blank lines are counted
# 'recursive' logical indicating if all directories within 'root' are searched
# 'lns' logical indicating if lines are counted, use F for counting characters
# 'trace' logical for monitoring progress
file.lens <- function(root, file_typs, omit_blank = F, recursive = T,
lns = T, trace = T){
require(reshape2)
# get files by recursive search, uses 'list.files'
if(trace) cat(paste0('Finding files in ', root, '...\n'))
files <- list.files(
root,
pattern = paste(paste0('\\.',file_typs,'$'), collapse = '|'),
recursive = recursive,
full.names = T,
ignore.case = T
)
# stop if no files found
if(length(files) == 0) return(cat('No files found\n'))
# read files, get info
out_ls <- vector('list', length(files))
names(out_ls) <- files
for(fl in files){
if(trace) cat(fl, '\n')
# read file
tmp <- readLines(fl, warn = F)
# get file information
fl_len <- length(tmp)
if(omit_blank) fl_len <- sum(tmp != '')
fl_nchr <- sum(nchar(tmp))
Date <- file.info(fl)$mtime
# combine file info, append to output
# line or character total
if(lns)
out <- data.frame(
fl = basename(fl),
Length = fl_len,
Date = as.character(Date),
stringsAsFactors = F)
else
out <- data.frame(
fl = basename(fl),
Length = fl_nchr,
Date = as.character(Date),
stringsAsFactors = F)
out_ls[[fl]] <- out
}
# convert list to data frame
out_df <- do.call('rbind', out_ls)
out_df$fl_typ <- out_df$fl
out_df <- data.frame(out_df, row.names = 1:nrow(out_df))
out_df$fl_typ <- tolower(lapply(strsplit(out_df$fl_typ, '\\.'),
function(x) x[[length(x)]]))
# convert file date to posix then date, order by date
out_df$Date <- as.POSIXct(as.character(out_df$Date),
format = '%Y-%m-%d %H:%M:%S')
out_df$Date <- as.Date(out_df$Date)
out_df <- out_df[order(out_df$Date),]
# get cumulative sums by file type
out_df <- lapply(
split(out_df, out_df$fl_typ),
function(x){
tmp <- x[order(x$Date),]
tmp$cum_len <- cumsum(tmp$Length)
tmp[,!names(tmp) %in% 'fl_typ']
})
# add column of file type using melt
# reassign column name
out_df <- melt(out_df, id.var = c('fl', 'Length', 'Date', 'cum_len'))
names(out_df)[names(out_df) %in% 'L1'] <- 'Type'
return(out_df)
}
@albertosantini

This comment has been minimized.

Copy link

commented May 8, 2014

out_df <- melt(out_df, id.var = names(out_df[[1]])) displays Error in measure.attributes[[1]] : subscript out of bounds

R version 3.1.0 (2014-04-10)
Platform: x86_64-w64-mingw32/x64 (64-bit)

locale:
[1] LC_COLLATE=English_United States.1252 
[2] LC_CTYPE=English_United States.1252   
[3] LC_MONETARY=English_United States.1252
[4] LC_NUMERIC=C                          
[5] LC_TIME=English_United States.1252    

attached base packages:
[1] grid      stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
[1] gridExtra_0.9.1 ggplot2_0.9.3.1 reshape2_1.4   

loaded via a namespace (and not attached):
 [1] colorspace_1.2-4 digest_0.6.4     gtable_0.1.2     labeling_0.2    
 [5] MASS_7.3-31      munsell_0.4.2    plyr_1.8.1       proto_0.3-10    
 [9] Rcpp_0.11.1      scales_0.2.4     stringr_0.6.2    tools_3.1.0     

@fawda123

This comment has been minimized.

Copy link
Owner Author

commented May 18, 2014

Just updated, try it now....

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.