Created
June 23, 2015 04:32
-
-
Save jwdink/e81d49057341bd64c1cf to your computer and use it in GitHub Desktop.
Take a folder of .txt files, each consisting of lines of tab-separated Key,Value pairs; convert and merge into to R data frame
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This takes a folder of session files, converts them to dataframe | |
## MAIN: | |
convert_session_files = function(path, | |
trial_advance_str= "TrialNum", | |
identifier_colnames = c("TrialNum", "BlockNum", "PhaseNum"), | |
participant_string = "subject_code", | |
exp_time_string = "start_time", | |
overwrite_conflict_function = NULL, | |
echo=TRUE) { | |
require("stringr") | |
require("plyr") # for bind_rows | |
df = data.frame(stringsAsFactors= FALSE) | |
files = list.files(path= path, pattern=".txt", all.files= TRUE, full.names= TRUE) | |
if (length(files)==0) stop("Directory was empty!") | |
for (file in files){ | |
if(echo) cat(sprintf("\nProcessing file: %s", file)) | |
# read in the raw data file | |
df_session = read.table(file, sep= "\t", header= FALSE, col.names = c('Key', 'Value'), stringsAsFactors= FALSE) | |
# Make Queue: | |
this_row_is_new = FALSE | |
this_row = NULL | |
the_row_queue = data.frame(stringsAsFactors=FALSE) | |
rq_identifiers = c() | |
# Participant: | |
part_str = df_session$Value[df_session$Key == participant_string] | |
if (length(part_str) == 0) { | |
warning("Could not find 'participant' in \n", file, "\n Check 'participant_string'.") | |
part_str = 'NULL' | |
} | |
the_row_queue[1, fcoln(participant_string)] = part_str | |
# Exp Time: | |
time_str = df_session$Value[df_session$Key == exp_time_string] | |
if (length(part_str) == 0) { | |
warning("Could not find time in \n", file, "\n Check 'exp_time_string'.") | |
time_str = 'NULL' | |
} | |
the_row_queue[1, fcoln(exp_time_string)] = time_str | |
# Loop thru: | |
for (srow in 1:nrow(df_session)) { # srow = df_session row | |
if ( df_session$Key[srow] == trial_advance_str ) { | |
# we might be in a new trial. | |
# we know that the prev row, which we were checking for uniqueness from the row queue, | |
# is unique iff the this_row_is_new==TRUE | |
if (this_row_is_new) { | |
# new? in that case, we can append the row queue to the df, flush it. | |
df = rbind.fill(df, the_row_queue) | |
the_row_queue = this_row | |
rq_identifiers = as.list( the_row_queue[,identifier_colnames] ) | |
# reset: | |
this_row = data.frame(stringsAsFactors=FALSE) | |
this_row[1, fcoln(participant_string)] = part_str | |
this_row[1, fcoln(exp_time_string)] = time_str | |
this_row_is_new = FALSE | |
} else { | |
# not new? merge into a single row_queue | |
the_row_queue = merge_rows(the_row_queue, this_row, overwrite_conflict_function) | |
if (all( identifier_colnames %in% colnames(the_row_queue) )) { | |
rq_identifiers = as.list( the_row_queue[,identifier_colnames] ) | |
} | |
# reset: | |
this_row = data.frame(stringsAsFactors=FALSE) | |
this_row[1, fcoln(participant_string)] = part_str | |
this_row[1, fcoln(exp_time_string)] = time_str | |
this_row_is_new = FALSE | |
} | |
} | |
if (df_session$Key[srow] %in% names(rq_identifiers) ) { | |
if (rq_identifiers[[ df_session$Key[srow] ]] != df_session$Value[srow]) { | |
# the value for this df_session row is a row-identifier value | |
# does it equal the previous row-id val? if not, set the this_row_is_new_flag. | |
this_row_is_new = TRUE | |
} | |
} | |
if (!is.null(this_row)) { # ignores first few lines of session file with date etc. | |
existing_val = this_row[1,fcoln(df_session$Key[srow])] | |
if ( is.null(existing_val) ) { | |
this_row[1,fcoln(df_session$Key[srow])] = df_session$Value[srow] | |
} else { | |
this_row[1,fcoln(df_session$Key[srow])] = merge_element(c(existing_val, df_session$Value[srow]), | |
df_session$Key[srow], | |
overwrite_conflict_function) | |
} | |
} | |
} # /loop thru rows | |
if (this_row_is_new) { | |
df = rbind.fill(df, the_row_queue, this_row) | |
} else { | |
# not new? merge into a single row_queue | |
the_row_queue = merge_rows(the_row_queue, this_row, overwrite_conflict_function) | |
df = rbind.fill(df, the_row_queue) | |
} | |
# /loop thru files | |
} | |
return(df) | |
} | |
## HELPERS: | |
fcoln = function(coln) gsub(pattern = " ", replacement = "_", coln) | |
merge_rows = function(the_row_queue, this_row, overwrite_conflict_function) { | |
if (is.null(this_row)) { | |
return(the_row_queue) | |
} | |
two_rows = rbind.fill(the_row_queue, this_row) | |
out = list() | |
for (col in colnames(two_rows)) { | |
out[[col]] = merge_element(two_rows[[col]], col, overwrite_conflict_function) | |
} | |
return( as.data.frame(out, stringsAsFactors=FALSE) ) | |
} | |
merge_element = function(column, col, overwrite_conflict_function) { | |
if (is.na(column[1])) return(column[2]) | |
if (is.na(column[2])) return(column[1]) | |
if (column[1] == column[2]) { | |
return(column[1]) | |
} | |
if (is.null(overwrite_conflict_function)) { | |
warning(paste0("\nOverwrite in ouptut-column / session-key '", col, "'. Consider diff advance_str or specify overwrite_function.") ) | |
return( paste(column, collapse = "; ") ) | |
} else { | |
return( overwrite_conflict_function(column[1], column[2], col) ) | |
} | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment