Skip to content

Instantly share code, notes, and snippets.

@jwdink
Created June 23, 2015 04:32
Show Gist options
  • Save jwdink/e81d49057341bd64c1cf to your computer and use it in GitHub Desktop.
Save jwdink/e81d49057341bd64c1cf to your computer and use it in GitHub Desktop.
Take a folder of .txt files, each consisting of lines of tab-separated Key,Value pairs; convert and merge into to R data frame
# This takes a folder of session files, converts them to dataframe
## MAIN:
convert_session_files = function(path,
trial_advance_str= "TrialNum",
identifier_colnames = c("TrialNum", "BlockNum", "PhaseNum"),
participant_string = "subject_code",
exp_time_string = "start_time",
overwrite_conflict_function = NULL,
echo=TRUE) {
require("stringr")
require("plyr") # for bind_rows
df = data.frame(stringsAsFactors= FALSE)
files = list.files(path= path, pattern=".txt", all.files= TRUE, full.names= TRUE)
if (length(files)==0) stop("Directory was empty!")
for (file in files){
if(echo) cat(sprintf("\nProcessing file: %s", file))
# read in the raw data file
df_session = read.table(file, sep= "\t", header= FALSE, col.names = c('Key', 'Value'), stringsAsFactors= FALSE)
# Make Queue:
this_row_is_new = FALSE
this_row = NULL
the_row_queue = data.frame(stringsAsFactors=FALSE)
rq_identifiers = c()
# Participant:
part_str = df_session$Value[df_session$Key == participant_string]
if (length(part_str) == 0) {
warning("Could not find 'participant' in \n", file, "\n Check 'participant_string'.")
part_str = 'NULL'
}
the_row_queue[1, fcoln(participant_string)] = part_str
# Exp Time:
time_str = df_session$Value[df_session$Key == exp_time_string]
if (length(part_str) == 0) {
warning("Could not find time in \n", file, "\n Check 'exp_time_string'.")
time_str = 'NULL'
}
the_row_queue[1, fcoln(exp_time_string)] = time_str
# Loop thru:
for (srow in 1:nrow(df_session)) { # srow = df_session row
if ( df_session$Key[srow] == trial_advance_str ) {
# we might be in a new trial.
# we know that the prev row, which we were checking for uniqueness from the row queue,
# is unique iff the this_row_is_new==TRUE
if (this_row_is_new) {
# new? in that case, we can append the row queue to the df, flush it.
df = rbind.fill(df, the_row_queue)
the_row_queue = this_row
rq_identifiers = as.list( the_row_queue[,identifier_colnames] )
# reset:
this_row = data.frame(stringsAsFactors=FALSE)
this_row[1, fcoln(participant_string)] = part_str
this_row[1, fcoln(exp_time_string)] = time_str
this_row_is_new = FALSE
} else {
# not new? merge into a single row_queue
the_row_queue = merge_rows(the_row_queue, this_row, overwrite_conflict_function)
if (all( identifier_colnames %in% colnames(the_row_queue) )) {
rq_identifiers = as.list( the_row_queue[,identifier_colnames] )
}
# reset:
this_row = data.frame(stringsAsFactors=FALSE)
this_row[1, fcoln(participant_string)] = part_str
this_row[1, fcoln(exp_time_string)] = time_str
this_row_is_new = FALSE
}
}
if (df_session$Key[srow] %in% names(rq_identifiers) ) {
if (rq_identifiers[[ df_session$Key[srow] ]] != df_session$Value[srow]) {
# the value for this df_session row is a row-identifier value
# does it equal the previous row-id val? if not, set the this_row_is_new_flag.
this_row_is_new = TRUE
}
}
if (!is.null(this_row)) { # ignores first few lines of session file with date etc.
existing_val = this_row[1,fcoln(df_session$Key[srow])]
if ( is.null(existing_val) ) {
this_row[1,fcoln(df_session$Key[srow])] = df_session$Value[srow]
} else {
this_row[1,fcoln(df_session$Key[srow])] = merge_element(c(existing_val, df_session$Value[srow]),
df_session$Key[srow],
overwrite_conflict_function)
}
}
} # /loop thru rows
if (this_row_is_new) {
df = rbind.fill(df, the_row_queue, this_row)
} else {
# not new? merge into a single row_queue
the_row_queue = merge_rows(the_row_queue, this_row, overwrite_conflict_function)
df = rbind.fill(df, the_row_queue)
}
# /loop thru files
}
return(df)
}
## HELPERS:
fcoln = function(coln) gsub(pattern = " ", replacement = "_", coln)
merge_rows = function(the_row_queue, this_row, overwrite_conflict_function) {
if (is.null(this_row)) {
return(the_row_queue)
}
two_rows = rbind.fill(the_row_queue, this_row)
out = list()
for (col in colnames(two_rows)) {
out[[col]] = merge_element(two_rows[[col]], col, overwrite_conflict_function)
}
return( as.data.frame(out, stringsAsFactors=FALSE) )
}
merge_element = function(column, col, overwrite_conflict_function) {
if (is.na(column[1])) return(column[2])
if (is.na(column[2])) return(column[1])
if (column[1] == column[2]) {
return(column[1])
}
if (is.null(overwrite_conflict_function)) {
warning(paste0("\nOverwrite in ouptut-column / session-key '", col, "'. Consider diff advance_str or specify overwrite_function.") )
return( paste(column, collapse = "; ") )
} else {
return( overwrite_conflict_function(column[1], column[2], col) )
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment