Created
November 13, 2012 23:02
-
-
Save sburns/4069030 to your computer and use it in GitHub Desktop.
A process for converting eprime txt files to pandas.DataFrames
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "eprime_txt_to_dataframe" | |
}, | |
"nbformat": 2, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"Warning: Work-In-Progress", | |
"", | |
"Imports and constants" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": true, | |
"input": [ | |
"import pandas as pd", | |
"", | |
"LFS = '*** LogFrame Start ***'", | |
"LFE = '*** LogFrame End ***'" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 46 | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"Open/read/format" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": true, | |
"input": [ | |
"ep = '/fs0/New_Server/RCV/In_Behavioral/098_208247/PassagesV2_ListB-8247-1.txt'", | |
"with open(ep, 'r') as f:", | |
" raw = [r.strip() for r in f.read().decode('utf-16').splitlines()]" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 28 | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"We know that a new LogFrame is denoted after every level:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"level_ind = [i for i, r in enumerate(raw) if 'Level:' in r]", | |
"lfs_ind = [i for i, r in enumerate(raw) if 'LogFrame Start' in r]", | |
"assert all([(r-1) in level_ind for r in lfs_ind])" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 30 | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"Grab all the levels", | |
"" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"levels = sorted(set([int(r.split(': ')[1]) for r in raw if 'Level: ' in r]))", | |
"print levels" | |
], | |
"language": "python", | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"[1, 2, 3]" | |
] | |
} | |
], | |
"prompt_number": 31 | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"Grab (level, index) tuples for each trial" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"level_indices = [(int(r.split('Level: ')[1]), i) for i, r in enumerate(raw) if 'Level: ' in r]", | |
"# print level_indices" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 142 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"def level_data_from_ind(ind, raw):", | |
" \"Given the index of a Level: row, grab all data before the next Level:\"", | |
" data = {}", | |
" lfs_ind = raw.index(LFS, ind)", | |
" lfe_ind = raw.index(LFE, ind)", | |
" for row_ind in range(lfs_ind+1, lfe_ind):", | |
" try:", | |
" key, value = raw[row_ind].split(':', 1)", | |
" data[str(key)] = value.strip()", | |
" except ValueError:", | |
" raise ValueError(\"Trouble with index %d, %s\" % (ind, raw[row_ind]))", | |
" return data" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 143 | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"Just checking we don't raise anywhere." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"for level, ind in level_indices:", | |
" try:", | |
" level_data_from_ind(ind, raw)", | |
" except ValueError as e:", | |
" print str(e)" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 144 | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"In a nutshell:", | |
"", | |
"For every trial (which is basically between a `Level: ?` and `*** LogFrame End ***`):", | |
"", | |
"- Figure out the level", | |
"- Grab the data", | |
"- If the new level is less than the old level:", | |
" - Update all the rows since we last saw this level", | |
"- Otherwise:", | |
" - Add the trial to the big list", | |
" - If the new level is greater than the current level:", | |
" - Update a \"last index\" counter for this level, taking into account how many level changes have occured" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"last_index = {lev: 0 for lev in levels}", | |
"data = []", | |
"current_level = 0", | |
"level_down = 0", | |
"for i, row_data in enumerate(level_indices):", | |
" new_level, lev_index = row_data", | |
" if not current_level: current_level = new_level", | |
" # grab data", | |
" row_data = level_data_from_ind(lev_index, raw)", | |
" if new_level < current_level: # move up a level", | |
" # update previous data rows, beginning at the index for the current_level held in last_index", | |
" for data_row in data[last_index[new_level]:]:", | |
" data_row.update(row_data)", | |
" level_down += 1", | |
" else:", | |
" data.append(row_data)", | |
" if new_level > current_level: # moving up a level", | |
" # update last_index", | |
" last_index[current_level] = i-level_down", | |
" current_level = new_level" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 151 | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"Make a dataFrame and spit it out." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": true, | |
"input": [ | |
"df = pd.DataFrame(data)", | |
"df.to_csv('test.csv')", | |
"!open test.csv" | |
], | |
"language": "python", | |
"outputs": [], | |
"prompt_number": 150 | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"Warning: This is totally un-tested. Don't use this on your e-prime data (yet)." | |
] | |
} | |
] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment