josef-pkt/ci_within.py

## ci_within.py
# Author Denis A. Engemann <d.engemann@gmail.com>
# Adjustments: Josef Perktold
#
# License: BSD (3-clause)

import numpy as np
import scipy.stats
import pandas as pd

def ci_within(df, indexvar, withinvars, measvar, confint=0.95,
                      copy=True):
    """ Compute CI / SEM correction factor

    Morey 2008, Cousinaueu 2005, Loftus & Masson, 1994
    Also see R-cookbook http://goo.gl/QdwJl

    Note. This functions helps to generate appropriate confidence
    intervals for repeated measure designs.
    Standard confidence intervals are are computed on normalized data
    and a correction factor is applied that prevents insanely small values.

    df : instance of pandas.DataFrame
        The data frame objetct.
    indexvar : str
        The column name of of the identifier variable that representing
        subjects or repeated measures
    withinvars : str | list of str
        The column names of the categorial data identifying random effects
    measvar : str
        The column name of the response measure
    confint : float
        The confidence interval
    copy : bool
        Whether to copy the data frame or not.
    """
    if copy:
        df = df.copy()

    # Apply Cousinaueu's method:
    # compute grand mean
    mean_ = df[measvar].mean()

    # compute subject means
    subj_means = df.groupby(indexvar)[measvar].mean().values
    for subj, smean_ in zip(df[indexvar].unique(), subj_means):
        # center
        df[measvar][df[indexvar] == subj] -= smean_
        # add grand average
        df[measvar][df[indexvar] == subj] += mean_

    def sem(x):
        return x.std() / np.sqrt(len(x))

    def ci(x):
        se = sem(x)
        return se * scipy.stats.t.interval(confint, len(x - 1))[1]

    aggfuncs = [np.mean, np.std, sem, ci, len]
    out = df.groupby(withinvars)[measvar].agg(aggfuncs)

    # compute & apply correction factor
    n_within = np.prod([len(df[k].unique()) for k in withinvars],
                       dtype= df[measvar].dtype)
    cf = np.sqrt(n_within / (n_within - 1.))
    for k in ['sem', 'std', 'ci']:
        out[k] *= cf

    out['ci'] = stats.t.isf((1 - confint) / 2., out['len'] - 1) * out['sem']

    return out

ss = '''
 subject condition value
       1   pretest  59.4
       2   pretest  46.4
       3   pretest  46.0
       4   pretest  49.0
       5   pretest  32.5
       6   pretest  45.2
       7   pretest  60.3
       8   pretest  54.3
       9   pretest  45.4
      10   pretest  38.9
       1  posttest  64.5
       2  posttest  52.4
       3  posttest  49.7
       4  posttest  48.7
       5  posttest  37.4
       6  posttest  49.5
       7  posttest  59.9
       8  posttest  54.1
       9  posttest  49.6
      10  posttest  48.5'''

import StringIO
df = pd.read_fwf(StringIO.StringIO(ss), widths=[8, 10, 6], header=1)
res = ci_within(df2, 'subject', ['condition'], 'value', confint=0.95)
print res
print res[['len', 'mean', 'std', 'sem', 'ci']]

#ci is different from R
#http://www.cookbook-r.com/Graphs/Plotting_means_and_error_bars_%28ggplot2%29/#error-bars-for-within-subjects-variables

#dfwc <- summarySEwithin(dfw.long, measurevar="value", withinvars="condition",
#                        idvar="subject", na.rm=FALSE, conf.interval=.95)
# condition  N value value_norm       sd        se       ci
#  posttest 10 51.43      51.43 2.262361 0.7154214 1.618396
#   pretest 10 47.74      47.74 2.262361 0.7154214 1.618396
	# Author Denis A. Engemann <d.engemann@gmail.com>
	# Adjustments: Josef Perktold
	#
	# License: BSD (3-clause)

	import numpy as np
	import scipy.stats
	import pandas as pd

	def ci_within(df, indexvar, withinvars, measvar, confint=0.95,
	copy=True):
	""" Compute CI / SEM correction factor

	Morey 2008, Cousinaueu 2005, Loftus & Masson, 1994
	Also see R-cookbook http://goo.gl/QdwJl

	Note. This functions helps to generate appropriate confidence
	intervals for repeated measure designs.
	Standard confidence intervals are are computed on normalized data
	and a correction factor is applied that prevents insanely small values.

	df : instance of pandas.DataFrame
	The data frame objetct.
	indexvar : str
	The column name of of the identifier variable that representing
	subjects or repeated measures
	withinvars : str \| list of str
	The column names of the categorial data identifying random effects
	measvar : str
	The column name of the response measure
	confint : float
	The confidence interval
	copy : bool
	Whether to copy the data frame or not.
	"""
	if copy:
	df = df.copy()

	# Apply Cousinaueu's method:
	# compute grand mean
	mean_ = df[measvar].mean()

	# compute subject means
	subj_means = df.groupby(indexvar)[measvar].mean().values
	for subj, smean_ in zip(df[indexvar].unique(), subj_means):
	# center
	df[measvar][df[indexvar] == subj] -= smean_
	# add grand average
	df[measvar][df[indexvar] == subj] += mean_

	def sem(x):
	return x.std() / np.sqrt(len(x))

	def ci(x):
	se = sem(x)
	return se * scipy.stats.t.interval(confint, len(x - 1))[1]

	aggfuncs = [np.mean, np.std, sem, ci, len]
	out = df.groupby(withinvars)[measvar].agg(aggfuncs)

	# compute & apply correction factor
	n_within = np.prod([len(df[k].unique()) for k in withinvars],
	dtype= df[measvar].dtype)
	cf = np.sqrt(n_within / (n_within - 1.))
	for k in ['sem', 'std', 'ci']:
	out[k] *= cf

	out['ci'] = stats.t.isf((1 - confint) / 2., out['len'] - 1) * out['sem']

	return out

	ss = '''
	subject condition value
	1 pretest 59.4
	2 pretest 46.4
	3 pretest 46.0
	4 pretest 49.0
	5 pretest 32.5
	6 pretest 45.2
	7 pretest 60.3
	8 pretest 54.3
	9 pretest 45.4
	10 pretest 38.9
	1 posttest 64.5
	2 posttest 52.4
	3 posttest 49.7
	4 posttest 48.7
	5 posttest 37.4
	6 posttest 49.5
	7 posttest 59.9
	8 posttest 54.1
	9 posttest 49.6
	10 posttest 48.5'''

	import StringIO
	df = pd.read_fwf(StringIO.StringIO(ss), widths=[8, 10, 6], header=1)
	res = ci_within(df2, 'subject', ['condition'], 'value', confint=0.95)
	print res
	print res[['len', 'mean', 'std', 'sem', 'ci']]

	#ci is different from R
	#http://www.cookbook-r.com/Graphs/Plotting_means_and_error_bars_%28ggplot2%29/#error-bars-for-within-subjects-variables

	#dfwc <- summarySEwithin(dfw.long, measurevar="value", withinvars="condition",
	# idvar="subject", na.rm=FALSE, conf.interval=.95)
	# condition N value value_norm sd se ci
	# posttest 10 51.43 51.43 2.262361 0.7154214 1.618396
	# pretest 10 47.74 47.74 2.262361 0.7154214 1.618396