Skip to content

Instantly share code, notes, and snippets.

@duckworthd
Created September 16, 2013 17:46
Show Gist options
  • Save duckworthd/6584027 to your computer and use it in GitHub Desktop.
Save duckworthd/6584027 to your computer and use it in GitHub Desktop.
Drop-in replacement for `sklearn.cross_validation.KFold` for sequential data.
'''
Cross validation selectors
@author: duckworthd
'''
import numpy as np
class SequentialFolds(object):
'''
Cut data into training and testing sets by creating a rolling time window.
For example,
train = [January, February], test = [March]
train = [Feburary, March], test = [April]
train = [March, April], test=[June]
'''
def __init__(self, times, n=5, ratio=3, cumulative=False, indices=True):
'''
Parameters
----------
times : array of sortable
index by which data is sorted.
n : integer
number of splits desired
ratio : integer
training size is >= len(test size)*ratio
cumulative: boolean
if false, training size == len(test size)*ratio. if true, use
all samples appearing before the test samples
indices : boolean
False results in outputting a boolean mask, True in
a list of integer indices
'''
self.index = np.argsort(times)
self.n = n
self.ratio = ratio
self.indices = indices
self.cumulative = cumulative
def __iter__(self):
'''
Returns
-------
train : array of boolean or integers
indices to use for training in either a boolean mask or
integer index form
test : array of boolean or integers
indices to use for testing in either a boolean mask or
integer index form
'''
# split data into n pieces
split_points = np.linspace(start=0,
stop=len(self.index),
num=self.n+self.ratio+1)
split_points = split_points.astype(int)
for i in range(self.n):
if self.cumulative:
train_start = 0
else:
train_start = split_points[i]
train_end = split_points[i+self.ratio]
test_start = train_end
test_end = split_points[i+self.ratio+1]
# train indices
train = np.zeros(len(self.index), dtype=np.bool)
train[ self.index[train_start:train_end] ] = True
# test indices
test = np.zeros(len(self.index), dtype=np.bool)
test[ self.index[test_start:test_end] ] = True
if self.indices:
ind = np.arange(len(self.index))
train = ind[train]
test = ind[test]
yield (train, test)
def __len__(self):
return self.n
if __name__ == '__main__':
# test SequentialFolds
import random
keys = np.arange(50)
random.shuffle(keys)
print 'Without cumulative:'
folds = SequentialFolds(keys, n=5, ratio=2, indices=True)
for (train, test) in folds:
print 'train: ' + str(np.sort([keys[t] for t in train]))
print 'test: ' + str(np.sort([keys[t] for t in test]))
print 'With cumulative:'
folds = SequentialFolds(keys, n=5, ratio=2, indices=True, cumulative=True)
for (train, test) in folds:
print 'train: ' + str(np.sort([keys[t] for t in train]))
print 'test: ' + str(np.sort([keys[t] for t in test]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment