Skip to content

Instantly share code, notes, and snippets.

@ragulpr
Created February 23, 2017 22:44
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save ragulpr/9c4eab7b1e48063f0797d0220d0f6f58 to your computer and use it in GitHub Desktop.
Save ragulpr/9c4eab7b1e48063f0797d0220d0f6f58 to your computer and use it in GitHub Desktop.
def load_challenge_data(df,start_at,truncate_at):
seq_len = np.max([truncate_at,df[:,1].max().astype(int)+1])
n_vars = df.shape[1]-2 # Drop unit_number and time
n_series = int(df[:,0].max())
feature_data = np.zeros([seq_len,n_series,n_vars])
times_to_event = np.zeros([seq_len,n_series,1])
seq_lengths = np.zeros([n_series])
mask = np.ones([seq_len,n_series,1])
for s in xrange(n_series):
this_seq = df[np.floor(df[:,0])==s+1,2:]
this_seq_len = this_seq.shape[0]
feature_data[0:this_seq_len,s,:] = this_seq
seq_lengths[s] = this_seq_len
mask[this_seq_len:,s,0] = 0
times_to_event[0:this_seq_len,s,0] = np.linspace(this_seq_len-1,0,this_seq_len)
seq_lengths = seq_lengths.astype(int)
feature_data = feature_data[start_at:truncate_at,:,:]
times_to_event = times_to_event[start_at:truncate_at,:,:]
seq_lengths[truncate_at<seq_lengths] = truncate_at
seq_lengths = seq_lengths - start_at
mask = mask[start_at:truncate_at,:,:]
n_series
n_vars
return feature_data,times_to_event,seq_lengths,mask,n_series,n_vars
def make_one_array(dflist):
for i in xrange(1,len(dflist)):
dflist[i][:,0] = dflist[i-1][:,0].max()+dflist[i][:,0]
return np.concatenate(dflist)
def get_normalization_coef(feature_data,mask):
reshaped_data = feature_data.reshape((-2,feature_data.shape[2]))[mask.reshape((-2,mask.shape[2])).flatten()==1,:]
return np.mean(reshaped_data,0),np.std(reshaped_data,0)
# https://github.com/hankroark/Turbofan-Engine-Degradation/tree/master/CMAPSSData
path = '/CMAPSSData/'
# Use only datasets where there's a mixture of six different flight conditions...
train = [
# np.loadtxt(path+'train_FD001.txt'),
np.loadtxt(path+'train_FD002.txt'),
# np.loadtxt(path+'train_FD003.txt'),
np.loadtxt(path+'train_FD004.txt')
]
# rul = [
# # np.loadtxt(path+'RUL_FD001.txt'),
# np.loadtxt(path+'RUL_FD002.txt'),
# # np.loadtxt(path+'RUL_FD003.txt'),
# np.loadtxt(path+'RUL_FD004.txt')
# ]
# dont use 'test' data since it seems like it comes from another distribution
# test = [
# # np.loadtxt(path+'test_FD001.txt'),
# np.loadtxt(path+'test_FD002.txt'),
# # np.loadtxt(path+'test_FD003.txt'),
# np.loadtxt(path+'test_FD004.txt')
# ]
# Shuffle dataset,
train = make_one_array(train)
truncate_at = 382
start_at = 128
n = seq_len = truncate_at - start_at
random.seed(0)
np.random.seed(0)
feature_data,times_to_event,seq_lengths,mask,n_series,n_vars = load_challenge_data(train,start_at,truncate_at)
new_idx = random.sample(xrange(n_series),n_series)
test_set_size = n_series/3
feature_data = np.copy(feature_data[:,new_idx,:])
times_to_event = np.copy(times_to_event[:,new_idx,:])
seq_lengths = np.copy(seq_lengths[new_idx])
mask = np.copy(mask[:,new_idx,:])
mean_normalize, std_normalize = get_normalization_coef(feature_data[:,test_set_size:,:],mask[:,test_set_size:,:])
del(train)
# Normalize
feature_data=mask*(feature_data-mean_normalize)/std_normalize#.reshape([1,1,n_vars])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment