andrewm4894/keras_lstm_multi_helper_functions.py

## keras_lstm_multi_helper_functions.py
def data_reshape_for_model(data_in,n_timesteps,n_features,print_info=True):
    ''' Function to reshape the data into model ready format, either for training or prediction.
    '''
    # get original data shape
    data_in_shape = data_in.shape
    # create a dummy row with desired shape and one empty observation
    data_out = np.zeros((1,n_timesteps,n_features))
    # loop though each row of data and reshape accordingly
    for row in range(len(data_in)):
        # for each row look ahead as many timesteps as needed and then transpose the data to give shape keras wants
        tmp_array = np.array([data_in[row:(row+n_timesteps),].transpose()])
        # if your reshaped data is as expected then concate the new observation into data_out
        if tmp_array.shape == (1,n_timesteps,n_features):
            data_out = np.concatenate((data_out,tmp_array))
    # drop first dummy row of data_out
    data_out = data_out[1:]
    # get output data shape
    data_out_shape = data_out.shape
    if print_info: print(f'{data_in_shape} -> {data_out_shape}')

    return data_out


def train(model,data,n_epochs=10,batch_size=50,print_info=False,callbacks=None,shuffle=False,verbose=1):
    ''' Function to take in model and data and train the model using defined params.
    '''
    # fit the model to the data
    model.fit(data, data, epochs=n_epochs, batch_size=batch_size,
              validation_data=(data, data), verbose=verbose, shuffle=shuffle,
              callbacks=callbacks)

    return model


def predict(model,data,print_info=True):
    ''' Function to take in model and data and return predictions in model data format.
    '''
    # get prediction from model
    yhat =  model.predict(data)
    if print_info: print(yhat.shape)

    return yhat


def model_data_to_df_long(data,n_timesteps,n_features):
    ''' Function to take model data numpy array and translate it into a long format dataframe.
    '''
    # define empty list to collect data into
    data_tmp = []
    # for each row in the data
    for r in range(len(data)):
        row = data[r]
        # for each feature in each row
        for f in range(n_features):
            # for each timestep of each feature in each row
            for t in range(n_timesteps):
                # add an element to the list decoding what it represents
                tmp = [r,f'f{f}',f't{t}',row[f,t]]
                # append that element to the data
                data_tmp.append(tmp)
    # now use the collected data to create a pandas df
    df_long = pd.DataFrame(data_tmp,columns=['row','feature','timestep','value'])
    # add a label col that can be used to go from long format to wide
    df_long['label'] = df_long['feature'] + '_' + df_long['timestep']

    return df_long


def model_df_long_to_wide(df_long,key_col='label'):
    ''' Function that can translate a long formant model data df into a wide version of it.
    '''
    # use pivot to go from long to wide
    df_wide = df_long[['row','label','value']].pivot(index='row',columns=key_col,values='value')

    return df_wide


def df_out_add_errors(df_out,n_timesteps,n_features):
    ''' Function to take in a df_out type df and add in error columns
    '''
    # loop through to get errors
    f_cols = [f'f{f}' for f in range(n_features)]
    t_cols = [f't{t}' for t in range(n_timesteps)]
    for f_col in f_cols:
        for t_col in t_cols:
            lag = int(t_col.replace('t','')) + 1
            df_out[f'{f_col}_{t_col}_error'] = df_out[f_col].shift(lag*-1) - df_out[f'{f_col}_{t_col}_yhat']

    # get summary error metrics by timestep across all features
    for t_col in t_cols:
        df_out[f'{t_col}_error_avg'] = df_out[[col for col in df_out.columns if f'{t_col}_error' in col]].mean(axis=1)
        df_out[f'{t_col}_error_med'] = df_out[[col for col in df_out.columns if f'{t_col}_error' in col]].median(axis=1)
        df_out[f'{t_col}_error_min'] = df_out[[col for col in df_out.columns if f'{t_col}_error' in col]].min(axis=1)
        df_out[f'{t_col}_error_max'] = df_out[[col for col in df_out.columns if f'{t_col}_error' in col]].max(axis=1)
        df_out[f'{t_col}_error_rng'] = df_out[[col for col in df_out.columns if f'{t_col}_error' in col]].max(axis=1) - df_out[[col for col in df_out.columns if f'{t_col}_error' in col]].min(axis=1)

    return df_out


def yhat_to_df_out(data_train,yhat,n_timesteps,n_features):
    ''' Function to take data train and yhat prediction output array from model and turn it into final form of df_out.
    '''

    # helper df's
    df_train_long = model_data_to_df_long(data_train,n_timesteps,n_features)
    df_yhat_long = model_data_to_df_long(yhat,n_timesteps,n_features)
    df_train_wide = model_df_long_to_wide(df_train_long)
    df_yhat_wide = model_df_long_to_wide(df_yhat_long)
    df_yhat_wide.columns = [f'{col}_yhat' for col in df_yhat_wide.columns]

    # begin process to collect final data frame

    # make df_out
    train_cols_latest = [col for col in df_train_wide.columns if f't{n_timesteps-1}' in col]
    df_out = df_train_wide[train_cols_latest]
    # clean up col names
    df_out.columns = [col.split('_')[0] for col in df_out.columns]
    # now concat train cols and cols from df_yhat_wide
    df_out = pd.concat([df_out,df_yhat_wide],axis=1)

    # add in error cols
    df_out = df_out_add_errors(df_out,n_timesteps,n_features)

    return df_out
	def data_reshape_for_model(data_in,n_timesteps,n_features,print_info=True):
	''' Function to reshape the data into model ready format, either for training or prediction.
	'''
	# get original data shape
	data_in_shape = data_in.shape
	# create a dummy row with desired shape and one empty observation
	data_out = np.zeros((1,n_timesteps,n_features))
	# loop though each row of data and reshape accordingly
	for row in range(len(data_in)):
	# for each row look ahead as many timesteps as needed and then transpose the data to give shape keras wants
	tmp_array = np.array([data_in[row:(row+n_timesteps),].transpose()])
	# if your reshaped data is as expected then concate the new observation into data_out
	if tmp_array.shape == (1,n_timesteps,n_features):
	data_out = np.concatenate((data_out,tmp_array))
	# drop first dummy row of data_out
	data_out = data_out[1:]
	# get output data shape
	data_out_shape = data_out.shape
	if print_info: print(f'{data_in_shape} -> {data_out_shape}')

	return data_out


	def train(model,data,n_epochs=10,batch_size=50,print_info=False,callbacks=None,shuffle=False,verbose=1):
	''' Function to take in model and data and train the model using defined params.
	'''
	# fit the model to the data
	model.fit(data, data, epochs=n_epochs, batch_size=batch_size,
	validation_data=(data, data), verbose=verbose, shuffle=shuffle,
	callbacks=callbacks)

	return model


	def predict(model,data,print_info=True):
	''' Function to take in model and data and return predictions in model data format.
	'''
	# get prediction from model
	yhat = model.predict(data)
	if print_info: print(yhat.shape)

	return yhat


	def model_data_to_df_long(data,n_timesteps,n_features):
	''' Function to take model data numpy array and translate it into a long format dataframe.
	'''
	# define empty list to collect data into
	data_tmp = []
	# for each row in the data
	for r in range(len(data)):
	row = data[r]
	# for each feature in each row
	for f in range(n_features):
	# for each timestep of each feature in each row
	for t in range(n_timesteps):
	# add an element to the list decoding what it represents
	tmp = [r,f'f{f}',f't{t}',row[f,t]]
	# append that element to the data
	data_tmp.append(tmp)
	# now use the collected data to create a pandas df
	df_long = pd.DataFrame(data_tmp,columns=['row','feature','timestep','value'])
	# add a label col that can be used to go from long format to wide
	df_long['label'] = df_long['feature'] + '_' + df_long['timestep']

	return df_long


	def model_df_long_to_wide(df_long,key_col='label'):
	''' Function that can translate a long formant model data df into a wide version of it.
	'''
	# use pivot to go from long to wide
	df_wide = df_long[['row','label','value']].pivot(index='row',columns=key_col,values='value')

	return df_wide


	def df_out_add_errors(df_out,n_timesteps,n_features):
	''' Function to take in a df_out type df and add in error columns
	'''
	# loop through to get errors
	f_cols = [f'f{f}' for f in range(n_features)]
	t_cols = [f't{t}' for t in range(n_timesteps)]
	for f_col in f_cols:
	for t_col in t_cols:
	lag = int(t_col.replace('t','')) + 1
	df_out[f'{f_col}_{t_col}_error'] = df_out[f_col].shift(lag*-1) - df_out[f'{f_col}_{t_col}_yhat']

	# get summary error metrics by timestep across all features
	for t_col in t_cols:
	df_out[f'{t_col}_error_avg'] = df_out[[col for col in df_out.columns if f'{t_col}_error' in col]].mean(axis=1)
	df_out[f'{t_col}_error_med'] = df_out[[col for col in df_out.columns if f'{t_col}_error' in col]].median(axis=1)
	df_out[f'{t_col}_error_min'] = df_out[[col for col in df_out.columns if f'{t_col}_error' in col]].min(axis=1)
	df_out[f'{t_col}_error_max'] = df_out[[col for col in df_out.columns if f'{t_col}_error' in col]].max(axis=1)
	df_out[f'{t_col}_error_rng'] = df_out[[col for col in df_out.columns if f'{t_col}_error' in col]].max(axis=1) - df_out[[col for col in df_out.columns if f'{t_col}_error' in col]].min(axis=1)

	return df_out


	def yhat_to_df_out(data_train,yhat,n_timesteps,n_features):
	''' Function to take data train and yhat prediction output array from model and turn it into final form of df_out.
	'''

	# helper df's
	df_train_long = model_data_to_df_long(data_train,n_timesteps,n_features)
	df_yhat_long = model_data_to_df_long(yhat,n_timesteps,n_features)
	df_train_wide = model_df_long_to_wide(df_train_long)
	df_yhat_wide = model_df_long_to_wide(df_yhat_long)
	df_yhat_wide.columns = [f'{col}_yhat' for col in df_yhat_wide.columns]

	# begin process to collect final data frame

	# make df_out
	train_cols_latest = [col for col in df_train_wide.columns if f't{n_timesteps-1}' in col]
	df_out = df_train_wide[train_cols_latest]
	# clean up col names
	df_out.columns = [col.split('_')[0] for col in df_out.columns]
	# now concat train cols and cols from df_yhat_wide
	df_out = pd.concat([df_out,df_yhat_wide],axis=1)

	# add in error cols
	df_out = df_out_add_errors(df_out,n_timesteps,n_features)

	return df_out