liannewriting/define_ts_function_time_series.py

## define_ts_function_time_series.py
# Goal of the model:
#  Predict Global_active_power at a specified time in the future.
#   Eg. We want to predict how much Global_active_power will be ten minutes from now.
#       We can use all the values from t-1, t-2, t-3, .... t-history_length to predict t+10


def create_ts_files(dataset,
                    start_index,
                    end_index,
                    history_length,
                    step_size,
                    target_step,
                    num_rows_per_file,
                    data_folder):
    assert step_size > 0
    assert start_index >= 0

    if not os.path.exists(data_folder):
        os.makedirs(data_folder)

    time_lags = sorted(range(target_step+1, target_step+history_length+1, step_size), reverse=True)
    col_names = [f'x_lag{i}' for i in time_lags] + ['y']
    start_index = start_index + history_length
    if end_index is None:
        end_index = len(dataset) - target_step

    rng = range(start_index, end_index)
    num_rows = len(rng)
    num_files = math.ceil(num_rows/num_rows_per_file)

    # for each file.
    print(f'Creating {num_files} files.')
    for i in range(num_files):
        filename = f'{data_folder}/ts_file{i}.pkl'

        if i % 10 == 0:
            print(f'{filename}')

        # get the start and end indices.
        ind0 = i*num_rows_per_file + start_index
        ind1 = min(ind0 + num_rows_per_file, end_index)
        data_list = []

        # j in the current timestep. Will need j-n to j-1 for the history. And j + target_step for the target.
        for j in range(ind0, ind1):
            indices = range(j-1, j-history_length-1, -step_size)
            data = dataset[sorted(indices) + [j+target_step]]

            # append data to the list.
            data_list.append(data)

        df_ts = pd.DataFrame(data=data_list, columns=col_names)
        df_ts.to_pickle(filename)

    return len(col_names)-1
	# Goal of the model:
	# Predict Global_active_power at a specified time in the future.
	# Eg. We want to predict how much Global_active_power will be ten minutes from now.
	# We can use all the values from t-1, t-2, t-3, .... t-history_length to predict t+10


	def create_ts_files(dataset,
	start_index,
	end_index,
	history_length,
	step_size,
	target_step,
	num_rows_per_file,
	data_folder):
	assert step_size > 0
	assert start_index >= 0

	if not os.path.exists(data_folder):
	os.makedirs(data_folder)

	time_lags = sorted(range(target_step+1, target_step+history_length+1, step_size), reverse=True)
	col_names = [f'x_lag{i}' for i in time_lags] + ['y']
	start_index = start_index + history_length
	if end_index is None:
	end_index = len(dataset) - target_step

	rng = range(start_index, end_index)
	num_rows = len(rng)
	num_files = math.ceil(num_rows/num_rows_per_file)

	# for each file.
	print(f'Creating {num_files} files.')
	for i in range(num_files):
	filename = f'{data_folder}/ts_file{i}.pkl'

	if i % 10 == 0:
	print(f'{filename}')

	# get the start and end indices.
	ind0 = i*num_rows_per_file + start_index
	ind1 = min(ind0 + num_rows_per_file, end_index)
	data_list = []

	# j in the current timestep. Will need j-n to j-1 for the history. And j + target_step for the target.
	for j in range(ind0, ind1):
	indices = range(j-1, j-history_length-1, -step_size)
	data = dataset[sorted(indices) + [j+target_step]]

	# append data to the list.
	data_list.append(data)

	df_ts = pd.DataFrame(data=data_list, columns=col_names)
	df_ts.to_pickle(filename)

	return len(col_names)-1