royerk/fill_set_points_2.py

## fill_set_points_2.py
import pandas as pd
import numpy as np

df = pd.DataFrame(
    [
        ["id_0", "2023-11-01", np.nan, 1, 0.5],
        ["id_0", "2023-11-02", 0.5, 2, np.nan],
        ["id_0", "2023-11-03", np.nan, np.nan, np.nan],  # device in baseline
        ["id_0", "2023-11-04", np.nan, np.nan, np.nan],  # device in baseline
        ["id_0", "2023-11-05", np.nan, 1.0, np.nan],
    ],
    columns=["id", "date", "0", "1", "2"],
)

df["date"] = pd.to_datetime(df["date"])

print(df)

expected_df = pd.DataFrame(
    [
        ["id_0", "2023-11-01", np.nan, 1, 0.5],
        ["id_0", "2023-11-02", 0.5, 2.0, 2.0],
        ["id_0", "2023-11-03", 2.0, 2.0, 2.0],  # device in baseline
        ["id_0", "2023-11-04", 2.0, 2.0, 2.0],  # device in baseline
        ["id_0", "2023-11-05", 2.0, 1.0, 1.0],
    ],
    columns=["id", "date", "0", "1", "2"],
)

expected_df["date"] = pd.to_datetime(expected_df["date"])


def find_last_value(row, columns):
    non_null_values = row[columns][row.notnull()]
    if len(non_null_values) > 0:
        return non_null_values[-1]
    return np.nan


def fill_row_set_points(row, columns):
    last_value = row["first_value"]

    for i in columns:
        if pd.isnull(row[i]):
            row[i] = last_value
        else:
            last_value = row[i]
    return row


def fill_df_set_points(df, columns):
    df["last_value"] = df.apply(lambda x: find_last_value(x, columns), axis=1)

    df_join = df[["id", "next_date", "last_value"]].copy()
    df_join.rename(
        columns={"next_date": "date", "last_value": "first_value"}, inplace=True
    )
    df_join.set_index(["id", "date"], inplace=True)

    df.set_index(["id", "date"], inplace=True)
    df.drop(columns=["last_value"], inplace=True)

    df = pd.merge(df, df_join, how="left", left_index=True, right_index=True)
    df.reset_index(inplace=True)

    df = df.apply(fill_row_set_points, axis=1, columns=columns)

    df.drop(columns=["first_value"], inplace=True)

    return df


def populate_set_points(df):
    columns = ["0", "1", "2"]  # todo: get columns from df
    missing_values = -1

    df["next_date"] = df["date"].apply(lambda x: x + pd.DateOffset(days=1))

    iteration = 0
    while missing_values != df[columns].isnull().sum().sum():
        print(f"iteration {iteration}")
        iteration += 1

        missing_values = df[columns].isnull().sum().sum()
        df = fill_df_set_points(df)

    df.drop(columns=["next_date"], inplace=True)

    return df


df = populate_set_points(df)

print(df)

pd.testing.assert_frame_equal(df, expected_df)
	import pandas as pd
	import numpy as np

	df = pd.DataFrame(
	[
	["id_0", "2023-11-01", np.nan, 1, 0.5],
	["id_0", "2023-11-02", 0.5, 2, np.nan],
	["id_0", "2023-11-03", np.nan, np.nan, np.nan], # device in baseline
	["id_0", "2023-11-04", np.nan, np.nan, np.nan], # device in baseline
	["id_0", "2023-11-05", np.nan, 1.0, np.nan],
	],
	columns=["id", "date", "0", "1", "2"],
	)

	df["date"] = pd.to_datetime(df["date"])

	print(df)

	expected_df = pd.DataFrame(
	[
	["id_0", "2023-11-01", np.nan, 1, 0.5],
	["id_0", "2023-11-02", 0.5, 2.0, 2.0],
	["id_0", "2023-11-03", 2.0, 2.0, 2.0], # device in baseline
	["id_0", "2023-11-04", 2.0, 2.0, 2.0], # device in baseline
	["id_0", "2023-11-05", 2.0, 1.0, 1.0],
	],
	columns=["id", "date", "0", "1", "2"],
	)

	expected_df["date"] = pd.to_datetime(expected_df["date"])


	def find_last_value(row, columns):
	non_null_values = row[columns][row.notnull()]
	if len(non_null_values) > 0:
	return non_null_values[-1]
	return np.nan


	def fill_row_set_points(row, columns):
	last_value = row["first_value"]

	for i in columns:
	if pd.isnull(row[i]):
	row[i] = last_value
	else:
	last_value = row[i]
	return row


	def fill_df_set_points(df, columns):
	df["last_value"] = df.apply(lambda x: find_last_value(x, columns), axis=1)

	df_join = df[["id", "next_date", "last_value"]].copy()
	df_join.rename(
	columns={"next_date": "date", "last_value": "first_value"}, inplace=True
	)
	df_join.set_index(["id", "date"], inplace=True)

	df.set_index(["id", "date"], inplace=True)
	df.drop(columns=["last_value"], inplace=True)

	df = pd.merge(df, df_join, how="left", left_index=True, right_index=True)
	df.reset_index(inplace=True)

	df = df.apply(fill_row_set_points, axis=1, columns=columns)

	df.drop(columns=["first_value"], inplace=True)

	return df


	def populate_set_points(df):
	columns = ["0", "1", "2"] # todo: get columns from df
	missing_values = -1

	df["next_date"] = df["date"].apply(lambda x: x + pd.DateOffset(days=1))

	iteration = 0
	while missing_values != df[columns].isnull().sum().sum():
	print(f"iteration {iteration}")
	iteration += 1

	missing_values = df[columns].isnull().sum().sum()
	df = fill_df_set_points(df)

	df.drop(columns=["next_date"], inplace=True)

	return df


	df = populate_set_points(df)

	print(df)

	pd.testing.assert_frame_equal(df, expected_df)