Skip to content

Instantly share code, notes, and snippets.

@royerk
Created November 21, 2023 02:59
Show Gist options
  • Save royerk/0f795c2dfadc20744a86dffac445105a to your computer and use it in GitHub Desktop.
Save royerk/0f795c2dfadc20744a86dffac445105a to your computer and use it in GitHub Desktop.
AB infer missing values, pick first value from day before
import pandas as pd
import numpy as np
df = pd.DataFrame(
[
["id_0", "2023-11-01", np.nan, 1, 0.5],
["id_0", "2023-11-02", 0.5, 2, np.nan],
["id_0", "2023-11-03", np.nan, np.nan, np.nan], # device in baseline
["id_0", "2023-11-04", np.nan, np.nan, np.nan], # device in baseline
["id_0", "2023-11-05", np.nan, 1.0, np.nan],
],
columns=["id", "date", "0", "1", "2"],
)
df["date"] = pd.to_datetime(df["date"])
print(df)
expected_df = pd.DataFrame(
[
["id_0", "2023-11-01", np.nan, 1, 0.5],
["id_0", "2023-11-02", 0.5, 2.0, 2.0],
["id_0", "2023-11-03", 2.0, 2.0, 2.0], # device in baseline
["id_0", "2023-11-04", 2.0, 2.0, 2.0], # device in baseline
["id_0", "2023-11-05", 2.0, 1.0, 1.0],
],
columns=["id", "date", "0", "1", "2"],
)
expected_df["date"] = pd.to_datetime(expected_df["date"])
def find_last_value(row, columns):
non_null_values = row[columns][row.notnull()]
if len(non_null_values) > 0:
return non_null_values[-1]
return np.nan
def fill_row_set_points(row, columns):
last_value = row["first_value"]
for i in columns:
if pd.isnull(row[i]):
row[i] = last_value
else:
last_value = row[i]
return row
def fill_df_set_points(df, columns):
df["last_value"] = df.apply(lambda x: find_last_value(x, columns), axis=1)
df_join = df[["id", "next_date", "last_value"]].copy()
df_join.rename(
columns={"next_date": "date", "last_value": "first_value"}, inplace=True
)
df_join.set_index(["id", "date"], inplace=True)
df.set_index(["id", "date"], inplace=True)
df.drop(columns=["last_value"], inplace=True)
df = pd.merge(df, df_join, how="left", left_index=True, right_index=True)
df.reset_index(inplace=True)
df = df.apply(fill_row_set_points, axis=1, columns=columns)
df.drop(columns=["first_value"], inplace=True)
return df
def populate_set_points(df):
columns = ["0", "1", "2"] # todo: get columns from df
missing_values = -1
df["next_date"] = df["date"].apply(lambda x: x + pd.DateOffset(days=1))
iteration = 0
while missing_values != df[columns].isnull().sum().sum():
print(f"iteration {iteration}")
iteration += 1
missing_values = df[columns].isnull().sum().sum()
df = fill_df_set_points(df)
df.drop(columns=["next_date"], inplace=True)
return df
df = populate_set_points(df)
print(df)
pd.testing.assert_frame_equal(df, expected_df)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment