Skip to content

Instantly share code, notes, and snippets.

@alexeygrigorev
Created December 13, 2022 13:17
Show Gist options
  • Save alexeygrigorev/70b66a42c99925e219c62c91c191b89c to your computer and use it in GitHub Desktop.
Save alexeygrigorev/70b66a42c99925e219c62c91c191b89c to your computer and use it in GitHub Desktop.
Linear imputer for missing values
import pandas as pd
from datetime import datetime
from sklearn.linear_model import LinearRegression
def impute_linear(df, X_cols, y_col):
df = df.copy()
null_values = df[y_col].isnull()
X_train = df.loc[~null_values, X_cols].fillna(0).values
y_train = df.loc[~null_values, y_col].values
model = LinearRegression().fit(X_train, y_train)
X_test = df.loc[null_values, X_cols].fillna(0).values
y_pred = model.predict(X_test)
df.loc[null_values, y_col] = y_pred
return df
data = [
[datetime(year=2022, month=12, day=12, hour=10), 18.8, 19],
[datetime(year=2022, month=12, day=12, hour=16, minute=0), 23.6, 14],
[datetime(year=2022, month=12, day=12, hour=16, minute=50), 24, 14],
[datetime(year=2022, month=12, day=13, hour=9), None, None],
]
df = pd.DataFrame(data, columns=['ts', 'temperature', 'humidity'])
df['diff'] = (df.ts - df.ts[0]).dt.total_seconds()
df = impute_linear(df, X_cols=['diff'], y_col='temperature')
df = impute_linear(df, X_cols=['diff'], y_col='humidity')
df
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment