Skip to content

Instantly share code, notes, and snippets.

@icexelloss
Last active March 1, 2018 15:26
Show Gist options
  • Save icexelloss/16d7956f96ba8aba2245bbb7bf958f43 to your computer and use it in GitHub Desktop.
Save icexelloss/16d7956f96ba8aba2245bbb7bf958f43 to your computer and use it in GitHub Desktop.
import statsmodels.api as sm
# df has four columns: id, y, x1, x2
group_column = 'id'
y_column = 'y'
x_columns = ['x1', 'x2']
schema = df.select(group_column, *x_columns).schema
@pandas_udf(schema, PandasUDFType.GROUPED_MAP)
# Input/output are both a pandas.DataFrame
def ols(pdf):
group_key = pdf[group_column].iloc[0]
y = pdf[y_column]
X = pdf[x_columns]
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
return pd.DataFrame([[group_key] + [model.params[i] for i in x_columns]], columns=[group_column] + x_columns)
beta = df.groupby(group_column).apply(ols)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment