Skip to content

Instantly share code, notes, and snippets.

Created October 30, 2020 00:19
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
What would you like to do?
def time_split(df, validation_pct=0.2):
df = df.sort_values("Dates")
split_date = df.loc[df.index[int(len(df) * (1 - validation_pct))], "Dates"]
return df.index[df["Dates"] <= split_date], df.index[df["Dates"] > split_date]
train_idx, validation_idx = time_split(train, validation_pct=0.2)
print(f"Training data has {len(train_idx)} samples from {train.loc[train_idx, 'Dates'].min()} to {train.loc[train_idx, 'Dates'].max()}")
print(f"Validation data has {len(validation_idx)} samples from {train.loc[validation_idx, 'Dates'].min()} to {train.loc[validation_idx, 'Dates'].max()}")
train.drop("Dates", axis=1, inplace=True)
to = TabularPandas(train,
procs=[Categorify, FillMissing, Normalize],
splits=[list(train_idx), list(validation_idx)])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment