Skip to content

Instantly share code, notes, and snippets.

@simplymathematics
Last active September 21, 2022 14:07
Show Gist options
  • Save simplymathematics/f3c1a8910f1d1397a9a59d10e3bc4e99 to your computer and use it in GitHub Desktop.
Save simplymathematics/f3c1a8910f1d1397a9a59d10e3bc4e99 to your computer and use it in GitHub Desktop.
deckard data yaml
import collections
from sklearn.datasets import make_blobs, make_moons, make_classification, load_boston, load_iris, load_diabetes, load_wine
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from pathlib import Path
generated = {
"blobs": make_blobs,
"moons": make_moons,
"classification": make_classification,
}
real = {
"boston": load_boston,
"iris": load_iris,
"diabetes": load_diabetes,
"wine": load_wine,
}
class Data(collections.namedtuple('Data', 'name, params')):
def __new__(cls, loader, node):
return super().__new__(cls, **loader.construct_mapping(node))
# def __init__(self):
def __call__(self, name = None, params = None, **kwargs):
if name is not None:
self.name = name
if params is not None:
self.params = params
for kwarg in kwargs:
if hasattr(self, kwarg):
setattr(self, kwarg, kwargs[kwarg])
kwargs.pop(kwarg)
if self.name in real:
big_X, big_y = real[self.name](return_X_y=True, **kwargs)
elif self.name in generated:
big_X, big_y = generated[self.name]( **kwargs)
elif isinstance(self.name, Path) and self.name.exists() and str(self.name).endswith(".npz"):
big_X, big_y = np.load(self.name)
elif isinstance(self.name, Path) and self.name.exists() and str(self.name).endswith(".csv"):
assert "target" in self.params, "target column must be specified"
df = pd.read_csv(self.name)
big_X = df.drop(self.params["target"], axis = 1)
big_y = df[self.params["target"]]
elif isinstance(self.name, Path) and self.name.exists() and str(self.name).endswith(".json"):
assert "target" in self.params, "target column must be specified"
df = pd.read_json(self.name)
big_X = df.drop(self.params["target"], axis = 1)
big_y = df[self.params["target"]]
else:
raise ValueError(f'Unknown dataset: {self.name}')
if "input_noise" in self.params:
input_noise = self.params.pop("input_noise")
else:
input_noise = 0
if "output_noise" in self.params:
output_noise = self.params.pop("output_noise")
else:
output_noise = 0
if "stratify" in self.params and self.params["stratify"] == True:
self.params["stratify"] = big_y
X_train, X_test, y_train, y_test = train_test_split(big_X, big_y, **self.params)
if "train_noise" in self.params:
X_train += np.random.normal(0, input_noise, X_train.shape)
if "test_noise" in self.params:
X_test += np.random.normal(0, output_noise, X_test.shape)
self.X_train = X_train
self.X_test = X_test
self.y_train = y_train
self.y_test = y_test
return self
yaml.add_constructor('!deckard.Data', Data)
document = """
!deckard.Data
name: 'blobs'
params: {
"shuffle" : True,
"random_state" : 42,
"test_size" : 0.2,
"stratify" : True,
"input_noise" : 1,
}
"""
data = yaml.load(document, Loader = yaml.Loader)
data = data(n_samples = 1000, n_features = 2, centers =3)
data.X_train.shape
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment