gokart のサンプルコード
データファイルは Kaggle Titanic を用いてます。
import gokart | |
import luigi | |
import pandas as pd | |
class ReadCSV(gokart.TaskOnKart): | |
data_path: str = luigi.Parameter() | |
def run(self): | |
print("reading.... this is slow") | |
df = pd.read_csv(self.data_path) | |
self.dump(df) | |
if __name__ == "__main__": | |
df = gokart.build(ReadCSV(data_path='test.csv')) | |
print("data_size", len(df)) |
import gokart | |
import luigi | |
import pandas as pd | |
from sklearn.preprocessing import LabelEncoder | |
class ReadCSV(gokart.TaskOnKart): | |
data_path: str = luigi.Parameter() | |
def run(self): | |
print("reading.... this is slow") | |
df = pd.read_csv(self.data_path) | |
self.dump(df) | |
class GenerateLabelEncoder(gokart.TaskOnKart): | |
data_path: str = luigi.Parameter() | |
def requires(self): | |
return ReadCSV(data_path=self.data_path) | |
def run(self): | |
df = self.load_data_frame() | |
self.dump( | |
{ | |
"Embarked": LabelEncoder().fit(df['Embarked']), | |
"Sex": LabelEncoder().fit(df['Sex']), | |
} | |
) | |
class Preprocess(gokart.TaskOnKart): | |
data_path: str = luigi.Parameter() | |
def requires(self): | |
return { | |
"data": ReadCSV(data_path=self.data_path), | |
"le": GenerateLabelEncoder(data_path=self.data_path), | |
} | |
def run(self): | |
df = self.load_data_frame("data") | |
le = self.load("le") | |
df["Age"] = df["Age"].fillna(df["Age"].median()) | |
df["Embarked"] = df["Embarked"].fillna("S") | |
df["Embarked"] = le["Embarked"].transform(df['Embarked']) | |
df["Sex"] = le["Sex"].transform(df['Sex']) | |
self.dump(df[["Pclass", "Sex", "Age", "Fare"]]) | |
if __name__ == "__main__": | |
df = gokart.build(Preprocess(data_path='train.csv')) | |
print(df) |
import datetime | |
import gokart | |
import luigi | |
import pandas as pd | |
from sklearn.preprocessing import LabelEncoder | |
class ReadCSV(gokart.TaskOnKart): | |
data_path: str = luigi.Parameter() | |
def run(self): | |
df = pd.read_csv(self.data_path) | |
self.dump(df) | |
class GenerateLabelEncoder(gokart.TaskOnKart): | |
data = gokart.TaskInstanceParameter() | |
def run(self): | |
df = self.load_data_frame() | |
self.dump( | |
{ | |
"Embarked": LabelEncoder().fit(df['Embarked']), | |
"Sex": LabelEncoder().fit(df['Sex']), | |
} | |
) | |
class Preprocess(gokart.TaskOnKart): | |
data: ReadCSV = gokart.TaskInstanceParameter() | |
le: GenerateLabelEncoder = gokart.TaskInstanceParameter() | |
def run(self): | |
df = self.load_data_frame("data") | |
le = self.load("le") | |
df["Age"] = df["Age"].fillna(df["Age"].median()) | |
df["Embarked"] = df["Embarked"].fillna("S") | |
df["Embarked"] = le["Embarked"].transform(df['Embarked']) | |
df["Sex"] = le["Sex"].transform(df['Sex']) | |
df["Fare"] = df["Fare"].fillna(df["Fare"].median()) | |
df = df[["Pclass", "Sex", "Age", "Fare"]] | |
assert not df.isnull().values.any() | |
self.dump(df[["Pclass", "Sex", "Age", "Fare"]]) | |
class Pipeline(gokart.TaskOnKart): | |
__cache_time = luigi.FloatParameter(default=datetime.datetime.now().timestamp()) | |
def requires(self): | |
train_data = ReadCSV(data_path='train.csv') | |
label_encoders = GenerateLabelEncoder(data=train_data) | |
train_features = Preprocess(data=train_data, le=label_encoders) | |
return train_features | |
def run(self): | |
result = self.load() | |
self.dump(result) | |
if __name__ == "__main__": | |
df = gokart.build(Pipeline()) | |
print(df) |
import datetime | |
import gokart | |
import luigi | |
import pandas as pd | |
from sklearn import tree | |
from sklearn.preprocessing import LabelEncoder | |
class ReadCSV(gokart.TaskOnKart): | |
data_path: str = luigi.Parameter() | |
def run(self): | |
df = pd.read_csv(self.data_path) | |
self.dump(df) | |
class GenerateLabelEncoder(gokart.TaskOnKart): | |
data = gokart.TaskInstanceParameter() | |
def run(self): | |
df = self.load_data_frame() | |
self.dump( | |
{ | |
"Embarked": LabelEncoder().fit(df['Embarked']), | |
"Sex": LabelEncoder().fit(df['Sex']), | |
} | |
) | |
class Preprocess(gokart.TaskOnKart): | |
__version: int = luigi.IntParameter(default=3) | |
data = gokart.TaskInstanceParameter() | |
le = gokart.TaskInstanceParameter() | |
def run(self): | |
df = self.load_data_frame("data") | |
le = self.load("le") | |
df["Age"] = df["Age"].fillna(df["Age"].median()) | |
df["Embarked"] = df["Embarked"].fillna("S") | |
df["Embarked"] = le["Embarked"].transform(df['Embarked']) | |
df["Sex"] = le["Sex"].transform(df['Sex']) | |
df["Fare"] = df["Fare"].fillna(df["Fare"].median()) | |
df = df[["Pclass", "Sex", "Age", "Fare"]] | |
assert not df.isnull().values.any() | |
self.dump(df[["Pclass", "Sex", "Age", "Fare"]]) | |
class TrainClassifier(gokart.TaskOnKart): | |
__version = luigi.Parameter(default=2) | |
data = gokart.TaskInstanceParameter() | |
features = gokart.TaskInstanceParameter() | |
def run(self): | |
features = self.load_data_frame("features").values | |
target = self.load_data_frame("data")["Survived"].values | |
classifier = tree.DecisionTreeClassifier() | |
classifier = classifier.fit(features, target) | |
self.dump(classifier) | |
class Predict(gokart.TaskOnKart): | |
features = gokart.TaskInstanceParameter() | |
classifier = gokart.TaskInstanceParameter() | |
def run(self): | |
df = self.load_data_frame("features")[["Pclass", "Sex", "Age", "Fare"]] | |
classifier = self.load("classifier") | |
self.dump(classifier.predict(df)) | |
class Pipeline(gokart.TaskOnKart): | |
__cache_time = luigi.FloatParameter(default=datetime.datetime.now().timestamp()) | |
def requires(self): | |
train_data = ReadCSV(data_path='train.csv') | |
label_encoders = GenerateLabelEncoder(data=train_data) | |
train_features = Preprocess(data=train_data, le=label_encoders) | |
classifier = TrainClassifier(data=train_data, features=train_features) | |
test_data = ReadCSV(data_path='test.csv') | |
test_features = Preprocess(data=test_data, le=label_encoders) | |
test_result = Predict(features=test_features, classifier=classifier) | |
return test_result | |
def run(self): | |
result = self.load() | |
self.dump(result) | |
if __name__ == "__main__": | |
df = gokart.build(Pipeline()) | |
print(df) |
gokart のサンプルコード
データファイルは Kaggle Titanic を用いてます。