Skip to content

Instantly share code, notes, and snippets.

@snowhork
Created April 7, 2023 04:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save snowhork/a9405e4877d98da1d7b4c7ed6c1ab1ca to your computer and use it in GitHub Desktop.
Save snowhork/a9405e4877d98da1d7b4c7ed6c1ab1ca to your computer and use it in GitHub Desktop.
gokartのサンプルコード(宣言方式・パイプライン方式)
import gokart
import luigi
import pandas as pd
class ReadCSV(gokart.TaskOnKart):
data_path: str = luigi.Parameter()
def run(self):
print("reading.... this is slow")
df = pd.read_csv(self.data_path)
self.dump(df)
if __name__ == "__main__":
df = gokart.build(ReadCSV(data_path='test.csv'))
print("data_size", len(df))
import gokart
import luigi
import pandas as pd
from sklearn.preprocessing import LabelEncoder
class ReadCSV(gokart.TaskOnKart):
data_path: str = luigi.Parameter()
def run(self):
print("reading.... this is slow")
df = pd.read_csv(self.data_path)
self.dump(df)
class GenerateLabelEncoder(gokart.TaskOnKart):
data_path: str = luigi.Parameter()
def requires(self):
return ReadCSV(data_path=self.data_path)
def run(self):
df = self.load_data_frame()
self.dump(
{
"Embarked": LabelEncoder().fit(df['Embarked']),
"Sex": LabelEncoder().fit(df['Sex']),
}
)
class Preprocess(gokart.TaskOnKart):
data_path: str = luigi.Parameter()
def requires(self):
return {
"data": ReadCSV(data_path=self.data_path),
"le": GenerateLabelEncoder(data_path=self.data_path),
}
def run(self):
df = self.load_data_frame("data")
le = self.load("le")
df["Age"] = df["Age"].fillna(df["Age"].median())
df["Embarked"] = df["Embarked"].fillna("S")
df["Embarked"] = le["Embarked"].transform(df['Embarked'])
df["Sex"] = le["Sex"].transform(df['Sex'])
self.dump(df[["Pclass", "Sex", "Age", "Fare"]])
if __name__ == "__main__":
df = gokart.build(Preprocess(data_path='train.csv'))
print(df)
import datetime
import gokart
import luigi
import pandas as pd
from sklearn.preprocessing import LabelEncoder
class ReadCSV(gokart.TaskOnKart):
data_path: str = luigi.Parameter()
def run(self):
df = pd.read_csv(self.data_path)
self.dump(df)
class GenerateLabelEncoder(gokart.TaskOnKart):
data = gokart.TaskInstanceParameter()
def run(self):
df = self.load_data_frame()
self.dump(
{
"Embarked": LabelEncoder().fit(df['Embarked']),
"Sex": LabelEncoder().fit(df['Sex']),
}
)
class Preprocess(gokart.TaskOnKart):
data: ReadCSV = gokart.TaskInstanceParameter()
le: GenerateLabelEncoder = gokart.TaskInstanceParameter()
def run(self):
df = self.load_data_frame("data")
le = self.load("le")
df["Age"] = df["Age"].fillna(df["Age"].median())
df["Embarked"] = df["Embarked"].fillna("S")
df["Embarked"] = le["Embarked"].transform(df['Embarked'])
df["Sex"] = le["Sex"].transform(df['Sex'])
df["Fare"] = df["Fare"].fillna(df["Fare"].median())
df = df[["Pclass", "Sex", "Age", "Fare"]]
assert not df.isnull().values.any()
self.dump(df[["Pclass", "Sex", "Age", "Fare"]])
class Pipeline(gokart.TaskOnKart):
__cache_time = luigi.FloatParameter(default=datetime.datetime.now().timestamp())
def requires(self):
train_data = ReadCSV(data_path='train.csv')
label_encoders = GenerateLabelEncoder(data=train_data)
train_features = Preprocess(data=train_data, le=label_encoders)
return train_features
def run(self):
result = self.load()
self.dump(result)
if __name__ == "__main__":
df = gokart.build(Pipeline())
print(df)
import datetime
import gokart
import luigi
import pandas as pd
from sklearn import tree
from sklearn.preprocessing import LabelEncoder
class ReadCSV(gokart.TaskOnKart):
data_path: str = luigi.Parameter()
def run(self):
df = pd.read_csv(self.data_path)
self.dump(df)
class GenerateLabelEncoder(gokart.TaskOnKart):
data = gokart.TaskInstanceParameter()
def run(self):
df = self.load_data_frame()
self.dump(
{
"Embarked": LabelEncoder().fit(df['Embarked']),
"Sex": LabelEncoder().fit(df['Sex']),
}
)
class Preprocess(gokart.TaskOnKart):
__version: int = luigi.IntParameter(default=3)
data = gokart.TaskInstanceParameter()
le = gokart.TaskInstanceParameter()
def run(self):
df = self.load_data_frame("data")
le = self.load("le")
df["Age"] = df["Age"].fillna(df["Age"].median())
df["Embarked"] = df["Embarked"].fillna("S")
df["Embarked"] = le["Embarked"].transform(df['Embarked'])
df["Sex"] = le["Sex"].transform(df['Sex'])
df["Fare"] = df["Fare"].fillna(df["Fare"].median())
df = df[["Pclass", "Sex", "Age", "Fare"]]
assert not df.isnull().values.any()
self.dump(df[["Pclass", "Sex", "Age", "Fare"]])
class TrainClassifier(gokart.TaskOnKart):
__version = luigi.Parameter(default=2)
data = gokart.TaskInstanceParameter()
features = gokart.TaskInstanceParameter()
def run(self):
features = self.load_data_frame("features").values
target = self.load_data_frame("data")["Survived"].values
classifier = tree.DecisionTreeClassifier()
classifier = classifier.fit(features, target)
self.dump(classifier)
class Predict(gokart.TaskOnKart):
features = gokart.TaskInstanceParameter()
classifier = gokart.TaskInstanceParameter()
def run(self):
df = self.load_data_frame("features")[["Pclass", "Sex", "Age", "Fare"]]
classifier = self.load("classifier")
self.dump(classifier.predict(df))
class Pipeline(gokart.TaskOnKart):
__cache_time = luigi.FloatParameter(default=datetime.datetime.now().timestamp())
def requires(self):
train_data = ReadCSV(data_path='train.csv')
label_encoders = GenerateLabelEncoder(data=train_data)
train_features = Preprocess(data=train_data, le=label_encoders)
classifier = TrainClassifier(data=train_data, features=train_features)
test_data = ReadCSV(data_path='test.csv')
test_features = Preprocess(data=test_data, le=label_encoders)
test_result = Predict(features=test_features, classifier=classifier)
return test_result
def run(self):
result = self.load()
self.dump(result)
if __name__ == "__main__":
df = gokart.build(Pipeline())
print(df)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment