Skip to content

Instantly share code, notes, and snippets.

View renardeinside's full-sized avatar
🦊
Make the elephant dance!

Ivan Trusov renardeinside

🦊
Make the elephant dance!
View GitHub Profile
import inspect
import tempfile
from pathlib import Path
from typing import Optional, Dict, Any, Tuple
import mlflow.sklearn
from hyperopt import STATUS_OK, tpe, fmin, Trials
from mlflow.models.signature import infer_signature
from pyspark.cloudpickle import dump
from sklearn.metrics import cohen_kappa_score, roc_auc_score, f1_score
class Provider:
@staticmethod
def get_search_space() -> SearchSpace:
search_space = {
"classifier": {
"max_depth": hp.choice("max_depth", np.arange(3, 10, dtype=int)),
"n_estimators": hp.choice(
"n_estimators", np.arange(10, 100, dtype=int)
),
class Provider:
@classmethod
def get_data(
cls,
data: pd.DataFrame,
source_metadata: SourceMetadata,
logger: Optional[Any] = None,
limit: Optional[int] = None,
) -> ModelData:
class ModelBuilderTask(Task):
def _read_data(self) -> Tuple[pd.DataFrame, SourceMetadata]:
db = self.conf["input"]["database"]
table_name = self.conf["input"]["table"]
full_table_name = f"{db}.{table_name}"
table = DeltaTable.forName(self.spark, full_table_name)
last_version = table.history(limit=1).toPandas()["version"][0]
_data = self.spark.sql(
f"select * from {full_table_name} VERSION AS OF {last_version}"
).toPandas()
from typing import Dict, Any
import pandas as pd
from pydantic import BaseModel
class FlexibleBaseModel(BaseModel):
class Config:
arbitrary_types_allowed = True
class DatasetLoaderTask(Task):
def get_data(self, limit: Optional[int] = None) -> PandasDataFrame:
self.logger.info("Loading the dataset")
dataset = openml.datasets.get_dataset(
"CreditCardFraudDetection", download_qualities=False
)
X, y, _, _ = dataset.get_data(dataset_format="dataframe")
_df = X
# sanitize column names
_df.rename(columns={"Class": "TARGET"}, inplace=True)
SearchSpace = Dict[str, Dict[str, Any]]
class Provider:
@staticmethod
def get_pipeline(params: Optional[SearchSpace] = None) -> Pipeline:
if not params:
params = {}
pipeline = Pipeline(
class DatasetLoaderTask(Task):
def get_data(self, limit: Optional[int] = None) -> SparkDataFrame:
self.logger.info("Loading the dataset")
dataset = openml.datasets.get_dataset("CreditCardFraudDetection")
X, y, _, _ = dataset.get_data(dataset_format="dataframe")
_df = X
# sanitize column names
_df.rename(columns={"Class": "TARGET"}, inplace=True)
_df.columns = [c.lower() for c in _df]
_df.drop(columns=["time"], inplace=True)
package net.renarde.demos.apps
import com.typesafe.config.{Config, ConfigFactory}
import net.renarde.demos.common.ConfigProvider
import net.renarde.demos.utils.{KafkaSupport, SparkSupport}
import org.scalatest.funsuite.AnyFunSuite
import collection.JavaConverters._
class AppTests extends AnyFunSuite with KafkaSupport with SparkSupport{
package net.renarde.demos.apps
import net.renarde.demos.common.GenericApp
object ReaderApp extends GenericApp {
val appName = "reader"
log.info(s"Running the application $appName")
log.info(s"Current application config: $config")
log.info("Application finished")
}