pycaret’s gists

## lead_conversion.py
# creating a copy of data
data2 = data.copy()
data2.dropna(axis=0, inplace=True)
data2['Converted'] = data2['Converted'].replace({1 : 'Yes', 0 : 'No'})

# plotly visual
import plotly.express as px

fig = px.scatter(x=data2['Total Time Spent on Website'], y=data2['Asymmetrique Activity Score'],
                 color = data2['Converted'], template = 'plotly_white',

## pycaret_github_main.yml
name: PyCaret AutoML Git Action
on:
  push :
    branches: [ master ]
jobs:
  build:
    runs-on: ubuntu-latest
    steps:
      - name: PyCaret AutoML Git Action
        id: model

## github_action_action.yml
name: "PyCaret AutoML Git Action"
description: "A simple example of AutoML created using PyCaret 2.0"
author: "Moez Ali"
inputs:
  DATASET:
    description: "Dataset for Training"
    required: true
    default: "juice"
  TARGET:
    description: "Name of Target variable"

## Dockerfile
FROM python:3.7-slim

WORKDIR /app

ADD . /app

RUN apt-get update && apt-get install -y libgomp1

RUN pip install --trusted-host pypi.python.org -r requirements.txt

## github_action_app.py
import os, ast
import pandas as pd

dataset = os.environ["INPUT_DATASET"]
target = os.environ["INPUT_TARGET"]
usecase = os.environ["INPUT_USECASE"]

dataset_path = "https://raw.githubusercontent.com/" + os.environ["GITHUB_REPOSITORY"] + "/master/" + os.environ["INPUT_DATASET"] + '.csv'
data = pd.read_csv(dataset_path)
data.head()

## script.py
# import libraries
import pandas as pd
import sys

# define command line parameters
data = sys.argv[1]
target = sys.argv[2]

# load data (replace this part with your own script)
from pycaret.datasets import get_data

## mlflow.py
# import classification module
from pycaret.classification import *

# init setup
clf1 = setup(data, target = 'name-of-target', log_experiment = True, experiment_name = 'exp-name-here')

# compare models
best = compare_models()

# start mlflow server on localhost:5000 (when using notebook)

## utils.py
# select and finalize the best model in the active run
best_model = automl() #returns the best model based on CV score

# select and finalize the best model based on 'F1' on hold_out set
best_model_holdout = automl(optimize = 'F1', use_holdout = True)

# save model
save_model(model, 'c:/path-to-directory/model-name')

# load model

## setup.py
# Import module
from pycaret.classification import *

# Initialize setup (when using Notebook environment)
clf1 = setup(data, target = 'target-variable')

# Initialize setup (outside of Notebook environment)
clf1 = setup(data, target = 'target-variable', html = False)

# Initialize setup (When using remote execution such as Kaggle / GitHub actions / CI-CD pipelines)

## predict_model.py
# train a catboost model
catboost = create_model('catboost')

# predict on holdout set (when no data is passed)
pred_holdout = predict_model(catboost)

# predict on new dataset
new_data = pd.read_csv('new-data.csv')
pred_new = predict_model(catboost, data = new_data)
	# creating a copy of data
	data2 = data.copy()
	data2.dropna(axis=0, inplace=True)
	data2['Converted'] = data2['Converted'].replace({1 : 'Yes', 0 : 'No'})

	# plotly visual
	import plotly.express as px

	fig = px.scatter(x=data2['Total Time Spent on Website'], y=data2['Asymmetrique Activity Score'],
	color = data2['Converted'], template = 'plotly_white',
	name: PyCaret AutoML Git Action
	on:
	push :
	branches: [ master ]
	jobs:
	build:
	runs-on: ubuntu-latest
	steps:
	- name: PyCaret AutoML Git Action
	id: model
	name: "PyCaret AutoML Git Action"
	description: "A simple example of AutoML created using PyCaret 2.0"
	author: "Moez Ali"
	inputs:
	DATASET:
	description: "Dataset for Training"
	required: true
	default: "juice"
	TARGET:
	description: "Name of Target variable"
	FROM python:3.7-slim

	WORKDIR /app

	ADD . /app

	RUN apt-get update && apt-get install -y libgomp1

	RUN pip install --trusted-host pypi.python.org -r requirements.txt
	import os, ast
	import pandas as pd

	dataset = os.environ["INPUT_DATASET"]
	target = os.environ["INPUT_TARGET"]
	usecase = os.environ["INPUT_USECASE"]

	dataset_path = "https://raw.githubusercontent.com/" + os.environ["GITHUB_REPOSITORY"] + "/master/" + os.environ["INPUT_DATASET"] + '.csv'
	data = pd.read_csv(dataset_path)
	data.head()
	# import libraries
	import pandas as pd
	import sys

	# define command line parameters
	data = sys.argv[1]
	target = sys.argv[2]

	# load data (replace this part with your own script)
	from pycaret.datasets import get_data
	# import classification module
	from pycaret.classification import *

	# init setup
	clf1 = setup(data, target = 'name-of-target', log_experiment = True, experiment_name = 'exp-name-here')

	# compare models
	best = compare_models()

	# start mlflow server on localhost:5000 (when using notebook)
	# select and finalize the best model in the active run
	best_model = automl() #returns the best model based on CV score

	# select and finalize the best model based on 'F1' on hold_out set
	best_model_holdout = automl(optimize = 'F1', use_holdout = True)

	# save model
	save_model(model, 'c:/path-to-directory/model-name')

	# load model
	# Import module
	from pycaret.classification import *

	# Initialize setup (when using Notebook environment)
	clf1 = setup(data, target = 'target-variable')

	# Initialize setup (outside of Notebook environment)
	clf1 = setup(data, target = 'target-variable', html = False)

	# Initialize setup (When using remote execution such as Kaggle / GitHub actions / CI-CD pipelines)
	# train a catboost model
	catboost = create_model('catboost')

	# predict on holdout set (when no data is passed)
	pred_holdout = predict_model(catboost)

	# predict on new dataset
	new_data = pd.read_csv('new-data.csv')
	pred_new = predict_model(catboost, data = new_data)