Skip to content

Instantly share code, notes, and snippets.

@csiebler
Created May 5, 2020 09:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save csiebler/50a1192012a81d58f145ac65feef8858 to your computer and use it in GitHub Desktop.
Save csiebler/50a1192012a81d58f145ac65feef8858 to your computer and use it in GitHub Desktop.
A short example for train.py
import os
import sys
import argparse
import joblib
import pandas as pd
from azureml.core import Run
from azureml.core.run import Run
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
# All the imports go here...
def getRuntimeArgs():
parser = argparse.ArgumentParser()
parser.add_argument('--data-path', type=str) # Make sure the data path can be passed in dynamically
# Add more arguments here
args = parser.parse_args()
return args
def main():
# Get arguments from command line
args = getRuntimeArgs()
# Get the current experiment run context from Azure Machine Learning
run = Run.get_context()
# Use whatever function you require to load the data from the data directory
credit_data_df = pd.read_csv(os.path.join(args.data_path, 'german_credit_data.csv'))
# Do some model training
clf = model_train(credit_data_df, run)
#copying to "outputs" directory, automatically uploads it to Azure ML
output_dir = './outputs/'
os.makedirs(output_dir, exist_ok=True)
joblib.dump(value=clf, filename=os.path.join(output_dir, 'model.pkl'))
# Do your training here
def model_train(ds_df, run):
ds_df.drop("Sno", axis=1, inplace=True)
y_raw = ds_df['Risk']
X_raw = ds_df.drop('Risk', axis=1)
categorical_features = X_raw.select_dtypes(include=['object']).columns
numeric_features = X_raw.select_dtypes(include=['int64', 'float']).columns
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant', fill_value="missing")),
('onehotencoder', OneHotEncoder(categories='auto', sparse=False))])
numeric_transformer = Pipeline(steps=[
('scaler', StandardScaler())])
feature_engineering_pipeline = ColumnTransformer(
transformers=[
('numeric', numeric_transformer, numeric_features),
('categorical', categorical_transformer, categorical_features)
], remainder="drop")
# Encode Labels
le = LabelEncoder()
encoded_y = le.fit_transform(y_raw)
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X_raw, encoded_y, test_size=0.20, stratify=encoded_y, random_state=42)
# Create sklearn pipeline
lr_clf = Pipeline(steps=[('preprocessor', feature_engineering_pipeline),
('classifier', LogisticRegression(solver="lbfgs"))])
# Train the model
lr_clf.fit(X_train, y_train)
# Capture metrics
train_acc = lr_clf.score(X_train, y_train)
test_acc = lr_clf.score(X_test, y_test)
print("Training accuracy: %.3f" % train_acc)
print("Test data accuracy: %.3f" % test_acc)
# Logmetrics to Azure ML
run.log('Train accuracy', train_acc)
run.log('Test accuracy', test_acc)
return lr_clf
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment