Skip to content

Instantly share code, notes, and snippets.

@ravenscroftj
Created December 29, 2020 14:22
Show Gist options
  • Save ravenscroftj/1167487c0262b8dd1d92bcf4c2b7efd2 to your computer and use it in GitHub Desktop.
Save ravenscroftj/1167487c0262b8dd1d92bcf4c2b7efd2 to your computer and use it in GitHub Desktop.
MLFlow + NLP Training script using TFIDF and Random Forest on 20 newsgroups
#!/usr/bin/env python
# coding: utf-8
import mlflow
import mlflow.sklearn
import json
import os
import tempfile
import pandas as pd
from sklearn.metrics import f1_score, classification_report, plot_confusion_matrix
from mlflow.models.signature import infer_signature
from sklearn.datasets import fetch_20newsgroups
def df_from_20ng(subset):
newsgroups_train = fetch_20newsgroups(subset='train')
ngdata = {"text": newsgroups_train.data, "target": newsgroups_train.target}
df = pd.DataFrame.from_dict(ngdata)
df['target_name'] = df.target.apply(lambda x: newsgroups_train.target_names[x])
return df
df_train = df_from_20ng('train')
df_test = df_from_20ng('test')
X_train = df_train.drop(columns=['target','target_name'])
y_train = df_train['target_name']
X_test = df_test.drop(columns=['target','target_name'])
y_test = df_test['target_name']
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
ct = ColumnTransformer([
('tfidf', TfidfVectorizer(max_features=5000), 'text')
])
pipe = Pipeline([
('ctransformer', ct),
('clf', RandomForestClassifier(n_estimators=10, max_depth=20))
])
mlflow.set_experiment("My NLP Model")
with mlflow.start_run(run_name="TFIDF + Random Forest"):
pipe.fit(X_train,y_train)
y_pred = pipe.predict(X_test)
mlflow.set_tag('client','That Email Company')
signature = infer_signature(X_test, y_test)
mlflow.log_metric('f1', f1_score(y_test, y_pred, average='micro'))
mlflow.sklearn.log_model(pipe, "model", signature=signature)
with tempfile.TemporaryDirectory() as tmpdir:
report = classification_report(y_test, y_pred, output_dict=True)
with open(os.path.join(tmpdir, "classification_report.json"),'w') as f:
json.dump(report, f, indent=2)
mlflow.log_artifacts(tmpdir, "reporting")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment