Created
December 29, 2020 14:22
-
-
Save ravenscroftj/1167487c0262b8dd1d92bcf4c2b7efd2 to your computer and use it in GitHub Desktop.
MLFlow + NLP Training script using TFIDF and Random Forest on 20 newsgroups
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# coding: utf-8 | |
import mlflow | |
import mlflow.sklearn | |
import json | |
import os | |
import tempfile | |
import pandas as pd | |
from sklearn.metrics import f1_score, classification_report, plot_confusion_matrix | |
from mlflow.models.signature import infer_signature | |
from sklearn.datasets import fetch_20newsgroups | |
def df_from_20ng(subset): | |
newsgroups_train = fetch_20newsgroups(subset='train') | |
ngdata = {"text": newsgroups_train.data, "target": newsgroups_train.target} | |
df = pd.DataFrame.from_dict(ngdata) | |
df['target_name'] = df.target.apply(lambda x: newsgroups_train.target_names[x]) | |
return df | |
df_train = df_from_20ng('train') | |
df_test = df_from_20ng('test') | |
X_train = df_train.drop(columns=['target','target_name']) | |
y_train = df_train['target_name'] | |
X_test = df_test.drop(columns=['target','target_name']) | |
y_test = df_test['target_name'] | |
from sklearn.compose import ColumnTransformer | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.pipeline import Pipeline | |
from sklearn.ensemble import RandomForestClassifier | |
ct = ColumnTransformer([ | |
('tfidf', TfidfVectorizer(max_features=5000), 'text') | |
]) | |
pipe = Pipeline([ | |
('ctransformer', ct), | |
('clf', RandomForestClassifier(n_estimators=10, max_depth=20)) | |
]) | |
mlflow.set_experiment("My NLP Model") | |
with mlflow.start_run(run_name="TFIDF + Random Forest"): | |
pipe.fit(X_train,y_train) | |
y_pred = pipe.predict(X_test) | |
mlflow.set_tag('client','That Email Company') | |
signature = infer_signature(X_test, y_test) | |
mlflow.log_metric('f1', f1_score(y_test, y_pred, average='micro')) | |
mlflow.sklearn.log_model(pipe, "model", signature=signature) | |
with tempfile.TemporaryDirectory() as tmpdir: | |
report = classification_report(y_test, y_pred, output_dict=True) | |
with open(os.path.join(tmpdir, "classification_report.json"),'w') as f: | |
json.dump(report, f, indent=2) | |
mlflow.log_artifacts(tmpdir, "reporting") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment