Skip to content

Instantly share code, notes, and snippets.

@mndrake
Last active June 3, 2022 17:51
Show Gist options
  • Save mndrake/de5f9b5929715b823f15c844d47d308e to your computer and use it in GitHub Desktop.
Save mndrake/de5f9b5929715b823f15c844d47d308e to your computer and use it in GitHub Desktop.
Extract Cluster Comments for Anomaly
from urllib.parse import urlparse
import re
import dataiku
import pandas as pd
PROJECT_ID = 'CUSTOMERSEGMENTATION'
ANALYSIS_ID = 'UjW24hJ1'
ML_TASK_ID = 'LsiobCLw'
MODEL_ID = 'A-CUSTOMERSEGMENTATION-UjW24hJ1-LsiobCLw-s1-pp1-m1'
OUTPUT_TABLE = 'cluster_comments'
client = dataiku.api_client()
project = client.get_project(PROJECT_ID)
ml_task = project.get_ml_task(analysis_id=ANALYSIS_ID, mltask_id=ML_TASK_ID)
trained_model_info = ml_task.get_trained_model_snippet(MODEL_ID)
anomoly_df = pd.DataFrame([md for md in trained_model_info['facts']['global']['facts']])
def process_observation_record(row):
if row['type'] == 'numerical':
difference = row['mean']/row['global_mean'] - 1
direction = 'greater' if difference > 0 else 'smaller'
result = f"{row['feature_label']} is in average {abs(difference):.2%} {direction}"
result += f" : mean of {row['mean']:.2f} against {row['global_mean']:.2f} globally"
else: # categorical
result = f"{row['current_ratio']:.0%} of the cluster has {row['category_value']} for {row['feature_label']} "
result += f"(against {row['global_ratio']:.0%} globally)"
return result
anomoly_df['comment'] = anomoly_df.apply(process_observation_record, axis=1)
result_df = anomoly_df.loc[:, ['cluster_label', 'comment']]
anomoly_cluster_comments = dataiku.Dataset(OUTPUT_TABLE)
anomoly_cluster_comments.write_with_schema(result_df)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment