mndrake/anomaly_comments_recipe.py

## anomaly_comments_recipe.py
from urllib.parse import urlparse
import re
import dataiku
import pandas as pd

PROJECT_ID = 'CUSTOMERSEGMENTATION'
ANALYSIS_ID = 'UjW24hJ1'
ML_TASK_ID = 'LsiobCLw'
MODEL_ID = 'A-CUSTOMERSEGMENTATION-UjW24hJ1-LsiobCLw-s1-pp1-m1'

OUTPUT_TABLE = 'cluster_comments'

client = dataiku.api_client()

project = client.get_project(PROJECT_ID)
ml_task = project.get_ml_task(analysis_id=ANALYSIS_ID, mltask_id=ML_TASK_ID)
trained_model_info = ml_task.get_trained_model_snippet(MODEL_ID)

anomoly_df = pd.DataFrame([md for md in trained_model_info['facts']['global']['facts']])

def process_observation_record(row):
    if row['type'] == 'numerical':
        difference = row['mean']/row['global_mean'] - 1
        direction = 'greater' if difference > 0 else 'smaller'
        result = f"{row['feature_label']} is in average {abs(difference):.2%} {direction}"
        result += f" : mean of {row['mean']:.2f} against {row['global_mean']:.2f} globally"
    else: # categorical
        result = f"{row['current_ratio']:.0%} of the cluster has {row['category_value']} for {row['feature_label']} "
        result += f"(against {row['global_ratio']:.0%} globally)"
    return result

anomoly_df['comment'] = anomoly_df.apply(process_observation_record, axis=1)
result_df = anomoly_df.loc[:, ['cluster_label', 'comment']]

anomoly_cluster_comments = dataiku.Dataset(OUTPUT_TABLE)
anomoly_cluster_comments.write_with_schema(result_df)
	from urllib.parse import urlparse
	import re
	import dataiku
	import pandas as pd

	PROJECT_ID = 'CUSTOMERSEGMENTATION'
	ANALYSIS_ID = 'UjW24hJ1'
	ML_TASK_ID = 'LsiobCLw'
	MODEL_ID = 'A-CUSTOMERSEGMENTATION-UjW24hJ1-LsiobCLw-s1-pp1-m1'

	OUTPUT_TABLE = 'cluster_comments'

	client = dataiku.api_client()

	project = client.get_project(PROJECT_ID)
	ml_task = project.get_ml_task(analysis_id=ANALYSIS_ID, mltask_id=ML_TASK_ID)
	trained_model_info = ml_task.get_trained_model_snippet(MODEL_ID)

	anomoly_df = pd.DataFrame([md for md in trained_model_info['facts']['global']['facts']])

	def process_observation_record(row):
	if row['type'] == 'numerical':
	difference = row['mean']/row['global_mean'] - 1
	direction = 'greater' if difference > 0 else 'smaller'
	result = f"{row['feature_label']} is in average {abs(difference):.2%} {direction}"
	result += f" : mean of {row['mean']:.2f} against {row['global_mean']:.2f} globally"
	else: # categorical
	result = f"{row['current_ratio']:.0%} of the cluster has {row['category_value']} for {row['feature_label']} "
	result += f"(against {row['global_ratio']:.0%} globally)"
	return result

	anomoly_df['comment'] = anomoly_df.apply(process_observation_record, axis=1)
	result_df = anomoly_df.loc[:, ['cluster_label', 'comment']]

	anomoly_cluster_comments = dataiku.Dataset(OUTPUT_TABLE)
	anomoly_cluster_comments.write_with_schema(result_df)