Created
November 20, 2023 15:24
-
-
Save paddymul/6a5ccfb94b8421a7c9e01734e597a662 to your computer and use it in GitHub Desktop.
Showing how to add counts for unique and nan to buckaroo
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"%pip install buckaroo\n", | |
"try:\n", | |
" from google.colab import output\n", | |
" output.enable_custom_widget_manager()\n", | |
"except Exception as e:\n", | |
" print(e)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import numpy as np\n", | |
"import pandas as pd\n", | |
"import buckaroo\n", | |
"from buckaroo.pluggable_analysis_framework.pluggable_analysis_framework import (ColAnalysis)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Demonstrating Buckaroo on Citibike data.\n", | |
"This might take a little time to download\n", | |
"\n", | |
"*once the view loads click 0's and 1's on the top left to toggle different parts of the UI*" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df = pd.read_csv(\"/Users/paddy/code/example-notebooks/citibike-trips.csv\") #for airplane work\n", | |
"w = buckaroo.BuckarooWidget(df)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"tags": [] | |
}, | |
"outputs": [], | |
"source": [ | |
"class Variance(ColAnalysis):\n", | |
" provides_summary = [\"unique_count\", \"nan_count\"]\n", | |
" @staticmethod\n", | |
" def summary(sampled_ser, summary_ser, ser):\n", | |
" l = len(ser)\n", | |
" val_counts = ser.value_counts()\n", | |
" return dict(unique_count=len(val_counts[val_counts==1]), \n", | |
" nan_count=(l - len(ser.dropna())))\n", | |
" summary_stats_display = [\n", | |
" 'dtype', 'length', 'nan_count', 'distinct_count', 'empty_count',\n", | |
" 'empty_per', 'unique_per', 'nan_per', \n", | |
" 'is_numeric', 'is_integer', 'is_datetime',\n", | |
" 'mode', 'min', 'max', 'mean', \n", | |
" # we must add unique_count and nan_count to the list of summary_stats_display, otherwise our new stat won't be displayed\n", | |
" 'unique_count', 'nan_count']\n", | |
"w.add_analysis(Variance)\n", | |
"w" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"analysis is added interactively, toggle the summary stats view on the widget above and notice that `unique_count` and `nan_count` has been added" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.11.5" | |
}, | |
"widgets": { | |
"application/vnd.jupyter.widget-state+json": { | |
"state": {}, | |
"version_major": 2, | |
"version_minor": 0 | |
} | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment