Skip to content

Instantly share code, notes, and snippets.

@trainorpj trainorpj/PJDF.py
Created Mar 3, 2019

Embed
What would you like to do?
extending pandas dataframes
import pandas as pd
class PJDF(pd.DataFrame):
def __init__(self, *args, **kwargs):
super(PJDF, self).__init__(*args, **kwargs)
@property
def _constructor(self):
return PJDF
# == ^ don't change anything above this line ^ ==
# ===============================================
# =================================
# == add your own methods below! ==
# =================================
def pj_decode_one_hot_encoding(self, columns, new_column_name="DECODED_OHE"):
"""combine one-hot encoded columns into one
Parameters
==========
columns : list of columns that are one-hot encoded
new_column_name : name of column with combined categories, default is "DECODED_OHE"
Returns
=======
a PJDF object without `columns`, and `new_column_name` in their place
"""
combined_column = (
self
[columns]
.idxmax(axis=1)
)
return (
self
.drop(columns=columns)
.assign(_decoded=combined_column)
.rename(columns=dict(_decoded=new_column_name))
)
def pj_encode_one_hot_encoding(self, column, prefix=""):
"""split column into one-hot encoding
Parameters
==========
column : column that contains categories
prefix : prefix for new columns, after ohe
Returns
=======
a PJDF, without `column`, and one-hot encoded columns in its place. If there
were 4 distinct categories in `column`, then there would be four new columns,
comprised of 0s and 1s. These columns will be named after the categories,
along with the `prefix`, if it's specified
"""
ohe = (
pd.get_dummies(self[column])
.rename(columns=lambda cat: "{0}{1}".format(prefix, cat))
)
return (
self
.drop(columns=column)
.join(ohe)
)
def pj_filter_by_value_counts(self, column, min_count=1):
"""filter out categories based on how many are present
Parameters
==========
columns : column to group on, think of df[column].value_counts()
min_count : minimum number of occurrences for filtering, e.g. keep anything with at
least this many occurrences
Returns
=======
a PJDF, with categories that didn't meet the criteria removed.
Note: the index remains the same, so you can still use df.loc
"""
return (
self
.groupby(column)
.filter(lambda xs: len(xs) >= min_count)
)
from PJDF import PJDF
from seaborn import load_dataset
iris = load_dataset("iris")
iris = PJDF(iris)
def test_df():
assert isinstance(iris, PJDF) == True
assert list(iris.columns) == [
"sepal_length",
"sepal_width",
"petal_length",
"petal_width",
"species",
]
dropped_df = iris.drop(columns="petal_length")
# test pandas functionality
# make sure it is still PJDF, and hasn't fallen back to a pd.DataFrame
assert isinstance(dropped_df, PJDF) == True
# make sure iris remained unchanged
assert list(iris.columns) == [
"sepal_length",
"sepal_width",
"petal_length",
"petal_width",
"species",
]
# make sure drop behaved properly
assert list(dropped_df.columns) == [
"sepal_length",
"sepal_width",
"petal_width",
"species",
]
def test_ohe():
ohe1 = iris.pj_encode_one_hot_encoding("species")
assert ohe1.shape == (150, 7)
assert list(ohe1.columns) == [
"sepal_length",
"sepal_width",
"petal_length",
"petal_width",
"setosa",
"versicolor",
"virginica",
]
ohe2 = iris.pj_encode_one_hot_encoding("species", prefix="species_")
assert list(ohe2.columns) == [
"sepal_length",
"sepal_width",
"petal_length",
"petal_width",
"species_setosa",
"species_versicolor",
"species_virginica",
]
ohe1 = ohe1.pj_decode_one_hot_encoding(
["setosa", "versicolor", "virginica"], "SPECIES"
)
assert list(ohe1.columns) == [
"sepal_length",
"sepal_width",
"petal_length",
"petal_width",
"SPECIES",
]
def test_value_count_filter():
# take only first 10, so dataset is not evenly distributed
iris_val_count = iris.loc[10:]
assert iris_val_count.shape == (140, 5)
# should remove 40 rows, since setosa only appears 40 times... must
# appear 41 times to pass the filter
# note: this also suggests that PJDF is preserved under .loc indexing
iris_val_count = iris_val_count.pj_filter_by_value_counts("species", min_count=41)
assert iris_val_count.shape == (100, 5)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.