BenjaminWolfe/test_chaining_series.py

## test_chaining_series.py
import numpy as np, pandas as pd

df_len = 1000  # integer multiple of 4
np.random.seed(42)

# create a random data frame
df = pd.DataFrame(
    {
        "group_a": np.random.randint(0, df_len / 4, size=df_len),
        "group_b": np.random.randint(0, df_len / 4, size=df_len),
        "binary": np.random.randint(0, 2, size=df_len),
    }
)
print(df)

# look for non-unique values
def get_non_unique(df=df, groupby=["group_a", "group_b"], check="binary"):
    counts = df.groupby(groupby)[check].nunique()
    return counts[counts > 1]

print(get_non_unique())

# look for non-unique values, but chain using .loc[] and a lambda function
def get_non_unique_chained(df=df, groupby=["group_a", "group_b"], check="binary"):
    return df.groupby(groupby)[check].nunique().loc[lambda x: x > 1]

print(get_non_unique_chained())

# check that the two get the same result
assert sum(get_non_unique() != get_non_unique_chained()) == 0

# time the two. I was concerned that a lambda function would be non-vectorized and slower.
# it appears that, in at least some simple cases, performance is equivalent.
%timeit get_non_unique()
%timeit get_non_unique_chained()
	import numpy as np, pandas as pd

	df_len = 1000 # integer multiple of 4
	np.random.seed(42)

	# create a random data frame
	df = pd.DataFrame(
	{
	"group_a": np.random.randint(0, df_len / 4, size=df_len),
	"group_b": np.random.randint(0, df_len / 4, size=df_len),
	"binary": np.random.randint(0, 2, size=df_len),
	}
	)
	print(df)

	# look for non-unique values
	def get_non_unique(df=df, groupby=["group_a", "group_b"], check="binary"):
	counts = df.groupby(groupby)[check].nunique()
	return counts[counts > 1]

	print(get_non_unique())

	# look for non-unique values, but chain using .loc[] and a lambda function
	def get_non_unique_chained(df=df, groupby=["group_a", "group_b"], check="binary"):
	return df.groupby(groupby)[check].nunique().loc[lambda x: x > 1]

	print(get_non_unique_chained())

	# check that the two get the same result
	assert sum(get_non_unique() != get_non_unique_chained()) == 0

	# time the two. I was concerned that a lambda function would be non-vectorized and slower.
	# it appears that, in at least some simple cases, performance is equivalent.
	%timeit get_non_unique()
	%timeit get_non_unique_chained()