Skip to content

Instantly share code, notes, and snippets.

@jlln
Last active June 3, 2016 06:55
Show Gist options
  • Save jlln/fe8259c56636a59b62c1c102c5ff6141 to your computer and use it in GitHub Desktop.
Save jlln/fe8259c56636a59b62c1c102c5ff6141 to your computer and use it in GitHub Desktop.
Pandas/python function for determining the fractions of examples falling into different groups, taking into account other grouping criteria.
def groupCountFractionals(dataframe,target,outer):
'''
dataframe: a pandas dataframe
target: a string corresponding to the column of interest in the dataframe
outer: a list of the columns by which the counts should be conditioned
Returns the fraction of target_criteria_group / outer_criteria_group counts.
Be mindful to take group sizes (Outer Count) into consideration.
As outer count gets smaller, the fraction value
will obviously become increasingly large.
'''
dataframe["Count"] = 1
outcome_variable = outer[0]
inner_group_criteria = outer + [target]
outer_counts = dataframe.groupby(outer).count()[["Count"]]
outer_counts.columns = ["OuterCount"]
outer_counts = outer_counts.reset_index()
inner_counts = dataframe.groupby(inner_group_criteria).count()[["Count"]]
inner_counts.columns = ["InnerCount"]
inner_counts = inner_counts.reset_index()
in_and_out =pandas.merge(inner_counts,outer_counts,on=outer,how="right")
in_and_out["Fraction"] = in_and_out["InnerCount"] / in_and_out["OuterCount"]
'''The following code is for tracking groups that have counts of zero'''
group_values = [list(dataframe[o].unique()) for o in outer]
inner_values = list(dataframe[target].unique())
group_values.append(inner_values)
group_permutations = list(itertools.product(*group_values))
null_df = pandas.DataFrame(group_permutations)
null_columns = outer.copy()
null_columns.append(target)
null_df.columns = null_columns
null_df = pandas.merge(null_df,outer_counts, on = outer,how="outer")
null_df["InnerCount"] = 0
null_df["Fraction"] = 0
absent_rows = pandas.merge(null_df,in_and_out,how="outer",on=null_columns,indicator="left_only")
zero_rows = null_df[absent_rows["left_only"] == "left_only"]
return pandas.concat([in_and_out,zero_rows]).fillna(0)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment