fcunhaneto/pandas-utils-2.py

## pandas-utils-2.py
import numpy as np
import pandas as pd

def frequency_by_buckets(dataframe, column, r, ini, end):
    """
    Creates a dataframe with cumulative values within a range, which are called
    buckets.
    Dataframe output example:

               tot  freq  tot_ac  freq_ac
    [40, 50)     8  0.16       8     0.16
    [50, 60)    22  0.44      30     0.60
    [60, 70)     8  0.16      38     0.76
    [70, 80)     6  0.12      44     0.88
    [80, 90)     5  0.10      49     0.98
    [90, 100)    1  0.02      50     1.00

    :param dataframe:
    :param column: the column in which the data will be grouped
    :param r: the range between values
    :param ini: where the interval begin
    :param end: where the interval end
    :return: dataframe
    """

    s_ini = dataframe[column]

    buckets = [x for x in range(ini, end+r, r)]
    s_buckets = pd.cut(s_ini, buckets, right=False)

    s_buckets = s_buckets.value_counts()
    s_buckets = s_buckets.sort_index()
    tot_ac = s_buckets.cumsum()

    s_tot = sum(s_buckets)
    freq = np.array(s_buckets) / s_tot
    freq_ac = freq.cumsum()

    df = pd.DataFrame({
        column: s_buckets,
        'freq': freq,
        'tot_ac': tot_ac,
        'freq_ac': freq_ac,
    })

    return df
	import numpy as np
	import pandas as pd

	def frequency_by_buckets(dataframe, column, r, ini, end):
	"""
	Creates a dataframe with cumulative values within a range, which are called
	buckets.
	Dataframe output example:

	tot freq tot_ac freq_ac
	[40, 50) 8 0.16 8 0.16
	[50, 60) 22 0.44 30 0.60
	[60, 70) 8 0.16 38 0.76
	[70, 80) 6 0.12 44 0.88
	[80, 90) 5 0.10 49 0.98
	[90, 100) 1 0.02 50 1.00

	:param dataframe:
	:param column: the column in which the data will be grouped
	:param r: the range between values
	:param ini: where the interval begin
	:param end: where the interval end
	:return: dataframe
	"""

	s_ini = dataframe[column]

	buckets = [x for x in range(ini, end+r, r)]
	s_buckets = pd.cut(s_ini, buckets, right=False)

	s_buckets = s_buckets.value_counts()
	s_buckets = s_buckets.sort_index()
	tot_ac = s_buckets.cumsum()

	s_tot = sum(s_buckets)
	freq = np.array(s_buckets) / s_tot
	freq_ac = freq.cumsum()

	df = pd.DataFrame({
	column: s_buckets,
	'freq': freq,
	'tot_ac': tot_ac,
	'freq_ac': freq_ac,
	})

	return df