Last active
September 4, 2018 17:00
-
-
Save fcunhaneto/faf636c179a8e78a9da391275db160e3 to your computer and use it in GitHub Desktop.
Cria um dataframe com valores cumulativos dentro de um intervalo, chamados de buckets.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
def frequency_by_buckets(dataframe, column, r, ini, end): | |
""" | |
Creates a dataframe with cumulative values within a range, which are called | |
buckets. | |
Dataframe output example: | |
tot freq tot_ac freq_ac | |
[40, 50) 8 0.16 8 0.16 | |
[50, 60) 22 0.44 30 0.60 | |
[60, 70) 8 0.16 38 0.76 | |
[70, 80) 6 0.12 44 0.88 | |
[80, 90) 5 0.10 49 0.98 | |
[90, 100) 1 0.02 50 1.00 | |
:param dataframe: | |
:param column: the column in which the data will be grouped | |
:param r: the range between values | |
:param ini: where the interval begin | |
:param end: where the interval end | |
:return: dataframe | |
""" | |
s_ini = dataframe[column] | |
buckets = [x for x in range(ini, end+r, r)] | |
s_buckets = pd.cut(s_ini, buckets, right=False) | |
s_buckets = s_buckets.value_counts() | |
s_buckets = s_buckets.sort_index() | |
tot_ac = s_buckets.cumsum() | |
s_tot = sum(s_buckets) | |
freq = np.array(s_buckets) / s_tot | |
freq_ac = freq.cumsum() | |
df = pd.DataFrame({ | |
column: s_buckets, | |
'freq': freq, | |
'tot_ac': tot_ac, | |
'freq_ac': freq_ac, | |
}) | |
return df |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment