Skip to content

Instantly share code, notes, and snippets.

@songqiang
Last active June 9, 2017 07:46
Show Gist options
  • Save songqiang/409ea6ed9807fef251fd6408901f854c to your computer and use it in GitHub Desktop.
Save songqiang/409ea6ed9807fef251fd6408901f854c to your computer and use it in GitHub Desktop.
python imlementation of SAS proc freq 1D & 2D tables
from pandas import crosstab
def pyfreq(v1, v2 = None, weight = None, order = ''):
if v2 is not None:
if weight is not None:
cnt = crosstab(v1, v2, weight, aggfunc = sum, margins = True, dropna = False)
else:
cnt = crosstab(v1, v2, margins = True, dropna = False)
if order == 'freq':
cnt.iloc[:-1, :] = cnt.iloc[:-1, :].sort_values('All', ascending = False).values
cnt = cnt.transpose()
cnt.iloc[:-1, :] = cnt.iloc[:-1, :].sort_values('All', ascending = False).values
cnt = cnt.transpose()
pct = cnt / cnt.iat[-1, -1] * 100
col_pct = cnt.divide(cnt.iloc[-1, :]) * 100
row_pct =cnt.divide(cnt.iloc[:, -1], axis = 0) * 100
nrow, ncol = cnt.shape
# the following displace should also be done with multiindex
col_width = max(8, *[len(str(x)) for x in cnt.columns])
col1_width = max(8, *[len(str(x)) for x in cnt.index])
header_tmplt = '{:>{w1}} |' + '{:>{w}} |' * ncol
cnt_tmplt = '{:>{w1}} |' + '{:{w}.0f} |' * ncol
pct_tmplt = ('{:>{w1}} |' + '{:{w}.2f} |' * ncol)
rowcol_pct_tmplt = ('{:>{w1}} |' + '{:{w}.2f} |' * (ncol - 1) + '{:{w}} |')
print('-' * ((col_width + 2) * ncol + col1_width + 2))
print(header_tmplt.format(cnt.columns.name, *list(cnt.columns ), w = col_width, w1 = col1_width))
print(header_tmplt.format(cnt.index.name, *([' '] * ncol), w = col_width, w1 = col1_width))
print('-' * ((col_width + 2) * ncol + col1_width + 2))
for i in range(cnt.shape[0]):
print(cnt_tmplt.format(cnt.index[i], *list(cnt.iloc[i, :]), w = col_width, w1 = col1_width))
print(pct_tmplt.format(' ', *list(pct.iloc[i, :]), w = col_width, w1 = col1_width))
if i == 0:
print(rowcol_pct_tmplt.format('Row Pct', *list(row_pct.iloc[i, :-1]), ' ', w = col_width, w1 = col1_width))
print(rowcol_pct_tmplt.format('Col Pct', *list(col_pct.iloc[i, :-1]), ' ', w = col_width, w1 = col1_width))
elif i < nrow - 1:
print(rowcol_pct_tmplt.format(' ', *list(row_pct.iloc[i, :-1]), ' ', w = col_width, w1 = col1_width))
print(rowcol_pct_tmplt.format(' ', *list(col_pct.iloc[i, :-1]), ' ', w=col_width, w1 = col1_width))
print('-' * ((col_width + 2) * ncol + col1_width + 2))
# 1d frequency table
else:
if weight is not None:
cnt = crosstab(v1, 'cnt', weight, aggfunc = sum, margins = True, dropna = False)
else:
cnt = crosstab(v1, 'cnt', margins = True, dropna = False)
del cnt['All']
cnt.columns.name = ''
cnt['pct'] = cnt.cnt / cnt.iat[-1, 0] * 100
print(cnt)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment