Skip to content

Instantly share code, notes, and snippets.

@mesejo
Last active November 17, 2020 14:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mesejo/15dbc9132ca0f3d15f35f0f7ad2ff190 to your computer and use it in GitHub Desktop.
Save mesejo/15dbc9132ca0f3d15f35f0f7ad2ff190 to your computer and use it in GitHub Desktop.
import pandas as pd
import numpy as np
arr = pd.read_csv("test.txt", delim_whitespace=True).values
print(arr)
class Cube:
def __init__(self, row_index, col_index, data):
self.row_index = {r: i for i, r in enumerate(row_index)}
self.col_index = {c: i for i, c in enumerate(col_index)}
self.data = data
def __getitem__(self, item):
row, col = item
return self.data[self.row_index[row], self.col_index[col]]
def __repr__(self):
return repr(self.data)
def _x_tab(rows, columns, values):
"""Function for computing the cross-tab of simple arrays"""
unique_values_all_cols, idx = zip(*(np.unique(col, return_inverse=True) for col in [rows, columns]))
shape_xt = [uniq_vals_col.size for uniq_vals_col in unique_values_all_cols]
xt = np.zeros(shape_xt, dtype=np.float)
np.add.at(xt, idx, values)
return unique_values_all_cols, xt
def make_index(a, r):
"""Make array of tuples"""
l = [tuple(row) for row in a[:, r]]
return make_object_array(l)
def make_object_array(l):
a = np.empty(len(l), dtype=object)
a[:] = l
return a
def fill_label(ar, le):
"""Fill missing parts with ALL label"""
missing = tuple(["ALL"] * le)
return [(e + missing)[:le] for e in ar]
def x_tab(rows, cols, values):
"""Main function for cross tabulation"""
_, l_cols = rows.shape
total_agg = []
total_idx = []
for i in range(l_cols + 1):
(idx, _), agg = _x_tab(make_index(rows, list(range(i))), cols, values)
total_idx.extend(fill_label(idx, l_cols))
total_agg.append(agg)
stacked_agg = np.vstack(total_agg)
stacked_agg_total = stacked_agg.sum(axis=1).reshape(-1, 1)
return Cube(total_idx, list(dict.fromkeys(cols)), np.concatenate((stacked_agg, stacked_agg_total), axis=1))
result = x_tab(arr[:, [0, 1]], arr[:, 2], arr[:, 3])
print(result)
print(result[('Dante', 'ALL'), 2001])
print(result[('Dante', 'Inferno'), 2001])
print(result[('Shakespeare', 'Hamlet'), 2000])
AUTHOR BOOK YEAR SALES
Shakespeare Hamlet 2000 104.2
Shakespeare Hamlet 2001 99.0
Shakespeare Romeo 2000 27.0
Shakespeare Romeo 2001 19.0
Dante Inferno 2000 11.6
Dante Inferno 2001 12.6
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment