Skip to content

Instantly share code, notes, and snippets.

@chilang
Created May 25, 2020 17:05
Show Gist options
  • Save chilang/3bc95e63578cf6fbb033644ea76b0246 to your computer and use it in GitHub Desktop.
Save chilang/3bc95e63578cf6fbb033644ea76b0246 to your computer and use it in GitHub Desktop.
def label_distribution_skew(x, y, partitions, skew=1):
def runner_split(N_labels, N_runners):
"""number of labels to assign to n runners"""
runner_labels = round(max(1, N_labels / N_runners))
runner_split = round(max(1, N_runners / N_labels))
return runner_labels, runner_split
runner_data = []
N_labels = np.unique(y).shape[0]
n_labels, n_runners = runner_split(N_labels, partitions)
runner_idx = 0
for label_idx in range(0, N_labels, n_labels):
mask = np.isin(y, range(label_idx, label_idx+n_labels))
subset_idx = np.argwhere(mask)[:, 0]
n_samples = subset_idx.shape[0]
sample_size = math.floor(skew*n_samples)
subset_idx = np.random.choice(subset_idx, sample_size, replace=False)
x_subset = x[subset_idx, ]
y_subset = y[subset_idx]
for partition in zip(np.array_split(x_subset, n_runners),
np.array_split(y_subset, n_runners)):
runner_data.append(partition)
x = np.delete(x, subset_idx, axis=0)
y = np.delete(y, subset_idx)
runner_idx = runner_idx + n_runners
return runner_data, x, y
@amitport
Copy link

what is runner_idx doing there?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment