Skip to content

Instantly share code, notes, and snippets.

@rcalsaverini
Created October 29, 2018 19:12
Show Gist options
  • Save rcalsaverini/8d793d3059dbc6f44c0635ad3717bac3 to your computer and use it in GitHub Desktop.
Save rcalsaverini/8d793d3059dbc6f44c0635ad3717bac3 to your computer and use it in GitHub Desktop.
import pandas
import numpy as np
import string
CHARACTERS = list(string.ascii_uppercase + string.digits)
def generate_random_label(label_size):
return ''.join(np.random.choice(CHARACTERS, size=label_size))
def generate_random_labels(num_labels, label_size=5):
labels = set()
while len(labels) < num_labels:
labels.add(generate_random_label(label_size))
return np.array(list(labels))
def generate_random_categorical_values(size, num_categories):
return np.random.choice(
range(num_categories),
size=size,
replace=True,
p=np.random.dirichlet([1.0]*num_categories)
)
def generate_pair_of_identical_categorical_series(size, num_categories):
labels_1 = generate_random_labels(num_categories)
labels_2 = generate_random_labels(num_categories)
values = generate_random_categorical_values(size, num_categories)
return (
pandas.Series(labels_1[values]).astype('category'),
pandas.Series(labels_2[values]).astype('category')
)
def compare_series(series_1, series_2):
""" Insert your code here"""
return series_1.cat.codes.equals(series_2.cat.codes)
def main(size, num_categories):
for _ in range(1000):
series_1, series_2 = generate_pair_of_identical_categorical_series(size, num_categories)
assert compare_series(series_1, series_2)
if __name__ == '__main__':
main(10, 3)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment