Last active
May 27, 2020 13:19
-
-
Save bigtonylewis/eb2913814869416ccbb82944c3662d32 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def test_pystore(n, nrows=1000, ncols=5): | |
pystore.set_path('./test_pystore') | |
store = pystore.store('teststore') | |
c = store.collection('testcollection') | |
# clean out any past test data | |
[c.delete_item(item) for item in c.list_items()] | |
c.list_items() | |
# we use this list of column names a few times | |
cols = ['col{}'.format(n) for n in range(ncols)] | |
df_arr = [] | |
# loop through n times | |
for x in range(n): | |
# create some random floats | |
rands = pd.DataFrame(np.random.rand(nrows,ncols), columns=cols) | |
# create a column of random ints so we can group by them later | |
i = pd.DataFrame(np.random.randint(0,5, (nrows, 1)), columns=['i']) | |
# save it | |
df_arr.append(pd.concat([i, rands], axis=1)) | |
print(df_arr[n-1].shape) | |
df_arr[n-1].sample(5) | |
# go through each of the saved arrays | |
for df in df_arr: | |
print('next df from array') | |
# group each df by the int column | |
for i, dfi in df.groupby('i'): | |
print(i, dfi.shape) | |
# append (or write) each grouped DF to pystore | |
try: | |
c.append('number{}'.format(i), dfi, npartitions=1) | |
except ValueError: | |
c.write('number{}'.format(i), dfi) | |
# now read them all back in and concatenate them, and sort deterministically | |
from_pystore = pd.concat([c.item(item).to_pandas() for item in c.list_items()]).sort_values(cols) | |
# combine the array of dfs, sorting deterministically | |
orig = pd.concat(df_arr).sort_values(cols) | |
if from_pystore.shape != orig.shape: | |
print('Sizes do not match: {} != {} when n={}'.format(from_pystore.shape, orig.shape, n)) | |
return | |
if not from_pystore.equals(orig): | |
print('Sorted dataframes are not equal when n={}'.format(n)) | |
return | |
print('The dataframes are the same when n={}'.format(n)) | |
print('\n---------\nThis will work, n=1') | |
test_pystore(1) | |
print('\n---------\nThis will fail, n=2') | |
test_pystore(2) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment