Skip to content

Instantly share code, notes, and snippets.

@drin
Last active March 11, 2023 02:14
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save drin/b8fe9c5bb239b27bece81d817c83f0f4 to your computer and use it in GitHub Desktop.
Save drin/b8fe9c5bb239b27bece81d817c83f0f4 to your computer and use it in GitHub Desktop.
Example Arrow code
import itertools
import pyarrow
# ------------------------------
# Define array types for readability
# id1: list<int64>
id1_type = pyarrow.list_(pyarrow.int64())
# id2: struct<type: str, value: str>
field_id2type = pyarrow.field('type' , pyarrow.utf8())
field_id2val = pyarrow.field('value', pyarrow.utf8())
id2_type = pyarrow.struct([field_id2type, field_id2val])
# item: struct<id1: list<>, id2: struct<>>
field_item_id1 = pyarrow.field('id1', id1_type)
field_item_id2 = pyarrow.field('id2', id2_type)
item_type = pyarrow.struct([field_item_id1, field_item_id2])
# group_id: double
groupid_type = pyarrow.float64()
# groups: list<item: struct<>>
groups_type = pyarrow.list_(item_type)
# finally, the table schema
groups_schema = pyarrow.schema([
pyarrow.field('group_id', groupid_type)
,pyarrow.field('groups' , groups_type)
])
# ------------------------------
# Convenience functions
def CreateItem(gid_list, val_type, val):
return (gid_list, (val_type, val))
def CreateGroups(groups_list):
return pyarrow.array(groups_list, groups_type)
def CreateGroupIDs(group_ids):
return pyarrow.array(group_ids, groupid_type)
def CreateTable(group_ids, groups):
return pyarrow.table([group_ids, groups], schema=groups_schema)
# ------------------------------
# Create example table
if __name__ == '__main__':
# A generator for id2 values
id2_val = itertools.count(start=1)
# A list to gather groups into. This contains data for the `groups` column
groups_py = []
# add first list, containing items with a single 'id1' element
groups_py.append([
CreateItem([tmp_id], 'test-single', str(next(id2_val)))
for tmp_id in range(10, 13)
])
# add second list, containing items with three 'id1' elements
groups_py.append([
CreateItem([tmp_id, tmp_id + 1, tmp_id + 2], 'test-triple', str(next(id2_val)))
for tmp_id in range(13, 18, 3)
])
# add third list, containing items with two 'id1' elements
groups_py.append([
CreateItem([tmp_id, tmp_id + 1], 'test-double', str(next(id2_val)))
for tmp_id in range(19, 20, 2)
])
gids = list(range(len(groups_py)))
group_table = CreateTable(
CreateGroupIDs(gids)
,CreateGroups(groups_py)
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment