Skip to content

Instantly share code, notes, and snippets.

@sbinet
Created April 23, 2019 14:40
Show Gist options
  • Save sbinet/8ec09eb95bc89c6d71cc192dfcce677b to your computer and use it in GitHub Desktop.
Save sbinet/8ec09eb95bc89c6d71cc192dfcce677b to your computer and use it in GitHub Desktop.
pyarrow test scripts
import numpy as np
import pyarrow as pa
schema = pa.schema([
pa.field("bools", "bool", True),
pa.field("int8s", "i1", True),
pa.field("int16s", "i2", True),
pa.field("int32s", "i4", True),
pa.field("int64s", "i8", True),
pa.field("uint8s", "u1", True),
pa.field("uint16s", "u2", True),
pa.field("uint32s", "u4", True),
pa.field("uint64s", "u8", True),
pa.field("float32s", "f4", True),
pa.field("float64s", "f8", True),
])
mask = np.array([0,1,1,0,0], dtype=bool)
data = [
[pa.array([True,False,True,False,True],type="bool",mask=mask),
pa.array([-1,-2,-3,-4,-5],type="i1",mask=mask),
pa.array([-1,-2,-3,-4,-5],type="i2",mask=mask),
pa.array([-1,-2,-3,-4,-5],type="i4",mask=mask),
pa.array([-1,-2,-3,-4,-5],type="i8",mask=mask),
pa.array([+1,+2,+3,+4,+5],type="u1",mask=mask),
pa.array([+1,+2,+3,+4,+5],type="u2",mask=mask),
pa.array([+1,+2,+3,+4,+5],type="u4",mask=mask),
pa.array([+1,+2,+3,+4,+5],type="u8",mask=mask),
pa.array([+1,+2,+3,+4,+5],type="f4",mask=mask),
pa.array([+1,+2,+3,+4,+5],type="f8",mask=mask)],
[pa.array([True,False,True,False,True],type="bool",mask=mask),
pa.array([-11,-12,-13,-14,-15],type="i1",mask=mask),
pa.array([-11,-12,-13,-14,-15],type="i2",mask=mask),
pa.array([-11,-12,-13,-14,-15],type="i4",mask=mask),
pa.array([-11,-12,-13,-14,-15],type="i8",mask=mask),
pa.array([+11,+12,+13,+14,+15],type="u1",mask=mask),
pa.array([+11,+12,+13,+14,+15],type="u2",mask=mask),
pa.array([+11,+12,+13,+14,+15],type="u4",mask=mask),
pa.array([+11,+12,+13,+14,+15],type="u8",mask=mask),
pa.array([+11,+12,+13,+14,+15],type="f4",mask=mask),
pa.array([+11,+12,+13,+14,+15],type="f8",mask=mask)],
[pa.array([True,False,True,False,True],type="bool",mask=mask),
pa.array([-21,-22,-23,-24,-25],type="i1",mask=mask),
pa.array([-21,-22,-23,-24,-25],type="i2",mask=mask),
pa.array([-21,-22,-23,-24,-25],type="i4",mask=mask),
pa.array([-21,-22,-23,-24,-25],type="i8",mask=mask),
pa.array([+21,+22,+23,+24,+25],type="u1",mask=mask),
pa.array([+21,+22,+23,+24,+25],type="u2",mask=mask),
pa.array([+21,+22,+23,+24,+25],type="u4",mask=mask),
pa.array([+21,+22,+23,+24,+25],type="u8",mask=mask),
pa.array([+21,+22,+23,+24,+25],type="f4",mask=mask),
pa.array([+21,+22,+23,+24,+25],type="f8",mask=mask)],
]
sink = pa.BufferOutputStream()
writer = pa.RecordBatchFileWriter(sink, schema)
for d in data:
batch = pa.RecordBatch.from_arrays(d, schema)
writer.write_batch(batch)
pass
writer.close()
buf = sink.getvalue()
b = buf.to_pybytes() # this is the buffer containing the full streaming format
f = open("primitives.file.data", "wb")
f.write(b)
f.close()
sink = pa.BufferOutputStream()
writer = pa.RecordBatchStreamWriter(sink, schema)
for d in data:
batch = pa.RecordBatch.from_arrays(d, schema)
writer.write_batch(batch)
pass
writer.close()
buf = sink.getvalue()
b = buf.to_pybytes() # this is the buffer containing the full streaming format
f = open("primitives.stream.data", "wb")
f.write(b)
f.close()
import pyarrow as pa
schema = pa.schema([
pa.field("f1", pa.binary(4), True)
])
sink = pa.BufferOutputStream()
writer = pa.RecordBatchFileWriter(sink, schema)
data = [
[pa.array(["aaaa","bbbb","cccc",u"éé"], type=pa.binary(4))],
]
for d in data[:1]:
batch = pa.RecordBatch.from_arrays(d, schema)
print("rows: {}".format(batch.num_rows))
print("cols: {}".format(batch.num_columns))
print("schema: {}".format(batch.schema))
print("write...")
writer.write_batch(batch)
pass
writer.close()
buf = sink.getvalue()
print("buf.size={}".format(buf.size))
b = buf.to_pybytes() # this is the buffer containing the full streaming format
f = open("fixedwidth.dat", "wb")
f.write(b)
f.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment