Skip to content

Instantly share code, notes, and snippets.

@andy-thomason
Created November 28, 2019 15:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save andy-thomason/ba9436823b4cead9ba70c32fc47f271d to your computer and use it in GitHub Desktop.
Save andy-thomason/ba9436823b4cead9ba70c32fc47f271d to your computer and use it in GitHub Desktop.
import numpy as np
import pyarrow as pa
import pyarrow.feather as pf
import pandas as pd
dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8', 'f4', 'f8']
strings = ['a', 'bc', 'de', 'efgh', 'five']
cols = {d: np.ones(dtype=d, shape=(20)) for d in dtypes}
df = pd.DataFrame(data=cols)
pf.write_feather(df, "tests/t1.feather")
df.to_csv("tests/t1.csv", index=False)
df.to_csv("tests/t1.tsv", sep="\t", index=False)
data = [pa.array(np.ones(dtype=d, shape=(20))) for d in dtypes]
batch = pa.RecordBatch.from_arrays(data, dtypes)
writer = pa.RecordBatchFileWriter("tests/t1.arrow", batch.schema)
writer.write_batch(batch)
writer.close()
# test nulls
df = pd.DataFrame(
{
'one': [-1, np.nan, 2.5] * 5,
'two': [',', None, ',",'] * 5,
'three': [True, False, None] * 5,
'four': [None, 2, 3] * 5,
'five': ['a', None, 'b'] * 5,
}
)
pf.write_feather(df, "tests/t2.feather")
df.to_csv("tests/t2.csv", index=False)
df.to_csv("tests/t2.tsv", sep="\t", index=False)
batch = pa.RecordBatch.from_pandas(df)
writer = pa.RecordBatchFileWriter("tests/t2.arrow", batch.schema)
writer.write_batch(batch)
writer.close()
# test categories
df = pd.DataFrame(
{
'pad1': pd.Series(np.array([1, 1, 1, 2], dtype='u8'), dtype="category"),
'cat1': pd.Series(["a","b","a","a"], dtype="category"),
'cat2': pd.Series(["aa",None,"cc","cc"], dtype="category"),
'pad2': pd.Series([0x123456789abcdef, -1, 0, 4], dtype='u8'),
},
)
pf.write_feather(df, "tests/t3.feather")
df.to_csv("tests/t3.csv", index=False)
df.to_csv("tests/t3.tsv", sep="\t", index=False)
batch = pa.RecordBatch.from_pandas(df)
writer = pa.RecordBatchFileWriter("tests/t3.arrow", batch.schema)
writer.write_batch(batch)
writer.close()
# arrow only: multiple record batches.
batch = pa.RecordBatch.from_pandas(df)
writer = pa.RecordBatchFileWriter("tests/t4.arrow", batch.schema)
writer.write_batch(batch)
writer.write_batch(batch)
writer.write_batch(batch)
writer.write_batch(batch)
writer.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment