Skip to content

Instantly share code, notes, and snippets.

@scravy
Created November 23, 2021 05:31
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save scravy/54805e8ed9d4a805c0011404170b9ffe to your computer and use it in GitHub Desktop.
Save scravy/54805e8ed9d4a805c0011404170b9ffe to your computer and use it in GitHub Desktop.
import pandas as pd
import pyarrow as pa
def main():
df1: pd.DataFrame = pd.DataFrame([
[1, 1.0, ],
[3, 2.0, ],
[None, 4.0, ],
[7, 8.0, ],
[9, 16.0, ],
], columns=["foo", "bar"])
print(df1['foo'].dtype, # float64
df1['bar'].dtype, # float64
)
df1.to_parquet(f1_dtype := "file1-dtypes.parquet")
# $ parquet-tools schema file1-dtypes.parquet
# message schema {
# optional double foo;
# optional double bar;
# }
df1.to_parquet(f1_pyarrow := "file1-pyarrow.parquet", schema=pa.schema([
('foo', pa.int64()), # if you select these based on the dtypes you will write float64 as above
('bar', pa.float64()),
]))
# $ parquet-tools schema file1-pyarrow.parquet
# message schema {
# optional int64 foo;
# optional double bar;
# }
df1_dtype: pd.DataFrame = pd.read_parquet(f1_dtype)
print(df1_dtype['foo'].dtype, # float64
df1_dtype['bar'].dtype, # float64
)
# This file had been written as both columns `double`
df1_pyarrow: pd.DataFrame = pd.read_parquet(f1_pyarrow)
print(df1_pyarrow['foo'].dtype, # float64
df1_pyarrow['bar'].dtype, # float64
)
# This file had been written as the columns `int64` and `double` – due to the NaN/null they became all float64
# If you write out these dataframes again, although they had different schemas, you will end up with the
# f1_dtype's schema. You should always specify the desired schema explicitly.
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment