Created
November 23, 2021 05:31
-
-
Save scravy/54805e8ed9d4a805c0011404170b9ffe to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import pyarrow as pa | |
def main(): | |
df1: pd.DataFrame = pd.DataFrame([ | |
[1, 1.0, ], | |
[3, 2.0, ], | |
[None, 4.0, ], | |
[7, 8.0, ], | |
[9, 16.0, ], | |
], columns=["foo", "bar"]) | |
print(df1['foo'].dtype, # float64 | |
df1['bar'].dtype, # float64 | |
) | |
df1.to_parquet(f1_dtype := "file1-dtypes.parquet") | |
# $ parquet-tools schema file1-dtypes.parquet | |
# message schema { | |
# optional double foo; | |
# optional double bar; | |
# } | |
df1.to_parquet(f1_pyarrow := "file1-pyarrow.parquet", schema=pa.schema([ | |
('foo', pa.int64()), # if you select these based on the dtypes you will write float64 as above | |
('bar', pa.float64()), | |
])) | |
# $ parquet-tools schema file1-pyarrow.parquet | |
# message schema { | |
# optional int64 foo; | |
# optional double bar; | |
# } | |
df1_dtype: pd.DataFrame = pd.read_parquet(f1_dtype) | |
print(df1_dtype['foo'].dtype, # float64 | |
df1_dtype['bar'].dtype, # float64 | |
) | |
# This file had been written as both columns `double` | |
df1_pyarrow: pd.DataFrame = pd.read_parquet(f1_pyarrow) | |
print(df1_pyarrow['foo'].dtype, # float64 | |
df1_pyarrow['bar'].dtype, # float64 | |
) | |
# This file had been written as the columns `int64` and `double` – due to the NaN/null they became all float64 | |
# If you write out these dataframes again, although they had different schemas, you will end up with the | |
# f1_dtype's schema. You should always specify the desired schema explicitly. | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment