scravy/dtypes.py

## dtypes.py
import pandas as pd
import pyarrow as pa


def main():
    df1: pd.DataFrame = pd.DataFrame([
        [1, 1.0, ],
        [3, 2.0, ],
        [None, 4.0, ],
        [7, 8.0, ],
        [9, 16.0, ],
    ], columns=["foo", "bar"])

    print(df1['foo'].dtype,  # float64
          df1['bar'].dtype,  # float64
          )

    df1.to_parquet(f1_dtype := "file1-dtypes.parquet")
    # $ parquet-tools schema file1-dtypes.parquet
    # message schema {
    #   optional double foo;
    #   optional double bar;
    # }

    df1.to_parquet(f1_pyarrow := "file1-pyarrow.parquet", schema=pa.schema([
        ('foo', pa.int64()),  # if you select these based on the dtypes you will write float64 as above
        ('bar', pa.float64()),
    ]))
    # $ parquet-tools schema file1-pyarrow.parquet
    # message schema {
    #   optional int64 foo;
    #   optional double bar;
    # }

    df1_dtype: pd.DataFrame = pd.read_parquet(f1_dtype)
    print(df1_dtype['foo'].dtype,  # float64
          df1_dtype['bar'].dtype,  # float64
          )
    # This file had been written as both columns `double`

    df1_pyarrow: pd.DataFrame = pd.read_parquet(f1_pyarrow)
    print(df1_pyarrow['foo'].dtype,  # float64
          df1_pyarrow['bar'].dtype,  # float64
          )
    # This file had been written as the columns `int64` and `double` – due to the NaN/null they became all float64

    # If you write out these dataframes again, although they had different schemas, you will end up with the
    # f1_dtype's schema. You should always specify the desired schema explicitly.


if __name__ == '__main__':
    main()
	import pandas as pd
	import pyarrow as pa


	def main():
	df1: pd.DataFrame = pd.DataFrame([
	[1, 1.0, ],
	[3, 2.0, ],
	[None, 4.0, ],
	[7, 8.0, ],
	[9, 16.0, ],
	], columns=["foo", "bar"])

	print(df1['foo'].dtype, # float64
	df1['bar'].dtype, # float64
	)

	df1.to_parquet(f1_dtype := "file1-dtypes.parquet")
	# $ parquet-tools schema file1-dtypes.parquet
	# message schema {
	# optional double foo;
	# optional double bar;
	# }

	df1.to_parquet(f1_pyarrow := "file1-pyarrow.parquet", schema=pa.schema([
	('foo', pa.int64()), # if you select these based on the dtypes you will write float64 as above
	('bar', pa.float64()),
	]))
	# $ parquet-tools schema file1-pyarrow.parquet
	# message schema {
	# optional int64 foo;
	# optional double bar;
	# }

	df1_dtype: pd.DataFrame = pd.read_parquet(f1_dtype)
	print(df1_dtype['foo'].dtype, # float64
	df1_dtype['bar'].dtype, # float64
	)
	# This file had been written as both columns `double`

	df1_pyarrow: pd.DataFrame = pd.read_parquet(f1_pyarrow)
	print(df1_pyarrow['foo'].dtype, # float64
	df1_pyarrow['bar'].dtype, # float64
	)
	# This file had been written as the columns `int64` and `double` – due to the NaN/null they became all float64

	# If you write out these dataframes again, although they had different schemas, you will end up with the
	# f1_dtype's schema. You should always specify the desired schema explicitly.


	if __name__ == '__main__':
	main()