Last active
October 13, 2023 18:10
-
-
Save ianmcook/0290092a4a9c6a34bc16f73be7996a6c to your computer and use it in GitHub Desktop.
Write Parquet file with float32 column
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
#include <random> | |
#include <arrow/api.h> | |
#include <arrow/io/api.h> | |
#include <parquet/arrow/writer.h> | |
float GetRandomFloat() | |
{ | |
static std::default_random_engine e; | |
static std::uniform_real_distribution<> dis(0, 1); | |
return dis(e); | |
} | |
arrow::Status WriteTableToParquetFile() { | |
std::shared_ptr<arrow::Array> array; | |
arrow::FloatBuilder builder; | |
for (int i = 0; i < 10; i++) { | |
ARROW_RETURN_NOT_OK(builder.Append(GetRandomFloat())); | |
} | |
ARROW_RETURN_NOT_OK(builder.Finish(&array)); | |
std::vector<std::shared_ptr<arrow::Array>> arrays; | |
arrays.push_back(array); | |
std::vector<std::shared_ptr<arrow::Field>> schema_vector; | |
schema_vector.push_back(arrow::field("f", arrow::float32())); | |
auto schema = std::make_shared<arrow::Schema>(schema_vector); | |
std::shared_ptr<arrow::Table> table = arrow::Table::Make(schema, arrays); | |
std::shared_ptr<parquet::WriterProperties> props = | |
parquet::WriterProperties::Builder() | |
.compression(arrow::Compression::SNAPPY) | |
->disable_dictionary() | |
->encoding(parquet::Encoding::BYTE_STREAM_SPLIT) | |
->build(); | |
std::shared_ptr<arrow::io::FileOutputStream> outfile; | |
ARROW_ASSIGN_OR_RAISE(outfile, arrow::io::FileOutputStream::Open("test.parquet")); | |
ARROW_RETURN_NOT_OK(parquet::arrow::WriteTable(*table.get(), | |
arrow::default_memory_pool(), outfile, /*chunk_size=*/10, props)); | |
return arrow::Status::OK(); | |
} | |
int main(int, char**) { | |
auto status = WriteTableToParquetFile(); | |
if (!status.ok()) { | |
std::cerr << "Error occurred : " << status.message() << std::endl; | |
return EXIT_FAILURE; | |
} | |
return EXIT_SUCCESS; | |
} |
Examine the column encodings, compression, etc. of the resulting Parquet file using pqrs:
pqrs schema test.parquet --detailed
To try this with float16 columns:
- Add this include:
#include <arrow/util/float16.h>
- Change
FloatBuilder
toHalfFloatBuilder
- Change
builder.Append(GetRandomFloat())
tobuilder.Append(arrow::util::Float16(GetRandomFloat()).bits())
- Change
float32()
tofloat16()
- Comment out these two lines:
// ->disable_dictionary()
// ->encoding(parquet::Encoding::BYTE_STREAM_SPLIT)
- Build Arrow from the float16 PR branch at apache/arrow#36073
When I use the above steps to write a Parquet file with a float16 column, then try to read it with an older version of PyArrow, I get the error
Metadata contains Thrift LogicalType that is not recognized
When I try to read the Parquet file using Spark 3.4.1, I see a big Java stack trace, with the notable parts being:
org.apache.spark.SparkException: Exception thrown in awaitResult
Caused by: org.apache.spark.SparkException: [CANNOT_READ_FILE_FOOTER] Could not read footer for file...
Caused by: java.lang.NullPointerException: Cannot invoke "org.apache.parquet.format.LogicalType$_Fields.ordinal()" because the return value of "org.apache.parquet.format.LogicalType.getSetField()" is null
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Compile and run with
clang++ write_parquet_float.cpp -std=c++17 -I/usr/local/include -L/usr/local/lib -larrow -lparquet -o write_parquet_float && ./write_parquet_float