wesm/parquet-changes.diff

## parquet-changes.diff
diff --git a/cpp/src/parquet/.parquetcppversion b/cpp/src/parquet/.parquetcppversion
index d65937f10..f825f7c7f 100644
--- a/cpp/src/parquet/.parquetcppversion
+++ b/cpp/src/parquet/.parquetcppversion
@@ -1 +1 @@
-1.4.1-SNAPSHOT
+1.5.1-SNAPSHOT
diff --git a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc
index 5f4e12349..086672711 100644
--- a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc
+++ b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc
@@ -1391,6 +1391,16 @@ TEST(TestArrowReadWrite, CoerceTimestampsLosePrecision) {
   ASSERT_RAISES(Invalid, WriteTable(*t4, ::arrow::default_memory_pool(), sink, 10,
                                     default_writer_properties(), coerce_millis));

+  // OK to lose precision if we explicitly allow it
+  auto allow_truncation = (ArrowWriterProperties::Builder()
+                               .coerce_timestamps(TimeUnit::MILLI)
+                               ->allow_truncated_timestamps()
+                               ->build());
+  ASSERT_OK_NO_THROW(WriteTable(*t3, ::arrow::default_memory_pool(), sink, 10,
+                                default_writer_properties(), allow_truncation));
+  ASSERT_OK_NO_THROW(WriteTable(*t4, ::arrow::default_memory_pool(), sink, 10,
+                                default_writer_properties(), allow_truncation));
+
   // OK to write micros to micros
   auto coerce_micros =
       (ArrowWriterProperties::Builder().coerce_timestamps(TimeUnit::MICRO)->build());
@@ -2316,11 +2326,11 @@ TEST(TestArrowReaderAdHoc, Int96BadMemoryAccess) {
   ASSERT_OK_NO_THROW(arrow_reader->ReadTable(&table));
 }

-class TestArrowReaderAdHocSpark
+class TestArrowReaderAdHocSparkAndHvr
     : public ::testing::TestWithParam<
           std::tuple<std::string, std::shared_ptr<::DataType>>> {};

-TEST_P(TestArrowReaderAdHocSpark, ReadDecimals) {
+TEST_P(TestArrowReaderAdHocSparkAndHvr, ReadDecimals) {
   std::string path(test::get_data_dir());

   std::string filename;
@@ -2364,12 +2374,13 @@ TEST_P(TestArrowReaderAdHocSpark, ReadDecimals) {
 }

 INSTANTIATE_TEST_CASE_P(
-    ReadDecimals, TestArrowReaderAdHocSpark,
+    ReadDecimals, TestArrowReaderAdHocSparkAndHvr,
     ::testing::Values(
         std::make_tuple("int32_decimal.parquet", ::arrow::decimal(4, 2)),
         std::make_tuple("int64_decimal.parquet", ::arrow::decimal(10, 2)),
         std::make_tuple("fixed_length_decimal.parquet", ::arrow::decimal(25, 2)),
-        std::make_tuple("fixed_length_decimal_legacy.parquet", ::arrow::decimal(13, 2))));
+        std::make_tuple("fixed_length_decimal_legacy.parquet", ::arrow::decimal(13, 2)),
+        std::make_tuple("byte_array_decimal.parquet", ::arrow::decimal(4, 2))));

 }  // namespace arrow

diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc
index 11fb20cd1..2006025ac 100644
--- a/cpp/src/parquet/arrow/reader.cc
+++ b/cpp/src/parquet/arrow/reader.cc
@@ -18,26 +18,30 @@
 #include "parquet/arrow/reader.h"

 #include <algorithm>
-#include <atomic>
-#include <chrono>
-#include <mutex>
-#include <queue>
+#include <climits>
+#include <cstring>
+#include <future>
+#include <ostream>
 #include <string>
-#include <thread>
 #include <type_traits>
 #include <utility>
 #include <vector>

 #include "arrow/api.h"
 #include "arrow/util/bit-util.h"
-#include "arrow/util/decimal.h"
 #include "arrow/util/logging.h"
 #include "arrow/util/thread-pool.h"

 #include "parquet/arrow/record_reader.h"
 #include "parquet/arrow/schema.h"
 #include "parquet/column_reader.h"
+#include "parquet/exception.h"
+#include "parquet/file_reader.h"
+#include "parquet/metadata.h"
+#include "parquet/properties.h"
 #include "parquet/schema.h"
+#include "parquet/types.h"
+#include "parquet/util/memory.h"
 #include "parquet/util/schema-util.h"

 using arrow::Array;
@@ -1221,6 +1225,64 @@ struct TransferFunctor<::arrow::Decimal128Type, FLBAType> {
   }
 };

+/// \brief Convert an arrow::BinaryArray to an arrow::Decimal128Array
+/// We do this by:
+/// 1. Creating an arrow::BinaryArray from the RecordReader's builder
+/// 2. Allocating a buffer for the arrow::Decimal128Array
+/// 3. Converting the big-endian bytes in each BinaryArray entry to two integers
+///    representing the high and low bits of each decimal value.
+template <>
+struct TransferFunctor<::arrow::Decimal128Type, ByteArrayType> {
+  Status operator()(RecordReader* reader, MemoryPool* pool,
+                    const std::shared_ptr<::arrow::DataType>& type,
+                    std::shared_ptr<Array>* out) {
+    DCHECK_EQ(type->id(), ::arrow::Type::DECIMAL);
+
+    // Finish the built data into a temporary array
+    std::shared_ptr<Array> array;
+    RETURN_NOT_OK(reader->builder()->Finish(&array));
+    const auto& binary_array = static_cast<const ::arrow::BinaryArray&>(*array);
+
+    const int64_t length = binary_array.length();
+
+    const auto& decimal_type = static_cast<const ::arrow::Decimal128Type&>(*type);
+    const int64_t type_length = decimal_type.byte_width();
+
+    std::shared_ptr<Buffer> data;
+    RETURN_NOT_OK(::arrow::AllocateBuffer(pool, length * type_length, &data));
+
+    // raw bytes that we can write to
+    uint8_t* out_ptr = data->mutable_data();
+
+    const int64_t null_count = binary_array.null_count();
+
+    // convert each BinaryArray value to valid decimal bytes
+    for (int64_t i = 0; i < length; i++, out_ptr += type_length) {
+      int32_t record_len = 0;
+      const uint8_t* record_loc = binary_array.GetValue(i, &record_len);
+
+      if ((record_len < 0) || (record_len > type_length)) {
+        return Status::Invalid("Invalid BYTE_ARRAY size");
+      }
+
+      auto out_ptr_view = reinterpret_cast<uint64_t*>(out_ptr);
+      out_ptr_view[0] = 0;
+      out_ptr_view[1] = 0;
+
+      // only convert rows that are not null if there are nulls, or
+      // all rows, if there are not
+      if (((null_count > 0) && !binary_array.IsNull(i)) || (null_count <= 0)) {
+        RawBytesToDecimalBytes(record_loc, record_len, out_ptr);
+      }
+    }
+
+    *out = std::make_shared<::arrow::Decimal128Array>(
+        type, length, data, binary_array.null_bitmap(), null_count);
+
+    return Status::OK();
+  }
+};
+
 /// \brief Convert an Int32 or Int64 array into a Decimal128Array
 /// The parquet spec allows systems to write decimals in int32, int64 if the values are
 /// small enough to fit in less 4 bytes or less than 8 bytes, respectively.
@@ -1353,12 +1415,16 @@ Status PrimitiveImpl::NextBatch(int64_t records_to_read, std::shared_ptr<Array>*
         case ::parquet::Type::INT64: {
           TRANSFER_DATA(::arrow::Decimal128Type, Int64Type);
         } break;
+        case ::parquet::Type::BYTE_ARRAY: {
+          TRANSFER_DATA(::arrow::Decimal128Type, ByteArrayType);
+        } break;
         case ::parquet::Type::FIXED_LEN_BYTE_ARRAY: {
           TRANSFER_DATA(::arrow::Decimal128Type, FLBAType);
         } break;
         default:
           return Status::Invalid(
-              "Physical type for decimal must be int32, int64, or fixed length binary");
+              "Physical type for decimal must be int32, int64, byte array, or fixed "
+              "length binary");
       }
     } break;
     case ::arrow::Type::TIMESTAMP: {
diff --git a/cpp/src/parquet/arrow/reader.h b/cpp/src/parquet/arrow/reader.h
index 6eee0f6e2..2cd94ca28 100644
--- a/cpp/src/parquet/arrow/reader.h
+++ b/cpp/src/parquet/arrow/reader.h
@@ -18,25 +18,32 @@
 #ifndef PARQUET_ARROW_READER_H
 #define PARQUET_ARROW_READER_H

+#include <cstdint>
 #include <memory>
 #include <vector>

-#include "parquet/api/reader.h"
-#include "parquet/api/schema.h"
+#include "parquet/util/visibility.h"

 #include "arrow/io/interfaces.h"
+#include "arrow/util/macros.h"

 namespace arrow {

 class Array;
 class MemoryPool;
 class RecordBatchReader;
+class Schema;
 class Status;
 class Table;
+
 }  // namespace arrow

 namespace parquet {

+class FileMetaData;
+class ParquetFileReader;
+class ReaderProperties;
+
 namespace arrow {

 class ColumnChunkReader;
diff --git a/cpp/src/parquet/arrow/record_reader.cc b/cpp/src/parquet/arrow/record_reader.cc
index 3fbdfd586..ce6fa2a5b 100644
--- a/cpp/src/parquet/arrow/record_reader.cc
+++ b/cpp/src/parquet/arrow/record_reader.cc
@@ -19,21 +19,29 @@

 #include <algorithm>
 #include <cstdint>
+#include <cstring>
 #include <memory>
 #include <sstream>
+#include <unordered_map>
 #include <utility>

-#include <arrow/buffer.h>
-#include <arrow/memory_pool.h>
-#include <arrow/status.h>
-#include <arrow/util/bit-util.h>
-#include <arrow/util/rle-encoding.h>
+#include "arrow/buffer.h"
+#include "arrow/builder.h"
+#include "arrow/memory_pool.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/util/bit-util.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/rle-encoding.h"

 #include "parquet/column_page.h"
 #include "parquet/column_reader.h"
 #include "parquet/encoding-internal.h"
+#include "parquet/encoding.h"
 #include "parquet/exception.h"
 #include "parquet/properties.h"
+#include "parquet/schema.h"
+#include "parquet/types.h"

 using arrow::MemoryPool;

diff --git a/cpp/src/parquet/arrow/record_reader.h b/cpp/src/parquet/arrow/record_reader.h
index 4935713a2..8da070999 100644
--- a/cpp/src/parquet/arrow/record_reader.h
+++ b/cpp/src/parquet/arrow/record_reader.h
@@ -19,22 +19,24 @@
 #define PARQUET_RECORD_READER_H

 #include <cstdint>
-#include <cstring>
-#include <iostream>
 #include <memory>
-#include <unordered_map>
-#include <vector>

-#include <arrow/buffer.h>
-#include <arrow/builder.h>
-#include <arrow/memory_pool.h>
-#include <arrow/util/bit-util.h>
+#include "arrow/memory_pool.h"

-#include "parquet/column_reader.h"
-#include "parquet/schema.h"
 #include "parquet/util/macros.h"
+#include "parquet/util/memory.h"
+
+namespace arrow {
+
+class ArrayBuilder;
+
+}  // namespace arrow

 namespace parquet {
+
+class ColumnDescriptor;
+class PageReader;
+
 namespace internal {

 /// \brief Stateful column reader that delimits semantic records for both flat
diff --git a/cpp/src/parquet/arrow/schema.h b/cpp/src/parquet/arrow/schema.h
index 3b212da7e..8e920850c 100644
--- a/cpp/src/parquet/arrow/schema.h
+++ b/cpp/src/parquet/arrow/schema.h
@@ -18,14 +18,16 @@
 #ifndef PARQUET_ARROW_SCHEMA_H
 #define PARQUET_ARROW_SCHEMA_H

+#include <cstdint>
 #include <memory>
 #include <vector>

 #include "arrow/api.h"

-#include "parquet/api/schema.h"
-#include "parquet/api/writer.h"
 #include "parquet/arrow/writer.h"
+#include "parquet/metadata.h"
+#include "parquet/schema.h"
+#include "parquet/util/visibility.h"

 namespace arrow {

@@ -35,8 +37,12 @@ class Status;

 namespace parquet {

+class WriterProperties;
+
 namespace arrow {

+class ArrowWriterProperties;
+
 PARQUET_EXPORT
 ::arrow::Status NodeToField(const schema::Node& node,
                             std::shared_ptr<::arrow::Field>* out);
diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc
index 9247b84cf..923f13294 100644
--- a/cpp/src/parquet/arrow/writer.cc
+++ b/cpp/src/parquet/arrow/writer.cc
@@ -366,8 +366,9 @@ class ArrowColumnWriter {
   Status WriteTimestamps(const Array& data, int64_t num_levels, const int16_t* def_levels,
                          const int16_t* rep_levels);

-  Status WriteTimestampsCoerce(const Array& data, int64_t num_levels,
-                               const int16_t* def_levels, const int16_t* rep_levels);
+  Status WriteTimestampsCoerce(const bool truncated_timestamps_allowed, const Array& data,
+                               int64_t num_levels, const int16_t* def_levels,
+                               const int16_t* rep_levels);

   template <typename ParquetType, typename ArrowType>
   Status WriteNonNullableBatch(const ArrowType& type, int64_t num_values,
@@ -626,7 +627,8 @@ Status ArrowColumnWriter::WriteTimestamps(const Array& values, int64_t num_level
     // Casting is required. This covers several cases
     // * Nanoseconds -> cast to microseconds
     // * coerce_timestamps_enabled_, cast all timestamps to requested unit
-    return WriteTimestampsCoerce(values, num_levels, def_levels, rep_levels);
+    return WriteTimestampsCoerce(ctx_->properties->truncated_timestamps_allowed(), values,
+                                 num_levels, def_levels, rep_levels);
   } else {
     // No casting of timestamps is required, take the fast path
     return TypedWriteBatch<Int64Type, ::arrow::TimestampType>(values, num_levels,
@@ -634,7 +636,8 @@ Status ArrowColumnWriter::WriteTimestamps(const Array& values, int64_t num_level
   }
 }

-Status ArrowColumnWriter::WriteTimestampsCoerce(const Array& array, int64_t num_levels,
+Status ArrowColumnWriter::WriteTimestampsCoerce(const bool truncated_timestamps_allowed,
+                                                const Array& array, int64_t num_levels,
                                                 const int16_t* def_levels,
                                                 const int16_t* rep_levels) {
   int64_t* buffer;
@@ -652,7 +655,7 @@ Status ArrowColumnWriter::WriteTimestampsCoerce(const Array& array, int64_t num_

   auto DivideBy = [&](const int64_t factor) {
     for (int64_t i = 0; i < array.length(); i++) {
-      if (!data.IsNull(i) && (values[i] % factor != 0)) {
+      if (!truncated_timestamps_allowed && !data.IsNull(i) && (values[i] % factor != 0)) {
         std::stringstream ss;
         ss << "Casting from " << type.ToString() << " to " << target_type->ToString()
            << " would lose data: " << values[i];
diff --git a/cpp/src/parquet/arrow/writer.h b/cpp/src/parquet/arrow/writer.h
index ad6f1d52d..7e4b2287b 100644
--- a/cpp/src/parquet/arrow/writer.h
+++ b/cpp/src/parquet/arrow/writer.h
@@ -44,7 +44,10 @@ class PARQUET_EXPORT ArrowWriterProperties {
  public:
   class Builder {
    public:
-    Builder() : write_nanos_as_int96_(false), coerce_timestamps_enabled_(false) {}
+    Builder()
+        : write_nanos_as_int96_(false),
+          coerce_timestamps_enabled_(false),
+          truncated_timestamps_allowed_(false) {}
     virtual ~Builder() {}

     Builder* disable_deprecated_int96_timestamps() {
@@ -63,9 +66,20 @@ class PARQUET_EXPORT ArrowWriterProperties {
       return this;
     }

+    Builder* allow_truncated_timestamps() {
+      truncated_timestamps_allowed_ = true;
+      return this;
+    }
+
+    Builder* disallow_truncated_timestamps() {
+      truncated_timestamps_allowed_ = false;
+      return this;
+    }
+
     std::shared_ptr<ArrowWriterProperties> build() {
       return std::shared_ptr<ArrowWriterProperties>(new ArrowWriterProperties(
-          write_nanos_as_int96_, coerce_timestamps_enabled_, coerce_timestamps_unit_));
+          write_nanos_as_int96_, coerce_timestamps_enabled_, coerce_timestamps_unit_,
+          truncated_timestamps_allowed_));
     }

    private:
@@ -73,6 +87,7 @@ class PARQUET_EXPORT ArrowWriterProperties {

     bool coerce_timestamps_enabled_;
     ::arrow::TimeUnit::type coerce_timestamps_unit_;
+    bool truncated_timestamps_allowed_;
   };

   bool support_deprecated_int96_timestamps() const { return write_nanos_as_int96_; }
@@ -82,17 +97,22 @@ class PARQUET_EXPORT ArrowWriterProperties {
     return coerce_timestamps_unit_;
   }

+  bool truncated_timestamps_allowed() const { return truncated_timestamps_allowed_; }
+
  private:
   explicit ArrowWriterProperties(bool write_nanos_as_int96,
                                  bool coerce_timestamps_enabled,
-                                 ::arrow::TimeUnit::type coerce_timestamps_unit)
+                                 ::arrow::TimeUnit::type coerce_timestamps_unit,
+                                 bool truncated_timestamps_allowed)
       : write_nanos_as_int96_(write_nanos_as_int96),
         coerce_timestamps_enabled_(coerce_timestamps_enabled),
-        coerce_timestamps_unit_(coerce_timestamps_unit) {}
+        coerce_timestamps_unit_(coerce_timestamps_unit),
+        truncated_timestamps_allowed_(truncated_timestamps_allowed) {}

   const bool write_nanos_as_int96_;
   const bool coerce_timestamps_enabled_;
   const ::arrow::TimeUnit::type coerce_timestamps_unit_;
+  const bool truncated_timestamps_allowed_;
 };

 std::shared_ptr<ArrowWriterProperties> PARQUET_EXPORT default_arrow_writer_properties();
diff --git a/cpp/src/parquet/bloom_filter-test.cc b/cpp/src/parquet/bloom_filter-test.cc
index 96d2e065f..945f80b7b 100644
--- a/cpp/src/parquet/bloom_filter-test.cc
+++ b/cpp/src/parquet/bloom_filter-test.cc
@@ -17,13 +17,21 @@

 #include <gtest/gtest.h>

-#include <algorithm>
+#include <cstdint>
+#include <limits>
+#include <memory>
 #include <random>
 #include <string>
+#include <vector>

+#include "arrow/buffer.h"
 #include "arrow/io/file.h"
+#include "arrow/status.h"
+
 #include "parquet/bloom_filter.h"
+#include "parquet/exception.h"
 #include "parquet/murmur3.h"
+#include "parquet/types.h"
 #include "parquet/util/memory.h"
 #include "parquet/util/test-common.h"

diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc
index 173292ecd..7fbf9babd 100644
--- a/cpp/src/parquet/column_reader.cc
+++ b/cpp/src/parquet/column_reader.cc
@@ -52,7 +52,8 @@ int LevelDecoder::SetData(Encoding::type encoding, int16_t max_level,
       num_bytes = *reinterpret_cast<const int32_t*>(data);
       const uint8_t* decoder_data = data + sizeof(int32_t);
       if (!rle_decoder_) {
-        rle_decoder_.reset(new ::arrow::RleDecoder(decoder_data, num_bytes, bit_width_));
+        rle_decoder_.reset(
+            new ::arrow::util::RleDecoder(decoder_data, num_bytes, bit_width_));
       } else {
         rle_decoder_->Reset(decoder_data, num_bytes, bit_width_);
       }
@@ -62,7 +63,7 @@ int LevelDecoder::SetData(Encoding::type encoding, int16_t max_level,
       num_bytes =
           static_cast<int32_t>(BitUtil::BytesForBits(num_buffered_values * bit_width_));
       if (!bit_packed_decoder_) {
-        bit_packed_decoder_.reset(new ::arrow::BitReader(data, num_bytes));
+        bit_packed_decoder_.reset(new ::arrow::BitUtil::BitReader(data, num_bytes));
       } else {
         bit_packed_decoder_->Reset(data, num_bytes);
       }
@@ -123,7 +124,7 @@ class SerializedPageReader : public PageReader {
   std::shared_ptr<Page> current_page_;

   // Compression codec to use.
-  std::unique_ptr<::arrow::Codec> decompressor_;
+  std::unique_ptr<::arrow::util::Codec> decompressor_;
   std::shared_ptr<ResizableBuffer> decompression_buffer_;

   // Maximum allowed page size
diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h
index d1b4d2ef5..960f2107d 100644
--- a/cpp/src/parquet/column_reader.h
+++ b/cpp/src/parquet/column_reader.h
@@ -44,8 +44,13 @@

 namespace arrow {

+namespace BitUtil {
 class BitReader;
+}  // namespace BitUtil
+
+namespace util {
 class RleDecoder;
+}  // namespace util

 }  // namespace arrow

@@ -76,8 +81,8 @@ class PARQUET_EXPORT LevelDecoder {
   int bit_width_;
   int num_values_remaining_;
   Encoding::type encoding_;
-  std::unique_ptr<::arrow::RleDecoder> rle_decoder_;
-  std::unique_ptr<::arrow::BitReader> bit_packed_decoder_;
+  std::unique_ptr<::arrow::util::RleDecoder> rle_decoder_;
+  std::unique_ptr<::arrow::BitUtil::BitReader> bit_packed_decoder_;
 };

 // Abstract page iterator interface. This way, we can feed column pages to the
diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc
index 9c7a39bfe..a45613f1b 100644
--- a/cpp/src/parquet/column_writer.cc
+++ b/cpp/src/parquet/column_writer.cc
@@ -34,8 +34,8 @@

 namespace parquet {

-using BitWriter = ::arrow::BitWriter;
-using RleEncoder = ::arrow::RleEncoder;
+using BitWriter = ::arrow::BitUtil::BitWriter;
+using RleEncoder = ::arrow::util::RleEncoder;

 LevelEncoder::LevelEncoder() {}
 LevelEncoder::~LevelEncoder() {}
@@ -271,7 +271,7 @@ class SerializedPageWriter : public PageWriter {
   int64_t total_compressed_size_;

   // Compression codec to use.
-  std::unique_ptr<::arrow::Codec> compressor_;
+  std::unique_ptr<::arrow::util::Codec> compressor_;
 };

 // This implementation of the PageWriter writes to the final sink on Close .
diff --git a/cpp/src/parquet/column_writer.h b/cpp/src/parquet/column_writer.h
index e3bfcf0ae..457c532bb 100644
--- a/cpp/src/parquet/column_writer.h
+++ b/cpp/src/parquet/column_writer.h
@@ -34,8 +34,13 @@

 namespace arrow {

+namespace BitUtil {
 class BitWriter;
+}  // namespace BitUtil
+
+namespace util {
 class RleEncoder;
+}  // namespace util

 }  // namespace arrow

@@ -67,8 +72,8 @@ class PARQUET_EXPORT LevelEncoder {
   int bit_width_;
   int rle_length_;
   Encoding::type encoding_;
-  std::unique_ptr<::arrow::RleEncoder> rle_encoder_;
-  std::unique_ptr<::arrow::BitWriter> bit_packed_encoder_;
+  std::unique_ptr<::arrow::util::RleEncoder> rle_encoder_;
+  std::unique_ptr<::arrow::BitUtil::BitWriter> bit_packed_encoder_;
 };

 class PageWriter {
diff --git a/cpp/src/parquet/encoding-internal.h b/cpp/src/parquet/encoding-internal.h
index 0bfd26fbd..93d499300 100644
--- a/cpp/src/parquet/encoding-internal.h
+++ b/cpp/src/parquet/encoding-internal.h
@@ -143,7 +143,7 @@ class PlainDecoder<BooleanType> : public Decoder<BooleanType> {

   virtual void SetData(int num_values, const uint8_t* data, int len) {
     num_values_ = num_values;
-    bit_reader_ = ::arrow::BitReader(data, len);
+    bit_reader_ = BitUtil::BitReader(data, len);
   }

   // Two flavors of bool decoding
@@ -175,7 +175,7 @@ class PlainDecoder<BooleanType> : public Decoder<BooleanType> {
   }

  private:
-  ::arrow::BitReader bit_reader_;
+  BitUtil::BitReader bit_reader_;
 };

 // ----------------------------------------------------------------------
@@ -210,7 +210,7 @@ class PlainEncoder<BooleanType> : public Encoder<BooleanType> {
         bits_available_(kInMemoryDefaultCapacity * 8),
         bits_buffer_(AllocateBuffer(pool, kInMemoryDefaultCapacity)),
         values_sink_(new InMemoryOutputStream(pool)) {
-    bit_writer_.reset(new ::arrow::BitWriter(bits_buffer_->mutable_data(),
+    bit_writer_.reset(new BitUtil::BitWriter(bits_buffer_->mutable_data(),
                                              static_cast<int>(bits_buffer_->size())));
   }

@@ -274,7 +274,7 @@ class PlainEncoder<BooleanType> : public Encoder<BooleanType> {

  protected:
   int bits_available_;
-  std::unique_ptr<::arrow::BitWriter> bit_writer_;
+  std::unique_ptr<BitUtil::BitWriter> bit_writer_;
   std::shared_ptr<ResizableBuffer> bits_buffer_;
   std::unique_ptr<InMemoryOutputStream> values_sink_;
 };
@@ -341,7 +341,7 @@ class DictionaryDecoder : public Decoder<Type> {
     uint8_t bit_width = *data;
     ++data;
     --len;
-    idx_decoder_ = ::arrow::RleDecoder(data, len, bit_width);
+    idx_decoder_ = ::arrow::util::RleDecoder(data, len, bit_width);
   }

   int Decode(T* buffer, int max_values) override {
@@ -376,7 +376,7 @@ class DictionaryDecoder : public Decoder<Type> {
   // pointers).
   std::shared_ptr<ResizableBuffer> byte_array_data_;

-  ::arrow::RleDecoder idx_decoder_;
+  ::arrow::util::RleDecoder idx_decoder_;
 };

 template <typename Type>
@@ -468,7 +468,7 @@ class DictEncoder : public Encoder<DType> {
         dict_encoded_size_(0),
         type_length_(desc->type_length()) {
     hash_slots_.Assign(hash_table_size_, HASH_SLOT_EMPTY);
-    cpu_info_ = ::arrow::CpuInfo::GetInstance();
+    cpu_info_ = ::arrow::internal::CpuInfo::GetInstance();
   }

   ~DictEncoder() override { DCHECK(buffered_indices_.empty()); }
@@ -487,9 +487,9 @@ class DictEncoder : public Encoder<DType> {
     // an extra "RleEncoder::MinBufferSize" bytes. These extra bytes won't be used
     // but not reserving them would cause the encoder to fail.
     return 1 +
-           ::arrow::RleEncoder::MaxBufferSize(
+           ::arrow::util::RleEncoder::MaxBufferSize(
                bit_width(), static_cast<int>(buffered_indices_.size())) +
-           ::arrow::RleEncoder::MinBufferSize(bit_width());
+           ::arrow::util::RleEncoder::MinBufferSize(bit_width());
   }

   /// The minimum bit width required to encode the currently buffered indices.
@@ -580,7 +580,7 @@ class DictEncoder : public Encoder<DType> {
   // For ByteArray / FixedLenByteArray data. Not owned
   ChunkedAllocator* pool_;

-  ::arrow::CpuInfo* cpu_info_;
+  ::arrow::internal::CpuInfo* cpu_info_;

   /// Size of the table. Must be a power of 2.
   int hash_table_size_;
@@ -791,7 +791,7 @@ inline int DictEncoder<DType>::WriteIndices(uint8_t* buffer, int buffer_len) {
   ++buffer;
   --buffer_len;

-  ::arrow::RleEncoder encoder(buffer, buffer_len, bit_width());
+  ::arrow::util::RleEncoder encoder(buffer, buffer_len, bit_width());
   for (int index : buffered_indices_) {
     if (!encoder.Put(index)) return -1;
   }
@@ -819,7 +819,7 @@ class DeltaBitPackDecoder : public Decoder<DType> {

   virtual void SetData(int num_values, const uint8_t* data, int len) {
     num_values_ = num_values;
-    decoder_ = ::arrow::BitReader(data, len);
+    decoder_ = BitUtil::BitReader(data, len);
     values_current_block_ = 0;
     values_current_mini_block_ = 0;
   }
@@ -885,7 +885,7 @@ class DeltaBitPackDecoder : public Decoder<DType> {
   }

   ::arrow::MemoryPool* pool_;
-  ::arrow::BitReader decoder_;
+  BitUtil::BitReader decoder_;
   int32_t values_current_block_;
   int32_t num_mini_blocks_;
   uint64_t values_per_mini_block_;
diff --git a/cpp/src/parquet/file-deserialize-test.cc b/cpp/src/parquet/file-deserialize-test.cc
index b766eedf5..17dfe387f 100644
--- a/cpp/src/parquet/file-deserialize-test.cc
+++ b/cpp/src/parquet/file-deserialize-test.cc
@@ -17,16 +17,11 @@

 #include <gtest/gtest.h>

-#include <algorithm>
 #include <cstdint>
-#include <cstdlib>
 #include <cstring>
-#include <exception>
 #include <memory>
-#include <string>
-#include <vector>

-#include "parquet/column_reader.h"
+#include "parquet/column_page.h"
 #include "parquet/exception.h"
 #include "parquet/file_reader.h"
 #include "parquet/thrift.h"
@@ -34,6 +29,8 @@
 #include "parquet/util/memory.h"
 #include "parquet/util/test-common.h"

+#include "arrow/io/memory.h"
+#include "arrow/status.h"
 #include "arrow/util/compression.h"

 namespace parquet {
@@ -196,7 +193,7 @@ TEST_F(TestPageSerde, Compression) {
     test::random_bytes(page_size, 0, &faux_data[i]);
   }
   for (auto codec_type : codec_types) {
-    std::unique_ptr<::arrow::Codec> codec = GetCodecFromArrow(codec_type);
+    auto codec = GetCodecFromArrow(codec_type);

     std::vector<uint8_t> buffer;
     for (int i = 0; i < num_pages; ++i) {
diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc
index ea518fd98..5be1a8623 100644
--- a/cpp/src/parquet/file_reader.cc
+++ b/cpp/src/parquet/file_reader.cc
@@ -41,12 +41,6 @@

 using std::string;

-namespace arrow {
-
-class Codec;
-
-}  // namespace arrow
-
 namespace parquet {

 // PARQUET-978: Minimize footer reads by reading 64 KB from the end of the file
diff --git a/cpp/src/parquet/metadata-test.cc b/cpp/src/parquet/metadata-test.cc
index 53653bd78..bcf911eab 100644
--- a/cpp/src/parquet/metadata-test.cc
+++ b/cpp/src/parquet/metadata-test.cc
@@ -16,9 +16,12 @@
 // under the License.

 #include "parquet/metadata.h"
+
 #include <gtest/gtest.h>
+
 #include "parquet/schema.h"
 #include "parquet/statistics.h"
+#include "parquet/thrift.h"
 #include "parquet/types.h"

 namespace parquet {
@@ -219,12 +222,36 @@ TEST(ApplicationVersion, Basics) {

   ASSERT_EQ(true, version.VersionLt(version1));

-  ASSERT_FALSE(version1.HasCorrectStatistics(Type::INT96, SortOrder::UNKNOWN));
-  ASSERT_TRUE(version.HasCorrectStatistics(Type::INT32, SortOrder::SIGNED));
-  ASSERT_FALSE(version.HasCorrectStatistics(Type::BYTE_ARRAY, SortOrder::SIGNED));
-  ASSERT_TRUE(version1.HasCorrectStatistics(Type::BYTE_ARRAY, SortOrder::SIGNED));
+  EncodedStatistics stats;
+  ASSERT_FALSE(version1.HasCorrectStatistics(Type::INT96, stats, SortOrder::UNKNOWN));
+  ASSERT_TRUE(version.HasCorrectStatistics(Type::INT32, stats, SortOrder::SIGNED));
+  ASSERT_FALSE(version.HasCorrectStatistics(Type::BYTE_ARRAY, stats, SortOrder::SIGNED));
+  ASSERT_TRUE(version1.HasCorrectStatistics(Type::BYTE_ARRAY, stats, SortOrder::SIGNED));
+  ASSERT_FALSE(
+      version1.HasCorrectStatistics(Type::BYTE_ARRAY, stats, SortOrder::UNSIGNED));
+  ASSERT_TRUE(version3.HasCorrectStatistics(Type::FIXED_LEN_BYTE_ARRAY, stats,
+                                            SortOrder::SIGNED));
+
+  // Check that the old stats are correct if min and max are the same
+  // regardless of sort order
+  EncodedStatistics stats_str;
+  stats_str.set_min("a").set_max("b");
+  ASSERT_FALSE(
+      version1.HasCorrectStatistics(Type::BYTE_ARRAY, stats_str, SortOrder::UNSIGNED));
+  stats_str.set_max("a");
+  ASSERT_TRUE(
+      version1.HasCorrectStatistics(Type::BYTE_ARRAY, stats_str, SortOrder::UNSIGNED));
+
+  // Check that the same holds true for ints
+  int32_t int_min = 100, int_max = 200;
+  EncodedStatistics stats_int;
+  stats_int.set_min(std::string(reinterpret_cast<const char*>(&int_min), 4))
+      .set_max(std::string(reinterpret_cast<const char*>(&int_max), 4));
+  ASSERT_FALSE(
+      version1.HasCorrectStatistics(Type::BYTE_ARRAY, stats_int, SortOrder::UNSIGNED));
+  stats_int.set_max(std::string(reinterpret_cast<const char*>(&int_min), 4));
   ASSERT_TRUE(
-      version3.HasCorrectStatistics(Type::FIXED_LEN_BYTE_ARRAY, SortOrder::SIGNED));
+      version1.HasCorrectStatistics(Type::BYTE_ARRAY, stats_int, SortOrder::UNSIGNED));
 }

 }  // namespace metadata
diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc
index 9c66c7aab..f49393b60 100644
--- a/cpp/src/parquet/metadata.cc
+++ b/cpp/src/parquet/metadata.cc
@@ -55,7 +55,8 @@ static std::shared_ptr<RowGroupStatistics> MakeTypedColumnStats(
     return std::make_shared<TypedRowGroupStatistics<DType>>(
         descr, metadata.statistics.min_value, metadata.statistics.max_value,
         metadata.num_values - metadata.statistics.null_count,
-        metadata.statistics.null_count, metadata.statistics.distinct_count, true);
+        metadata.statistics.null_count, metadata.statistics.distinct_count,
+        metadata.statistics.__isset.max_value || metadata.statistics.__isset.min_value);
   }
   // Default behavior
   return std::make_shared<TypedRowGroupStatistics<DType>>(
@@ -100,7 +101,7 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl {
     for (auto encoding : meta_data.encodings) {
       encodings_.push_back(FromThrift(encoding));
     }
-    stats_ = nullptr;
+    possible_stats_ = nullptr;
   }
   ~ColumnChunkMetaDataImpl() {}

@@ -125,15 +126,22 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl {
   //    Eg: UTF8
   inline bool is_stats_set() const {
     DCHECK(writer_version_ != nullptr);
-    return column_->meta_data.__isset.statistics &&
-           writer_version_->HasCorrectStatistics(type(), descr_->sort_order());
+    // If the column statistics don't exist or column sort order is unknown
+    // we cannot use the column stats
+    if (!column_->meta_data.__isset.statistics ||
+        descr_->sort_order() == SortOrder::UNKNOWN) {
+      return false;
+    }
+    if (possible_stats_ == nullptr) {
+      possible_stats_ = MakeColumnStats(column_->meta_data, descr_);
+    }
+    EncodedStatistics encodedStatistics = possible_stats_->Encode();
+    return writer_version_->HasCorrectStatistics(type(), encodedStatistics,
+                                                 descr_->sort_order());
   }

   inline std::shared_ptr<RowGroupStatistics> statistics() const {
-    if (stats_ == nullptr && is_stats_set()) {
-      stats_ = MakeColumnStats(column_->meta_data, descr_);
-    }
-    return stats_;
+    return is_stats_set() ? possible_stats_ : nullptr;
   }

   inline Compression::type compression() const {
@@ -169,7 +177,7 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl {
   }

  private:
-  mutable std::shared_ptr<RowGroupStatistics> stats_;
+  mutable std::shared_ptr<RowGroupStatistics> possible_stats_;
   std::vector<Encoding::type> encodings_;
   const format::ColumnChunk* column_;
   const ColumnDescriptor* descr_;
@@ -530,11 +538,16 @@ bool ApplicationVersion::VersionEq(const ApplicationVersion& other_version) cons
 // parquet-mr/parquet-column/src/main/java/org/apache/parquet/CorruptStatistics.java
 // PARQUET-686 has more disussion on statistics
 bool ApplicationVersion::HasCorrectStatistics(Type::type col_type,
+                                              EncodedStatistics& statistics,
                                               SortOrder::type sort_order) const {
   // Parquet cpp version 1.3.0 onwards stats are computed correctly for all types
   if ((application_ != "parquet-cpp") || (VersionLt(PARQUET_CPP_FIXED_STATS_VERSION()))) {
-    // Only SIGNED are valid
-    if (SortOrder::SIGNED != sort_order) {
+    // Only SIGNED are valid unless max and min are the same
+    // (in which case the sort order does not matter)
+    bool max_equals_min = statistics.has_min && statistics.has_max
+                              ? statistics.min() == statistics.max()
+                              : false;
+    if (SortOrder::SIGNED != sort_order && !max_equals_min) {
       return false;
     }

diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h
index 79f4fdb35..7e29fe91a 100644
--- a/cpp/src/parquet/metadata.h
+++ b/cpp/src/parquet/metadata.h
@@ -85,7 +85,7 @@ class ApplicationVersion {
   bool VersionEq(const ApplicationVersion& other_version) const;

   // Checks if the Version has the correct statistics for a given column
-  bool HasCorrectStatistics(Type::type primitive,
+  bool HasCorrectStatistics(Type::type primitive, EncodedStatistics& statistics,
                             SortOrder::type sort_order = SortOrder::SIGNED) const;
 };

diff --git a/cpp/src/parquet/public-api-test.cc b/cpp/src/parquet/public-api-test.cc
index 958e97016..c0ef97a70 100644
--- a/cpp/src/parquet/public-api-test.cc
+++ b/cpp/src/parquet/public-api-test.cc
@@ -17,10 +17,10 @@

 #include <gtest/gtest.h>

-#include "parquet/api/io.h"
-#include "parquet/api/reader.h"
-#include "parquet/api/schema.h"
-#include "parquet/api/writer.h"
+#include "parquet/api/io.h"      // IWYU pragma: keep
+#include "parquet/api/reader.h"  // IWYU pragma: keep
+#include "parquet/api/schema.h"  // IWYU pragma: keep
+#include "parquet/api/writer.h"  // IWYU pragma: keep

 TEST(TestPublicAPI, DoesNotIncludeThrift) {
 #ifdef _THRIFT_THRIFT_H_
diff --git a/cpp/src/parquet/thrift.h b/cpp/src/parquet/thrift.h
index 217cc76c0..9c665acfa 100644
--- a/cpp/src/parquet/thrift.h
+++ b/cpp/src/parquet/thrift.h
@@ -44,7 +44,7 @@
 #include "parquet/exception.h"
 #include "parquet/util/memory.h"

-#include "parquet/parquet_types.h"
+#include "parquet/parquet_types.h"  // IYWU pragma: export

 namespace parquet {

diff --git a/cpp/src/parquet/util/memory.cc b/cpp/src/parquet/util/memory.cc
index 5c76cd8a6..fde424aaf 100644
--- a/cpp/src/parquet/util/memory.cc
+++ b/cpp/src/parquet/util/memory.cc
@@ -32,31 +32,32 @@
 #include "parquet/types.h"

 using arrow::MemoryPool;
+using arrow::util::Codec;

 namespace parquet {

-std::unique_ptr<::arrow::Codec> GetCodecFromArrow(Compression::type codec) {
-  std::unique_ptr<::arrow::Codec> result;
+std::unique_ptr<Codec> GetCodecFromArrow(Compression::type codec) {
+  std::unique_ptr<Codec> result;
   switch (codec) {
     case Compression::UNCOMPRESSED:
       break;
     case Compression::SNAPPY:
-      PARQUET_THROW_NOT_OK(::arrow::Codec::Create(::arrow::Compression::SNAPPY, &result));
+      PARQUET_THROW_NOT_OK(Codec::Create(::arrow::Compression::SNAPPY, &result));
       break;
     case Compression::GZIP:
-      PARQUET_THROW_NOT_OK(::arrow::Codec::Create(::arrow::Compression::GZIP, &result));
+      PARQUET_THROW_NOT_OK(Codec::Create(::arrow::Compression::GZIP, &result));
       break;
     case Compression::LZO:
-      PARQUET_THROW_NOT_OK(::arrow::Codec::Create(::arrow::Compression::LZO, &result));
+      PARQUET_THROW_NOT_OK(Codec::Create(::arrow::Compression::LZO, &result));
       break;
     case Compression::BROTLI:
-      PARQUET_THROW_NOT_OK(::arrow::Codec::Create(::arrow::Compression::BROTLI, &result));
+      PARQUET_THROW_NOT_OK(Codec::Create(::arrow::Compression::BROTLI, &result));
       break;
     case Compression::LZ4:
-      PARQUET_THROW_NOT_OK(::arrow::Codec::Create(::arrow::Compression::LZ4, &result));
+      PARQUET_THROW_NOT_OK(Codec::Create(::arrow::Compression::LZ4, &result));
       break;
     case Compression::ZSTD:
-      PARQUET_THROW_NOT_OK(::arrow::Codec::Create(::arrow::Compression::ZSTD, &result));
+      PARQUET_THROW_NOT_OK(Codec::Create(::arrow::Compression::ZSTD, &result));
       break;
     default:
       break;
diff --git a/cpp/src/parquet/util/memory.h b/cpp/src/parquet/util/memory.h
index 2eadb3326..cccafe8cb 100644
--- a/cpp/src/parquet/util/memory.h
+++ b/cpp/src/parquet/util/memory.h
@@ -37,15 +37,17 @@
 #include "parquet/util/visibility.h"

 namespace arrow {
+namespace util {

 class Codec;

+}  // namespace util
 }  // namespace arrow

 namespace parquet {

 PARQUET_EXPORT
-std::unique_ptr<::arrow::Codec> GetCodecFromArrow(Compression::type codec);
+std::unique_ptr<::arrow::util::Codec> GetCodecFromArrow(Compression::type codec);

 static constexpr int64_t kInMemoryDefaultCapacity = 1024;