Created
October 10, 2022 16:05
-
-
Save flowpoint/08e76e9a90544009b298e5bea9219236 to your computer and use it in GitHub Desktop.
pyarrow_big_string_patch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff --git a/cpp/src/arrow/compute/kernels/row_encoder.cc b/cpp/src/arrow/compute/kernels/row_encoder.cc | |
index f553708cc..6a0862c0b 100644 | |
--- a/cpp/src/arrow/compute/kernels/row_encoder.cc | |
+++ b/cpp/src/arrow/compute/kernels/row_encoder.cc | |
@@ -63,13 +63,13 @@ Status KeyEncoder::DecodeNulls(MemoryPool* pool, int32_t length, uint8_t** encod | |
} | |
void BooleanKeyEncoder::AddLength(const ExecValue&, int64_t batch_length, | |
- int32_t* lengths) { | |
+ int64_t* lengths) { | |
for (int64_t i = 0; i < batch_length; ++i) { | |
lengths[i] += kByteWidth + kExtraByteForNull; | |
} | |
} | |
-void BooleanKeyEncoder::AddLengthNull(int32_t* length) { | |
+void BooleanKeyEncoder::AddLengthNull(int64_t* length) { | |
*length += kByteWidth + kExtraByteForNull; | |
} | |
@@ -128,13 +128,13 @@ Result<std::shared_ptr<ArrayData>> BooleanKeyEncoder::Decode(uint8_t** encoded_b | |
} | |
void FixedWidthKeyEncoder::AddLength(const ExecValue&, int64_t batch_length, | |
- int32_t* lengths) { | |
+ int64_t* lengths) { | |
for (int64_t i = 0; i < batch_length; ++i) { | |
lengths[i] += byte_width_ + kExtraByteForNull; | |
} | |
} | |
-void FixedWidthKeyEncoder::AddLengthNull(int32_t* length) { | |
+void FixedWidthKeyEncoder::AddLengthNull(int64_t* length) { | |
*length += byte_width_ + kExtraByteForNull; | |
} | |
@@ -304,7 +304,7 @@ void RowEncoder::Init(const std::vector<TypeHolder>& column_types, ExecContext* | |
ARROW_DCHECK(false); | |
} | |
- int32_t total_length = 0; | |
+ int64_t total_length = 0; | |
for (size_t i = 0; i < column_types.size(); ++i) { | |
encoders_[i]->AddLengthNull(&total_length); | |
} | |
@@ -325,17 +325,17 @@ Status RowEncoder::EncodeAndAppend(const ExecSpan& batch) { | |
offsets_.resize(1); | |
offsets_[0] = 0; | |
} | |
- size_t length_before = offsets_.size() - 1; | |
+ int64_t length_before = offsets_.size() - 1; | |
offsets_.resize(length_before + batch.length + 1); | |
for (int64_t i = 0; i < batch.length; ++i) { | |
offsets_[length_before + 1 + i] = 0; | |
} | |
for (int i = 0; i < batch.num_values(); ++i) { | |
- encoders_[i]->AddLength(batch[i], batch.length, offsets_.data() + length_before + 1); | |
+ encoders_[i]->AddLength(batch[i], batch.length, (int64_t*) offsets_.data() + length_before + 1); | |
} | |
- int32_t total_length = offsets_[length_before]; | |
+ int64_t total_length = offsets_[length_before]; | |
for (int64_t i = 0; i < batch.length; ++i) { | |
total_length += offsets_[length_before + 1 + i]; | |
offsets_[length_before + 1 + i] = total_length; | |
diff --git a/cpp/src/arrow/compute/kernels/row_encoder.h b/cpp/src/arrow/compute/kernels/row_encoder.h | |
index 5fe80e0f5..49f82d661 100644 | |
--- a/cpp/src/arrow/compute/kernels/row_encoder.h | |
+++ b/cpp/src/arrow/compute/kernels/row_encoder.h | |
@@ -40,9 +40,9 @@ struct KeyEncoder { | |
virtual ~KeyEncoder() = default; | |
virtual void AddLength(const ExecValue& value, int64_t batch_length, | |
- int32_t* lengths) = 0; | |
+ int64_t* lengths) = 0; | |
- virtual void AddLengthNull(int32_t* length) = 0; | |
+ virtual void AddLengthNull(int64_t* length) = 0; | |
virtual Status Encode(const ExecValue&, int64_t batch_length, | |
uint8_t** encoded_bytes) = 0; | |
@@ -64,9 +64,9 @@ struct KeyEncoder { | |
struct BooleanKeyEncoder : KeyEncoder { | |
static constexpr int kByteWidth = 1; | |
- void AddLength(const ExecValue& data, int64_t batch_length, int32_t* lengths) override; | |
+ void AddLength(const ExecValue& data, int64_t batch_length, int64_t* lengths) override; | |
- void AddLengthNull(int32_t* length) override; | |
+ void AddLengthNull(int64_t* length) override; | |
Status Encode(const ExecValue& data, int64_t batch_length, | |
uint8_t** encoded_bytes) override; | |
@@ -82,9 +82,9 @@ struct FixedWidthKeyEncoder : KeyEncoder { | |
: type_(std::move(type)), | |
byte_width_(checked_cast<const FixedWidthType&>(*type_).bit_width() / 8) {} | |
- void AddLength(const ExecValue& data, int64_t batch_length, int32_t* lengths) override; | |
+ void AddLength(const ExecValue& data, int64_t batch_length, int64_t* lengths) override; | |
- void AddLengthNull(int32_t* length) override; | |
+ void AddLengthNull(int64_t* length) override; | |
Status Encode(const ExecValue& data, int64_t batch_length, | |
uint8_t** encoded_bytes) override; | |
@@ -116,7 +116,7 @@ template <typename T> | |
struct VarLengthKeyEncoder : KeyEncoder { | |
using Offset = typename T::offset_type; | |
- void AddLength(const ExecValue& data, int64_t batch_length, int32_t* lengths) override { | |
+ void AddLength(const ExecValue& data, int64_t batch_length, int64_t* lengths) override { | |
if (data.is_array()) { | |
int64_t i = 0; | |
VisitArraySpanInline<T>( | |
@@ -137,7 +137,7 @@ struct VarLengthKeyEncoder : KeyEncoder { | |
} | |
} | |
- void AddLengthNull(int32_t* length) override { | |
+ void AddLengthNull(int64_t* length) override { | |
*length += kExtraByteForNull + sizeof(Offset); | |
} | |
@@ -234,9 +234,9 @@ struct VarLengthKeyEncoder : KeyEncoder { | |
}; | |
struct NullKeyEncoder : KeyEncoder { | |
- void AddLength(const ExecValue&, int64_t batch_length, int32_t* lengths) override {} | |
+ void AddLength(const ExecValue&, int64_t batch_length, int64_t* lengths) override {} | |
- void AddLengthNull(int32_t* length) override {} | |
+ void AddLengthNull(int64_t* length) override {} | |
Status Encode(const ExecValue& data, int64_t batch_length, | |
uint8_t** encoded_bytes) override { | |
@@ -277,7 +277,7 @@ class ARROW_EXPORT RowEncoder { | |
private: | |
ExecContext* ctx_; | |
std::vector<std::shared_ptr<KeyEncoder>> encoders_; | |
- std::vector<int32_t> offsets_; | |
+ std::vector<int64_t> offsets_; | |
std::vector<uint8_t> bytes_; | |
std::vector<uint8_t> encoded_nulls_; | |
std::vector<std::shared_ptr<ExtensionType>> extension_types_; | |
diff --git a/cpp/src/arrow/compute/row/grouper.cc b/cpp/src/arrow/compute/row/grouper.cc | |
index cf0aa7dd4..4e442c6c3 100644 | |
--- a/cpp/src/arrow/compute/row/grouper.cc | |
+++ b/cpp/src/arrow/compute/row/grouper.cc | |
@@ -98,7 +98,7 @@ struct GrouperImpl : Grouper { | |
Result<Datum> Consume(const ExecSpan& batch) override { | |
std::vector<int32_t> offsets_batch(batch.length + 1); | |
for (int i = 0; i < batch.num_values(); ++i) { | |
- encoders_[i]->AddLength(batch[i], batch.length, offsets_batch.data()); | |
+ encoders_[i]->AddLength(batch[i], batch.length, (int64_t*)offsets_batch.data()); | |
} | |
int32_t total_length = 0; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment