Skip to content

Instantly share code, notes, and snippets.

@flowpoint
Created October 10, 2022 16:05
Show Gist options
  • Save flowpoint/08e76e9a90544009b298e5bea9219236 to your computer and use it in GitHub Desktop.
Save flowpoint/08e76e9a90544009b298e5bea9219236 to your computer and use it in GitHub Desktop.
pyarrow_big_string_patch
diff --git a/cpp/src/arrow/compute/kernels/row_encoder.cc b/cpp/src/arrow/compute/kernels/row_encoder.cc
index f553708cc..6a0862c0b 100644
--- a/cpp/src/arrow/compute/kernels/row_encoder.cc
+++ b/cpp/src/arrow/compute/kernels/row_encoder.cc
@@ -63,13 +63,13 @@ Status KeyEncoder::DecodeNulls(MemoryPool* pool, int32_t length, uint8_t** encod
}
void BooleanKeyEncoder::AddLength(const ExecValue&, int64_t batch_length,
- int32_t* lengths) {
+ int64_t* lengths) {
for (int64_t i = 0; i < batch_length; ++i) {
lengths[i] += kByteWidth + kExtraByteForNull;
}
}
-void BooleanKeyEncoder::AddLengthNull(int32_t* length) {
+void BooleanKeyEncoder::AddLengthNull(int64_t* length) {
*length += kByteWidth + kExtraByteForNull;
}
@@ -128,13 +128,13 @@ Result<std::shared_ptr<ArrayData>> BooleanKeyEncoder::Decode(uint8_t** encoded_b
}
void FixedWidthKeyEncoder::AddLength(const ExecValue&, int64_t batch_length,
- int32_t* lengths) {
+ int64_t* lengths) {
for (int64_t i = 0; i < batch_length; ++i) {
lengths[i] += byte_width_ + kExtraByteForNull;
}
}
-void FixedWidthKeyEncoder::AddLengthNull(int32_t* length) {
+void FixedWidthKeyEncoder::AddLengthNull(int64_t* length) {
*length += byte_width_ + kExtraByteForNull;
}
@@ -304,7 +304,7 @@ void RowEncoder::Init(const std::vector<TypeHolder>& column_types, ExecContext*
ARROW_DCHECK(false);
}
- int32_t total_length = 0;
+ int64_t total_length = 0;
for (size_t i = 0; i < column_types.size(); ++i) {
encoders_[i]->AddLengthNull(&total_length);
}
@@ -325,17 +325,17 @@ Status RowEncoder::EncodeAndAppend(const ExecSpan& batch) {
offsets_.resize(1);
offsets_[0] = 0;
}
- size_t length_before = offsets_.size() - 1;
+ int64_t length_before = offsets_.size() - 1;
offsets_.resize(length_before + batch.length + 1);
for (int64_t i = 0; i < batch.length; ++i) {
offsets_[length_before + 1 + i] = 0;
}
for (int i = 0; i < batch.num_values(); ++i) {
- encoders_[i]->AddLength(batch[i], batch.length, offsets_.data() + length_before + 1);
+ encoders_[i]->AddLength(batch[i], batch.length, (int64_t*) offsets_.data() + length_before + 1);
}
- int32_t total_length = offsets_[length_before];
+ int64_t total_length = offsets_[length_before];
for (int64_t i = 0; i < batch.length; ++i) {
total_length += offsets_[length_before + 1 + i];
offsets_[length_before + 1 + i] = total_length;
diff --git a/cpp/src/arrow/compute/kernels/row_encoder.h b/cpp/src/arrow/compute/kernels/row_encoder.h
index 5fe80e0f5..49f82d661 100644
--- a/cpp/src/arrow/compute/kernels/row_encoder.h
+++ b/cpp/src/arrow/compute/kernels/row_encoder.h
@@ -40,9 +40,9 @@ struct KeyEncoder {
virtual ~KeyEncoder() = default;
virtual void AddLength(const ExecValue& value, int64_t batch_length,
- int32_t* lengths) = 0;
+ int64_t* lengths) = 0;
- virtual void AddLengthNull(int32_t* length) = 0;
+ virtual void AddLengthNull(int64_t* length) = 0;
virtual Status Encode(const ExecValue&, int64_t batch_length,
uint8_t** encoded_bytes) = 0;
@@ -64,9 +64,9 @@ struct KeyEncoder {
struct BooleanKeyEncoder : KeyEncoder {
static constexpr int kByteWidth = 1;
- void AddLength(const ExecValue& data, int64_t batch_length, int32_t* lengths) override;
+ void AddLength(const ExecValue& data, int64_t batch_length, int64_t* lengths) override;
- void AddLengthNull(int32_t* length) override;
+ void AddLengthNull(int64_t* length) override;
Status Encode(const ExecValue& data, int64_t batch_length,
uint8_t** encoded_bytes) override;
@@ -82,9 +82,9 @@ struct FixedWidthKeyEncoder : KeyEncoder {
: type_(std::move(type)),
byte_width_(checked_cast<const FixedWidthType&>(*type_).bit_width() / 8) {}
- void AddLength(const ExecValue& data, int64_t batch_length, int32_t* lengths) override;
+ void AddLength(const ExecValue& data, int64_t batch_length, int64_t* lengths) override;
- void AddLengthNull(int32_t* length) override;
+ void AddLengthNull(int64_t* length) override;
Status Encode(const ExecValue& data, int64_t batch_length,
uint8_t** encoded_bytes) override;
@@ -116,7 +116,7 @@ template <typename T>
struct VarLengthKeyEncoder : KeyEncoder {
using Offset = typename T::offset_type;
- void AddLength(const ExecValue& data, int64_t batch_length, int32_t* lengths) override {
+ void AddLength(const ExecValue& data, int64_t batch_length, int64_t* lengths) override {
if (data.is_array()) {
int64_t i = 0;
VisitArraySpanInline<T>(
@@ -137,7 +137,7 @@ struct VarLengthKeyEncoder : KeyEncoder {
}
}
- void AddLengthNull(int32_t* length) override {
+ void AddLengthNull(int64_t* length) override {
*length += kExtraByteForNull + sizeof(Offset);
}
@@ -234,9 +234,9 @@ struct VarLengthKeyEncoder : KeyEncoder {
};
struct NullKeyEncoder : KeyEncoder {
- void AddLength(const ExecValue&, int64_t batch_length, int32_t* lengths) override {}
+ void AddLength(const ExecValue&, int64_t batch_length, int64_t* lengths) override {}
- void AddLengthNull(int32_t* length) override {}
+ void AddLengthNull(int64_t* length) override {}
Status Encode(const ExecValue& data, int64_t batch_length,
uint8_t** encoded_bytes) override {
@@ -277,7 +277,7 @@ class ARROW_EXPORT RowEncoder {
private:
ExecContext* ctx_;
std::vector<std::shared_ptr<KeyEncoder>> encoders_;
- std::vector<int32_t> offsets_;
+ std::vector<int64_t> offsets_;
std::vector<uint8_t> bytes_;
std::vector<uint8_t> encoded_nulls_;
std::vector<std::shared_ptr<ExtensionType>> extension_types_;
diff --git a/cpp/src/arrow/compute/row/grouper.cc b/cpp/src/arrow/compute/row/grouper.cc
index cf0aa7dd4..4e442c6c3 100644
--- a/cpp/src/arrow/compute/row/grouper.cc
+++ b/cpp/src/arrow/compute/row/grouper.cc
@@ -98,7 +98,7 @@ struct GrouperImpl : Grouper {
Result<Datum> Consume(const ExecSpan& batch) override {
std::vector<int32_t> offsets_batch(batch.length + 1);
for (int i = 0; i < batch.num_values(); ++i) {
- encoders_[i]->AddLength(batch[i], batch.length, offsets_batch.data());
+ encoders_[i]->AddLength(batch[i], batch.length, (int64_t*)offsets_batch.data());
}
int32_t total_length = 0;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment