Skip to content

Instantly share code, notes, and snippets.

@kszucs
Created July 22, 2019 18:28
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kszucs/2172802c04c43540068067692febab38 to your computer and use it in GitHub Desktop.
Save kszucs/2172802c04c43540068067692febab38 to your computer and use it in GitHub Desktop.
diff --git a/cpp/build-support/get_apache_mirror.py b/cpp/build-support/get_apache_mirror.py
index 38ea6f4d2..ac55abad4 100755
--- a/cpp/build-support/get_apache_mirror.py
+++ b/cpp/build-support/get_apache_mirror.py
@@ -20,6 +20,8 @@
# mirror for downloading dependencies, e.g. in CMake
import json
+import warnings
+
try:
import requests
@@ -35,6 +37,14 @@ except ImportError:
def get_url(url):
return urlopen(url).read()
-suggested_mirror = get_url('https://www.apache.org/dyn/'
- 'closer.cgi?as_json=1')
-print(json.loads(suggested_mirror.decode('utf-8'))['preferred'])
+url = 'https://www.apache.org/dyn/closer.cgi?as_json=1'
+
+try:
+ suggested_mirror = get_url(url)
+except Exception as e:
+ warnings.warn("Failed loading {url!r}: {e}".format(**locals()),
+ RuntimeWarning)
+ # Well-known mirror, in case the URL above fails loading
+ print("http://apache.osuosl.org/")
+else:
+ print(json.loads(suggested_mirror.decode('utf-8'))['preferred'])
diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake
index aa1e557af..15c8b6e07 100644
--- a/cpp/cmake_modules/ThirdpartyToolchain.cmake
+++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake
@@ -40,11 +40,10 @@ set(APACHE_MIRROR "")
macro(get_apache_mirror)
if(APACHE_MIRROR STREQUAL "")
- exec_program(${PYTHON_EXECUTABLE}
- ARGS
- ${CMAKE_SOURCE_DIR}/build-support/get_apache_mirror.py
- OUTPUT_VARIABLE
- APACHE_MIRROR)
+ execute_process(COMMAND ${PYTHON_EXECUTABLE}
+ ${CMAKE_SOURCE_DIR}/build-support/get_apache_mirror.py
+ OUTPUT_VARIABLE APACHE_MIRROR
+ OUTPUT_STRIP_TRAILING_WHITESPACE)
endif()
endmacro()
diff --git a/cpp/src/arrow/array-binary-test.cc b/cpp/src/arrow/array-binary-test.cc
index cb8d6d530..71fb81ebb 100644
--- a/cpp/src/arrow/array-binary-test.cc
+++ b/cpp/src/arrow/array-binary-test.cc
@@ -40,6 +40,9 @@ namespace arrow {
using internal::checked_cast;
+using StringTypes =
+ ::testing::Types<StringType, LargeStringType, BinaryType, LargeBinaryType>;
+
// ----------------------------------------------------------------------
// String / Binary tests
@@ -67,8 +70,14 @@ void CheckStringArray(const ArrayType& array, const std::vector<std::string>& st
}
}
+template <typename T>
class TestStringArray : public ::testing::Test {
public:
+ using TypeClass = T;
+ using offset_type = typename TypeClass::offset_type;
+ using ArrayType = typename TypeTraits<TypeClass>::ArrayType;
+ using BuilderType = typename TypeTraits<TypeClass>::BuilderType;
+
void SetUp() {
chars_ = {'a', 'b', 'b', 'c', 'c', 'c'};
offsets_ = {0, 1, 1, 1, 3, 6};
@@ -85,268 +94,132 @@ class TestStringArray : public ::testing::Test {
ASSERT_OK(BitUtil::BytesToBits(valid_bytes_, default_memory_pool(), &null_bitmap_));
null_count_ = CountNulls(valid_bytes_);
- strings_ = std::make_shared<StringArray>(length_, offsets_buf_, value_buf_,
- null_bitmap_, null_count_);
- }
-
- protected:
- std::vector<int32_t> offsets_;
- std::vector<char> chars_;
- std::vector<uint8_t> valid_bytes_;
-
- std::vector<std::string> expected_;
-
- std::shared_ptr<Buffer> value_buf_;
- std::shared_ptr<Buffer> offsets_buf_;
- std::shared_ptr<Buffer> null_bitmap_;
-
- int64_t null_count_;
- int64_t length_;
-
- std::shared_ptr<StringArray> strings_;
-};
-
-TEST_F(TestStringArray, TestArrayBasics) {
- ASSERT_EQ(length_, strings_->length());
- ASSERT_EQ(1, strings_->null_count());
- ASSERT_OK(ValidateArray(*strings_));
-}
-
-TEST_F(TestStringArray, TestType) {
- std::shared_ptr<DataType> type = strings_->type();
-
- ASSERT_EQ(Type::STRING, type->id());
- ASSERT_EQ(Type::STRING, strings_->type_id());
-}
-
-TEST_F(TestStringArray, TestListFunctions) {
- int pos = 0;
- for (size_t i = 0; i < expected_.size(); ++i) {
- ASSERT_EQ(pos, strings_->value_offset(i));
- ASSERT_EQ(static_cast<int>(expected_[i].size()), strings_->value_length(i));
- pos += static_cast<int>(expected_[i].size());
- }
-}
-
-TEST_F(TestStringArray, TestDestructor) {
- auto arr = std::make_shared<StringArray>(length_, offsets_buf_, value_buf_,
+ strings_ = std::make_shared<ArrayType>(length_, offsets_buf_, value_buf_,
null_bitmap_, null_count_);
-}
+ }
-TEST_F(TestStringArray, TestGetString) {
- for (size_t i = 0; i < expected_.size(); ++i) {
- if (valid_bytes_[i] == 0) {
- ASSERT_TRUE(strings_->IsNull(i));
+ void _TestArrayBasics() {
+ ASSERT_EQ(length_, strings_->length());
+ ASSERT_EQ(1, strings_->null_count());
+ ASSERT_OK(ValidateArray(*strings_));
+ TestInitialized(*strings_);
+ AssertZeroPadded(*strings_);
+ }
+
+ void _TestType() {
+ std::shared_ptr<DataType> type = this->strings_->type();
+
+ if (std::is_same<TypeClass, StringType>::value) {
+ ASSERT_EQ(Type::STRING, type->id());
+ ASSERT_EQ(Type::STRING, this->strings_->type_id());
+ } else if (std::is_same<TypeClass, LargeStringType>::value) {
+ ASSERT_EQ(Type::LARGE_STRING, type->id());
+ ASSERT_EQ(Type::LARGE_STRING, this->strings_->type_id());
+ } else if (std::is_same<TypeClass, BinaryType>::value) {
+ ASSERT_EQ(Type::BINARY, type->id());
+ ASSERT_EQ(Type::BINARY, this->strings_->type_id());
+ } else if (std::is_same<TypeClass, LargeBinaryType>::value) {
+ ASSERT_EQ(Type::LARGE_BINARY, type->id());
+ ASSERT_EQ(Type::LARGE_BINARY, this->strings_->type_id());
} else {
- ASSERT_EQ(expected_[i], strings_->GetString(i));
+ FAIL();
}
}
-}
-
-TEST_F(TestStringArray, TestEmptyStringComparison) {
- offsets_ = {0, 0, 0, 0, 0, 0};
- offsets_buf_ = Buffer::Wrap(offsets_);
- length_ = static_cast<int64_t>(offsets_.size() - 1);
-
- auto strings_a = std::make_shared<StringArray>(length_, offsets_buf_, nullptr,
- null_bitmap_, null_count_);
- auto strings_b = std::make_shared<StringArray>(length_, offsets_buf_, nullptr,
- null_bitmap_, null_count_);
- ASSERT_TRUE(strings_a->Equals(strings_b));
-}
-
-TEST_F(TestStringArray, CompareNullByteSlots) {
- StringBuilder builder;
- StringBuilder builder2;
- StringBuilder builder3;
-
- ASSERT_OK(builder.Append("foo"));
- ASSERT_OK(builder2.Append("foo"));
- ASSERT_OK(builder3.Append("foo"));
-
- ASSERT_OK(builder.Append("bar"));
- ASSERT_OK(builder2.AppendNull());
-
- // same length, but different
- ASSERT_OK(builder3.Append("xyz"));
-
- ASSERT_OK(builder.Append("baz"));
- ASSERT_OK(builder2.Append("baz"));
- ASSERT_OK(builder3.Append("baz"));
-
- std::shared_ptr<Array> array, array2, array3;
- FinishAndCheckPadding(&builder, &array);
- ASSERT_OK(builder2.Finish(&array2));
- ASSERT_OK(builder3.Finish(&array3));
-
- const auto& a1 = checked_cast<const StringArray&>(*array);
- const auto& a2 = checked_cast<const StringArray&>(*array2);
- const auto& a3 = checked_cast<const StringArray&>(*array3);
-
- // The validity bitmaps are the same, the data is different, but the unequal
- // portion is masked out
- StringArray equal_array(3, a1.value_offsets(), a1.value_data(), a2.null_bitmap(), 1);
- StringArray equal_array2(3, a3.value_offsets(), a3.value_data(), a2.null_bitmap(), 1);
- ASSERT_TRUE(equal_array.Equals(equal_array2));
- ASSERT_TRUE(a2.RangeEquals(equal_array2, 0, 3, 0));
-
- ASSERT_TRUE(equal_array.Array::Slice(1)->Equals(equal_array2.Array::Slice(1)));
- ASSERT_TRUE(
- equal_array.Array::Slice(1)->RangeEquals(0, 2, 0, equal_array2.Array::Slice(1)));
-}
-
-TEST_F(TestStringArray, TestSliceGetString) {
- StringBuilder builder;
-
- ASSERT_OK(builder.Append("a"));
- ASSERT_OK(builder.Append("b"));
- ASSERT_OK(builder.Append("c"));
-
- std::shared_ptr<Array> array;
- ASSERT_OK(builder.Finish(&array));
- auto s = array->Slice(1, 10);
- auto arr = std::dynamic_pointer_cast<StringArray>(s);
- ASSERT_EQ(arr->GetString(0), "b");
-}
-
-// ----------------------------------------------------------------------
-// String builder tests
-
-class TestStringBuilder : public TestBuilder {
- public:
- void SetUp() {
- TestBuilder::SetUp();
- builder_.reset(new StringBuilder(pool_));
+ void _TestListFunctions() {
+ int64_t pos = 0;
+ for (size_t i = 0; i < expected_.size(); ++i) {
+ ASSERT_EQ(pos, strings_->value_offset(i));
+ ASSERT_EQ(expected_[i].size(), strings_->value_length(i));
+ pos += expected_[i].size();
+ }
}
- void Done() {
- std::shared_ptr<Array> out;
- FinishAndCheckPadding(builder_.get(), &out);
-
- result_ = std::dynamic_pointer_cast<StringArray>(out);
- ASSERT_OK(ValidateArray(*result_));
+ void _TestDestructor() {
+ auto arr = std::make_shared<ArrayType>(length_, offsets_buf_, value_buf_,
+ null_bitmap_, null_count_);
}
- protected:
- std::unique_ptr<StringBuilder> builder_;
- std::shared_ptr<StringArray> result_;
-};
-
-TEST_F(TestStringBuilder, TestScalarAppend) {
- std::vector<std::string> strings = {"", "bb", "a", "", "ccc"};
- std::vector<uint8_t> is_valid = {1, 1, 1, 0, 1};
-
- int N = static_cast<int>(strings.size());
- int reps = 1000;
-
- for (int j = 0; j < reps; ++j) {
- for (int i = 0; i < N; ++i) {
- if (!is_valid[i]) {
- ASSERT_OK(builder_->AppendNull());
+ void _TestGetString() {
+ for (size_t i = 0; i < expected_.size(); ++i) {
+ if (valid_bytes_[i] == 0) {
+ ASSERT_TRUE(strings_->IsNull(i));
} else {
- ASSERT_OK(builder_->Append(strings[i]));
+ ASSERT_FALSE(strings_->IsNull(i));
+ ASSERT_EQ(expected_[i], strings_->GetString(i));
}
}
}
- Done();
-
- ASSERT_EQ(reps * N, result_->length());
- ASSERT_EQ(reps, result_->null_count());
- ASSERT_EQ(reps * 6, result_->value_data()->size());
- CheckStringArray(*result_, strings, is_valid, reps);
-}
-
-TEST_F(TestStringBuilder, TestAppendVector) {
- std::vector<std::string> strings = {"", "bb", "a", "", "ccc"};
- std::vector<uint8_t> valid_bytes = {1, 1, 1, 0, 1};
-
- int N = static_cast<int>(strings.size());
- int reps = 1000;
-
- for (int j = 0; j < reps; ++j) {
- ASSERT_OK(builder_->AppendValues(strings, valid_bytes.data()));
- }
- Done();
-
- ASSERT_EQ(reps * N, result_->length());
- ASSERT_EQ(reps, result_->null_count());
- ASSERT_EQ(reps * 6, result_->value_data()->size());
-
- CheckStringArray(*result_, strings, valid_bytes, reps);
-}
-
-TEST_F(TestStringBuilder, TestAppendCStringsWithValidBytes) {
- const char* strings[] = {nullptr, "aaa", nullptr, "ignored", ""};
- std::vector<uint8_t> valid_bytes = {1, 1, 1, 0, 1};
-
- int N = static_cast<int>(sizeof(strings) / sizeof(strings[0]));
- int reps = 1000;
+ void _TestEmptyStringComparison() {
+ offsets_ = {0, 0, 0, 0, 0, 0};
+ offsets_buf_ = Buffer::Wrap(offsets_);
+ length_ = static_cast<int64_t>(offsets_.size() - 1);
- for (int j = 0; j < reps; ++j) {
- ASSERT_OK(builder_->AppendValues(strings, N, valid_bytes.data()));
+ auto strings_a = std::make_shared<ArrayType>(length_, offsets_buf_, nullptr,
+ null_bitmap_, null_count_);
+ auto strings_b = std::make_shared<ArrayType>(length_, offsets_buf_, nullptr,
+ null_bitmap_, null_count_);
+ ASSERT_TRUE(strings_a->Equals(strings_b));
}
- Done();
- ASSERT_EQ(reps * N, result_->length());
- ASSERT_EQ(reps * 3, result_->null_count());
- ASSERT_EQ(reps * 3, result_->value_data()->size());
+ void _TestCompareNullByteSlots() {
+ BuilderType builder;
+ BuilderType builder2;
+ BuilderType builder3;
- CheckStringArray(*result_, {"", "aaa", "", "", ""}, {0, 1, 0, 0, 1}, reps);
-}
+ ASSERT_OK(builder.Append("foo"));
+ ASSERT_OK(builder2.Append("foo"));
+ ASSERT_OK(builder3.Append("foo"));
-TEST_F(TestStringBuilder, TestAppendCStringsWithoutValidBytes) {
- const char* strings[] = {"", "bb", "a", nullptr, "ccc"};
+ ASSERT_OK(builder.Append("bar"));
+ ASSERT_OK(builder2.AppendNull());
- int N = static_cast<int>(sizeof(strings) / sizeof(strings[0]));
- int reps = 1000;
+ // same length, but different
+ ASSERT_OK(builder3.Append("xyz"));
- for (int j = 0; j < reps; ++j) {
- ASSERT_OK(builder_->AppendValues(strings, N));
- }
- Done();
+ ASSERT_OK(builder.Append("baz"));
+ ASSERT_OK(builder2.Append("baz"));
+ ASSERT_OK(builder3.Append("baz"));
- ASSERT_EQ(reps * N, result_->length());
- ASSERT_EQ(reps, result_->null_count());
- ASSERT_EQ(reps * 6, result_->value_data()->size());
+ std::shared_ptr<Array> array, array2, array3;
+ FinishAndCheckPadding(&builder, &array);
+ ASSERT_OK(builder2.Finish(&array2));
+ ASSERT_OK(builder3.Finish(&array3));
- CheckStringArray(*result_, {"", "bb", "a", "", "ccc"}, {1, 1, 1, 0, 1}, reps);
-}
+ const auto& a1 = checked_cast<const ArrayType&>(*array);
+ const auto& a2 = checked_cast<const ArrayType&>(*array2);
+ const auto& a3 = checked_cast<const ArrayType&>(*array3);
-TEST_F(TestStringBuilder, TestZeroLength) {
- // All buffers are null
- Done();
-}
+ // The validity bitmaps are the same, the data is different, but the unequal
+ // portion is masked out
+ ArrayType equal_array(3, a1.value_offsets(), a1.value_data(), a2.null_bitmap(), 1);
+ ArrayType equal_array2(3, a3.value_offsets(), a3.value_data(), a2.null_bitmap(), 1);
-// Binary container type
-// TODO(emkornfield) there should be some way to refactor these to avoid code duplicating
-// with String
-class TestBinaryArray : public ::testing::Test {
- public:
- void SetUp() {
- chars_ = {'a', 'b', 'b', 'c', 'c', 'c'};
- offsets_ = {0, 1, 1, 1, 3, 6};
- valid_bytes_ = {1, 1, 0, 1, 1};
- expected_ = {"a", "", "", "bb", "ccc"};
+ ASSERT_TRUE(equal_array.Equals(equal_array2));
+ ASSERT_TRUE(a2.RangeEquals(equal_array2, 0, 3, 0));
- MakeArray();
+ ASSERT_TRUE(equal_array.Array::Slice(1)->Equals(equal_array2.Array::Slice(1)));
+ ASSERT_TRUE(
+ equal_array.Array::Slice(1)->RangeEquals(0, 2, 0, equal_array2.Array::Slice(1)));
}
- void MakeArray() {
- length_ = static_cast<int64_t>(offsets_.size() - 1);
- value_buf_ = Buffer::Wrap(chars_);
- offsets_buf_ = Buffer::Wrap(offsets_);
+ void _TestSliceGetString() {
+ BuilderType builder;
- ASSERT_OK(BitUtil::BytesToBits(valid_bytes_, default_memory_pool(), &null_bitmap_));
- null_count_ = CountNulls(valid_bytes_);
+ ASSERT_OK(builder.Append("a"));
+ ASSERT_OK(builder.Append("b"));
+ ASSERT_OK(builder.Append("c"));
- strings_ = std::make_shared<BinaryArray>(length_, offsets_buf_, value_buf_,
- null_bitmap_, null_count_);
+ std::shared_ptr<Array> array;
+ ASSERT_OK(builder.Finish(&array));
+ auto s = array->Slice(1, 10);
+ auto arr = std::dynamic_pointer_cast<ArrayType>(s);
+ ASSERT_EQ(arr->GetString(0), "b");
}
protected:
- std::vector<int32_t> offsets_;
+ std::vector<offset_type> offsets_;
std::vector<char> chars_;
std::vector<uint8_t> valid_bytes_;
@@ -359,300 +232,161 @@ class TestBinaryArray : public ::testing::Test {
int64_t null_count_;
int64_t length_;
- std::shared_ptr<BinaryArray> strings_;
+ std::shared_ptr<ArrayType> strings_;
};
-TEST_F(TestBinaryArray, TestArrayBasics) {
- ASSERT_EQ(length_, strings_->length());
- ASSERT_EQ(1, strings_->null_count());
- ASSERT_OK(ValidateArray(*strings_));
-}
+TYPED_TEST_CASE(TestStringArray, StringTypes);
-TEST_F(TestBinaryArray, TestType) {
- std::shared_ptr<DataType> type = strings_->type();
+TYPED_TEST(TestStringArray, TestArrayBasics) { this->_TestArrayBasics(); }
- ASSERT_EQ(Type::BINARY, type->id());
- ASSERT_EQ(Type::BINARY, strings_->type_id());
-}
+TYPED_TEST(TestStringArray, TestType) { this->_TestType(); }
-TEST_F(TestBinaryArray, TestListFunctions) {
- size_t pos = 0;
- for (size_t i = 0; i < expected_.size(); ++i) {
- ASSERT_EQ(pos, strings_->value_offset(i));
- ASSERT_EQ(static_cast<int>(expected_[i].size()), strings_->value_length(i));
- pos += expected_[i].size();
- }
-}
+TYPED_TEST(TestStringArray, TestListFunctions) { this->_TestListFunctions(); }
-TEST_F(TestBinaryArray, TestDestructor) {
- auto arr = std::make_shared<BinaryArray>(length_, offsets_buf_, value_buf_,
- null_bitmap_, null_count_);
-}
+TYPED_TEST(TestStringArray, TestDestructor) { this->_TestDestructor(); }
-TEST_F(TestBinaryArray, TestGetValue) {
- for (size_t i = 0; i < expected_.size(); ++i) {
- if (valid_bytes_[i] == 0) {
- ASSERT_TRUE(strings_->IsNull(i));
- } else {
- ASSERT_FALSE(strings_->IsNull(i));
- ASSERT_EQ(strings_->GetString(i), expected_[i]);
- }
- }
-}
+TYPED_TEST(TestStringArray, TestGetString) { this->_TestGetString(); }
-TEST_F(TestBinaryArray, TestNullValuesInitialized) {
- for (size_t i = 0; i < expected_.size(); ++i) {
- if (valid_bytes_[i] == 0) {
- ASSERT_TRUE(strings_->IsNull(i));
- } else {
- ASSERT_FALSE(strings_->IsNull(i));
- ASSERT_EQ(strings_->GetString(i), expected_[i]);
- }
- }
- TestInitialized(*strings_);
+TYPED_TEST(TestStringArray, TestEmptyStringComparison) {
+ this->_TestEmptyStringComparison();
}
-TEST_F(TestBinaryArray, TestPaddingZeroed) { AssertZeroPadded(*strings_); }
+TYPED_TEST(TestStringArray, CompareNullByteSlots) { this->_TestCompareNullByteSlots(); }
-TEST_F(TestBinaryArray, TestGetString) {
- for (size_t i = 0; i < expected_.size(); ++i) {
- if (valid_bytes_[i] == 0) {
- ASSERT_TRUE(strings_->IsNull(i));
- } else {
- std::string val = strings_->GetString(i);
- ASSERT_EQ(0, std::memcmp(expected_[i].data(), val.c_str(), val.size()));
- }
- }
-}
+TYPED_TEST(TestStringArray, TestSliceGetString) { this->_TestSliceGetString(); }
-TEST_F(TestBinaryArray, TestEqualsEmptyStrings) {
- BinaryBuilder builder;
-
- std::string empty_string("");
- for (int i = 0; i < 5; ++i) {
- ASSERT_OK(builder.Append(empty_string));
- }
-
- std::shared_ptr<Array> left_arr;
- FinishAndCheckPadding(&builder, &left_arr);
-
- const BinaryArray& left = checked_cast<const BinaryArray&>(*left_arr);
- std::shared_ptr<Array> right =
- std::make_shared<BinaryArray>(left.length(), left.value_offsets(), nullptr,
- left.null_bitmap(), left.null_count());
-
- ASSERT_TRUE(left.Equals(right));
- ASSERT_TRUE(left.RangeEquals(0, left.length(), 0, right));
-}
+// ----------------------------------------------------------------------
+// String builder tests
-class TestBinaryBuilder : public TestBuilder {
+template <typename T>
+class TestStringBuilder : public TestBuilder {
public:
+ using TypeClass = T;
+ using offset_type = typename TypeClass::offset_type;
+ using ArrayType = typename TypeTraits<TypeClass>::ArrayType;
+ using BuilderType = typename TypeTraits<TypeClass>::BuilderType;
+
void SetUp() {
TestBuilder::SetUp();
- builder_.reset(new BinaryBuilder(pool_));
+ builder_.reset(new BuilderType(pool_));
}
void Done() {
std::shared_ptr<Array> out;
FinishAndCheckPadding(builder_.get(), &out);
- result_ = std::dynamic_pointer_cast<BinaryArray>(out);
+ result_ = std::dynamic_pointer_cast<ArrayType>(out);
ASSERT_OK(ValidateArray(*result_));
}
- protected:
- std::unique_ptr<BinaryBuilder> builder_;
- std::shared_ptr<BinaryArray> result_;
-};
-
-TEST_F(TestBinaryBuilder, TestScalarAppend) {
- std::vector<std::string> strings = {"", "bb", "a", "", "ccc"};
- std::vector<uint8_t> is_valid = {1, 1, 1, 0, 1};
+ void _TestScalarAppend() {
+ std::vector<std::string> strings = {"", "bb", "a", "", "ccc"};
+ std::vector<uint8_t> is_valid = {1, 1, 1, 0, 1};
- int N = static_cast<int>(strings.size());
- int reps = 10;
+ int N = static_cast<int>(strings.size());
+ int reps = 1000;
- for (int j = 0; j < reps; ++j) {
- for (int i = 0; i < N; ++i) {
- if (!is_valid[i]) {
- ASSERT_OK(builder_->AppendNull());
- } else {
- ASSERT_OK(builder_->Append(strings[i]));
+ for (int j = 0; j < reps; ++j) {
+ for (int i = 0; i < N; ++i) {
+ if (!is_valid[i]) {
+ ASSERT_OK(builder_->AppendNull());
+ } else {
+ ASSERT_OK(builder_->Append(strings[i]));
+ }
}
}
- }
- Done();
- ASSERT_OK(ValidateArray(*result_));
- ASSERT_EQ(reps * N, result_->length());
- ASSERT_EQ(reps, result_->null_count());
- ASSERT_EQ(reps * 6, result_->value_data()->size());
-
- CheckStringArray(*result_, strings, is_valid, reps);
-}
-
-TEST_F(TestBinaryBuilder, TestAppendNulls) {
- ASSERT_OK(builder_->Append("bow"));
- ASSERT_OK(builder_->AppendNulls(3));
- ASSERT_OK(builder_->Append("arrow"));
- Done();
- ASSERT_OK(ValidateArray(*result_));
-
- ASSERT_EQ(5, result_->length());
- ASSERT_EQ(3, result_->null_count());
- ASSERT_EQ(8, result_->value_data()->size());
-
- CheckStringArray(*result_, {"bow", "", "", "", "arrow"}, {1, 0, 0, 0, 1});
-}
+ Done();
-TEST_F(TestBinaryBuilder, TestScalarAppendUnsafe) {
- std::vector<std::string> strings = {"", "bb", "a", "", "ccc"};
- std::vector<uint8_t> is_valid = {1, 1, 1, 0, 1};
+ ASSERT_EQ(reps * N, result_->length());
+ ASSERT_EQ(reps, result_->null_count());
+ ASSERT_EQ(reps * 6, result_->value_data()->size());
- int N = static_cast<int>(strings.size());
- int reps = 13;
- int total_length = 0;
- for (auto&& s : strings) total_length += static_cast<int>(s.size());
-
- ASSERT_OK(builder_->Reserve(N * reps));
- ASSERT_OK(builder_->ReserveData(total_length * reps));
-
- for (int j = 0; j < reps; ++j) {
- for (int i = 0; i < N; ++i) {
- if (!is_valid[i]) {
- builder_->UnsafeAppendNull();
- } else {
- builder_->UnsafeAppend(strings[i]);
- }
- }
+ CheckStringArray(*result_, strings, is_valid, reps);
}
- ASSERT_EQ(builder_->value_data_length(), total_length * reps);
- Done();
- ASSERT_OK(ValidateArray(*result_));
- ASSERT_EQ(reps * N, result_->length());
- ASSERT_EQ(reps, result_->null_count());
- ASSERT_EQ(reps * total_length, result_->value_data()->size());
-
- CheckStringArray(*result_, strings, is_valid, reps);
-}
-
-TEST_F(TestBinaryBuilder, TestCapacityReserve) {
- std::vector<std::string> strings = {"aaaaa", "bbbbbbbbbb", "ccccccccccccccc",
- "dddddddddd"};
- int N = static_cast<int>(strings.size());
- int reps = 15;
- int64_t length = 0;
- int64_t capacity = 1000;
- int64_t expected_capacity = BitUtil::RoundUpToMultipleOf64(capacity);
-
- ASSERT_OK(builder_->ReserveData(capacity));
- ASSERT_EQ(length, builder_->value_data_length());
- ASSERT_EQ(expected_capacity, builder_->value_data_capacity());
+ void _TestVectorAppend() {
+ std::vector<std::string> strings = {"", "bb", "a", "", "ccc"};
+ std::vector<uint8_t> valid_bytes = {1, 1, 1, 0, 1};
- for (int j = 0; j < reps; ++j) {
- for (int i = 0; i < N; ++i) {
- ASSERT_OK(builder_->Append(strings[i]));
- length += static_cast<int>(strings[i].size());
+ int N = static_cast<int>(strings.size());
+ int reps = 1000;
- ASSERT_EQ(length, builder_->value_data_length());
- ASSERT_EQ(expected_capacity, builder_->value_data_capacity());
+ for (int j = 0; j < reps; ++j) {
+ ASSERT_OK(builder_->AppendValues(strings, valid_bytes.data()));
}
- }
-
- int extra_capacity = 500;
- expected_capacity = BitUtil::RoundUpToMultipleOf64(length + extra_capacity);
+ Done();
- ASSERT_OK(builder_->ReserveData(extra_capacity));
+ ASSERT_EQ(reps * N, result_->length());
+ ASSERT_EQ(reps, result_->null_count());
+ ASSERT_EQ(reps * 6, result_->value_data()->size());
- ASSERT_EQ(length, builder_->value_data_length());
- int64_t actual_capacity = builder_->value_data_capacity();
- ASSERT_GE(actual_capacity, expected_capacity);
- ASSERT_EQ(actual_capacity & 63, 0);
-
- Done();
-
- ASSERT_EQ(reps * N, result_->length());
- ASSERT_EQ(0, result_->null_count());
- ASSERT_EQ(reps * 40, result_->value_data()->size());
+ CheckStringArray(*result_, strings, valid_bytes, reps);
+ }
- // Capacity is shrunk after `Finish`
- ASSERT_EQ(640, result_->value_data()->capacity());
-}
+ void _TestAppendCStringsWithValidBytes() {
+ const char* strings[] = {nullptr, "aaa", nullptr, "ignored", ""};
+ std::vector<uint8_t> valid_bytes = {1, 1, 1, 0, 1};
-TEST_F(TestBinaryBuilder, TestZeroLength) {
- // All buffers are null
- Done();
-}
+ int N = static_cast<int>(sizeof(strings) / sizeof(strings[0]));
+ int reps = 1000;
-// ----------------------------------------------------------------------
-// Slice tests
+ for (int j = 0; j < reps; ++j) {
+ ASSERT_OK(builder_->AppendValues(strings, N, valid_bytes.data()));
+ }
+ Done();
-template <typename TYPE>
-void CheckSliceEquality() {
- using Traits = TypeTraits<TYPE>;
- using BuilderType = typename Traits::BuilderType;
+ ASSERT_EQ(reps * N, result_->length());
+ ASSERT_EQ(reps * 3, result_->null_count());
+ ASSERT_EQ(reps * 3, result_->value_data()->size());
- BuilderType builder;
+ CheckStringArray(*result_, {"", "aaa", "", "", ""}, {0, 1, 0, 0, 1}, reps);
+ }
- std::vector<std::string> strings = {"foo", "", "bar", "baz", "qux", ""};
- std::vector<uint8_t> is_null = {0, 1, 0, 1, 0, 0};
+ void _TestAppendCStringsWithoutValidBytes() {
+ const char* strings[] = {"", "bb", "a", nullptr, "ccc"};
- int N = static_cast<int>(strings.size());
- int reps = 10;
+ int N = static_cast<int>(sizeof(strings) / sizeof(strings[0]));
+ int reps = 1000;
- for (int j = 0; j < reps; ++j) {
- for (int i = 0; i < N; ++i) {
- if (is_null[i]) {
- ASSERT_OK(builder.AppendNull());
- } else {
- ASSERT_OK(builder.Append(strings[i]));
- }
+ for (int j = 0; j < reps; ++j) {
+ ASSERT_OK(builder_->AppendValues(strings, N));
}
- }
+ Done();
- std::shared_ptr<Array> array;
- FinishAndCheckPadding(&builder, &array);
+ ASSERT_EQ(reps * N, result_->length());
+ ASSERT_EQ(reps, result_->null_count());
+ ASSERT_EQ(reps * 6, result_->value_data()->size());
- std::shared_ptr<Array> slice, slice2;
-
- slice = array->Slice(5);
- slice2 = array->Slice(5);
- ASSERT_EQ(N * reps - 5, slice->length());
-
- ASSERT_TRUE(slice->Equals(slice2));
- ASSERT_TRUE(array->RangeEquals(5, slice->length(), 0, slice));
+ CheckStringArray(*result_, {"", "bb", "a", "", "ccc"}, {1, 1, 1, 0, 1}, reps);
+ }
- // Chained slices
- slice2 = array->Slice(2)->Slice(3);
- ASSERT_TRUE(slice->Equals(slice2));
+ void _TestZeroLength() {
+ // All buffers are null
+ Done();
+ ASSERT_EQ(result_->length(), 0);
+ ASSERT_EQ(result_->null_count(), 0);
+ }
- slice = array->Slice(5, 20);
- slice2 = array->Slice(5, 20);
- ASSERT_EQ(20, slice->length());
+ protected:
+ std::unique_ptr<BuilderType> builder_;
+ std::shared_ptr<ArrayType> result_;
+};
- ASSERT_TRUE(slice->Equals(slice2));
- ASSERT_TRUE(array->RangeEquals(5, 25, 0, slice));
+TYPED_TEST_CASE(TestStringBuilder, StringTypes);
- ASSERT_OK(builder.Append("a"));
- for (int j = 0; j < reps; ++j) {
- ASSERT_OK(builder.Append(""));
- }
- FinishAndCheckPadding(&builder, &array);
- slice = array->Slice(1);
+TYPED_TEST(TestStringBuilder, TestScalarAppend) { this->_TestScalarAppend(); }
- for (int j = 0; j < reps; ++j) {
- ASSERT_OK(builder.Append(""));
- }
- FinishAndCheckPadding(&builder, &array);
+TYPED_TEST(TestStringBuilder, TestVectorAppend) { this->_TestVectorAppend(); }
- AssertArraysEqual(*slice, *array);
+TYPED_TEST(TestStringBuilder, TestAppendCStringsWithValidBytes) {
+ this->_TestAppendCStringsWithValidBytes();
}
-TEST_F(TestBinaryArray, TestSliceEquality) { CheckSliceEquality<BinaryType>(); }
-
-TEST_F(TestStringArray, TestSliceEquality) { CheckSliceEquality<BinaryType>(); }
+TYPED_TEST(TestStringBuilder, TestAppendCStringsWithoutValidBytes) {
+ this->_TestAppendCStringsWithoutValidBytes();
+}
-TEST_F(TestBinaryArray, LengthZeroCtor) { BinaryArray array(0, nullptr, nullptr); }
+TYPED_TEST(TestStringBuilder, TestZeroLength) { this->_TestZeroLength(); }
// ----------------------------------------------------------------------
// ChunkedBinaryBuilder tests
diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc
index 5f76f0839..0b7d8f170 100644
--- a/cpp/src/arrow/array.cc
+++ b/cpp/src/arrow/array.cc
@@ -386,31 +386,26 @@ BinaryArray::BinaryArray(const std::shared_ptr<ArrayData>& data) {
SetData(data);
}
-void BinaryArray::SetData(const std::shared_ptr<ArrayData>& data) {
- ARROW_CHECK_EQ(data->buffers.size(), 3);
- auto value_offsets = data->buffers[1];
- auto value_data = data->buffers[2];
- this->Array::SetData(data);
- raw_data_ = value_data == nullptr ? nullptr : value_data->data();
- raw_value_offsets_ = value_offsets == nullptr
- ? nullptr
- : reinterpret_cast<const int32_t*>(value_offsets->data());
-}
-
BinaryArray::BinaryArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
- const std::shared_ptr<Buffer>& data,
- const std::shared_ptr<Buffer>& null_bitmap, int64_t null_count,
- int64_t offset)
- : BinaryArray(binary(), length, value_offsets, data, null_bitmap, null_count,
- offset) {}
-
-BinaryArray::BinaryArray(const std::shared_ptr<DataType>& type, int64_t length,
- const std::shared_ptr<Buffer>& value_offsets,
const std::shared_ptr<Buffer>& data,
const std::shared_ptr<Buffer>& null_bitmap, int64_t null_count,
int64_t offset) {
- SetData(ArrayData::Make(type, length, {null_bitmap, value_offsets, data}, null_count,
- offset));
+ SetData(ArrayData::Make(binary(), length, {null_bitmap, value_offsets, data},
+ null_count, offset));
+}
+
+LargeBinaryArray::LargeBinaryArray(const std::shared_ptr<ArrayData>& data) {
+ ARROW_CHECK_EQ(data->type->id(), Type::LARGE_BINARY);
+ SetData(data);
+}
+
+LargeBinaryArray::LargeBinaryArray(int64_t length,
+ const std::shared_ptr<Buffer>& value_offsets,
+ const std::shared_ptr<Buffer>& data,
+ const std::shared_ptr<Buffer>& null_bitmap,
+ int64_t null_count, int64_t offset) {
+ SetData(ArrayData::Make(large_binary(), length, {null_bitmap, value_offsets, data},
+ null_count, offset));
}
StringArray::StringArray(const std::shared_ptr<ArrayData>& data) {
@@ -421,8 +416,24 @@ StringArray::StringArray(const std::shared_ptr<ArrayData>& data) {
StringArray::StringArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
const std::shared_ptr<Buffer>& data,
const std::shared_ptr<Buffer>& null_bitmap, int64_t null_count,
- int64_t offset)
- : BinaryArray(utf8(), length, value_offsets, data, null_bitmap, null_count, offset) {}
+ int64_t offset) {
+ SetData(ArrayData::Make(utf8(), length, {null_bitmap, value_offsets, data}, null_count,
+ offset));
+}
+
+LargeStringArray::LargeStringArray(const std::shared_ptr<ArrayData>& data) {
+ ARROW_CHECK_EQ(data->type->id(), Type::LARGE_STRING);
+ SetData(data);
+}
+
+LargeStringArray::LargeStringArray(int64_t length,
+ const std::shared_ptr<Buffer>& value_offsets,
+ const std::shared_ptr<Buffer>& data,
+ const std::shared_ptr<Buffer>& null_bitmap,
+ int64_t null_count, int64_t offset) {
+ SetData(ArrayData::Make(large_utf8(), length, {null_bitmap, value_offsets, data},
+ null_count, offset));
+}
// ----------------------------------------------------------------------
// Fixed width binary
@@ -1148,20 +1159,14 @@ struct ValidateVisitor {
return ValidateOffsets(array);
}
- Status Visit(const ListArray& array) {
- if (array.length() < 0) {
- return Status::Invalid("Length was negative");
- }
-
- auto value_offsets = array.value_offsets();
- if (array.length() && !value_offsets) {
- return Status::Invalid("value_offsets_ was null");
- }
- if (value_offsets->size() / static_cast<int>(sizeof(int32_t)) < array.length()) {
- return Status::Invalid("offset buffer size (bytes): ", value_offsets->size(),
- " isn't large enough for length: ", array.length());
+ Status Visit(const LargeBinaryArray& array) {
+ if (array.data()->buffers.size() != 3) {
+ return Status::Invalid("number of buffers was != 3");
}
+ return ValidateOffsets(array);
+ }
+ Status Visit(const ListArray& array) {
if (!array.values()) {
return Status::Invalid("values was null");
}
@@ -1181,19 +1186,6 @@ struct ValidateVisitor {
}
Status Visit(const MapArray& array) {
- if (array.length() < 0) {
- return Status::Invalid("Length was negative");
- }
-
- auto value_offsets = array.value_offsets();
- if (array.length() && !value_offsets) {
- return Status::Invalid("value_offsets_ was null");
- }
- if (value_offsets->size() / static_cast<int>(sizeof(int32_t)) < array.length()) {
- return Status::Invalid("offset buffer size (bytes): ", value_offsets->size(),
- " isn't large enough for length: ", array.length());
- }
-
if (!array.keys()) {
return Status::Invalid("keys was null");
}
@@ -1224,9 +1216,6 @@ struct ValidateVisitor {
}
Status Visit(const FixedSizeListArray& array) {
- if (array.length() < 0) {
- return Status::Invalid("Length was negative");
- }
if (!array.values()) {
return Status::Invalid("values was null");
}
@@ -1240,14 +1229,6 @@ struct ValidateVisitor {
}
Status Visit(const StructArray& array) {
- if (array.length() < 0) {
- return Status::Invalid("Length was negative");
- }
-
- if (array.null_count() > array.length()) {
- return Status::Invalid("Null count exceeds the length of this struct");
- }
-
if (array.num_fields() > 0) {
// Validate fields
int64_t array_length = array.field(0)->length();
@@ -1274,16 +1255,7 @@ struct ValidateVisitor {
return Status::OK();
}
- Status Visit(const UnionArray& array) {
- if (array.length() < 0) {
- return Status::Invalid("Length was negative");
- }
-
- if (array.null_count() > array.length()) {
- return Status::Invalid("Null count exceeds the length of this struct");
- }
- return Status::OK();
- }
+ Status Visit(const UnionArray& array) { return Status::OK(); }
Status Visit(const DictionaryArray& array) {
Type::type index_type_id = array.indices()->type()->id();
@@ -1310,12 +1282,23 @@ struct ValidateVisitor {
protected:
template <typename ArrayType>
Status ValidateOffsets(ArrayType& array) {
- int32_t prev_offset = array.value_offset(0);
+ using offset_type = typename ArrayType::offset_type;
+
+ auto value_offsets = array.value_offsets();
+ if (array.length() && !value_offsets) {
+ return Status::Invalid("value_offsets_ was null");
+ }
+ if (value_offsets->size() / static_cast<int>(sizeof(offset_type)) < array.length()) {
+ return Status::Invalid("offset buffer size (bytes): ", value_offsets->size(),
+ " isn't large enough for length: ", array.length());
+ }
+
+ auto prev_offset = array.value_offset(0);
if (array.offset() == 0 && prev_offset != 0) {
return Status::Invalid("The first offset wasn't zero");
}
for (int64_t i = 1; i <= array.length(); ++i) {
- int32_t current_offset = array.value_offset(i);
+ auto current_offset = array.value_offset(i);
if (array.IsNull(i - 1) && current_offset != prev_offset) {
return Status::Invalid("Offset invariant failure at: ", i,
" inconsistent value_offsets for null slot",
@@ -1340,6 +1323,14 @@ Status ValidateArray(const Array& array) {
const auto layout = type.layout();
const ArrayData& data = *array.data();
+ if (array.length() < 0) {
+ return Status::Invalid("Array length is negative");
+ }
+
+ if (array.null_count() > array.length()) {
+ return Status::Invalid("Null count exceeds array length");
+ }
+
if (data.buffers.size() != layout.bit_widths.size()) {
return Status::Invalid("Expected ", layout.bit_widths.size(),
" buffers in array "
diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h
index 599a6ea62..e13088c65 100644
--- a/cpp/src/arrow/array.h
+++ b/cpp/src/arrow/array.h
@@ -492,6 +492,7 @@ class ARROW_EXPORT BooleanArray : public PrimitiveArray {
class ARROW_EXPORT ListArray : public Array {
public:
using TypeClass = ListType;
+ using offset_type = ListType::offset_type;
explicit ListArray(const std::shared_ptr<ArrayData>& data);
@@ -635,24 +636,20 @@ class ARROW_EXPORT FixedSizeListArray : public Array {
// ----------------------------------------------------------------------
// Binary and String
-/// Concrete Array class for variable-size binary data
-class ARROW_EXPORT BinaryArray : public FlatArray {
+/// Base class for variable-sized binary arrays, regardless of offset size
+/// and logical interpretation.
+template <typename TYPE>
+class BaseBinaryArray : public FlatArray {
public:
- using TypeClass = BinaryType;
-
- explicit BinaryArray(const std::shared_ptr<ArrayData>& data);
-
- BinaryArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
- const std::shared_ptr<Buffer>& data,
- const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
- int64_t null_count = kUnknownNullCount, int64_t offset = 0);
+ using TypeClass = TYPE;
+ using offset_type = typename TypeClass::offset_type;
/// Return the pointer to the given elements bytes
// XXX should GetValue(int64_t i) return a string_view?
- const uint8_t* GetValue(int64_t i, int32_t* out_length) const {
+ const uint8_t* GetValue(int64_t i, offset_type* out_length) const {
// Account for base offset
i += data_->offset;
- const int32_t pos = raw_value_offsets_[i];
+ const offset_type pos = raw_value_offsets_[i];
*out_length = raw_value_offsets_[i + 1] - pos;
return raw_data_ + pos;
}
@@ -664,7 +661,7 @@ class ARROW_EXPORT BinaryArray : public FlatArray {
util::string_view GetView(int64_t i) const {
// Account for base offset
i += data_->offset;
- const int32_t pos = raw_value_offsets_[i];
+ const offset_type pos = raw_value_offsets_[i];
return util::string_view(reinterpret_cast<const char*>(raw_data_ + pos),
raw_value_offsets_[i + 1] - pos);
}
@@ -681,31 +678,52 @@ class ARROW_EXPORT BinaryArray : public FlatArray {
/// Note that this buffer does not account for any slice offset
std::shared_ptr<Buffer> value_data() const { return data_->buffers[2]; }
- const int32_t* raw_value_offsets() const { return raw_value_offsets_ + data_->offset; }
+ const offset_type* raw_value_offsets() const {
+ return raw_value_offsets_ + data_->offset;
+ }
// Neither of these functions will perform boundschecking
- int32_t value_offset(int64_t i) const { return raw_value_offsets_[i + data_->offset]; }
- int32_t value_length(int64_t i) const {
+ offset_type value_offset(int64_t i) const {
+ return raw_value_offsets_[i + data_->offset];
+ }
+ offset_type value_length(int64_t i) const {
i += data_->offset;
return raw_value_offsets_[i + 1] - raw_value_offsets_[i];
}
protected:
// For subclasses
- BinaryArray() : raw_value_offsets_(NULLPTR), raw_data_(NULLPTR) {}
+ BaseBinaryArray() : raw_value_offsets_(NULLPTR), raw_data_(NULLPTR) {}
- /// Protected method for constructors
- void SetData(const std::shared_ptr<ArrayData>& data);
+ // Protected method for constructors
+ void SetData(const std::shared_ptr<ArrayData>& data) {
+ auto value_offsets = data->buffers[1];
+ auto value_data = data->buffers[2];
+ this->Array::SetData(data);
+ raw_data_ = value_data == NULLPTR ? NULLPTR : value_data->data();
+ raw_value_offsets_ =
+ value_offsets == NULLPTR
+ ? NULLPTR
+ : reinterpret_cast<const offset_type*>(value_offsets->data());
+ }
- // Constructor to allow sub-classes/builders to substitute their own logical type
- BinaryArray(const std::shared_ptr<DataType>& type, int64_t length,
- const std::shared_ptr<Buffer>& value_offsets,
+ const offset_type* raw_value_offsets_;
+ const uint8_t* raw_data_;
+};
+
+/// Concrete Array class for variable-size binary data
+class ARROW_EXPORT BinaryArray : public BaseBinaryArray<BinaryType> {
+ public:
+ explicit BinaryArray(const std::shared_ptr<ArrayData>& data);
+
+ BinaryArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
const std::shared_ptr<Buffer>& data,
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
- const int32_t* raw_value_offsets_;
- const uint8_t* raw_data_;
+ protected:
+ // For subclasses such as StringArray
+ BinaryArray() : BaseBinaryArray() {}
};
/// Concrete Array class for variable-size string (utf-8) data
@@ -721,6 +739,34 @@ class ARROW_EXPORT StringArray : public BinaryArray {
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
};
+/// Concrete Array class for large variable-size binary data
+class ARROW_EXPORT LargeBinaryArray : public BaseBinaryArray<LargeBinaryType> {
+ public:
+ explicit LargeBinaryArray(const std::shared_ptr<ArrayData>& data);
+
+ LargeBinaryArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
+ const std::shared_ptr<Buffer>& data,
+ const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
+ int64_t null_count = kUnknownNullCount, int64_t offset = 0);
+
+ protected:
+ // For subclasses such as LargeStringArray
+ LargeBinaryArray() : BaseBinaryArray() {}
+};
+
+/// Concrete Array class for large variable-size string (utf-8) data
+class ARROW_EXPORT LargeStringArray : public LargeBinaryArray {
+ public:
+ using TypeClass = LargeStringType;
+
+ explicit LargeStringArray(const std::shared_ptr<ArrayData>& data);
+
+ LargeStringArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
+ const std::shared_ptr<Buffer>& data,
+ const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
+ int64_t null_count = kUnknownNullCount, int64_t offset = 0);
+};
+
// ----------------------------------------------------------------------
// Fixed width binary
diff --git a/cpp/src/arrow/array/builder_binary.cc b/cpp/src/arrow/array/builder_binary.cc
index 818ad1559..b83897d7e 100644
--- a/cpp/src/arrow/array/builder_binary.cc
+++ b/cpp/src/arrow/array/builder_binary.cc
@@ -43,173 +43,15 @@ using internal::checked_cast;
// ----------------------------------------------------------------------
// String and binary
-BinaryBuilder::BinaryBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool)
- : ArrayBuilder(type, pool), offsets_builder_(pool), value_data_builder_(pool) {}
-
-BinaryBuilder::BinaryBuilder(MemoryPool* pool) : BinaryBuilder(binary(), pool) {}
-
-Status BinaryBuilder::Resize(int64_t capacity) {
- if (capacity > kListMaximumElements) {
- return Status::CapacityError(
- "BinaryBuilder cannot reserve space for more then 2^31 - 1 child elements, got ",
- capacity);
- }
- RETURN_NOT_OK(CheckCapacity(capacity, capacity_));
-
- // one more then requested for offsets
- RETURN_NOT_OK(offsets_builder_.Resize(capacity + 1));
- return ArrayBuilder::Resize(capacity);
-}
-
-Status BinaryBuilder::ReserveData(int64_t elements) {
- const int64_t size = value_data_length() + elements;
- ARROW_RETURN_IF(
- size > kBinaryMemoryLimit,
- Status::CapacityError("Cannot reserve capacity larger than 2^31 - 1 for binary"));
-
- return (size > value_data_capacity()) ? value_data_builder_.Reserve(elements)
- : Status::OK();
-}
-
-Status BinaryBuilder::AppendOverflow(int64_t num_bytes) {
- return Status::CapacityError("BinaryArray cannot contain more than ",
- kBinaryMemoryLimit, " bytes, have ", num_bytes);
-}
-
-Status BinaryBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
- // Write final offset (values length)
- RETURN_NOT_OK(AppendNextOffset());
-
- // These buffers' padding zeroed by BufferBuilder
- std::shared_ptr<Buffer> offsets, value_data, null_bitmap;
- RETURN_NOT_OK(offsets_builder_.Finish(&offsets));
- RETURN_NOT_OK(value_data_builder_.Finish(&value_data));
- RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap));
-
- *out =
- ArrayData::Make(type_, length_, {null_bitmap, offsets, value_data}, null_count_, 0);
- Reset();
- return Status::OK();
-}
-
-void BinaryBuilder::Reset() {
- ArrayBuilder::Reset();
- offsets_builder_.Reset();
- value_data_builder_.Reset();
-}
-
-const uint8_t* BinaryBuilder::GetValue(int64_t i, int32_t* out_length) const {
- const int32_t* offsets = offsets_builder_.data();
- int32_t offset = offsets[i];
- if (i == (length_ - 1)) {
- *out_length = static_cast<int32_t>(value_data_builder_.length()) - offset;
- } else {
- *out_length = offsets[i + 1] - offset;
- }
- return value_data_builder_.data() + offset;
-}
-
-util::string_view BinaryBuilder::GetView(int64_t i) const {
- const int32_t* offsets = offsets_builder_.data();
- int32_t offset = offsets[i];
- int32_t value_length;
- if (i == (length_ - 1)) {
- value_length = static_cast<int32_t>(value_data_builder_.length()) - offset;
- } else {
- value_length = offsets[i + 1] - offset;
- }
- return util::string_view(
- reinterpret_cast<const char*>(value_data_builder_.data() + offset), value_length);
-}
+BinaryBuilder::BinaryBuilder(MemoryPool* pool) : BaseBinaryBuilder(binary(), pool) {}
StringBuilder::StringBuilder(MemoryPool* pool) : BinaryBuilder(utf8(), pool) {}
-Status StringBuilder::AppendValues(const std::vector<std::string>& values,
- const uint8_t* valid_bytes) {
- std::size_t total_length = std::accumulate(
- values.begin(), values.end(), 0ULL,
- [](uint64_t sum, const std::string& str) { return sum + str.size(); });
- RETURN_NOT_OK(Reserve(values.size()));
- RETURN_NOT_OK(value_data_builder_.Reserve(total_length));
- RETURN_NOT_OK(offsets_builder_.Reserve(values.size()));
-
- if (valid_bytes) {
- for (std::size_t i = 0; i < values.size(); ++i) {
- UnsafeAppendNextOffset();
- if (valid_bytes[i]) {
- value_data_builder_.UnsafeAppend(
- reinterpret_cast<const uint8_t*>(values[i].data()), values[i].size());
- }
- }
- } else {
- for (std::size_t i = 0; i < values.size(); ++i) {
- UnsafeAppendNextOffset();
- value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values[i].data()),
- values[i].size());
- }
- }
+LargeBinaryBuilder::LargeBinaryBuilder(MemoryPool* pool)
+ : BaseBinaryBuilder(large_binary(), pool) {}
- UnsafeAppendToBitmap(valid_bytes, values.size());
- return Status::OK();
-}
-
-Status StringBuilder::AppendValues(const char** values, int64_t length,
- const uint8_t* valid_bytes) {
- std::size_t total_length = 0;
- std::vector<std::size_t> value_lengths(length);
- bool have_null_value = false;
- for (int64_t i = 0; i < length; ++i) {
- if (values[i]) {
- auto value_length = strlen(values[i]);
- value_lengths[i] = value_length;
- total_length += value_length;
- } else {
- have_null_value = true;
- }
- }
- RETURN_NOT_OK(Reserve(length));
- RETURN_NOT_OK(value_data_builder_.Reserve(total_length));
- RETURN_NOT_OK(offsets_builder_.Reserve(length));
-
- if (valid_bytes) {
- int64_t valid_bytes_offset = 0;
- for (int64_t i = 0; i < length; ++i) {
- UnsafeAppendNextOffset();
- if (valid_bytes[i]) {
- if (values[i]) {
- value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values[i]),
- value_lengths[i]);
- } else {
- UnsafeAppendToBitmap(valid_bytes + valid_bytes_offset, i - valid_bytes_offset);
- UnsafeAppendToBitmap(false);
- valid_bytes_offset = i + 1;
- }
- }
- }
- UnsafeAppendToBitmap(valid_bytes + valid_bytes_offset, length - valid_bytes_offset);
- } else {
- if (have_null_value) {
- std::vector<uint8_t> valid_vector(length, 0);
- for (int64_t i = 0; i < length; ++i) {
- UnsafeAppendNextOffset();
- if (values[i]) {
- value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values[i]),
- value_lengths[i]);
- valid_vector[i] = 1;
- }
- }
- UnsafeAppendToBitmap(valid_vector.data(), length);
- } else {
- for (int64_t i = 0; i < length; ++i) {
- UnsafeAppendNextOffset();
- value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values[i]),
- value_lengths[i]);
- }
- UnsafeAppendToBitmap(nullptr, length);
- }
- }
- return Status::OK();
-}
+LargeStringBuilder::LargeStringBuilder(MemoryPool* pool)
+ : LargeBinaryBuilder(large_utf8(), pool) {}
// ----------------------------------------------------------------------
// Fixed width binary
diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h
index 47d3bae4b..5bf4e747b 100644
--- a/cpp/src/arrow/array/builder_binary.h
+++ b/cpp/src/arrow/array/builder_binary.h
@@ -17,8 +17,11 @@
#pragma once
+#include <algorithm>
+#include <cstdint>
#include <limits>
#include <memory>
+#include <numeric>
#include <string>
#include <vector>
@@ -37,15 +40,16 @@ constexpr int64_t kBinaryMemoryLimit = std::numeric_limits<int32_t>::max() - 1;
// ----------------------------------------------------------------------
// Binary and String
-/// \class BinaryBuilder
-/// \brief Builder class for variable-length binary data
-class ARROW_EXPORT BinaryBuilder : public ArrayBuilder {
+template <typename TYPE>
+class BaseBinaryBuilder : public ArrayBuilder {
public:
- explicit BinaryBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT);
+ using TypeClass = TYPE;
+ using offset_type = typename TypeClass::offset_type;
- BinaryBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool);
+ BaseBinaryBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool)
+ : ArrayBuilder(type, pool), offsets_builder_(pool), value_data_builder_(pool) {}
- Status Append(const uint8_t* value, int32_t length) {
+ Status Append(const uint8_t* value, offset_type length) {
ARROW_RETURN_NOT_OK(Reserve(1));
ARROW_RETURN_NOT_OK(AppendNextOffset());
// Safety check for UBSAN.
@@ -57,14 +61,22 @@ class ARROW_EXPORT BinaryBuilder : public ArrayBuilder {
return Status::OK();
}
+ Status Append(const char* value, offset_type length) {
+ return Append(reinterpret_cast<const uint8_t*>(value), length);
+ }
+
+ Status Append(util::string_view value) {
+ return Append(value.data(), static_cast<offset_type>(value.size()));
+ }
+
Status AppendNulls(int64_t length) final {
const int64_t num_bytes = value_data_builder_.length();
- if (ARROW_PREDICT_FALSE(num_bytes > kBinaryMemoryLimit)) {
+ if (ARROW_PREDICT_FALSE(num_bytes > memory_limit())) {
return AppendOverflow(num_bytes);
}
ARROW_RETURN_NOT_OK(Reserve(length));
for (int64_t i = 0; i < length; ++i) {
- offsets_builder_.UnsafeAppend(static_cast<int32_t>(num_bytes));
+ offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
}
UnsafeAppendToBitmap(length, false);
return Status::OK();
@@ -77,56 +89,184 @@ class ARROW_EXPORT BinaryBuilder : public ArrayBuilder {
return Status::OK();
}
- Status Append(const char* value, int32_t length) {
- return Append(reinterpret_cast<const uint8_t*>(value), length);
- }
-
- Status Append(util::string_view value) {
- return Append(value.data(), static_cast<int32_t>(value.size()));
- }
-
/// \brief Append without checking capacity
///
/// Offsets and data should have been presized using Reserve() and
/// ReserveData(), respectively.
- void UnsafeAppend(const uint8_t* value, int32_t length) {
+ void UnsafeAppend(const uint8_t* value, offset_type length) {
UnsafeAppendNextOffset();
value_data_builder_.UnsafeAppend(value, length);
UnsafeAppendToBitmap(true);
}
- void UnsafeAppend(const char* value, int32_t length) {
+ void UnsafeAppend(const char* value, offset_type length) {
UnsafeAppend(reinterpret_cast<const uint8_t*>(value), length);
}
void UnsafeAppend(const std::string& value) {
- UnsafeAppend(value.c_str(), static_cast<int32_t>(value.size()));
+ UnsafeAppend(value.c_str(), static_cast<offset_type>(value.size()));
}
void UnsafeAppend(util::string_view value) {
- UnsafeAppend(value.data(), static_cast<int32_t>(value.size()));
+ UnsafeAppend(value.data(), static_cast<offset_type>(value.size()));
}
void UnsafeAppendNull() {
const int64_t num_bytes = value_data_builder_.length();
- offsets_builder_.UnsafeAppend(static_cast<int32_t>(num_bytes));
+ offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
UnsafeAppendToBitmap(false);
}
- void Reset() override;
- Status Resize(int64_t capacity) override;
+ /// \brief Append a sequence of strings in one shot.
+ ///
+ /// \param[in] values a vector of strings
+ /// \param[in] valid_bytes an optional sequence of bytes where non-zero
+ /// indicates a valid (non-null) value
+ /// \return Status
+ Status AppendValues(const std::vector<std::string>& values,
+ const uint8_t* valid_bytes = NULLPTR) {
+ std::size_t total_length = std::accumulate(
+ values.begin(), values.end(), 0ULL,
+ [](uint64_t sum, const std::string& str) { return sum + str.size(); });
+ ARROW_RETURN_NOT_OK(Reserve(values.size()));
+ ARROW_RETURN_NOT_OK(value_data_builder_.Reserve(total_length));
+ ARROW_RETURN_NOT_OK(offsets_builder_.Reserve(values.size()));
+
+ if (valid_bytes) {
+ for (std::size_t i = 0; i < values.size(); ++i) {
+ UnsafeAppendNextOffset();
+ if (valid_bytes[i]) {
+ value_data_builder_.UnsafeAppend(
+ reinterpret_cast<const uint8_t*>(values[i].data()), values[i].size());
+ }
+ }
+ } else {
+ for (std::size_t i = 0; i < values.size(); ++i) {
+ UnsafeAppendNextOffset();
+ value_data_builder_.UnsafeAppend(
+ reinterpret_cast<const uint8_t*>(values[i].data()), values[i].size());
+ }
+ }
+
+ UnsafeAppendToBitmap(valid_bytes, values.size());
+ return Status::OK();
+ }
+
+ /// \brief Append a sequence of nul-terminated strings in one shot.
+ /// If one of the values is NULL, it is processed as a null
+ /// value even if the corresponding valid_bytes entry is 1.
+ ///
+ /// \param[in] values a contiguous C array of nul-terminated char *
+ /// \param[in] length the number of values to append
+ /// \param[in] valid_bytes an optional sequence of bytes where non-zero
+ /// indicates a valid (non-null) value
+ /// \return Status
+ Status AppendValues(const char** values, int64_t length,
+ const uint8_t* valid_bytes = NULLPTR) {
+ std::size_t total_length = 0;
+ std::vector<std::size_t> value_lengths(length);
+ bool have_null_value = false;
+ for (int64_t i = 0; i < length; ++i) {
+ if (values[i]) {
+ auto value_length = strlen(values[i]);
+ value_lengths[i] = value_length;
+ total_length += value_length;
+ } else {
+ have_null_value = true;
+ }
+ }
+ ARROW_RETURN_NOT_OK(Reserve(length));
+ ARROW_RETURN_NOT_OK(value_data_builder_.Reserve(total_length));
+ ARROW_RETURN_NOT_OK(offsets_builder_.Reserve(length));
+
+ if (valid_bytes) {
+ int64_t valid_bytes_offset = 0;
+ for (int64_t i = 0; i < length; ++i) {
+ UnsafeAppendNextOffset();
+ if (valid_bytes[i]) {
+ if (values[i]) {
+ value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values[i]),
+ value_lengths[i]);
+ } else {
+ UnsafeAppendToBitmap(valid_bytes + valid_bytes_offset,
+ i - valid_bytes_offset);
+ UnsafeAppendToBitmap(false);
+ valid_bytes_offset = i + 1;
+ }
+ }
+ }
+ UnsafeAppendToBitmap(valid_bytes + valid_bytes_offset, length - valid_bytes_offset);
+ } else {
+ if (have_null_value) {
+ std::vector<uint8_t> valid_vector(length, 0);
+ for (int64_t i = 0; i < length; ++i) {
+ UnsafeAppendNextOffset();
+ if (values[i]) {
+ value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values[i]),
+ value_lengths[i]);
+ valid_vector[i] = 1;
+ }
+ }
+ UnsafeAppendToBitmap(valid_vector.data(), length);
+ } else {
+ for (int64_t i = 0; i < length; ++i) {
+ UnsafeAppendNextOffset();
+ value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values[i]),
+ value_lengths[i]);
+ }
+ UnsafeAppendToBitmap(NULLPTR, length);
+ }
+ }
+ return Status::OK();
+ }
+
+ void Reset() override {
+ ArrayBuilder::Reset();
+ offsets_builder_.Reset();
+ value_data_builder_.Reset();
+ }
+
+ Status Resize(int64_t capacity) override {
+ if (capacity > kListMaximumElements) {
+ return Status::CapacityError(
+ "BinaryBuilder cannot reserve space for more than 2^31 - 1 child elements, "
+ "got ",
+ capacity);
+ }
+ ARROW_RETURN_NOT_OK(CheckCapacity(capacity, capacity_));
+
+ // One more than requested for offsets
+ ARROW_RETURN_NOT_OK(offsets_builder_.Resize(capacity + 1));
+ return ArrayBuilder::Resize(capacity);
+ }
/// \brief Ensures there is enough allocated capacity to append the indicated
/// number of bytes to the value data buffer without additional allocations
- Status ReserveData(int64_t elements);
+ Status ReserveData(int64_t elements) {
+ const int64_t size = value_data_length() + elements;
+ ARROW_RETURN_IF(size > memory_limit(),
+ Status::CapacityError("Cannot reserve capacity larger than ",
+ memory_limit(), " bytes"));
+
+ return (size > value_data_capacity()) ? value_data_builder_.Reserve(elements)
+ : Status::OK();
+ }
- Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
+ Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
+ // Write final offset (values length)
+ ARROW_RETURN_NOT_OK(AppendNextOffset());
- /// \cond FALSE
- using ArrayBuilder::Finish;
- /// \endcond
+ // These buffers' padding zeroed by BufferBuilder
+ std::shared_ptr<Buffer> offsets, value_data, null_bitmap;
+ ARROW_RETURN_NOT_OK(offsets_builder_.Finish(&offsets));
+ ARROW_RETURN_NOT_OK(value_data_builder_.Finish(&value_data));
+ ARROW_RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap));
- Status Finish(std::shared_ptr<BinaryArray>* out) { return FinishTyped(out); }
+ *out = ArrayData::Make(type_, length_, {null_bitmap, offsets, value_data},
+ null_count_, 0);
+ Reset();
+ return Status::OK();
+ }
/// \return size of values buffer so far
int64_t value_data_length() const { return value_data_builder_.length(); }
@@ -136,33 +276,77 @@ class ARROW_EXPORT BinaryBuilder : public ArrayBuilder {
/// Temporary access to a value.
///
/// This pointer becomes invalid on the next modifying operation.
- const uint8_t* GetValue(int64_t i, int32_t* out_length) const;
+ const uint8_t* GetValue(int64_t i, offset_type* out_length) const {
+ const offset_type* offsets = offsets_builder_.data();
+ const auto offset = offsets[i];
+ if (i == (length_ - 1)) {
+ *out_length = static_cast<offset_type>(value_data_builder_.length()) - offset;
+ } else {
+ *out_length = offsets[i + 1] - offset;
+ }
+ return value_data_builder_.data() + offset;
+ }
/// Temporary access to a value.
///
/// This view becomes invalid on the next modifying operation.
- util::string_view GetView(int64_t i) const;
+ util::string_view GetView(int64_t i) const {
+ const offset_type* offsets = offsets_builder_.data();
+ const auto offset = offsets[i];
+ offset_type value_length;
+ if (i == (length_ - 1)) {
+ value_length = static_cast<offset_type>(value_data_builder_.length()) - offset;
+ } else {
+ value_length = offsets[i + 1] - offset;
+ }
+ return util::string_view(
+ reinterpret_cast<const char*>(value_data_builder_.data() + offset), value_length);
+ }
protected:
- TypedBufferBuilder<int32_t> offsets_builder_;
+ TypedBufferBuilder<offset_type> offsets_builder_;
TypedBufferBuilder<uint8_t> value_data_builder_;
- Status AppendOverflow(int64_t num_bytes);
+ Status AppendOverflow(int64_t num_bytes) {
+ return Status::CapacityError("array cannot contain more than ", memory_limit(),
+ " bytes, have ", num_bytes);
+ }
Status AppendNextOffset() {
const int64_t num_bytes = value_data_builder_.length();
- if (ARROW_PREDICT_FALSE(num_bytes > kBinaryMemoryLimit)) {
+ if (ARROW_PREDICT_FALSE(num_bytes > memory_limit())) {
return AppendOverflow(num_bytes);
}
- return offsets_builder_.Append(static_cast<int32_t>(num_bytes));
+ return offsets_builder_.Append(static_cast<offset_type>(num_bytes));
}
void UnsafeAppendNextOffset() {
const int64_t num_bytes = value_data_builder_.length();
- offsets_builder_.UnsafeAppend(static_cast<int32_t>(num_bytes));
+ offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
+ }
+
+ // Cannot make this a static attribute because of linking issues
+ static constexpr int64_t memory_limit() {
+ return std::numeric_limits<offset_type>::max() - 1;
}
};
+/// \class BinaryBuilder
+/// \brief Builder class for variable-length binary data
+class ARROW_EXPORT BinaryBuilder : public BaseBinaryBuilder<BinaryType> {
+ public:
+ explicit BinaryBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT);
+
+ /// \cond FALSE
+ using ArrayBuilder::Finish;
+ /// \endcond
+
+ Status Finish(std::shared_ptr<BinaryArray>* out) { return FinishTyped(out); }
+
+ protected:
+ using BaseBinaryBuilder::BaseBinaryBuilder;
+};
+
/// \class StringBuilder
/// \brief Builder class for UTF8 strings
class ARROW_EXPORT StringBuilder : public BinaryBuilder {
@@ -170,36 +354,41 @@ class ARROW_EXPORT StringBuilder : public BinaryBuilder {
using BinaryBuilder::BinaryBuilder;
explicit StringBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT);
- using BinaryBuilder::Append;
- using BinaryBuilder::Reset;
- using BinaryBuilder::UnsafeAppend;
+ /// \cond FALSE
+ using ArrayBuilder::Finish;
+ /// \endcond
- /// \brief Append a sequence of strings in one shot.
- ///
- /// \param[in] values a vector of strings
- /// \param[in] valid_bytes an optional sequence of bytes where non-zero
- /// indicates a valid (non-null) value
- /// \return Status
- Status AppendValues(const std::vector<std::string>& values,
- const uint8_t* valid_bytes = NULLPTR);
+ Status Finish(std::shared_ptr<StringArray>* out) { return FinishTyped(out); }
+};
- /// \brief Append a sequence of nul-terminated strings in one shot.
- /// If one of the values is NULL, it is processed as a null
- /// value even if the corresponding valid_bytes entry is 1.
- ///
- /// \param[in] values a contiguous C array of nul-terminated char *
- /// \param[in] length the number of values to append
- /// \param[in] valid_bytes an optional sequence of bytes where non-zero
- /// indicates a valid (non-null) value
- /// \return Status
- Status AppendValues(const char** values, int64_t length,
- const uint8_t* valid_bytes = NULLPTR);
+/// \class LargeBinaryBuilder
+/// \brief Builder class for large variable-length binary data
+class ARROW_EXPORT LargeBinaryBuilder : public BaseBinaryBuilder<LargeBinaryType> {
+ public:
+ explicit LargeBinaryBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT);
/// \cond FALSE
using ArrayBuilder::Finish;
/// \endcond
- Status Finish(std::shared_ptr<StringArray>* out) { return FinishTyped(out); }
+ Status Finish(std::shared_ptr<LargeBinaryArray>* out) { return FinishTyped(out); }
+
+ protected:
+ using BaseBinaryBuilder::BaseBinaryBuilder;
+};
+
+/// \class LargeStringBuilder
+/// \brief Builder class for large UTF8 strings
+class ARROW_EXPORT LargeStringBuilder : public LargeBinaryBuilder {
+ public:
+ using LargeBinaryBuilder::LargeBinaryBuilder;
+ explicit LargeStringBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT);
+
+ /// \cond FALSE
+ using ArrayBuilder::Finish;
+ /// \endcond
+
+ Status Finish(std::shared_ptr<LargeStringArray>* out) { return FinishTyped(out); }
};
// ----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array/concatenate-test.cc b/cpp/src/arrow/array/concatenate-test.cc
index cf105ceb6..730b25ab8 100644
--- a/cpp/src/arrow/array/concatenate-test.cc
+++ b/cpp/src/arrow/array/concatenate-test.cc
@@ -48,10 +48,11 @@ class ConcatenateTest : public ::testing::Test {
sizes_({0, 1, 2, 4, 16, 31, 1234}),
null_probabilities_({0.0, 0.1, 0.5, 0.9, 1.0}) {}
- std::vector<int32_t> Offsets(int32_t length, int32_t slice_count) {
- std::vector<int32_t> offsets(static_cast<std::size_t>(slice_count + 1));
+ template <typename OffsetType>
+ std::vector<OffsetType> Offsets(int32_t length, int32_t slice_count) {
+ std::vector<OffsetType> offsets(static_cast<std::size_t>(slice_count + 1));
std::default_random_engine gen(seed_);
- std::uniform_int_distribution<int32_t> dist(0, length);
+ std::uniform_int_distribution<OffsetType> dist(0, length);
std::generate(offsets.begin(), offsets.end(), [&] { return dist(gen); });
std::sort(offsets.begin(), offsets.end());
return offsets;
@@ -85,7 +86,7 @@ class ConcatenateTest : public ::testing::Test {
template <typename ArrayFactory>
void Check(ArrayFactory&& factory) {
for (auto size : this->sizes_) {
- auto offsets = this->Offsets(size, 3);
+ auto offsets = this->Offsets<int32_t>(size, 3);
for (auto null_probability : this->null_probabilities_) {
std::shared_ptr<Array> array;
factory(size, null_probability, &array);
@@ -146,16 +147,16 @@ TYPED_TEST(PrimitiveConcatenateTest, Primitives) {
TEST_F(ConcatenateTest, StringType) {
Check([this](int32_t size, double null_probability, std::shared_ptr<Array>* out) {
- auto values_size = size * 4;
- auto char_array = this->GeneratePrimitive<Int8Type>(values_size, null_probability);
- std::shared_ptr<Buffer> offsets;
- auto offsets_vector = this->Offsets(values_size, size);
- // ensure the first offset is 0, which is expected for StringType
- offsets_vector[0] = 0;
- ASSERT_OK(CopyBufferFromVector(offsets_vector, default_memory_pool(), &offsets));
- *out = MakeArray(ArrayData::Make(
- utf8(), size,
- {char_array->data()->buffers[0], offsets, char_array->data()->buffers[1]}));
+ *out = rng_.String(size, /*min_length =*/0, /*max_length =*/15, null_probability);
+ ASSERT_OK(ValidateArray(**out));
+ });
+}
+
+TEST_F(ConcatenateTest, LargeStringType) {
+ Check([this](int32_t size, double null_probability, std::shared_ptr<Array>* out) {
+ *out =
+ rng_.LargeString(size, /*min_length =*/0, /*max_length =*/15, null_probability);
+ ASSERT_OK(ValidateArray(**out));
});
}
@@ -163,7 +164,7 @@ TEST_F(ConcatenateTest, ListType) {
Check([this](int32_t size, double null_probability, std::shared_ptr<Array>* out) {
auto values_size = size * 4;
auto values = this->GeneratePrimitive<Int8Type>(values_size, null_probability);
- auto offsets_vector = this->Offsets(values_size, size);
+ auto offsets_vector = this->Offsets<int32_t>(values_size, size);
// ensure the first offset is 0, which is expected for ListType
offsets_vector[0] = 0;
std::shared_ptr<Array> offsets;
diff --git a/cpp/src/arrow/array/concatenate.cc b/cpp/src/arrow/array/concatenate.cc
index 60da0d3f8..a20b157ac 100644
--- a/cpp/src/arrow/array/concatenate.cc
+++ b/cpp/src/arrow/array/concatenate.cc
@@ -184,14 +184,21 @@ class ConcatenateImpl {
Status Visit(const BinaryType&) {
std::vector<Range> value_ranges;
- RETURN_NOT_OK(ConcatenateOffsets<int32_t>(Buffers(1, *offset_type), pool_,
+ RETURN_NOT_OK(ConcatenateOffsets<int32_t>(Buffers(1, sizeof(int32_t)), pool_,
+ &out_.buffers[1], &value_ranges));
+ return ConcatenateBuffers(Buffers(2, value_ranges), pool_, &out_.buffers[2]);
+ }
+
+ Status Visit(const LargeBinaryType&) {
+ std::vector<Range> value_ranges;
+ RETURN_NOT_OK(ConcatenateOffsets<int64_t>(Buffers(1, sizeof(int64_t)), pool_,
&out_.buffers[1], &value_ranges));
return ConcatenateBuffers(Buffers(2, value_ranges), pool_, &out_.buffers[2]);
}
Status Visit(const ListType&) {
std::vector<Range> value_ranges;
- RETURN_NOT_OK(ConcatenateOffsets<int32_t>(Buffers(1, *offset_type), pool_,
+ RETURN_NOT_OK(ConcatenateOffsets<int32_t>(Buffers(1, sizeof(int32_t)), pool_,
&out_.buffers[1], &value_ranges));
return ConcatenateImpl(ChildData(0, value_ranges), pool_)
.Concatenate(out_.child_data[0].get());
@@ -277,13 +284,11 @@ class ConcatenateImpl {
}
// Gather the index-th buffer of each input into a vector.
- // Buffers are assumed to contain elements of fixed.bit_width(),
+ // Buffers are assumed to contain elements of the given byte_width,
// those elements are sliced with that input's offset and length.
// Note that BufferVector will not contain the buffer of in_[i] if it's
// nullptr.
- BufferVector Buffers(size_t index, const FixedWidthType& fixed) {
- DCHECK_EQ(fixed.bit_width() % 8, 0);
- auto byte_width = fixed.bit_width() / 8;
+ BufferVector Buffers(size_t index, int byte_width) {
BufferVector buffers;
buffers.reserve(in_.size());
for (const ArrayData& array_data : in_) {
@@ -296,6 +301,16 @@ class ConcatenateImpl {
return buffers;
}
+ // Gather the index-th buffer of each input into a vector.
+ // Buffers are assumed to contain elements of fixed.bit_width(),
+ // those elements are sliced with that input's offset and length.
+ // Note that BufferVector will not contain the buffer of in_[i] if it's
+ // nullptr.
+ BufferVector Buffers(size_t index, const FixedWidthType& fixed) {
+ DCHECK_EQ(fixed.bit_width() % 8, 0);
+ return Buffers(index, fixed.bit_width() / 8);
+ }
+
// Gather the index-th buffer of each input as a Bitmap
// into a vector of Bitmaps.
std::vector<Bitmap> Bitmaps(size_t index) {
@@ -328,15 +343,11 @@ class ConcatenateImpl {
return child_data;
}
- static const std::shared_ptr<FixedWidthType> offset_type;
const std::vector<ArrayData>& in_;
MemoryPool* pool_;
ArrayData out_;
};
-const std::shared_ptr<FixedWidthType> ConcatenateImpl::offset_type =
- std::static_pointer_cast<FixedWidthType>(int32());
-
Status Concatenate(const ArrayVector& arrays, MemoryPool* pool,
std::shared_ptr<Array>* out) {
if (arrays.size() == 0) {
diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc
index cee443c48..44b0d041b 100644
--- a/cpp/src/arrow/builder.cc
+++ b/cpp/src/arrow/builder.cc
@@ -107,6 +107,8 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type,
BUILDER_CASE(DOUBLE, DoubleBuilder);
BUILDER_CASE(STRING, StringBuilder);
BUILDER_CASE(BINARY, BinaryBuilder);
+ BUILDER_CASE(LARGE_STRING, LargeStringBuilder);
+ BUILDER_CASE(LARGE_BINARY, LargeBinaryBuilder);
BUILDER_CASE(FIXED_SIZE_BINARY, FixedSizeBinaryBuilder);
BUILDER_CASE(DECIMAL, Decimal128Builder);
case Type::DICTIONARY: {
diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc
index 097bc8f76..590ab6e4a 100644
--- a/cpp/src/arrow/compare.cc
+++ b/cpp/src/arrow/compare.cc
@@ -144,8 +144,9 @@ class RangeEqualsVisitor {
return Status::OK();
}
- bool CompareBinaryRange(const BinaryArray& left) const {
- const auto& right = checked_cast<const BinaryArray&>(right_);
+ template <typename ArrayType>
+ bool CompareBinaryRange(const ArrayType& left) const {
+ const auto& right = checked_cast<const ArrayType&>(right_);
for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
++i, ++o_i) {
@@ -154,10 +155,10 @@ class RangeEqualsVisitor {
return false;
}
if (is_null) continue;
- const int32_t begin_offset = left.value_offset(i);
- const int32_t end_offset = left.value_offset(i + 1);
- const int32_t right_begin_offset = right.value_offset(o_i);
- const int32_t right_end_offset = right.value_offset(o_i + 1);
+ const auto begin_offset = left.value_offset(i);
+ const auto end_offset = left.value_offset(i + 1);
+ const auto right_begin_offset = right.value_offset(o_i);
+ const auto right_end_offset = right.value_offset(o_i + 1);
// Underlying can't be equal if the size isn't equal
if (end_offset - begin_offset != right_end_offset - right_begin_offset) {
return false;
@@ -278,6 +279,11 @@ class RangeEqualsVisitor {
return Status::OK();
}
+ Status Visit(const LargeBinaryArray& left) {
+ result_ = CompareBinaryRange(left);
+ return Status::OK();
+ }
+
Status Visit(const FixedSizeBinaryArray& left) {
const auto& right = checked_cast<const FixedSizeBinaryArray&>(right_);
@@ -489,18 +495,21 @@ class ArrayEqualsVisitor : public RangeEqualsVisitor {
template <typename ArrayType>
bool ValueOffsetsEqual(const ArrayType& left) {
+ using offset_type = typename ArrayType::offset_type;
+
const auto& right = checked_cast<const ArrayType&>(right_);
if (left.offset() == 0 && right.offset() == 0) {
return left.value_offsets()->Equals(*right.value_offsets(),
- (left.length() + 1) * sizeof(int32_t));
+ (left.length() + 1) * sizeof(offset_type));
} else {
// One of the arrays is sliced; logic is more complicated because the
// value offsets are not both 0-based
auto left_offsets =
- reinterpret_cast<const int32_t*>(left.value_offsets()->data()) + left.offset();
+ reinterpret_cast<const offset_type*>(left.value_offsets()->data()) +
+ left.offset();
auto right_offsets =
- reinterpret_cast<const int32_t*>(right.value_offsets()->data()) +
+ reinterpret_cast<const offset_type*>(right.value_offsets()->data()) +
right.offset();
for (int64_t i = 0; i < left.length() + 1; ++i) {
@@ -512,10 +521,11 @@ class ArrayEqualsVisitor : public RangeEqualsVisitor {
}
}
- bool CompareBinary(const BinaryArray& left) {
- const auto& right = checked_cast<const BinaryArray&>(right_);
+ template <typename ArrayType>
+ bool CompareBinary(const ArrayType& left) {
+ const auto& right = checked_cast<const ArrayType&>(right_);
- bool equal_offsets = ValueOffsetsEqual<BinaryArray>(left);
+ bool equal_offsets = ValueOffsetsEqual<ArrayType>(left);
if (!equal_offsets) {
return false;
}
@@ -544,8 +554,8 @@ class ArrayEqualsVisitor : public RangeEqualsVisitor {
}
} else {
// ARROW-537: Only compare data in non-null slots
- const int32_t* left_offsets = left.raw_value_offsets();
- const int32_t* right_offsets = right.raw_value_offsets();
+ auto left_offsets = left.raw_value_offsets();
+ auto right_offsets = right.raw_value_offsets();
for (int64_t i = 0; i < left.length(); ++i) {
if (left.IsNull(i)) {
continue;
@@ -564,6 +574,11 @@ class ArrayEqualsVisitor : public RangeEqualsVisitor {
return Status::OK();
}
+ Status Visit(const LargeBinaryArray& left) {
+ result_ = CompareBinary(left);
+ return Status::OK();
+ }
+
Status Visit(const ListArray& left) {
const auto& right = checked_cast<const ListArray&>(right_);
bool equal_offsets = ValueOffsetsEqual<ListArray>(left);
@@ -822,6 +837,15 @@ class ScalarEqualsVisitor {
return Status::OK();
}
+ template <typename T>
+ typename std::enable_if<std::is_base_of<LargeBinaryScalar, T>::value, Status>::type
+ Visit(const T& left_) {
+ const auto& left = checked_cast<const LargeBinaryScalar&>(left_);
+ const auto& right = checked_cast<const LargeBinaryScalar&>(right_);
+ result_ = internal::SharedPtrEquals(left.value, right.value);
+ return Status::OK();
+ }
+
Status Visit(const Decimal128Scalar& left) {
const auto& right = checked_cast<const Decimal128Scalar&>(right_);
result_ = left.value == right.value;
diff --git a/cpp/src/arrow/compute/kernels/cast-test.cc b/cpp/src/arrow/compute/kernels/cast-test.cc
index 6bf4f9417..80538f20e 100644
--- a/cpp/src/arrow/compute/kernels/cast-test.cc
+++ b/cpp/src/arrow/compute/kernels/cast-test.cc
@@ -52,6 +52,8 @@ namespace compute {
using internal::checked_cast;
+static constexpr const char* kInvalidUtf8 = "\xa0\xa1";
+
static std::vector<std::shared_ptr<DataType>> kNumericTypes = {
uint8(), int8(), uint16(), int16(), uint32(),
int32(), uint64(), int64(), float32(), float64()};
@@ -131,6 +133,132 @@ class TestCast : public ComputeFixture, public TestBase {
CheckPass(*input->Slice(1), *expected->Slice(1), out_type, options);
}
}
+
+ template <typename SourceType, typename DestType>
+ void TestCastBinaryToString() {
+ CastOptions options;
+ auto src_type = TypeTraits<SourceType>::type_singleton();
+ auto dest_type = TypeTraits<DestType>::type_singleton();
+
+ // All valid except the last one
+ std::vector<bool> all = {1, 1, 1, 1, 1};
+ std::vector<bool> valid = {1, 1, 1, 1, 0};
+ std::vector<std::string> strings = {"Hi", "olá mundo", "你好世界", "", kInvalidUtf8};
+
+ std::shared_ptr<Array> array;
+
+ // Should accept when invalid but null.
+ ArrayFromVector<SourceType, std::string>(src_type, valid, strings, &array);
+ CheckZeroCopy(*array, dest_type);
+
+ // Should refuse due to invalid utf8 payload
+ CheckFails<SourceType, std::string>(src_type, strings, all, dest_type, options);
+
+ // Should accept due to option override
+ options.allow_invalid_utf8 = true;
+ CheckCase<SourceType, std::string, DestType, std::string>(
+ src_type, strings, all, dest_type, strings, options);
+ }
+
+ template <typename SourceType>
+ void TestCastStringToNumber() {
+ CastOptions options;
+ auto src_type = TypeTraits<SourceType>::type_singleton();
+
+ std::vector<bool> is_valid = {true, false, true, true, true};
+
+ // string to int
+ std::vector<std::string> v_int = {"0", "1", "127", "-1", "0"};
+ std::vector<int8_t> e_int8 = {0, 1, 127, -1, 0};
+ std::vector<int16_t> e_int16 = {0, 1, 127, -1, 0};
+ std::vector<int32_t> e_int32 = {0, 1, 127, -1, 0};
+ std::vector<int64_t> e_int64 = {0, 1, 127, -1, 0};
+ CheckCase<SourceType, std::string, Int8Type, int8_t>(src_type, v_int, is_valid,
+ int8(), e_int8, options);
+ CheckCase<SourceType, std::string, Int16Type, int16_t>(src_type, v_int, is_valid,
+ int16(), e_int16, options);
+ CheckCase<SourceType, std::string, Int32Type, int32_t>(src_type, v_int, is_valid,
+ int32(), e_int32, options);
+ CheckCase<SourceType, std::string, Int64Type, int64_t>(src_type, v_int, is_valid,
+ int64(), e_int64, options);
+
+ v_int = {"2147483647", "0", "-2147483648", "0", "0"};
+ e_int32 = {2147483647, 0, -2147483648LL, 0, 0};
+ CheckCase<SourceType, std::string, Int32Type, int32_t>(src_type, v_int, is_valid,
+ int32(), e_int32, options);
+ v_int = {"9223372036854775807", "0", "-9223372036854775808", "0", "0"};
+ e_int64 = {9223372036854775807LL, 0, (-9223372036854775807LL - 1), 0, 0};
+ CheckCase<SourceType, std::string, Int64Type, int64_t>(src_type, v_int, is_valid,
+ int64(), e_int64, options);
+
+ // string to uint
+ std::vector<std::string> v_uint = {"0", "1", "127", "255", "0"};
+ std::vector<uint8_t> e_uint8 = {0, 1, 127, 255, 0};
+ std::vector<uint16_t> e_uint16 = {0, 1, 127, 255, 0};
+ std::vector<uint32_t> e_uint32 = {0, 1, 127, 255, 0};
+ std::vector<uint64_t> e_uint64 = {0, 1, 127, 255, 0};
+ CheckCase<SourceType, std::string, UInt8Type, uint8_t>(src_type, v_uint, is_valid,
+ uint8(), e_uint8, options);
+ CheckCase<SourceType, std::string, UInt16Type, uint16_t>(src_type, v_uint, is_valid,
+ uint16(), e_uint16, options);
+ CheckCase<SourceType, std::string, UInt32Type, uint32_t>(src_type, v_uint, is_valid,
+ uint32(), e_uint32, options);
+ CheckCase<SourceType, std::string, UInt64Type, uint64_t>(src_type, v_uint, is_valid,
+ uint64(), e_uint64, options);
+
+ v_uint = {"4294967295", "0", "0", "0", "0"};
+ e_uint32 = {4294967295, 0, 0, 0, 0};
+ CheckCase<SourceType, std::string, UInt32Type, uint32_t>(src_type, v_uint, is_valid,
+ uint32(), e_uint32, options);
+ v_uint = {"18446744073709551615", "0", "0", "0", "0"};
+ e_uint64 = {18446744073709551615ULL, 0, 0, 0, 0};
+ CheckCase<SourceType, std::string, UInt64Type, uint64_t>(src_type, v_uint, is_valid,
+ uint64(), e_uint64, options);
+
+ // string to float
+ std::vector<std::string> v_float = {"0.1", "1.2", "127.3", "200.4", "0.5"};
+ std::vector<float> e_float = {0.1f, 1.2f, 127.3f, 200.4f, 0.5f};
+ std::vector<double> e_double = {0.1, 1.2, 127.3, 200.4, 0.5};
+ CheckCase<SourceType, std::string, FloatType, float>(src_type, v_float, is_valid,
+ float32(), e_float, options);
+ CheckCase<SourceType, std::string, DoubleType, double>(src_type, v_float, is_valid,
+ float64(), e_double, options);
+
+ // Test that casting is locale-independent
+ auto global_locale = std::locale();
+ try {
+ // French locale uses the comma as decimal point
+ std::locale::global(std::locale("fr_FR.UTF-8"));
+ } catch (std::runtime_error&) {
+ // Locale unavailable, ignore
+ }
+ CheckCase<SourceType, std::string, FloatType, float>(src_type, v_float, is_valid,
+ float32(), e_float, options);
+ CheckCase<SourceType, std::string, DoubleType, double>(src_type, v_float, is_valid,
+ float64(), e_double, options);
+ std::locale::global(global_locale);
+ }
+
+ template <typename SourceType>
+ void TestCastStringToTimestamp() {
+ CastOptions options;
+ auto src_type = TypeTraits<SourceType>::type_singleton();
+
+ std::vector<bool> is_valid = {true, false, true};
+ std::vector<std::string> strings = {"1970-01-01", "xxx", "2000-02-29"};
+
+ auto type = timestamp(TimeUnit::SECOND);
+ std::vector<int64_t> e = {0, 0, 951782400};
+ CheckCase<SourceType, std::string, TimestampType, int64_t>(
+ src_type, strings, is_valid, type, e, options);
+
+ type = timestamp(TimeUnit::MICRO);
+ e = {0, 0, 951782400000000LL};
+ CheckCase<SourceType, std::string, TimestampType, int64_t>(
+ src_type, strings, is_valid, type, e, options);
+
+ // NOTE: timestamp parsing is tested comprehensively in parsing-util-test.cc
+ }
};
TEST_F(TestCast, SameTypeZeroCopy) {
@@ -922,6 +1050,10 @@ TEST_F(TestCast, StringToBoolean) {
e, options);
CheckCase<StringType, std::string, BooleanType, bool>(utf8(), v2, is_valid, boolean(),
e, options);
+
+ // Same with LargeStringType
+ CheckCase<LargeStringType, std::string, BooleanType, bool>(large_utf8(), v1, is_valid,
+ boolean(), e, options);
}
TEST_F(TestCast, StringToBooleanErrors) {
@@ -931,84 +1063,13 @@ TEST_F(TestCast, StringToBooleanErrors) {
CheckFails<StringType, std::string>(utf8(), {"false "}, is_valid, boolean(), options);
CheckFails<StringType, std::string>(utf8(), {"T"}, is_valid, boolean(), options);
+ CheckFails<LargeStringType, std::string>(large_utf8(), {"T"}, is_valid, boolean(),
+ options);
}
-TEST_F(TestCast, StringToNumber) {
- CastOptions options;
+TEST_F(TestCast, StringToNumber) { TestCastStringToNumber<StringType>(); }
- std::vector<bool> is_valid = {true, false, true, true, true};
-
- // string to int
- std::vector<std::string> v_int = {"0", "1", "127", "-1", "0"};
- std::vector<int8_t> e_int8 = {0, 1, 127, -1, 0};
- std::vector<int16_t> e_int16 = {0, 1, 127, -1, 0};
- std::vector<int32_t> e_int32 = {0, 1, 127, -1, 0};
- std::vector<int64_t> e_int64 = {0, 1, 127, -1, 0};
- CheckCase<StringType, std::string, Int8Type, int8_t>(utf8(), v_int, is_valid, int8(),
- e_int8, options);
- CheckCase<StringType, std::string, Int16Type, int16_t>(utf8(), v_int, is_valid, int16(),
- e_int16, options);
- CheckCase<StringType, std::string, Int32Type, int32_t>(utf8(), v_int, is_valid, int32(),
- e_int32, options);
- CheckCase<StringType, std::string, Int64Type, int64_t>(utf8(), v_int, is_valid, int64(),
- e_int64, options);
-
- v_int = {"2147483647", "0", "-2147483648", "0", "0"};
- e_int32 = {2147483647, 0, -2147483648LL, 0, 0};
- CheckCase<StringType, std::string, Int32Type, int32_t>(utf8(), v_int, is_valid, int32(),
- e_int32, options);
- v_int = {"9223372036854775807", "0", "-9223372036854775808", "0", "0"};
- e_int64 = {9223372036854775807LL, 0, (-9223372036854775807LL - 1), 0, 0};
- CheckCase<StringType, std::string, Int64Type, int64_t>(utf8(), v_int, is_valid, int64(),
- e_int64, options);
-
- // string to uint
- std::vector<std::string> v_uint = {"0", "1", "127", "255", "0"};
- std::vector<uint8_t> e_uint8 = {0, 1, 127, 255, 0};
- std::vector<uint16_t> e_uint16 = {0, 1, 127, 255, 0};
- std::vector<uint32_t> e_uint32 = {0, 1, 127, 255, 0};
- std::vector<uint64_t> e_uint64 = {0, 1, 127, 255, 0};
- CheckCase<StringType, std::string, UInt8Type, uint8_t>(utf8(), v_uint, is_valid,
- uint8(), e_uint8, options);
- CheckCase<StringType, std::string, UInt16Type, uint16_t>(utf8(), v_uint, is_valid,
- uint16(), e_uint16, options);
- CheckCase<StringType, std::string, UInt32Type, uint32_t>(utf8(), v_uint, is_valid,
- uint32(), e_uint32, options);
- CheckCase<StringType, std::string, UInt64Type, uint64_t>(utf8(), v_uint, is_valid,
- uint64(), e_uint64, options);
-
- v_uint = {"4294967295", "0", "0", "0", "0"};
- e_uint32 = {4294967295, 0, 0, 0, 0};
- CheckCase<StringType, std::string, UInt32Type, uint32_t>(utf8(), v_uint, is_valid,
- uint32(), e_uint32, options);
- v_uint = {"18446744073709551615", "0", "0", "0", "0"};
- e_uint64 = {18446744073709551615ULL, 0, 0, 0, 0};
- CheckCase<StringType, std::string, UInt64Type, uint64_t>(utf8(), v_uint, is_valid,
- uint64(), e_uint64, options);
-
- // string to float
- std::vector<std::string> v_float = {"0.1", "1.2", "127.3", "200.4", "0.5"};
- std::vector<float> e_float = {0.1f, 1.2f, 127.3f, 200.4f, 0.5f};
- std::vector<double> e_double = {0.1, 1.2, 127.3, 200.4, 0.5};
- CheckCase<StringType, std::string, FloatType, float>(utf8(), v_float, is_valid,
- float32(), e_float, options);
- CheckCase<StringType, std::string, DoubleType, double>(utf8(), v_float, is_valid,
- float64(), e_double, options);
-
- // Test that casting is locale-independent
- auto global_locale = std::locale();
- try {
- // French locale uses the comma as decimal point
- std::locale::global(std::locale("fr_FR.UTF-8"));
- } catch (std::runtime_error&) {
- // Locale unavailable, ignore
- }
- CheckCase<StringType, std::string, FloatType, float>(utf8(), v_float, is_valid,
- float32(), e_float, options);
- CheckCase<StringType, std::string, DoubleType, double>(utf8(), v_float, is_valid,
- float64(), e_double, options);
- std::locale::global(global_locale);
-}
+TEST_F(TestCast, LargeStringToNumber) { TestCastStringToNumber<LargeStringType>(); }
TEST_F(TestCast, StringToNumberErrors) {
CastOptions options;
@@ -1027,24 +1088,9 @@ TEST_F(TestCast, StringToNumberErrors) {
CheckFails<StringType, std::string>(utf8(), {"z"}, is_valid, float32(), options);
}
-TEST_F(TestCast, StringToTimestamp) {
- CastOptions options;
-
- std::vector<bool> is_valid = {true, false, true};
- std::vector<std::string> strings = {"1970-01-01", "xxx", "2000-02-29"};
+TEST_F(TestCast, StringToTimestamp) { TestCastStringToTimestamp<StringType>(); }
- auto type = timestamp(TimeUnit::SECOND);
- std::vector<int64_t> e = {0, 0, 951782400};
- CheckCase<StringType, std::string, TimestampType, int64_t>(utf8(), strings, is_valid,
- type, e, options);
-
- type = timestamp(TimeUnit::MICRO);
- e = {0, 0, 951782400000000LL};
- CheckCase<StringType, std::string, TimestampType, int64_t>(utf8(), strings, is_valid,
- type, e, options);
-
- // NOTE: timestamp parsing is tested comprehensively in parsing-util-test.cc
-}
+TEST_F(TestCast, LargeStringToTimestamp) { TestCastStringToTimestamp<LargeStringType>(); }
TEST_F(TestCast, StringToTimestampErrors) {
CastOptions options;
@@ -1058,29 +1104,10 @@ TEST_F(TestCast, StringToTimestampErrors) {
}
}
-constexpr const char* kInvalidUtf8 = "\xa0\xa1";
-
-TEST_F(TestCast, BinaryToString) {
- CastOptions options;
-
- // All valid except the last one
- std::vector<bool> all = {1, 1, 1, 1, 1};
- std::vector<bool> valid = {1, 1, 1, 1, 0};
- std::vector<std::string> strings = {"Hi", "olá mundo", "你好世界", "", kInvalidUtf8};
-
- std::shared_ptr<Array> array;
-
- // Should accept when invalid but null.
- ArrayFromVector<BinaryType, std::string>(binary(), valid, strings, &array);
- CheckZeroCopy(*array, utf8());
-
- // Should refuse due to invalid utf8 payload
- CheckFails<BinaryType, std::string>(binary(), strings, all, utf8(), options);
+TEST_F(TestCast, BinaryToString) { TestCastBinaryToString<BinaryType, StringType>(); }
- // Should accept due to option override
- options.allow_invalid_utf8 = true;
- CheckCase<BinaryType, std::string, StringType, std::string>(binary(), strings, all,
- utf8(), strings, options);
+TEST_F(TestCast, LargeBinaryToLargeString) {
+ TestCastBinaryToString<LargeBinaryType, LargeStringType>();
}
TEST_F(TestCast, ListToList) {
diff --git a/cpp/src/arrow/compute/kernels/cast.cc b/cpp/src/arrow/compute/kernels/cast.cc
index 88a4f3087..a8b661599 100644
--- a/cpp/src/arrow/compute/kernels/cast.cc
+++ b/cpp/src/arrow/compute/kernels/cast.cc
@@ -905,13 +905,15 @@ struct CastFunctor<T, DictionaryType> {
// ----------------------------------------------------------------------
// String to Number
-template <typename O>
-struct CastFunctor<O, StringType, enable_if_number<O>> {
+template <typename I, typename O>
+struct CastFunctor<O, I,
+ typename std::enable_if<is_any_string_type<I>::value &&
+ is_number_type<O>::value>::type> {
void operator()(FunctionContext* ctx, const CastOptions& options,
const ArrayData& input, ArrayData* output) {
using out_type = typename O::c_type;
- StringArray input_array(input.Copy());
+ typename TypeTraits<I>::ArrayType input_array(input.Copy());
auto out_data = output->GetMutableValues<out_type>(1);
internal::StringConverter<O> converter;
@@ -933,15 +935,15 @@ struct CastFunctor<O, StringType, enable_if_number<O>> {
// ----------------------------------------------------------------------
// String to Boolean
-template <typename O>
-struct CastFunctor<O, StringType,
- typename std::enable_if<std::is_same<BooleanType, O>::value>::type> {
+template <typename I>
+struct CastFunctor<BooleanType, I,
+ typename std::enable_if<is_any_string_type<I>::value>::type> {
void operator()(FunctionContext* ctx, const CastOptions& options,
const ArrayData& input, ArrayData* output) {
- StringArray input_array(input.Copy());
+ typename TypeTraits<I>::ArrayType input_array(input.Copy());
internal::FirstTimeBitmapWriter writer(output->buffers[1]->mutable_data(),
output->offset, input.length);
- internal::StringConverter<O> converter;
+ internal::StringConverter<BooleanType> converter;
for (int64_t i = 0; i < input.length; ++i) {
if (input_array.IsNull(i)) {
@@ -972,13 +974,14 @@ struct CastFunctor<O, StringType,
// ----------------------------------------------------------------------
// String to Timestamp
-template <>
-struct CastFunctor<TimestampType, StringType> {
+template <typename I>
+struct CastFunctor<TimestampType, I,
+ typename std::enable_if<is_any_string_type<I>::value>::type> {
void operator()(FunctionContext* ctx, const CastOptions& options,
const ArrayData& input, ArrayData* output) {
using out_type = TimestampType::c_type;
- StringArray input_array(input.Copy());
+ typename TypeTraits<I>::ArrayType input_array(input.Copy());
auto out_data = output->GetMutableValues<out_type>(1);
internal::StringConverter<TimestampType> converter(output->type);
@@ -1001,47 +1004,51 @@ struct CastFunctor<TimestampType, StringType> {
// Binary to String
//
-template <typename I>
-struct CastFunctor<StringType, I,
- typename std::enable_if<std::is_same<BinaryType, I>::value>::type> {
+#if defined(_MSC_VER)
+// Silence warning: """'visitor': unreferenced local variable"""
+#pragma warning(push)
+#pragma warning(disable : 4101)
+#endif
+
+template <typename I, typename O>
+struct BinaryToStringSameWidthCastFunctor {
void operator()(FunctionContext* ctx, const CastOptions& options,
const ArrayData& input, ArrayData* output) {
- BinaryArray binary(input.Copy());
+ if (!options.allow_invalid_utf8) {
+ util::InitializeUTF8();
- if (options.allow_invalid_utf8) {
- ZeroCopyData(input, output);
- return;
+ ArrayDataVisitor<I> visitor;
+ Status st = visitor.Visit(input, this);
+ if (!st.ok()) {
+ ctx->SetStatus(st);
+ return;
+ }
}
+ ZeroCopyData(input, output);
+ }
- util::InitializeUTF8();
-
- if (binary.null_count() != 0) {
- for (int64_t i = 0; i < input.length; i++) {
- if (binary.IsNull(i)) {
- continue;
- }
-
- const auto str = binary.GetView(i);
- if (ARROW_PREDICT_FALSE(!arrow::util::ValidateUTF8(str))) {
- ctx->SetStatus(Status::Invalid("Invalid UTF8 payload"));
- return;
- }
- }
+ Status VisitNull() { return Status::OK(); }
- } else {
- for (int64_t i = 0; i < input.length; i++) {
- const auto str = binary.GetView(i);
- if (ARROW_PREDICT_FALSE(!arrow::util::ValidateUTF8(str))) {
- ctx->SetStatus(Status::Invalid("Invalid UTF8 payload"));
- return;
- }
- }
+ Status VisitValue(util::string_view str) {
+ if (ARROW_PREDICT_FALSE(!arrow::util::ValidateUTF8(str))) {
+ return Status::Invalid("Invalid UTF8 payload");
}
-
- ZeroCopyData(input, output);
+ return Status::OK();
}
};
+template <>
+struct CastFunctor<StringType, BinaryType>
+ : public BinaryToStringSameWidthCastFunctor<StringType, BinaryType> {};
+
+template <>
+struct CastFunctor<LargeStringType, LargeBinaryType>
+ : public BinaryToStringSameWidthCastFunctor<LargeStringType, LargeBinaryType> {};
+
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+
// ----------------------------------------------------------------------
typedef std::function<void(FunctionContext*, const CastOptions& options, const ArrayData&,
@@ -1142,6 +1149,8 @@ GET_CAST_FUNCTION(TIME64_CASES, Time64Type)
GET_CAST_FUNCTION(TIMESTAMP_CASES, TimestampType)
GET_CAST_FUNCTION(BINARY_CASES, BinaryType)
GET_CAST_FUNCTION(STRING_CASES, StringType)
+GET_CAST_FUNCTION(LARGEBINARY_CASES, LargeBinaryType)
+GET_CAST_FUNCTION(LARGESTRING_CASES, LargeStringType)
GET_CAST_FUNCTION(DICTIONARY_CASES, DictionaryType)
#define CAST_FUNCTION_CASE(InType) \
@@ -1225,6 +1234,8 @@ Status GetCastFunction(const DataType& in_type, std::shared_ptr<DataType> out_ty
CAST_FUNCTION_CASE(TimestampType);
CAST_FUNCTION_CASE(BinaryType);
CAST_FUNCTION_CASE(StringType);
+ CAST_FUNCTION_CASE(LargeBinaryType);
+ CAST_FUNCTION_CASE(LargeStringType);
CAST_FUNCTION_CASE(DictionaryType);
case Type::LIST:
RETURN_NOT_OK(GetListCastFunc(in_type, std::move(out_type), options, kernel));
diff --git a/cpp/src/arrow/compute/kernels/generated/cast-codegen-internal.h b/cpp/src/arrow/compute/kernels/generated/cast-codegen-internal.h
index 77334af36..fb82067bb 100644
--- a/cpp/src/arrow/compute/kernels/generated/cast-codegen-internal.h
+++ b/cpp/src/arrow/compute/kernels/generated/cast-codegen-internal.h
@@ -171,6 +171,9 @@
#define BINARY_CASES(TEMPLATE) \
TEMPLATE(BinaryType, StringType)
+#define LARGEBINARY_CASES(TEMPLATE) \
+ TEMPLATE(LargeBinaryType, LargeStringType)
+
#define STRING_CASES(TEMPLATE) \
TEMPLATE(StringType, BooleanType) \
TEMPLATE(StringType, UInt8Type) \
@@ -185,6 +188,20 @@
TEMPLATE(StringType, DoubleType) \
TEMPLATE(StringType, TimestampType)
+#define LARGESTRING_CASES(TEMPLATE) \
+ TEMPLATE(LargeStringType, BooleanType) \
+ TEMPLATE(LargeStringType, UInt8Type) \
+ TEMPLATE(LargeStringType, Int8Type) \
+ TEMPLATE(LargeStringType, UInt16Type) \
+ TEMPLATE(LargeStringType, Int16Type) \
+ TEMPLATE(LargeStringType, UInt32Type) \
+ TEMPLATE(LargeStringType, Int32Type) \
+ TEMPLATE(LargeStringType, UInt64Type) \
+ TEMPLATE(LargeStringType, Int64Type) \
+ TEMPLATE(LargeStringType, FloatType) \
+ TEMPLATE(LargeStringType, DoubleType) \
+ TEMPLATE(LargeStringType, TimestampType)
+
#define DICTIONARY_CASES(TEMPLATE) \
TEMPLATE(DictionaryType, UInt8Type) \
TEMPLATE(DictionaryType, Int8Type) \
diff --git a/cpp/src/arrow/compute/kernels/generated/codegen.py b/cpp/src/arrow/compute/kernels/generated/codegen.py
index 04fc38618..c9db7eaa0 100644
--- a/cpp/src/arrow/compute/kernels/generated/codegen.py
+++ b/cpp/src/arrow/compute/kernels/generated/codegen.py
@@ -85,7 +85,9 @@ CAST_GENERATORS = [
CastCodeGenerator('Timestamp', ['Date32', 'Date64', 'Timestamp'],
parametric=True),
CastCodeGenerator('Binary', ['String']),
+ CastCodeGenerator('LargeBinary', ['LargeString']),
CastCodeGenerator('String', NUMERIC_TYPES + ['Timestamp']),
+ CastCodeGenerator('LargeString', NUMERIC_TYPES + ['Timestamp']),
CastCodeGenerator('Dictionary',
INTEGER_TYPES + FLOATING_TYPES + DATE_TIME_TYPES +
['Null', 'Binary', 'FixedSizeBinary', 'String',
diff --git a/cpp/src/arrow/csv/converter-test.cc b/cpp/src/arrow/csv/converter-test.cc
index a5e4c0372..53176ff0a 100644
--- a/cpp/src/arrow/csv/converter-test.cc
+++ b/cpp/src/arrow/csv/converter-test.cc
@@ -30,6 +30,7 @@
#include "arrow/status.h"
#include "arrow/testing/gtest_util.h"
#include "arrow/type.h"
+#include "arrow/type_traits.h"
#include "arrow/util/decimal.h"
#include "arrow/util/logging.h"
@@ -118,11 +119,17 @@ void AssertConversionError(const std::shared_ptr<DataType>& type,
//////////////////////////////////////////////////////////////////////////
// Test functions begin here
-TEST(BinaryConversion, Basics) {
- AssertConversion<BinaryType, std::string>(binary(), {"ab,cdé\n", ",\xffgh\n"},
- {{"ab", ""}, {"cdé", "\xffgh"}});
+template <typename T>
+static void TestBinaryConversionBasics() {
+ auto type = TypeTraits<T>::type_singleton();
+ AssertConversion<T, std::string>(type, {"ab,cdé\n", ",\xffgh\n"},
+ {{"ab", ""}, {"cdé", "\xffgh"}});
}
+TEST(BinaryConversion, Basics) { TestBinaryConversionBasics<BinaryType>(); }
+
+TEST(LargeBinaryConversion, Basics) { TestBinaryConversionBasics<LargeBinaryType>(); }
+
TEST(BinaryConversion, Nulls) {
AssertConversion<BinaryType, std::string>(binary(), {"ab,N/A\n", "NULL,\n"},
{{"ab", "NULL"}, {"N/A", ""}},
@@ -135,16 +142,22 @@ TEST(BinaryConversion, Nulls) {
{{true, false}, {false, false}}, options);
}
-TEST(StringConversion, Basics) {
- AssertConversion<StringType, std::string>(utf8(), {"ab,cdé\n", ",gh\n"},
- {{"ab", ""}, {"cdé", "gh"}});
+template <typename T>
+static void TestStringConversionBasics() {
+ auto type = TypeTraits<T>::type_singleton();
+ AssertConversion<T, std::string>(type, {"ab,cdé\n", ",gh\n"},
+ {{"ab", ""}, {"cdé", "gh"}});
auto options = ConvertOptions::Defaults();
options.check_utf8 = false;
- AssertConversion<StringType, std::string>(utf8(), {"ab,cdé\n", ",\xffgh\n"},
- {{"ab", ""}, {"cdé", "\xffgh"}}, options);
+ AssertConversion<T, std::string>(type, {"ab,cdé\n", ",\xffgh\n"},
+ {{"ab", ""}, {"cdé", "\xffgh"}}, options);
}
+TEST(StringConversion, Basics) { TestStringConversionBasics<StringType>(); }
+
+TEST(LargeStringConversion, Basics) { TestStringConversionBasics<LargeStringType>(); }
+
TEST(StringConversion, Nulls) {
AssertConversion<StringType, std::string>(utf8(), {"ab,N/A\n", "NULL,\n"},
{{"ab", "NULL"}, {"N/A", ""}},
@@ -157,11 +170,17 @@ TEST(StringConversion, Nulls) {
{{true, false}, {false, false}}, options);
}
-TEST(StringConversion, Errors) {
+template <typename T>
+static void TestStringConversionErrors() {
+ auto type = TypeTraits<T>::type_singleton();
// Invalid UTF8 in column 0
- AssertConversionError(utf8(), {"ab,cdé\n", "\xff,gh\n"}, {0});
+ AssertConversionError(type, {"ab,cdé\n", "\xff,gh\n"}, {0});
}
+TEST(StringConversion, Errors) { TestStringConversionErrors<StringType>(); }
+
+TEST(LargeStringConversion, Errors) { TestStringConversionErrors<LargeStringType>(); }
+
TEST(FixedSizeBinaryConversion, Basics) {
AssertConversion<FixedSizeBinaryType, std::string>(
fixed_size_binary(2), {"ab,cd\n", "gh,ij\n"}, {{"ab", "gh"}, {"cd", "ij"}});
diff --git a/cpp/src/arrow/csv/converter.cc b/cpp/src/arrow/csv/converter.cc
index 53495cf9b..1c61d3ccb 100644
--- a/cpp/src/arrow/csv/converter.cc
+++ b/cpp/src/arrow/csv/converter.cc
@@ -431,6 +431,7 @@ Status Converter::Make(const std::shared_ptr<DataType>& type,
CONVERTER_CASE(Type::BOOL, BooleanConverter)
CONVERTER_CASE(Type::TIMESTAMP, TimestampConverter)
CONVERTER_CASE(Type::BINARY, (VarSizeBinaryConverter<BinaryType, false>))
+ CONVERTER_CASE(Type::LARGE_BINARY, (VarSizeBinaryConverter<LargeBinaryType, false>))
CONVERTER_CASE(Type::FIXED_SIZE_BINARY, FixedSizeBinaryConverter)
CONVERTER_CASE(Type::DECIMAL, DecimalConverter)
@@ -442,6 +443,14 @@ Status Converter::Make(const std::shared_ptr<DataType>& type,
}
break;
+ case Type::LARGE_STRING:
+ if (options.check_utf8) {
+ result = new VarSizeBinaryConverter<LargeStringType, true>(type, options, pool);
+ } else {
+ result = new VarSizeBinaryConverter<LargeStringType, false>(type, options, pool);
+ }
+ break;
+
default: {
return Status::NotImplemented("CSV conversion to ", type->ToString(),
" is not supported");
diff --git a/cpp/src/arrow/ipc/feather.cc b/cpp/src/arrow/ipc/feather.cc
index 7cd64c8d7..8436bd205 100644
--- a/cpp/src/arrow/ipc/feather.cc
+++ b/cpp/src/arrow/ipc/feather.cc
@@ -367,6 +367,8 @@ class TableReader::TableReaderImpl {
PRIMITIVE_CASE(DOUBLE, float64);
PRIMITIVE_CASE(UTF8, utf8);
PRIMITIVE_CASE(BINARY, binary);
+ PRIMITIVE_CASE(LARGE_UTF8, large_utf8);
+ PRIMITIVE_CASE(LARGE_BINARY, large_binary);
default:
return Status::Invalid("Unrecognized type");
}
@@ -410,6 +412,10 @@ class TableReader::TableReaderImpl {
int64_t offsets_size = GetOutputLength((meta->length() + 1) * sizeof(int32_t));
buffers.push_back(SliceBuffer(buffer, offset, offsets_size));
offset += offsets_size;
+ } else if (is_large_binary_like(type->id())) {
+ int64_t offsets_size = GetOutputLength((meta->length() + 1) * sizeof(int64_t));
+ buffers.push_back(SliceBuffer(buffer, offset, offsets_size));
+ offset += offsets_size;
}
buffers.push_back(SliceBuffer(buffer, offset, buffer->size() - offset));
@@ -585,6 +591,10 @@ fbs::Type ToFlatbufferType(Type::type type) {
return fbs::Type_UTF8;
case Type::BINARY:
return fbs::Type_BINARY;
+ case Type::LARGE_STRING:
+ return fbs::Type_LARGE_UTF8;
+ case Type::LARGE_BINARY:
+ return fbs::Type_LARGE_BINARY;
case Type::DATE32:
return fbs::Type_INT32;
case Type::TIMESTAMP:
@@ -644,7 +654,8 @@ class TableWriter::TableWriterImpl : public ArrayVisitor {
}
Status LoadArrayMetadata(const Array& values, ArrayMetadata* meta) {
- if (!(is_primitive(values.type_id()) || is_binary_like(values.type_id()))) {
+ if (!(is_primitive(values.type_id()) || is_binary_like(values.type_id()) ||
+ is_large_binary_like(values.type_id()))) {
return Status::Invalid("Array is not primitive type: ", values.type()->ToString());
}
@@ -659,6 +670,32 @@ class TableWriter::TableWriterImpl : public ArrayVisitor {
return Status::OK();
}
+ template <typename ArrayType>
+ Status WriteBinaryArray(const ArrayType& values, ArrayMetadata* meta,
+ const uint8_t** values_buffer, int64_t* values_bytes,
+ int64_t* bytes_written) {
+ using offset_type = typename ArrayType::offset_type;
+
+ int64_t offset_bytes = sizeof(offset_type) * (values.length() + 1);
+
+ if (values.value_offsets()) {
+ *values_bytes = values.raw_value_offsets()[values.length()];
+
+ // Write the variable-length offsets
+ RETURN_NOT_OK(WritePadded(
+ stream_.get(), reinterpret_cast<const uint8_t*>(values.raw_value_offsets()),
+ offset_bytes, bytes_written));
+ } else {
+ RETURN_NOT_OK(WritePaddedBlank(stream_.get(), offset_bytes, bytes_written));
+ }
+ meta->total_bytes += *bytes_written;
+
+ if (values.value_data()) {
+ *values_buffer = values.value_data()->data();
+ }
+ return Status::OK();
+ }
+
Status WriteArray(const Array& values, ArrayMetadata* meta) {
RETURN_NOT_OK(CheckStarted());
RETURN_NOT_OK(LoadArrayMetadata(values, meta));
@@ -687,26 +724,11 @@ class TableWriter::TableWriterImpl : public ArrayVisitor {
const uint8_t* values_buffer = nullptr;
if (is_binary_like(values.type_id())) {
- const auto& bin_values = checked_cast<const BinaryArray&>(values);
-
- int64_t offset_bytes = sizeof(int32_t) * (values.length() + 1);
-
- if (bin_values.value_offsets()) {
- values_bytes = bin_values.raw_value_offsets()[values.length()];
-
- // Write the variable-length offsets
- RETURN_NOT_OK(
- WritePadded(stream_.get(),
- reinterpret_cast<const uint8_t*>(bin_values.raw_value_offsets()),
- offset_bytes, &bytes_written));
- } else {
- RETURN_NOT_OK(WritePaddedBlank(stream_.get(), offset_bytes, &bytes_written));
- }
- meta->total_bytes += bytes_written;
-
- if (bin_values.value_data()) {
- values_buffer = bin_values.value_data()->data();
- }
+ RETURN_NOT_OK(WriteBinaryArray(checked_cast<const BinaryArray&>(values), meta,
+ &values_buffer, &values_bytes, &bytes_written));
+ } else if (is_large_binary_like(values.type_id())) {
+ RETURN_NOT_OK(WriteBinaryArray(checked_cast<const LargeBinaryArray&>(values), meta,
+ &values_buffer, &values_bytes, &bytes_written));
} else {
const auto& prim_values = checked_cast<const PrimitiveArray&>(values);
const auto& fw_type = checked_cast<const FixedWidthType&>(*values.type());
@@ -760,6 +782,8 @@ class TableWriter::TableWriterImpl : public ArrayVisitor {
VISIT_PRIMITIVE(DoubleArray)
VISIT_PRIMITIVE(BinaryArray)
VISIT_PRIMITIVE(StringArray)
+ VISIT_PRIMITIVE(LargeBinaryArray)
+ VISIT_PRIMITIVE(LargeStringArray)
#undef VISIT_PRIMITIVE
diff --git a/cpp/src/arrow/ipc/feather.fbs b/cpp/src/arrow/ipc/feather.fbs
index a27d39989..5ec062998 100644
--- a/cpp/src/arrow/ipc/feather.fbs
+++ b/cpp/src/arrow/ipc/feather.fbs
@@ -48,7 +48,10 @@ enum Type : byte {
TIMESTAMP = 14,
DATE = 15,
- TIME = 16
+ TIME = 16,
+
+ LARGE_UTF8 = 17,
+ LARGE_BINARY = 18
}
enum Encoding : byte {
diff --git a/cpp/src/arrow/ipc/json-internal.cc b/cpp/src/arrow/ipc/json-internal.cc
index 135296551..49a884e1f 100644
--- a/cpp/src/arrow/ipc/json-internal.cc
+++ b/cpp/src/arrow/ipc/json-internal.cc
@@ -312,6 +312,10 @@ class SchemaWriter {
Status Visit(const TimeType& type) { return WritePrimitive("time", type); }
Status Visit(const StringType& type) { return WriteVarBytes("utf8", type); }
Status Visit(const BinaryType& type) { return WriteVarBytes("binary", type); }
+ Status Visit(const LargeStringType& type) { return WriteVarBytes("large_utf8", type); }
+ Status Visit(const LargeBinaryType& type) {
+ return WriteVarBytes("large_binary", type);
+ }
Status Visit(const FixedSizeBinaryType& type) {
return WritePrimitive("fixedsizebinary", type);
}
@@ -430,20 +434,26 @@ class ArrayWriter {
}
}
- // Binary, encode to hexadecimal. UTF8 string write as is
+ // Binary, encode to hexadecimal.
template <typename T>
- typename std::enable_if<std::is_base_of<BinaryArray, T>::value, void>::type
+ typename std::enable_if<std::is_same<BinaryArray, T>::value ||
+ std::is_same<LargeBinaryArray, T>::value,
+ void>::type
WriteDataValues(const T& arr) {
for (int64_t i = 0; i < arr.length(); ++i) {
- int32_t length;
- const uint8_t* buf = arr.GetValue(i, &length);
+ writer_->String(HexEncode(arr.GetView(i)));
+ }
+ }
- if (std::is_base_of<StringArray, T>::value) {
- // Presumed UTF-8
- writer_->String(reinterpret_cast<const char*>(buf), length);
- } else {
- writer_->String(HexEncode(buf, length));
- }
+ // UTF8 string, write as is
+ template <typename T>
+ typename std::enable_if<std::is_same<StringArray, T>::value ||
+ std::is_same<LargeStringArray, T>::value,
+ void>::type
+ WriteDataValues(const T& arr) {
+ for (int64_t i = 0; i < arr.length(); ++i) {
+ auto view = arr.GetView(i);
+ writer_->String(view.data(), static_cast<rj::SizeType>(view.size()));
}
}
@@ -558,8 +568,10 @@ class ArrayWriter {
}
template <typename T>
- typename std::enable_if<std::is_base_of<BinaryArray, T>::value, Status>::type Visit(
- const T& array) {
+ typename std::enable_if<std::is_base_of<BinaryArray, T>::value ||
+ std::is_base_of<LargeBinaryArray, T>::value,
+ Status>::type
+ Visit(const T& array) {
WriteValidityField(array);
WriteIntegerField("OFFSET", array.raw_value_offsets(), array.length() + 1);
WriteDataField(array);
@@ -911,6 +923,10 @@ static Status GetType(const RjObject& json_type,
*type = utf8();
} else if (type_name == "binary") {
*type = binary();
+ } else if (type_name == "large_utf8") {
+ *type = large_utf8();
+ } else if (type_name == "large_binary") {
+ *type = large_binary();
} else if (type_name == "fixedsizebinary") {
return GetFixedSizeBinary(json_type, type);
} else if (type_name == "decimal") {
@@ -1091,9 +1107,10 @@ class ArrayReader {
}
template <typename T>
- typename std::enable_if<std::is_base_of<BinaryType, T>::value, Status>::type Visit(
+ typename std::enable_if<std::is_base_of<BaseBinaryType, T>::value, Status>::type Visit(
const T& type) {
typename TypeTraits<T>::BuilderType builder(pool_);
+ using offset_type = typename T::offset_type;
const auto& json_data = obj_.FindMember(kData);
RETURN_NOT_ARRAY(kData, json_data, obj_);
@@ -1110,23 +1127,27 @@ class ArrayReader {
const rj::Value& val = json_data_arr[i];
DCHECK(val.IsString());
- if (std::is_base_of<StringType, T>::value) {
+
+ if (T::is_utf8) {
RETURN_NOT_OK(builder.Append(val.GetString()));
} else {
std::string hex_string = val.GetString();
- DCHECK(hex_string.size() % 2 == 0) << "Expected base16 hex string";
- int32_t length = static_cast<int>(hex_string.size()) / 2;
+ if (hex_string.size() % 2 != 0) {
+ return Status::Invalid("Expected base16 hex string");
+ }
+ const auto value_len = static_cast<int64_t>(hex_string.size()) / 2;
std::shared_ptr<Buffer> byte_buffer;
- RETURN_NOT_OK(AllocateBuffer(pool_, length, &byte_buffer));
+ RETURN_NOT_OK(AllocateBuffer(pool_, value_len, &byte_buffer));
const char* hex_data = hex_string.c_str();
uint8_t* byte_buffer_data = byte_buffer->mutable_data();
- for (int32_t j = 0; j < length; ++j) {
+ for (int64_t j = 0; j < value_len; ++j) {
RETURN_NOT_OK(ParseHexValue(hex_data + j * 2, &byte_buffer_data[j]));
}
- RETURN_NOT_OK(builder.Append(byte_buffer_data, length));
+ RETURN_NOT_OK(
+ builder.Append(byte_buffer_data, static_cast<offset_type>(value_len)));
}
}
diff --git a/cpp/src/arrow/ipc/json-simple-test.cc b/cpp/src/arrow/ipc/json-simple-test.cc
index ce8b21a84..b5f68e0c7 100644
--- a/cpp/src/arrow/ipc/json-simple-test.cc
+++ b/cpp/src/arrow/ipc/json-simple-test.cc
@@ -322,6 +322,21 @@ TEST(TestString, Basics) {
AssertJSONArray<BinaryType, std::string>(type, "[\"\\u0000\\u001f\"]", {s});
}
+TEST(TestLargeString, Basics) {
+ // Similar as TestString above, only testing the basics
+ std::shared_ptr<DataType> type = large_utf8();
+ std::shared_ptr<Array> expected, actual;
+
+ AssertJSONArray<LargeStringType, std::string>(type, "[\"\", \"foo\"]", {"", "foo"});
+ AssertJSONArray<LargeStringType, std::string>(type, "[\"\", null]", {true, false},
+ {"", ""});
+
+ // Large binary type
+ type = large_binary();
+ AssertJSONArray<LargeBinaryType, std::string>(type, "[\"\", \"foo\", null]",
+ {true, true, false}, {"", "foo", ""});
+}
+
TEST(TestTimestamp, Basics) {
// Timestamp type
auto type = timestamp(TimeUnit::SECOND);
diff --git a/cpp/src/arrow/ipc/json-simple.cc b/cpp/src/arrow/ipc/json-simple.cc
index ae01bcc4b..ce0d2c53c 100644
--- a/cpp/src/arrow/ipc/json-simple.cc
+++ b/cpp/src/arrow/ipc/json-simple.cc
@@ -26,6 +26,7 @@
#include "arrow/ipc/json-internal.h"
#include "arrow/ipc/json-simple.h"
#include "arrow/memory_pool.h"
+#include "arrow/type_traits.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/decimal.h"
#include "arrow/util/logging.h"
@@ -344,11 +345,14 @@ class TimestampConverter final : public ConcreteConverter<TimestampConverter> {
// ------------------------------------------------------------------------
// Converter for binary and string arrays
-class StringConverter final : public ConcreteConverter<StringConverter> {
+template <typename TYPE>
+class StringConverter final : public ConcreteConverter<StringConverter<TYPE>> {
public:
+ using BuilderType = typename TypeTraits<TYPE>::BuilderType;
+
explicit StringConverter(const std::shared_ptr<DataType>& type) {
this->type_ = type;
- builder_ = std::make_shared<BinaryBuilder>(type, default_memory_pool());
+ builder_ = std::make_shared<BuilderType>(type, default_memory_pool());
}
Status AppendNull() override { return builder_->AppendNull(); }
@@ -368,7 +372,7 @@ class StringConverter final : public ConcreteConverter<StringConverter> {
std::shared_ptr<ArrayBuilder> builder() override { return builder_; }
private:
- std::shared_ptr<BinaryBuilder> builder_;
+ std::shared_ptr<BuilderType> builder_;
};
// ------------------------------------------------------------------------
@@ -732,8 +736,10 @@ Status GetConverter(const std::shared_ptr<DataType>& type,
SIMPLE_CONVERTER_CASE(Type::MAP, MapConverter)
SIMPLE_CONVERTER_CASE(Type::FIXED_SIZE_LIST, FixedSizeListConverter)
SIMPLE_CONVERTER_CASE(Type::STRUCT, StructConverter)
- SIMPLE_CONVERTER_CASE(Type::STRING, StringConverter)
- SIMPLE_CONVERTER_CASE(Type::BINARY, StringConverter)
+ SIMPLE_CONVERTER_CASE(Type::STRING, StringConverter<StringType>)
+ SIMPLE_CONVERTER_CASE(Type::BINARY, StringConverter<BinaryType>)
+ SIMPLE_CONVERTER_CASE(Type::LARGE_STRING, StringConverter<LargeStringType>)
+ SIMPLE_CONVERTER_CASE(Type::LARGE_BINARY, StringConverter<LargeBinaryType>)
SIMPLE_CONVERTER_CASE(Type::FIXED_SIZE_BINARY, FixedSizeBinaryConverter)
SIMPLE_CONVERTER_CASE(Type::DECIMAL, DecimalConverter)
SIMPLE_CONVERTER_CASE(Type::UNION, UnionConverter)
diff --git a/cpp/src/arrow/ipc/metadata-internal.cc b/cpp/src/arrow/ipc/metadata-internal.cc
index e505ddeca..93f859a0a 100644
--- a/cpp/src/arrow/ipc/metadata-internal.cc
+++ b/cpp/src/arrow/ipc/metadata-internal.cc
@@ -232,6 +232,9 @@ Status ConcreteTypeFromFlatbuffer(flatbuf::Type type, const void* type_data,
case flatbuf::Type_Binary:
*out = binary();
return Status::OK();
+ case flatbuf::Type_LargeBinary:
+ *out = large_binary();
+ return Status::OK();
case flatbuf::Type_FixedSizeBinary: {
auto fw_binary = static_cast<const flatbuf::FixedSizeBinary*>(type_data);
*out = fixed_size_binary(fw_binary->byteWidth());
@@ -240,6 +243,9 @@ Status ConcreteTypeFromFlatbuffer(flatbuf::Type type, const void* type_data,
case flatbuf::Type_Utf8:
*out = utf8();
return Status::OK();
+ case flatbuf::Type_LargeUtf8:
+ *out = large_utf8();
+ return Status::OK();
case flatbuf::Type_Bool:
*out = boolean();
return Status::OK();
@@ -541,12 +547,24 @@ class FieldToFlatbufferVisitor {
return Status::OK();
}
+ Status Visit(const LargeBinaryType& type) {
+ fb_type_ = flatbuf::Type_LargeBinary;
+ type_offset_ = flatbuf::CreateLargeBinary(fbb_).Union();
+ return Status::OK();
+ }
+
Status Visit(const StringType& type) {
fb_type_ = flatbuf::Type_Utf8;
type_offset_ = flatbuf::CreateUtf8(fbb_).Union();
return Status::OK();
}
+ Status Visit(const LargeStringType& type) {
+ fb_type_ = flatbuf::Type_LargeUtf8;
+ type_offset_ = flatbuf::CreateLargeUtf8(fbb_).Union();
+ return Status::OK();
+ }
+
Status Visit(const Date32Type& type) {
fb_type_ = flatbuf::Type_Date;
type_offset_ = flatbuf::CreateDate(fbb_, flatbuf::DateUnit_DAY).Union();
diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc
index c39f2d714..b9f29d747 100644
--- a/cpp/src/arrow/ipc/reader.cc
+++ b/cpp/src/arrow/ipc/reader.cc
@@ -249,8 +249,10 @@ class ArrayLoader {
}
template <typename T>
- typename std::enable_if<std::is_base_of<BinaryType, T>::value, Status>::type Visit(
- const T& type) {
+ typename std::enable_if<std::is_base_of<BinaryType, T>::value ||
+ std::is_base_of<LargeBinaryType, T>::value,
+ Status>::type
+ Visit(const T& type) {
return LoadBinary<T>();
}
diff --git a/cpp/src/arrow/ipc/test-common.cc b/cpp/src/arrow/ipc/test-common.cc
index 47c307659..4cf13ecc0 100644
--- a/cpp/src/arrow/ipc/test-common.cc
+++ b/cpp/src/arrow/ipc/test-common.cc
@@ -34,6 +34,7 @@
#include "arrow/testing/random.h"
#include "arrow/testing/util.h"
#include "arrow/type.h"
+#include "arrow/type_traits.h"
#include "arrow/util/bit-util.h"
namespace arrow {
@@ -205,18 +206,16 @@ Status MakeRandomStringArray(int64_t length, bool include_nulls, MemoryPool* poo
return builder.Finish(out);
}
-template <class Builder, class RawType>
+template <class BuilderType>
static Status MakeBinaryArrayWithUniqueValues(int64_t length, bool include_nulls,
MemoryPool* pool,
std::shared_ptr<Array>* out) {
- Builder builder(pool);
+ BuilderType builder(pool);
for (int64_t i = 0; i < length; ++i) {
if (include_nulls && (i % 7 == 0)) {
RETURN_NOT_OK(builder.AppendNull());
} else {
- const std::string value = std::to_string(i);
- RETURN_NOT_OK(builder.Append(reinterpret_cast<const RawType*>(value.data()),
- static_cast<int32_t>(value.size())));
+ RETURN_NOT_OK(builder.Append(std::to_string(i)));
}
}
return builder.Finish(out);
@@ -224,28 +223,37 @@ static Status MakeBinaryArrayWithUniqueValues(int64_t length, bool include_nulls
Status MakeStringTypesRecordBatch(std::shared_ptr<RecordBatch>* out, bool with_nulls) {
const int64_t length = 500;
- auto string_type = utf8();
- auto binary_type = binary();
- auto f0 = field("f0", string_type);
- auto f1 = field("f1", binary_type);
- auto schema = ::arrow::schema({f0, f1});
+ auto f0 = field("strings", utf8());
+ auto f1 = field("binaries", binary());
+ auto f2 = field("large_strings", large_utf8());
+ auto f3 = field("large_binaries", large_binary());
+ auto schema = ::arrow::schema({f0, f1, f2, f3});
- std::shared_ptr<Array> a0, a1;
+ std::shared_ptr<Array> a0, a1, a2, a3;
MemoryPool* pool = default_memory_pool();
// Quirk with RETURN_NOT_OK macro and templated functions
{
- auto s = MakeBinaryArrayWithUniqueValues<StringBuilder, char>(length, with_nulls,
- pool, &a0);
+ auto s =
+ MakeBinaryArrayWithUniqueValues<StringBuilder>(length, with_nulls, pool, &a0);
RETURN_NOT_OK(s);
}
-
{
- auto s = MakeBinaryArrayWithUniqueValues<BinaryBuilder, uint8_t>(length, with_nulls,
- pool, &a1);
+ auto s =
+ MakeBinaryArrayWithUniqueValues<BinaryBuilder>(length, with_nulls, pool, &a1);
RETURN_NOT_OK(s);
}
- *out = RecordBatch::Make(schema, length, {a0, a1});
+ {
+ auto s = MakeBinaryArrayWithUniqueValues<LargeStringBuilder>(length, with_nulls, pool,
+ &a2);
+ RETURN_NOT_OK(s);
+ }
+ {
+ auto s = MakeBinaryArrayWithUniqueValues<LargeBinaryBuilder>(length, with_nulls, pool,
+ &a3);
+ RETURN_NOT_OK(s);
+ }
+ *out = RecordBatch::Make(schema, length, {a0, a1, a2, a3});
return Status::OK();
}
diff --git a/cpp/src/arrow/ipc/writer.cc b/cpp/src/arrow/ipc/writer.cc
index e1c2ecacb..ec372074d 100644
--- a/cpp/src/arrow/ipc/writer.cc
+++ b/cpp/src/arrow/ipc/writer.cc
@@ -225,7 +225,8 @@ class RecordBatchSerializer : public ArrayVisitor {
template <typename ArrayType>
Status GetZeroBasedValueOffsets(const ArrayType& array,
std::shared_ptr<Buffer>* value_offsets) {
- // Share slicing logic between ListArray and BinaryArray
+ // Share slicing logic between ListArray, BinaryArray and LargeBinaryArray
+ using offset_type = typename ArrayType::offset_type;
auto offsets = array.value_offsets();
@@ -235,11 +236,12 @@ class RecordBatchSerializer : public ArrayVisitor {
// b) slice the values array accordingly
std::shared_ptr<Buffer> shifted_offsets;
- RETURN_NOT_OK(AllocateBuffer(pool_, sizeof(int32_t) * (array.length() + 1),
+ RETURN_NOT_OK(AllocateBuffer(pool_, sizeof(offset_type) * (array.length() + 1),
&shifted_offsets));
- int32_t* dest_offsets = reinterpret_cast<int32_t*>(shifted_offsets->mutable_data());
- const int32_t start_offset = array.value_offset(0);
+ offset_type* dest_offsets =
+ reinterpret_cast<offset_type*>(shifted_offsets->mutable_data());
+ const offset_type start_offset = array.value_offset(0);
for (int i = 0; i < array.length(); ++i) {
dest_offsets[i] = array.value_offset(i) - start_offset;
@@ -253,9 +255,10 @@ class RecordBatchSerializer : public ArrayVisitor {
return Status::OK();
}
- Status VisitBinary(const BinaryArray& array) {
+ template <typename ArrayType>
+ Status VisitBinary(const ArrayType& array) {
std::shared_ptr<Buffer> value_offsets;
- RETURN_NOT_OK(GetZeroBasedValueOffsets<BinaryArray>(array, &value_offsets));
+ RETURN_NOT_OK(GetZeroBasedValueOffsets<ArrayType>(array, &value_offsets));
auto data = array.value_data();
int64_t total_data_bytes = 0;
@@ -343,6 +346,10 @@ class RecordBatchSerializer : public ArrayVisitor {
Status Visit(const BinaryArray& array) override { return VisitBinary(array); }
+ Status Visit(const LargeStringArray& array) override { return VisitBinary(array); }
+
+ Status Visit(const LargeBinaryArray& array) override { return VisitBinary(array); }
+
Status Visit(const ListArray& array) override { return VisitList(array); }
Status Visit(const MapArray& array) override { return VisitList(array); }
diff --git a/cpp/src/arrow/json/converter-test.cc b/cpp/src/arrow/json/converter-test.cc
index 86e8e8dc8..cf09e617d 100644
--- a/cpp/src/arrow/json/converter-test.cc
+++ b/cpp/src/arrow/json/converter-test.cc
@@ -85,6 +85,11 @@ TEST(ConverterTest, String) {
AssertConvert(utf8(), src, src);
}
+TEST(ConverterTest, LargeString) {
+ std::string src = R"(["a", "b c", null, "d e f", "g"])";
+ AssertConvert(large_utf8(), src, src);
+}
+
TEST(ConverterTest, Timestamp) {
std::string src = R"([null, "1970-01-01", "2018-11-13 17:11:10"])";
AssertConvert(timestamp(TimeUnit::SECOND), src, src);
diff --git a/cpp/src/arrow/json/converter.cc b/cpp/src/arrow/json/converter.cc
index 078e31418..6b7b73086 100644
--- a/cpp/src/arrow/json/converter.cc
+++ b/cpp/src/arrow/json/converter.cc
@@ -264,6 +264,8 @@ Status MakeConverter(const std::shared_ptr<DataType>& out_type, MemoryPool* pool
CONVERTER_CASE(Type::DATE64, DateTimeConverter<Date64Type>);
CONVERTER_CASE(Type::BINARY, BinaryConverter<BinaryType>);
CONVERTER_CASE(Type::STRING, BinaryConverter<StringType>);
+ CONVERTER_CASE(Type::LARGE_BINARY, BinaryConverter<LargeBinaryType>);
+ CONVERTER_CASE(Type::LARGE_STRING, BinaryConverter<LargeStringType>);
default:
return Status::NotImplemented("JSON conversion to ", *out_type,
" is not supported");
diff --git a/cpp/src/arrow/pretty_print-test.cc b/cpp/src/arrow/pretty_print-test.cc
index c77a92b7f..cdb230c6c 100644
--- a/cpp/src/arrow/pretty_print-test.cc
+++ b/cpp/src/arrow/pretty_print-test.cc
@@ -155,6 +155,7 @@ TEST_F(TestPrettyPrint, PrimitiveType) {
null
])expected";
CheckPrimitive<StringType, std::string>({0, 10}, is_valid, values3, ex3);
+ CheckPrimitive<LargeStringType, std::string>({0, 10}, is_valid, values3, ex3);
static const char* ex3_in2 = R"expected( [
"foo",
"bar",
@@ -163,6 +164,7 @@ TEST_F(TestPrettyPrint, PrimitiveType) {
null
])expected";
CheckPrimitive<StringType, std::string>({2, 10}, is_valid, values3, ex3_in2);
+ CheckPrimitive<LargeStringType, std::string>({2, 10}, is_valid, values3, ex3_in2);
}
TEST_F(TestPrettyPrint, Int8) {
@@ -338,9 +340,11 @@ TEST_F(TestPrettyPrint, BinaryType) {
std::vector<std::string> values = {"foo", "bar", "", "baz", "", "\xff"};
static const char* ex = "[\n 666F6F,\n 626172,\n null,\n 62617A,\n ,\n FF\n]";
CheckPrimitive<BinaryType, std::string>({0}, is_valid, values, ex);
+ CheckPrimitive<LargeBinaryType, std::string>({0}, is_valid, values, ex);
static const char* ex_in2 =
" [\n 666F6F,\n 626172,\n null,\n 62617A,\n ,\n FF\n ]";
CheckPrimitive<BinaryType, std::string>({2}, is_valid, values, ex_in2);
+ CheckPrimitive<LargeBinaryType, std::string>({2}, is_valid, values, ex_in2);
}
TEST_F(TestPrettyPrint, ListType) {
diff --git a/cpp/src/arrow/pretty_print.cc b/cpp/src/arrow/pretty_print.cc
index 6caef1714..5a54e13b8 100644
--- a/cpp/src/arrow/pretty_print.cc
+++ b/cpp/src/arrow/pretty_print.cc
@@ -205,7 +205,9 @@ class ArrayPrinter : public PrettyPrinter {
// String (Utf8)
template <typename T>
- inline typename std::enable_if<std::is_same<StringArray, T>::value, Status>::type
+ inline typename std::enable_if<std::is_same<StringArray, T>::value ||
+ std::is_same<LargeStringArray, T>::value,
+ Status>::type
WriteDataValues(const T& array) {
WriteValues(array, [&](int64_t i) { (*sink_) << "\"" << array.GetView(i) << "\""; });
return Status::OK();
@@ -213,7 +215,9 @@ class ArrayPrinter : public PrettyPrinter {
// Binary
template <typename T>
- inline typename std::enable_if<std::is_same<BinaryArray, T>::value, Status>::type
+ inline typename std::enable_if<std::is_same<BinaryArray, T>::value ||
+ std::is_same<LargeBinaryArray, T>::value,
+ Status>::type
WriteDataValues(const T& array) {
WriteValues(array, [&](int64_t i) { (*sink_) << HexEncode(array.GetView(i)); });
return Status::OK();
@@ -314,6 +318,7 @@ class ArrayPrinter : public PrettyPrinter {
typename std::enable_if<std::is_base_of<PrimitiveArray, T>::value ||
std::is_base_of<FixedSizeBinaryArray, T>::value ||
std::is_base_of<BinaryArray, T>::value ||
+ std::is_base_of<LargeBinaryArray, T>::value ||
std::is_base_of<ListArray, T>::value ||
std::is_base_of<MapArray, T>::value ||
std::is_base_of<FixedSizeListArray, T>::value,
diff --git a/cpp/src/arrow/scalar.h b/cpp/src/arrow/scalar.h
index 4f0589a2f..76aecd01f 100644
--- a/cpp/src/arrow/scalar.h
+++ b/cpp/src/arrow/scalar.h
@@ -91,20 +91,22 @@ struct NumericScalar : public internal::PrimitiveScalar {
: internal::PrimitiveScalar{type, is_valid}, value(value) {}
};
-struct ARROW_EXPORT BinaryScalar : public Scalar {
+template <typename Type>
+struct BaseBinaryScalar : public Scalar {
std::shared_ptr<Buffer> value;
- explicit BinaryScalar(const std::shared_ptr<Buffer>& value, bool is_valid = true)
- : BinaryScalar(value, binary(), is_valid) {}
protected:
- BinaryScalar(const std::shared_ptr<Buffer>& value,
- const std::shared_ptr<DataType>& type, bool is_valid = true)
+ BaseBinaryScalar(const std::shared_ptr<Buffer>& value,
+ const std::shared_ptr<DataType>& type, bool is_valid = true)
: Scalar{type, is_valid}, value(value) {}
};
-struct ARROW_EXPORT FixedSizeBinaryScalar : public BinaryScalar {
- FixedSizeBinaryScalar(const std::shared_ptr<Buffer>& value,
- const std::shared_ptr<DataType>& type, bool is_valid = true);
+struct ARROW_EXPORT BinaryScalar : public BaseBinaryScalar<BinaryType> {
+ explicit BinaryScalar(const std::shared_ptr<Buffer>& value, bool is_valid = true)
+ : BaseBinaryScalar(value, binary(), is_valid) {}
+
+ protected:
+ using BaseBinaryScalar::BaseBinaryScalar;
};
struct ARROW_EXPORT StringScalar : public BinaryScalar {
@@ -112,6 +114,24 @@ struct ARROW_EXPORT StringScalar : public BinaryScalar {
: BinaryScalar(value, utf8(), is_valid) {}
};
+struct ARROW_EXPORT LargeBinaryScalar : public BaseBinaryScalar<LargeBinaryType> {
+ explicit LargeBinaryScalar(const std::shared_ptr<Buffer>& value, bool is_valid = true)
+ : BaseBinaryScalar(value, large_binary(), is_valid) {}
+
+ protected:
+ using BaseBinaryScalar::BaseBinaryScalar;
+};
+
+struct ARROW_EXPORT LargeStringScalar : public LargeBinaryScalar {
+ explicit LargeStringScalar(const std::shared_ptr<Buffer>& value, bool is_valid = true)
+ : LargeBinaryScalar(value, utf8(), is_valid) {}
+};
+
+struct ARROW_EXPORT FixedSizeBinaryScalar : public BinaryScalar {
+ FixedSizeBinaryScalar(const std::shared_ptr<Buffer>& value,
+ const std::shared_ptr<DataType>& type, bool is_valid = true);
+};
+
class ARROW_EXPORT Date32Scalar : public NumericScalar<Date32Type> {
public:
using NumericScalar<Date32Type>::NumericScalar;
diff --git a/cpp/src/arrow/testing/random.cc b/cpp/src/arrow/testing/random.cc
index f693a4535..1e26a4420 100644
--- a/cpp/src/arrow/testing/random.cc
+++ b/cpp/src/arrow/testing/random.cc
@@ -145,24 +145,30 @@ PRIMITIVE_RAND_FLOAT_IMPL(Float64, double, DoubleType)
#undef PRIMITIVE_RAND_FLOAT_IMPL
#undef PRIMITIVE_RAND_IMPL
-std::shared_ptr<arrow::Array> RandomArrayGenerator::String(int64_t size,
- int32_t min_length,
- int32_t max_length,
- double null_probability) {
+template <typename TypeClass>
+static std::shared_ptr<arrow::Array> GenerateBinaryArray(RandomArrayGenerator* gen,
+ int64_t size, int32_t min_length,
+ int32_t max_length,
+ double null_probability) {
+ using offset_type = typename TypeClass::offset_type;
+ using BuilderType = typename TypeTraits<TypeClass>::BuilderType;
+ using OffsetArrowType = typename CTypeTraits<offset_type>::ArrowType;
+ using OffsetArrayType = typename TypeTraits<OffsetArrowType>::ArrayType;
+
if (null_probability < 0 || null_probability > 1) {
ABORT_NOT_OK(Status::Invalid("null_probability must be between 0 and 1"));
}
- auto int32_lengths = Int32(size, min_length, max_length, null_probability);
- auto lengths = std::dynamic_pointer_cast<Int32Array>(int32_lengths);
+ auto lengths = std::dynamic_pointer_cast<OffsetArrayType>(
+ gen->Numeric<OffsetArrowType>(size, min_length, max_length, null_probability));
// Visual Studio does not implement uniform_int_distribution for char types.
using GenOpt = GenerateOptions<uint8_t, std::uniform_int_distribution<uint16_t>>;
- GenOpt options(seed(), static_cast<uint8_t>('A'), static_cast<uint8_t>('z'),
+ GenOpt options(gen->seed(), static_cast<uint8_t>('A'), static_cast<uint8_t>('z'),
/*null_probability=*/0);
std::vector<uint8_t> str_buffer(max_length);
- StringBuilder builder;
+ BuilderType builder;
for (int64_t i = 0; i < size; ++i) {
if (lengths->IsValid(i)) {
@@ -178,6 +184,22 @@ std::shared_ptr<arrow::Array> RandomArrayGenerator::String(int64_t size,
return result;
}
+std::shared_ptr<arrow::Array> RandomArrayGenerator::String(int64_t size,
+ int32_t min_length,
+ int32_t max_length,
+ double null_probability) {
+ return GenerateBinaryArray<StringType>(this, size, min_length, max_length,
+ null_probability);
+}
+
+std::shared_ptr<arrow::Array> RandomArrayGenerator::LargeString(int64_t size,
+ int32_t min_length,
+ int32_t max_length,
+ double null_probability) {
+ return GenerateBinaryArray<LargeStringType>(this, size, min_length, max_length,
+ null_probability);
+}
+
std::shared_ptr<arrow::Array> RandomArrayGenerator::StringWithRepeats(
int64_t size, int64_t unique, int32_t min_length, int32_t max_length,
double null_probability) {
diff --git a/cpp/src/arrow/testing/random.h b/cpp/src/arrow/testing/random.h
index 6b188fd57..fc8c2d2fb 100644
--- a/cpp/src/arrow/testing/random.h
+++ b/cpp/src/arrow/testing/random.h
@@ -214,6 +214,19 @@ class ARROW_EXPORT RandomArrayGenerator {
std::shared_ptr<arrow::Array> String(int64_t size, int32_t min_length,
int32_t max_length, double null_probability);
+ /// \brief Generates a random LargeStringArray
+ ///
+ /// \param[in] size the size of the array to generate
+ /// \param[in] min_length the lower bound of the string length
+ /// determined by the uniform distribution
+ /// \param[in] max_length the upper bound of the string length
+ /// determined by the uniform distribution
+ /// \param[in] null_probability the probability of a row being null
+ ///
+ /// \return a generated Array
+ std::shared_ptr<arrow::Array> LargeString(int64_t size, int32_t min_length,
+ int32_t max_length, double null_probability);
+
/// \brief Generates a random StringArray with repeated values
///
/// \param[in] size the size of the array to generate
@@ -230,9 +243,9 @@ class ARROW_EXPORT RandomArrayGenerator {
int32_t min_length, int32_t max_length,
double null_probability);
- private:
SeedType seed() { return seed_distribution_(seed_rng_); }
+ private:
std::uniform_int_distribution<SeedType> seed_distribution_;
std::default_random_engine seed_rng_;
};
diff --git a/cpp/src/arrow/type-test.cc b/cpp/src/arrow/type-test.cc
index 7ad1d8ad0..7bfb72001 100644
--- a/cpp/src/arrow/type-test.cc
+++ b/cpp/src/arrow/type-test.cc
@@ -354,6 +354,20 @@ TEST(TestStringType, ToString) {
ASSERT_EQ(str.ToString(), std::string("string"));
}
+TEST(TestLargeBinaryTypes, ToString) {
+ BinaryType bt1;
+ LargeBinaryType t1;
+ LargeBinaryType e1;
+ LargeStringType t2;
+ EXPECT_TRUE(t1.Equals(e1));
+ EXPECT_FALSE(t1.Equals(t2));
+ EXPECT_FALSE(t1.Equals(bt1));
+ ASSERT_EQ(t1.id(), Type::LARGE_BINARY);
+ ASSERT_EQ(t1.ToString(), std::string("large_binary"));
+ ASSERT_EQ(t2.id(), Type::LARGE_STRING);
+ ASSERT_EQ(t2.ToString(), std::string("large_string"));
+}
+
TEST(TestFixedSizeBinaryType, ToString) {
auto t = fixed_size_binary(10);
ASSERT_EQ(t->id(), Type::FIXED_SIZE_BINARY);
diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc
index 54e0103fb..4397bf297 100644
--- a/cpp/src/arrow/type.cc
+++ b/cpp/src/arrow/type.cc
@@ -143,8 +143,6 @@ FloatingPointType::Precision DoubleType::precision() const {
return FloatingPointType::DOUBLE;
}
-std::string StringType::ToString() const { return std::string("string"); }
-
std::string ListType::ToString() const {
std::stringstream s;
s << "list<" << value_field()->ToString() << ">";
@@ -178,7 +176,13 @@ std::string FixedSizeListType::ToString() const {
return s.str();
}
-std::string BinaryType::ToString() const { return std::string("binary"); }
+std::string BinaryType::ToString() const { return "binary"; }
+
+std::string LargeBinaryType::ToString() const { return "large_binary"; }
+
+std::string StringType::ToString() const { return "string"; }
+
+std::string LargeStringType::ToString() const { return "large_string"; }
int FixedSizeBinaryType::bit_width() const { return CHAR_BIT * byte_width(); }
@@ -667,7 +671,9 @@ TYPE_FACTORY(float16, HalfFloatType)
TYPE_FACTORY(float32, FloatType)
TYPE_FACTORY(float64, DoubleType)
TYPE_FACTORY(utf8, StringType)
+TYPE_FACTORY(large_utf8, LargeStringType)
TYPE_FACTORY(binary, BinaryType)
+TYPE_FACTORY(large_binary, LargeBinaryType)
TYPE_FACTORY(date64, Date64Type)
TYPE_FACTORY(date32, Date32Type)
diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h
index fc235bb2d..572b888df 100644
--- a/cpp/src/arrow/type.h
+++ b/cpp/src/arrow/type.h
@@ -143,7 +143,13 @@ struct Type {
/// Measure of elapsed time in either seconds, milliseconds, microseconds
/// or nanoseconds.
- DURATION
+ DURATION,
+
+ /// Like STRING, but with 64-bit offsets
+ LARGE_STRING,
+
+ /// Like BINARY, but with 64-bit offsets
+ LARGE_BINARY
};
};
@@ -472,6 +478,7 @@ class ARROW_EXPORT DoubleType
class ARROW_EXPORT ListType : public NestedType {
public:
static constexpr Type::type type_id = Type::LIST;
+ using offset_type = int32_t;
// List can contain any other logical value type
explicit ListType(const std::shared_ptr<DataType>& value_type)
@@ -486,7 +493,7 @@ class ARROW_EXPORT ListType : public NestedType {
std::shared_ptr<DataType> value_type() const { return children_[0]->type(); }
DataTypeLayout layout() const override {
- return {{1, CHAR_BIT * sizeof(int32_t)}, false};
+ return {{1, CHAR_BIT * sizeof(offset_type)}, false};
}
std::string ToString() const override;
@@ -550,23 +557,78 @@ class ARROW_EXPORT FixedSizeListType : public NestedType {
int32_t list_size_;
};
+/// \brief Base class for all variable-size binary data types
+class ARROW_EXPORT BaseBinaryType : public DataType, public NoExtraMeta {
+ public:
+ using DataType::DataType;
+};
+
/// \brief Concrete type class for variable-size binary data
-class ARROW_EXPORT BinaryType : public DataType, public NoExtraMeta {
+class ARROW_EXPORT BinaryType : public BaseBinaryType {
public:
static constexpr Type::type type_id = Type::BINARY;
+ static constexpr bool is_utf8 = false;
+ using offset_type = int32_t;
BinaryType() : BinaryType(Type::BINARY) {}
DataTypeLayout layout() const override {
- return {{1, CHAR_BIT * sizeof(int32_t), DataTypeLayout::kVariableSizeBuffer}, false};
+ return {{1, CHAR_BIT * sizeof(offset_type), DataTypeLayout::kVariableSizeBuffer},
+ false};
}
std::string ToString() const override;
std::string name() const override { return "binary"; }
protected:
- // Allow subclasses to change the logical type.
- explicit BinaryType(Type::type logical_type) : DataType(logical_type) {}
+ // Allow subclasses like StringType to change the logical type.
+ explicit BinaryType(Type::type logical_type) : BaseBinaryType(logical_type) {}
+};
+
+/// \brief Concrete type class for large variable-size binary data
+class ARROW_EXPORT LargeBinaryType : public BaseBinaryType {
+ public:
+ static constexpr Type::type type_id = Type::LARGE_BINARY;
+ static constexpr bool is_utf8 = false;
+ using offset_type = int64_t;
+
+ LargeBinaryType() : LargeBinaryType(Type::LARGE_BINARY) {}
+
+ DataTypeLayout layout() const override {
+ return {{1, CHAR_BIT * sizeof(offset_type), DataTypeLayout::kVariableSizeBuffer},
+ false};
+ }
+
+ std::string ToString() const override;
+ std::string name() const override { return "large_binary"; }
+
+ protected:
+ // Allow subclasses like LargeStringType to change the logical type.
+ explicit LargeBinaryType(Type::type logical_type) : BaseBinaryType(logical_type) {}
+};
+
+/// \brief Concrete type class for variable-size string data, utf8-encoded
+class ARROW_EXPORT StringType : public BinaryType {
+ public:
+ static constexpr Type::type type_id = Type::STRING;
+ static constexpr bool is_utf8 = true;
+
+ StringType() : BinaryType(Type::STRING) {}
+
+ std::string ToString() const override;
+ std::string name() const override { return "utf8"; }
+};
+
+/// \brief Concrete type class for large variable-size string data, utf8-encoded
+class ARROW_EXPORT LargeStringType : public LargeBinaryType {
+ public:
+ static constexpr Type::type type_id = Type::LARGE_STRING;
+ static constexpr bool is_utf8 = true;
+
+ LargeStringType() : LargeBinaryType(Type::LARGE_STRING) {}
+
+ std::string ToString() const override;
+ std::string name() const override { return "large_utf8"; }
};
/// \brief Concrete type class for fixed-size binary data
@@ -591,17 +653,6 @@ class ARROW_EXPORT FixedSizeBinaryType : public FixedWidthType, public Parametri
int32_t byte_width_;
};
-/// \brief Concrete type class for variable-size string data, utf8-encoded
-class ARROW_EXPORT StringType : public BinaryType {
- public:
- static constexpr Type::type type_id = Type::STRING;
-
- StringType() : BinaryType(Type::STRING) {}
-
- std::string ToString() const override;
- std::string name() const override { return "utf8"; }
-};
-
/// \brief Concrete type class for struct data
class ARROW_EXPORT StructType : public NestedType {
public:
diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h
index c42d66152..9935af511 100644
--- a/cpp/src/arrow/type_fwd.h
+++ b/cpp/src/arrow/type_fwd.h
@@ -65,6 +65,11 @@ class BinaryArray;
class BinaryBuilder;
struct BinaryScalar;
+class LargeBinaryType;
+class LargeBinaryArray;
+class LargeBinaryBuilder;
+struct LargeBinaryScalar;
+
class FixedSizeBinaryType;
class FixedSizeBinaryArray;
class FixedSizeBinaryBuilder;
@@ -75,6 +80,11 @@ class StringArray;
class StringBuilder;
struct StringScalar;
+class LargeStringType;
+class LargeStringArray;
+class LargeStringBuilder;
+struct LargeStringScalar;
+
class ListType;
class ListArray;
class ListBuilder;
@@ -218,8 +228,12 @@ std::shared_ptr<DataType> ARROW_EXPORT float32();
std::shared_ptr<DataType> ARROW_EXPORT float64();
/// \brief Return a StringType instance
std::shared_ptr<DataType> ARROW_EXPORT utf8();
+/// \brief Return a LargeStringType instance
+std::shared_ptr<DataType> ARROW_EXPORT large_utf8();
/// \brief Return a BinaryType instance
std::shared_ptr<DataType> ARROW_EXPORT binary();
+/// \brief Return a LargeBinaryType instance
+std::shared_ptr<DataType> ARROW_EXPORT large_binary();
/// \brief Return a Date32Type instance
std::shared_ptr<DataType> ARROW_EXPORT date32();
/// \brief Return a Date64Type instance
diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h
index 4902f5c63..50e1e725a 100644
--- a/cpp/src/arrow/type_traits.h
+++ b/cpp/src/arrow/type_traits.h
@@ -243,6 +243,15 @@ struct TypeTraits<BinaryType> {
static inline std::shared_ptr<DataType> type_singleton() { return binary(); }
};
+template <>
+struct TypeTraits<LargeBinaryType> {
+ using ArrayType = LargeBinaryArray;
+ using BuilderType = LargeBinaryBuilder;
+ using ScalarType = LargeBinaryScalar;
+ constexpr static bool is_parameter_free = true;
+ static inline std::shared_ptr<DataType> type_singleton() { return large_binary(); }
+};
+
template <>
struct TypeTraits<FixedSizeBinaryType> {
using ArrayType = FixedSizeBinaryArray;
@@ -260,6 +269,15 @@ struct TypeTraits<StringType> {
static inline std::shared_ptr<DataType> type_singleton() { return utf8(); }
};
+template <>
+struct TypeTraits<LargeStringType> {
+ using ArrayType = LargeStringArray;
+ using BuilderType = LargeStringBuilder;
+ using ScalarType = LargeStringScalar;
+ constexpr static bool is_parameter_free = true;
+ static inline std::shared_ptr<DataType> type_singleton() { return large_utf8(); }
+};
+
template <>
struct CTypeTraits<std::string> : public TypeTraits<StringType> {
using ArrowType = StringType;
@@ -361,6 +379,12 @@ struct is_8bit_int {
(std::is_same<UInt8Type, T>::value || std::is_same<Int8Type, T>::value);
};
+template <typename T>
+struct is_any_string_type {
+ static constexpr bool value =
+ std::is_same<StringType, T>::value || std::is_same<LargeStringType, T>::value;
+};
+
template <typename T, typename R = void>
using enable_if_8bit_int = typename std::enable_if<is_8bit_int<T>::value, R>::type;
@@ -412,10 +436,18 @@ using enable_if_has_c_type = typename std::enable_if<has_c_type<T>::value, R>::t
template <typename T, typename R = void>
using enable_if_null = typename std::enable_if<std::is_same<NullType, T>::value, R>::type;
+template <typename T, typename R = void>
+using enable_if_base_binary =
+ typename std::enable_if<std::is_base_of<BaseBinaryType, T>::value, R>::type;
+
template <typename T, typename R = void>
using enable_if_binary =
typename std::enable_if<std::is_base_of<BinaryType, T>::value, R>::type;
+template <typename T, typename R = void>
+using enable_if_large_binary =
+ typename std::enable_if<std::is_base_of<LargeBinaryType, T>::value, R>::type;
+
template <typename T, typename R = void>
using enable_if_boolean =
typename std::enable_if<std::is_same<BooleanType, T>::value, R>::type;
@@ -574,6 +606,17 @@ static inline bool is_binary_like(Type::type type_id) {
return false;
}
+static inline bool is_large_binary_like(Type::type type_id) {
+ switch (type_id) {
+ case Type::LARGE_BINARY:
+ case Type::LARGE_STRING:
+ return true;
+ default:
+ break;
+ }
+ return false;
+}
+
static inline bool is_dictionary(Type::type type_id) {
return type_id == Type::DICTIONARY;
}
diff --git a/cpp/src/arrow/visitor.cc b/cpp/src/arrow/visitor.cc
index 53b341b53..2ec6c6421 100644
--- a/cpp/src/arrow/visitor.cc
+++ b/cpp/src/arrow/visitor.cc
@@ -47,6 +47,8 @@ ARRAY_VISITOR_DEFAULT(FloatArray)
ARRAY_VISITOR_DEFAULT(DoubleArray)
ARRAY_VISITOR_DEFAULT(BinaryArray)
ARRAY_VISITOR_DEFAULT(StringArray)
+ARRAY_VISITOR_DEFAULT(LargeBinaryArray)
+ARRAY_VISITOR_DEFAULT(LargeStringArray)
ARRAY_VISITOR_DEFAULT(FixedSizeBinaryArray)
ARRAY_VISITOR_DEFAULT(Date32Array)
ARRAY_VISITOR_DEFAULT(Date64Array)
@@ -90,6 +92,8 @@ TYPE_VISITOR_DEFAULT(FloatType)
TYPE_VISITOR_DEFAULT(DoubleType)
TYPE_VISITOR_DEFAULT(StringType)
TYPE_VISITOR_DEFAULT(BinaryType)
+TYPE_VISITOR_DEFAULT(LargeStringType)
+TYPE_VISITOR_DEFAULT(LargeBinaryType)
TYPE_VISITOR_DEFAULT(FixedSizeBinaryType)
TYPE_VISITOR_DEFAULT(Date64Type)
TYPE_VISITOR_DEFAULT(Date32Type)
@@ -134,6 +138,8 @@ SCALAR_VISITOR_DEFAULT(FloatScalar)
SCALAR_VISITOR_DEFAULT(DoubleScalar)
SCALAR_VISITOR_DEFAULT(StringScalar)
SCALAR_VISITOR_DEFAULT(BinaryScalar)
+SCALAR_VISITOR_DEFAULT(LargeStringScalar)
+SCALAR_VISITOR_DEFAULT(LargeBinaryScalar)
SCALAR_VISITOR_DEFAULT(FixedSizeBinaryScalar)
SCALAR_VISITOR_DEFAULT(Date64Scalar)
SCALAR_VISITOR_DEFAULT(Date32Scalar)
diff --git a/cpp/src/arrow/visitor.h b/cpp/src/arrow/visitor.h
index a4979e9ce..1c854c478 100644
--- a/cpp/src/arrow/visitor.h
+++ b/cpp/src/arrow/visitor.h
@@ -43,6 +43,8 @@ class ARROW_EXPORT ArrayVisitor {
virtual Status Visit(const DoubleArray& array);
virtual Status Visit(const StringArray& array);
virtual Status Visit(const BinaryArray& array);
+ virtual Status Visit(const LargeStringArray& array);
+ virtual Status Visit(const LargeBinaryArray& array);
virtual Status Visit(const FixedSizeBinaryArray& array);
virtual Status Visit(const Date32Array& array);
virtual Status Visit(const Date64Array& array);
@@ -81,6 +83,8 @@ class ARROW_EXPORT TypeVisitor {
virtual Status Visit(const DoubleType& type);
virtual Status Visit(const StringType& type);
virtual Status Visit(const BinaryType& type);
+ virtual Status Visit(const LargeStringType& type);
+ virtual Status Visit(const LargeBinaryType& type);
virtual Status Visit(const FixedSizeBinaryType& type);
virtual Status Visit(const Date64Type& type);
virtual Status Visit(const Date32Type& type);
@@ -119,6 +123,8 @@ class ARROW_EXPORT ScalarVisitor {
virtual Status Visit(const DoubleScalar& scalar);
virtual Status Visit(const StringScalar& scalar);
virtual Status Visit(const BinaryScalar& scalar);
+ virtual Status Visit(const LargeStringScalar& scalar);
+ virtual Status Visit(const LargeBinaryScalar& scalar);
virtual Status Visit(const FixedSizeBinaryScalar& scalar);
virtual Status Visit(const Date64Scalar& scalar);
virtual Status Visit(const Date32Scalar& scalar);
diff --git a/cpp/src/arrow/visitor_inline.h b/cpp/src/arrow/visitor_inline.h
index 544763a2f..3ed058e64 100644
--- a/cpp/src/arrow/visitor_inline.h
+++ b/cpp/src/arrow/visitor_inline.h
@@ -47,6 +47,8 @@ namespace arrow {
ACTION(Double); \
ACTION(String); \
ACTION(Binary); \
+ ACTION(LargeString); \
+ ACTION(LargeBinary); \
ACTION(FixedSizeBinary); \
ACTION(Duration); \
ACTION(Date32); \
@@ -186,12 +188,13 @@ struct ArrayDataVisitor<T, enable_if_has_c_type<T>> {
};
template <typename T>
-struct ArrayDataVisitor<T, enable_if_binary<T>> {
+struct ArrayDataVisitor<T, enable_if_base_binary<T>> {
template <typename Visitor>
static Status Visit(const ArrayData& arr, Visitor* visitor) {
+ using offset_type = typename T::offset_type;
constexpr uint8_t empty_value = 0;
- const int32_t* offsets = arr.GetValues<int32_t>(1);
+ const offset_type* offsets = arr.GetValues<offset_type>(1);
const uint8_t* data;
if (!arr.buffers[2]) {
data = &empty_value;
diff --git a/format/Schema.fbs b/format/Schema.fbs
index 36127925e..91aa9db48 100644
--- a/format/Schema.fbs
+++ b/format/Schema.fbs
@@ -103,13 +103,22 @@ table FloatingPoint {
precision: Precision;
}
-/// Unicode with UTF-8 encoding
+/// UTF-8 encoded Unicode strings. Items are limited to 32-bit byte lengths.
table Utf8 {
}
+/// Raw binary strings. Items are limited to 32-bit byte lengths.
table Binary {
}
+/// Variants of Utf8 and Binary with 64-bit byte lengths.
+/// These types are optional and may not be supported by all implementations.
+table LargeUtf8 {
+}
+
+table LargeBinary {
+}
+
table FixedSizeBinary {
/// Number of bytes per value
byteWidth: int;
@@ -235,6 +244,8 @@ union Type {
FixedSizeList,
Map,
Duration,
+ LargeBinary,
+ LargeUtf8,
}
/// ----------------------------------------------------------------------
diff --git a/rust/datafusion/Cargo.toml b/rust/datafusion/Cargo.toml
index 6e021e32e..3f16f79e1 100644
--- a/rust/datafusion/Cargo.toml
+++ b/rust/datafusion/Cargo.toml
@@ -58,4 +58,3 @@ criterion = "0.2.0"
[[bench]]
name = "aggregate_query_sql"
harness = false
-
diff --git a/testing b/testing
index a674dac19..d14764eff 160000
--- a/testing
+++ b/testing
@@ -1 +1 @@
-Subproject commit a674dac190c5fc626964c9b611c67552fa2e530d
+Subproject commit d14764eff71c51156bea2a7860f8df811d6c9f11
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment