Created
July 22, 2019 18:28
-
-
Save kszucs/2172802c04c43540068067692febab38 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff --git a/cpp/build-support/get_apache_mirror.py b/cpp/build-support/get_apache_mirror.py | |
index 38ea6f4d2..ac55abad4 100755 | |
--- a/cpp/build-support/get_apache_mirror.py | |
+++ b/cpp/build-support/get_apache_mirror.py | |
@@ -20,6 +20,8 @@ | |
# mirror for downloading dependencies, e.g. in CMake | |
import json | |
+import warnings | |
+ | |
try: | |
import requests | |
@@ -35,6 +37,14 @@ except ImportError: | |
def get_url(url): | |
return urlopen(url).read() | |
-suggested_mirror = get_url('https://www.apache.org/dyn/' | |
- 'closer.cgi?as_json=1') | |
-print(json.loads(suggested_mirror.decode('utf-8'))['preferred']) | |
+url = 'https://www.apache.org/dyn/closer.cgi?as_json=1' | |
+ | |
+try: | |
+ suggested_mirror = get_url(url) | |
+except Exception as e: | |
+ warnings.warn("Failed loading {url!r}: {e}".format(**locals()), | |
+ RuntimeWarning) | |
+ # Well-known mirror, in case the URL above fails loading | |
+ print("http://apache.osuosl.org/") | |
+else: | |
+ print(json.loads(suggested_mirror.decode('utf-8'))['preferred']) | |
diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake | |
index aa1e557af..15c8b6e07 100644 | |
--- a/cpp/cmake_modules/ThirdpartyToolchain.cmake | |
+++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake | |
@@ -40,11 +40,10 @@ set(APACHE_MIRROR "") | |
macro(get_apache_mirror) | |
if(APACHE_MIRROR STREQUAL "") | |
- exec_program(${PYTHON_EXECUTABLE} | |
- ARGS | |
- ${CMAKE_SOURCE_DIR}/build-support/get_apache_mirror.py | |
- OUTPUT_VARIABLE | |
- APACHE_MIRROR) | |
+ execute_process(COMMAND ${PYTHON_EXECUTABLE} | |
+ ${CMAKE_SOURCE_DIR}/build-support/get_apache_mirror.py | |
+ OUTPUT_VARIABLE APACHE_MIRROR | |
+ OUTPUT_STRIP_TRAILING_WHITESPACE) | |
endif() | |
endmacro() | |
diff --git a/cpp/src/arrow/array-binary-test.cc b/cpp/src/arrow/array-binary-test.cc | |
index cb8d6d530..71fb81ebb 100644 | |
--- a/cpp/src/arrow/array-binary-test.cc | |
+++ b/cpp/src/arrow/array-binary-test.cc | |
@@ -40,6 +40,9 @@ namespace arrow { | |
using internal::checked_cast; | |
+using StringTypes = | |
+ ::testing::Types<StringType, LargeStringType, BinaryType, LargeBinaryType>; | |
+ | |
// ---------------------------------------------------------------------- | |
// String / Binary tests | |
@@ -67,8 +70,14 @@ void CheckStringArray(const ArrayType& array, const std::vector<std::string>& st | |
} | |
} | |
+template <typename T> | |
class TestStringArray : public ::testing::Test { | |
public: | |
+ using TypeClass = T; | |
+ using offset_type = typename TypeClass::offset_type; | |
+ using ArrayType = typename TypeTraits<TypeClass>::ArrayType; | |
+ using BuilderType = typename TypeTraits<TypeClass>::BuilderType; | |
+ | |
void SetUp() { | |
chars_ = {'a', 'b', 'b', 'c', 'c', 'c'}; | |
offsets_ = {0, 1, 1, 1, 3, 6}; | |
@@ -85,268 +94,132 @@ class TestStringArray : public ::testing::Test { | |
ASSERT_OK(BitUtil::BytesToBits(valid_bytes_, default_memory_pool(), &null_bitmap_)); | |
null_count_ = CountNulls(valid_bytes_); | |
- strings_ = std::make_shared<StringArray>(length_, offsets_buf_, value_buf_, | |
- null_bitmap_, null_count_); | |
- } | |
- | |
- protected: | |
- std::vector<int32_t> offsets_; | |
- std::vector<char> chars_; | |
- std::vector<uint8_t> valid_bytes_; | |
- | |
- std::vector<std::string> expected_; | |
- | |
- std::shared_ptr<Buffer> value_buf_; | |
- std::shared_ptr<Buffer> offsets_buf_; | |
- std::shared_ptr<Buffer> null_bitmap_; | |
- | |
- int64_t null_count_; | |
- int64_t length_; | |
- | |
- std::shared_ptr<StringArray> strings_; | |
-}; | |
- | |
-TEST_F(TestStringArray, TestArrayBasics) { | |
- ASSERT_EQ(length_, strings_->length()); | |
- ASSERT_EQ(1, strings_->null_count()); | |
- ASSERT_OK(ValidateArray(*strings_)); | |
-} | |
- | |
-TEST_F(TestStringArray, TestType) { | |
- std::shared_ptr<DataType> type = strings_->type(); | |
- | |
- ASSERT_EQ(Type::STRING, type->id()); | |
- ASSERT_EQ(Type::STRING, strings_->type_id()); | |
-} | |
- | |
-TEST_F(TestStringArray, TestListFunctions) { | |
- int pos = 0; | |
- for (size_t i = 0; i < expected_.size(); ++i) { | |
- ASSERT_EQ(pos, strings_->value_offset(i)); | |
- ASSERT_EQ(static_cast<int>(expected_[i].size()), strings_->value_length(i)); | |
- pos += static_cast<int>(expected_[i].size()); | |
- } | |
-} | |
- | |
-TEST_F(TestStringArray, TestDestructor) { | |
- auto arr = std::make_shared<StringArray>(length_, offsets_buf_, value_buf_, | |
+ strings_ = std::make_shared<ArrayType>(length_, offsets_buf_, value_buf_, | |
null_bitmap_, null_count_); | |
-} | |
+ } | |
-TEST_F(TestStringArray, TestGetString) { | |
- for (size_t i = 0; i < expected_.size(); ++i) { | |
- if (valid_bytes_[i] == 0) { | |
- ASSERT_TRUE(strings_->IsNull(i)); | |
+ void _TestArrayBasics() { | |
+ ASSERT_EQ(length_, strings_->length()); | |
+ ASSERT_EQ(1, strings_->null_count()); | |
+ ASSERT_OK(ValidateArray(*strings_)); | |
+ TestInitialized(*strings_); | |
+ AssertZeroPadded(*strings_); | |
+ } | |
+ | |
+ void _TestType() { | |
+ std::shared_ptr<DataType> type = this->strings_->type(); | |
+ | |
+ if (std::is_same<TypeClass, StringType>::value) { | |
+ ASSERT_EQ(Type::STRING, type->id()); | |
+ ASSERT_EQ(Type::STRING, this->strings_->type_id()); | |
+ } else if (std::is_same<TypeClass, LargeStringType>::value) { | |
+ ASSERT_EQ(Type::LARGE_STRING, type->id()); | |
+ ASSERT_EQ(Type::LARGE_STRING, this->strings_->type_id()); | |
+ } else if (std::is_same<TypeClass, BinaryType>::value) { | |
+ ASSERT_EQ(Type::BINARY, type->id()); | |
+ ASSERT_EQ(Type::BINARY, this->strings_->type_id()); | |
+ } else if (std::is_same<TypeClass, LargeBinaryType>::value) { | |
+ ASSERT_EQ(Type::LARGE_BINARY, type->id()); | |
+ ASSERT_EQ(Type::LARGE_BINARY, this->strings_->type_id()); | |
} else { | |
- ASSERT_EQ(expected_[i], strings_->GetString(i)); | |
+ FAIL(); | |
} | |
} | |
-} | |
- | |
-TEST_F(TestStringArray, TestEmptyStringComparison) { | |
- offsets_ = {0, 0, 0, 0, 0, 0}; | |
- offsets_buf_ = Buffer::Wrap(offsets_); | |
- length_ = static_cast<int64_t>(offsets_.size() - 1); | |
- | |
- auto strings_a = std::make_shared<StringArray>(length_, offsets_buf_, nullptr, | |
- null_bitmap_, null_count_); | |
- auto strings_b = std::make_shared<StringArray>(length_, offsets_buf_, nullptr, | |
- null_bitmap_, null_count_); | |
- ASSERT_TRUE(strings_a->Equals(strings_b)); | |
-} | |
- | |
-TEST_F(TestStringArray, CompareNullByteSlots) { | |
- StringBuilder builder; | |
- StringBuilder builder2; | |
- StringBuilder builder3; | |
- | |
- ASSERT_OK(builder.Append("foo")); | |
- ASSERT_OK(builder2.Append("foo")); | |
- ASSERT_OK(builder3.Append("foo")); | |
- | |
- ASSERT_OK(builder.Append("bar")); | |
- ASSERT_OK(builder2.AppendNull()); | |
- | |
- // same length, but different | |
- ASSERT_OK(builder3.Append("xyz")); | |
- | |
- ASSERT_OK(builder.Append("baz")); | |
- ASSERT_OK(builder2.Append("baz")); | |
- ASSERT_OK(builder3.Append("baz")); | |
- | |
- std::shared_ptr<Array> array, array2, array3; | |
- FinishAndCheckPadding(&builder, &array); | |
- ASSERT_OK(builder2.Finish(&array2)); | |
- ASSERT_OK(builder3.Finish(&array3)); | |
- | |
- const auto& a1 = checked_cast<const StringArray&>(*array); | |
- const auto& a2 = checked_cast<const StringArray&>(*array2); | |
- const auto& a3 = checked_cast<const StringArray&>(*array3); | |
- | |
- // The validity bitmaps are the same, the data is different, but the unequal | |
- // portion is masked out | |
- StringArray equal_array(3, a1.value_offsets(), a1.value_data(), a2.null_bitmap(), 1); | |
- StringArray equal_array2(3, a3.value_offsets(), a3.value_data(), a2.null_bitmap(), 1); | |
- ASSERT_TRUE(equal_array.Equals(equal_array2)); | |
- ASSERT_TRUE(a2.RangeEquals(equal_array2, 0, 3, 0)); | |
- | |
- ASSERT_TRUE(equal_array.Array::Slice(1)->Equals(equal_array2.Array::Slice(1))); | |
- ASSERT_TRUE( | |
- equal_array.Array::Slice(1)->RangeEquals(0, 2, 0, equal_array2.Array::Slice(1))); | |
-} | |
- | |
-TEST_F(TestStringArray, TestSliceGetString) { | |
- StringBuilder builder; | |
- | |
- ASSERT_OK(builder.Append("a")); | |
- ASSERT_OK(builder.Append("b")); | |
- ASSERT_OK(builder.Append("c")); | |
- | |
- std::shared_ptr<Array> array; | |
- ASSERT_OK(builder.Finish(&array)); | |
- auto s = array->Slice(1, 10); | |
- auto arr = std::dynamic_pointer_cast<StringArray>(s); | |
- ASSERT_EQ(arr->GetString(0), "b"); | |
-} | |
- | |
-// ---------------------------------------------------------------------- | |
-// String builder tests | |
- | |
-class TestStringBuilder : public TestBuilder { | |
- public: | |
- void SetUp() { | |
- TestBuilder::SetUp(); | |
- builder_.reset(new StringBuilder(pool_)); | |
+ void _TestListFunctions() { | |
+ int64_t pos = 0; | |
+ for (size_t i = 0; i < expected_.size(); ++i) { | |
+ ASSERT_EQ(pos, strings_->value_offset(i)); | |
+ ASSERT_EQ(expected_[i].size(), strings_->value_length(i)); | |
+ pos += expected_[i].size(); | |
+ } | |
} | |
- void Done() { | |
- std::shared_ptr<Array> out; | |
- FinishAndCheckPadding(builder_.get(), &out); | |
- | |
- result_ = std::dynamic_pointer_cast<StringArray>(out); | |
- ASSERT_OK(ValidateArray(*result_)); | |
+ void _TestDestructor() { | |
+ auto arr = std::make_shared<ArrayType>(length_, offsets_buf_, value_buf_, | |
+ null_bitmap_, null_count_); | |
} | |
- protected: | |
- std::unique_ptr<StringBuilder> builder_; | |
- std::shared_ptr<StringArray> result_; | |
-}; | |
- | |
-TEST_F(TestStringBuilder, TestScalarAppend) { | |
- std::vector<std::string> strings = {"", "bb", "a", "", "ccc"}; | |
- std::vector<uint8_t> is_valid = {1, 1, 1, 0, 1}; | |
- | |
- int N = static_cast<int>(strings.size()); | |
- int reps = 1000; | |
- | |
- for (int j = 0; j < reps; ++j) { | |
- for (int i = 0; i < N; ++i) { | |
- if (!is_valid[i]) { | |
- ASSERT_OK(builder_->AppendNull()); | |
+ void _TestGetString() { | |
+ for (size_t i = 0; i < expected_.size(); ++i) { | |
+ if (valid_bytes_[i] == 0) { | |
+ ASSERT_TRUE(strings_->IsNull(i)); | |
} else { | |
- ASSERT_OK(builder_->Append(strings[i])); | |
+ ASSERT_FALSE(strings_->IsNull(i)); | |
+ ASSERT_EQ(expected_[i], strings_->GetString(i)); | |
} | |
} | |
} | |
- Done(); | |
- | |
- ASSERT_EQ(reps * N, result_->length()); | |
- ASSERT_EQ(reps, result_->null_count()); | |
- ASSERT_EQ(reps * 6, result_->value_data()->size()); | |
- CheckStringArray(*result_, strings, is_valid, reps); | |
-} | |
- | |
-TEST_F(TestStringBuilder, TestAppendVector) { | |
- std::vector<std::string> strings = {"", "bb", "a", "", "ccc"}; | |
- std::vector<uint8_t> valid_bytes = {1, 1, 1, 0, 1}; | |
- | |
- int N = static_cast<int>(strings.size()); | |
- int reps = 1000; | |
- | |
- for (int j = 0; j < reps; ++j) { | |
- ASSERT_OK(builder_->AppendValues(strings, valid_bytes.data())); | |
- } | |
- Done(); | |
- | |
- ASSERT_EQ(reps * N, result_->length()); | |
- ASSERT_EQ(reps, result_->null_count()); | |
- ASSERT_EQ(reps * 6, result_->value_data()->size()); | |
- | |
- CheckStringArray(*result_, strings, valid_bytes, reps); | |
-} | |
- | |
-TEST_F(TestStringBuilder, TestAppendCStringsWithValidBytes) { | |
- const char* strings[] = {nullptr, "aaa", nullptr, "ignored", ""}; | |
- std::vector<uint8_t> valid_bytes = {1, 1, 1, 0, 1}; | |
- | |
- int N = static_cast<int>(sizeof(strings) / sizeof(strings[0])); | |
- int reps = 1000; | |
+ void _TestEmptyStringComparison() { | |
+ offsets_ = {0, 0, 0, 0, 0, 0}; | |
+ offsets_buf_ = Buffer::Wrap(offsets_); | |
+ length_ = static_cast<int64_t>(offsets_.size() - 1); | |
- for (int j = 0; j < reps; ++j) { | |
- ASSERT_OK(builder_->AppendValues(strings, N, valid_bytes.data())); | |
+ auto strings_a = std::make_shared<ArrayType>(length_, offsets_buf_, nullptr, | |
+ null_bitmap_, null_count_); | |
+ auto strings_b = std::make_shared<ArrayType>(length_, offsets_buf_, nullptr, | |
+ null_bitmap_, null_count_); | |
+ ASSERT_TRUE(strings_a->Equals(strings_b)); | |
} | |
- Done(); | |
- ASSERT_EQ(reps * N, result_->length()); | |
- ASSERT_EQ(reps * 3, result_->null_count()); | |
- ASSERT_EQ(reps * 3, result_->value_data()->size()); | |
+ void _TestCompareNullByteSlots() { | |
+ BuilderType builder; | |
+ BuilderType builder2; | |
+ BuilderType builder3; | |
- CheckStringArray(*result_, {"", "aaa", "", "", ""}, {0, 1, 0, 0, 1}, reps); | |
-} | |
+ ASSERT_OK(builder.Append("foo")); | |
+ ASSERT_OK(builder2.Append("foo")); | |
+ ASSERT_OK(builder3.Append("foo")); | |
-TEST_F(TestStringBuilder, TestAppendCStringsWithoutValidBytes) { | |
- const char* strings[] = {"", "bb", "a", nullptr, "ccc"}; | |
+ ASSERT_OK(builder.Append("bar")); | |
+ ASSERT_OK(builder2.AppendNull()); | |
- int N = static_cast<int>(sizeof(strings) / sizeof(strings[0])); | |
- int reps = 1000; | |
+ // same length, but different | |
+ ASSERT_OK(builder3.Append("xyz")); | |
- for (int j = 0; j < reps; ++j) { | |
- ASSERT_OK(builder_->AppendValues(strings, N)); | |
- } | |
- Done(); | |
+ ASSERT_OK(builder.Append("baz")); | |
+ ASSERT_OK(builder2.Append("baz")); | |
+ ASSERT_OK(builder3.Append("baz")); | |
- ASSERT_EQ(reps * N, result_->length()); | |
- ASSERT_EQ(reps, result_->null_count()); | |
- ASSERT_EQ(reps * 6, result_->value_data()->size()); | |
+ std::shared_ptr<Array> array, array2, array3; | |
+ FinishAndCheckPadding(&builder, &array); | |
+ ASSERT_OK(builder2.Finish(&array2)); | |
+ ASSERT_OK(builder3.Finish(&array3)); | |
- CheckStringArray(*result_, {"", "bb", "a", "", "ccc"}, {1, 1, 1, 0, 1}, reps); | |
-} | |
+ const auto& a1 = checked_cast<const ArrayType&>(*array); | |
+ const auto& a2 = checked_cast<const ArrayType&>(*array2); | |
+ const auto& a3 = checked_cast<const ArrayType&>(*array3); | |
-TEST_F(TestStringBuilder, TestZeroLength) { | |
- // All buffers are null | |
- Done(); | |
-} | |
+ // The validity bitmaps are the same, the data is different, but the unequal | |
+ // portion is masked out | |
+ ArrayType equal_array(3, a1.value_offsets(), a1.value_data(), a2.null_bitmap(), 1); | |
+ ArrayType equal_array2(3, a3.value_offsets(), a3.value_data(), a2.null_bitmap(), 1); | |
-// Binary container type | |
-// TODO(emkornfield) there should be some way to refactor these to avoid code duplicating | |
-// with String | |
-class TestBinaryArray : public ::testing::Test { | |
- public: | |
- void SetUp() { | |
- chars_ = {'a', 'b', 'b', 'c', 'c', 'c'}; | |
- offsets_ = {0, 1, 1, 1, 3, 6}; | |
- valid_bytes_ = {1, 1, 0, 1, 1}; | |
- expected_ = {"a", "", "", "bb", "ccc"}; | |
+ ASSERT_TRUE(equal_array.Equals(equal_array2)); | |
+ ASSERT_TRUE(a2.RangeEquals(equal_array2, 0, 3, 0)); | |
- MakeArray(); | |
+ ASSERT_TRUE(equal_array.Array::Slice(1)->Equals(equal_array2.Array::Slice(1))); | |
+ ASSERT_TRUE( | |
+ equal_array.Array::Slice(1)->RangeEquals(0, 2, 0, equal_array2.Array::Slice(1))); | |
} | |
- void MakeArray() { | |
- length_ = static_cast<int64_t>(offsets_.size() - 1); | |
- value_buf_ = Buffer::Wrap(chars_); | |
- offsets_buf_ = Buffer::Wrap(offsets_); | |
+ void _TestSliceGetString() { | |
+ BuilderType builder; | |
- ASSERT_OK(BitUtil::BytesToBits(valid_bytes_, default_memory_pool(), &null_bitmap_)); | |
- null_count_ = CountNulls(valid_bytes_); | |
+ ASSERT_OK(builder.Append("a")); | |
+ ASSERT_OK(builder.Append("b")); | |
+ ASSERT_OK(builder.Append("c")); | |
- strings_ = std::make_shared<BinaryArray>(length_, offsets_buf_, value_buf_, | |
- null_bitmap_, null_count_); | |
+ std::shared_ptr<Array> array; | |
+ ASSERT_OK(builder.Finish(&array)); | |
+ auto s = array->Slice(1, 10); | |
+ auto arr = std::dynamic_pointer_cast<ArrayType>(s); | |
+ ASSERT_EQ(arr->GetString(0), "b"); | |
} | |
protected: | |
- std::vector<int32_t> offsets_; | |
+ std::vector<offset_type> offsets_; | |
std::vector<char> chars_; | |
std::vector<uint8_t> valid_bytes_; | |
@@ -359,300 +232,161 @@ class TestBinaryArray : public ::testing::Test { | |
int64_t null_count_; | |
int64_t length_; | |
- std::shared_ptr<BinaryArray> strings_; | |
+ std::shared_ptr<ArrayType> strings_; | |
}; | |
-TEST_F(TestBinaryArray, TestArrayBasics) { | |
- ASSERT_EQ(length_, strings_->length()); | |
- ASSERT_EQ(1, strings_->null_count()); | |
- ASSERT_OK(ValidateArray(*strings_)); | |
-} | |
+TYPED_TEST_CASE(TestStringArray, StringTypes); | |
-TEST_F(TestBinaryArray, TestType) { | |
- std::shared_ptr<DataType> type = strings_->type(); | |
+TYPED_TEST(TestStringArray, TestArrayBasics) { this->_TestArrayBasics(); } | |
- ASSERT_EQ(Type::BINARY, type->id()); | |
- ASSERT_EQ(Type::BINARY, strings_->type_id()); | |
-} | |
+TYPED_TEST(TestStringArray, TestType) { this->_TestType(); } | |
-TEST_F(TestBinaryArray, TestListFunctions) { | |
- size_t pos = 0; | |
- for (size_t i = 0; i < expected_.size(); ++i) { | |
- ASSERT_EQ(pos, strings_->value_offset(i)); | |
- ASSERT_EQ(static_cast<int>(expected_[i].size()), strings_->value_length(i)); | |
- pos += expected_[i].size(); | |
- } | |
-} | |
+TYPED_TEST(TestStringArray, TestListFunctions) { this->_TestListFunctions(); } | |
-TEST_F(TestBinaryArray, TestDestructor) { | |
- auto arr = std::make_shared<BinaryArray>(length_, offsets_buf_, value_buf_, | |
- null_bitmap_, null_count_); | |
-} | |
+TYPED_TEST(TestStringArray, TestDestructor) { this->_TestDestructor(); } | |
-TEST_F(TestBinaryArray, TestGetValue) { | |
- for (size_t i = 0; i < expected_.size(); ++i) { | |
- if (valid_bytes_[i] == 0) { | |
- ASSERT_TRUE(strings_->IsNull(i)); | |
- } else { | |
- ASSERT_FALSE(strings_->IsNull(i)); | |
- ASSERT_EQ(strings_->GetString(i), expected_[i]); | |
- } | |
- } | |
-} | |
+TYPED_TEST(TestStringArray, TestGetString) { this->_TestGetString(); } | |
-TEST_F(TestBinaryArray, TestNullValuesInitialized) { | |
- for (size_t i = 0; i < expected_.size(); ++i) { | |
- if (valid_bytes_[i] == 0) { | |
- ASSERT_TRUE(strings_->IsNull(i)); | |
- } else { | |
- ASSERT_FALSE(strings_->IsNull(i)); | |
- ASSERT_EQ(strings_->GetString(i), expected_[i]); | |
- } | |
- } | |
- TestInitialized(*strings_); | |
+TYPED_TEST(TestStringArray, TestEmptyStringComparison) { | |
+ this->_TestEmptyStringComparison(); | |
} | |
-TEST_F(TestBinaryArray, TestPaddingZeroed) { AssertZeroPadded(*strings_); } | |
+TYPED_TEST(TestStringArray, CompareNullByteSlots) { this->_TestCompareNullByteSlots(); } | |
-TEST_F(TestBinaryArray, TestGetString) { | |
- for (size_t i = 0; i < expected_.size(); ++i) { | |
- if (valid_bytes_[i] == 0) { | |
- ASSERT_TRUE(strings_->IsNull(i)); | |
- } else { | |
- std::string val = strings_->GetString(i); | |
- ASSERT_EQ(0, std::memcmp(expected_[i].data(), val.c_str(), val.size())); | |
- } | |
- } | |
-} | |
+TYPED_TEST(TestStringArray, TestSliceGetString) { this->_TestSliceGetString(); } | |
-TEST_F(TestBinaryArray, TestEqualsEmptyStrings) { | |
- BinaryBuilder builder; | |
- | |
- std::string empty_string(""); | |
- for (int i = 0; i < 5; ++i) { | |
- ASSERT_OK(builder.Append(empty_string)); | |
- } | |
- | |
- std::shared_ptr<Array> left_arr; | |
- FinishAndCheckPadding(&builder, &left_arr); | |
- | |
- const BinaryArray& left = checked_cast<const BinaryArray&>(*left_arr); | |
- std::shared_ptr<Array> right = | |
- std::make_shared<BinaryArray>(left.length(), left.value_offsets(), nullptr, | |
- left.null_bitmap(), left.null_count()); | |
- | |
- ASSERT_TRUE(left.Equals(right)); | |
- ASSERT_TRUE(left.RangeEquals(0, left.length(), 0, right)); | |
-} | |
+// ---------------------------------------------------------------------- | |
+// String builder tests | |
-class TestBinaryBuilder : public TestBuilder { | |
+template <typename T> | |
+class TestStringBuilder : public TestBuilder { | |
public: | |
+ using TypeClass = T; | |
+ using offset_type = typename TypeClass::offset_type; | |
+ using ArrayType = typename TypeTraits<TypeClass>::ArrayType; | |
+ using BuilderType = typename TypeTraits<TypeClass>::BuilderType; | |
+ | |
void SetUp() { | |
TestBuilder::SetUp(); | |
- builder_.reset(new BinaryBuilder(pool_)); | |
+ builder_.reset(new BuilderType(pool_)); | |
} | |
void Done() { | |
std::shared_ptr<Array> out; | |
FinishAndCheckPadding(builder_.get(), &out); | |
- result_ = std::dynamic_pointer_cast<BinaryArray>(out); | |
+ result_ = std::dynamic_pointer_cast<ArrayType>(out); | |
ASSERT_OK(ValidateArray(*result_)); | |
} | |
- protected: | |
- std::unique_ptr<BinaryBuilder> builder_; | |
- std::shared_ptr<BinaryArray> result_; | |
-}; | |
- | |
-TEST_F(TestBinaryBuilder, TestScalarAppend) { | |
- std::vector<std::string> strings = {"", "bb", "a", "", "ccc"}; | |
- std::vector<uint8_t> is_valid = {1, 1, 1, 0, 1}; | |
+ void _TestScalarAppend() { | |
+ std::vector<std::string> strings = {"", "bb", "a", "", "ccc"}; | |
+ std::vector<uint8_t> is_valid = {1, 1, 1, 0, 1}; | |
- int N = static_cast<int>(strings.size()); | |
- int reps = 10; | |
+ int N = static_cast<int>(strings.size()); | |
+ int reps = 1000; | |
- for (int j = 0; j < reps; ++j) { | |
- for (int i = 0; i < N; ++i) { | |
- if (!is_valid[i]) { | |
- ASSERT_OK(builder_->AppendNull()); | |
- } else { | |
- ASSERT_OK(builder_->Append(strings[i])); | |
+ for (int j = 0; j < reps; ++j) { | |
+ for (int i = 0; i < N; ++i) { | |
+ if (!is_valid[i]) { | |
+ ASSERT_OK(builder_->AppendNull()); | |
+ } else { | |
+ ASSERT_OK(builder_->Append(strings[i])); | |
+ } | |
} | |
} | |
- } | |
- Done(); | |
- ASSERT_OK(ValidateArray(*result_)); | |
- ASSERT_EQ(reps * N, result_->length()); | |
- ASSERT_EQ(reps, result_->null_count()); | |
- ASSERT_EQ(reps * 6, result_->value_data()->size()); | |
- | |
- CheckStringArray(*result_, strings, is_valid, reps); | |
-} | |
- | |
-TEST_F(TestBinaryBuilder, TestAppendNulls) { | |
- ASSERT_OK(builder_->Append("bow")); | |
- ASSERT_OK(builder_->AppendNulls(3)); | |
- ASSERT_OK(builder_->Append("arrow")); | |
- Done(); | |
- ASSERT_OK(ValidateArray(*result_)); | |
- | |
- ASSERT_EQ(5, result_->length()); | |
- ASSERT_EQ(3, result_->null_count()); | |
- ASSERT_EQ(8, result_->value_data()->size()); | |
- | |
- CheckStringArray(*result_, {"bow", "", "", "", "arrow"}, {1, 0, 0, 0, 1}); | |
-} | |
+ Done(); | |
-TEST_F(TestBinaryBuilder, TestScalarAppendUnsafe) { | |
- std::vector<std::string> strings = {"", "bb", "a", "", "ccc"}; | |
- std::vector<uint8_t> is_valid = {1, 1, 1, 0, 1}; | |
+ ASSERT_EQ(reps * N, result_->length()); | |
+ ASSERT_EQ(reps, result_->null_count()); | |
+ ASSERT_EQ(reps * 6, result_->value_data()->size()); | |
- int N = static_cast<int>(strings.size()); | |
- int reps = 13; | |
- int total_length = 0; | |
- for (auto&& s : strings) total_length += static_cast<int>(s.size()); | |
- | |
- ASSERT_OK(builder_->Reserve(N * reps)); | |
- ASSERT_OK(builder_->ReserveData(total_length * reps)); | |
- | |
- for (int j = 0; j < reps; ++j) { | |
- for (int i = 0; i < N; ++i) { | |
- if (!is_valid[i]) { | |
- builder_->UnsafeAppendNull(); | |
- } else { | |
- builder_->UnsafeAppend(strings[i]); | |
- } | |
- } | |
+ CheckStringArray(*result_, strings, is_valid, reps); | |
} | |
- ASSERT_EQ(builder_->value_data_length(), total_length * reps); | |
- Done(); | |
- ASSERT_OK(ValidateArray(*result_)); | |
- ASSERT_EQ(reps * N, result_->length()); | |
- ASSERT_EQ(reps, result_->null_count()); | |
- ASSERT_EQ(reps * total_length, result_->value_data()->size()); | |
- | |
- CheckStringArray(*result_, strings, is_valid, reps); | |
-} | |
- | |
-TEST_F(TestBinaryBuilder, TestCapacityReserve) { | |
- std::vector<std::string> strings = {"aaaaa", "bbbbbbbbbb", "ccccccccccccccc", | |
- "dddddddddd"}; | |
- int N = static_cast<int>(strings.size()); | |
- int reps = 15; | |
- int64_t length = 0; | |
- int64_t capacity = 1000; | |
- int64_t expected_capacity = BitUtil::RoundUpToMultipleOf64(capacity); | |
- | |
- ASSERT_OK(builder_->ReserveData(capacity)); | |
- ASSERT_EQ(length, builder_->value_data_length()); | |
- ASSERT_EQ(expected_capacity, builder_->value_data_capacity()); | |
+ void _TestVectorAppend() { | |
+ std::vector<std::string> strings = {"", "bb", "a", "", "ccc"}; | |
+ std::vector<uint8_t> valid_bytes = {1, 1, 1, 0, 1}; | |
- for (int j = 0; j < reps; ++j) { | |
- for (int i = 0; i < N; ++i) { | |
- ASSERT_OK(builder_->Append(strings[i])); | |
- length += static_cast<int>(strings[i].size()); | |
+ int N = static_cast<int>(strings.size()); | |
+ int reps = 1000; | |
- ASSERT_EQ(length, builder_->value_data_length()); | |
- ASSERT_EQ(expected_capacity, builder_->value_data_capacity()); | |
+ for (int j = 0; j < reps; ++j) { | |
+ ASSERT_OK(builder_->AppendValues(strings, valid_bytes.data())); | |
} | |
- } | |
- | |
- int extra_capacity = 500; | |
- expected_capacity = BitUtil::RoundUpToMultipleOf64(length + extra_capacity); | |
+ Done(); | |
- ASSERT_OK(builder_->ReserveData(extra_capacity)); | |
+ ASSERT_EQ(reps * N, result_->length()); | |
+ ASSERT_EQ(reps, result_->null_count()); | |
+ ASSERT_EQ(reps * 6, result_->value_data()->size()); | |
- ASSERT_EQ(length, builder_->value_data_length()); | |
- int64_t actual_capacity = builder_->value_data_capacity(); | |
- ASSERT_GE(actual_capacity, expected_capacity); | |
- ASSERT_EQ(actual_capacity & 63, 0); | |
- | |
- Done(); | |
- | |
- ASSERT_EQ(reps * N, result_->length()); | |
- ASSERT_EQ(0, result_->null_count()); | |
- ASSERT_EQ(reps * 40, result_->value_data()->size()); | |
+ CheckStringArray(*result_, strings, valid_bytes, reps); | |
+ } | |
- // Capacity is shrunk after `Finish` | |
- ASSERT_EQ(640, result_->value_data()->capacity()); | |
-} | |
+ void _TestAppendCStringsWithValidBytes() { | |
+ const char* strings[] = {nullptr, "aaa", nullptr, "ignored", ""}; | |
+ std::vector<uint8_t> valid_bytes = {1, 1, 1, 0, 1}; | |
-TEST_F(TestBinaryBuilder, TestZeroLength) { | |
- // All buffers are null | |
- Done(); | |
-} | |
+ int N = static_cast<int>(sizeof(strings) / sizeof(strings[0])); | |
+ int reps = 1000; | |
-// ---------------------------------------------------------------------- | |
-// Slice tests | |
+ for (int j = 0; j < reps; ++j) { | |
+ ASSERT_OK(builder_->AppendValues(strings, N, valid_bytes.data())); | |
+ } | |
+ Done(); | |
-template <typename TYPE> | |
-void CheckSliceEquality() { | |
- using Traits = TypeTraits<TYPE>; | |
- using BuilderType = typename Traits::BuilderType; | |
+ ASSERT_EQ(reps * N, result_->length()); | |
+ ASSERT_EQ(reps * 3, result_->null_count()); | |
+ ASSERT_EQ(reps * 3, result_->value_data()->size()); | |
- BuilderType builder; | |
+ CheckStringArray(*result_, {"", "aaa", "", "", ""}, {0, 1, 0, 0, 1}, reps); | |
+ } | |
- std::vector<std::string> strings = {"foo", "", "bar", "baz", "qux", ""}; | |
- std::vector<uint8_t> is_null = {0, 1, 0, 1, 0, 0}; | |
+ void _TestAppendCStringsWithoutValidBytes() { | |
+ const char* strings[] = {"", "bb", "a", nullptr, "ccc"}; | |
- int N = static_cast<int>(strings.size()); | |
- int reps = 10; | |
+ int N = static_cast<int>(sizeof(strings) / sizeof(strings[0])); | |
+ int reps = 1000; | |
- for (int j = 0; j < reps; ++j) { | |
- for (int i = 0; i < N; ++i) { | |
- if (is_null[i]) { | |
- ASSERT_OK(builder.AppendNull()); | |
- } else { | |
- ASSERT_OK(builder.Append(strings[i])); | |
- } | |
+ for (int j = 0; j < reps; ++j) { | |
+ ASSERT_OK(builder_->AppendValues(strings, N)); | |
} | |
- } | |
+ Done(); | |
- std::shared_ptr<Array> array; | |
- FinishAndCheckPadding(&builder, &array); | |
+ ASSERT_EQ(reps * N, result_->length()); | |
+ ASSERT_EQ(reps, result_->null_count()); | |
+ ASSERT_EQ(reps * 6, result_->value_data()->size()); | |
- std::shared_ptr<Array> slice, slice2; | |
- | |
- slice = array->Slice(5); | |
- slice2 = array->Slice(5); | |
- ASSERT_EQ(N * reps - 5, slice->length()); | |
- | |
- ASSERT_TRUE(slice->Equals(slice2)); | |
- ASSERT_TRUE(array->RangeEquals(5, slice->length(), 0, slice)); | |
+ CheckStringArray(*result_, {"", "bb", "a", "", "ccc"}, {1, 1, 1, 0, 1}, reps); | |
+ } | |
- // Chained slices | |
- slice2 = array->Slice(2)->Slice(3); | |
- ASSERT_TRUE(slice->Equals(slice2)); | |
+ void _TestZeroLength() { | |
+ // All buffers are null | |
+ Done(); | |
+ ASSERT_EQ(result_->length(), 0); | |
+ ASSERT_EQ(result_->null_count(), 0); | |
+ } | |
- slice = array->Slice(5, 20); | |
- slice2 = array->Slice(5, 20); | |
- ASSERT_EQ(20, slice->length()); | |
+ protected: | |
+ std::unique_ptr<BuilderType> builder_; | |
+ std::shared_ptr<ArrayType> result_; | |
+}; | |
- ASSERT_TRUE(slice->Equals(slice2)); | |
- ASSERT_TRUE(array->RangeEquals(5, 25, 0, slice)); | |
+TYPED_TEST_CASE(TestStringBuilder, StringTypes); | |
- ASSERT_OK(builder.Append("a")); | |
- for (int j = 0; j < reps; ++j) { | |
- ASSERT_OK(builder.Append("")); | |
- } | |
- FinishAndCheckPadding(&builder, &array); | |
- slice = array->Slice(1); | |
+TYPED_TEST(TestStringBuilder, TestScalarAppend) { this->_TestScalarAppend(); } | |
- for (int j = 0; j < reps; ++j) { | |
- ASSERT_OK(builder.Append("")); | |
- } | |
- FinishAndCheckPadding(&builder, &array); | |
+TYPED_TEST(TestStringBuilder, TestVectorAppend) { this->_TestVectorAppend(); } | |
- AssertArraysEqual(*slice, *array); | |
+TYPED_TEST(TestStringBuilder, TestAppendCStringsWithValidBytes) { | |
+ this->_TestAppendCStringsWithValidBytes(); | |
} | |
-TEST_F(TestBinaryArray, TestSliceEquality) { CheckSliceEquality<BinaryType>(); } | |
- | |
-TEST_F(TestStringArray, TestSliceEquality) { CheckSliceEquality<BinaryType>(); } | |
+TYPED_TEST(TestStringBuilder, TestAppendCStringsWithoutValidBytes) { | |
+ this->_TestAppendCStringsWithoutValidBytes(); | |
+} | |
-TEST_F(TestBinaryArray, LengthZeroCtor) { BinaryArray array(0, nullptr, nullptr); } | |
+TYPED_TEST(TestStringBuilder, TestZeroLength) { this->_TestZeroLength(); } | |
// ---------------------------------------------------------------------- | |
// ChunkedBinaryBuilder tests | |
diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc | |
index 5f76f0839..0b7d8f170 100644 | |
--- a/cpp/src/arrow/array.cc | |
+++ b/cpp/src/arrow/array.cc | |
@@ -386,31 +386,26 @@ BinaryArray::BinaryArray(const std::shared_ptr<ArrayData>& data) { | |
SetData(data); | |
} | |
-void BinaryArray::SetData(const std::shared_ptr<ArrayData>& data) { | |
- ARROW_CHECK_EQ(data->buffers.size(), 3); | |
- auto value_offsets = data->buffers[1]; | |
- auto value_data = data->buffers[2]; | |
- this->Array::SetData(data); | |
- raw_data_ = value_data == nullptr ? nullptr : value_data->data(); | |
- raw_value_offsets_ = value_offsets == nullptr | |
- ? nullptr | |
- : reinterpret_cast<const int32_t*>(value_offsets->data()); | |
-} | |
- | |
BinaryArray::BinaryArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets, | |
- const std::shared_ptr<Buffer>& data, | |
- const std::shared_ptr<Buffer>& null_bitmap, int64_t null_count, | |
- int64_t offset) | |
- : BinaryArray(binary(), length, value_offsets, data, null_bitmap, null_count, | |
- offset) {} | |
- | |
-BinaryArray::BinaryArray(const std::shared_ptr<DataType>& type, int64_t length, | |
- const std::shared_ptr<Buffer>& value_offsets, | |
const std::shared_ptr<Buffer>& data, | |
const std::shared_ptr<Buffer>& null_bitmap, int64_t null_count, | |
int64_t offset) { | |
- SetData(ArrayData::Make(type, length, {null_bitmap, value_offsets, data}, null_count, | |
- offset)); | |
+ SetData(ArrayData::Make(binary(), length, {null_bitmap, value_offsets, data}, | |
+ null_count, offset)); | |
+} | |
+ | |
+LargeBinaryArray::LargeBinaryArray(const std::shared_ptr<ArrayData>& data) { | |
+ ARROW_CHECK_EQ(data->type->id(), Type::LARGE_BINARY); | |
+ SetData(data); | |
+} | |
+ | |
+LargeBinaryArray::LargeBinaryArray(int64_t length, | |
+ const std::shared_ptr<Buffer>& value_offsets, | |
+ const std::shared_ptr<Buffer>& data, | |
+ const std::shared_ptr<Buffer>& null_bitmap, | |
+ int64_t null_count, int64_t offset) { | |
+ SetData(ArrayData::Make(large_binary(), length, {null_bitmap, value_offsets, data}, | |
+ null_count, offset)); | |
} | |
StringArray::StringArray(const std::shared_ptr<ArrayData>& data) { | |
@@ -421,8 +416,24 @@ StringArray::StringArray(const std::shared_ptr<ArrayData>& data) { | |
StringArray::StringArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets, | |
const std::shared_ptr<Buffer>& data, | |
const std::shared_ptr<Buffer>& null_bitmap, int64_t null_count, | |
- int64_t offset) | |
- : BinaryArray(utf8(), length, value_offsets, data, null_bitmap, null_count, offset) {} | |
+ int64_t offset) { | |
+ SetData(ArrayData::Make(utf8(), length, {null_bitmap, value_offsets, data}, null_count, | |
+ offset)); | |
+} | |
+ | |
+LargeStringArray::LargeStringArray(const std::shared_ptr<ArrayData>& data) { | |
+ ARROW_CHECK_EQ(data->type->id(), Type::LARGE_STRING); | |
+ SetData(data); | |
+} | |
+ | |
+LargeStringArray::LargeStringArray(int64_t length, | |
+ const std::shared_ptr<Buffer>& value_offsets, | |
+ const std::shared_ptr<Buffer>& data, | |
+ const std::shared_ptr<Buffer>& null_bitmap, | |
+ int64_t null_count, int64_t offset) { | |
+ SetData(ArrayData::Make(large_utf8(), length, {null_bitmap, value_offsets, data}, | |
+ null_count, offset)); | |
+} | |
// ---------------------------------------------------------------------- | |
// Fixed width binary | |
@@ -1148,20 +1159,14 @@ struct ValidateVisitor { | |
return ValidateOffsets(array); | |
} | |
- Status Visit(const ListArray& array) { | |
- if (array.length() < 0) { | |
- return Status::Invalid("Length was negative"); | |
- } | |
- | |
- auto value_offsets = array.value_offsets(); | |
- if (array.length() && !value_offsets) { | |
- return Status::Invalid("value_offsets_ was null"); | |
- } | |
- if (value_offsets->size() / static_cast<int>(sizeof(int32_t)) < array.length()) { | |
- return Status::Invalid("offset buffer size (bytes): ", value_offsets->size(), | |
- " isn't large enough for length: ", array.length()); | |
+ Status Visit(const LargeBinaryArray& array) { | |
+ if (array.data()->buffers.size() != 3) { | |
+ return Status::Invalid("number of buffers was != 3"); | |
} | |
+ return ValidateOffsets(array); | |
+ } | |
+ Status Visit(const ListArray& array) { | |
if (!array.values()) { | |
return Status::Invalid("values was null"); | |
} | |
@@ -1181,19 +1186,6 @@ struct ValidateVisitor { | |
} | |
Status Visit(const MapArray& array) { | |
- if (array.length() < 0) { | |
- return Status::Invalid("Length was negative"); | |
- } | |
- | |
- auto value_offsets = array.value_offsets(); | |
- if (array.length() && !value_offsets) { | |
- return Status::Invalid("value_offsets_ was null"); | |
- } | |
- if (value_offsets->size() / static_cast<int>(sizeof(int32_t)) < array.length()) { | |
- return Status::Invalid("offset buffer size (bytes): ", value_offsets->size(), | |
- " isn't large enough for length: ", array.length()); | |
- } | |
- | |
if (!array.keys()) { | |
return Status::Invalid("keys was null"); | |
} | |
@@ -1224,9 +1216,6 @@ struct ValidateVisitor { | |
} | |
Status Visit(const FixedSizeListArray& array) { | |
- if (array.length() < 0) { | |
- return Status::Invalid("Length was negative"); | |
- } | |
if (!array.values()) { | |
return Status::Invalid("values was null"); | |
} | |
@@ -1240,14 +1229,6 @@ struct ValidateVisitor { | |
} | |
Status Visit(const StructArray& array) { | |
- if (array.length() < 0) { | |
- return Status::Invalid("Length was negative"); | |
- } | |
- | |
- if (array.null_count() > array.length()) { | |
- return Status::Invalid("Null count exceeds the length of this struct"); | |
- } | |
- | |
if (array.num_fields() > 0) { | |
// Validate fields | |
int64_t array_length = array.field(0)->length(); | |
@@ -1274,16 +1255,7 @@ struct ValidateVisitor { | |
return Status::OK(); | |
} | |
- Status Visit(const UnionArray& array) { | |
- if (array.length() < 0) { | |
- return Status::Invalid("Length was negative"); | |
- } | |
- | |
- if (array.null_count() > array.length()) { | |
- return Status::Invalid("Null count exceeds the length of this struct"); | |
- } | |
- return Status::OK(); | |
- } | |
+ Status Visit(const UnionArray& array) { return Status::OK(); } | |
Status Visit(const DictionaryArray& array) { | |
Type::type index_type_id = array.indices()->type()->id(); | |
@@ -1310,12 +1282,23 @@ struct ValidateVisitor { | |
protected: | |
template <typename ArrayType> | |
Status ValidateOffsets(ArrayType& array) { | |
- int32_t prev_offset = array.value_offset(0); | |
+ using offset_type = typename ArrayType::offset_type; | |
+ | |
+ auto value_offsets = array.value_offsets(); | |
+ if (array.length() && !value_offsets) { | |
+ return Status::Invalid("value_offsets_ was null"); | |
+ } | |
+ if (value_offsets->size() / static_cast<int>(sizeof(offset_type)) < array.length()) { | |
+ return Status::Invalid("offset buffer size (bytes): ", value_offsets->size(), | |
+ " isn't large enough for length: ", array.length()); | |
+ } | |
+ | |
+ auto prev_offset = array.value_offset(0); | |
if (array.offset() == 0 && prev_offset != 0) { | |
return Status::Invalid("The first offset wasn't zero"); | |
} | |
for (int64_t i = 1; i <= array.length(); ++i) { | |
- int32_t current_offset = array.value_offset(i); | |
+ auto current_offset = array.value_offset(i); | |
if (array.IsNull(i - 1) && current_offset != prev_offset) { | |
return Status::Invalid("Offset invariant failure at: ", i, | |
" inconsistent value_offsets for null slot", | |
@@ -1340,6 +1323,14 @@ Status ValidateArray(const Array& array) { | |
const auto layout = type.layout(); | |
const ArrayData& data = *array.data(); | |
+ if (array.length() < 0) { | |
+ return Status::Invalid("Array length is negative"); | |
+ } | |
+ | |
+ if (array.null_count() > array.length()) { | |
+ return Status::Invalid("Null count exceeds array length"); | |
+ } | |
+ | |
if (data.buffers.size() != layout.bit_widths.size()) { | |
return Status::Invalid("Expected ", layout.bit_widths.size(), | |
" buffers in array " | |
diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h | |
index 599a6ea62..e13088c65 100644 | |
--- a/cpp/src/arrow/array.h | |
+++ b/cpp/src/arrow/array.h | |
@@ -492,6 +492,7 @@ class ARROW_EXPORT BooleanArray : public PrimitiveArray { | |
class ARROW_EXPORT ListArray : public Array { | |
public: | |
using TypeClass = ListType; | |
+ using offset_type = ListType::offset_type; | |
explicit ListArray(const std::shared_ptr<ArrayData>& data); | |
@@ -635,24 +636,20 @@ class ARROW_EXPORT FixedSizeListArray : public Array { | |
// ---------------------------------------------------------------------- | |
// Binary and String | |
-/// Concrete Array class for variable-size binary data | |
-class ARROW_EXPORT BinaryArray : public FlatArray { | |
+/// Base class for variable-sized binary arrays, regardless of offset size | |
+/// and logical interpretation. | |
+template <typename TYPE> | |
+class BaseBinaryArray : public FlatArray { | |
public: | |
- using TypeClass = BinaryType; | |
- | |
- explicit BinaryArray(const std::shared_ptr<ArrayData>& data); | |
- | |
- BinaryArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets, | |
- const std::shared_ptr<Buffer>& data, | |
- const std::shared_ptr<Buffer>& null_bitmap = NULLPTR, | |
- int64_t null_count = kUnknownNullCount, int64_t offset = 0); | |
+ using TypeClass = TYPE; | |
+ using offset_type = typename TypeClass::offset_type; | |
/// Return the pointer to the given elements bytes | |
// XXX should GetValue(int64_t i) return a string_view? | |
- const uint8_t* GetValue(int64_t i, int32_t* out_length) const { | |
+ const uint8_t* GetValue(int64_t i, offset_type* out_length) const { | |
// Account for base offset | |
i += data_->offset; | |
- const int32_t pos = raw_value_offsets_[i]; | |
+ const offset_type pos = raw_value_offsets_[i]; | |
*out_length = raw_value_offsets_[i + 1] - pos; | |
return raw_data_ + pos; | |
} | |
@@ -664,7 +661,7 @@ class ARROW_EXPORT BinaryArray : public FlatArray { | |
util::string_view GetView(int64_t i) const { | |
// Account for base offset | |
i += data_->offset; | |
- const int32_t pos = raw_value_offsets_[i]; | |
+ const offset_type pos = raw_value_offsets_[i]; | |
return util::string_view(reinterpret_cast<const char*>(raw_data_ + pos), | |
raw_value_offsets_[i + 1] - pos); | |
} | |
@@ -681,31 +678,52 @@ class ARROW_EXPORT BinaryArray : public FlatArray { | |
/// Note that this buffer does not account for any slice offset | |
std::shared_ptr<Buffer> value_data() const { return data_->buffers[2]; } | |
- const int32_t* raw_value_offsets() const { return raw_value_offsets_ + data_->offset; } | |
+ const offset_type* raw_value_offsets() const { | |
+ return raw_value_offsets_ + data_->offset; | |
+ } | |
// Neither of these functions will perform boundschecking | |
- int32_t value_offset(int64_t i) const { return raw_value_offsets_[i + data_->offset]; } | |
- int32_t value_length(int64_t i) const { | |
+ offset_type value_offset(int64_t i) const { | |
+ return raw_value_offsets_[i + data_->offset]; | |
+ } | |
+ offset_type value_length(int64_t i) const { | |
i += data_->offset; | |
return raw_value_offsets_[i + 1] - raw_value_offsets_[i]; | |
} | |
protected: | |
// For subclasses | |
- BinaryArray() : raw_value_offsets_(NULLPTR), raw_data_(NULLPTR) {} | |
+ BaseBinaryArray() : raw_value_offsets_(NULLPTR), raw_data_(NULLPTR) {} | |
- /// Protected method for constructors | |
- void SetData(const std::shared_ptr<ArrayData>& data); | |
+ // Protected method for constructors | |
+ void SetData(const std::shared_ptr<ArrayData>& data) { | |
+ auto value_offsets = data->buffers[1]; | |
+ auto value_data = data->buffers[2]; | |
+ this->Array::SetData(data); | |
+ raw_data_ = value_data == NULLPTR ? NULLPTR : value_data->data(); | |
+ raw_value_offsets_ = | |
+ value_offsets == NULLPTR | |
+ ? NULLPTR | |
+ : reinterpret_cast<const offset_type*>(value_offsets->data()); | |
+ } | |
- // Constructor to allow sub-classes/builders to substitute their own logical type | |
- BinaryArray(const std::shared_ptr<DataType>& type, int64_t length, | |
- const std::shared_ptr<Buffer>& value_offsets, | |
+ const offset_type* raw_value_offsets_; | |
+ const uint8_t* raw_data_; | |
+}; | |
+ | |
+/// Concrete Array class for variable-size binary data | |
+class ARROW_EXPORT BinaryArray : public BaseBinaryArray<BinaryType> { | |
+ public: | |
+ explicit BinaryArray(const std::shared_ptr<ArrayData>& data); | |
+ | |
+ BinaryArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets, | |
const std::shared_ptr<Buffer>& data, | |
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR, | |
int64_t null_count = kUnknownNullCount, int64_t offset = 0); | |
- const int32_t* raw_value_offsets_; | |
- const uint8_t* raw_data_; | |
+ protected: | |
+ // For subclasses such as StringArray | |
+ BinaryArray() : BaseBinaryArray() {} | |
}; | |
/// Concrete Array class for variable-size string (utf-8) data | |
@@ -721,6 +739,34 @@ class ARROW_EXPORT StringArray : public BinaryArray { | |
int64_t null_count = kUnknownNullCount, int64_t offset = 0); | |
}; | |
+/// Concrete Array class for large variable-size binary data | |
+class ARROW_EXPORT LargeBinaryArray : public BaseBinaryArray<LargeBinaryType> { | |
+ public: | |
+ explicit LargeBinaryArray(const std::shared_ptr<ArrayData>& data); | |
+ | |
+ LargeBinaryArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets, | |
+ const std::shared_ptr<Buffer>& data, | |
+ const std::shared_ptr<Buffer>& null_bitmap = NULLPTR, | |
+ int64_t null_count = kUnknownNullCount, int64_t offset = 0); | |
+ | |
+ protected: | |
+ // For subclasses such as LargeStringArray | |
+ LargeBinaryArray() : BaseBinaryArray() {} | |
+}; | |
+ | |
+/// Concrete Array class for large variable-size string (utf-8) data | |
+class ARROW_EXPORT LargeStringArray : public LargeBinaryArray { | |
+ public: | |
+ using TypeClass = LargeStringType; | |
+ | |
+ explicit LargeStringArray(const std::shared_ptr<ArrayData>& data); | |
+ | |
+ LargeStringArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets, | |
+ const std::shared_ptr<Buffer>& data, | |
+ const std::shared_ptr<Buffer>& null_bitmap = NULLPTR, | |
+ int64_t null_count = kUnknownNullCount, int64_t offset = 0); | |
+}; | |
+ | |
// ---------------------------------------------------------------------- | |
// Fixed width binary | |
diff --git a/cpp/src/arrow/array/builder_binary.cc b/cpp/src/arrow/array/builder_binary.cc | |
index 818ad1559..b83897d7e 100644 | |
--- a/cpp/src/arrow/array/builder_binary.cc | |
+++ b/cpp/src/arrow/array/builder_binary.cc | |
@@ -43,173 +43,15 @@ using internal::checked_cast; | |
// ---------------------------------------------------------------------- | |
// String and binary | |
-BinaryBuilder::BinaryBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool) | |
- : ArrayBuilder(type, pool), offsets_builder_(pool), value_data_builder_(pool) {} | |
- | |
-BinaryBuilder::BinaryBuilder(MemoryPool* pool) : BinaryBuilder(binary(), pool) {} | |
- | |
-Status BinaryBuilder::Resize(int64_t capacity) { | |
- if (capacity > kListMaximumElements) { | |
- return Status::CapacityError( | |
- "BinaryBuilder cannot reserve space for more then 2^31 - 1 child elements, got ", | |
- capacity); | |
- } | |
- RETURN_NOT_OK(CheckCapacity(capacity, capacity_)); | |
- | |
- // one more then requested for offsets | |
- RETURN_NOT_OK(offsets_builder_.Resize(capacity + 1)); | |
- return ArrayBuilder::Resize(capacity); | |
-} | |
- | |
-Status BinaryBuilder::ReserveData(int64_t elements) { | |
- const int64_t size = value_data_length() + elements; | |
- ARROW_RETURN_IF( | |
- size > kBinaryMemoryLimit, | |
- Status::CapacityError("Cannot reserve capacity larger than 2^31 - 1 for binary")); | |
- | |
- return (size > value_data_capacity()) ? value_data_builder_.Reserve(elements) | |
- : Status::OK(); | |
-} | |
- | |
-Status BinaryBuilder::AppendOverflow(int64_t num_bytes) { | |
- return Status::CapacityError("BinaryArray cannot contain more than ", | |
- kBinaryMemoryLimit, " bytes, have ", num_bytes); | |
-} | |
- | |
-Status BinaryBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) { | |
- // Write final offset (values length) | |
- RETURN_NOT_OK(AppendNextOffset()); | |
- | |
- // These buffers' padding zeroed by BufferBuilder | |
- std::shared_ptr<Buffer> offsets, value_data, null_bitmap; | |
- RETURN_NOT_OK(offsets_builder_.Finish(&offsets)); | |
- RETURN_NOT_OK(value_data_builder_.Finish(&value_data)); | |
- RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap)); | |
- | |
- *out = | |
- ArrayData::Make(type_, length_, {null_bitmap, offsets, value_data}, null_count_, 0); | |
- Reset(); | |
- return Status::OK(); | |
-} | |
- | |
-void BinaryBuilder::Reset() { | |
- ArrayBuilder::Reset(); | |
- offsets_builder_.Reset(); | |
- value_data_builder_.Reset(); | |
-} | |
- | |
-const uint8_t* BinaryBuilder::GetValue(int64_t i, int32_t* out_length) const { | |
- const int32_t* offsets = offsets_builder_.data(); | |
- int32_t offset = offsets[i]; | |
- if (i == (length_ - 1)) { | |
- *out_length = static_cast<int32_t>(value_data_builder_.length()) - offset; | |
- } else { | |
- *out_length = offsets[i + 1] - offset; | |
- } | |
- return value_data_builder_.data() + offset; | |
-} | |
- | |
-util::string_view BinaryBuilder::GetView(int64_t i) const { | |
- const int32_t* offsets = offsets_builder_.data(); | |
- int32_t offset = offsets[i]; | |
- int32_t value_length; | |
- if (i == (length_ - 1)) { | |
- value_length = static_cast<int32_t>(value_data_builder_.length()) - offset; | |
- } else { | |
- value_length = offsets[i + 1] - offset; | |
- } | |
- return util::string_view( | |
- reinterpret_cast<const char*>(value_data_builder_.data() + offset), value_length); | |
-} | |
+BinaryBuilder::BinaryBuilder(MemoryPool* pool) : BaseBinaryBuilder(binary(), pool) {} | |
StringBuilder::StringBuilder(MemoryPool* pool) : BinaryBuilder(utf8(), pool) {} | |
-Status StringBuilder::AppendValues(const std::vector<std::string>& values, | |
- const uint8_t* valid_bytes) { | |
- std::size_t total_length = std::accumulate( | |
- values.begin(), values.end(), 0ULL, | |
- [](uint64_t sum, const std::string& str) { return sum + str.size(); }); | |
- RETURN_NOT_OK(Reserve(values.size())); | |
- RETURN_NOT_OK(value_data_builder_.Reserve(total_length)); | |
- RETURN_NOT_OK(offsets_builder_.Reserve(values.size())); | |
- | |
- if (valid_bytes) { | |
- for (std::size_t i = 0; i < values.size(); ++i) { | |
- UnsafeAppendNextOffset(); | |
- if (valid_bytes[i]) { | |
- value_data_builder_.UnsafeAppend( | |
- reinterpret_cast<const uint8_t*>(values[i].data()), values[i].size()); | |
- } | |
- } | |
- } else { | |
- for (std::size_t i = 0; i < values.size(); ++i) { | |
- UnsafeAppendNextOffset(); | |
- value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values[i].data()), | |
- values[i].size()); | |
- } | |
- } | |
+LargeBinaryBuilder::LargeBinaryBuilder(MemoryPool* pool) | |
+ : BaseBinaryBuilder(large_binary(), pool) {} | |
- UnsafeAppendToBitmap(valid_bytes, values.size()); | |
- return Status::OK(); | |
-} | |
- | |
-Status StringBuilder::AppendValues(const char** values, int64_t length, | |
- const uint8_t* valid_bytes) { | |
- std::size_t total_length = 0; | |
- std::vector<std::size_t> value_lengths(length); | |
- bool have_null_value = false; | |
- for (int64_t i = 0; i < length; ++i) { | |
- if (values[i]) { | |
- auto value_length = strlen(values[i]); | |
- value_lengths[i] = value_length; | |
- total_length += value_length; | |
- } else { | |
- have_null_value = true; | |
- } | |
- } | |
- RETURN_NOT_OK(Reserve(length)); | |
- RETURN_NOT_OK(value_data_builder_.Reserve(total_length)); | |
- RETURN_NOT_OK(offsets_builder_.Reserve(length)); | |
- | |
- if (valid_bytes) { | |
- int64_t valid_bytes_offset = 0; | |
- for (int64_t i = 0; i < length; ++i) { | |
- UnsafeAppendNextOffset(); | |
- if (valid_bytes[i]) { | |
- if (values[i]) { | |
- value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values[i]), | |
- value_lengths[i]); | |
- } else { | |
- UnsafeAppendToBitmap(valid_bytes + valid_bytes_offset, i - valid_bytes_offset); | |
- UnsafeAppendToBitmap(false); | |
- valid_bytes_offset = i + 1; | |
- } | |
- } | |
- } | |
- UnsafeAppendToBitmap(valid_bytes + valid_bytes_offset, length - valid_bytes_offset); | |
- } else { | |
- if (have_null_value) { | |
- std::vector<uint8_t> valid_vector(length, 0); | |
- for (int64_t i = 0; i < length; ++i) { | |
- UnsafeAppendNextOffset(); | |
- if (values[i]) { | |
- value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values[i]), | |
- value_lengths[i]); | |
- valid_vector[i] = 1; | |
- } | |
- } | |
- UnsafeAppendToBitmap(valid_vector.data(), length); | |
- } else { | |
- for (int64_t i = 0; i < length; ++i) { | |
- UnsafeAppendNextOffset(); | |
- value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values[i]), | |
- value_lengths[i]); | |
- } | |
- UnsafeAppendToBitmap(nullptr, length); | |
- } | |
- } | |
- return Status::OK(); | |
-} | |
+LargeStringBuilder::LargeStringBuilder(MemoryPool* pool) | |
+ : LargeBinaryBuilder(large_utf8(), pool) {} | |
// ---------------------------------------------------------------------- | |
// Fixed width binary | |
diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h | |
index 47d3bae4b..5bf4e747b 100644 | |
--- a/cpp/src/arrow/array/builder_binary.h | |
+++ b/cpp/src/arrow/array/builder_binary.h | |
@@ -17,8 +17,11 @@ | |
#pragma once | |
+#include <algorithm> | |
+#include <cstdint> | |
#include <limits> | |
#include <memory> | |
+#include <numeric> | |
#include <string> | |
#include <vector> | |
@@ -37,15 +40,16 @@ constexpr int64_t kBinaryMemoryLimit = std::numeric_limits<int32_t>::max() - 1; | |
// ---------------------------------------------------------------------- | |
// Binary and String | |
-/// \class BinaryBuilder | |
-/// \brief Builder class for variable-length binary data | |
-class ARROW_EXPORT BinaryBuilder : public ArrayBuilder { | |
+template <typename TYPE> | |
+class BaseBinaryBuilder : public ArrayBuilder { | |
public: | |
- explicit BinaryBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); | |
+ using TypeClass = TYPE; | |
+ using offset_type = typename TypeClass::offset_type; | |
- BinaryBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool); | |
+ BaseBinaryBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool) | |
+ : ArrayBuilder(type, pool), offsets_builder_(pool), value_data_builder_(pool) {} | |
- Status Append(const uint8_t* value, int32_t length) { | |
+ Status Append(const uint8_t* value, offset_type length) { | |
ARROW_RETURN_NOT_OK(Reserve(1)); | |
ARROW_RETURN_NOT_OK(AppendNextOffset()); | |
// Safety check for UBSAN. | |
@@ -57,14 +61,22 @@ class ARROW_EXPORT BinaryBuilder : public ArrayBuilder { | |
return Status::OK(); | |
} | |
+ Status Append(const char* value, offset_type length) { | |
+ return Append(reinterpret_cast<const uint8_t*>(value), length); | |
+ } | |
+ | |
+ Status Append(util::string_view value) { | |
+ return Append(value.data(), static_cast<offset_type>(value.size())); | |
+ } | |
+ | |
Status AppendNulls(int64_t length) final { | |
const int64_t num_bytes = value_data_builder_.length(); | |
- if (ARROW_PREDICT_FALSE(num_bytes > kBinaryMemoryLimit)) { | |
+ if (ARROW_PREDICT_FALSE(num_bytes > memory_limit())) { | |
return AppendOverflow(num_bytes); | |
} | |
ARROW_RETURN_NOT_OK(Reserve(length)); | |
for (int64_t i = 0; i < length; ++i) { | |
- offsets_builder_.UnsafeAppend(static_cast<int32_t>(num_bytes)); | |
+ offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes)); | |
} | |
UnsafeAppendToBitmap(length, false); | |
return Status::OK(); | |
@@ -77,56 +89,184 @@ class ARROW_EXPORT BinaryBuilder : public ArrayBuilder { | |
return Status::OK(); | |
} | |
- Status Append(const char* value, int32_t length) { | |
- return Append(reinterpret_cast<const uint8_t*>(value), length); | |
- } | |
- | |
- Status Append(util::string_view value) { | |
- return Append(value.data(), static_cast<int32_t>(value.size())); | |
- } | |
- | |
/// \brief Append without checking capacity | |
/// | |
/// Offsets and data should have been presized using Reserve() and | |
/// ReserveData(), respectively. | |
- void UnsafeAppend(const uint8_t* value, int32_t length) { | |
+ void UnsafeAppend(const uint8_t* value, offset_type length) { | |
UnsafeAppendNextOffset(); | |
value_data_builder_.UnsafeAppend(value, length); | |
UnsafeAppendToBitmap(true); | |
} | |
- void UnsafeAppend(const char* value, int32_t length) { | |
+ void UnsafeAppend(const char* value, offset_type length) { | |
UnsafeAppend(reinterpret_cast<const uint8_t*>(value), length); | |
} | |
void UnsafeAppend(const std::string& value) { | |
- UnsafeAppend(value.c_str(), static_cast<int32_t>(value.size())); | |
+ UnsafeAppend(value.c_str(), static_cast<offset_type>(value.size())); | |
} | |
void UnsafeAppend(util::string_view value) { | |
- UnsafeAppend(value.data(), static_cast<int32_t>(value.size())); | |
+ UnsafeAppend(value.data(), static_cast<offset_type>(value.size())); | |
} | |
void UnsafeAppendNull() { | |
const int64_t num_bytes = value_data_builder_.length(); | |
- offsets_builder_.UnsafeAppend(static_cast<int32_t>(num_bytes)); | |
+ offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes)); | |
UnsafeAppendToBitmap(false); | |
} | |
- void Reset() override; | |
- Status Resize(int64_t capacity) override; | |
+ /// \brief Append a sequence of strings in one shot. | |
+ /// | |
+ /// \param[in] values a vector of strings | |
+ /// \param[in] valid_bytes an optional sequence of bytes where non-zero | |
+ /// indicates a valid (non-null) value | |
+ /// \return Status | |
+ Status AppendValues(const std::vector<std::string>& values, | |
+ const uint8_t* valid_bytes = NULLPTR) { | |
+ std::size_t total_length = std::accumulate( | |
+ values.begin(), values.end(), 0ULL, | |
+ [](uint64_t sum, const std::string& str) { return sum + str.size(); }); | |
+ ARROW_RETURN_NOT_OK(Reserve(values.size())); | |
+ ARROW_RETURN_NOT_OK(value_data_builder_.Reserve(total_length)); | |
+ ARROW_RETURN_NOT_OK(offsets_builder_.Reserve(values.size())); | |
+ | |
+ if (valid_bytes) { | |
+ for (std::size_t i = 0; i < values.size(); ++i) { | |
+ UnsafeAppendNextOffset(); | |
+ if (valid_bytes[i]) { | |
+ value_data_builder_.UnsafeAppend( | |
+ reinterpret_cast<const uint8_t*>(values[i].data()), values[i].size()); | |
+ } | |
+ } | |
+ } else { | |
+ for (std::size_t i = 0; i < values.size(); ++i) { | |
+ UnsafeAppendNextOffset(); | |
+ value_data_builder_.UnsafeAppend( | |
+ reinterpret_cast<const uint8_t*>(values[i].data()), values[i].size()); | |
+ } | |
+ } | |
+ | |
+ UnsafeAppendToBitmap(valid_bytes, values.size()); | |
+ return Status::OK(); | |
+ } | |
+ | |
+ /// \brief Append a sequence of nul-terminated strings in one shot. | |
+ /// If one of the values is NULL, it is processed as a null | |
+ /// value even if the corresponding valid_bytes entry is 1. | |
+ /// | |
+ /// \param[in] values a contiguous C array of nul-terminated char * | |
+ /// \param[in] length the number of values to append | |
+ /// \param[in] valid_bytes an optional sequence of bytes where non-zero | |
+ /// indicates a valid (non-null) value | |
+ /// \return Status | |
+ Status AppendValues(const char** values, int64_t length, | |
+ const uint8_t* valid_bytes = NULLPTR) { | |
+ std::size_t total_length = 0; | |
+ std::vector<std::size_t> value_lengths(length); | |
+ bool have_null_value = false; | |
+ for (int64_t i = 0; i < length; ++i) { | |
+ if (values[i]) { | |
+ auto value_length = strlen(values[i]); | |
+ value_lengths[i] = value_length; | |
+ total_length += value_length; | |
+ } else { | |
+ have_null_value = true; | |
+ } | |
+ } | |
+ ARROW_RETURN_NOT_OK(Reserve(length)); | |
+ ARROW_RETURN_NOT_OK(value_data_builder_.Reserve(total_length)); | |
+ ARROW_RETURN_NOT_OK(offsets_builder_.Reserve(length)); | |
+ | |
+ if (valid_bytes) { | |
+ int64_t valid_bytes_offset = 0; | |
+ for (int64_t i = 0; i < length; ++i) { | |
+ UnsafeAppendNextOffset(); | |
+ if (valid_bytes[i]) { | |
+ if (values[i]) { | |
+ value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values[i]), | |
+ value_lengths[i]); | |
+ } else { | |
+ UnsafeAppendToBitmap(valid_bytes + valid_bytes_offset, | |
+ i - valid_bytes_offset); | |
+ UnsafeAppendToBitmap(false); | |
+ valid_bytes_offset = i + 1; | |
+ } | |
+ } | |
+ } | |
+ UnsafeAppendToBitmap(valid_bytes + valid_bytes_offset, length - valid_bytes_offset); | |
+ } else { | |
+ if (have_null_value) { | |
+ std::vector<uint8_t> valid_vector(length, 0); | |
+ for (int64_t i = 0; i < length; ++i) { | |
+ UnsafeAppendNextOffset(); | |
+ if (values[i]) { | |
+ value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values[i]), | |
+ value_lengths[i]); | |
+ valid_vector[i] = 1; | |
+ } | |
+ } | |
+ UnsafeAppendToBitmap(valid_vector.data(), length); | |
+ } else { | |
+ for (int64_t i = 0; i < length; ++i) { | |
+ UnsafeAppendNextOffset(); | |
+ value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values[i]), | |
+ value_lengths[i]); | |
+ } | |
+ UnsafeAppendToBitmap(NULLPTR, length); | |
+ } | |
+ } | |
+ return Status::OK(); | |
+ } | |
+ | |
+ void Reset() override { | |
+ ArrayBuilder::Reset(); | |
+ offsets_builder_.Reset(); | |
+ value_data_builder_.Reset(); | |
+ } | |
+ | |
+ Status Resize(int64_t capacity) override { | |
+ if (capacity > kListMaximumElements) { | |
+ return Status::CapacityError( | |
+ "BinaryBuilder cannot reserve space for more than 2^31 - 1 child elements, " | |
+ "got ", | |
+ capacity); | |
+ } | |
+ ARROW_RETURN_NOT_OK(CheckCapacity(capacity, capacity_)); | |
+ | |
+ // One more than requested for offsets | |
+ ARROW_RETURN_NOT_OK(offsets_builder_.Resize(capacity + 1)); | |
+ return ArrayBuilder::Resize(capacity); | |
+ } | |
/// \brief Ensures there is enough allocated capacity to append the indicated | |
/// number of bytes to the value data buffer without additional allocations | |
- Status ReserveData(int64_t elements); | |
+ Status ReserveData(int64_t elements) { | |
+ const int64_t size = value_data_length() + elements; | |
+ ARROW_RETURN_IF(size > memory_limit(), | |
+ Status::CapacityError("Cannot reserve capacity larger than ", | |
+ memory_limit(), " bytes")); | |
+ | |
+ return (size > value_data_capacity()) ? value_data_builder_.Reserve(elements) | |
+ : Status::OK(); | |
+ } | |
- Status FinishInternal(std::shared_ptr<ArrayData>* out) override; | |
+ Status FinishInternal(std::shared_ptr<ArrayData>* out) override { | |
+ // Write final offset (values length) | |
+ ARROW_RETURN_NOT_OK(AppendNextOffset()); | |
- /// \cond FALSE | |
- using ArrayBuilder::Finish; | |
- /// \endcond | |
+ // These buffers' padding zeroed by BufferBuilder | |
+ std::shared_ptr<Buffer> offsets, value_data, null_bitmap; | |
+ ARROW_RETURN_NOT_OK(offsets_builder_.Finish(&offsets)); | |
+ ARROW_RETURN_NOT_OK(value_data_builder_.Finish(&value_data)); | |
+ ARROW_RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap)); | |
- Status Finish(std::shared_ptr<BinaryArray>* out) { return FinishTyped(out); } | |
+ *out = ArrayData::Make(type_, length_, {null_bitmap, offsets, value_data}, | |
+ null_count_, 0); | |
+ Reset(); | |
+ return Status::OK(); | |
+ } | |
/// \return size of values buffer so far | |
int64_t value_data_length() const { return value_data_builder_.length(); } | |
@@ -136,33 +276,77 @@ class ARROW_EXPORT BinaryBuilder : public ArrayBuilder { | |
/// Temporary access to a value. | |
/// | |
/// This pointer becomes invalid on the next modifying operation. | |
- const uint8_t* GetValue(int64_t i, int32_t* out_length) const; | |
+ const uint8_t* GetValue(int64_t i, offset_type* out_length) const { | |
+ const offset_type* offsets = offsets_builder_.data(); | |
+ const auto offset = offsets[i]; | |
+ if (i == (length_ - 1)) { | |
+ *out_length = static_cast<offset_type>(value_data_builder_.length()) - offset; | |
+ } else { | |
+ *out_length = offsets[i + 1] - offset; | |
+ } | |
+ return value_data_builder_.data() + offset; | |
+ } | |
/// Temporary access to a value. | |
/// | |
/// This view becomes invalid on the next modifying operation. | |
- util::string_view GetView(int64_t i) const; | |
+ util::string_view GetView(int64_t i) const { | |
+ const offset_type* offsets = offsets_builder_.data(); | |
+ const auto offset = offsets[i]; | |
+ offset_type value_length; | |
+ if (i == (length_ - 1)) { | |
+ value_length = static_cast<offset_type>(value_data_builder_.length()) - offset; | |
+ } else { | |
+ value_length = offsets[i + 1] - offset; | |
+ } | |
+ return util::string_view( | |
+ reinterpret_cast<const char*>(value_data_builder_.data() + offset), value_length); | |
+ } | |
protected: | |
- TypedBufferBuilder<int32_t> offsets_builder_; | |
+ TypedBufferBuilder<offset_type> offsets_builder_; | |
TypedBufferBuilder<uint8_t> value_data_builder_; | |
- Status AppendOverflow(int64_t num_bytes); | |
+ Status AppendOverflow(int64_t num_bytes) { | |
+ return Status::CapacityError("array cannot contain more than ", memory_limit(), | |
+ " bytes, have ", num_bytes); | |
+ } | |
Status AppendNextOffset() { | |
const int64_t num_bytes = value_data_builder_.length(); | |
- if (ARROW_PREDICT_FALSE(num_bytes > kBinaryMemoryLimit)) { | |
+ if (ARROW_PREDICT_FALSE(num_bytes > memory_limit())) { | |
return AppendOverflow(num_bytes); | |
} | |
- return offsets_builder_.Append(static_cast<int32_t>(num_bytes)); | |
+ return offsets_builder_.Append(static_cast<offset_type>(num_bytes)); | |
} | |
void UnsafeAppendNextOffset() { | |
const int64_t num_bytes = value_data_builder_.length(); | |
- offsets_builder_.UnsafeAppend(static_cast<int32_t>(num_bytes)); | |
+ offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes)); | |
+ } | |
+ | |
+ // Cannot make this a static attribute because of linking issues | |
+ static constexpr int64_t memory_limit() { | |
+ return std::numeric_limits<offset_type>::max() - 1; | |
} | |
}; | |
+/// \class BinaryBuilder | |
+/// \brief Builder class for variable-length binary data | |
+class ARROW_EXPORT BinaryBuilder : public BaseBinaryBuilder<BinaryType> { | |
+ public: | |
+ explicit BinaryBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); | |
+ | |
+ /// \cond FALSE | |
+ using ArrayBuilder::Finish; | |
+ /// \endcond | |
+ | |
+ Status Finish(std::shared_ptr<BinaryArray>* out) { return FinishTyped(out); } | |
+ | |
+ protected: | |
+ using BaseBinaryBuilder::BaseBinaryBuilder; | |
+}; | |
+ | |
/// \class StringBuilder | |
/// \brief Builder class for UTF8 strings | |
class ARROW_EXPORT StringBuilder : public BinaryBuilder { | |
@@ -170,36 +354,41 @@ class ARROW_EXPORT StringBuilder : public BinaryBuilder { | |
using BinaryBuilder::BinaryBuilder; | |
explicit StringBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); | |
- using BinaryBuilder::Append; | |
- using BinaryBuilder::Reset; | |
- using BinaryBuilder::UnsafeAppend; | |
+ /// \cond FALSE | |
+ using ArrayBuilder::Finish; | |
+ /// \endcond | |
- /// \brief Append a sequence of strings in one shot. | |
- /// | |
- /// \param[in] values a vector of strings | |
- /// \param[in] valid_bytes an optional sequence of bytes where non-zero | |
- /// indicates a valid (non-null) value | |
- /// \return Status | |
- Status AppendValues(const std::vector<std::string>& values, | |
- const uint8_t* valid_bytes = NULLPTR); | |
+ Status Finish(std::shared_ptr<StringArray>* out) { return FinishTyped(out); } | |
+}; | |
- /// \brief Append a sequence of nul-terminated strings in one shot. | |
- /// If one of the values is NULL, it is processed as a null | |
- /// value even if the corresponding valid_bytes entry is 1. | |
- /// | |
- /// \param[in] values a contiguous C array of nul-terminated char * | |
- /// \param[in] length the number of values to append | |
- /// \param[in] valid_bytes an optional sequence of bytes where non-zero | |
- /// indicates a valid (non-null) value | |
- /// \return Status | |
- Status AppendValues(const char** values, int64_t length, | |
- const uint8_t* valid_bytes = NULLPTR); | |
+/// \class LargeBinaryBuilder | |
+/// \brief Builder class for large variable-length binary data | |
+class ARROW_EXPORT LargeBinaryBuilder : public BaseBinaryBuilder<LargeBinaryType> { | |
+ public: | |
+ explicit LargeBinaryBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); | |
/// \cond FALSE | |
using ArrayBuilder::Finish; | |
/// \endcond | |
- Status Finish(std::shared_ptr<StringArray>* out) { return FinishTyped(out); } | |
+ Status Finish(std::shared_ptr<LargeBinaryArray>* out) { return FinishTyped(out); } | |
+ | |
+ protected: | |
+ using BaseBinaryBuilder::BaseBinaryBuilder; | |
+}; | |
+ | |
+/// \class LargeStringBuilder | |
+/// \brief Builder class for large UTF8 strings | |
+class ARROW_EXPORT LargeStringBuilder : public LargeBinaryBuilder { | |
+ public: | |
+ using LargeBinaryBuilder::LargeBinaryBuilder; | |
+ explicit LargeStringBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); | |
+ | |
+ /// \cond FALSE | |
+ using ArrayBuilder::Finish; | |
+ /// \endcond | |
+ | |
+ Status Finish(std::shared_ptr<LargeStringArray>* out) { return FinishTyped(out); } | |
}; | |
// ---------------------------------------------------------------------- | |
diff --git a/cpp/src/arrow/array/concatenate-test.cc b/cpp/src/arrow/array/concatenate-test.cc | |
index cf105ceb6..730b25ab8 100644 | |
--- a/cpp/src/arrow/array/concatenate-test.cc | |
+++ b/cpp/src/arrow/array/concatenate-test.cc | |
@@ -48,10 +48,11 @@ class ConcatenateTest : public ::testing::Test { | |
sizes_({0, 1, 2, 4, 16, 31, 1234}), | |
null_probabilities_({0.0, 0.1, 0.5, 0.9, 1.0}) {} | |
- std::vector<int32_t> Offsets(int32_t length, int32_t slice_count) { | |
- std::vector<int32_t> offsets(static_cast<std::size_t>(slice_count + 1)); | |
+ template <typename OffsetType> | |
+ std::vector<OffsetType> Offsets(int32_t length, int32_t slice_count) { | |
+ std::vector<OffsetType> offsets(static_cast<std::size_t>(slice_count + 1)); | |
std::default_random_engine gen(seed_); | |
- std::uniform_int_distribution<int32_t> dist(0, length); | |
+ std::uniform_int_distribution<OffsetType> dist(0, length); | |
std::generate(offsets.begin(), offsets.end(), [&] { return dist(gen); }); | |
std::sort(offsets.begin(), offsets.end()); | |
return offsets; | |
@@ -85,7 +86,7 @@ class ConcatenateTest : public ::testing::Test { | |
template <typename ArrayFactory> | |
void Check(ArrayFactory&& factory) { | |
for (auto size : this->sizes_) { | |
- auto offsets = this->Offsets(size, 3); | |
+ auto offsets = this->Offsets<int32_t>(size, 3); | |
for (auto null_probability : this->null_probabilities_) { | |
std::shared_ptr<Array> array; | |
factory(size, null_probability, &array); | |
@@ -146,16 +147,16 @@ TYPED_TEST(PrimitiveConcatenateTest, Primitives) { | |
TEST_F(ConcatenateTest, StringType) { | |
Check([this](int32_t size, double null_probability, std::shared_ptr<Array>* out) { | |
- auto values_size = size * 4; | |
- auto char_array = this->GeneratePrimitive<Int8Type>(values_size, null_probability); | |
- std::shared_ptr<Buffer> offsets; | |
- auto offsets_vector = this->Offsets(values_size, size); | |
- // ensure the first offset is 0, which is expected for StringType | |
- offsets_vector[0] = 0; | |
- ASSERT_OK(CopyBufferFromVector(offsets_vector, default_memory_pool(), &offsets)); | |
- *out = MakeArray(ArrayData::Make( | |
- utf8(), size, | |
- {char_array->data()->buffers[0], offsets, char_array->data()->buffers[1]})); | |
+ *out = rng_.String(size, /*min_length =*/0, /*max_length =*/15, null_probability); | |
+ ASSERT_OK(ValidateArray(**out)); | |
+ }); | |
+} | |
+ | |
+TEST_F(ConcatenateTest, LargeStringType) { | |
+ Check([this](int32_t size, double null_probability, std::shared_ptr<Array>* out) { | |
+ *out = | |
+ rng_.LargeString(size, /*min_length =*/0, /*max_length =*/15, null_probability); | |
+ ASSERT_OK(ValidateArray(**out)); | |
}); | |
} | |
@@ -163,7 +164,7 @@ TEST_F(ConcatenateTest, ListType) { | |
Check([this](int32_t size, double null_probability, std::shared_ptr<Array>* out) { | |
auto values_size = size * 4; | |
auto values = this->GeneratePrimitive<Int8Type>(values_size, null_probability); | |
- auto offsets_vector = this->Offsets(values_size, size); | |
+ auto offsets_vector = this->Offsets<int32_t>(values_size, size); | |
// ensure the first offset is 0, which is expected for ListType | |
offsets_vector[0] = 0; | |
std::shared_ptr<Array> offsets; | |
diff --git a/cpp/src/arrow/array/concatenate.cc b/cpp/src/arrow/array/concatenate.cc | |
index 60da0d3f8..a20b157ac 100644 | |
--- a/cpp/src/arrow/array/concatenate.cc | |
+++ b/cpp/src/arrow/array/concatenate.cc | |
@@ -184,14 +184,21 @@ class ConcatenateImpl { | |
Status Visit(const BinaryType&) { | |
std::vector<Range> value_ranges; | |
- RETURN_NOT_OK(ConcatenateOffsets<int32_t>(Buffers(1, *offset_type), pool_, | |
+ RETURN_NOT_OK(ConcatenateOffsets<int32_t>(Buffers(1, sizeof(int32_t)), pool_, | |
+ &out_.buffers[1], &value_ranges)); | |
+ return ConcatenateBuffers(Buffers(2, value_ranges), pool_, &out_.buffers[2]); | |
+ } | |
+ | |
+ Status Visit(const LargeBinaryType&) { | |
+ std::vector<Range> value_ranges; | |
+ RETURN_NOT_OK(ConcatenateOffsets<int64_t>(Buffers(1, sizeof(int64_t)), pool_, | |
&out_.buffers[1], &value_ranges)); | |
return ConcatenateBuffers(Buffers(2, value_ranges), pool_, &out_.buffers[2]); | |
} | |
Status Visit(const ListType&) { | |
std::vector<Range> value_ranges; | |
- RETURN_NOT_OK(ConcatenateOffsets<int32_t>(Buffers(1, *offset_type), pool_, | |
+ RETURN_NOT_OK(ConcatenateOffsets<int32_t>(Buffers(1, sizeof(int32_t)), pool_, | |
&out_.buffers[1], &value_ranges)); | |
return ConcatenateImpl(ChildData(0, value_ranges), pool_) | |
.Concatenate(out_.child_data[0].get()); | |
@@ -277,13 +284,11 @@ class ConcatenateImpl { | |
} | |
// Gather the index-th buffer of each input into a vector. | |
- // Buffers are assumed to contain elements of fixed.bit_width(), | |
+ // Buffers are assumed to contain elements of the given byte_width, | |
// those elements are sliced with that input's offset and length. | |
// Note that BufferVector will not contain the buffer of in_[i] if it's | |
// nullptr. | |
- BufferVector Buffers(size_t index, const FixedWidthType& fixed) { | |
- DCHECK_EQ(fixed.bit_width() % 8, 0); | |
- auto byte_width = fixed.bit_width() / 8; | |
+ BufferVector Buffers(size_t index, int byte_width) { | |
BufferVector buffers; | |
buffers.reserve(in_.size()); | |
for (const ArrayData& array_data : in_) { | |
@@ -296,6 +301,16 @@ class ConcatenateImpl { | |
return buffers; | |
} | |
+ // Gather the index-th buffer of each input into a vector. | |
+ // Buffers are assumed to contain elements of fixed.bit_width(), | |
+ // those elements are sliced with that input's offset and length. | |
+ // Note that BufferVector will not contain the buffer of in_[i] if it's | |
+ // nullptr. | |
+ BufferVector Buffers(size_t index, const FixedWidthType& fixed) { | |
+ DCHECK_EQ(fixed.bit_width() % 8, 0); | |
+ return Buffers(index, fixed.bit_width() / 8); | |
+ } | |
+ | |
// Gather the index-th buffer of each input as a Bitmap | |
// into a vector of Bitmaps. | |
std::vector<Bitmap> Bitmaps(size_t index) { | |
@@ -328,15 +343,11 @@ class ConcatenateImpl { | |
return child_data; | |
} | |
- static const std::shared_ptr<FixedWidthType> offset_type; | |
const std::vector<ArrayData>& in_; | |
MemoryPool* pool_; | |
ArrayData out_; | |
}; | |
-const std::shared_ptr<FixedWidthType> ConcatenateImpl::offset_type = | |
- std::static_pointer_cast<FixedWidthType>(int32()); | |
- | |
Status Concatenate(const ArrayVector& arrays, MemoryPool* pool, | |
std::shared_ptr<Array>* out) { | |
if (arrays.size() == 0) { | |
diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc | |
index cee443c48..44b0d041b 100644 | |
--- a/cpp/src/arrow/builder.cc | |
+++ b/cpp/src/arrow/builder.cc | |
@@ -107,6 +107,8 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type, | |
BUILDER_CASE(DOUBLE, DoubleBuilder); | |
BUILDER_CASE(STRING, StringBuilder); | |
BUILDER_CASE(BINARY, BinaryBuilder); | |
+ BUILDER_CASE(LARGE_STRING, LargeStringBuilder); | |
+ BUILDER_CASE(LARGE_BINARY, LargeBinaryBuilder); | |
BUILDER_CASE(FIXED_SIZE_BINARY, FixedSizeBinaryBuilder); | |
BUILDER_CASE(DECIMAL, Decimal128Builder); | |
case Type::DICTIONARY: { | |
diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc | |
index 097bc8f76..590ab6e4a 100644 | |
--- a/cpp/src/arrow/compare.cc | |
+++ b/cpp/src/arrow/compare.cc | |
@@ -144,8 +144,9 @@ class RangeEqualsVisitor { | |
return Status::OK(); | |
} | |
- bool CompareBinaryRange(const BinaryArray& left) const { | |
- const auto& right = checked_cast<const BinaryArray&>(right_); | |
+ template <typename ArrayType> | |
+ bool CompareBinaryRange(const ArrayType& left) const { | |
+ const auto& right = checked_cast<const ArrayType&>(right_); | |
for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_; | |
++i, ++o_i) { | |
@@ -154,10 +155,10 @@ class RangeEqualsVisitor { | |
return false; | |
} | |
if (is_null) continue; | |
- const int32_t begin_offset = left.value_offset(i); | |
- const int32_t end_offset = left.value_offset(i + 1); | |
- const int32_t right_begin_offset = right.value_offset(o_i); | |
- const int32_t right_end_offset = right.value_offset(o_i + 1); | |
+ const auto begin_offset = left.value_offset(i); | |
+ const auto end_offset = left.value_offset(i + 1); | |
+ const auto right_begin_offset = right.value_offset(o_i); | |
+ const auto right_end_offset = right.value_offset(o_i + 1); | |
// Underlying can't be equal if the size isn't equal | |
if (end_offset - begin_offset != right_end_offset - right_begin_offset) { | |
return false; | |
@@ -278,6 +279,11 @@ class RangeEqualsVisitor { | |
return Status::OK(); | |
} | |
+ Status Visit(const LargeBinaryArray& left) { | |
+ result_ = CompareBinaryRange(left); | |
+ return Status::OK(); | |
+ } | |
+ | |
Status Visit(const FixedSizeBinaryArray& left) { | |
const auto& right = checked_cast<const FixedSizeBinaryArray&>(right_); | |
@@ -489,18 +495,21 @@ class ArrayEqualsVisitor : public RangeEqualsVisitor { | |
template <typename ArrayType> | |
bool ValueOffsetsEqual(const ArrayType& left) { | |
+ using offset_type = typename ArrayType::offset_type; | |
+ | |
const auto& right = checked_cast<const ArrayType&>(right_); | |
if (left.offset() == 0 && right.offset() == 0) { | |
return left.value_offsets()->Equals(*right.value_offsets(), | |
- (left.length() + 1) * sizeof(int32_t)); | |
+ (left.length() + 1) * sizeof(offset_type)); | |
} else { | |
// One of the arrays is sliced; logic is more complicated because the | |
// value offsets are not both 0-based | |
auto left_offsets = | |
- reinterpret_cast<const int32_t*>(left.value_offsets()->data()) + left.offset(); | |
+ reinterpret_cast<const offset_type*>(left.value_offsets()->data()) + | |
+ left.offset(); | |
auto right_offsets = | |
- reinterpret_cast<const int32_t*>(right.value_offsets()->data()) + | |
+ reinterpret_cast<const offset_type*>(right.value_offsets()->data()) + | |
right.offset(); | |
for (int64_t i = 0; i < left.length() + 1; ++i) { | |
@@ -512,10 +521,11 @@ class ArrayEqualsVisitor : public RangeEqualsVisitor { | |
} | |
} | |
- bool CompareBinary(const BinaryArray& left) { | |
- const auto& right = checked_cast<const BinaryArray&>(right_); | |
+ template <typename ArrayType> | |
+ bool CompareBinary(const ArrayType& left) { | |
+ const auto& right = checked_cast<const ArrayType&>(right_); | |
- bool equal_offsets = ValueOffsetsEqual<BinaryArray>(left); | |
+ bool equal_offsets = ValueOffsetsEqual<ArrayType>(left); | |
if (!equal_offsets) { | |
return false; | |
} | |
@@ -544,8 +554,8 @@ class ArrayEqualsVisitor : public RangeEqualsVisitor { | |
} | |
} else { | |
// ARROW-537: Only compare data in non-null slots | |
- const int32_t* left_offsets = left.raw_value_offsets(); | |
- const int32_t* right_offsets = right.raw_value_offsets(); | |
+ auto left_offsets = left.raw_value_offsets(); | |
+ auto right_offsets = right.raw_value_offsets(); | |
for (int64_t i = 0; i < left.length(); ++i) { | |
if (left.IsNull(i)) { | |
continue; | |
@@ -564,6 +574,11 @@ class ArrayEqualsVisitor : public RangeEqualsVisitor { | |
return Status::OK(); | |
} | |
+ Status Visit(const LargeBinaryArray& left) { | |
+ result_ = CompareBinary(left); | |
+ return Status::OK(); | |
+ } | |
+ | |
Status Visit(const ListArray& left) { | |
const auto& right = checked_cast<const ListArray&>(right_); | |
bool equal_offsets = ValueOffsetsEqual<ListArray>(left); | |
@@ -822,6 +837,15 @@ class ScalarEqualsVisitor { | |
return Status::OK(); | |
} | |
+ template <typename T> | |
+ typename std::enable_if<std::is_base_of<LargeBinaryScalar, T>::value, Status>::type | |
+ Visit(const T& left_) { | |
+ const auto& left = checked_cast<const LargeBinaryScalar&>(left_); | |
+ const auto& right = checked_cast<const LargeBinaryScalar&>(right_); | |
+ result_ = internal::SharedPtrEquals(left.value, right.value); | |
+ return Status::OK(); | |
+ } | |
+ | |
Status Visit(const Decimal128Scalar& left) { | |
const auto& right = checked_cast<const Decimal128Scalar&>(right_); | |
result_ = left.value == right.value; | |
diff --git a/cpp/src/arrow/compute/kernels/cast-test.cc b/cpp/src/arrow/compute/kernels/cast-test.cc | |
index 6bf4f9417..80538f20e 100644 | |
--- a/cpp/src/arrow/compute/kernels/cast-test.cc | |
+++ b/cpp/src/arrow/compute/kernels/cast-test.cc | |
@@ -52,6 +52,8 @@ namespace compute { | |
using internal::checked_cast; | |
+static constexpr const char* kInvalidUtf8 = "\xa0\xa1"; | |
+ | |
static std::vector<std::shared_ptr<DataType>> kNumericTypes = { | |
uint8(), int8(), uint16(), int16(), uint32(), | |
int32(), uint64(), int64(), float32(), float64()}; | |
@@ -131,6 +133,132 @@ class TestCast : public ComputeFixture, public TestBase { | |
CheckPass(*input->Slice(1), *expected->Slice(1), out_type, options); | |
} | |
} | |
+ | |
+ template <typename SourceType, typename DestType> | |
+ void TestCastBinaryToString() { | |
+ CastOptions options; | |
+ auto src_type = TypeTraits<SourceType>::type_singleton(); | |
+ auto dest_type = TypeTraits<DestType>::type_singleton(); | |
+ | |
+ // All valid except the last one | |
+ std::vector<bool> all = {1, 1, 1, 1, 1}; | |
+ std::vector<bool> valid = {1, 1, 1, 1, 0}; | |
+ std::vector<std::string> strings = {"Hi", "olá mundo", "你好世界", "", kInvalidUtf8}; | |
+ | |
+ std::shared_ptr<Array> array; | |
+ | |
+ // Should accept when invalid but null. | |
+ ArrayFromVector<SourceType, std::string>(src_type, valid, strings, &array); | |
+ CheckZeroCopy(*array, dest_type); | |
+ | |
+ // Should refuse due to invalid utf8 payload | |
+ CheckFails<SourceType, std::string>(src_type, strings, all, dest_type, options); | |
+ | |
+ // Should accept due to option override | |
+ options.allow_invalid_utf8 = true; | |
+ CheckCase<SourceType, std::string, DestType, std::string>( | |
+ src_type, strings, all, dest_type, strings, options); | |
+ } | |
+ | |
+ template <typename SourceType> | |
+ void TestCastStringToNumber() { | |
+ CastOptions options; | |
+ auto src_type = TypeTraits<SourceType>::type_singleton(); | |
+ | |
+ std::vector<bool> is_valid = {true, false, true, true, true}; | |
+ | |
+ // string to int | |
+ std::vector<std::string> v_int = {"0", "1", "127", "-1", "0"}; | |
+ std::vector<int8_t> e_int8 = {0, 1, 127, -1, 0}; | |
+ std::vector<int16_t> e_int16 = {0, 1, 127, -1, 0}; | |
+ std::vector<int32_t> e_int32 = {0, 1, 127, -1, 0}; | |
+ std::vector<int64_t> e_int64 = {0, 1, 127, -1, 0}; | |
+ CheckCase<SourceType, std::string, Int8Type, int8_t>(src_type, v_int, is_valid, | |
+ int8(), e_int8, options); | |
+ CheckCase<SourceType, std::string, Int16Type, int16_t>(src_type, v_int, is_valid, | |
+ int16(), e_int16, options); | |
+ CheckCase<SourceType, std::string, Int32Type, int32_t>(src_type, v_int, is_valid, | |
+ int32(), e_int32, options); | |
+ CheckCase<SourceType, std::string, Int64Type, int64_t>(src_type, v_int, is_valid, | |
+ int64(), e_int64, options); | |
+ | |
+ v_int = {"2147483647", "0", "-2147483648", "0", "0"}; | |
+ e_int32 = {2147483647, 0, -2147483648LL, 0, 0}; | |
+ CheckCase<SourceType, std::string, Int32Type, int32_t>(src_type, v_int, is_valid, | |
+ int32(), e_int32, options); | |
+ v_int = {"9223372036854775807", "0", "-9223372036854775808", "0", "0"}; | |
+ e_int64 = {9223372036854775807LL, 0, (-9223372036854775807LL - 1), 0, 0}; | |
+ CheckCase<SourceType, std::string, Int64Type, int64_t>(src_type, v_int, is_valid, | |
+ int64(), e_int64, options); | |
+ | |
+ // string to uint | |
+ std::vector<std::string> v_uint = {"0", "1", "127", "255", "0"}; | |
+ std::vector<uint8_t> e_uint8 = {0, 1, 127, 255, 0}; | |
+ std::vector<uint16_t> e_uint16 = {0, 1, 127, 255, 0}; | |
+ std::vector<uint32_t> e_uint32 = {0, 1, 127, 255, 0}; | |
+ std::vector<uint64_t> e_uint64 = {0, 1, 127, 255, 0}; | |
+ CheckCase<SourceType, std::string, UInt8Type, uint8_t>(src_type, v_uint, is_valid, | |
+ uint8(), e_uint8, options); | |
+ CheckCase<SourceType, std::string, UInt16Type, uint16_t>(src_type, v_uint, is_valid, | |
+ uint16(), e_uint16, options); | |
+ CheckCase<SourceType, std::string, UInt32Type, uint32_t>(src_type, v_uint, is_valid, | |
+ uint32(), e_uint32, options); | |
+ CheckCase<SourceType, std::string, UInt64Type, uint64_t>(src_type, v_uint, is_valid, | |
+ uint64(), e_uint64, options); | |
+ | |
+ v_uint = {"4294967295", "0", "0", "0", "0"}; | |
+ e_uint32 = {4294967295, 0, 0, 0, 0}; | |
+ CheckCase<SourceType, std::string, UInt32Type, uint32_t>(src_type, v_uint, is_valid, | |
+ uint32(), e_uint32, options); | |
+ v_uint = {"18446744073709551615", "0", "0", "0", "0"}; | |
+ e_uint64 = {18446744073709551615ULL, 0, 0, 0, 0}; | |
+ CheckCase<SourceType, std::string, UInt64Type, uint64_t>(src_type, v_uint, is_valid, | |
+ uint64(), e_uint64, options); | |
+ | |
+ // string to float | |
+ std::vector<std::string> v_float = {"0.1", "1.2", "127.3", "200.4", "0.5"}; | |
+ std::vector<float> e_float = {0.1f, 1.2f, 127.3f, 200.4f, 0.5f}; | |
+ std::vector<double> e_double = {0.1, 1.2, 127.3, 200.4, 0.5}; | |
+ CheckCase<SourceType, std::string, FloatType, float>(src_type, v_float, is_valid, | |
+ float32(), e_float, options); | |
+ CheckCase<SourceType, std::string, DoubleType, double>(src_type, v_float, is_valid, | |
+ float64(), e_double, options); | |
+ | |
+ // Test that casting is locale-independent | |
+ auto global_locale = std::locale(); | |
+ try { | |
+ // French locale uses the comma as decimal point | |
+ std::locale::global(std::locale("fr_FR.UTF-8")); | |
+ } catch (std::runtime_error&) { | |
+ // Locale unavailable, ignore | |
+ } | |
+ CheckCase<SourceType, std::string, FloatType, float>(src_type, v_float, is_valid, | |
+ float32(), e_float, options); | |
+ CheckCase<SourceType, std::string, DoubleType, double>(src_type, v_float, is_valid, | |
+ float64(), e_double, options); | |
+ std::locale::global(global_locale); | |
+ } | |
+ | |
+ template <typename SourceType> | |
+ void TestCastStringToTimestamp() { | |
+ CastOptions options; | |
+ auto src_type = TypeTraits<SourceType>::type_singleton(); | |
+ | |
+ std::vector<bool> is_valid = {true, false, true}; | |
+ std::vector<std::string> strings = {"1970-01-01", "xxx", "2000-02-29"}; | |
+ | |
+ auto type = timestamp(TimeUnit::SECOND); | |
+ std::vector<int64_t> e = {0, 0, 951782400}; | |
+ CheckCase<SourceType, std::string, TimestampType, int64_t>( | |
+ src_type, strings, is_valid, type, e, options); | |
+ | |
+ type = timestamp(TimeUnit::MICRO); | |
+ e = {0, 0, 951782400000000LL}; | |
+ CheckCase<SourceType, std::string, TimestampType, int64_t>( | |
+ src_type, strings, is_valid, type, e, options); | |
+ | |
+ // NOTE: timestamp parsing is tested comprehensively in parsing-util-test.cc | |
+ } | |
}; | |
TEST_F(TestCast, SameTypeZeroCopy) { | |
@@ -922,6 +1050,10 @@ TEST_F(TestCast, StringToBoolean) { | |
e, options); | |
CheckCase<StringType, std::string, BooleanType, bool>(utf8(), v2, is_valid, boolean(), | |
e, options); | |
+ | |
+ // Same with LargeStringType | |
+ CheckCase<LargeStringType, std::string, BooleanType, bool>(large_utf8(), v1, is_valid, | |
+ boolean(), e, options); | |
} | |
TEST_F(TestCast, StringToBooleanErrors) { | |
@@ -931,84 +1063,13 @@ TEST_F(TestCast, StringToBooleanErrors) { | |
CheckFails<StringType, std::string>(utf8(), {"false "}, is_valid, boolean(), options); | |
CheckFails<StringType, std::string>(utf8(), {"T"}, is_valid, boolean(), options); | |
+ CheckFails<LargeStringType, std::string>(large_utf8(), {"T"}, is_valid, boolean(), | |
+ options); | |
} | |
-TEST_F(TestCast, StringToNumber) { | |
- CastOptions options; | |
+TEST_F(TestCast, StringToNumber) { TestCastStringToNumber<StringType>(); } | |
- std::vector<bool> is_valid = {true, false, true, true, true}; | |
- | |
- // string to int | |
- std::vector<std::string> v_int = {"0", "1", "127", "-1", "0"}; | |
- std::vector<int8_t> e_int8 = {0, 1, 127, -1, 0}; | |
- std::vector<int16_t> e_int16 = {0, 1, 127, -1, 0}; | |
- std::vector<int32_t> e_int32 = {0, 1, 127, -1, 0}; | |
- std::vector<int64_t> e_int64 = {0, 1, 127, -1, 0}; | |
- CheckCase<StringType, std::string, Int8Type, int8_t>(utf8(), v_int, is_valid, int8(), | |
- e_int8, options); | |
- CheckCase<StringType, std::string, Int16Type, int16_t>(utf8(), v_int, is_valid, int16(), | |
- e_int16, options); | |
- CheckCase<StringType, std::string, Int32Type, int32_t>(utf8(), v_int, is_valid, int32(), | |
- e_int32, options); | |
- CheckCase<StringType, std::string, Int64Type, int64_t>(utf8(), v_int, is_valid, int64(), | |
- e_int64, options); | |
- | |
- v_int = {"2147483647", "0", "-2147483648", "0", "0"}; | |
- e_int32 = {2147483647, 0, -2147483648LL, 0, 0}; | |
- CheckCase<StringType, std::string, Int32Type, int32_t>(utf8(), v_int, is_valid, int32(), | |
- e_int32, options); | |
- v_int = {"9223372036854775807", "0", "-9223372036854775808", "0", "0"}; | |
- e_int64 = {9223372036854775807LL, 0, (-9223372036854775807LL - 1), 0, 0}; | |
- CheckCase<StringType, std::string, Int64Type, int64_t>(utf8(), v_int, is_valid, int64(), | |
- e_int64, options); | |
- | |
- // string to uint | |
- std::vector<std::string> v_uint = {"0", "1", "127", "255", "0"}; | |
- std::vector<uint8_t> e_uint8 = {0, 1, 127, 255, 0}; | |
- std::vector<uint16_t> e_uint16 = {0, 1, 127, 255, 0}; | |
- std::vector<uint32_t> e_uint32 = {0, 1, 127, 255, 0}; | |
- std::vector<uint64_t> e_uint64 = {0, 1, 127, 255, 0}; | |
- CheckCase<StringType, std::string, UInt8Type, uint8_t>(utf8(), v_uint, is_valid, | |
- uint8(), e_uint8, options); | |
- CheckCase<StringType, std::string, UInt16Type, uint16_t>(utf8(), v_uint, is_valid, | |
- uint16(), e_uint16, options); | |
- CheckCase<StringType, std::string, UInt32Type, uint32_t>(utf8(), v_uint, is_valid, | |
- uint32(), e_uint32, options); | |
- CheckCase<StringType, std::string, UInt64Type, uint64_t>(utf8(), v_uint, is_valid, | |
- uint64(), e_uint64, options); | |
- | |
- v_uint = {"4294967295", "0", "0", "0", "0"}; | |
- e_uint32 = {4294967295, 0, 0, 0, 0}; | |
- CheckCase<StringType, std::string, UInt32Type, uint32_t>(utf8(), v_uint, is_valid, | |
- uint32(), e_uint32, options); | |
- v_uint = {"18446744073709551615", "0", "0", "0", "0"}; | |
- e_uint64 = {18446744073709551615ULL, 0, 0, 0, 0}; | |
- CheckCase<StringType, std::string, UInt64Type, uint64_t>(utf8(), v_uint, is_valid, | |
- uint64(), e_uint64, options); | |
- | |
- // string to float | |
- std::vector<std::string> v_float = {"0.1", "1.2", "127.3", "200.4", "0.5"}; | |
- std::vector<float> e_float = {0.1f, 1.2f, 127.3f, 200.4f, 0.5f}; | |
- std::vector<double> e_double = {0.1, 1.2, 127.3, 200.4, 0.5}; | |
- CheckCase<StringType, std::string, FloatType, float>(utf8(), v_float, is_valid, | |
- float32(), e_float, options); | |
- CheckCase<StringType, std::string, DoubleType, double>(utf8(), v_float, is_valid, | |
- float64(), e_double, options); | |
- | |
- // Test that casting is locale-independent | |
- auto global_locale = std::locale(); | |
- try { | |
- // French locale uses the comma as decimal point | |
- std::locale::global(std::locale("fr_FR.UTF-8")); | |
- } catch (std::runtime_error&) { | |
- // Locale unavailable, ignore | |
- } | |
- CheckCase<StringType, std::string, FloatType, float>(utf8(), v_float, is_valid, | |
- float32(), e_float, options); | |
- CheckCase<StringType, std::string, DoubleType, double>(utf8(), v_float, is_valid, | |
- float64(), e_double, options); | |
- std::locale::global(global_locale); | |
-} | |
+TEST_F(TestCast, LargeStringToNumber) { TestCastStringToNumber<LargeStringType>(); } | |
TEST_F(TestCast, StringToNumberErrors) { | |
CastOptions options; | |
@@ -1027,24 +1088,9 @@ TEST_F(TestCast, StringToNumberErrors) { | |
CheckFails<StringType, std::string>(utf8(), {"z"}, is_valid, float32(), options); | |
} | |
-TEST_F(TestCast, StringToTimestamp) { | |
- CastOptions options; | |
- | |
- std::vector<bool> is_valid = {true, false, true}; | |
- std::vector<std::string> strings = {"1970-01-01", "xxx", "2000-02-29"}; | |
+TEST_F(TestCast, StringToTimestamp) { TestCastStringToTimestamp<StringType>(); } | |
- auto type = timestamp(TimeUnit::SECOND); | |
- std::vector<int64_t> e = {0, 0, 951782400}; | |
- CheckCase<StringType, std::string, TimestampType, int64_t>(utf8(), strings, is_valid, | |
- type, e, options); | |
- | |
- type = timestamp(TimeUnit::MICRO); | |
- e = {0, 0, 951782400000000LL}; | |
- CheckCase<StringType, std::string, TimestampType, int64_t>(utf8(), strings, is_valid, | |
- type, e, options); | |
- | |
- // NOTE: timestamp parsing is tested comprehensively in parsing-util-test.cc | |
-} | |
+TEST_F(TestCast, LargeStringToTimestamp) { TestCastStringToTimestamp<LargeStringType>(); } | |
TEST_F(TestCast, StringToTimestampErrors) { | |
CastOptions options; | |
@@ -1058,29 +1104,10 @@ TEST_F(TestCast, StringToTimestampErrors) { | |
} | |
} | |
-constexpr const char* kInvalidUtf8 = "\xa0\xa1"; | |
- | |
-TEST_F(TestCast, BinaryToString) { | |
- CastOptions options; | |
- | |
- // All valid except the last one | |
- std::vector<bool> all = {1, 1, 1, 1, 1}; | |
- std::vector<bool> valid = {1, 1, 1, 1, 0}; | |
- std::vector<std::string> strings = {"Hi", "olá mundo", "你好世界", "", kInvalidUtf8}; | |
- | |
- std::shared_ptr<Array> array; | |
- | |
- // Should accept when invalid but null. | |
- ArrayFromVector<BinaryType, std::string>(binary(), valid, strings, &array); | |
- CheckZeroCopy(*array, utf8()); | |
- | |
- // Should refuse due to invalid utf8 payload | |
- CheckFails<BinaryType, std::string>(binary(), strings, all, utf8(), options); | |
+TEST_F(TestCast, BinaryToString) { TestCastBinaryToString<BinaryType, StringType>(); } | |
- // Should accept due to option override | |
- options.allow_invalid_utf8 = true; | |
- CheckCase<BinaryType, std::string, StringType, std::string>(binary(), strings, all, | |
- utf8(), strings, options); | |
+TEST_F(TestCast, LargeBinaryToLargeString) { | |
+ TestCastBinaryToString<LargeBinaryType, LargeStringType>(); | |
} | |
TEST_F(TestCast, ListToList) { | |
diff --git a/cpp/src/arrow/compute/kernels/cast.cc b/cpp/src/arrow/compute/kernels/cast.cc | |
index 88a4f3087..a8b661599 100644 | |
--- a/cpp/src/arrow/compute/kernels/cast.cc | |
+++ b/cpp/src/arrow/compute/kernels/cast.cc | |
@@ -905,13 +905,15 @@ struct CastFunctor<T, DictionaryType> { | |
// ---------------------------------------------------------------------- | |
// String to Number | |
-template <typename O> | |
-struct CastFunctor<O, StringType, enable_if_number<O>> { | |
+template <typename I, typename O> | |
+struct CastFunctor<O, I, | |
+ typename std::enable_if<is_any_string_type<I>::value && | |
+ is_number_type<O>::value>::type> { | |
void operator()(FunctionContext* ctx, const CastOptions& options, | |
const ArrayData& input, ArrayData* output) { | |
using out_type = typename O::c_type; | |
- StringArray input_array(input.Copy()); | |
+ typename TypeTraits<I>::ArrayType input_array(input.Copy()); | |
auto out_data = output->GetMutableValues<out_type>(1); | |
internal::StringConverter<O> converter; | |
@@ -933,15 +935,15 @@ struct CastFunctor<O, StringType, enable_if_number<O>> { | |
// ---------------------------------------------------------------------- | |
// String to Boolean | |
-template <typename O> | |
-struct CastFunctor<O, StringType, | |
- typename std::enable_if<std::is_same<BooleanType, O>::value>::type> { | |
+template <typename I> | |
+struct CastFunctor<BooleanType, I, | |
+ typename std::enable_if<is_any_string_type<I>::value>::type> { | |
void operator()(FunctionContext* ctx, const CastOptions& options, | |
const ArrayData& input, ArrayData* output) { | |
- StringArray input_array(input.Copy()); | |
+ typename TypeTraits<I>::ArrayType input_array(input.Copy()); | |
internal::FirstTimeBitmapWriter writer(output->buffers[1]->mutable_data(), | |
output->offset, input.length); | |
- internal::StringConverter<O> converter; | |
+ internal::StringConverter<BooleanType> converter; | |
for (int64_t i = 0; i < input.length; ++i) { | |
if (input_array.IsNull(i)) { | |
@@ -972,13 +974,14 @@ struct CastFunctor<O, StringType, | |
// ---------------------------------------------------------------------- | |
// String to Timestamp | |
-template <> | |
-struct CastFunctor<TimestampType, StringType> { | |
+template <typename I> | |
+struct CastFunctor<TimestampType, I, | |
+ typename std::enable_if<is_any_string_type<I>::value>::type> { | |
void operator()(FunctionContext* ctx, const CastOptions& options, | |
const ArrayData& input, ArrayData* output) { | |
using out_type = TimestampType::c_type; | |
- StringArray input_array(input.Copy()); | |
+ typename TypeTraits<I>::ArrayType input_array(input.Copy()); | |
auto out_data = output->GetMutableValues<out_type>(1); | |
internal::StringConverter<TimestampType> converter(output->type); | |
@@ -1001,47 +1004,51 @@ struct CastFunctor<TimestampType, StringType> { | |
// Binary to String | |
// | |
-template <typename I> | |
-struct CastFunctor<StringType, I, | |
- typename std::enable_if<std::is_same<BinaryType, I>::value>::type> { | |
+#if defined(_MSC_VER) | |
+// Silence warning: """'visitor': unreferenced local variable""" | |
+#pragma warning(push) | |
+#pragma warning(disable : 4101) | |
+#endif | |
+ | |
+template <typename I, typename O> | |
+struct BinaryToStringSameWidthCastFunctor { | |
void operator()(FunctionContext* ctx, const CastOptions& options, | |
const ArrayData& input, ArrayData* output) { | |
- BinaryArray binary(input.Copy()); | |
+ if (!options.allow_invalid_utf8) { | |
+ util::InitializeUTF8(); | |
- if (options.allow_invalid_utf8) { | |
- ZeroCopyData(input, output); | |
- return; | |
+ ArrayDataVisitor<I> visitor; | |
+ Status st = visitor.Visit(input, this); | |
+ if (!st.ok()) { | |
+ ctx->SetStatus(st); | |
+ return; | |
+ } | |
} | |
+ ZeroCopyData(input, output); | |
+ } | |
- util::InitializeUTF8(); | |
- | |
- if (binary.null_count() != 0) { | |
- for (int64_t i = 0; i < input.length; i++) { | |
- if (binary.IsNull(i)) { | |
- continue; | |
- } | |
- | |
- const auto str = binary.GetView(i); | |
- if (ARROW_PREDICT_FALSE(!arrow::util::ValidateUTF8(str))) { | |
- ctx->SetStatus(Status::Invalid("Invalid UTF8 payload")); | |
- return; | |
- } | |
- } | |
+ Status VisitNull() { return Status::OK(); } | |
- } else { | |
- for (int64_t i = 0; i < input.length; i++) { | |
- const auto str = binary.GetView(i); | |
- if (ARROW_PREDICT_FALSE(!arrow::util::ValidateUTF8(str))) { | |
- ctx->SetStatus(Status::Invalid("Invalid UTF8 payload")); | |
- return; | |
- } | |
- } | |
+ Status VisitValue(util::string_view str) { | |
+ if (ARROW_PREDICT_FALSE(!arrow::util::ValidateUTF8(str))) { | |
+ return Status::Invalid("Invalid UTF8 payload"); | |
} | |
- | |
- ZeroCopyData(input, output); | |
+ return Status::OK(); | |
} | |
}; | |
+template <> | |
+struct CastFunctor<StringType, BinaryType> | |
+ : public BinaryToStringSameWidthCastFunctor<StringType, BinaryType> {}; | |
+ | |
+template <> | |
+struct CastFunctor<LargeStringType, LargeBinaryType> | |
+ : public BinaryToStringSameWidthCastFunctor<LargeStringType, LargeBinaryType> {}; | |
+ | |
+#if defined(_MSC_VER) | |
+#pragma warning(pop) | |
+#endif | |
+ | |
// ---------------------------------------------------------------------- | |
typedef std::function<void(FunctionContext*, const CastOptions& options, const ArrayData&, | |
@@ -1142,6 +1149,8 @@ GET_CAST_FUNCTION(TIME64_CASES, Time64Type) | |
GET_CAST_FUNCTION(TIMESTAMP_CASES, TimestampType) | |
GET_CAST_FUNCTION(BINARY_CASES, BinaryType) | |
GET_CAST_FUNCTION(STRING_CASES, StringType) | |
+GET_CAST_FUNCTION(LARGEBINARY_CASES, LargeBinaryType) | |
+GET_CAST_FUNCTION(LARGESTRING_CASES, LargeStringType) | |
GET_CAST_FUNCTION(DICTIONARY_CASES, DictionaryType) | |
#define CAST_FUNCTION_CASE(InType) \ | |
@@ -1225,6 +1234,8 @@ Status GetCastFunction(const DataType& in_type, std::shared_ptr<DataType> out_ty | |
CAST_FUNCTION_CASE(TimestampType); | |
CAST_FUNCTION_CASE(BinaryType); | |
CAST_FUNCTION_CASE(StringType); | |
+ CAST_FUNCTION_CASE(LargeBinaryType); | |
+ CAST_FUNCTION_CASE(LargeStringType); | |
CAST_FUNCTION_CASE(DictionaryType); | |
case Type::LIST: | |
RETURN_NOT_OK(GetListCastFunc(in_type, std::move(out_type), options, kernel)); | |
diff --git a/cpp/src/arrow/compute/kernels/generated/cast-codegen-internal.h b/cpp/src/arrow/compute/kernels/generated/cast-codegen-internal.h | |
index 77334af36..fb82067bb 100644 | |
--- a/cpp/src/arrow/compute/kernels/generated/cast-codegen-internal.h | |
+++ b/cpp/src/arrow/compute/kernels/generated/cast-codegen-internal.h | |
@@ -171,6 +171,9 @@ | |
#define BINARY_CASES(TEMPLATE) \ | |
TEMPLATE(BinaryType, StringType) | |
+#define LARGEBINARY_CASES(TEMPLATE) \ | |
+ TEMPLATE(LargeBinaryType, LargeStringType) | |
+ | |
#define STRING_CASES(TEMPLATE) \ | |
TEMPLATE(StringType, BooleanType) \ | |
TEMPLATE(StringType, UInt8Type) \ | |
@@ -185,6 +188,20 @@ | |
TEMPLATE(StringType, DoubleType) \ | |
TEMPLATE(StringType, TimestampType) | |
+#define LARGESTRING_CASES(TEMPLATE) \ | |
+ TEMPLATE(LargeStringType, BooleanType) \ | |
+ TEMPLATE(LargeStringType, UInt8Type) \ | |
+ TEMPLATE(LargeStringType, Int8Type) \ | |
+ TEMPLATE(LargeStringType, UInt16Type) \ | |
+ TEMPLATE(LargeStringType, Int16Type) \ | |
+ TEMPLATE(LargeStringType, UInt32Type) \ | |
+ TEMPLATE(LargeStringType, Int32Type) \ | |
+ TEMPLATE(LargeStringType, UInt64Type) \ | |
+ TEMPLATE(LargeStringType, Int64Type) \ | |
+ TEMPLATE(LargeStringType, FloatType) \ | |
+ TEMPLATE(LargeStringType, DoubleType) \ | |
+ TEMPLATE(LargeStringType, TimestampType) | |
+ | |
#define DICTIONARY_CASES(TEMPLATE) \ | |
TEMPLATE(DictionaryType, UInt8Type) \ | |
TEMPLATE(DictionaryType, Int8Type) \ | |
diff --git a/cpp/src/arrow/compute/kernels/generated/codegen.py b/cpp/src/arrow/compute/kernels/generated/codegen.py | |
index 04fc38618..c9db7eaa0 100644 | |
--- a/cpp/src/arrow/compute/kernels/generated/codegen.py | |
+++ b/cpp/src/arrow/compute/kernels/generated/codegen.py | |
@@ -85,7 +85,9 @@ CAST_GENERATORS = [ | |
CastCodeGenerator('Timestamp', ['Date32', 'Date64', 'Timestamp'], | |
parametric=True), | |
CastCodeGenerator('Binary', ['String']), | |
+ CastCodeGenerator('LargeBinary', ['LargeString']), | |
CastCodeGenerator('String', NUMERIC_TYPES + ['Timestamp']), | |
+ CastCodeGenerator('LargeString', NUMERIC_TYPES + ['Timestamp']), | |
CastCodeGenerator('Dictionary', | |
INTEGER_TYPES + FLOATING_TYPES + DATE_TIME_TYPES + | |
['Null', 'Binary', 'FixedSizeBinary', 'String', | |
diff --git a/cpp/src/arrow/csv/converter-test.cc b/cpp/src/arrow/csv/converter-test.cc | |
index a5e4c0372..53176ff0a 100644 | |
--- a/cpp/src/arrow/csv/converter-test.cc | |
+++ b/cpp/src/arrow/csv/converter-test.cc | |
@@ -30,6 +30,7 @@ | |
#include "arrow/status.h" | |
#include "arrow/testing/gtest_util.h" | |
#include "arrow/type.h" | |
+#include "arrow/type_traits.h" | |
#include "arrow/util/decimal.h" | |
#include "arrow/util/logging.h" | |
@@ -118,11 +119,17 @@ void AssertConversionError(const std::shared_ptr<DataType>& type, | |
////////////////////////////////////////////////////////////////////////// | |
// Test functions begin here | |
-TEST(BinaryConversion, Basics) { | |
- AssertConversion<BinaryType, std::string>(binary(), {"ab,cdé\n", ",\xffgh\n"}, | |
- {{"ab", ""}, {"cdé", "\xffgh"}}); | |
+template <typename T> | |
+static void TestBinaryConversionBasics() { | |
+ auto type = TypeTraits<T>::type_singleton(); | |
+ AssertConversion<T, std::string>(type, {"ab,cdé\n", ",\xffgh\n"}, | |
+ {{"ab", ""}, {"cdé", "\xffgh"}}); | |
} | |
+TEST(BinaryConversion, Basics) { TestBinaryConversionBasics<BinaryType>(); } | |
+ | |
+TEST(LargeBinaryConversion, Basics) { TestBinaryConversionBasics<LargeBinaryType>(); } | |
+ | |
TEST(BinaryConversion, Nulls) { | |
AssertConversion<BinaryType, std::string>(binary(), {"ab,N/A\n", "NULL,\n"}, | |
{{"ab", "NULL"}, {"N/A", ""}}, | |
@@ -135,16 +142,22 @@ TEST(BinaryConversion, Nulls) { | |
{{true, false}, {false, false}}, options); | |
} | |
-TEST(StringConversion, Basics) { | |
- AssertConversion<StringType, std::string>(utf8(), {"ab,cdé\n", ",gh\n"}, | |
- {{"ab", ""}, {"cdé", "gh"}}); | |
+template <typename T> | |
+static void TestStringConversionBasics() { | |
+ auto type = TypeTraits<T>::type_singleton(); | |
+ AssertConversion<T, std::string>(type, {"ab,cdé\n", ",gh\n"}, | |
+ {{"ab", ""}, {"cdé", "gh"}}); | |
auto options = ConvertOptions::Defaults(); | |
options.check_utf8 = false; | |
- AssertConversion<StringType, std::string>(utf8(), {"ab,cdé\n", ",\xffgh\n"}, | |
- {{"ab", ""}, {"cdé", "\xffgh"}}, options); | |
+ AssertConversion<T, std::string>(type, {"ab,cdé\n", ",\xffgh\n"}, | |
+ {{"ab", ""}, {"cdé", "\xffgh"}}, options); | |
} | |
+TEST(StringConversion, Basics) { TestStringConversionBasics<StringType>(); } | |
+ | |
+TEST(LargeStringConversion, Basics) { TestStringConversionBasics<LargeStringType>(); } | |
+ | |
TEST(StringConversion, Nulls) { | |
AssertConversion<StringType, std::string>(utf8(), {"ab,N/A\n", "NULL,\n"}, | |
{{"ab", "NULL"}, {"N/A", ""}}, | |
@@ -157,11 +170,17 @@ TEST(StringConversion, Nulls) { | |
{{true, false}, {false, false}}, options); | |
} | |
-TEST(StringConversion, Errors) { | |
+template <typename T> | |
+static void TestStringConversionErrors() { | |
+ auto type = TypeTraits<T>::type_singleton(); | |
// Invalid UTF8 in column 0 | |
- AssertConversionError(utf8(), {"ab,cdé\n", "\xff,gh\n"}, {0}); | |
+ AssertConversionError(type, {"ab,cdé\n", "\xff,gh\n"}, {0}); | |
} | |
+TEST(StringConversion, Errors) { TestStringConversionErrors<StringType>(); } | |
+ | |
+TEST(LargeStringConversion, Errors) { TestStringConversionErrors<LargeStringType>(); } | |
+ | |
TEST(FixedSizeBinaryConversion, Basics) { | |
AssertConversion<FixedSizeBinaryType, std::string>( | |
fixed_size_binary(2), {"ab,cd\n", "gh,ij\n"}, {{"ab", "gh"}, {"cd", "ij"}}); | |
diff --git a/cpp/src/arrow/csv/converter.cc b/cpp/src/arrow/csv/converter.cc | |
index 53495cf9b..1c61d3ccb 100644 | |
--- a/cpp/src/arrow/csv/converter.cc | |
+++ b/cpp/src/arrow/csv/converter.cc | |
@@ -431,6 +431,7 @@ Status Converter::Make(const std::shared_ptr<DataType>& type, | |
CONVERTER_CASE(Type::BOOL, BooleanConverter) | |
CONVERTER_CASE(Type::TIMESTAMP, TimestampConverter) | |
CONVERTER_CASE(Type::BINARY, (VarSizeBinaryConverter<BinaryType, false>)) | |
+ CONVERTER_CASE(Type::LARGE_BINARY, (VarSizeBinaryConverter<LargeBinaryType, false>)) | |
CONVERTER_CASE(Type::FIXED_SIZE_BINARY, FixedSizeBinaryConverter) | |
CONVERTER_CASE(Type::DECIMAL, DecimalConverter) | |
@@ -442,6 +443,14 @@ Status Converter::Make(const std::shared_ptr<DataType>& type, | |
} | |
break; | |
+ case Type::LARGE_STRING: | |
+ if (options.check_utf8) { | |
+ result = new VarSizeBinaryConverter<LargeStringType, true>(type, options, pool); | |
+ } else { | |
+ result = new VarSizeBinaryConverter<LargeStringType, false>(type, options, pool); | |
+ } | |
+ break; | |
+ | |
default: { | |
return Status::NotImplemented("CSV conversion to ", type->ToString(), | |
" is not supported"); | |
diff --git a/cpp/src/arrow/ipc/feather.cc b/cpp/src/arrow/ipc/feather.cc | |
index 7cd64c8d7..8436bd205 100644 | |
--- a/cpp/src/arrow/ipc/feather.cc | |
+++ b/cpp/src/arrow/ipc/feather.cc | |
@@ -367,6 +367,8 @@ class TableReader::TableReaderImpl { | |
PRIMITIVE_CASE(DOUBLE, float64); | |
PRIMITIVE_CASE(UTF8, utf8); | |
PRIMITIVE_CASE(BINARY, binary); | |
+ PRIMITIVE_CASE(LARGE_UTF8, large_utf8); | |
+ PRIMITIVE_CASE(LARGE_BINARY, large_binary); | |
default: | |
return Status::Invalid("Unrecognized type"); | |
} | |
@@ -410,6 +412,10 @@ class TableReader::TableReaderImpl { | |
int64_t offsets_size = GetOutputLength((meta->length() + 1) * sizeof(int32_t)); | |
buffers.push_back(SliceBuffer(buffer, offset, offsets_size)); | |
offset += offsets_size; | |
+ } else if (is_large_binary_like(type->id())) { | |
+ int64_t offsets_size = GetOutputLength((meta->length() + 1) * sizeof(int64_t)); | |
+ buffers.push_back(SliceBuffer(buffer, offset, offsets_size)); | |
+ offset += offsets_size; | |
} | |
buffers.push_back(SliceBuffer(buffer, offset, buffer->size() - offset)); | |
@@ -585,6 +591,10 @@ fbs::Type ToFlatbufferType(Type::type type) { | |
return fbs::Type_UTF8; | |
case Type::BINARY: | |
return fbs::Type_BINARY; | |
+ case Type::LARGE_STRING: | |
+ return fbs::Type_LARGE_UTF8; | |
+ case Type::LARGE_BINARY: | |
+ return fbs::Type_LARGE_BINARY; | |
case Type::DATE32: | |
return fbs::Type_INT32; | |
case Type::TIMESTAMP: | |
@@ -644,7 +654,8 @@ class TableWriter::TableWriterImpl : public ArrayVisitor { | |
} | |
Status LoadArrayMetadata(const Array& values, ArrayMetadata* meta) { | |
- if (!(is_primitive(values.type_id()) || is_binary_like(values.type_id()))) { | |
+ if (!(is_primitive(values.type_id()) || is_binary_like(values.type_id()) || | |
+ is_large_binary_like(values.type_id()))) { | |
return Status::Invalid("Array is not primitive type: ", values.type()->ToString()); | |
} | |
@@ -659,6 +670,32 @@ class TableWriter::TableWriterImpl : public ArrayVisitor { | |
return Status::OK(); | |
} | |
+ template <typename ArrayType> | |
+ Status WriteBinaryArray(const ArrayType& values, ArrayMetadata* meta, | |
+ const uint8_t** values_buffer, int64_t* values_bytes, | |
+ int64_t* bytes_written) { | |
+ using offset_type = typename ArrayType::offset_type; | |
+ | |
+ int64_t offset_bytes = sizeof(offset_type) * (values.length() + 1); | |
+ | |
+ if (values.value_offsets()) { | |
+ *values_bytes = values.raw_value_offsets()[values.length()]; | |
+ | |
+ // Write the variable-length offsets | |
+ RETURN_NOT_OK(WritePadded( | |
+ stream_.get(), reinterpret_cast<const uint8_t*>(values.raw_value_offsets()), | |
+ offset_bytes, bytes_written)); | |
+ } else { | |
+ RETURN_NOT_OK(WritePaddedBlank(stream_.get(), offset_bytes, bytes_written)); | |
+ } | |
+ meta->total_bytes += *bytes_written; | |
+ | |
+ if (values.value_data()) { | |
+ *values_buffer = values.value_data()->data(); | |
+ } | |
+ return Status::OK(); | |
+ } | |
+ | |
Status WriteArray(const Array& values, ArrayMetadata* meta) { | |
RETURN_NOT_OK(CheckStarted()); | |
RETURN_NOT_OK(LoadArrayMetadata(values, meta)); | |
@@ -687,26 +724,11 @@ class TableWriter::TableWriterImpl : public ArrayVisitor { | |
const uint8_t* values_buffer = nullptr; | |
if (is_binary_like(values.type_id())) { | |
- const auto& bin_values = checked_cast<const BinaryArray&>(values); | |
- | |
- int64_t offset_bytes = sizeof(int32_t) * (values.length() + 1); | |
- | |
- if (bin_values.value_offsets()) { | |
- values_bytes = bin_values.raw_value_offsets()[values.length()]; | |
- | |
- // Write the variable-length offsets | |
- RETURN_NOT_OK( | |
- WritePadded(stream_.get(), | |
- reinterpret_cast<const uint8_t*>(bin_values.raw_value_offsets()), | |
- offset_bytes, &bytes_written)); | |
- } else { | |
- RETURN_NOT_OK(WritePaddedBlank(stream_.get(), offset_bytes, &bytes_written)); | |
- } | |
- meta->total_bytes += bytes_written; | |
- | |
- if (bin_values.value_data()) { | |
- values_buffer = bin_values.value_data()->data(); | |
- } | |
+ RETURN_NOT_OK(WriteBinaryArray(checked_cast<const BinaryArray&>(values), meta, | |
+ &values_buffer, &values_bytes, &bytes_written)); | |
+ } else if (is_large_binary_like(values.type_id())) { | |
+ RETURN_NOT_OK(WriteBinaryArray(checked_cast<const LargeBinaryArray&>(values), meta, | |
+ &values_buffer, &values_bytes, &bytes_written)); | |
} else { | |
const auto& prim_values = checked_cast<const PrimitiveArray&>(values); | |
const auto& fw_type = checked_cast<const FixedWidthType&>(*values.type()); | |
@@ -760,6 +782,8 @@ class TableWriter::TableWriterImpl : public ArrayVisitor { | |
VISIT_PRIMITIVE(DoubleArray) | |
VISIT_PRIMITIVE(BinaryArray) | |
VISIT_PRIMITIVE(StringArray) | |
+ VISIT_PRIMITIVE(LargeBinaryArray) | |
+ VISIT_PRIMITIVE(LargeStringArray) | |
#undef VISIT_PRIMITIVE | |
diff --git a/cpp/src/arrow/ipc/feather.fbs b/cpp/src/arrow/ipc/feather.fbs | |
index a27d39989..5ec062998 100644 | |
--- a/cpp/src/arrow/ipc/feather.fbs | |
+++ b/cpp/src/arrow/ipc/feather.fbs | |
@@ -48,7 +48,10 @@ enum Type : byte { | |
TIMESTAMP = 14, | |
DATE = 15, | |
- TIME = 16 | |
+ TIME = 16, | |
+ | |
+ LARGE_UTF8 = 17, | |
+ LARGE_BINARY = 18 | |
} | |
enum Encoding : byte { | |
diff --git a/cpp/src/arrow/ipc/json-internal.cc b/cpp/src/arrow/ipc/json-internal.cc | |
index 135296551..49a884e1f 100644 | |
--- a/cpp/src/arrow/ipc/json-internal.cc | |
+++ b/cpp/src/arrow/ipc/json-internal.cc | |
@@ -312,6 +312,10 @@ class SchemaWriter { | |
Status Visit(const TimeType& type) { return WritePrimitive("time", type); } | |
Status Visit(const StringType& type) { return WriteVarBytes("utf8", type); } | |
Status Visit(const BinaryType& type) { return WriteVarBytes("binary", type); } | |
+ Status Visit(const LargeStringType& type) { return WriteVarBytes("large_utf8", type); } | |
+ Status Visit(const LargeBinaryType& type) { | |
+ return WriteVarBytes("large_binary", type); | |
+ } | |
Status Visit(const FixedSizeBinaryType& type) { | |
return WritePrimitive("fixedsizebinary", type); | |
} | |
@@ -430,20 +434,26 @@ class ArrayWriter { | |
} | |
} | |
- // Binary, encode to hexadecimal. UTF8 string write as is | |
+ // Binary, encode to hexadecimal. | |
template <typename T> | |
- typename std::enable_if<std::is_base_of<BinaryArray, T>::value, void>::type | |
+ typename std::enable_if<std::is_same<BinaryArray, T>::value || | |
+ std::is_same<LargeBinaryArray, T>::value, | |
+ void>::type | |
WriteDataValues(const T& arr) { | |
for (int64_t i = 0; i < arr.length(); ++i) { | |
- int32_t length; | |
- const uint8_t* buf = arr.GetValue(i, &length); | |
+ writer_->String(HexEncode(arr.GetView(i))); | |
+ } | |
+ } | |
- if (std::is_base_of<StringArray, T>::value) { | |
- // Presumed UTF-8 | |
- writer_->String(reinterpret_cast<const char*>(buf), length); | |
- } else { | |
- writer_->String(HexEncode(buf, length)); | |
- } | |
+ // UTF8 string, write as is | |
+ template <typename T> | |
+ typename std::enable_if<std::is_same<StringArray, T>::value || | |
+ std::is_same<LargeStringArray, T>::value, | |
+ void>::type | |
+ WriteDataValues(const T& arr) { | |
+ for (int64_t i = 0; i < arr.length(); ++i) { | |
+ auto view = arr.GetView(i); | |
+ writer_->String(view.data(), static_cast<rj::SizeType>(view.size())); | |
} | |
} | |
@@ -558,8 +568,10 @@ class ArrayWriter { | |
} | |
template <typename T> | |
- typename std::enable_if<std::is_base_of<BinaryArray, T>::value, Status>::type Visit( | |
- const T& array) { | |
+ typename std::enable_if<std::is_base_of<BinaryArray, T>::value || | |
+ std::is_base_of<LargeBinaryArray, T>::value, | |
+ Status>::type | |
+ Visit(const T& array) { | |
WriteValidityField(array); | |
WriteIntegerField("OFFSET", array.raw_value_offsets(), array.length() + 1); | |
WriteDataField(array); | |
@@ -911,6 +923,10 @@ static Status GetType(const RjObject& json_type, | |
*type = utf8(); | |
} else if (type_name == "binary") { | |
*type = binary(); | |
+ } else if (type_name == "large_utf8") { | |
+ *type = large_utf8(); | |
+ } else if (type_name == "large_binary") { | |
+ *type = large_binary(); | |
} else if (type_name == "fixedsizebinary") { | |
return GetFixedSizeBinary(json_type, type); | |
} else if (type_name == "decimal") { | |
@@ -1091,9 +1107,10 @@ class ArrayReader { | |
} | |
template <typename T> | |
- typename std::enable_if<std::is_base_of<BinaryType, T>::value, Status>::type Visit( | |
+ typename std::enable_if<std::is_base_of<BaseBinaryType, T>::value, Status>::type Visit( | |
const T& type) { | |
typename TypeTraits<T>::BuilderType builder(pool_); | |
+ using offset_type = typename T::offset_type; | |
const auto& json_data = obj_.FindMember(kData); | |
RETURN_NOT_ARRAY(kData, json_data, obj_); | |
@@ -1110,23 +1127,27 @@ class ArrayReader { | |
const rj::Value& val = json_data_arr[i]; | |
DCHECK(val.IsString()); | |
- if (std::is_base_of<StringType, T>::value) { | |
+ | |
+ if (T::is_utf8) { | |
RETURN_NOT_OK(builder.Append(val.GetString())); | |
} else { | |
std::string hex_string = val.GetString(); | |
- DCHECK(hex_string.size() % 2 == 0) << "Expected base16 hex string"; | |
- int32_t length = static_cast<int>(hex_string.size()) / 2; | |
+ if (hex_string.size() % 2 != 0) { | |
+ return Status::Invalid("Expected base16 hex string"); | |
+ } | |
+ const auto value_len = static_cast<int64_t>(hex_string.size()) / 2; | |
std::shared_ptr<Buffer> byte_buffer; | |
- RETURN_NOT_OK(AllocateBuffer(pool_, length, &byte_buffer)); | |
+ RETURN_NOT_OK(AllocateBuffer(pool_, value_len, &byte_buffer)); | |
const char* hex_data = hex_string.c_str(); | |
uint8_t* byte_buffer_data = byte_buffer->mutable_data(); | |
- for (int32_t j = 0; j < length; ++j) { | |
+ for (int64_t j = 0; j < value_len; ++j) { | |
RETURN_NOT_OK(ParseHexValue(hex_data + j * 2, &byte_buffer_data[j])); | |
} | |
- RETURN_NOT_OK(builder.Append(byte_buffer_data, length)); | |
+ RETURN_NOT_OK( | |
+ builder.Append(byte_buffer_data, static_cast<offset_type>(value_len))); | |
} | |
} | |
diff --git a/cpp/src/arrow/ipc/json-simple-test.cc b/cpp/src/arrow/ipc/json-simple-test.cc | |
index ce8b21a84..b5f68e0c7 100644 | |
--- a/cpp/src/arrow/ipc/json-simple-test.cc | |
+++ b/cpp/src/arrow/ipc/json-simple-test.cc | |
@@ -322,6 +322,21 @@ TEST(TestString, Basics) { | |
AssertJSONArray<BinaryType, std::string>(type, "[\"\\u0000\\u001f\"]", {s}); | |
} | |
+TEST(TestLargeString, Basics) { | |
+ // Similar as TestString above, only testing the basics | |
+ std::shared_ptr<DataType> type = large_utf8(); | |
+ std::shared_ptr<Array> expected, actual; | |
+ | |
+ AssertJSONArray<LargeStringType, std::string>(type, "[\"\", \"foo\"]", {"", "foo"}); | |
+ AssertJSONArray<LargeStringType, std::string>(type, "[\"\", null]", {true, false}, | |
+ {"", ""}); | |
+ | |
+ // Large binary type | |
+ type = large_binary(); | |
+ AssertJSONArray<LargeBinaryType, std::string>(type, "[\"\", \"foo\", null]", | |
+ {true, true, false}, {"", "foo", ""}); | |
+} | |
+ | |
TEST(TestTimestamp, Basics) { | |
// Timestamp type | |
auto type = timestamp(TimeUnit::SECOND); | |
diff --git a/cpp/src/arrow/ipc/json-simple.cc b/cpp/src/arrow/ipc/json-simple.cc | |
index ae01bcc4b..ce0d2c53c 100644 | |
--- a/cpp/src/arrow/ipc/json-simple.cc | |
+++ b/cpp/src/arrow/ipc/json-simple.cc | |
@@ -26,6 +26,7 @@ | |
#include "arrow/ipc/json-internal.h" | |
#include "arrow/ipc/json-simple.h" | |
#include "arrow/memory_pool.h" | |
+#include "arrow/type_traits.h" | |
#include "arrow/util/checked_cast.h" | |
#include "arrow/util/decimal.h" | |
#include "arrow/util/logging.h" | |
@@ -344,11 +345,14 @@ class TimestampConverter final : public ConcreteConverter<TimestampConverter> { | |
// ------------------------------------------------------------------------ | |
// Converter for binary and string arrays | |
-class StringConverter final : public ConcreteConverter<StringConverter> { | |
+template <typename TYPE> | |
+class StringConverter final : public ConcreteConverter<StringConverter<TYPE>> { | |
public: | |
+ using BuilderType = typename TypeTraits<TYPE>::BuilderType; | |
+ | |
explicit StringConverter(const std::shared_ptr<DataType>& type) { | |
this->type_ = type; | |
- builder_ = std::make_shared<BinaryBuilder>(type, default_memory_pool()); | |
+ builder_ = std::make_shared<BuilderType>(type, default_memory_pool()); | |
} | |
Status AppendNull() override { return builder_->AppendNull(); } | |
@@ -368,7 +372,7 @@ class StringConverter final : public ConcreteConverter<StringConverter> { | |
std::shared_ptr<ArrayBuilder> builder() override { return builder_; } | |
private: | |
- std::shared_ptr<BinaryBuilder> builder_; | |
+ std::shared_ptr<BuilderType> builder_; | |
}; | |
// ------------------------------------------------------------------------ | |
@@ -732,8 +736,10 @@ Status GetConverter(const std::shared_ptr<DataType>& type, | |
SIMPLE_CONVERTER_CASE(Type::MAP, MapConverter) | |
SIMPLE_CONVERTER_CASE(Type::FIXED_SIZE_LIST, FixedSizeListConverter) | |
SIMPLE_CONVERTER_CASE(Type::STRUCT, StructConverter) | |
- SIMPLE_CONVERTER_CASE(Type::STRING, StringConverter) | |
- SIMPLE_CONVERTER_CASE(Type::BINARY, StringConverter) | |
+ SIMPLE_CONVERTER_CASE(Type::STRING, StringConverter<StringType>) | |
+ SIMPLE_CONVERTER_CASE(Type::BINARY, StringConverter<BinaryType>) | |
+ SIMPLE_CONVERTER_CASE(Type::LARGE_STRING, StringConverter<LargeStringType>) | |
+ SIMPLE_CONVERTER_CASE(Type::LARGE_BINARY, StringConverter<LargeBinaryType>) | |
SIMPLE_CONVERTER_CASE(Type::FIXED_SIZE_BINARY, FixedSizeBinaryConverter) | |
SIMPLE_CONVERTER_CASE(Type::DECIMAL, DecimalConverter) | |
SIMPLE_CONVERTER_CASE(Type::UNION, UnionConverter) | |
diff --git a/cpp/src/arrow/ipc/metadata-internal.cc b/cpp/src/arrow/ipc/metadata-internal.cc | |
index e505ddeca..93f859a0a 100644 | |
--- a/cpp/src/arrow/ipc/metadata-internal.cc | |
+++ b/cpp/src/arrow/ipc/metadata-internal.cc | |
@@ -232,6 +232,9 @@ Status ConcreteTypeFromFlatbuffer(flatbuf::Type type, const void* type_data, | |
case flatbuf::Type_Binary: | |
*out = binary(); | |
return Status::OK(); | |
+ case flatbuf::Type_LargeBinary: | |
+ *out = large_binary(); | |
+ return Status::OK(); | |
case flatbuf::Type_FixedSizeBinary: { | |
auto fw_binary = static_cast<const flatbuf::FixedSizeBinary*>(type_data); | |
*out = fixed_size_binary(fw_binary->byteWidth()); | |
@@ -240,6 +243,9 @@ Status ConcreteTypeFromFlatbuffer(flatbuf::Type type, const void* type_data, | |
case flatbuf::Type_Utf8: | |
*out = utf8(); | |
return Status::OK(); | |
+ case flatbuf::Type_LargeUtf8: | |
+ *out = large_utf8(); | |
+ return Status::OK(); | |
case flatbuf::Type_Bool: | |
*out = boolean(); | |
return Status::OK(); | |
@@ -541,12 +547,24 @@ class FieldToFlatbufferVisitor { | |
return Status::OK(); | |
} | |
+ Status Visit(const LargeBinaryType& type) { | |
+ fb_type_ = flatbuf::Type_LargeBinary; | |
+ type_offset_ = flatbuf::CreateLargeBinary(fbb_).Union(); | |
+ return Status::OK(); | |
+ } | |
+ | |
Status Visit(const StringType& type) { | |
fb_type_ = flatbuf::Type_Utf8; | |
type_offset_ = flatbuf::CreateUtf8(fbb_).Union(); | |
return Status::OK(); | |
} | |
+ Status Visit(const LargeStringType& type) { | |
+ fb_type_ = flatbuf::Type_LargeUtf8; | |
+ type_offset_ = flatbuf::CreateLargeUtf8(fbb_).Union(); | |
+ return Status::OK(); | |
+ } | |
+ | |
Status Visit(const Date32Type& type) { | |
fb_type_ = flatbuf::Type_Date; | |
type_offset_ = flatbuf::CreateDate(fbb_, flatbuf::DateUnit_DAY).Union(); | |
diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc | |
index c39f2d714..b9f29d747 100644 | |
--- a/cpp/src/arrow/ipc/reader.cc | |
+++ b/cpp/src/arrow/ipc/reader.cc | |
@@ -249,8 +249,10 @@ class ArrayLoader { | |
} | |
template <typename T> | |
- typename std::enable_if<std::is_base_of<BinaryType, T>::value, Status>::type Visit( | |
- const T& type) { | |
+ typename std::enable_if<std::is_base_of<BinaryType, T>::value || | |
+ std::is_base_of<LargeBinaryType, T>::value, | |
+ Status>::type | |
+ Visit(const T& type) { | |
return LoadBinary<T>(); | |
} | |
diff --git a/cpp/src/arrow/ipc/test-common.cc b/cpp/src/arrow/ipc/test-common.cc | |
index 47c307659..4cf13ecc0 100644 | |
--- a/cpp/src/arrow/ipc/test-common.cc | |
+++ b/cpp/src/arrow/ipc/test-common.cc | |
@@ -34,6 +34,7 @@ | |
#include "arrow/testing/random.h" | |
#include "arrow/testing/util.h" | |
#include "arrow/type.h" | |
+#include "arrow/type_traits.h" | |
#include "arrow/util/bit-util.h" | |
namespace arrow { | |
@@ -205,18 +206,16 @@ Status MakeRandomStringArray(int64_t length, bool include_nulls, MemoryPool* poo | |
return builder.Finish(out); | |
} | |
-template <class Builder, class RawType> | |
+template <class BuilderType> | |
static Status MakeBinaryArrayWithUniqueValues(int64_t length, bool include_nulls, | |
MemoryPool* pool, | |
std::shared_ptr<Array>* out) { | |
- Builder builder(pool); | |
+ BuilderType builder(pool); | |
for (int64_t i = 0; i < length; ++i) { | |
if (include_nulls && (i % 7 == 0)) { | |
RETURN_NOT_OK(builder.AppendNull()); | |
} else { | |
- const std::string value = std::to_string(i); | |
- RETURN_NOT_OK(builder.Append(reinterpret_cast<const RawType*>(value.data()), | |
- static_cast<int32_t>(value.size()))); | |
+ RETURN_NOT_OK(builder.Append(std::to_string(i))); | |
} | |
} | |
return builder.Finish(out); | |
@@ -224,28 +223,37 @@ static Status MakeBinaryArrayWithUniqueValues(int64_t length, bool include_nulls | |
Status MakeStringTypesRecordBatch(std::shared_ptr<RecordBatch>* out, bool with_nulls) { | |
const int64_t length = 500; | |
- auto string_type = utf8(); | |
- auto binary_type = binary(); | |
- auto f0 = field("f0", string_type); | |
- auto f1 = field("f1", binary_type); | |
- auto schema = ::arrow::schema({f0, f1}); | |
+ auto f0 = field("strings", utf8()); | |
+ auto f1 = field("binaries", binary()); | |
+ auto f2 = field("large_strings", large_utf8()); | |
+ auto f3 = field("large_binaries", large_binary()); | |
+ auto schema = ::arrow::schema({f0, f1, f2, f3}); | |
- std::shared_ptr<Array> a0, a1; | |
+ std::shared_ptr<Array> a0, a1, a2, a3; | |
MemoryPool* pool = default_memory_pool(); | |
// Quirk with RETURN_NOT_OK macro and templated functions | |
{ | |
- auto s = MakeBinaryArrayWithUniqueValues<StringBuilder, char>(length, with_nulls, | |
- pool, &a0); | |
+ auto s = | |
+ MakeBinaryArrayWithUniqueValues<StringBuilder>(length, with_nulls, pool, &a0); | |
RETURN_NOT_OK(s); | |
} | |
- | |
{ | |
- auto s = MakeBinaryArrayWithUniqueValues<BinaryBuilder, uint8_t>(length, with_nulls, | |
- pool, &a1); | |
+ auto s = | |
+ MakeBinaryArrayWithUniqueValues<BinaryBuilder>(length, with_nulls, pool, &a1); | |
RETURN_NOT_OK(s); | |
} | |
- *out = RecordBatch::Make(schema, length, {a0, a1}); | |
+ { | |
+ auto s = MakeBinaryArrayWithUniqueValues<LargeStringBuilder>(length, with_nulls, pool, | |
+ &a2); | |
+ RETURN_NOT_OK(s); | |
+ } | |
+ { | |
+ auto s = MakeBinaryArrayWithUniqueValues<LargeBinaryBuilder>(length, with_nulls, pool, | |
+ &a3); | |
+ RETURN_NOT_OK(s); | |
+ } | |
+ *out = RecordBatch::Make(schema, length, {a0, a1, a2, a3}); | |
return Status::OK(); | |
} | |
diff --git a/cpp/src/arrow/ipc/writer.cc b/cpp/src/arrow/ipc/writer.cc | |
index e1c2ecacb..ec372074d 100644 | |
--- a/cpp/src/arrow/ipc/writer.cc | |
+++ b/cpp/src/arrow/ipc/writer.cc | |
@@ -225,7 +225,8 @@ class RecordBatchSerializer : public ArrayVisitor { | |
template <typename ArrayType> | |
Status GetZeroBasedValueOffsets(const ArrayType& array, | |
std::shared_ptr<Buffer>* value_offsets) { | |
- // Share slicing logic between ListArray and BinaryArray | |
+ // Share slicing logic between ListArray, BinaryArray and LargeBinaryArray | |
+ using offset_type = typename ArrayType::offset_type; | |
auto offsets = array.value_offsets(); | |
@@ -235,11 +236,12 @@ class RecordBatchSerializer : public ArrayVisitor { | |
// b) slice the values array accordingly | |
std::shared_ptr<Buffer> shifted_offsets; | |
- RETURN_NOT_OK(AllocateBuffer(pool_, sizeof(int32_t) * (array.length() + 1), | |
+ RETURN_NOT_OK(AllocateBuffer(pool_, sizeof(offset_type) * (array.length() + 1), | |
&shifted_offsets)); | |
- int32_t* dest_offsets = reinterpret_cast<int32_t*>(shifted_offsets->mutable_data()); | |
- const int32_t start_offset = array.value_offset(0); | |
+ offset_type* dest_offsets = | |
+ reinterpret_cast<offset_type*>(shifted_offsets->mutable_data()); | |
+ const offset_type start_offset = array.value_offset(0); | |
for (int i = 0; i < array.length(); ++i) { | |
dest_offsets[i] = array.value_offset(i) - start_offset; | |
@@ -253,9 +255,10 @@ class RecordBatchSerializer : public ArrayVisitor { | |
return Status::OK(); | |
} | |
- Status VisitBinary(const BinaryArray& array) { | |
+ template <typename ArrayType> | |
+ Status VisitBinary(const ArrayType& array) { | |
std::shared_ptr<Buffer> value_offsets; | |
- RETURN_NOT_OK(GetZeroBasedValueOffsets<BinaryArray>(array, &value_offsets)); | |
+ RETURN_NOT_OK(GetZeroBasedValueOffsets<ArrayType>(array, &value_offsets)); | |
auto data = array.value_data(); | |
int64_t total_data_bytes = 0; | |
@@ -343,6 +346,10 @@ class RecordBatchSerializer : public ArrayVisitor { | |
Status Visit(const BinaryArray& array) override { return VisitBinary(array); } | |
+ Status Visit(const LargeStringArray& array) override { return VisitBinary(array); } | |
+ | |
+ Status Visit(const LargeBinaryArray& array) override { return VisitBinary(array); } | |
+ | |
Status Visit(const ListArray& array) override { return VisitList(array); } | |
Status Visit(const MapArray& array) override { return VisitList(array); } | |
diff --git a/cpp/src/arrow/json/converter-test.cc b/cpp/src/arrow/json/converter-test.cc | |
index 86e8e8dc8..cf09e617d 100644 | |
--- a/cpp/src/arrow/json/converter-test.cc | |
+++ b/cpp/src/arrow/json/converter-test.cc | |
@@ -85,6 +85,11 @@ TEST(ConverterTest, String) { | |
AssertConvert(utf8(), src, src); | |
} | |
+TEST(ConverterTest, LargeString) { | |
+ std::string src = R"(["a", "b c", null, "d e f", "g"])"; | |
+ AssertConvert(large_utf8(), src, src); | |
+} | |
+ | |
TEST(ConverterTest, Timestamp) { | |
std::string src = R"([null, "1970-01-01", "2018-11-13 17:11:10"])"; | |
AssertConvert(timestamp(TimeUnit::SECOND), src, src); | |
diff --git a/cpp/src/arrow/json/converter.cc b/cpp/src/arrow/json/converter.cc | |
index 078e31418..6b7b73086 100644 | |
--- a/cpp/src/arrow/json/converter.cc | |
+++ b/cpp/src/arrow/json/converter.cc | |
@@ -264,6 +264,8 @@ Status MakeConverter(const std::shared_ptr<DataType>& out_type, MemoryPool* pool | |
CONVERTER_CASE(Type::DATE64, DateTimeConverter<Date64Type>); | |
CONVERTER_CASE(Type::BINARY, BinaryConverter<BinaryType>); | |
CONVERTER_CASE(Type::STRING, BinaryConverter<StringType>); | |
+ CONVERTER_CASE(Type::LARGE_BINARY, BinaryConverter<LargeBinaryType>); | |
+ CONVERTER_CASE(Type::LARGE_STRING, BinaryConverter<LargeStringType>); | |
default: | |
return Status::NotImplemented("JSON conversion to ", *out_type, | |
" is not supported"); | |
diff --git a/cpp/src/arrow/pretty_print-test.cc b/cpp/src/arrow/pretty_print-test.cc | |
index c77a92b7f..cdb230c6c 100644 | |
--- a/cpp/src/arrow/pretty_print-test.cc | |
+++ b/cpp/src/arrow/pretty_print-test.cc | |
@@ -155,6 +155,7 @@ TEST_F(TestPrettyPrint, PrimitiveType) { | |
null | |
])expected"; | |
CheckPrimitive<StringType, std::string>({0, 10}, is_valid, values3, ex3); | |
+ CheckPrimitive<LargeStringType, std::string>({0, 10}, is_valid, values3, ex3); | |
static const char* ex3_in2 = R"expected( [ | |
"foo", | |
"bar", | |
@@ -163,6 +164,7 @@ TEST_F(TestPrettyPrint, PrimitiveType) { | |
null | |
])expected"; | |
CheckPrimitive<StringType, std::string>({2, 10}, is_valid, values3, ex3_in2); | |
+ CheckPrimitive<LargeStringType, std::string>({2, 10}, is_valid, values3, ex3_in2); | |
} | |
TEST_F(TestPrettyPrint, Int8) { | |
@@ -338,9 +340,11 @@ TEST_F(TestPrettyPrint, BinaryType) { | |
std::vector<std::string> values = {"foo", "bar", "", "baz", "", "\xff"}; | |
static const char* ex = "[\n 666F6F,\n 626172,\n null,\n 62617A,\n ,\n FF\n]"; | |
CheckPrimitive<BinaryType, std::string>({0}, is_valid, values, ex); | |
+ CheckPrimitive<LargeBinaryType, std::string>({0}, is_valid, values, ex); | |
static const char* ex_in2 = | |
" [\n 666F6F,\n 626172,\n null,\n 62617A,\n ,\n FF\n ]"; | |
CheckPrimitive<BinaryType, std::string>({2}, is_valid, values, ex_in2); | |
+ CheckPrimitive<LargeBinaryType, std::string>({2}, is_valid, values, ex_in2); | |
} | |
TEST_F(TestPrettyPrint, ListType) { | |
diff --git a/cpp/src/arrow/pretty_print.cc b/cpp/src/arrow/pretty_print.cc | |
index 6caef1714..5a54e13b8 100644 | |
--- a/cpp/src/arrow/pretty_print.cc | |
+++ b/cpp/src/arrow/pretty_print.cc | |
@@ -205,7 +205,9 @@ class ArrayPrinter : public PrettyPrinter { | |
// String (Utf8) | |
template <typename T> | |
- inline typename std::enable_if<std::is_same<StringArray, T>::value, Status>::type | |
+ inline typename std::enable_if<std::is_same<StringArray, T>::value || | |
+ std::is_same<LargeStringArray, T>::value, | |
+ Status>::type | |
WriteDataValues(const T& array) { | |
WriteValues(array, [&](int64_t i) { (*sink_) << "\"" << array.GetView(i) << "\""; }); | |
return Status::OK(); | |
@@ -213,7 +215,9 @@ class ArrayPrinter : public PrettyPrinter { | |
// Binary | |
template <typename T> | |
- inline typename std::enable_if<std::is_same<BinaryArray, T>::value, Status>::type | |
+ inline typename std::enable_if<std::is_same<BinaryArray, T>::value || | |
+ std::is_same<LargeBinaryArray, T>::value, | |
+ Status>::type | |
WriteDataValues(const T& array) { | |
WriteValues(array, [&](int64_t i) { (*sink_) << HexEncode(array.GetView(i)); }); | |
return Status::OK(); | |
@@ -314,6 +318,7 @@ class ArrayPrinter : public PrettyPrinter { | |
typename std::enable_if<std::is_base_of<PrimitiveArray, T>::value || | |
std::is_base_of<FixedSizeBinaryArray, T>::value || | |
std::is_base_of<BinaryArray, T>::value || | |
+ std::is_base_of<LargeBinaryArray, T>::value || | |
std::is_base_of<ListArray, T>::value || | |
std::is_base_of<MapArray, T>::value || | |
std::is_base_of<FixedSizeListArray, T>::value, | |
diff --git a/cpp/src/arrow/scalar.h b/cpp/src/arrow/scalar.h | |
index 4f0589a2f..76aecd01f 100644 | |
--- a/cpp/src/arrow/scalar.h | |
+++ b/cpp/src/arrow/scalar.h | |
@@ -91,20 +91,22 @@ struct NumericScalar : public internal::PrimitiveScalar { | |
: internal::PrimitiveScalar{type, is_valid}, value(value) {} | |
}; | |
-struct ARROW_EXPORT BinaryScalar : public Scalar { | |
+template <typename Type> | |
+struct BaseBinaryScalar : public Scalar { | |
std::shared_ptr<Buffer> value; | |
- explicit BinaryScalar(const std::shared_ptr<Buffer>& value, bool is_valid = true) | |
- : BinaryScalar(value, binary(), is_valid) {} | |
protected: | |
- BinaryScalar(const std::shared_ptr<Buffer>& value, | |
- const std::shared_ptr<DataType>& type, bool is_valid = true) | |
+ BaseBinaryScalar(const std::shared_ptr<Buffer>& value, | |
+ const std::shared_ptr<DataType>& type, bool is_valid = true) | |
: Scalar{type, is_valid}, value(value) {} | |
}; | |
-struct ARROW_EXPORT FixedSizeBinaryScalar : public BinaryScalar { | |
- FixedSizeBinaryScalar(const std::shared_ptr<Buffer>& value, | |
- const std::shared_ptr<DataType>& type, bool is_valid = true); | |
+struct ARROW_EXPORT BinaryScalar : public BaseBinaryScalar<BinaryType> { | |
+ explicit BinaryScalar(const std::shared_ptr<Buffer>& value, bool is_valid = true) | |
+ : BaseBinaryScalar(value, binary(), is_valid) {} | |
+ | |
+ protected: | |
+ using BaseBinaryScalar::BaseBinaryScalar; | |
}; | |
struct ARROW_EXPORT StringScalar : public BinaryScalar { | |
@@ -112,6 +114,24 @@ struct ARROW_EXPORT StringScalar : public BinaryScalar { | |
: BinaryScalar(value, utf8(), is_valid) {} | |
}; | |
+struct ARROW_EXPORT LargeBinaryScalar : public BaseBinaryScalar<LargeBinaryType> { | |
+ explicit LargeBinaryScalar(const std::shared_ptr<Buffer>& value, bool is_valid = true) | |
+ : BaseBinaryScalar(value, large_binary(), is_valid) {} | |
+ | |
+ protected: | |
+ using BaseBinaryScalar::BaseBinaryScalar; | |
+}; | |
+ | |
+struct ARROW_EXPORT LargeStringScalar : public LargeBinaryScalar { | |
+ explicit LargeStringScalar(const std::shared_ptr<Buffer>& value, bool is_valid = true) | |
+ : LargeBinaryScalar(value, utf8(), is_valid) {} | |
+}; | |
+ | |
+struct ARROW_EXPORT FixedSizeBinaryScalar : public BinaryScalar { | |
+ FixedSizeBinaryScalar(const std::shared_ptr<Buffer>& value, | |
+ const std::shared_ptr<DataType>& type, bool is_valid = true); | |
+}; | |
+ | |
class ARROW_EXPORT Date32Scalar : public NumericScalar<Date32Type> { | |
public: | |
using NumericScalar<Date32Type>::NumericScalar; | |
diff --git a/cpp/src/arrow/testing/random.cc b/cpp/src/arrow/testing/random.cc | |
index f693a4535..1e26a4420 100644 | |
--- a/cpp/src/arrow/testing/random.cc | |
+++ b/cpp/src/arrow/testing/random.cc | |
@@ -145,24 +145,30 @@ PRIMITIVE_RAND_FLOAT_IMPL(Float64, double, DoubleType) | |
#undef PRIMITIVE_RAND_FLOAT_IMPL | |
#undef PRIMITIVE_RAND_IMPL | |
-std::shared_ptr<arrow::Array> RandomArrayGenerator::String(int64_t size, | |
- int32_t min_length, | |
- int32_t max_length, | |
- double null_probability) { | |
+template <typename TypeClass> | |
+static std::shared_ptr<arrow::Array> GenerateBinaryArray(RandomArrayGenerator* gen, | |
+ int64_t size, int32_t min_length, | |
+ int32_t max_length, | |
+ double null_probability) { | |
+ using offset_type = typename TypeClass::offset_type; | |
+ using BuilderType = typename TypeTraits<TypeClass>::BuilderType; | |
+ using OffsetArrowType = typename CTypeTraits<offset_type>::ArrowType; | |
+ using OffsetArrayType = typename TypeTraits<OffsetArrowType>::ArrayType; | |
+ | |
if (null_probability < 0 || null_probability > 1) { | |
ABORT_NOT_OK(Status::Invalid("null_probability must be between 0 and 1")); | |
} | |
- auto int32_lengths = Int32(size, min_length, max_length, null_probability); | |
- auto lengths = std::dynamic_pointer_cast<Int32Array>(int32_lengths); | |
+ auto lengths = std::dynamic_pointer_cast<OffsetArrayType>( | |
+ gen->Numeric<OffsetArrowType>(size, min_length, max_length, null_probability)); | |
// Visual Studio does not implement uniform_int_distribution for char types. | |
using GenOpt = GenerateOptions<uint8_t, std::uniform_int_distribution<uint16_t>>; | |
- GenOpt options(seed(), static_cast<uint8_t>('A'), static_cast<uint8_t>('z'), | |
+ GenOpt options(gen->seed(), static_cast<uint8_t>('A'), static_cast<uint8_t>('z'), | |
/*null_probability=*/0); | |
std::vector<uint8_t> str_buffer(max_length); | |
- StringBuilder builder; | |
+ BuilderType builder; | |
for (int64_t i = 0; i < size; ++i) { | |
if (lengths->IsValid(i)) { | |
@@ -178,6 +184,22 @@ std::shared_ptr<arrow::Array> RandomArrayGenerator::String(int64_t size, | |
return result; | |
} | |
+std::shared_ptr<arrow::Array> RandomArrayGenerator::String(int64_t size, | |
+ int32_t min_length, | |
+ int32_t max_length, | |
+ double null_probability) { | |
+ return GenerateBinaryArray<StringType>(this, size, min_length, max_length, | |
+ null_probability); | |
+} | |
+ | |
+std::shared_ptr<arrow::Array> RandomArrayGenerator::LargeString(int64_t size, | |
+ int32_t min_length, | |
+ int32_t max_length, | |
+ double null_probability) { | |
+ return GenerateBinaryArray<LargeStringType>(this, size, min_length, max_length, | |
+ null_probability); | |
+} | |
+ | |
std::shared_ptr<arrow::Array> RandomArrayGenerator::StringWithRepeats( | |
int64_t size, int64_t unique, int32_t min_length, int32_t max_length, | |
double null_probability) { | |
diff --git a/cpp/src/arrow/testing/random.h b/cpp/src/arrow/testing/random.h | |
index 6b188fd57..fc8c2d2fb 100644 | |
--- a/cpp/src/arrow/testing/random.h | |
+++ b/cpp/src/arrow/testing/random.h | |
@@ -214,6 +214,19 @@ class ARROW_EXPORT RandomArrayGenerator { | |
std::shared_ptr<arrow::Array> String(int64_t size, int32_t min_length, | |
int32_t max_length, double null_probability); | |
+ /// \brief Generates a random LargeStringArray | |
+ /// | |
+ /// \param[in] size the size of the array to generate | |
+ /// \param[in] min_length the lower bound of the string length | |
+ /// determined by the uniform distribution | |
+ /// \param[in] max_length the upper bound of the string length | |
+ /// determined by the uniform distribution | |
+ /// \param[in] null_probability the probability of a row being null | |
+ /// | |
+ /// \return a generated Array | |
+ std::shared_ptr<arrow::Array> LargeString(int64_t size, int32_t min_length, | |
+ int32_t max_length, double null_probability); | |
+ | |
/// \brief Generates a random StringArray with repeated values | |
/// | |
/// \param[in] size the size of the array to generate | |
@@ -230,9 +243,9 @@ class ARROW_EXPORT RandomArrayGenerator { | |
int32_t min_length, int32_t max_length, | |
double null_probability); | |
- private: | |
SeedType seed() { return seed_distribution_(seed_rng_); } | |
+ private: | |
std::uniform_int_distribution<SeedType> seed_distribution_; | |
std::default_random_engine seed_rng_; | |
}; | |
diff --git a/cpp/src/arrow/type-test.cc b/cpp/src/arrow/type-test.cc | |
index 7ad1d8ad0..7bfb72001 100644 | |
--- a/cpp/src/arrow/type-test.cc | |
+++ b/cpp/src/arrow/type-test.cc | |
@@ -354,6 +354,20 @@ TEST(TestStringType, ToString) { | |
ASSERT_EQ(str.ToString(), std::string("string")); | |
} | |
+TEST(TestLargeBinaryTypes, ToString) { | |
+ BinaryType bt1; | |
+ LargeBinaryType t1; | |
+ LargeBinaryType e1; | |
+ LargeStringType t2; | |
+ EXPECT_TRUE(t1.Equals(e1)); | |
+ EXPECT_FALSE(t1.Equals(t2)); | |
+ EXPECT_FALSE(t1.Equals(bt1)); | |
+ ASSERT_EQ(t1.id(), Type::LARGE_BINARY); | |
+ ASSERT_EQ(t1.ToString(), std::string("large_binary")); | |
+ ASSERT_EQ(t2.id(), Type::LARGE_STRING); | |
+ ASSERT_EQ(t2.ToString(), std::string("large_string")); | |
+} | |
+ | |
TEST(TestFixedSizeBinaryType, ToString) { | |
auto t = fixed_size_binary(10); | |
ASSERT_EQ(t->id(), Type::FIXED_SIZE_BINARY); | |
diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc | |
index 54e0103fb..4397bf297 100644 | |
--- a/cpp/src/arrow/type.cc | |
+++ b/cpp/src/arrow/type.cc | |
@@ -143,8 +143,6 @@ FloatingPointType::Precision DoubleType::precision() const { | |
return FloatingPointType::DOUBLE; | |
} | |
-std::string StringType::ToString() const { return std::string("string"); } | |
- | |
std::string ListType::ToString() const { | |
std::stringstream s; | |
s << "list<" << value_field()->ToString() << ">"; | |
@@ -178,7 +176,13 @@ std::string FixedSizeListType::ToString() const { | |
return s.str(); | |
} | |
-std::string BinaryType::ToString() const { return std::string("binary"); } | |
+std::string BinaryType::ToString() const { return "binary"; } | |
+ | |
+std::string LargeBinaryType::ToString() const { return "large_binary"; } | |
+ | |
+std::string StringType::ToString() const { return "string"; } | |
+ | |
+std::string LargeStringType::ToString() const { return "large_string"; } | |
int FixedSizeBinaryType::bit_width() const { return CHAR_BIT * byte_width(); } | |
@@ -667,7 +671,9 @@ TYPE_FACTORY(float16, HalfFloatType) | |
TYPE_FACTORY(float32, FloatType) | |
TYPE_FACTORY(float64, DoubleType) | |
TYPE_FACTORY(utf8, StringType) | |
+TYPE_FACTORY(large_utf8, LargeStringType) | |
TYPE_FACTORY(binary, BinaryType) | |
+TYPE_FACTORY(large_binary, LargeBinaryType) | |
TYPE_FACTORY(date64, Date64Type) | |
TYPE_FACTORY(date32, Date32Type) | |
diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h | |
index fc235bb2d..572b888df 100644 | |
--- a/cpp/src/arrow/type.h | |
+++ b/cpp/src/arrow/type.h | |
@@ -143,7 +143,13 @@ struct Type { | |
/// Measure of elapsed time in either seconds, milliseconds, microseconds | |
/// or nanoseconds. | |
- DURATION | |
+ DURATION, | |
+ | |
+ /// Like STRING, but with 64-bit offsets | |
+ LARGE_STRING, | |
+ | |
+ /// Like BINARY, but with 64-bit offsets | |
+ LARGE_BINARY | |
}; | |
}; | |
@@ -472,6 +478,7 @@ class ARROW_EXPORT DoubleType | |
class ARROW_EXPORT ListType : public NestedType { | |
public: | |
static constexpr Type::type type_id = Type::LIST; | |
+ using offset_type = int32_t; | |
// List can contain any other logical value type | |
explicit ListType(const std::shared_ptr<DataType>& value_type) | |
@@ -486,7 +493,7 @@ class ARROW_EXPORT ListType : public NestedType { | |
std::shared_ptr<DataType> value_type() const { return children_[0]->type(); } | |
DataTypeLayout layout() const override { | |
- return {{1, CHAR_BIT * sizeof(int32_t)}, false}; | |
+ return {{1, CHAR_BIT * sizeof(offset_type)}, false}; | |
} | |
std::string ToString() const override; | |
@@ -550,23 +557,78 @@ class ARROW_EXPORT FixedSizeListType : public NestedType { | |
int32_t list_size_; | |
}; | |
+/// \brief Base class for all variable-size binary data types | |
+class ARROW_EXPORT BaseBinaryType : public DataType, public NoExtraMeta { | |
+ public: | |
+ using DataType::DataType; | |
+}; | |
+ | |
/// \brief Concrete type class for variable-size binary data | |
-class ARROW_EXPORT BinaryType : public DataType, public NoExtraMeta { | |
+class ARROW_EXPORT BinaryType : public BaseBinaryType { | |
public: | |
static constexpr Type::type type_id = Type::BINARY; | |
+ static constexpr bool is_utf8 = false; | |
+ using offset_type = int32_t; | |
BinaryType() : BinaryType(Type::BINARY) {} | |
DataTypeLayout layout() const override { | |
- return {{1, CHAR_BIT * sizeof(int32_t), DataTypeLayout::kVariableSizeBuffer}, false}; | |
+ return {{1, CHAR_BIT * sizeof(offset_type), DataTypeLayout::kVariableSizeBuffer}, | |
+ false}; | |
} | |
std::string ToString() const override; | |
std::string name() const override { return "binary"; } | |
protected: | |
- // Allow subclasses to change the logical type. | |
- explicit BinaryType(Type::type logical_type) : DataType(logical_type) {} | |
+ // Allow subclasses like StringType to change the logical type. | |
+ explicit BinaryType(Type::type logical_type) : BaseBinaryType(logical_type) {} | |
+}; | |
+ | |
+/// \brief Concrete type class for large variable-size binary data | |
+class ARROW_EXPORT LargeBinaryType : public BaseBinaryType { | |
+ public: | |
+ static constexpr Type::type type_id = Type::LARGE_BINARY; | |
+ static constexpr bool is_utf8 = false; | |
+ using offset_type = int64_t; | |
+ | |
+ LargeBinaryType() : LargeBinaryType(Type::LARGE_BINARY) {} | |
+ | |
+ DataTypeLayout layout() const override { | |
+ return {{1, CHAR_BIT * sizeof(offset_type), DataTypeLayout::kVariableSizeBuffer}, | |
+ false}; | |
+ } | |
+ | |
+ std::string ToString() const override; | |
+ std::string name() const override { return "large_binary"; } | |
+ | |
+ protected: | |
+ // Allow subclasses like LargeStringType to change the logical type. | |
+ explicit LargeBinaryType(Type::type logical_type) : BaseBinaryType(logical_type) {} | |
+}; | |
+ | |
+/// \brief Concrete type class for variable-size string data, utf8-encoded | |
+class ARROW_EXPORT StringType : public BinaryType { | |
+ public: | |
+ static constexpr Type::type type_id = Type::STRING; | |
+ static constexpr bool is_utf8 = true; | |
+ | |
+ StringType() : BinaryType(Type::STRING) {} | |
+ | |
+ std::string ToString() const override; | |
+ std::string name() const override { return "utf8"; } | |
+}; | |
+ | |
+/// \brief Concrete type class for large variable-size string data, utf8-encoded | |
+class ARROW_EXPORT LargeStringType : public LargeBinaryType { | |
+ public: | |
+ static constexpr Type::type type_id = Type::LARGE_STRING; | |
+ static constexpr bool is_utf8 = true; | |
+ | |
+ LargeStringType() : LargeBinaryType(Type::LARGE_STRING) {} | |
+ | |
+ std::string ToString() const override; | |
+ std::string name() const override { return "large_utf8"; } | |
}; | |
/// \brief Concrete type class for fixed-size binary data | |
@@ -591,17 +653,6 @@ class ARROW_EXPORT FixedSizeBinaryType : public FixedWidthType, public Parametri | |
int32_t byte_width_; | |
}; | |
-/// \brief Concrete type class for variable-size string data, utf8-encoded | |
-class ARROW_EXPORT StringType : public BinaryType { | |
- public: | |
- static constexpr Type::type type_id = Type::STRING; | |
- | |
- StringType() : BinaryType(Type::STRING) {} | |
- | |
- std::string ToString() const override; | |
- std::string name() const override { return "utf8"; } | |
-}; | |
- | |
/// \brief Concrete type class for struct data | |
class ARROW_EXPORT StructType : public NestedType { | |
public: | |
diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h | |
index c42d66152..9935af511 100644 | |
--- a/cpp/src/arrow/type_fwd.h | |
+++ b/cpp/src/arrow/type_fwd.h | |
@@ -65,6 +65,11 @@ class BinaryArray; | |
class BinaryBuilder; | |
struct BinaryScalar; | |
+class LargeBinaryType; | |
+class LargeBinaryArray; | |
+class LargeBinaryBuilder; | |
+struct LargeBinaryScalar; | |
+ | |
class FixedSizeBinaryType; | |
class FixedSizeBinaryArray; | |
class FixedSizeBinaryBuilder; | |
@@ -75,6 +80,11 @@ class StringArray; | |
class StringBuilder; | |
struct StringScalar; | |
+class LargeStringType; | |
+class LargeStringArray; | |
+class LargeStringBuilder; | |
+struct LargeStringScalar; | |
+ | |
class ListType; | |
class ListArray; | |
class ListBuilder; | |
@@ -218,8 +228,12 @@ std::shared_ptr<DataType> ARROW_EXPORT float32(); | |
std::shared_ptr<DataType> ARROW_EXPORT float64(); | |
/// \brief Return a StringType instance | |
std::shared_ptr<DataType> ARROW_EXPORT utf8(); | |
+/// \brief Return a LargeStringType instance | |
+std::shared_ptr<DataType> ARROW_EXPORT large_utf8(); | |
/// \brief Return a BinaryType instance | |
std::shared_ptr<DataType> ARROW_EXPORT binary(); | |
+/// \brief Return a LargeBinaryType instance | |
+std::shared_ptr<DataType> ARROW_EXPORT large_binary(); | |
/// \brief Return a Date32Type instance | |
std::shared_ptr<DataType> ARROW_EXPORT date32(); | |
/// \brief Return a Date64Type instance | |
diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h | |
index 4902f5c63..50e1e725a 100644 | |
--- a/cpp/src/arrow/type_traits.h | |
+++ b/cpp/src/arrow/type_traits.h | |
@@ -243,6 +243,15 @@ struct TypeTraits<BinaryType> { | |
static inline std::shared_ptr<DataType> type_singleton() { return binary(); } | |
}; | |
+template <> | |
+struct TypeTraits<LargeBinaryType> { | |
+ using ArrayType = LargeBinaryArray; | |
+ using BuilderType = LargeBinaryBuilder; | |
+ using ScalarType = LargeBinaryScalar; | |
+ constexpr static bool is_parameter_free = true; | |
+ static inline std::shared_ptr<DataType> type_singleton() { return large_binary(); } | |
+}; | |
+ | |
template <> | |
struct TypeTraits<FixedSizeBinaryType> { | |
using ArrayType = FixedSizeBinaryArray; | |
@@ -260,6 +269,15 @@ struct TypeTraits<StringType> { | |
static inline std::shared_ptr<DataType> type_singleton() { return utf8(); } | |
}; | |
+template <> | |
+struct TypeTraits<LargeStringType> { | |
+ using ArrayType = LargeStringArray; | |
+ using BuilderType = LargeStringBuilder; | |
+ using ScalarType = LargeStringScalar; | |
+ constexpr static bool is_parameter_free = true; | |
+ static inline std::shared_ptr<DataType> type_singleton() { return large_utf8(); } | |
+}; | |
+ | |
template <> | |
struct CTypeTraits<std::string> : public TypeTraits<StringType> { | |
using ArrowType = StringType; | |
@@ -361,6 +379,12 @@ struct is_8bit_int { | |
(std::is_same<UInt8Type, T>::value || std::is_same<Int8Type, T>::value); | |
}; | |
+template <typename T> | |
+struct is_any_string_type { | |
+ static constexpr bool value = | |
+ std::is_same<StringType, T>::value || std::is_same<LargeStringType, T>::value; | |
+}; | |
+ | |
template <typename T, typename R = void> | |
using enable_if_8bit_int = typename std::enable_if<is_8bit_int<T>::value, R>::type; | |
@@ -412,10 +436,18 @@ using enable_if_has_c_type = typename std::enable_if<has_c_type<T>::value, R>::t | |
template <typename T, typename R = void> | |
using enable_if_null = typename std::enable_if<std::is_same<NullType, T>::value, R>::type; | |
+template <typename T, typename R = void> | |
+using enable_if_base_binary = | |
+ typename std::enable_if<std::is_base_of<BaseBinaryType, T>::value, R>::type; | |
+ | |
template <typename T, typename R = void> | |
using enable_if_binary = | |
typename std::enable_if<std::is_base_of<BinaryType, T>::value, R>::type; | |
+template <typename T, typename R = void> | |
+using enable_if_large_binary = | |
+ typename std::enable_if<std::is_base_of<LargeBinaryType, T>::value, R>::type; | |
+ | |
template <typename T, typename R = void> | |
using enable_if_boolean = | |
typename std::enable_if<std::is_same<BooleanType, T>::value, R>::type; | |
@@ -574,6 +606,17 @@ static inline bool is_binary_like(Type::type type_id) { | |
return false; | |
} | |
+static inline bool is_large_binary_like(Type::type type_id) { | |
+ switch (type_id) { | |
+ case Type::LARGE_BINARY: | |
+ case Type::LARGE_STRING: | |
+ return true; | |
+ default: | |
+ break; | |
+ } | |
+ return false; | |
+} | |
+ | |
static inline bool is_dictionary(Type::type type_id) { | |
return type_id == Type::DICTIONARY; | |
} | |
diff --git a/cpp/src/arrow/visitor.cc b/cpp/src/arrow/visitor.cc | |
index 53b341b53..2ec6c6421 100644 | |
--- a/cpp/src/arrow/visitor.cc | |
+++ b/cpp/src/arrow/visitor.cc | |
@@ -47,6 +47,8 @@ ARRAY_VISITOR_DEFAULT(FloatArray) | |
ARRAY_VISITOR_DEFAULT(DoubleArray) | |
ARRAY_VISITOR_DEFAULT(BinaryArray) | |
ARRAY_VISITOR_DEFAULT(StringArray) | |
+ARRAY_VISITOR_DEFAULT(LargeBinaryArray) | |
+ARRAY_VISITOR_DEFAULT(LargeStringArray) | |
ARRAY_VISITOR_DEFAULT(FixedSizeBinaryArray) | |
ARRAY_VISITOR_DEFAULT(Date32Array) | |
ARRAY_VISITOR_DEFAULT(Date64Array) | |
@@ -90,6 +92,8 @@ TYPE_VISITOR_DEFAULT(FloatType) | |
TYPE_VISITOR_DEFAULT(DoubleType) | |
TYPE_VISITOR_DEFAULT(StringType) | |
TYPE_VISITOR_DEFAULT(BinaryType) | |
+TYPE_VISITOR_DEFAULT(LargeStringType) | |
+TYPE_VISITOR_DEFAULT(LargeBinaryType) | |
TYPE_VISITOR_DEFAULT(FixedSizeBinaryType) | |
TYPE_VISITOR_DEFAULT(Date64Type) | |
TYPE_VISITOR_DEFAULT(Date32Type) | |
@@ -134,6 +138,8 @@ SCALAR_VISITOR_DEFAULT(FloatScalar) | |
SCALAR_VISITOR_DEFAULT(DoubleScalar) | |
SCALAR_VISITOR_DEFAULT(StringScalar) | |
SCALAR_VISITOR_DEFAULT(BinaryScalar) | |
+SCALAR_VISITOR_DEFAULT(LargeStringScalar) | |
+SCALAR_VISITOR_DEFAULT(LargeBinaryScalar) | |
SCALAR_VISITOR_DEFAULT(FixedSizeBinaryScalar) | |
SCALAR_VISITOR_DEFAULT(Date64Scalar) | |
SCALAR_VISITOR_DEFAULT(Date32Scalar) | |
diff --git a/cpp/src/arrow/visitor.h b/cpp/src/arrow/visitor.h | |
index a4979e9ce..1c854c478 100644 | |
--- a/cpp/src/arrow/visitor.h | |
+++ b/cpp/src/arrow/visitor.h | |
@@ -43,6 +43,8 @@ class ARROW_EXPORT ArrayVisitor { | |
virtual Status Visit(const DoubleArray& array); | |
virtual Status Visit(const StringArray& array); | |
virtual Status Visit(const BinaryArray& array); | |
+ virtual Status Visit(const LargeStringArray& array); | |
+ virtual Status Visit(const LargeBinaryArray& array); | |
virtual Status Visit(const FixedSizeBinaryArray& array); | |
virtual Status Visit(const Date32Array& array); | |
virtual Status Visit(const Date64Array& array); | |
@@ -81,6 +83,8 @@ class ARROW_EXPORT TypeVisitor { | |
virtual Status Visit(const DoubleType& type); | |
virtual Status Visit(const StringType& type); | |
virtual Status Visit(const BinaryType& type); | |
+ virtual Status Visit(const LargeStringType& type); | |
+ virtual Status Visit(const LargeBinaryType& type); | |
virtual Status Visit(const FixedSizeBinaryType& type); | |
virtual Status Visit(const Date64Type& type); | |
virtual Status Visit(const Date32Type& type); | |
@@ -119,6 +123,8 @@ class ARROW_EXPORT ScalarVisitor { | |
virtual Status Visit(const DoubleScalar& scalar); | |
virtual Status Visit(const StringScalar& scalar); | |
virtual Status Visit(const BinaryScalar& scalar); | |
+ virtual Status Visit(const LargeStringScalar& scalar); | |
+ virtual Status Visit(const LargeBinaryScalar& scalar); | |
virtual Status Visit(const FixedSizeBinaryScalar& scalar); | |
virtual Status Visit(const Date64Scalar& scalar); | |
virtual Status Visit(const Date32Scalar& scalar); | |
diff --git a/cpp/src/arrow/visitor_inline.h b/cpp/src/arrow/visitor_inline.h | |
index 544763a2f..3ed058e64 100644 | |
--- a/cpp/src/arrow/visitor_inline.h | |
+++ b/cpp/src/arrow/visitor_inline.h | |
@@ -47,6 +47,8 @@ namespace arrow { | |
ACTION(Double); \ | |
ACTION(String); \ | |
ACTION(Binary); \ | |
+ ACTION(LargeString); \ | |
+ ACTION(LargeBinary); \ | |
ACTION(FixedSizeBinary); \ | |
ACTION(Duration); \ | |
ACTION(Date32); \ | |
@@ -186,12 +188,13 @@ struct ArrayDataVisitor<T, enable_if_has_c_type<T>> { | |
}; | |
template <typename T> | |
-struct ArrayDataVisitor<T, enable_if_binary<T>> { | |
+struct ArrayDataVisitor<T, enable_if_base_binary<T>> { | |
template <typename Visitor> | |
static Status Visit(const ArrayData& arr, Visitor* visitor) { | |
+ using offset_type = typename T::offset_type; | |
constexpr uint8_t empty_value = 0; | |
- const int32_t* offsets = arr.GetValues<int32_t>(1); | |
+ const offset_type* offsets = arr.GetValues<offset_type>(1); | |
const uint8_t* data; | |
if (!arr.buffers[2]) { | |
data = &empty_value; | |
diff --git a/format/Schema.fbs b/format/Schema.fbs | |
index 36127925e..91aa9db48 100644 | |
--- a/format/Schema.fbs | |
+++ b/format/Schema.fbs | |
@@ -103,13 +103,22 @@ table FloatingPoint { | |
precision: Precision; | |
} | |
-/// Unicode with UTF-8 encoding | |
+/// UTF-8 encoded Unicode strings. Items are limited to 32-bit byte lengths. | |
table Utf8 { | |
} | |
+/// Raw binary strings. Items are limited to 32-bit byte lengths. | |
table Binary { | |
} | |
+/// Variants of Utf8 and Binary with 64-bit byte lengths. | |
+/// These types are optional and may not be supported by all implementations. | |
+table LargeUtf8 { | |
+} | |
+ | |
+table LargeBinary { | |
+} | |
+ | |
table FixedSizeBinary { | |
/// Number of bytes per value | |
byteWidth: int; | |
@@ -235,6 +244,8 @@ union Type { | |
FixedSizeList, | |
Map, | |
Duration, | |
+ LargeBinary, | |
+ LargeUtf8, | |
} | |
/// ---------------------------------------------------------------------- | |
diff --git a/rust/datafusion/Cargo.toml b/rust/datafusion/Cargo.toml | |
index 6e021e32e..3f16f79e1 100644 | |
--- a/rust/datafusion/Cargo.toml | |
+++ b/rust/datafusion/Cargo.toml | |
@@ -58,4 +58,3 @@ criterion = "0.2.0" | |
[[bench]] | |
name = "aggregate_query_sql" | |
harness = false | |
- | |
diff --git a/testing b/testing | |
index a674dac19..d14764eff 160000 | |
--- a/testing | |
+++ b/testing | |
@@ -1 +1 @@ | |
-Subproject commit a674dac190c5fc626964c9b611c67552fa2e530d | |
+Subproject commit d14764eff71c51156bea2a7860f8df811d6c9f11 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment