From 2971afc9118ab3f16c63ef9ba6e8311d3200313b Mon Sep 17 00:00:00 2001 From: Hadrian Date: Fri, 26 Jun 2026 00:17:28 -0400 Subject: [PATCH 1/3] first pass --- .../arrow/compute/kernels/scalar_validity.cc | 135 +++++++++--------- .../compute/kernels/scalar_validity_test.cc | 46 ++++++ cpp/src/arrow/util/dict_util.cc | 69 +++++++++ cpp/src/arrow/util/dict_util_internal.h | 2 + cpp/src/arrow/util/ree_util.cc | 30 ++++ cpp/src/arrow/util/ree_util.h | 3 + cpp/src/arrow/util/union_util.cc | 14 ++ cpp/src/arrow/util/union_util.h | 6 + 8 files changed, 235 insertions(+), 70 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_validity.cc b/cpp/src/arrow/compute/kernels/scalar_validity.cc index 5913b756f1c0..3167906ef332 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity.cc @@ -23,8 +23,11 @@ #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_ops.h" +#include "arrow/util/dict_util.h" #include "arrow/util/float16.h" #include "arrow/util/logging_internal.h" +#include "arrow/util/ree_util.h" +#include "arrow/util/union_util.h" namespace arrow { @@ -36,29 +39,7 @@ namespace compute { namespace internal { namespace { -Status IsValidExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { - const ArraySpan& arr = batch[0].array; - ArraySpan* out_span = out->array_span_mutable(); - if (arr.type->id() == Type::NA) { - // Input is all nulls => output is entirely false. - bit_util::SetBitsTo(out_span->buffers[1].data, out_span->offset, out_span->length, - false); - return Status::OK(); - } - - DCHECK_EQ(out_span->offset, 0); - DCHECK_LE(out_span->length, arr.length); - if (arr.MayHaveNulls()) { - // We could do a zero-copy optimization, but it isn't worth the added complexity - ::arrow::internal::CopyBitmap(arr.buffers[0].data, arr.offset, arr.length, - out_span->buffers[1].data, out_span->offset); - } else { - // Input has no nulls => output is entirely true. - bit_util::SetBitsTo(out_span->buffers[1].data, out_span->offset, out_span->length, - true); - } - return Status::OK(); -} +using NanOptionsState = OptionsWrapper; struct IsFiniteOperator { template @@ -82,8 +63,6 @@ struct IsInfOperator { } }; -using NanOptionsState = OptionsWrapper; - template static void SetNanBits(const ArraySpan& arr, uint8_t* out_bitmap, int64_t out_offset) { const T* data = arr.GetValues(1); @@ -101,45 +80,66 @@ static void SetNanBits(const ArraySpan& arr, uint8_t* out_bitmap, int64_t out_of } } -Status IsNullExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { - const ArraySpan& arr = batch[0].array; - ArraySpan* out_span = out->array_span_mutable(); - if (arr.type->id() == Type::NA) { - bit_util::SetBitsTo(out_span->buffers[1].data, out_span->offset, out_span->length, - true); - return Status::OK(); - } - - const auto& options = NanOptionsState::Get(ctx); - uint8_t* out_bitmap = out_span->buffers[1].data; - if (arr.GetNullCount() > 0) { - // Input has nulls => output is the inverted null (validity) bitmap. - InvertBitmap(arr.buffers[0].data, arr.offset, arr.length, out_bitmap, - out_span->offset); +static Status SetLogicalNullBits(KernelContext* ctx, const ArraySpan& span, + uint8_t* out_bitmap, int64_t out_offset, + bool set_on_null) { + const Type::type t = span.type->id(); + if (t == Type::NA) { + // Input is all nulls, so all output bits are the same. + bit_util::SetBitsTo(out_bitmap, out_offset, span.length, set_on_null); + } else if (t == Type::SPARSE_UNION) { + union_util::SetLogicalNullBitsSparse(span, out_bitmap, out_offset, set_on_null); + } else if (t == Type::DENSE_UNION) { + union_util::SetLogicalNullBitsDense(span, out_bitmap, out_offset, set_on_null); + } else if (t == Type::RUN_END_ENCODED) { + ree_util::SetLogicalNullBits(span, out_bitmap, out_offset, set_on_null); + } else if (t == Type::DICTIONARY) { + dict_util::SetLogicalNullBits(span, out_bitmap, out_offset, set_on_null); } else { - // Input has no nulls => output is entirely false. - bit_util::SetBitsTo(out_bitmap, out_span->offset, out_span->length, false); - } + // Input is a type for which logical and physical nulls are the same, so we can + // use GetNullCount() and the validity bitmap + if (span.GetNullCount() > 0) { + // Input has nulls. The output is either the validity bitmap or the inverse of the + // validity bitmap. + if (set_on_null) { + InvertBitmap(span.buffers[0].data, span.offset, span.length, out_bitmap, + out_offset); + } else { + CopyBitmap(span.buffers[0].data, span.offset, span.length, out_bitmap, + out_offset); + } + } else { + // Input has no nulls, so all output bits are the same. + bit_util::SetBitsTo(out_bitmap, out_offset, span.length, !set_on_null); + } - if (is_floating(arr.type->id()) && options.nan_is_null) { - switch (arr.type->id()) { - case Type::FLOAT: - SetNanBits(arr, out_bitmap, out_span->offset); - break; - case Type::DOUBLE: - SetNanBits(arr, out_bitmap, out_span->offset); - break; - case Type::HALF_FLOAT: - SetNanBits(arr, out_bitmap, out_span->offset); - break; - default: + // If nan_is_null, we must also check for nans. + if (is_floating(t) && NanOptionsState::Get(ctx).nan_is_null) { + if (t == Type::FLOAT) { + SetNanBits(span, out_bitmap, out_offset); + } else if (t == Type::DOUBLE) { + SetNanBits(span, out_bitmap, out_offset); + } else { return Status::NotImplemented("NaN detection not implemented for type ", - arr.type->ToString()); + span.type->ToString()); + } } } return Status::OK(); } +Status IsValidExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { + ArraySpan* out_span = out->array_span_mutable(); + return SetLogicalNullBits(ctx, batch[0].array, out_span->buffers[1].data, + out_span->offset, false); +} + +Status IsNullExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { + ArraySpan* out_span = out->array_span_mutable(); + return SetLogicalNullBits(ctx, batch[0].array, out_span->buffers[1].data, + out_span->offset, true); +} + struct IsNanOperator { template static constexpr OutType Call(KernelContext*, const InType& value, Status*) { @@ -243,20 +243,14 @@ std::shared_ptr MakeIsNanFunction(std::string name, FunctionDoc } Status TrueUnlessNullExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { + // Set all bits in the output's value bitmap to true ArraySpan* out_span = out->array_span_mutable(); - if (out_span->buffers[0].data) { - // If there is a validity bitmap computed above the kernel - // invocation, we copy it to the output buffers - ::arrow::internal::CopyBitmap(out_span->buffers[0].data, out_span->offset, - out_span->length, out_span->buffers[1].data, - out_span->offset); - } else { - // But for all-valid inputs, the engine will skip allocating a - // validity bitmap, so we set everything to true - bit_util::SetBitsTo(out_span->buffers[1].data, out_span->offset, out_span->length, - true); - } - return Status::OK(); + bit_util::SetBitsTo(out_span->buffers[1].data, out_span->offset, out_span->length, + true); + + // Set the output's validity bitmap based on the nullity of the input array + return SetLogicalNullBits(ctx, batch[0].array, out_span->buffers[0].data, + out_span->offset, false); } const FunctionDoc is_valid_doc( @@ -302,8 +296,9 @@ void RegisterScalarValidity(FunctionRegistry* registry) { registry, NullHandling::OUTPUT_NOT_NULL, /*can_write_into_slices=*/true, &kNullOptions, NanOptionsState::Init); + // TODO: switch back to NullHandling::INTERSECTION MakeFunction("true_unless_null", true_unless_null_doc, {InputType::Any()}, boolean(), - TrueUnlessNullExec, registry, NullHandling::INTERSECTION, + TrueUnlessNullExec, registry, NullHandling::COMPUTED_PREALLOCATE, /*can_write_into_slices=*/false); DCHECK_OK(registry->AddFunction(MakeIsFiniteFunction("is_finite", is_finite_doc))); diff --git a/cpp/src/arrow/compute/kernels/scalar_validity_test.cc b/cpp/src/arrow/compute/kernels/scalar_validity_test.cc index 4613176b48ca..f4ddf9c5116b 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity_test.cc @@ -59,6 +59,52 @@ TEST_F(TestBooleanValidityKernels, TrueUnlessNull) { type_singleton(), "[null, true, true, null]"); } +TEST_F(TestBooleanValidityKernels, LogicalNulls) { + auto null_dict = + DictArrayFromJSON(dictionary(int8(), int8()), "[0, 2, 1]", "[0, 1, null]"); + CheckScalarUnary("is_valid", null_dict, + ArrayFromJSON(boolean(), "[true, false, true]")); + CheckScalarUnary("is_null", null_dict, + ArrayFromJSON(boolean(), "[false, true, false]")); + CheckScalarUnary("true_unless_null", null_dict, + ArrayFromJSON(boolean(), "[true, null, true]")); + auto null_index = + DictArrayFromJSON(dictionary(int8(), int32()), "[null, 1, 0]", "[8, 2]"); + CheckScalarUnary("true_unless_null", null_index, + ArrayFromJSON(boolean(), "[null, true, true]")); + auto null_dict_and_index = DictArrayFromJSON(dictionary(int8(), boolean()), + "[1, null, 2, 0]", "[true, false, null]"); + CheckScalarUnary("true_unless_null", null_dict_and_index, + ArrayFromJSON(boolean(), "[true, null, null, true]")); + + ASSERT_OK_AND_ASSIGN(auto ree, + RunEndEncode(ArrayFromJSON(int64(), "[11, 11, null, null, 12]"))); + CheckScalarUnary("true_unless_null", ree, + ArrayFromJSON(boolean(), "[true, true, null, null, true]")); + + ArrayVector children{ + ArrayFromJSON(int64(), "[1, 23, 45, null, null, -2, null]"), + ArrayFromJSON(float32(), "[null, 1.1, 2.2, null, -4.0, 1.5, 0.1]"), + ArrayFromJSON(utf8(), R"(["alpha", "", "beta", null, "gamma", "delta", null])"), + }; + auto type_ids = ArrayFromJSON(int8(), "[0, 1, 2, 2, 0, 2, 1]"); + auto fields = {field("a", int64()), field("b", float32()), field("c", utf8())}; + SparseUnionArray sparse(sparse_union(fields), 7, children, + type_ids->data()->buffers[1]); + ASSERT_OK(sparse.ValidateFull()); + CheckScalarUnary( + "true_unless_null", sparse, + ArrayFromJSON(boolean(), "[true, true, true, null, null, true, true]")); + + auto offsets = ArrayFromJSON(int32(), "[0, 0, 0, 2, 3, 6, 3]"); + DenseUnionArray dense(dense_union(fields), 7, children, type_ids->data()->buffers[1], + offsets->data()->buffers[1]); + ASSERT_OK(dense.ValidateFull()); + CheckScalarUnary( + "true_unless_null", dense, + ArrayFromJSON(boolean(), "[true, null, true, true, null, null, null]")); +} + TEST_F(TestBooleanValidityKernels, IsValidIsNullNullType) { CheckScalarUnary("is_null", std::make_shared(5), ArrayFromJSON(boolean(), "[true, true, true, true, true]")); diff --git a/cpp/src/arrow/util/dict_util.cc b/cpp/src/arrow/util/dict_util.cc index c93517140ca3..e76a8dcdcc50 100644 --- a/cpp/src/arrow/util/dict_util.cc +++ b/cpp/src/arrow/util/dict_util.cc @@ -19,6 +19,7 @@ #include "arrow/array/array_dict.h" #include "arrow/util/bit_util.h" +#include "arrow/util/bitmap_ops.h" #include "arrow/util/checked_cast.h" namespace arrow { @@ -51,6 +52,31 @@ int64_t LogicalNullCount(const ArraySpan& span) { return null_count; } +template +void SetLogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap, int64_t out_offset, + bool set_on_null) { + const auto* indices_null_bit_map = span.buffers[0].data; + const auto& dictionary_span = span.dictionary(); + // TODO: Is this always non-null? + const auto* dictionary_null_bit_map = dictionary_span.buffers[0].data; + + using CType = typename IndexArrowType::c_type; + const CType* indices_data = span.GetValues(1); + for (int64_t i = 0; i < span.length; i++) { + bool is_null = false; + if (indices_null_bit_map != nullptr && + !bit_util::GetBit(indices_null_bit_map, i + span.offset)) { + is_null = true; + } else { + CType current_index = indices_data[i]; + is_null = !bit_util::GetBit(dictionary_null_bit_map, + current_index + dictionary_span.offset); + } + + bit_util::SetBitTo(out_bitmap, out_offset + i, set_on_null == is_null); + } +} + } // namespace int64_t LogicalNullCount(const ArraySpan& span) { @@ -78,5 +104,48 @@ int64_t LogicalNullCount(const ArraySpan& span) { return LogicalNullCount(span); } } + +void SetLogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap, int64_t out_offset, + bool set_on_null) { + if (span.dictionary().GetNullCount() == 0 || span.length == 0) { + if (set_on_null) { + internal::InvertBitmap(span.buffers[0].data, span.offset, span.length, out_bitmap, + out_offset); + } else { + internal::CopyBitmap(span.buffers[0].data, span.offset, span.length, out_bitmap, + out_offset); + } + return; + } + + const auto& dict_array_type = internal::checked_cast(*span.type); + switch (dict_array_type.index_type()->id()) { + case Type::UINT8: + SetLogicalNullBits(span, out_bitmap, out_offset, set_on_null); + break; + case Type::INT8: + SetLogicalNullBits(span, out_bitmap, out_offset, set_on_null); + break; + case Type::UINT16: + SetLogicalNullBits(span, out_bitmap, out_offset, set_on_null); + break; + case Type::INT16: + SetLogicalNullBits(span, out_bitmap, out_offset, set_on_null); + break; + case Type::UINT32: + SetLogicalNullBits(span, out_bitmap, out_offset, set_on_null); + break; + case Type::INT32: + SetLogicalNullBits(span, out_bitmap, out_offset, set_on_null); + break; + case Type::UINT64: + SetLogicalNullBits(span, out_bitmap, out_offset, set_on_null); + break; + default: + SetLogicalNullBits(span, out_bitmap, out_offset, set_on_null); + break; + } +} + } // namespace dict_util } // namespace arrow diff --git a/cpp/src/arrow/util/dict_util_internal.h b/cpp/src/arrow/util/dict_util_internal.h index a92733ae0f63..6a9c5f7429a1 100644 --- a/cpp/src/arrow/util/dict_util_internal.h +++ b/cpp/src/arrow/util/dict_util_internal.h @@ -23,6 +23,8 @@ namespace arrow { namespace dict_util { int64_t LogicalNullCount(const ArraySpan& span); +void SetLogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap, int64_t out_offset, + bool set_on_null); } // namespace dict_util } // namespace arrow diff --git a/cpp/src/arrow/util/ree_util.cc b/cpp/src/arrow/util/ree_util.cc index 461d6804b8cb..8685f70e9cee 100644 --- a/cpp/src/arrow/util/ree_util.cc +++ b/cpp/src/arrow/util/ree_util.cc @@ -47,6 +47,23 @@ int64_t LogicalNullCount(const ArraySpan& span) { return null_count; } +template +void SetLogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap, int64_t out_offset, + bool set_on_null) { + const auto& values = ValuesArray(span); + const auto& values_bitmap = values.buffers[0].data; + + RunEndEncodedArraySpan ree_span(span); + auto end = ree_span.end(); + for (auto it = ree_span.begin(); it != end; ++it) { + const bool is_null = + values_bitmap && + !bit_util::GetBit(values_bitmap, values.offset + it.index_into_array()); + bit_util::SetBitsTo(out_bitmap, out_offset, it.run_length(), set_on_null == is_null); + out_offset += it.run_length(); + } +} + } // namespace int64_t LogicalNullCount(const ArraySpan& span) { @@ -61,6 +78,19 @@ int64_t LogicalNullCount(const ArraySpan& span) { return LogicalNullCount(span); } +void SetLogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap, int64_t out_offset, + bool set_on_null) { + const auto type_id = RunEndsArray(span).type->id(); + if (type_id == Type::INT16) { + SetLogicalNullBits(span, out_bitmap, out_offset, set_on_null); + } else if (type_id == Type::INT32) { + SetLogicalNullBits(span, out_bitmap, out_offset, set_on_null); + } else { + DCHECK_EQ(type_id, Type::INT64); + SetLogicalNullBits(span, out_bitmap, out_offset, set_on_null); + } +} + namespace internal { /// \pre 0 <= i < array_span.length() diff --git a/cpp/src/arrow/util/ree_util.h b/cpp/src/arrow/util/ree_util.h index 5c759f2e80dc..cbc4421bde3c 100644 --- a/cpp/src/arrow/util/ree_util.h +++ b/cpp/src/arrow/util/ree_util.h @@ -57,6 +57,9 @@ Status ValidateRunEndEncodedChildren(const RunEndEncodedType& type, /// \brief Compute the logical null count of an REE array int64_t LogicalNullCount(const ArraySpan& span); +void SetLogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap, int64_t out_offset, + bool set_on_null); + namespace internal { /// \brief Uses binary-search to find the physical offset given a logical offset diff --git a/cpp/src/arrow/util/union_util.cc b/cpp/src/arrow/util/union_util.cc index 6b4d752d8685..56438f537d73 100644 --- a/cpp/src/arrow/util/union_util.cc +++ b/cpp/src/arrow/util/union_util.cc @@ -55,4 +55,18 @@ int64_t LogicalDenseUnionNullCount(const ArraySpan& span) { return null_count; } +void SetLogicalNullBitsSparse(const ArraySpan& span, uint8_t* out_bitmap, + int64_t out_offset, bool set_on_null) { + for (int64_t i = 0; i < span.length; i++) { + bit_util::SetBitTo(out_bitmap, out_offset + i, set_on_null == span.IsNull(i)); + } +} + +void SetLogicalNullBitsDense(const ArraySpan& span, uint8_t* out_bitmap, + int64_t out_offset, bool set_on_null) { + for (int64_t i = 0; i < span.length; i++) { + bit_util::SetBitTo(out_bitmap, out_offset + i, set_on_null == span.IsNull(i)); + } +} + } // namespace arrow::union_util diff --git a/cpp/src/arrow/util/union_util.h b/cpp/src/arrow/util/union_util.h index 0f30d5a32781..4b292106c73c 100644 --- a/cpp/src/arrow/util/union_util.h +++ b/cpp/src/arrow/util/union_util.h @@ -27,5 +27,11 @@ int64_t LogicalSparseUnionNullCount(const ArraySpan& span); /// \brief Compute the number of of logical nulls in a dense union array int64_t LogicalDenseUnionNullCount(const ArraySpan& span); +void SetLogicalNullBitsSparse(const ArraySpan& span, uint8_t* out_bitmap, + int64_t out_offset, bool set_on_null); + +void SetLogicalNullBitsDense(const ArraySpan& span, uint8_t* out_bitmap, + int64_t out_offset, bool set_on_null); + } // namespace union_util } // namespace arrow From 1c7641eb23c8e7f7d083ba614c481ce9212c2c9f Mon Sep 17 00:00:00 2001 From: Hadrian Date: Fri, 26 Jun 2026 01:14:19 -0400 Subject: [PATCH 2/3] rebase --- .../arrow/compute/kernels/scalar_validity.cc | 22 ++++++++++++------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_validity.cc b/cpp/src/arrow/compute/kernels/scalar_validity.cc index 3167906ef332..dfaa679350a3 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity.cc @@ -23,7 +23,7 @@ #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_ops.h" -#include "arrow/util/dict_util.h" +#include "arrow/util/dict_util_internal.h" #include "arrow/util/float16.h" #include "arrow/util/logging_internal.h" #include "arrow/util/ree_util.h" @@ -115,13 +115,19 @@ static Status SetLogicalNullBits(KernelContext* ctx, const ArraySpan& span, // If nan_is_null, we must also check for nans. if (is_floating(t) && NanOptionsState::Get(ctx).nan_is_null) { - if (t == Type::FLOAT) { - SetNanBits(span, out_bitmap, out_offset); - } else if (t == Type::DOUBLE) { - SetNanBits(span, out_bitmap, out_offset); - } else { - return Status::NotImplemented("NaN detection not implemented for type ", - span.type->ToString()); + switch (t) { + case Type::FLOAT: + SetNanBits(span, out_bitmap, out_offset); + break; + case Type::DOUBLE: + SetNanBits(span, out_bitmap, out_offset); + break; + case Type::HALF_FLOAT: + SetNanBits(span, out_bitmap, out_offset); + break; + default: + return Status::NotImplemented("NaN detection not implemented for type ", + span.type->ToString()); } } } From 8575bde8d4f29b86993684fdcec067569c9f28b5 Mon Sep 17 00:00:00 2001 From: Hadrian Date: Fri, 26 Jun 2026 07:11:00 -0400 Subject: [PATCH 3/3] clean stuff up --- .../arrow/compute/kernels/scalar_validity.cc | 5 ++- .../compute/kernels/scalar_validity_test.cc | 39 ++++++++++--------- cpp/src/arrow/util/dict_util_internal.h | 6 +++ cpp/src/arrow/util/ree_util.h | 4 ++ cpp/src/arrow/util/union_util.h | 8 ++++ 5 files changed, 43 insertions(+), 19 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_validity.cc b/cpp/src/arrow/compute/kernels/scalar_validity.cc index dfaa679350a3..ffcdbd7a0faf 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity.cc @@ -255,6 +255,10 @@ Status TrueUnlessNullExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* true); // Set the output's validity bitmap based on the nullity of the input array + // NOTE: alternatively, we could switch this kernel's null handling back to + // NullHandling::INTERSECTION and change the validity checks in exec.cc so that + // they correctly handle logical nulls, but that would invove significant changes + // in exec.cc which might have more side effects return SetLogicalNullBits(ctx, batch[0].array, out_span->buffers[0].data, out_span->offset, false); } @@ -302,7 +306,6 @@ void RegisterScalarValidity(FunctionRegistry* registry) { registry, NullHandling::OUTPUT_NOT_NULL, /*can_write_into_slices=*/true, &kNullOptions, NanOptionsState::Init); - // TODO: switch back to NullHandling::INTERSECTION MakeFunction("true_unless_null", true_unless_null_doc, {InputType::Any()}, boolean(), TrueUnlessNullExec, registry, NullHandling::COMPUTED_PREALLOCATE, /*can_write_into_slices=*/false); diff --git a/cpp/src/arrow/compute/kernels/scalar_validity_test.cc b/cpp/src/arrow/compute/kernels/scalar_validity_test.cc index f4ddf9c5116b..a73577442706 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity_test.cc @@ -59,28 +59,33 @@ TEST_F(TestBooleanValidityKernels, TrueUnlessNull) { type_singleton(), "[null, true, true, null]"); } +void CheckValidityKernels(Datum input, Datum is_valid_expected) { + ASSERT_OK_AND_ASSIGN(auto is_null_expected, compute::Invert(is_valid_expected)); + BooleanScalar true_scalar(true); + NullScalar null_scalar; + ASSERT_OK_AND_ASSIGN(auto true_unlesss_null_expected, + compute::IfElse(is_valid_expected, true_scalar, null_scalar)); + + CheckScalarUnary("is_valid", input, is_valid_expected); + CheckScalarUnary("is_null", input, is_null_expected); + CheckScalarUnary("true_unless_null", input, true_unlesss_null_expected); +} + TEST_F(TestBooleanValidityKernels, LogicalNulls) { auto null_dict = DictArrayFromJSON(dictionary(int8(), int8()), "[0, 2, 1]", "[0, 1, null]"); - CheckScalarUnary("is_valid", null_dict, - ArrayFromJSON(boolean(), "[true, false, true]")); - CheckScalarUnary("is_null", null_dict, - ArrayFromJSON(boolean(), "[false, true, false]")); - CheckScalarUnary("true_unless_null", null_dict, - ArrayFromJSON(boolean(), "[true, null, true]")); + CheckValidityKernels(null_dict, ArrayFromJSON(boolean(), "[true, false, true]")); auto null_index = DictArrayFromJSON(dictionary(int8(), int32()), "[null, 1, 0]", "[8, 2]"); - CheckScalarUnary("true_unless_null", null_index, - ArrayFromJSON(boolean(), "[null, true, true]")); + CheckValidityKernels(null_index, ArrayFromJSON(boolean(), "[false, true, true]")); auto null_dict_and_index = DictArrayFromJSON(dictionary(int8(), boolean()), "[1, null, 2, 0]", "[true, false, null]"); - CheckScalarUnary("true_unless_null", null_dict_and_index, - ArrayFromJSON(boolean(), "[true, null, null, true]")); + CheckValidityKernels(null_dict_and_index, + ArrayFromJSON(boolean(), "[true, false, false, true]")); ASSERT_OK_AND_ASSIGN(auto ree, RunEndEncode(ArrayFromJSON(int64(), "[11, 11, null, null, 12]"))); - CheckScalarUnary("true_unless_null", ree, - ArrayFromJSON(boolean(), "[true, true, null, null, true]")); + CheckValidityKernels(ree, ArrayFromJSON(boolean(), "[true, true, false, false, true]")); ArrayVector children{ ArrayFromJSON(int64(), "[1, 23, 45, null, null, -2, null]"), @@ -92,17 +97,15 @@ TEST_F(TestBooleanValidityKernels, LogicalNulls) { SparseUnionArray sparse(sparse_union(fields), 7, children, type_ids->data()->buffers[1]); ASSERT_OK(sparse.ValidateFull()); - CheckScalarUnary( - "true_unless_null", sparse, - ArrayFromJSON(boolean(), "[true, true, true, null, null, true, true]")); + CheckValidityKernels( + sparse, ArrayFromJSON(boolean(), "[true, true, true, false, false, true, true]")); auto offsets = ArrayFromJSON(int32(), "[0, 0, 0, 2, 3, 6, 3]"); DenseUnionArray dense(dense_union(fields), 7, children, type_ids->data()->buffers[1], offsets->data()->buffers[1]); ASSERT_OK(dense.ValidateFull()); - CheckScalarUnary( - "true_unless_null", dense, - ArrayFromJSON(boolean(), "[true, null, true, true, null, null, null]")); + CheckValidityKernels( + dense, ArrayFromJSON(boolean(), "[true, false, true, true, false, false, false]")); } TEST_F(TestBooleanValidityKernels, IsValidIsNullNullType) { diff --git a/cpp/src/arrow/util/dict_util_internal.h b/cpp/src/arrow/util/dict_util_internal.h index 6a9c5f7429a1..a946758b685e 100644 --- a/cpp/src/arrow/util/dict_util_internal.h +++ b/cpp/src/arrow/util/dict_util_internal.h @@ -22,7 +22,13 @@ namespace arrow { namespace dict_util { +/// \brief Compute the logical null count of a dictionary-encoded array int64_t LogicalNullCount(const ArraySpan& span); + +/// \brief Populate a bitmap based on the logical nulls in a dictionary-encoded array +/// +/// \param set_on_null true if we should set bits corresponding to nulls and false if +/// we should set bits corresponding to non-nulls void SetLogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap, int64_t out_offset, bool set_on_null); diff --git a/cpp/src/arrow/util/ree_util.h b/cpp/src/arrow/util/ree_util.h index cbc4421bde3c..16a2395acc2e 100644 --- a/cpp/src/arrow/util/ree_util.h +++ b/cpp/src/arrow/util/ree_util.h @@ -57,6 +57,10 @@ Status ValidateRunEndEncodedChildren(const RunEndEncodedType& type, /// \brief Compute the logical null count of an REE array int64_t LogicalNullCount(const ArraySpan& span); +/// \brief Populate a bitmap based on the logical nulls in an REE array +/// +/// \param set_on_null true if we should set bits corresponding to nulls and false if +/// we should set bits corresponding to non-nulls void SetLogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap, int64_t out_offset, bool set_on_null); diff --git a/cpp/src/arrow/util/union_util.h b/cpp/src/arrow/util/union_util.h index 4b292106c73c..04608d245361 100644 --- a/cpp/src/arrow/util/union_util.h +++ b/cpp/src/arrow/util/union_util.h @@ -27,9 +27,17 @@ int64_t LogicalSparseUnionNullCount(const ArraySpan& span); /// \brief Compute the number of of logical nulls in a dense union array int64_t LogicalDenseUnionNullCount(const ArraySpan& span); +/// \brief Populate a bitmap based on the logical nulls in a sparse union array +/// +/// \param set_on_null true if we should set bits corresponding to nulls and false if +/// we should set bits corresponding to non-nulls void SetLogicalNullBitsSparse(const ArraySpan& span, uint8_t* out_bitmap, int64_t out_offset, bool set_on_null); +/// \brief Populate a bitmap based on the logical nulls in a dense union array +/// +/// \param set_on_null true if we should set bits corresponding to nulls and false if +/// we should set bits corresponding to non-nulls void SetLogicalNullBitsDense(const ArraySpan& span, uint8_t* out_bitmap, int64_t out_offset, bool set_on_null);