Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
146 changes: 75 additions & 71 deletions cpp/src/arrow/compute/kernels/scalar_validity.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,11 @@

#include "arrow/util/bit_util.h"
#include "arrow/util/bitmap_ops.h"
#include "arrow/util/dict_util_internal.h"
#include "arrow/util/float16.h"
#include "arrow/util/logging_internal.h"
#include "arrow/util/ree_util.h"
#include "arrow/util/union_util.h"

namespace arrow {

Expand All @@ -36,29 +39,7 @@ namespace compute {
namespace internal {
namespace {

Status IsValidExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
const ArraySpan& arr = batch[0].array;
ArraySpan* out_span = out->array_span_mutable();
if (arr.type->id() == Type::NA) {
// Input is all nulls => output is entirely false.
bit_util::SetBitsTo(out_span->buffers[1].data, out_span->offset, out_span->length,
false);
return Status::OK();
}

DCHECK_EQ(out_span->offset, 0);
DCHECK_LE(out_span->length, arr.length);
if (arr.MayHaveNulls()) {
// We could do a zero-copy optimization, but it isn't worth the added complexity
::arrow::internal::CopyBitmap(arr.buffers[0].data, arr.offset, arr.length,
out_span->buffers[1].data, out_span->offset);
} else {
// Input has no nulls => output is entirely true.
bit_util::SetBitsTo(out_span->buffers[1].data, out_span->offset, out_span->length,
true);
}
return Status::OK();
}
using NanOptionsState = OptionsWrapper<NullOptions>;

struct IsFiniteOperator {
template <typename OutType, typename InType>
Expand All @@ -82,8 +63,6 @@ struct IsInfOperator {
}
};

using NanOptionsState = OptionsWrapper<NullOptions>;

template <typename T>
static void SetNanBits(const ArraySpan& arr, uint8_t* out_bitmap, int64_t out_offset) {
const T* data = arr.GetValues<T>(1);
Expand All @@ -101,45 +80,72 @@ static void SetNanBits(const ArraySpan& arr, uint8_t* out_bitmap, int64_t out_of
}
}

Status IsNullExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
const ArraySpan& arr = batch[0].array;
ArraySpan* out_span = out->array_span_mutable();
if (arr.type->id() == Type::NA) {
bit_util::SetBitsTo(out_span->buffers[1].data, out_span->offset, out_span->length,
true);
return Status::OK();
}

const auto& options = NanOptionsState::Get(ctx);
uint8_t* out_bitmap = out_span->buffers[1].data;
if (arr.GetNullCount() > 0) {
// Input has nulls => output is the inverted null (validity) bitmap.
InvertBitmap(arr.buffers[0].data, arr.offset, arr.length, out_bitmap,
out_span->offset);
static Status SetLogicalNullBits(KernelContext* ctx, const ArraySpan& span,
uint8_t* out_bitmap, int64_t out_offset,
bool set_on_null) {
const Type::type t = span.type->id();
if (t == Type::NA) {
// Input is all nulls, so all output bits are the same.
bit_util::SetBitsTo(out_bitmap, out_offset, span.length, set_on_null);
} else if (t == Type::SPARSE_UNION) {
union_util::SetLogicalNullBitsSparse(span, out_bitmap, out_offset, set_on_null);
} else if (t == Type::DENSE_UNION) {
union_util::SetLogicalNullBitsDense(span, out_bitmap, out_offset, set_on_null);
} else if (t == Type::RUN_END_ENCODED) {
ree_util::SetLogicalNullBits(span, out_bitmap, out_offset, set_on_null);
} else if (t == Type::DICTIONARY) {
dict_util::SetLogicalNullBits(span, out_bitmap, out_offset, set_on_null);
} else {
// Input has no nulls => output is entirely false.
bit_util::SetBitsTo(out_bitmap, out_span->offset, out_span->length, false);
}
// Input is a type for which logical and physical nulls are the same, so we can
// use GetNullCount() and the validity bitmap
if (span.GetNullCount() > 0) {
// Input has nulls. The output is either the validity bitmap or the inverse of the
// validity bitmap.
if (set_on_null) {
InvertBitmap(span.buffers[0].data, span.offset, span.length, out_bitmap,
out_offset);
} else {
CopyBitmap(span.buffers[0].data, span.offset, span.length, out_bitmap,
out_offset);
}
} else {
// Input has no nulls, so all output bits are the same.
bit_util::SetBitsTo(out_bitmap, out_offset, span.length, !set_on_null);
}

if (is_floating(arr.type->id()) && options.nan_is_null) {
switch (arr.type->id()) {
case Type::FLOAT:
SetNanBits<float>(arr, out_bitmap, out_span->offset);
break;
case Type::DOUBLE:
SetNanBits<double>(arr, out_bitmap, out_span->offset);
break;
case Type::HALF_FLOAT:
SetNanBits<uint16_t>(arr, out_bitmap, out_span->offset);
break;
default:
return Status::NotImplemented("NaN detection not implemented for type ",
arr.type->ToString());
// If nan_is_null, we must also check for nans.
if (is_floating(t) && NanOptionsState::Get(ctx).nan_is_null) {
switch (t) {
case Type::FLOAT:
SetNanBits<float>(span, out_bitmap, out_offset);
break;
case Type::DOUBLE:
SetNanBits<double>(span, out_bitmap, out_offset);
break;
case Type::HALF_FLOAT:
SetNanBits<uint16_t>(span, out_bitmap, out_offset);
break;
default:
return Status::NotImplemented("NaN detection not implemented for type ",
span.type->ToString());
}
}
}
return Status::OK();
}

Status IsValidExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
ArraySpan* out_span = out->array_span_mutable();
return SetLogicalNullBits(ctx, batch[0].array, out_span->buffers[1].data,
out_span->offset, false);
}

Status IsNullExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
ArraySpan* out_span = out->array_span_mutable();
return SetLogicalNullBits(ctx, batch[0].array, out_span->buffers[1].data,
out_span->offset, true);
}

struct IsNanOperator {
template <typename OutType, typename InType>
static constexpr OutType Call(KernelContext*, const InType& value, Status*) {
Expand Down Expand Up @@ -243,20 +249,18 @@ std::shared_ptr<ScalarFunction> MakeIsNanFunction(std::string name, FunctionDoc
}

Status TrueUnlessNullExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
// Set all bits in the output's value bitmap to true
ArraySpan* out_span = out->array_span_mutable();
if (out_span->buffers[0].data) {
// If there is a validity bitmap computed above the kernel
// invocation, we copy it to the output buffers
::arrow::internal::CopyBitmap(out_span->buffers[0].data, out_span->offset,
out_span->length, out_span->buffers[1].data,
out_span->offset);
} else {
// But for all-valid inputs, the engine will skip allocating a
// validity bitmap, so we set everything to true
bit_util::SetBitsTo(out_span->buffers[1].data, out_span->offset, out_span->length,
true);
}
return Status::OK();
bit_util::SetBitsTo(out_span->buffers[1].data, out_span->offset, out_span->length,
true);

// Set the output's validity bitmap based on the nullity of the input array
// NOTE: alternatively, we could switch this kernel's null handling back to
// NullHandling::INTERSECTION and change the validity checks in exec.cc so that
// they correctly handle logical nulls, but that would invove significant changes
// in exec.cc which might have more side effects
return SetLogicalNullBits(ctx, batch[0].array, out_span->buffers[0].data,
out_span->offset, false);
}

const FunctionDoc is_valid_doc(
Expand Down Expand Up @@ -303,7 +307,7 @@ void RegisterScalarValidity(FunctionRegistry* registry) {
/*can_write_into_slices=*/true, &kNullOptions, NanOptionsState::Init);

MakeFunction("true_unless_null", true_unless_null_doc, {InputType::Any()}, boolean(),
TrueUnlessNullExec, registry, NullHandling::INTERSECTION,
TrueUnlessNullExec, registry, NullHandling::COMPUTED_PREALLOCATE,
/*can_write_into_slices=*/false);

DCHECK_OK(registry->AddFunction(MakeIsFiniteFunction("is_finite", is_finite_doc)));
Expand Down
49 changes: 49 additions & 0 deletions cpp/src/arrow/compute/kernels/scalar_validity_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,55 @@ TEST_F(TestBooleanValidityKernels, TrueUnlessNull) {
type_singleton(), "[null, true, true, null]");
}

void CheckValidityKernels(Datum input, Datum is_valid_expected) {
ASSERT_OK_AND_ASSIGN(auto is_null_expected, compute::Invert(is_valid_expected));
BooleanScalar true_scalar(true);
NullScalar null_scalar;
ASSERT_OK_AND_ASSIGN(auto true_unlesss_null_expected,
compute::IfElse(is_valid_expected, true_scalar, null_scalar));

CheckScalarUnary("is_valid", input, is_valid_expected);
CheckScalarUnary("is_null", input, is_null_expected);
CheckScalarUnary("true_unless_null", input, true_unlesss_null_expected);
}

TEST_F(TestBooleanValidityKernels, LogicalNulls) {
auto null_dict =
DictArrayFromJSON(dictionary(int8(), int8()), "[0, 2, 1]", "[0, 1, null]");
CheckValidityKernels(null_dict, ArrayFromJSON(boolean(), "[true, false, true]"));
auto null_index =
DictArrayFromJSON(dictionary(int8(), int32()), "[null, 1, 0]", "[8, 2]");
CheckValidityKernels(null_index, ArrayFromJSON(boolean(), "[false, true, true]"));
auto null_dict_and_index = DictArrayFromJSON(dictionary(int8(), boolean()),
"[1, null, 2, 0]", "[true, false, null]");
CheckValidityKernels(null_dict_and_index,
ArrayFromJSON(boolean(), "[true, false, false, true]"));

ASSERT_OK_AND_ASSIGN(auto ree,
RunEndEncode(ArrayFromJSON(int64(), "[11, 11, null, null, 12]")));
CheckValidityKernels(ree, ArrayFromJSON(boolean(), "[true, true, false, false, true]"));

ArrayVector children{
ArrayFromJSON(int64(), "[1, 23, 45, null, null, -2, null]"),
ArrayFromJSON(float32(), "[null, 1.1, 2.2, null, -4.0, 1.5, 0.1]"),
ArrayFromJSON(utf8(), R"(["alpha", "", "beta", null, "gamma", "delta", null])"),
};
auto type_ids = ArrayFromJSON(int8(), "[0, 1, 2, 2, 0, 2, 1]");
auto fields = {field("a", int64()), field("b", float32()), field("c", utf8())};
SparseUnionArray sparse(sparse_union(fields), 7, children,
type_ids->data()->buffers[1]);
ASSERT_OK(sparse.ValidateFull());
CheckValidityKernels(
sparse, ArrayFromJSON(boolean(), "[true, true, true, false, false, true, true]"));

auto offsets = ArrayFromJSON(int32(), "[0, 0, 0, 2, 3, 6, 3]");
DenseUnionArray dense(dense_union(fields), 7, children, type_ids->data()->buffers[1],
offsets->data()->buffers[1]);
ASSERT_OK(dense.ValidateFull());
CheckValidityKernels(
dense, ArrayFromJSON(boolean(), "[true, false, true, true, false, false, false]"));
}

TEST_F(TestBooleanValidityKernels, IsValidIsNullNullType) {
CheckScalarUnary("is_null", std::make_shared<NullArray>(5),
ArrayFromJSON(boolean(), "[true, true, true, true, true]"));
Expand Down
69 changes: 69 additions & 0 deletions cpp/src/arrow/util/dict_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

#include "arrow/array/array_dict.h"
#include "arrow/util/bit_util.h"
#include "arrow/util/bitmap_ops.h"
#include "arrow/util/checked_cast.h"

namespace arrow {
Expand Down Expand Up @@ -51,6 +52,31 @@ int64_t LogicalNullCount(const ArraySpan& span) {
return null_count;
}

template <typename IndexArrowType>
void SetLogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap, int64_t out_offset,
bool set_on_null) {
const auto* indices_null_bit_map = span.buffers[0].data;
const auto& dictionary_span = span.dictionary();
// TODO: Is this always non-null?
const auto* dictionary_null_bit_map = dictionary_span.buffers[0].data;

using CType = typename IndexArrowType::c_type;
const CType* indices_data = span.GetValues<CType>(1);
for (int64_t i = 0; i < span.length; i++) {
bool is_null = false;
if (indices_null_bit_map != nullptr &&
!bit_util::GetBit(indices_null_bit_map, i + span.offset)) {
is_null = true;
} else {
CType current_index = indices_data[i];
is_null = !bit_util::GetBit(dictionary_null_bit_map,
current_index + dictionary_span.offset);
}

bit_util::SetBitTo(out_bitmap, out_offset + i, set_on_null == is_null);
}
}

} // namespace

int64_t LogicalNullCount(const ArraySpan& span) {
Expand Down Expand Up @@ -78,5 +104,48 @@ int64_t LogicalNullCount(const ArraySpan& span) {
return LogicalNullCount<Int64Type>(span);
}
}

void SetLogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap, int64_t out_offset,
bool set_on_null) {
if (span.dictionary().GetNullCount() == 0 || span.length == 0) {
if (set_on_null) {
internal::InvertBitmap(span.buffers[0].data, span.offset, span.length, out_bitmap,
out_offset);
} else {
internal::CopyBitmap(span.buffers[0].data, span.offset, span.length, out_bitmap,
out_offset);
}
return;
}

const auto& dict_array_type = internal::checked_cast<const DictionaryType&>(*span.type);
switch (dict_array_type.index_type()->id()) {
case Type::UINT8:
SetLogicalNullBits<UInt8Type>(span, out_bitmap, out_offset, set_on_null);
break;
case Type::INT8:
SetLogicalNullBits<Int8Type>(span, out_bitmap, out_offset, set_on_null);
break;
case Type::UINT16:
SetLogicalNullBits<UInt16Type>(span, out_bitmap, out_offset, set_on_null);
break;
case Type::INT16:
SetLogicalNullBits<Int16Type>(span, out_bitmap, out_offset, set_on_null);
break;
case Type::UINT32:
SetLogicalNullBits<UInt32Type>(span, out_bitmap, out_offset, set_on_null);
break;
case Type::INT32:
SetLogicalNullBits<Int32Type>(span, out_bitmap, out_offset, set_on_null);
break;
case Type::UINT64:
SetLogicalNullBits<UInt64Type>(span, out_bitmap, out_offset, set_on_null);
break;
default:
SetLogicalNullBits<Int64Type>(span, out_bitmap, out_offset, set_on_null);
break;
}
}

} // namespace dict_util
} // namespace arrow
8 changes: 8 additions & 0 deletions cpp/src/arrow/util/dict_util_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,15 @@
namespace arrow {
namespace dict_util {

/// \brief Compute the logical null count of a dictionary-encoded array
int64_t LogicalNullCount(const ArraySpan& span);

/// \brief Populate a bitmap based on the logical nulls in a dictionary-encoded array
///
/// \param set_on_null true if we should set bits corresponding to nulls and false if
/// we should set bits corresponding to non-nulls
void SetLogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap, int64_t out_offset,
bool set_on_null);

} // namespace dict_util
} // namespace arrow
Loading
Loading