Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions velox/dwio/parquet/thrift/ParquetThriftTypes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -682,6 +682,11 @@ void Statistics::__set_max_value(const std::string& val) {
__isset.max_value = true;
}

void Statistics::__set_nan_count(const int64_t val) {
this->nan_count = val;
__isset.nan_count = true;
}
Comment on lines +685 to +688

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

parquet.thrift updated with new field.
Read updated

     case 7:
        if (ftype == ::apache::thrift::protocol::T_I64) {
          xfer += iprot->readI64(this->nan_count);
          this->__isset.nan_count = true;
        } else {
          xfer += iprot->skip(ftype);
        }
        break;


void Statistics::__set_min_value(const std::string& val) {
this->min_value = val;
__isset.min_value = true;
Expand Down Expand Up @@ -756,6 +761,14 @@ uint32_t Statistics::read(::apache::thrift::protocol::TProtocol* iprot) {
xfer += iprot->skip(ftype);
}
break;
case 7:
if (ftype == ::apache::thrift::protocol::T_I64) {
xfer += iprot->readI64(this->nan_count);
this->__isset.nan_count = true;
} else {
xfer += iprot->skip(ftype);
}
break;
default:
xfer += iprot->skip(ftype);
break;
Expand Down Expand Up @@ -809,6 +822,12 @@ uint32_t Statistics::write(::apache::thrift::protocol::TProtocol* oprot) const {
xfer += oprot->writeBinary(this->min_value);
xfer += oprot->writeFieldEnd();
}
if (this->__isset.nan_count) {
xfer += oprot->writeFieldBegin(
"nan_count", ::apache::thrift::protocol::T_I64, 7);
xfer += oprot->writeI64(this->nan_count);
xfer += oprot->writeFieldEnd();
}
xfer += oprot->writeFieldStop();
xfer += oprot->writeStructEnd();
return xfer;
Expand All @@ -820,6 +839,7 @@ void swap(Statistics& a, Statistics& b) {
swap(a.min, b.min);
swap(a.null_count, b.null_count);
swap(a.distinct_count, b.distinct_count);
swap(a.nan_count, b.nan_count);
swap(a.max_value, b.max_value);
swap(a.min_value, b.min_value);
swap(a.__isset, b.__isset);
Comment on lines 839 to 845

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Write updated.

  if (this->__isset.nan_count) {
    xfer += oprot->writeFieldBegin(
        "nan_count", ::apache::thrift::protocol::T_I64, 7);
    xfer += oprot->writeI64(this->nan_count);
    xfer += oprot->writeFieldEnd();
  }

Expand All @@ -830,6 +850,7 @@ Statistics::Statistics(const Statistics& other0) {
min = other0.min;
null_count = other0.null_count;
distinct_count = other0.distinct_count;
nan_count = other0.nan_count;
max_value = other0.max_value;
min_value = other0.min_value;
__isset = other0.__isset;
Expand All @@ -839,6 +860,7 @@ Statistics& Statistics::operator=(const Statistics& other1) {
min = other1.min;
null_count = other1.null_count;
distinct_count = other1.distinct_count;
nan_count = other1.nan_count;
max_value = other1.max_value;
Comment on lines 860 to 864

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated

void Statistics::printTo(std::ostream& out) const {
  using ::apache::thrift::to_string;
  out << "Statistics(";
  out << "max=";
  (__isset.max ? (out << to_string(max)) : (out << "<null>"));
  out << ", " << "min=";
  (__isset.min ? (out << to_string(min)) : (out << "<null>"));
  out << ", " << "null_count=";
  (__isset.null_count ? (out << to_string(null_count)) : (out << "<null>"));
  out << ", " << "distinct_count=";
  (__isset.distinct_count ? (out << to_string(distinct_count))
                          : (out << "<null>"));
  out << ", " << "max_value=";
  (__isset.max_value ? (out << to_string(max_value)) : (out << "<null>"));
  out << ", " << "min_value=";
  (__isset.min_value ? (out << to_string(min_value)) : (out << "<null>"));
  out << ", " << "nan_count=";
  (__isset.nan_count ? (out << to_string(nan_count)) : (out << "<null>"));
  out << ")";
}

min_value = other1.min_value;
__isset = other1.__isset;
Expand All @@ -860,6 +882,8 @@ void Statistics::printTo(std::ostream& out) const {
(__isset.max_value ? (out << to_string(max_value)) : (out << "<null>"));
out << ", " << "min_value=";
(__isset.min_value ? (out << to_string(min_value)) : (out << "<null>"));
out << ", " << "nan_count=";
(__isset.nan_count ? (out << to_string(nan_count)) : (out << "<null>"));
out << ")";
}

Expand Down
18 changes: 18 additions & 0 deletions velox/dwio/parquet/thrift/ParquetThriftTypes.h
Original file line number Diff line number Diff line change
Expand Up @@ -466,12 +466,14 @@ typedef struct _Statistics__isset {
min(false),
null_count(false),
distinct_count(false),
nan_count(false),
max_value(false),
min_value(false) {}
bool max : 1;
bool min : 1;
bool null_count : 1;
bool distinct_count : 1;
bool nan_count : 1;
bool max_value : 1;
bool min_value : 1;
} _Statistics__isset;
Expand All @@ -489,6 +491,7 @@ class Statistics : public virtual apache::thrift::TBase {
min(),
null_count(0),
distinct_count(0),
nan_count(0),
max_value(),
min_value() {}

Expand Down Expand Up @@ -516,6 +519,15 @@ class Statistics : public virtual apache::thrift::TBase {
* count of distinct values occurring
*/
int64_t distinct_count;

/**
* count of NaN values occurring.
* Note: This is a Velox extension to the Parquet format. The upstream
* Parquet community is considering adding official support for this field.
* See: https://github.com/apache/parquet-format/pull/514
*/
int64_t nan_count;
Comment thread
mohsaka marked this conversation as resolved.

/**
* Min and max values for the column, determined by its ColumnOrder.
*
Expand All @@ -535,6 +547,8 @@ class Statistics : public virtual apache::thrift::TBase {

void __set_distinct_count(const int64_t val);

void __set_nan_count(const int64_t val);

void __set_max_value(const std::string& val);

void __set_min_value(const std::string& val);
Expand All @@ -556,6 +570,10 @@ class Statistics : public virtual apache::thrift::TBase {
return false;
else if (__isset.distinct_count && !(distinct_count == rhs.distinct_count))
return false;
if (__isset.nan_count != rhs.__isset.nan_count)
return false;
else if (__isset.nan_count && !(nan_count == rhs.nan_count))
return false;
if (__isset.max_value != rhs.__isset.max_value)
return false;
else if (__isset.max_value && !(max_value == rhs.max_value))
Expand Down
7 changes: 7 additions & 0 deletions velox/dwio/parquet/thrift/parquet.thrift
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,13 @@ struct Statistics {
*/
5: optional binary max_value;
6: optional binary min_value;
/**
* count of NaN values occurring.
* Note: This is a Velox extension to the Parquet format. The upstream
* Parquet community is considering adding official support for this field.
* See: https://github.com/apache/parquet-format/pull/514
*/
7: optional i64 nan_count;
}

/** Empty structs to use as logical type annotations */
Expand Down
12 changes: 6 additions & 6 deletions velox/dwio/parquet/writer/arrow/Metadata.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -102,11 +102,11 @@ static std::shared_ptr<Statistics> makeTypedColumnStats(
metadata.num_values - stats.null_count,
stats.null_count,
stats.distinct_count,
stats.__isset.nan_count ? stats.nan_count : 0,
stats.__isset.max_value || stats.__isset.min_value,
stats.__isset.null_count,
stats.__isset.distinct_count,
false,
0);
stats.__isset.nan_count);
}
// Default behavior.
return makeStatistics<DType>(
Expand All @@ -116,11 +116,11 @@ static std::shared_ptr<Statistics> makeTypedColumnStats(
metadata.num_values - stats.null_count,
stats.null_count,
stats.distinct_count,
stats.__isset.nan_count ? stats.nan_count : 0,
stats.__isset.max || stats.__isset.min,
stats.__isset.null_count,
stats.__isset.distinct_count,
false,
0);
stats.__isset.nan_count);
}

std::shared_ptr<Statistics> makeColumnStats(
Expand Down Expand Up @@ -1019,8 +1019,8 @@ class FileMetaData::FileMetaDataImpl {
// Set NaN counts from the builder (called during Finish)
// This stores total NaN counts per field ID across all row groups.
void setNaNCounts(
std::unordered_map<int32_t, std::pair<int64_t, bool>> nan_counts) {
fieldNanCounts_ = std::move(nan_counts);
std::unordered_map<int32_t, std::pair<int64_t, bool>> nanCounts) {
fieldNanCounts_ = std::move(nanCounts);
}

// Get total NaN count for a specific field ID across all row groups.
Expand Down
11 changes: 5 additions & 6 deletions velox/dwio/parquet/writer/arrow/Statistics.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@
#include "velox/dwio/parquet/writer/arrow/Platform.h"
#include "velox/dwio/parquet/writer/arrow/Schema.h"
#include "velox/dwio/parquet/writer/arrow/StringTruncation.h"

#include "velox/type/DecimalUtil.h"
#include "velox/type/HugeInt.h"

Expand Down Expand Up @@ -607,11 +606,11 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
int64_t numValues,
int64_t nullCount,
int64_t distinctCount,
int64_t nanCount,
bool hasMinMax,
bool hasNullCount,
bool hasDistinctCount,
bool hasNaNCount,
int64_t nanCount,
MemoryPool* pool)
: TypedStatisticsImpl(descr, pool) {
TypedStatisticsImpl::incrementNumValues(numValues);
Expand Down Expand Up @@ -870,7 +869,7 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
s.setDistinctCount(this->distinctCount());
}
if (hasNanCount_) {
s.set_nan_count(nanCount_);
s.setNanCount(nanCount_);
}
return s;
}
Expand Down Expand Up @@ -1261,11 +1260,11 @@ std::shared_ptr<Statistics> Statistics::make(
numValues,
encodedStats->nullCount,
encodedStats->distinctCount,
encodedStats->nanCount,
encodedStats->hasMin && encodedStats->hasMax,
encodedStats->hasNullCount,
encodedStats->hasDistinctCount,
encodedStats->hasNanCount,
encodedStats->nanCount,
pool);
}

Expand All @@ -1276,11 +1275,11 @@ std::shared_ptr<Statistics> Statistics::make(
int64_t numValues,
int64_t nullCount,
int64_t distinctCount,
int64_t nanCount,
bool hasMinMax,
bool hasNullCount,
bool hasDistinctCount,
bool hasNaNCount,
int64_t nanCount,
::arrow::MemoryPool* pool) {
#define MAKE_STATS(CAP_TYPE, KLASS) \
case Type::CAP_TYPE: \
Expand All @@ -1291,11 +1290,11 @@ std::shared_ptr<Statistics> Statistics::make(
numValues, \
nullCount, \
distinctCount, \
nanCount, \
hasMinMax, \
hasNullCount, \
hasDistinctCount, \
hasNaNCount, \
nanCount, \
pool)

switch (descr->physicalType()) {
Expand Down
10 changes: 5 additions & 5 deletions velox/dwio/parquet/writer/arrow/Statistics.h
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ class PARQUET_EXPORT EncodedStatistics {
}

bool isSet() const {
return hasMin || hasMax || hasNullCount || hasDistinctCount;
return hasMin || hasMax || hasNullCount || hasDistinctCount || hasNanCount;
}

bool isSigned() const {
Expand Down Expand Up @@ -205,7 +205,7 @@ class PARQUET_EXPORT EncodedStatistics {
return *this;
}

EncodedStatistics& set_nan_count(int64_t value) {
EncodedStatistics& setNanCount(int64_t value) {
nanCount = value;
hasNanCount = true;
return *this;
Expand Down Expand Up @@ -245,11 +245,11 @@ class PARQUET_EXPORT Statistics {
int64_t numValues,
int64_t nullCount,
int64_t distinctCount,
int64_t nanCount,
bool hasMinMax,
bool hasNullCount,
bool hasDistinctCount,
bool hasNaNCount,
int64_t nanCount,
::arrow::MemoryPool* pool = ::arrow::default_memory_pool());

// Helper function to convert EncodedStatistics to Statistics.
Expand Down Expand Up @@ -476,11 +476,11 @@ std::shared_ptr<TypedStatistics<DType>> makeStatistics(
int64_t numValues,
int64_t nullCount,
int64_t distinctCount,
int64_t nanCount,
bool hasMinMax,
bool hasNullCount,
bool hasDistinctCount,
bool hasNaNCount,
int64_t nanCount,
::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
return std::static_pointer_cast<TypedStatistics<DType>>(Statistics::make(
descr,
Expand All @@ -489,11 +489,11 @@ std::shared_ptr<TypedStatistics<DType>> makeStatistics(
numValues,
nullCount,
distinctCount,
nanCount,
hasMinMax,
hasNullCount,
hasDistinctCount,
hasNaNCount,
nanCount,
pool));
}

Expand Down
3 changes: 3 additions & 0 deletions velox/dwio/parquet/writer/arrow/ThriftInternal.h
Original file line number Diff line number Diff line change
Expand Up @@ -385,6 +385,9 @@ static inline facebook::velox::parquet::thrift::Statistics toThrift(
if (stats.hasDistinctCount) {
Statistics.__set_distinct_count(stats.distinctCount);
}
if (stats.hasNanCount) {
Statistics.__set_nan_count(stats.nanCount);
}

return Statistics;
}
Expand Down
10 changes: 6 additions & 4 deletions velox/dwio/parquet/writer/arrow/tests/StatisticsTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -345,11 +345,12 @@ class TestStatistics : public PrimitiveTypedTest<TestType> {
this->values_.size(),
0, // nullCount.
0, // distinctCount.
0, // nanCount.
true, // hasMinMax.
true, // hasNullCount.
true, // hasDistinctCount.
false, // hasNaNCount.
0); // nanCount.
false // hasNaNCount.
);

auto statistics3 = makeStatistics<TestType>(this->schema_.column(0));
std::vector<uint8_t> validBits(
Expand Down Expand Up @@ -615,11 +616,12 @@ void TestStatistics<ByteArrayType>::testMinMaxEncode() {
this->values_.size(),
0, // nullCount
0, // distinctCount
0, // nanCount
true, // hasMinMax
true, // hasNullCount
true, // hasDistinctCount
false, // hasNaNCount
0); // nanCount
false // hasNaNCount
);

ASSERT_EQ(encodedMin, statistics2->encodeMin());
ASSERT_EQ(encodedMax, statistics2->encodeMax());
Expand Down
Loading