diff --git a/velox/dwio/parquet/thrift/ParquetThriftTypes.cpp b/velox/dwio/parquet/thrift/ParquetThriftTypes.cpp index 674c99300f7..6ac6ed538f6 100644 --- a/velox/dwio/parquet/thrift/ParquetThriftTypes.cpp +++ b/velox/dwio/parquet/thrift/ParquetThriftTypes.cpp @@ -682,6 +682,11 @@ void Statistics::__set_max_value(const std::string& val) { __isset.max_value = true; } +void Statistics::__set_nan_count(const int64_t val) { + this->nan_count = val; + __isset.nan_count = true; +} + void Statistics::__set_min_value(const std::string& val) { this->min_value = val; __isset.min_value = true; @@ -756,6 +761,14 @@ uint32_t Statistics::read(::apache::thrift::protocol::TProtocol* iprot) { xfer += iprot->skip(ftype); } break; + case 7: + if (ftype == ::apache::thrift::protocol::T_I64) { + xfer += iprot->readI64(this->nan_count); + this->__isset.nan_count = true; + } else { + xfer += iprot->skip(ftype); + } + break; default: xfer += iprot->skip(ftype); break; @@ -809,6 +822,12 @@ uint32_t Statistics::write(::apache::thrift::protocol::TProtocol* oprot) const { xfer += oprot->writeBinary(this->min_value); xfer += oprot->writeFieldEnd(); } + if (this->__isset.nan_count) { + xfer += oprot->writeFieldBegin( + "nan_count", ::apache::thrift::protocol::T_I64, 7); + xfer += oprot->writeI64(this->nan_count); + xfer += oprot->writeFieldEnd(); + } xfer += oprot->writeFieldStop(); xfer += oprot->writeStructEnd(); return xfer; @@ -820,6 +839,7 @@ void swap(Statistics& a, Statistics& b) { swap(a.min, b.min); swap(a.null_count, b.null_count); swap(a.distinct_count, b.distinct_count); + swap(a.nan_count, b.nan_count); swap(a.max_value, b.max_value); swap(a.min_value, b.min_value); swap(a.__isset, b.__isset); @@ -830,6 +850,7 @@ Statistics::Statistics(const Statistics& other0) { min = other0.min; null_count = other0.null_count; distinct_count = other0.distinct_count; + nan_count = other0.nan_count; max_value = other0.max_value; min_value = other0.min_value; __isset = other0.__isset; @@ -839,6 +860,7 @@ Statistics& Statistics::operator=(const Statistics& other1) { min = other1.min; null_count = other1.null_count; distinct_count = other1.distinct_count; + nan_count = other1.nan_count; max_value = other1.max_value; min_value = other1.min_value; __isset = other1.__isset; @@ -860,6 +882,8 @@ void Statistics::printTo(std::ostream& out) const { (__isset.max_value ? (out << to_string(max_value)) : (out << "")); out << ", " << "min_value="; (__isset.min_value ? (out << to_string(min_value)) : (out << "")); + out << ", " << "nan_count="; + (__isset.nan_count ? (out << to_string(nan_count)) : (out << "")); out << ")"; } diff --git a/velox/dwio/parquet/thrift/ParquetThriftTypes.h b/velox/dwio/parquet/thrift/ParquetThriftTypes.h index 0fd63e83cc0..1e5bd41b98b 100644 --- a/velox/dwio/parquet/thrift/ParquetThriftTypes.h +++ b/velox/dwio/parquet/thrift/ParquetThriftTypes.h @@ -466,12 +466,14 @@ typedef struct _Statistics__isset { min(false), null_count(false), distinct_count(false), + nan_count(false), max_value(false), min_value(false) {} bool max : 1; bool min : 1; bool null_count : 1; bool distinct_count : 1; + bool nan_count : 1; bool max_value : 1; bool min_value : 1; } _Statistics__isset; @@ -489,6 +491,7 @@ class Statistics : public virtual apache::thrift::TBase { min(), null_count(0), distinct_count(0), + nan_count(0), max_value(), min_value() {} @@ -516,6 +519,15 @@ class Statistics : public virtual apache::thrift::TBase { * count of distinct values occurring */ int64_t distinct_count; + + /** + * count of NaN values occurring. + * Note: This is a Velox extension to the Parquet format. The upstream + * Parquet community is considering adding official support for this field. + * See: https://github.com/apache/parquet-format/pull/514 + */ + int64_t nan_count; + /** * Min and max values for the column, determined by its ColumnOrder. * @@ -535,6 +547,8 @@ class Statistics : public virtual apache::thrift::TBase { void __set_distinct_count(const int64_t val); + void __set_nan_count(const int64_t val); + void __set_max_value(const std::string& val); void __set_min_value(const std::string& val); @@ -556,6 +570,10 @@ class Statistics : public virtual apache::thrift::TBase { return false; else if (__isset.distinct_count && !(distinct_count == rhs.distinct_count)) return false; + if (__isset.nan_count != rhs.__isset.nan_count) + return false; + else if (__isset.nan_count && !(nan_count == rhs.nan_count)) + return false; if (__isset.max_value != rhs.__isset.max_value) return false; else if (__isset.max_value && !(max_value == rhs.max_value)) diff --git a/velox/dwio/parquet/thrift/parquet.thrift b/velox/dwio/parquet/thrift/parquet.thrift index c7c962a0c50..961e4986203 100644 --- a/velox/dwio/parquet/thrift/parquet.thrift +++ b/velox/dwio/parquet/thrift/parquet.thrift @@ -238,6 +238,13 @@ struct Statistics { */ 5: optional binary max_value; 6: optional binary min_value; + /** + * count of NaN values occurring. + * Note: This is a Velox extension to the Parquet format. The upstream + * Parquet community is considering adding official support for this field. + * See: https://github.com/apache/parquet-format/pull/514 + */ + 7: optional i64 nan_count; } /** Empty structs to use as logical type annotations */ diff --git a/velox/dwio/parquet/writer/arrow/Metadata.cpp b/velox/dwio/parquet/writer/arrow/Metadata.cpp index b62d9ecea85..e02b12d00a4 100644 --- a/velox/dwio/parquet/writer/arrow/Metadata.cpp +++ b/velox/dwio/parquet/writer/arrow/Metadata.cpp @@ -102,11 +102,11 @@ static std::shared_ptr makeTypedColumnStats( metadata.num_values - stats.null_count, stats.null_count, stats.distinct_count, + stats.__isset.nan_count ? stats.nan_count : 0, stats.__isset.max_value || stats.__isset.min_value, stats.__isset.null_count, stats.__isset.distinct_count, - false, - 0); + stats.__isset.nan_count); } // Default behavior. return makeStatistics( @@ -116,11 +116,11 @@ static std::shared_ptr makeTypedColumnStats( metadata.num_values - stats.null_count, stats.null_count, stats.distinct_count, + stats.__isset.nan_count ? stats.nan_count : 0, stats.__isset.max || stats.__isset.min, stats.__isset.null_count, stats.__isset.distinct_count, - false, - 0); + stats.__isset.nan_count); } std::shared_ptr makeColumnStats( @@ -1019,8 +1019,8 @@ class FileMetaData::FileMetaDataImpl { // Set NaN counts from the builder (called during Finish) // This stores total NaN counts per field ID across all row groups. void setNaNCounts( - std::unordered_map> nan_counts) { - fieldNanCounts_ = std::move(nan_counts); + std::unordered_map> nanCounts) { + fieldNanCounts_ = std::move(nanCounts); } // Get total NaN count for a specific field ID across all row groups. diff --git a/velox/dwio/parquet/writer/arrow/Statistics.cpp b/velox/dwio/parquet/writer/arrow/Statistics.cpp index 3d88fc41d33..2f410a4c750 100644 --- a/velox/dwio/parquet/writer/arrow/Statistics.cpp +++ b/velox/dwio/parquet/writer/arrow/Statistics.cpp @@ -40,7 +40,6 @@ #include "velox/dwio/parquet/writer/arrow/Platform.h" #include "velox/dwio/parquet/writer/arrow/Schema.h" #include "velox/dwio/parquet/writer/arrow/StringTruncation.h" - #include "velox/type/DecimalUtil.h" #include "velox/type/HugeInt.h" @@ -607,11 +606,11 @@ class TypedStatisticsImpl : public TypedStatistics { int64_t numValues, int64_t nullCount, int64_t distinctCount, + int64_t nanCount, bool hasMinMax, bool hasNullCount, bool hasDistinctCount, bool hasNaNCount, - int64_t nanCount, MemoryPool* pool) : TypedStatisticsImpl(descr, pool) { TypedStatisticsImpl::incrementNumValues(numValues); @@ -870,7 +869,7 @@ class TypedStatisticsImpl : public TypedStatistics { s.setDistinctCount(this->distinctCount()); } if (hasNanCount_) { - s.set_nan_count(nanCount_); + s.setNanCount(nanCount_); } return s; } @@ -1261,11 +1260,11 @@ std::shared_ptr Statistics::make( numValues, encodedStats->nullCount, encodedStats->distinctCount, + encodedStats->nanCount, encodedStats->hasMin && encodedStats->hasMax, encodedStats->hasNullCount, encodedStats->hasDistinctCount, encodedStats->hasNanCount, - encodedStats->nanCount, pool); } @@ -1276,11 +1275,11 @@ std::shared_ptr Statistics::make( int64_t numValues, int64_t nullCount, int64_t distinctCount, + int64_t nanCount, bool hasMinMax, bool hasNullCount, bool hasDistinctCount, bool hasNaNCount, - int64_t nanCount, ::arrow::MemoryPool* pool) { #define MAKE_STATS(CAP_TYPE, KLASS) \ case Type::CAP_TYPE: \ @@ -1291,11 +1290,11 @@ std::shared_ptr Statistics::make( numValues, \ nullCount, \ distinctCount, \ + nanCount, \ hasMinMax, \ hasNullCount, \ hasDistinctCount, \ hasNaNCount, \ - nanCount, \ pool) switch (descr->physicalType()) { diff --git a/velox/dwio/parquet/writer/arrow/Statistics.h b/velox/dwio/parquet/writer/arrow/Statistics.h index 64295487359..97fdf6e35c0 100644 --- a/velox/dwio/parquet/writer/arrow/Statistics.h +++ b/velox/dwio/parquet/writer/arrow/Statistics.h @@ -170,7 +170,7 @@ class PARQUET_EXPORT EncodedStatistics { } bool isSet() const { - return hasMin || hasMax || hasNullCount || hasDistinctCount; + return hasMin || hasMax || hasNullCount || hasDistinctCount || hasNanCount; } bool isSigned() const { @@ -205,7 +205,7 @@ class PARQUET_EXPORT EncodedStatistics { return *this; } - EncodedStatistics& set_nan_count(int64_t value) { + EncodedStatistics& setNanCount(int64_t value) { nanCount = value; hasNanCount = true; return *this; @@ -245,11 +245,11 @@ class PARQUET_EXPORT Statistics { int64_t numValues, int64_t nullCount, int64_t distinctCount, + int64_t nanCount, bool hasMinMax, bool hasNullCount, bool hasDistinctCount, bool hasNaNCount, - int64_t nanCount, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); // Helper function to convert EncodedStatistics to Statistics. @@ -476,11 +476,11 @@ std::shared_ptr> makeStatistics( int64_t numValues, int64_t nullCount, int64_t distinctCount, + int64_t nanCount, bool hasMinMax, bool hasNullCount, bool hasDistinctCount, bool hasNaNCount, - int64_t nanCount, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) { return std::static_pointer_cast>(Statistics::make( descr, @@ -489,11 +489,11 @@ std::shared_ptr> makeStatistics( numValues, nullCount, distinctCount, + nanCount, hasMinMax, hasNullCount, hasDistinctCount, hasNaNCount, - nanCount, pool)); } diff --git a/velox/dwio/parquet/writer/arrow/ThriftInternal.h b/velox/dwio/parquet/writer/arrow/ThriftInternal.h index 2ce0e151741..a265a51cb45 100644 --- a/velox/dwio/parquet/writer/arrow/ThriftInternal.h +++ b/velox/dwio/parquet/writer/arrow/ThriftInternal.h @@ -385,6 +385,9 @@ static inline facebook::velox::parquet::thrift::Statistics toThrift( if (stats.hasDistinctCount) { Statistics.__set_distinct_count(stats.distinctCount); } + if (stats.hasNanCount) { + Statistics.__set_nan_count(stats.nanCount); + } return Statistics; } diff --git a/velox/dwio/parquet/writer/arrow/tests/StatisticsTest.cpp b/velox/dwio/parquet/writer/arrow/tests/StatisticsTest.cpp index 008d95d3df9..91b096b1ea0 100644 --- a/velox/dwio/parquet/writer/arrow/tests/StatisticsTest.cpp +++ b/velox/dwio/parquet/writer/arrow/tests/StatisticsTest.cpp @@ -345,11 +345,12 @@ class TestStatistics : public PrimitiveTypedTest { this->values_.size(), 0, // nullCount. 0, // distinctCount. + 0, // nanCount. true, // hasMinMax. true, // hasNullCount. true, // hasDistinctCount. - false, // hasNaNCount. - 0); // nanCount. + false // hasNaNCount. + ); auto statistics3 = makeStatistics(this->schema_.column(0)); std::vector validBits( @@ -615,11 +616,12 @@ void TestStatistics::testMinMaxEncode() { this->values_.size(), 0, // nullCount 0, // distinctCount + 0, // nanCount true, // hasMinMax true, // hasNullCount true, // hasDistinctCount - false, // hasNaNCount - 0); // nanCount + false // hasNaNCount + ); ASSERT_EQ(encodedMin, statistics2->encodeMin()); ASSERT_EQ(encodedMax, statistics2->encodeMax());