diff --git a/Cargo.lock b/Cargo.lock index af1c980d45da..54c4f9564e21 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2401,6 +2401,7 @@ dependencies = [ "criterion", "half", "indexmap", + "num-traits", "parquet-variant", "parquet-variant-json", "rand 0.9.4", diff --git a/parquet-variant-compute/Cargo.toml b/parquet-variant-compute/Cargo.toml index bcfb36b8710c..4cf2a3b1804d 100644 --- a/parquet-variant-compute/Cargo.toml +++ b/parquet-variant-compute/Cargo.toml @@ -37,6 +37,7 @@ parquet-variant-json = { workspace = true } chrono = { workspace = true } uuid = { version = "1.18.0", features = ["v4"] } serde_json = "1.0" +num-traits = { version = "0.2", default-features = false } # uuid requires the `js` feature to run on wasm [target.'cfg(target_arch = "wasm32")'.dependencies] diff --git a/parquet-variant-compute/src/shred_variant.rs b/parquet-variant-compute/src/shred_variant.rs index 440f4b716521..e387d773aabf 100644 --- a/parquet-variant-compute/src/shred_variant.rs +++ b/parquet-variant-compute/src/shred_variant.rs @@ -92,6 +92,7 @@ pub(crate) fn shred_variant_with_options( cast_options, array.len(), NullValue::TopLevelVariant, + true, )?; for i in 0..array.len() { if array.is_null(i) { @@ -145,6 +146,7 @@ pub(crate) fn make_variant_to_shredded_variant_arrow_row_builder<'a>( cast_options: &'a CastOptions, capacity: usize, null_value: NullValue, + shred: bool, ) -> Result> { let builder = match data_type { DataType::Struct(fields) => { @@ -153,6 +155,7 @@ pub(crate) fn make_variant_to_shredded_variant_arrow_row_builder<'a>( cast_options, capacity, null_value, + shred, )?; VariantToShreddedVariantRowBuilder::Object(typed_value_builder) } @@ -193,7 +196,7 @@ pub(crate) fn make_variant_to_shredded_variant_arrow_row_builder<'a>( | DataType::FixedSizeBinary(16) // UUID => { let builder = - make_primitive_variant_to_arrow_row_builder(data_type, cast_options, capacity)?; + make_primitive_variant_to_arrow_row_builder(data_type, cast_options, capacity, shred)?; let typed_value_builder = VariantToShreddedPrimitiveVariantRowBuilder::new(builder, capacity, null_value); VariantToShreddedVariantRowBuilder::Primitive(typed_value_builder) @@ -369,6 +372,7 @@ impl<'a> VariantToShreddedObjectVariantRowBuilder<'a> { cast_options: &'a CastOptions, capacity: usize, null_value: NullValue, + shred: bool, ) -> Result { let typed_value_builders = fields.iter().map(|field| { let builder = make_variant_to_shredded_variant_arrow_row_builder( @@ -376,6 +380,7 @@ impl<'a> VariantToShreddedObjectVariantRowBuilder<'a> { cast_options, capacity, NullValue::ObjectField, + shred, )?; Ok((field.name().as_str(), builder)) }); @@ -710,9 +715,12 @@ mod tests { use arrow::datatypes::{ ArrowPrimitiveType, DataType, Field, Fields, Int64Type, TimeUnit, UnionFields, UnionMode, }; + use arrow_schema::IntervalUnit; + use chrono::{DateTime, NaiveDate, NaiveTime}; use parquet_variant::{ BuilderSpecificState, EMPTY_VARIANT_METADATA_BYTES, ObjectBuilder, ReadOnlyMetadataBuilder, - Variant, VariantBuilder, VariantPath, VariantPathElement, + ShortString, Variant, VariantBuilder, VariantDecimal4, VariantDecimal8, VariantDecimal16, + VariantPath, VariantPathElement, }; use std::sync::Arc; use uuid::Uuid; @@ -1046,6 +1054,7 @@ mod tests { &cast_options, 1, mode, + true, ) .unwrap(); primitive_builder.append_null().unwrap(); @@ -1076,6 +1085,7 @@ mod tests { &cast_options, 1, mode, + true, ) .unwrap(); array_builder.append_null().unwrap(); @@ -1104,6 +1114,7 @@ mod tests { &cast_options, 1, mode, + true, ) .unwrap(); object_builder.append_null().unwrap(); @@ -1286,11 +1297,14 @@ mod tests { ); assert!(typed_value_field.is_null(4)); - // Row 5: 3i8 -> should shred successfully (int8->int64 conversion) + // Row 5: 3i8 -> should not shred (int8 can't be shredded into int64) assert!(!result.is_null(5)); - assert!(value_field.is_null(5)); // value should be null when shredded - assert!(!typed_value_field.is_null(5)); - assert_eq!(typed_value_field.value(5), 3); + assert!(typed_value_field.is_null(5)); // value should contain original + assert!(!value_field.is_null(5)); // typed_value should be null + assert_eq!( + variant_from_arrays_at(metadata_field, value_field, 5).unwrap(), + Variant::from(3i8) + ); } #[test] @@ -1310,7 +1324,7 @@ mod tests { .downcast_ref::() .unwrap(); assert_eq!(typed_value_int32.value(0), 42); - assert_eq!(typed_value_int32.value(1), 3); + assert!(typed_value_int32.is_null(1)); // float doesn't shred to int32 assert!(typed_value_int32.is_null(2)); // string doesn't convert to int32 // Test Float64 target @@ -1321,7 +1335,7 @@ mod tests { .as_any() .downcast_ref::() .unwrap(); - assert_eq!(typed_value_float64.value(0), 42.0); // int converts to float + assert!(typed_value_float64.is_null(0)); // int doesn't shred to float assert_eq!(typed_value_float64.value(1), 3.15); assert!(typed_value_float64.is_null(2)); // string doesn't convert } @@ -2807,4 +2821,179 @@ mod tests { let shredding_type = ShreddedSchemaBuilder::default().build(); assert_eq!(shredding_type, DataType::Null); } + + // This test wants to cover that the variant can/can't be shredded to the given data type. + #[test] + fn test_variant_type_shredded_correctly() { + // array contains all variant types + let mut array_builder = VariantArrayBuilder::new(30); + array_builder.append_value(Variant::Null); + array_builder.append_value(Variant::Int8(1)); + array_builder.append_value(Variant::Int16(2)); + array_builder.append_value(Variant::Int32(3)); + array_builder.append_value(Variant::Int64(4)); + array_builder.append_value(Variant::Date(NaiveDate::from_epoch_days(12345).unwrap())); + array_builder.append_value(Variant::TimestampMicros( + DateTime::from_timestamp_micros(123456789).unwrap(), + )); + array_builder.append_value(Variant::TimestampNtzMicros( + DateTime::from_timestamp_micros(123456789) + .unwrap() + .naive_utc(), + )); + array_builder.append_value(Variant::TimestampNanos(DateTime::from_timestamp_nanos( + 1234567890123, + ))); + array_builder.append_value(Variant::TimestampNtzNanos( + DateTime::from_timestamp_nanos(1234567890123).naive_utc(), + )); + array_builder.append_value(VariantDecimal4::try_new(123, 2).unwrap()); + array_builder.append_value(VariantDecimal8::try_new(123, 3).unwrap()); + array_builder.append_value(VariantDecimal16::try_new(123, 4).unwrap()); + array_builder.append_value(Variant::Float(5.0)); + array_builder.append_value(Variant::Double(6f64)); + array_builder.append_value(Variant::BooleanTrue); + array_builder.append_value(Variant::BooleanFalse); + array_builder.append_value(Variant::Binary("helow".as_bytes())); + array_builder.append_value(Variant::String("hello")); + array_builder.append_value(Variant::ShortString( + ShortString::try_from("world").unwrap(), + )); + array_builder.append_value(Variant::Time( + NaiveTime::from_num_seconds_from_midnight_opt(12345, 123).unwrap(), + )); + + let array = array_builder.build(); + + fn can_shred_to(v: &Variant, dt: &DataType) -> bool { + matches!( + (v, dt), + (Variant::Int8(_), DataType::Int8) + | (Variant::Int16(_), DataType::Int16) + | (Variant::Int32(_), DataType::Int32) + | (Variant::Int64(_), DataType::Int64) + | (Variant::Date(_), DataType::Date32) + | ( + Variant::TimestampMicros(_), + DataType::Timestamp(TimeUnit::Microsecond, Some(_)), + ) + | ( + Variant::TimestampNtzMicros(_), + DataType::Timestamp(TimeUnit::Microsecond, None), + ) + | ( + Variant::TimestampNanos(_), + DataType::Timestamp(TimeUnit::Nanosecond, Some(_)), + ) + | ( + Variant::TimestampNtzNanos(_), + DataType::Timestamp(TimeUnit::Nanosecond, None), + ) + | (Variant::Decimal4(_), DataType::Decimal32(_, _)) + | (Variant::Decimal8(_), DataType::Decimal64(_, _)) + | (Variant::Decimal16(_), DataType::Decimal128(_, _)) + | (Variant::Float(_), DataType::Float32) + | (Variant::Double(_), DataType::Float64) + | (Variant::BooleanFalse, DataType::Boolean) + | (Variant::BooleanTrue, DataType::Boolean) + | (Variant::Binary(_), DataType::Binary) + | (Variant::Binary(_), DataType::BinaryView) + | (Variant::Binary(_), DataType::LargeBinary) + | (Variant::ShortString(_), DataType::Utf8) + | (Variant::ShortString(_), DataType::Utf8View) + | (Variant::ShortString(_), DataType::LargeUtf8) + | (Variant::String(_), DataType::Utf8) + | (Variant::String(_), DataType::Utf8View) + | (Variant::String(_), DataType::LargeUtf8) + | (Variant::Time(_), DataType::Time64(_)) + ) + } + + macro_rules! assert_shred_type { + ($shred_type:expr, $expected_value_valid_bits:expr) => { + let shredded_array_result = shred_variant(&array, &$shred_type); + match shredded_array_result { + Ok(shredded_array) => { + let value_column = shredded_array.inner().column_by_name("value").unwrap(); + for (idx, valid) in $expected_value_valid_bits.iter().enumerate() { + match valid { + true => assert!( + value_column.is_null(idx), + "{:?} should be shredded to {}", + array.value(idx), + $shred_type + ), + false => assert!( + value_column.is_valid(idx), + "{:?} should not be shredded to {}", + array.value(idx), + $shred_type + ), + } + } + } + Err(e) => { + let error_msg = format!("is not a valid variant shredding type"); + assert!( + e.to_string().contains(error_msg.as_str()), + "{} => {}", + $shred_type, + e.to_string() + ); + } + } + }; + } + + let types = [ + DataType::Null, + DataType::Boolean, + DataType::Int8, + DataType::Int16, + DataType::Int32, + DataType::Int64, + DataType::UInt8, + DataType::UInt16, + DataType::UInt32, + DataType::UInt64, + DataType::Float32, + DataType::Float64, + DataType::Timestamp(TimeUnit::Second, Some("+00:00".into())), + DataType::Timestamp(TimeUnit::Second, None), + DataType::Timestamp(TimeUnit::Millisecond, Some("-00:00".into())), + DataType::Timestamp(TimeUnit::Millisecond, None), + DataType::Timestamp(TimeUnit::Microsecond, Some("-00:00".into())), + DataType::Timestamp(TimeUnit::Microsecond, None), + DataType::Timestamp(TimeUnit::Nanosecond, Some("+00:00".into())), + DataType::Timestamp(TimeUnit::Nanosecond, None), + DataType::Date32, + DataType::Date64, + DataType::Time32(TimeUnit::Second), + DataType::Time32(TimeUnit::Millisecond), + DataType::Time64(TimeUnit::Microsecond), + DataType::Time64(TimeUnit::Nanosecond), + DataType::Duration(TimeUnit::Nanosecond), + DataType::Interval(IntervalUnit::DayTime), + DataType::Binary, + DataType::FixedSizeBinary(16), // uuid + DataType::FixedSizeBinary(32), + DataType::LargeBinary, + DataType::BinaryView, + DataType::Utf8, + DataType::LargeUtf8, + DataType::Utf8View, + DataType::Decimal32(4, 2), + DataType::Decimal64(10, 4), + DataType::Decimal128(20, 10), + DataType::Decimal256(30, 10), + ]; + + for data_type in types { + let expected_bits = array + .iter() + .map(|v| can_shred_to(&v.unwrap(), &data_type)) + .collect::>(); + assert_shred_type!(data_type, expected_bits); + } + } } diff --git a/parquet-variant-compute/src/type_conversion.rs b/parquet-variant-compute/src/type_conversion.rs index 2255d4316b25..889dc91f84e4 100644 --- a/parquet-variant-compute/src/type_conversion.rs +++ b/parquet-variant-compute/src/type_conversion.rs @@ -17,28 +17,32 @@ //! Module for transforming a typed arrow `Array` to `VariantArray`. +use arrow::array::ArrowNativeTypeOp; use arrow::compute::{ - CastOptions, DecimalCast, parse_string_to_decimal_native, rescale_decimal, - single_float_to_decimal, + CastOptions, DecimalCast, cast_num_to_bool, cast_single_string_to_boolean_default, num_cast, + parse_string_to_decimal_native, rescale_decimal, single_bool_to_numeric, + single_decimal_to_float_lossy, single_float_to_decimal, }; use arrow::datatypes::{ self, ArrowPrimitiveType, ArrowTimestampType, Decimal32Type, Decimal64Type, Decimal128Type, - DecimalType, + Decimal256Type, DecimalType, }; use arrow::error::{ArrowError, Result}; -use chrono::Timelike; +use chrono::{DateTime, NaiveDate, NaiveDateTime, NaiveTime, Timelike, Utc}; +use half::f16; +use num_traits::NumCast; use parquet_variant::{Variant, VariantDecimal4, VariantDecimal8, VariantDecimal16}; /// Extension trait for Arrow primitive types that can extract their native value from a Variant pub(crate) trait PrimitiveFromVariant: ArrowPrimitiveType { - fn from_variant(variant: &Variant<'_, '_>) -> Option; + fn from_variant(variant: &Variant<'_, '_>, shred: bool) -> Option; } /// Extension trait for Arrow timestamp types that can extract their native value from a Variant /// We can't use [`PrimitiveFromVariant`] directly because we need _two_ implementations for each /// timestamp type -- the `NTZ` param here. pub(crate) trait TimestampFromVariant: ArrowTimestampType { - fn from_variant(variant: &Variant<'_, '_>) -> Option; + fn from_variant(variant: &Variant<'_, '_>, shred: bool) -> Option; } /// Cast a single `Variant` value with safe/strict semantics. @@ -64,10 +68,13 @@ pub(crate) fn variant_cast_with_options<'a, 'm, 'v, T>( /// Macro to generate PrimitiveFromVariant implementations for Arrow primitive types macro_rules! impl_primitive_from_variant { - ($arrow_type:ty, $variant_method:ident $(, $cast_fn:expr)?) => { + ($arrow_type:ty, $shred_fun:expr, $get_method:ident $(, $cast_fn:expr)?) => { impl PrimitiveFromVariant for $arrow_type { - fn from_variant(variant: &Variant<'_, '_>) -> Option { - let value = variant.$variant_method(); + fn from_variant(variant: &Variant<'_, '_>, shred: bool) -> Option { + let value = match shred { + true => ($shred_fun)(variant), + false => $get_method(variant), + }; $( let value = value.and_then($cast_fn); )? value } @@ -76,58 +83,201 @@ macro_rules! impl_primitive_from_variant { } macro_rules! impl_timestamp_from_variant { - ($timestamp_type:ty, $variant_method:ident, ntz=$ntz:ident, $cast_fn:expr $(,)?) => { + ($timestamp_type:ty, $shred_fun:expr, $variant_method:expr, ntz=$ntz:ident, $cast_fn:expr $(,)?) => { impl TimestampFromVariant<{ $ntz }> for $timestamp_type { - fn from_variant(variant: &Variant<'_, '_>) -> Option { - variant.$variant_method().and_then($cast_fn) + fn from_variant(variant: &Variant<'_, '_>, shred: bool) -> Option { + let value = match shred { + true => ($shred_fun)(variant), + false => $variant_method(variant), + }; + + value.and_then($cast_fn) } } }; } -impl_primitive_from_variant!(datatypes::Int32Type, as_int32); -impl_primitive_from_variant!(datatypes::Int16Type, as_int16); -impl_primitive_from_variant!(datatypes::Int8Type, as_int8); -impl_primitive_from_variant!(datatypes::Int64Type, as_int64); -impl_primitive_from_variant!(datatypes::UInt8Type, as_u8); -impl_primitive_from_variant!(datatypes::UInt16Type, as_u16); -impl_primitive_from_variant!(datatypes::UInt32Type, as_u32); -impl_primitive_from_variant!(datatypes::UInt64Type, as_u64); -impl_primitive_from_variant!(datatypes::Float16Type, as_f16); -impl_primitive_from_variant!(datatypes::Float32Type, as_f32); -impl_primitive_from_variant!(datatypes::Float64Type, as_f64); -impl_primitive_from_variant!(datatypes::Date32Type, as_naive_date, |v| { - Some(datatypes::Date32Type::from_naive_date(v)) -}); -impl_primitive_from_variant!(datatypes::Date64Type, as_naive_date, |v| { - Some(datatypes::Date64Type::from_naive_date(v)) -}); -impl_primitive_from_variant!(datatypes::Time32SecondType, as_time_utc, |v| { - // Return None if there are leftover nanoseconds - if v.nanosecond() != 0 { - None - } else { - Some(v.num_seconds_from_midnight() as i32) +fn convert_to_timestamp_nano(value: &Variant) -> Option> { + match *value { + Variant::TimestampNanos(d) | Variant::TimestampMicros(d) => Some(d), + _ => None, } -}); -impl_primitive_from_variant!(datatypes::Time32MillisecondType, as_time_utc, |v| { - // Return None if there are leftover microseconds - if v.nanosecond() % 1_000_000 != 0 { - None - } else { - Some((v.num_seconds_from_midnight() * 1_000) as i32 + (v.nanosecond() / 1_000_000) as i32) +} + +fn convert_to_timestamp_ntz_nano(value: &Variant) -> Option { + match *value { + Variant::TimestampNtzNanos(d) | Variant::TimestampNtzMicros(d) => Some(d), + _ => None, + } +} + +enum NumericKind { + Integer, + Float, +} + +trait DecimalCastTarget: NumCast + Default { + const KIND: NumericKind; +} + +macro_rules! impl_decimal_cast_target { + ($raw_type: ident, $target_kind:expr) => { + impl DecimalCastTarget for $raw_type { + const KIND: NumericKind = $target_kind; + } + }; +} + +impl_decimal_cast_target!(i8, NumericKind::Integer); +impl_decimal_cast_target!(i16, NumericKind::Integer); +impl_decimal_cast_target!(i32, NumericKind::Integer); +impl_decimal_cast_target!(i64, NumericKind::Integer); +impl_decimal_cast_target!(u8, NumericKind::Integer); +impl_decimal_cast_target!(u16, NumericKind::Integer); +impl_decimal_cast_target!(u32, NumericKind::Integer); +impl_decimal_cast_target!(u64, NumericKind::Integer); +impl_decimal_cast_target!(f16, NumericKind::Float); +impl_decimal_cast_target!(f32, NumericKind::Float); +impl_decimal_cast_target!(f64, NumericKind::Float); + +/// Converts a boolean or numeric variant(integers, floating-point, and decimals) +/// to the specified numeric type `T`. +/// +/// Uses Arrow's casting logic to perform the conversion. Returns `Some(T)` if +/// the conversion succeeds, `None` if the variant can't be casted to type `T`. +fn as_num(variant: &Variant) -> Option +where + T: DecimalCastTarget, +{ + match *variant { + Variant::BooleanFalse => single_bool_to_numeric(false), + Variant::BooleanTrue => single_bool_to_numeric(true), + Variant::Int8(i) => num_cast(i), + Variant::Int16(i) => num_cast(i), + Variant::Int32(i) => num_cast(i), + Variant::Int64(i) => num_cast(i), + Variant::Float(f) => num_cast(f), + Variant::Double(d) => num_cast(d), + Variant::Decimal4(d) => { + cast_decimal_to_num::(d.integer(), d.scale(), |x| x as f64) + } + Variant::Decimal8(d) => { + cast_decimal_to_num::(d.integer(), d.scale(), |x| x as f64) + } + Variant::Decimal16(d) => { + cast_decimal_to_num::(d.integer(), d.scale(), |x| x as f64) + } + _ => None, + } +} + +fn cast_decimal_to_num(raw: D::Native, scale: u8, as_float: F) -> Option +where + D: DecimalType, + D::Native: NumCast + ArrowNativeTypeOp, + T: DecimalCastTarget, + F: Fn(D::Native) -> f64, +{ + let base: D::Native = NumCast::from(10)?; + + let div = base.pow_checked(>::from(scale)).ok()?; + match T::KIND { + NumericKind::Integer => raw + .div_checked(div) + .ok() + .and_then(::from::), + NumericKind::Float => T::from(single_decimal_to_float_lossy::( + &as_float, + raw, + >::from(scale), + )), + } +} + +fn cast_naive_date(value: &Variant<'_, '_>) -> Option { + value.as_naive_date() +} + +fn cast_time_utc(value: &Variant<'_, '_>) -> Option { + value.as_time_utc() +} + +// helper function for the types that would never be the shred target type. +fn always_none(_input: &Variant) -> Option { + None +} + +impl_primitive_from_variant!(datatypes::Int32Type, Variant::as_int32, as_num); +impl_primitive_from_variant!(datatypes::Int16Type, Variant::as_int16, as_num); +impl_primitive_from_variant!(datatypes::Int8Type, Variant::as_int8, as_num); +impl_primitive_from_variant!(datatypes::Int64Type, Variant::as_int64, as_num); +impl_primitive_from_variant!(datatypes::UInt8Type, always_none, as_num); +impl_primitive_from_variant!(datatypes::UInt16Type, always_none, as_num); +impl_primitive_from_variant!(datatypes::UInt32Type, always_none, as_num); +impl_primitive_from_variant!(datatypes::UInt64Type, always_none, as_num); +impl_primitive_from_variant!(datatypes::Float16Type, always_none, as_num); +impl_primitive_from_variant!(datatypes::Float32Type, Variant::as_f32, as_num); +impl_primitive_from_variant!(datatypes::Float64Type, Variant::as_f64, as_num); +impl_primitive_from_variant!( + datatypes::Date32Type, + Variant::as_naive_date, + cast_naive_date, + |v| { Some(datatypes::Date32Type::from_naive_date(v)) } +); +impl_primitive_from_variant!( + datatypes::Date64Type, + Variant::as_naive_date, + cast_naive_date, + |v| { Some(datatypes::Date64Type::from_naive_date(v)) } +); +impl_primitive_from_variant!( + datatypes::Time32SecondType, + always_none, // would never shred to Time32SecondType + cast_time_utc, + |v| { + // Return None if there are leftover nanoseconds + if v.nanosecond() != 0 { + None + } else { + Some(v.num_seconds_from_midnight() as i32) + } + } +); +impl_primitive_from_variant!( + datatypes::Time32MillisecondType, + always_none, // would never shred to Time32MillisecondType + cast_time_utc, + |v| { + // Return None if there are leftover microseconds + if v.nanosecond() % 1_000_000 != 0 { + None + } else { + Some( + (v.num_seconds_from_midnight() * 1_000) as i32 + + (v.nanosecond() / 1_000_000) as i32, + ) + } + } +); +impl_primitive_from_variant!( + datatypes::Time64MicrosecondType, + Variant::as_time_utc, + cast_time_utc, + |v| { Some(v.num_seconds_from_midnight() as i64 * 1_000_000 + v.nanosecond() as i64 / 1_000) } +); +impl_primitive_from_variant!( + datatypes::Time64NanosecondType, + always_none, // would never shred to Time64NanosecondType + cast_time_utc, + |v| { + // convert micro to nano seconds + Some(v.num_seconds_from_midnight() as i64 * 1_000_000_000 + v.nanosecond() as i64) } -}); -impl_primitive_from_variant!(datatypes::Time64MicrosecondType, as_time_utc, |v| { - Some(v.num_seconds_from_midnight() as i64 * 1_000_000 + v.nanosecond() as i64 / 1_000) -}); -impl_primitive_from_variant!(datatypes::Time64NanosecondType, as_time_utc, |v| { - // convert micro to nano seconds - Some(v.num_seconds_from_midnight() as i64 * 1_000_000_000 + v.nanosecond() as i64) -}); +); impl_timestamp_from_variant!( datatypes::TimestampSecondType, - as_timestamp_ntz_nanos, + always_none, // would never shred to TimestampSecondType + convert_to_timestamp_ntz_nano, ntz = true, |timestamp| { // Return None if there are leftover nanoseconds @@ -140,7 +290,8 @@ impl_timestamp_from_variant!( ); impl_timestamp_from_variant!( datatypes::TimestampSecondType, - as_timestamp_nanos, + always_none, // would never shred to TimestampSecondType + convert_to_timestamp_nano, ntz = false, |timestamp| { // Return None if there are leftover nanoseconds @@ -153,7 +304,8 @@ impl_timestamp_from_variant!( ); impl_timestamp_from_variant!( datatypes::TimestampMillisecondType, - as_timestamp_ntz_nanos, + always_none, // would never shred to TimestampMillisecondType + convert_to_timestamp_ntz_nano, ntz = true, |timestamp| { // Return None if there are leftover microseconds @@ -166,7 +318,8 @@ impl_timestamp_from_variant!( ); impl_timestamp_from_variant!( datatypes::TimestampMillisecondType, - as_timestamp_nanos, + always_none, // would never shred to TimestampMillisecondType + convert_to_timestamp_nano, ntz = false, |timestamp| { // Return None if there are leftover microseconds @@ -179,25 +332,29 @@ impl_timestamp_from_variant!( ); impl_timestamp_from_variant!( datatypes::TimestampMicrosecondType, - as_timestamp_ntz_micros, + Variant::as_timestamp_ntz_micros, + Variant::as_timestamp_ntz_micros, ntz = true, |timestamp| Self::from_naive_datetime(timestamp, None), ); impl_timestamp_from_variant!( datatypes::TimestampMicrosecondType, - as_timestamp_micros, + Variant::as_timestamp_micros, + Variant::as_timestamp_micros, ntz = false, |timestamp| Self::from_naive_datetime(timestamp.naive_utc(), None) ); impl_timestamp_from_variant!( datatypes::TimestampNanosecondType, - as_timestamp_ntz_nanos, + Variant::as_timestamp_ntz_nanos, + convert_to_timestamp_ntz_nano, ntz = true, |timestamp| Self::from_naive_datetime(timestamp, None) ); impl_timestamp_from_variant!( datatypes::TimestampNanosecondType, - as_timestamp_nanos, + Variant::as_timestamp_nanos, + convert_to_timestamp_nano, ntz = false, |timestamp| Self::from_naive_datetime(timestamp.naive_utc(), None) ); @@ -254,7 +411,7 @@ where precision, scale, ), - Variant::Float(f) => single_float_to_decimal::(f64::from(*f), mul), + Variant::Float(f) => single_float_to_decimal::(>::from(*f), mul), Variant::Double(f) => single_float_to_decimal::(*f, mul), // arrow-cast only support cast string to decimal with scale >=0 for now // Please see `cast_string_to_decimal` in arrow-cast/src/cast/decimal.rs for more detail @@ -287,6 +444,79 @@ where } } +/// Return the unscaled integer representation for Arrow decimal type `O` from a `Variant`. +/// +/// This function is unlike `variant_to_unscaled_decim`, it would never rescale the decimal value, +/// and only return the unscaled integer representation for the specific decimal variants. +pub(crate) fn shred_variant_to_unscaled_decimal(variant: &Variant<'_, '_>) -> Option +where + O: ShredDecimalVariant, + O::Native: DecimalCast, +{ + match variant { + Variant::Decimal4(_) | Variant::Decimal8(_) | Variant::Decimal16(_) => { + O::shred_variant(variant) + } + _ => None, + } +} +pub(crate) trait ShredDecimalVariant: DecimalType { + fn shred_variant(value: &Variant<'_, '_>) -> Option; +} + +impl ShredDecimalVariant for Decimal32Type { + fn shred_variant(value: &Variant<'_, '_>) -> Option { + match *value { + Variant::Decimal4(d) => Some(d.integer()), + _ => None, + } + } +} + +impl ShredDecimalVariant for Decimal64Type { + fn shred_variant(value: &Variant<'_, '_>) -> Option { + match *value { + Variant::Decimal8(d) => Some(d.integer()), + _ => None, + } + } +} + +impl ShredDecimalVariant for Decimal128Type { + fn shred_variant(value: &Variant<'_, '_>) -> Option { + match *value { + Variant::Decimal16(d) => Some(d.integer()), + _ => None, + } + } +} + +impl ShredDecimalVariant for Decimal256Type { + fn shred_variant(_value: &Variant<'_, '_>) -> Option { + None // always return none because we'll never shred to decimal256 + } +} + +pub(crate) fn variant_to_boolean(variant: &Variant<'_, '_>, shred: bool) -> Option { + if shred { + return variant.as_boolean(); + } + + match variant { + Variant::BooleanTrue => Some(true), + Variant::BooleanFalse => Some(false), + Variant::Int8(i) => Some(cast_num_to_bool(*i)), + Variant::Int16(i) => Some(cast_num_to_bool(*i)), + Variant::Int32(i) => Some(cast_num_to_bool(*i)), + Variant::Int64(i) => Some(cast_num_to_bool(*i)), + Variant::Float(f) => Some(cast_num_to_bool(*f)), + Variant::Double(d) => Some(cast_num_to_bool(*d)), + Variant::ShortString(s) => cast_single_string_to_boolean_default(s.as_str()), + Variant::String(s) => cast_single_string_to_boolean_default(s), + _ => None, + } +} + /// Convert the value at a specific index in the given array into a `Variant`. macro_rules! non_generic_conversion_single_value { ($array:expr, $cast_fn:expr, $index:expr) => {{ diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index c3e915993533..1b5972162f0a 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -370,6 +370,26 @@ fn try_perfect_shredding(variant_array: &VariantArray, as_field: &Field) -> Opti /// to the specified path. /// 2. `as_type: Some()`: an array of the specified type is returned. /// +/// # Casting Semantics +/// +/// Scalar conversion semantics intentionally follow Arrow cast behavior where applicable. +/// Conversions in this module delegate to Arrow compute cast helpers such as +/// `num_cast`, `cast_num_to_bool`, `single_bool_to_numeric`, and +/// `cast_single_string_to_boolean_default`. +/// +/// - Getting `DataType::Boolean` accepts boolean, numeric, and string variants. +/// Numeric zero maps to `false`; non-zero maps to `true`. String parsing follows +/// Arrow UTF8-to-boolean cast rules. +/// - Getting numeric datatypes such as `DataType::Int8`, `DataType::Int16`, `DataType::Int32`, +/// `DataType::Int64`, `DataType::UInt8`, `DataType::UInt16`, `DataType::UInt32`, `DataType::UInt64`, +/// `DataType::Float16`, `DataType::Float32`, `DataType::Float64` accept +/// boolean and numeric variants (integers, floating-point, and decimals). +/// They return `None` when conversion is not possible. +/// - Getting decimals such as `DataType::Decimal32`, `DataType::Decimal64`, `DataType::Decimal128`, +/// `DataType::Decimal256` accept compatible decimal variants, integer variants, +/// float variants and string variants. +/// They return `None` when conversion is not possible. +/// /// TODO: How would a caller request a struct or list type where the fields/elements can be any /// variant? Caller can pass None as the requested type to fetch a specific path, but it would /// quickly become annoying (and inefficient) to call `variant_get` for each leaf value in a struct or @@ -1880,7 +1900,7 @@ mod test { let result_variant = VariantArray::try_new(&result).unwrap(); assert_eq!(result_variant.value(0), Variant::from("drama"), "{case}"); - assert_eq!(result_variant.value(1).as_int64(), Some(123), "{case}"); + assert_eq!(result_variant.value(1).as_int8(), Some(123), "{case}"); } } @@ -1936,7 +1956,7 @@ mod test { let result_variant = VariantArray::try_new(&result).unwrap(); assert_eq!(result_variant.value(0), Variant::from("drama")); - assert_eq!(result_variant.value(1).as_int64(), Some(123)); + assert_eq!(result_variant.value(1).as_int8(), Some(123)); } #[test] @@ -1959,7 +1979,7 @@ mod test { let result = variant_get(&array, GetOptions::new_with_path(path.clone())).unwrap(); let result_variant = VariantArray::try_new(&result).unwrap(); assert_eq!(result_variant.value(0), Variant::from("b")); - assert_eq!(result_variant.value(1).as_int64(), Some(123)); + assert_eq!(result_variant.value(1).as_int8(), Some(123)); let field = Field::new("typed_value", DataType::Int64, true); let casted = variant_get( diff --git a/parquet-variant-compute/src/variant_to_arrow.rs b/parquet-variant-compute/src/variant_to_arrow.rs index 9841da555da0..3a79ae1cc532 100644 --- a/parquet-variant-compute/src/variant_to_arrow.rs +++ b/parquet-variant-compute/src/variant_to_arrow.rs @@ -20,7 +20,8 @@ use crate::shred_variant::{ make_variant_to_shredded_variant_arrow_row_builder, }; use crate::type_conversion::{ - PrimitiveFromVariant, TimestampFromVariant, variant_cast_with_options, + PrimitiveFromVariant, ShredDecimalVariant, TimestampFromVariant, + shred_variant_to_unscaled_decimal, variant_cast_with_options, variant_to_boolean, variant_to_unscaled_decimal, }; use crate::variant_array::ShreddedVariantFieldArray; @@ -97,6 +98,7 @@ fn make_typed_variant_to_arrow_row_builder<'a>( data_type: &'a DataType, cast_options: &'a CastOptions, capacity: usize, + shred: bool, ) -> Result> { use VariantToArrowRowBuilder::*; @@ -133,8 +135,12 @@ fn make_typed_variant_to_arrow_row_builder<'a>( Ok(Encoded(builder)) } data_type => { - let builder = - make_primitive_variant_to_arrow_row_builder(data_type, cast_options, capacity)?; + let builder = make_primitive_variant_to_arrow_row_builder( + data_type, + cast_options, + capacity, + shred, + )?; Ok(Primitive(builder)) } } @@ -156,7 +162,7 @@ pub(crate) fn make_variant_to_arrow_row_builder<'a>( capacity, )), Some(data_type) => { - make_typed_variant_to_arrow_row_builder(data_type, cast_options, capacity)? + make_typed_variant_to_arrow_row_builder(data_type, cast_options, capacity, false)? } }; @@ -370,6 +376,7 @@ impl<'a> EncodedVariantToArrowRowBuilder<'a> { value_type, cast_options, capacity, + false, )?); Ok(Self { data_type, @@ -397,169 +404,200 @@ pub(crate) fn make_primitive_variant_to_arrow_row_builder<'a>( data_type: &'a DataType, cast_options: &'a CastOptions, capacity: usize, + shred: bool, ) -> Result> { use PrimitiveVariantToArrowRowBuilder::*; - let builder = - match data_type { - DataType::Null => Null(VariantToNullArrowRowBuilder::new(cast_options, capacity)), - DataType::Boolean => { - Boolean(VariantToBooleanArrowRowBuilder::new(cast_options, capacity)) - } - DataType::Int8 => Int8(VariantToPrimitiveArrowRowBuilder::new( - cast_options, - capacity, - )), - DataType::Int16 => Int16(VariantToPrimitiveArrowRowBuilder::new( - cast_options, - capacity, - )), - DataType::Int32 => Int32(VariantToPrimitiveArrowRowBuilder::new( - cast_options, - capacity, - )), - DataType::Int64 => Int64(VariantToPrimitiveArrowRowBuilder::new( - cast_options, - capacity, - )), - DataType::UInt8 => UInt8(VariantToPrimitiveArrowRowBuilder::new( - cast_options, - capacity, - )), - DataType::UInt16 => UInt16(VariantToPrimitiveArrowRowBuilder::new( - cast_options, - capacity, - )), - DataType::UInt32 => UInt32(VariantToPrimitiveArrowRowBuilder::new( - cast_options, - capacity, - )), - DataType::UInt64 => UInt64(VariantToPrimitiveArrowRowBuilder::new( - cast_options, - capacity, - )), - DataType::Float16 => Float16(VariantToPrimitiveArrowRowBuilder::new( - cast_options, - capacity, - )), - DataType::Float32 => Float32(VariantToPrimitiveArrowRowBuilder::new( - cast_options, - capacity, - )), - DataType::Float64 => Float64(VariantToPrimitiveArrowRowBuilder::new( - cast_options, - capacity, - )), - DataType::Decimal32(precision, scale) => Decimal32( - VariantToDecimalArrowRowBuilder::new(cast_options, capacity, *precision, *scale)?, - ), - DataType::Decimal64(precision, scale) => Decimal64( - VariantToDecimalArrowRowBuilder::new(cast_options, capacity, *precision, *scale)?, - ), - DataType::Decimal128(precision, scale) => Decimal128( - VariantToDecimalArrowRowBuilder::new(cast_options, capacity, *precision, *scale)?, - ), - DataType::Decimal256(precision, scale) => Decimal256( - VariantToDecimalArrowRowBuilder::new(cast_options, capacity, *precision, *scale)?, - ), - DataType::Date32 => Date32(VariantToPrimitiveArrowRowBuilder::new( - cast_options, - capacity, - )), - DataType::Date64 => Date64(VariantToPrimitiveArrowRowBuilder::new( - cast_options, - capacity, - )), - DataType::Time32(TimeUnit::Second) => Time32Second( - VariantToPrimitiveArrowRowBuilder::new(cast_options, capacity), - ), - DataType::Time32(TimeUnit::Millisecond) => Time32Milli( - VariantToPrimitiveArrowRowBuilder::new(cast_options, capacity), - ), - DataType::Time32(t) => { - return Err(ArrowError::InvalidArgumentError(format!( - "The unit for Time32 must be second/millisecond, received {t:?}" - ))); - } - DataType::Time64(TimeUnit::Microsecond) => Time64Micro( - VariantToPrimitiveArrowRowBuilder::new(cast_options, capacity), - ), - DataType::Time64(TimeUnit::Nanosecond) => Time64Nano( - VariantToPrimitiveArrowRowBuilder::new(cast_options, capacity), - ), - DataType::Time64(t) => { - return Err(ArrowError::InvalidArgumentError(format!( - "The unit for Time64 must be micro/nano seconds, received {t:?}" - ))); - } - DataType::Timestamp(TimeUnit::Second, None) => TimestampSecondNtz( - VariantToTimestampNtzArrowRowBuilder::new(cast_options, capacity), - ), - DataType::Timestamp(TimeUnit::Second, tz) => TimestampSecond( - VariantToTimestampArrowRowBuilder::new(cast_options, capacity, tz.clone()), - ), - DataType::Timestamp(TimeUnit::Millisecond, None) => TimestampMilliNtz( - VariantToTimestampNtzArrowRowBuilder::new(cast_options, capacity), - ), - DataType::Timestamp(TimeUnit::Millisecond, tz) => TimestampMilli( - VariantToTimestampArrowRowBuilder::new(cast_options, capacity, tz.clone()), - ), - DataType::Timestamp(TimeUnit::Microsecond, None) => TimestampMicroNtz( - VariantToTimestampNtzArrowRowBuilder::new(cast_options, capacity), - ), - DataType::Timestamp(TimeUnit::Microsecond, tz) => TimestampMicro( - VariantToTimestampArrowRowBuilder::new(cast_options, capacity, tz.clone()), - ), - DataType::Timestamp(TimeUnit::Nanosecond, None) => TimestampNanoNtz( - VariantToTimestampNtzArrowRowBuilder::new(cast_options, capacity), - ), - DataType::Timestamp(TimeUnit::Nanosecond, tz) => TimestampNano( - VariantToTimestampArrowRowBuilder::new(cast_options, capacity, tz.clone()), - ), - DataType::Duration(_) | DataType::Interval(_) => { - return Err(ArrowError::InvalidArgumentError( - "Casting Variant to duration/interval types is not supported. \ + let builder = match data_type { + DataType::Null => Null(VariantToNullArrowRowBuilder::new(cast_options, capacity)), + DataType::Boolean => Boolean(VariantToBooleanArrowRowBuilder::new( + cast_options, + capacity, + shred, + )), + DataType::Int8 => Int8(VariantToPrimitiveArrowRowBuilder::new( + cast_options, + capacity, + shred, + )), + DataType::Int16 => Int16(VariantToPrimitiveArrowRowBuilder::new( + cast_options, + capacity, + shred, + )), + DataType::Int32 => Int32(VariantToPrimitiveArrowRowBuilder::new( + cast_options, + capacity, + shred, + )), + DataType::Int64 => Int64(VariantToPrimitiveArrowRowBuilder::new( + cast_options, + capacity, + shred, + )), + DataType::UInt8 => UInt8(VariantToPrimitiveArrowRowBuilder::new( + cast_options, + capacity, + shred, + )), + DataType::UInt16 => UInt16(VariantToPrimitiveArrowRowBuilder::new( + cast_options, + capacity, + shred, + )), + DataType::UInt32 => UInt32(VariantToPrimitiveArrowRowBuilder::new( + cast_options, + capacity, + shred, + )), + DataType::UInt64 => UInt64(VariantToPrimitiveArrowRowBuilder::new( + cast_options, + capacity, + shred, + )), + DataType::Float16 => Float16(VariantToPrimitiveArrowRowBuilder::new( + cast_options, + capacity, + shred, + )), + DataType::Float32 => Float32(VariantToPrimitiveArrowRowBuilder::new( + cast_options, + capacity, + shred, + )), + DataType::Float64 => Float64(VariantToPrimitiveArrowRowBuilder::new( + cast_options, + capacity, + shred, + )), + DataType::Decimal32(precision, scale) => Decimal32(VariantToDecimalArrowRowBuilder::new( + cast_options, + capacity, + *precision, + *scale, + shred, + )?), + DataType::Decimal64(precision, scale) => Decimal64(VariantToDecimalArrowRowBuilder::new( + cast_options, + capacity, + *precision, + *scale, + shred, + )?), + DataType::Decimal128(precision, scale) => Decimal128(VariantToDecimalArrowRowBuilder::new( + cast_options, + capacity, + *precision, + *scale, + shred, + )?), + DataType::Decimal256(precision, scale) => Decimal256(VariantToDecimalArrowRowBuilder::new( + cast_options, + capacity, + *precision, + *scale, + shred, + )?), + DataType::Date32 => Date32(VariantToPrimitiveArrowRowBuilder::new( + cast_options, + capacity, + shred, + )), + DataType::Date64 => Date64(VariantToPrimitiveArrowRowBuilder::new( + cast_options, + capacity, + shred, + )), + DataType::Time32(TimeUnit::Second) => Time32Second(VariantToPrimitiveArrowRowBuilder::new( + cast_options, + capacity, + shred, + )), + DataType::Time32(TimeUnit::Millisecond) => Time32Milli( + VariantToPrimitiveArrowRowBuilder::new(cast_options, capacity, shred), + ), + DataType::Time32(t) => { + return Err(ArrowError::InvalidArgumentError(format!( + "The unit for Time32 must be second/millisecond, received {t:?}" + ))); + } + DataType::Time64(TimeUnit::Microsecond) => Time64Micro( + VariantToPrimitiveArrowRowBuilder::new(cast_options, capacity, shred), + ), + DataType::Time64(TimeUnit::Nanosecond) => Time64Nano( + VariantToPrimitiveArrowRowBuilder::new(cast_options, capacity, shred), + ), + DataType::Time64(t) => { + return Err(ArrowError::InvalidArgumentError(format!( + "The unit for Time64 must be micro/nano seconds, received {t:?}" + ))); + } + DataType::Timestamp(TimeUnit::Second, None) => TimestampSecondNtz( + VariantToTimestampNtzArrowRowBuilder::new(cast_options, capacity, shred), + ), + DataType::Timestamp(TimeUnit::Second, tz) => TimestampSecond( + VariantToTimestampArrowRowBuilder::new(cast_options, capacity, shred, tz.clone()), + ), + DataType::Timestamp(TimeUnit::Millisecond, None) => TimestampMilliNtz( + VariantToTimestampNtzArrowRowBuilder::new(cast_options, capacity, shred), + ), + DataType::Timestamp(TimeUnit::Millisecond, tz) => TimestampMilli( + VariantToTimestampArrowRowBuilder::new(cast_options, capacity, shred, tz.clone()), + ), + DataType::Timestamp(TimeUnit::Microsecond, None) => TimestampMicroNtz( + VariantToTimestampNtzArrowRowBuilder::new(cast_options, capacity, shred), + ), + DataType::Timestamp(TimeUnit::Microsecond, tz) => TimestampMicro( + VariantToTimestampArrowRowBuilder::new(cast_options, capacity, shred, tz.clone()), + ), + DataType::Timestamp(TimeUnit::Nanosecond, None) => TimestampNanoNtz( + VariantToTimestampNtzArrowRowBuilder::new(cast_options, capacity, shred), + ), + DataType::Timestamp(TimeUnit::Nanosecond, tz) => TimestampNano( + VariantToTimestampArrowRowBuilder::new(cast_options, capacity, shred, tz.clone()), + ), + DataType::Duration(_) | DataType::Interval(_) => { + return Err(ArrowError::InvalidArgumentError( + "Casting Variant to duration/interval types is not supported. \ The Variant format does not define duration/interval types." - .to_string(), - )); - } - DataType::Binary => Binary(VariantToBinaryArrowRowBuilder::new(cast_options, capacity)), - DataType::LargeBinary => { - LargeBinary(VariantToBinaryArrowRowBuilder::new(cast_options, capacity)) - } - DataType::BinaryView => { - BinaryView(VariantToBinaryArrowRowBuilder::new(cast_options, capacity)) - } - DataType::FixedSizeBinary(16) => { - Uuid(VariantToUuidArrowRowBuilder::new(cast_options, capacity)) - } - DataType::FixedSizeBinary(_) => { - return Err(ArrowError::NotYetImplemented(format!( - "DataType {data_type:?} not yet implemented" - ))); - } - DataType::Utf8 => String(VariantToStringArrowBuilder::new(cast_options, capacity)), - DataType::LargeUtf8 => { - LargeString(VariantToStringArrowBuilder::new(cast_options, capacity)) - } - DataType::Utf8View => { - StringView(VariantToStringArrowBuilder::new(cast_options, capacity)) - } - DataType::List(_) - | DataType::LargeList(_) - | DataType::ListView(_) - | DataType::LargeListView(_) - | DataType::FixedSizeList(..) - | DataType::Struct(_) - | DataType::Map(..) - | DataType::Union(..) - | DataType::Dictionary(..) - | DataType::RunEndEncoded(..) => { - return Err(ArrowError::InvalidArgumentError(format!( - "Casting to {data_type:?} is not applicable for primitive Variant types" - ))); - } - }; + .to_string(), + )); + } + DataType::Binary => Binary(VariantToBinaryArrowRowBuilder::new(cast_options, capacity)), + DataType::LargeBinary => { + LargeBinary(VariantToBinaryArrowRowBuilder::new(cast_options, capacity)) + } + DataType::BinaryView => { + BinaryView(VariantToBinaryArrowRowBuilder::new(cast_options, capacity)) + } + DataType::FixedSizeBinary(16) => { + Uuid(VariantToUuidArrowRowBuilder::new(cast_options, capacity)) + } + DataType::FixedSizeBinary(_) => { + return Err(ArrowError::NotYetImplemented(format!( + "DataType {data_type:?} not yet implemented" + ))); + } + DataType::Utf8 => String(VariantToStringArrowBuilder::new(cast_options, capacity)), + DataType::LargeUtf8 => { + LargeString(VariantToStringArrowBuilder::new(cast_options, capacity)) + } + DataType::Utf8View => StringView(VariantToStringArrowBuilder::new(cast_options, capacity)), + DataType::List(_) + | DataType::LargeList(_) + | DataType::ListView(_) + | DataType::LargeListView(_) + | DataType::FixedSizeList(..) + | DataType::Struct(_) + | DataType::Map(..) + | DataType::Union(..) + | DataType::Dictionary(..) + | DataType::RunEndEncoded(..) => { + return Err(ArrowError::InvalidArgumentError(format!( + "Casting to {data_type:?} is not applicable for primitive Variant types" + ))); + } + }; Ok(builder) } @@ -590,6 +628,7 @@ impl<'a> StructVariantToArrowRowBuilder<'a> { field.data_type(), cast_options, capacity, + false, )?); } Ok(Self { @@ -760,11 +799,12 @@ impl<'a> VariantPathRowBuilder<'a> { macro_rules! define_variant_to_primitive_builder { (struct $name:ident<$lifetime:lifetime $(, $generic:ident: $bound:path )?> |$array_param:ident $(, $field:ident: $field_type:ty)?| -> $builder_name:ident $(< $array_type:ty >)? { $init_expr: expr }, - |$value: ident| $value_transform:expr, + |$value: ident $(, $shred: ident)?| $value_transform:expr, type_name: $type_name:expr) => { pub(crate) struct $name<$lifetime $(, $generic : $bound )?> { builder: $builder_name $(<$array_type>)?, + $($shred: bool,)? cast_options: &$lifetime CastOptions<$lifetime>, } @@ -772,12 +812,14 @@ macro_rules! define_variant_to_primitive_builder { fn new( cast_options: &$lifetime CastOptions<$lifetime>, $array_param: usize, + $($shred: bool,)? // add this so that $init_expr can use it $( $field: $field_type, )? ) -> Self { Self { builder: $init_expr, cast_options, + $($shred)? } } @@ -787,6 +829,7 @@ macro_rules! define_variant_to_primitive_builder { } fn append_value(&mut self, $value: &Variant<'_, '_>) -> Result { + $(let $shred: bool = self.shred;)? match variant_cast_with_options( $value, self.cast_options, @@ -831,21 +874,21 @@ define_variant_to_primitive_builder!( define_variant_to_primitive_builder!( struct VariantToBooleanArrowRowBuilder<'a> |capacity| -> BooleanBuilder { BooleanBuilder::with_capacity(capacity) }, - |value| value.as_boolean(), + |value, shred| variant_to_boolean(value, shred), type_name: datatypes::BooleanType::DATA_TYPE ); define_variant_to_primitive_builder!( struct VariantToPrimitiveArrowRowBuilder<'a, T:PrimitiveFromVariant> |capacity| -> PrimitiveBuilder { PrimitiveBuilder::::with_capacity(capacity) }, - |value| T::from_variant(value), + |value, shred| T::from_variant(value, shred), type_name: T::DATA_TYPE ); define_variant_to_primitive_builder!( struct VariantToTimestampNtzArrowRowBuilder<'a, T:TimestampFromVariant> |capacity| -> PrimitiveBuilder { PrimitiveBuilder::::with_capacity(capacity) }, - |value| T::from_variant(value), + |value, shred| T::from_variant(value, shred), type_name: T::DATA_TYPE ); @@ -854,7 +897,7 @@ define_variant_to_primitive_builder!( |capacity, tz: Option> | -> PrimitiveBuilder { PrimitiveBuilder::::with_capacity(capacity).with_timezone_opt(tz) }, - |value| T::from_variant(value), + |value, shred| T::from_variant(value, shred), type_name: T::DATA_TYPE ); @@ -875,11 +918,12 @@ where cast_options: &'a CastOptions<'a>, precision: u8, scale: i8, + shred: bool, } impl<'a, T> VariantToDecimalArrowRowBuilder<'a, T> where - T: DecimalType, + T: ShredDecimalVariant, T::Native: DecimalCast, { fn new( @@ -887,6 +931,7 @@ where capacity: usize, precision: u8, scale: i8, + shred: bool, ) -> Result { let builder = PrimitiveBuilder::::with_capacity(capacity) .with_precision_and_scale(precision, scale)?; @@ -895,6 +940,7 @@ where cast_options, precision, scale, + shred, }) } @@ -904,8 +950,9 @@ where } fn append_value(&mut self, value: &Variant<'_, '_>) -> Result { - match variant_cast_with_options(value, self.cast_options, |value| { - variant_to_unscaled_decimal::(value, self.precision, self.scale) + match variant_cast_with_options(value, self.cast_options, |value| match self.shred { + true => shred_variant_to_unscaled_decimal::(value), + false => variant_to_unscaled_decimal::(value, self.precision, self.scale), }) { Ok(Some(scaled)) => { self.builder.append_value(scaled); @@ -1046,11 +1093,16 @@ where cast_options, capacity, NullValue::ArrayElement, + shredded, )?; ListElementBuilder::Shredded(Box::new(builder)) } else { - let builder = - make_typed_variant_to_arrow_row_builder(element_data_type, cast_options, capacity)?; + let builder = make_typed_variant_to_arrow_row_builder( + element_data_type, + cast_options, + capacity, + shredded, + )?; ListElementBuilder::Typed(Box::new(builder)) }; @@ -1151,11 +1203,16 @@ impl<'a> VariantToFixedSizeListArrowRowBuilder<'a> { cast_options, capacity, NullValue::ArrayElement, + shredded, )?; ListElementBuilder::Shredded(Box::new(builder)) } else { - let builder = - make_typed_variant_to_arrow_row_builder(element_data_type, cast_options, capacity)?; + let builder = make_typed_variant_to_arrow_row_builder( + element_data_type, + cast_options, + capacity, + shredded, + )?; ListElementBuilder::Typed(Box::new(builder)) }; Ok(Self { @@ -1336,11 +1393,15 @@ mod tests { ]; for data_type in non_primitive_types { - let err = - match make_primitive_variant_to_arrow_row_builder(&data_type, &cast_options, 1) { - Ok(_) => panic!("non-primitive type {data_type:?} should be rejected"), - Err(err) => err, - }; + let err = match make_primitive_variant_to_arrow_row_builder( + &data_type, + &cast_options, + 1, + false, + ) { + Ok(_) => panic!("non-primitive type {data_type:?} should be rejected"), + Err(err) => err, + }; match err { ArrowError::InvalidArgumentError(msg) => { @@ -1358,7 +1419,7 @@ mod tests { ..Default::default() }; let mut builder = - make_primitive_variant_to_arrow_row_builder(&DataType::Int32, &cast_options, 2) + make_primitive_variant_to_arrow_row_builder(&DataType::Int32, &cast_options, 2, false) .unwrap(); assert!(!builder.append_value(&Variant::Null).unwrap()); @@ -1380,6 +1441,7 @@ mod tests { &DataType::Decimal32(9, 2), &cast_options, 2, + false, ) .unwrap(); let decimal_variant: Variant<'_, '_> = VariantDecimal4::try_new(1234, 2).unwrap().into(); @@ -1403,6 +1465,7 @@ mod tests { &DataType::FixedSizeBinary(16), &cast_options, 2, + false, ) .unwrap(); let uuid = Uuid::nil(); @@ -1428,7 +1491,7 @@ mod tests { let list_type = DataType::List(Arc::new(Field::new("item", DataType::Int64, true))); let mut list_builder = - make_typed_variant_to_arrow_row_builder(&list_type, &cast_options, 1).unwrap(); + make_typed_variant_to_arrow_row_builder(&list_type, &cast_options, 1, false).unwrap(); assert!(!list_builder.append_value(Variant::Null).unwrap()); let list_array = list_builder.finish().unwrap(); let list_array = list_array.as_any().downcast_ref::().unwrap(); @@ -1437,7 +1500,7 @@ mod tests { let struct_type = DataType::Struct(Fields::from(vec![Field::new("a", DataType::Int32, true)])); let mut struct_builder = - make_typed_variant_to_arrow_row_builder(&struct_type, &cast_options, 1).unwrap(); + make_typed_variant_to_arrow_row_builder(&struct_type, &cast_options, 1, false).unwrap(); assert!(!struct_builder.append_value(Variant::Null).unwrap()); let struct_array = struct_builder.finish().unwrap(); let struct_array = struct_array.as_any().downcast_ref::().unwrap(); diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index c9f175c3a610..07ca4542bd83 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -29,18 +29,10 @@ use crate::decoder::{ }; use crate::path::{VariantPath, VariantPathElement}; use crate::utils::{first_byte_from_slice, slice_from_slice}; -use arrow::array::ArrowNativeTypeOp; -use arrow::compute::{ - DecimalCast, cast_num_to_bool, cast_single_string_to_boolean_default, num_cast, - parse_string_to_decimal_native, single_bool_to_numeric, single_decimal_to_float_lossy, - single_float_to_decimal, -}; -use arrow::datatypes::{Decimal32Type, Decimal64Type, Decimal128Type, DecimalType}; +use std::ops::Deref; use arrow_schema::ArrowError; use chrono::{DateTime, NaiveDate, NaiveDateTime, NaiveTime, Timelike, Utc}; -use num_traits::NumCast; -use std::ops::Deref; mod decimal; mod list; @@ -159,25 +151,6 @@ impl Deref for ShortString<'_> { /// [specification]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md /// [Variant Shredding specification]: https://github.com/apache/parquet-format/blob/master/VariantShredding.md /// -/// # Casting Semantics -/// -/// Scalar conversion semantics intentionally follow Arrow cast behavior where applicable. -/// Conversions in this module delegate to Arrow compute cast helpers such as -/// [`num_cast`], [`cast_num_to_bool`], [`single_bool_to_numeric`], and -/// [`cast_single_string_to_boolean_default`]. -/// -/// - [`Self::as_boolean`] accepts boolean, numeric, and string variants. -/// Numeric zero maps to `false`; non-zero maps to `true`. String parsing follows -/// Arrow UTF8-to-boolean cast rules. -/// - Numeric accessors such as [`Self::as_int8`], [`Self::as_int64`], [`Self::as_u8`], -/// [`Self::as_u64`], [`Self::as_f16`], [`Self::as_f32`], and [`Self::as_f64`] accept -/// boolean and numeric variants (integers, floating-point, and decimals). -/// They return `None` when conversion is not possible. -/// - Decimal accessors such as [`Self::as_decimal4`], [`Self::as_decimal8`], and -/// [`Self::as_decimal16`] accept compatible decimal variants, integer variants, -/// float variants and string variants. -/// They return `None` when conversion is not possible. -/// /// # Examples: /// /// ## Creating `Variant` from Rust Types @@ -305,35 +278,6 @@ const _: () = crate::utils::expect_size_of::(80); #[cfg(target_pointer_width = "32")] const _: () = crate::utils::expect_size_of::(48); -enum NumericKind { - Integer, - Float, -} - -trait DecimalCastTarget: NumCast + Default { - const KIND: NumericKind; -} - -macro_rules! impl_decimal_cast_target { - ($raw_type: ident, $target_kind:expr) => { - impl DecimalCastTarget for $raw_type { - const KIND: NumericKind = $target_kind; - } - }; -} - -impl_decimal_cast_target!(i8, NumericKind::Integer); -impl_decimal_cast_target!(i16, NumericKind::Integer); -impl_decimal_cast_target!(i32, NumericKind::Integer); -impl_decimal_cast_target!(i64, NumericKind::Integer); -impl_decimal_cast_target!(u8, NumericKind::Integer); -impl_decimal_cast_target!(u16, NumericKind::Integer); -impl_decimal_cast_target!(u32, NumericKind::Integer); -impl_decimal_cast_target!(u64, NumericKind::Integer); -impl_decimal_cast_target!(f16, NumericKind::Float); -impl_decimal_cast_target!(f32, NumericKind::Float); -impl_decimal_cast_target!(f64, NumericKind::Float); - impl<'m, 'v> Variant<'m, 'v> { /// Attempts to interpret a metadata and value buffer pair as a new `Variant`. /// @@ -536,7 +480,7 @@ impl<'m, 'v> Variant<'m, 'v> { /// Converts this variant to a `bool` if possible. /// - /// Returns `Some(bool)` for boolean, numeric and string variants, + /// Returns `Some(bool)` for boolean variants, /// `None` for non-boolean variants. /// /// # Examples @@ -552,30 +496,14 @@ impl<'m, 'v> Variant<'m, 'v> { /// let v2 = Variant::from(false); /// assert_eq!(v2.as_boolean(), Some(false)); /// - /// // and a numeric variant - /// let v3 = Variant::from(3); - /// assert_eq!(v3.as_boolean(), Some(true)); - /// - /// // and a string variant - /// let v4 = Variant::from("true"); - /// assert_eq!(v4.as_boolean(), Some(true)); - /// /// // but not from other variants - /// let v5 = Variant::from("hello!"); - /// assert_eq!(v5.as_boolean(), None); + /// let v3 = Variant::from("hello!"); + /// assert_eq!(v3.as_boolean(), None); /// ``` pub fn as_boolean(&self) -> Option { match self { Variant::BooleanTrue => Some(true), Variant::BooleanFalse => Some(false), - Variant::Int8(i) => Some(cast_num_to_bool(*i)), - Variant::Int16(i) => Some(cast_num_to_bool(*i)), - Variant::Int32(i) => Some(cast_num_to_bool(*i)), - Variant::Int64(i) => Some(cast_num_to_bool(*i)), - Variant::Float(f) => Some(cast_num_to_bool(*f)), - Variant::Double(d) => Some(cast_num_to_bool(*d)), - Variant::ShortString(s) => cast_single_string_to_boolean_default(s.as_str()), - Variant::String(s) => cast_single_string_to_boolean_default(s), _ => None, } } @@ -680,7 +608,7 @@ impl<'m, 'v> Variant<'m, 'v> { /// Converts this variant to a `DateTime` if possible. /// - /// Returns `Some(DateTime)` for timestamp variants, + /// Returns `Some(DateTime)` for timestamp nano variants, /// `None` for other variants. /// /// # Examples @@ -698,30 +626,20 @@ impl<'m, 'v> Variant<'m, 'v> { /// let v1 = Variant::from(datetime); /// assert_eq!(v1.as_timestamp_nanos(), Some(datetime)); /// - /// // or from UTC-adjusted microsecond-precision variant - /// let datetime_micros = NaiveDate::from_ymd_opt(2025, 8, 14) - /// .unwrap() - /// .and_hms_milli_opt(12, 33, 54, 123) - /// .unwrap() - /// .and_utc(); - /// // this will convert to `Variant::TimestampMicros`. - /// let v2 = Variant::from(datetime_micros); - /// assert_eq!(v2.as_timestamp_nanos(), Some(datetime_micros)); - /// /// // but not for other variants. - /// let v3 = Variant::from("hello!"); - /// assert_eq!(v3.as_timestamp_nanos(), None); + /// let v2 = Variant::from("hello!"); + /// assert_eq!(v2.as_timestamp_nanos(), None); /// ``` pub fn as_timestamp_nanos(&self) -> Option> { match *self { - Variant::TimestampNanos(d) | Variant::TimestampMicros(d) => Some(d), + Variant::TimestampNanos(d) => Some(d), _ => None, } } /// Converts this variant to a `NaiveDateTime` if possible. /// - /// Returns `Some(NaiveDateTime)` for timestamp variants, + /// Returns `Some(NaiveDateTime)` for timestamp nano variants, /// `None` for other variants. /// /// # Examples @@ -738,22 +656,13 @@ impl<'m, 'v> Variant<'m, 'v> { /// let v1 = Variant::from(datetime); /// assert_eq!(v1.as_timestamp_ntz_nanos(), Some(datetime)); /// - /// // or from a microsecond-precision non-UTC-adjusted variant - /// let datetime_micros = NaiveDate::from_ymd_opt(2025, 8, 14) - /// .unwrap() - /// .and_hms_milli_opt(12, 33, 54, 123) - /// .unwrap(); - /// // this will convert to `Variant::TimestampMicros`. - /// let v2 = Variant::from(datetime_micros); - /// assert_eq!(v2.as_timestamp_ntz_nanos(), Some(datetime_micros)); - /// /// // but not for other variants. - /// let v3 = Variant::from("hello!"); - /// assert_eq!(v3.as_timestamp_ntz_nanos(), None); + /// let v2 = Variant::from("hello!"); + /// assert_eq!(v2.as_timestamp_ntz_nanos(), None); /// ``` pub fn as_timestamp_ntz_nanos(&self) -> Option { match *self { - Variant::TimestampNtzNanos(d) | Variant::TimestampNtzMicros(d) => Some(d), + Variant::TimestampNtzNanos(d) => Some(d), _ => None, } } @@ -837,164 +746,81 @@ impl<'m, 'v> Variant<'m, 'v> { } } - fn cast_decimal_to_num(raw: D::Native, scale: u8, as_float: F) -> Option - where - D: DecimalType, - D::Native: NumCast + ArrowNativeTypeOp, - T: DecimalCastTarget, - F: Fn(D::Native) -> f64, - { - let base: D::Native = NumCast::from(10)?; - - let div = base.pow_checked(>::from(scale)).ok()?; - match T::KIND { - NumericKind::Integer => raw - .div_checked(div) - .ok() - .and_then(::from::), - NumericKind::Float => T::from(single_decimal_to_float_lossy::( - &as_float, - raw, - >::from(scale), - )), - } - } - - /// Converts a boolean or numeric variant(integers, floating-point, and decimals) - /// to the specified numeric type `T`. + /// Converts this variant to an `i8`. /// - /// Uses Arrow's casting logic to perform the conversion. Returns `Some(T)` if - /// the conversion succeeds, `None` if the variant can't be casted to type `T`. - fn as_num(&self) -> Option - where - T: DecimalCastTarget, - { - match *self { - Variant::BooleanFalse => single_bool_to_numeric(false), - Variant::BooleanTrue => single_bool_to_numeric(true), - Variant::Int8(i) => num_cast(i), - Variant::Int16(i) => num_cast(i), - Variant::Int32(i) => num_cast(i), - Variant::Int64(i) => num_cast(i), - Variant::Float(f) => num_cast(f), - Variant::Double(d) => num_cast(d), - Variant::Decimal4(d) => { - Self::cast_decimal_to_num::(d.integer(), d.scale(), |x| { - x as f64 - }) - } - Variant::Decimal8(d) => { - Self::cast_decimal_to_num::(d.integer(), d.scale(), |x| { - x as f64 - }) - } - Variant::Decimal16(d) => { - Self::cast_decimal_to_num::(d.integer(), d.scale(), |x| { - x as f64 - }) - } - _ => None, - } - } - - /// Converts this variant to an `i8` if possible. - /// - /// Returns `Some(i8)` for boolean and numeric variants(integers, floating-point, - /// and decimals with scale 0) that fit in `i8` range, - /// `None` for other variants or values that would overflow. + /// Returns `Some(i8)` for Variant::Int8, `None` for other variants. /// /// # Examples /// /// ``` /// use parquet_variant::Variant; /// - /// // you can read an int64 variant into an i8 if it fits - /// let v1 = Variant::from(123i64); + /// // you can read an i8 variant into int8 + /// let v1 = Variant::from(123i8); /// assert_eq!(v1.as_int8(), Some(123i8)); /// - /// // or from boolean variant - /// let v2 = Variant::BooleanFalse; - /// assert_eq!(v2.as_int8(), Some(0)); - /// - /// // but not if it would overflow - /// let v3 = Variant::from(1234i64); - /// assert_eq!(v3.as_int8(), None); - /// - /// // or if the variant cannot be cast into an integer - /// let v4 = Variant::from("hello!"); - /// assert_eq!(v4.as_int8(), None); + /// // but not for other variants + /// let v2 = Variant::from(256i64); + /// assert_eq!(v2.as_int8(), None); /// ``` pub fn as_int8(&self) -> Option { - self.as_num() + match *self { + Variant::Int8(i) => Some(i), + _ => None, + } } - /// Converts this variant to an `i16` if possible. + /// Converts this variant to an `i16`. /// - /// Returns `Some(i16)` for boolean and numeric variants(integers, floating-point, - /// and decimals with scale 0) that fit in `i16` range - /// `None` for other variants or values that would overflow. + /// Returns `Some(i16)` for Variant::Int16, `None` for other variants. /// /// # Examples /// /// ``` /// use parquet_variant::Variant; /// - /// // you can read an int64 variant into an i16 if it fits - /// let v1 = Variant::from(123i64); + /// // you can read an i16 variant into int16 + /// let v1 = Variant::from(123i16); /// assert_eq!(v1.as_int16(), Some(123i16)); /// - /// // or from boolean variant - /// let v2 = Variant::BooleanFalse; - /// assert_eq!(v2.as_int16(), Some(0)); - /// - /// // but not if it would overflow - /// let v3 = Variant::from(123456i64); - /// assert_eq!(v3.as_int16(), None); - /// - /// // or if the variant cannot be cast into an integer - /// let v4 = Variant::from("hello!"); - /// assert_eq!(v4.as_int16(), None); + /// // but not for other variants + /// let v2 = Variant::from(1234i64); + /// assert_eq!(v2.as_int16(), None); /// ``` pub fn as_int16(&self) -> Option { - self.as_num() + match *self { + Variant::Int16(i) => Some(i), + _ => None, + } } - /// Converts this variant to an `i32` if possible. + /// Converts this variant to an `i32`. /// - /// Returns `Some(i32)` for boolean and numeric variants(integers, floating-point, - /// and decimals with scale 0) that fit in `i32` range - /// `None` for other variants or values that would overflow. + /// Returns `Some(i32)` for Variant::Int32, `None` for other variants. /// /// # Examples /// /// ``` /// use parquet_variant::Variant; /// - /// // you can read an int64 variant into an i32 if it fits - /// let v1 = Variant::from(123i64); + /// // you can read an int32 variant into an i32 + /// let v1 = Variant::from(123i32); /// assert_eq!(v1.as_int32(), Some(123i32)); /// - /// // or from boolean variant - /// let v2 = Variant::BooleanFalse; - /// assert_eq!(v2.as_int32(), Some(0)); - /// - /// // but not if it would overflow - /// let v3 = Variant::from(12345678901i64); - /// assert_eq!(v3.as_int32(), None); - /// - /// // or if the variant cannot be cast into an integer - /// let v4 = Variant::from("hello!"); - /// assert_eq!(v4.as_int32(), None); + /// // but not from other variants + /// let v2 = Variant::from(1231i64); + /// assert_eq!(v2.as_int32(), None); /// ``` pub fn as_int32(&self) -> Option { - self.as_num() + match *self { + Variant::Int32(i) => Some(i), + _ => None, + } } - /// Converts this variant to an `i64` if possible. + /// Converts this variant to an `i64`. /// - /// Returns `Some(i64)` for boolean and numeric variants(integers, floating-point, - /// and decimals with scale 0) that fit in `i64` range - /// `None` for other variants or values that would overflow. + /// Returns `Some(i64)` for Variant::Int64, `None` for other variants. /// /// # Examples /// @@ -1005,104 +831,78 @@ impl<'m, 'v> Variant<'m, 'v> { /// let v1 = Variant::from(123i64); /// assert_eq!(v1.as_int64(), Some(123i64)); /// - /// // or from boolean variant - /// let v2 = Variant::BooleanFalse; - /// assert_eq!(v2.as_int64(), Some(0)); - /// /// // but not a variant that cannot be cast into an integer - /// let v3 = Variant::from("hello!"); - /// assert_eq!(v3.as_int64(), None); + /// let v2 = Variant::from("hello!"); + /// assert_eq!(v2.as_int64(), None); /// ``` pub fn as_int64(&self) -> Option { - self.as_num() + match *self { + Variant::Int64(i) => Some(i), + _ => None, + } } /// Converts this variant to a `u8` if possible. /// - /// Returns `Some(u8)` for boolean and numeric variants(integers, floating-point, - /// and decimals with scale 0) that fit in `u8` range + /// Returns `Some(u8)` for int8 variants that fit in `u8`, /// `None` for other variants or values that would overflow. /// /// # Examples /// /// ``` - /// use parquet_variant::{Variant, VariantDecimal4}; + /// use parquet_variant::Variant; /// /// // you can read an int64 variant into an u8 - /// let v1 = Variant::from(123i64); + /// let v1 = Variant::from(123i8); /// assert_eq!(v1.as_u8(), Some(123u8)); /// - /// // or a Decimal4 with scale 0 into u8 - /// let d = VariantDecimal4::try_new(26, 0).unwrap(); - /// let v2 = Variant::from(d); - /// assert_eq!(v2.as_u8(), Some(26u8)); - /// - /// // or a variant that decimal with scale not equal to zero - /// let d = VariantDecimal4::try_new(123, 2).unwrap(); - /// let v3 = Variant::from(d); - /// assert_eq!(v3.as_u8(), Some(1)); - /// - /// // or from boolean variant - /// let v4 = Variant::BooleanFalse; - /// assert_eq!(v4.as_u8(), Some(0)); - /// /// // but not a variant that can't fit into the range - /// let v5 = Variant::from(-1); - /// assert_eq!(v5.as_u8(), None); + /// let v2 = Variant::from(-1); + /// assert_eq!(v2.as_u8(), None); /// - /// // or not a variant that cannot be cast into an integer - /// let v6 = Variant::from("hello!"); - /// assert_eq!(v6.as_u8(), None); + /// // or not an int8 variant + /// let v3 = Variant::from(123i64); + /// assert_eq!(v3.as_u8(), None); /// ``` pub fn as_u8(&self) -> Option { - self.as_num() + match *self { + Variant::Int8(i) => i.try_into().ok(), + _ => None, + } } /// Converts this variant to an `u16` if possible. /// - /// Returns `Some(u16)` for boolean and numeric variants(integers, floating-point, - /// and decimals with scale 0) that fit in `u16` range + /// Returns `Some(u16)` for int16 variants that fit in `u16`, /// `None` for other variants or values that would overflow. /// /// # Examples /// /// ``` - /// use parquet_variant::{Variant, VariantDecimal4}; + /// use parquet_variant::Variant; /// /// // you can read an int64 variant into an u16 - /// let v1 = Variant::from(123i64); + /// let v1 = Variant::from(123i16); /// assert_eq!(v1.as_u16(), Some(123u16)); /// - /// // or a Decimal4 with scale 0 into u8 - /// let d = VariantDecimal4::try_new(u16::MAX as i32, 0).unwrap(); - /// let v2 = Variant::from(d); - /// assert_eq!(v2.as_u16(), Some(u16::MAX)); - /// - /// // or a variant that decimal with scale not equal to zero - /// let d = VariantDecimal4::try_new(123, 2).unwrap(); - /// let v3 = Variant::from(d); - /// assert_eq!(v3.as_u16(), Some(1)); - /// - /// // or from boolean variant - /// let v4= Variant::BooleanFalse; - /// assert_eq!(v4.as_u16(), Some(0)); - /// /// // but not a variant that can't fit into the range - /// let v5 = Variant::from(-1); - /// assert_eq!(v5.as_u16(), None); + /// let v2 = Variant::from(-1); + /// assert_eq!(v2.as_u16(), None); /// - /// // or not a variant that cannot be cast into an integer - /// let v6 = Variant::from("hello!"); - /// assert_eq!(v6.as_u16(), None); + /// // or not an int16 variant + /// let v3 = Variant::from(123i8); + /// assert_eq!(v3.as_u16(), None); /// ``` pub fn as_u16(&self) -> Option { - self.as_num() + match *self { + Variant::Int16(i) => i.try_into().ok(), + _ => None, + } } /// Converts this variant to an `u32` if possible. /// - /// Returns `Some(u32)` for boolean and numeric variants(integers, floating-point, - /// and decimals with scale 0) that fit in `u32` range + /// Returns `Some(u32)` for int32 variants that fit in `u32`, /// `None` for other variants or values that would overflow. /// /// # Examples @@ -1111,40 +911,28 @@ impl<'m, 'v> Variant<'m, 'v> { /// use parquet_variant::{Variant, VariantDecimal8}; /// /// // you can read an int64 variant into an u32 - /// let v1 = Variant::from(123i64); + /// let v1 = Variant::from(123i32); /// assert_eq!(v1.as_u32(), Some(123u32)); /// - /// // or a Decimal4 with scale 0 into u8 - /// let d = VariantDecimal8::try_new(u32::MAX as i64, 0).unwrap(); - /// let v2 = Variant::from(d); - /// assert_eq!(v2.as_u32(), Some(u32::MAX)); - /// - /// // or a variant that decimal with scale not equal to zero - /// let d = VariantDecimal8::try_new(123, 2).unwrap(); - /// let v3 = Variant::from(d); - /// assert_eq!(v3.as_u32(), Some(1)); - /// - /// // or from boolean variant - /// let v4 = Variant::BooleanFalse; - /// assert_eq!(v4.as_u32(), Some(0)); - /// /// // but not a variant that can't fit into the range - /// let v5 = Variant::from(-1); - /// assert_eq!(v5.as_u32(), None); + /// let v2 = Variant::from(-1); + /// assert_eq!(v2.as_u32(), None); /// - /// // or not a variant that cannot be cast into an integer - /// let v6 = Variant::from("hello!"); - /// assert_eq!(v6.as_u32(), None); + /// // or not an int32 variant + /// let v3 = Variant::from(1234i64); + /// assert_eq!(v3.as_u32(), None); /// ``` pub fn as_u32(&self) -> Option { - self.as_num() + match *self { + Variant::Int32(i) => i.try_into().ok(), + _ => None, + } } /// Converts this variant to an `u64` if possible. /// - /// Returns `Some(u64)` for boolean and numeric variants(integers, floating-point, - /// and decimals with scale 0) that fit in `u64` range - /// `None` for other variants or values that would overflow. + /// Returns `Some(u64)` for integer variants that fit in `u64` + /// `None` for non-integer variants or values that would overflow. /// /// # Examples /// @@ -1155,154 +943,77 @@ impl<'m, 'v> Variant<'m, 'v> { /// let v1 = Variant::from(123i64); /// assert_eq!(v1.as_u64(), Some(123u64)); /// - /// // or a Decimal16 with scale 0 into u8 - /// let d = VariantDecimal16::try_new(u64::MAX as i128, 0).unwrap(); - /// let v2 = Variant::from(d); - /// assert_eq!(v2.as_u64(), Some(u64::MAX)); + /// // but not a variant that can't fit into the range + /// let v2 = Variant::from(-1); + /// assert_eq!(v2.as_u64(), None); /// - /// // or a variant that decimal with scale not equal to zero - /// let d = VariantDecimal16::try_new(123, 2).unwrap(); + /// // or not a variant decimal + /// let d = VariantDecimal16::try_new(1, 0).unwrap(); /// let v3 = Variant::from(d); - /// assert_eq!(v3.as_u64(), Some(1)); - /// - /// // or from boolean variant - /// let v4 = Variant::BooleanFalse; - /// assert_eq!(v4.as_u64(), Some(0)); - /// - /// // but not a variant that can't fit into the range - /// let v5 = Variant::from(-1); - /// assert_eq!(v5.as_u64(), None); + /// assert_eq!(v3.as_u64(), None); /// /// // or not a variant that cannot be cast into an integer - /// let v6 = Variant::from("hello!"); - /// assert_eq!(v6.as_u64(), None); + /// let v4 = Variant::from("hello!"); + /// assert_eq!(v4.as_u64(), None); /// ``` pub fn as_u64(&self) -> Option { - self.as_num() - } - - fn convert_string_to_decimal(input: &str) -> Option - where - D: DecimalType, - VD: VariantDecimalType, - D::Native: NumCast + DecimalCast, - { - // find the last '.' - let scale_usize = input.rsplit_once('.').map_or(0, |(_, frac)| frac.len()); - - let scale = u8::try_from(scale_usize).ok()?; - - let raw = parse_string_to_decimal_native::(input, scale_usize).ok()?; - VD::try_new(raw, scale).ok() + match *self { + Variant::Int64(i) => i.try_into().ok(), + _ => None, + } } - /// Converts this variant to tuple with a 4-byte unscaled value if possible. + /// Converts this variant to tuple with a 4-byte unscaled value. /// - /// Returns `Some((i32, u8))` for decimal variants where the unscaled value - /// fits in `i32` range, - /// `None` for non-decimal variants or decimal values that would overflow. + /// Returns `Some((i32, u8))` for decimal4 variants, `None` for other variants. /// /// # Examples /// /// ``` /// use parquet_variant::{Variant, VariantDecimal4, VariantDecimal8}; /// - /// // you can extract decimal parts from smaller or equally-sized decimal variants + /// // you can read decimal4 variant into VariantDecimal4 /// let v1 = Variant::from(VariantDecimal4::try_new(1234_i32, 2).unwrap()); /// assert_eq!(v1.as_decimal4(), VariantDecimal4::try_new(1234_i32, 2).ok()); /// - /// // and from larger decimal variants if they fit + /// // but not from other variants /// let v2 = Variant::from(VariantDecimal8::try_new(1234_i64, 2).unwrap()); - /// assert_eq!(v2.as_decimal4(), VariantDecimal4::try_new(1234_i32, 2).ok()); - /// - /// // or from string variants if they can be parsed as decimals - /// let v3 = Variant::from("123.45"); - /// assert_eq!(v3.as_decimal4(), VariantDecimal4::try_new(12345, 2).ok()); - /// - /// // but not if the value would overflow i32 - /// let v4 = Variant::from(VariantDecimal8::try_new(12345678901i64, 2).unwrap()); - /// assert_eq!(v4.as_decimal4(), None); - /// - /// // or if the variant is not a decimal - /// let v5 = Variant::from("hello!"); - /// assert_eq!(v5.as_decimal4(), None); + /// assert_eq!(v2.as_decimal4(), None); /// ``` pub fn as_decimal4(&self) -> Option { match *self { - Variant::Int8(_) | Variant::Int16(_) | Variant::Int32(_) | Variant::Int64(_) => { - self.as_num::().and_then(|x| x.try_into().ok()) - } - Variant::Float(f) => single_float_to_decimal::(f as _, 1f64) - .and_then(|x: i32| x.try_into().ok()), - Variant::Double(f) => single_float_to_decimal::(f, 1f64) - .and_then(|x: i32| x.try_into().ok()), - Variant::String(v) => Self::convert_string_to_decimal::(v), - Variant::ShortString(v) => { - Self::convert_string_to_decimal::(v.as_str()) - } Variant::Decimal4(decimal4) => Some(decimal4), - Variant::Decimal8(decimal8) => decimal8.try_into().ok(), - Variant::Decimal16(decimal16) => decimal16.try_into().ok(), _ => None, } } - /// Converts this variant to tuple with an 8-byte unscaled value if possible. + /// Converts this variant to tuple with an 8-byte unscaled value. /// - /// Returns `Some((i64, u8))` for decimal variants where the unscaled value - /// fits in `i64` range, the scale will be 0 if the input is string variants. - /// `None` for non-decimal variants or decimal values that would overflow. + /// Returns `Some((i64, u8))` for decimal8 variants, `None` for other variants. /// /// # Examples /// /// ``` - /// use parquet_variant::{Variant, VariantDecimal4, VariantDecimal8, VariantDecimal16}; + /// use parquet_variant::{Variant, VariantDecimal4, VariantDecimal8}; /// - /// // you can extract decimal parts from smaller or equally-sized decimal variants - /// let v1 = Variant::from(VariantDecimal4::try_new(1234_i32, 2).unwrap()); + /// // you can read decimal8 variant into VariantDecimal8 + /// let v1 = Variant::from(VariantDecimal8::try_new(1234_i64, 2).unwrap()); /// assert_eq!(v1.as_decimal8(), VariantDecimal8::try_new(1234_i64, 2).ok()); /// - /// // and from larger decimal variants if they fit - /// let v2 = Variant::from(VariantDecimal16::try_new(1234_i128, 2).unwrap()); - /// assert_eq!(v2.as_decimal8(), VariantDecimal8::try_new(1234_i64, 2).ok()); - /// - /// // or from string variants if they can be parsed as decimals - /// let v3 = Variant::from("123.45"); - /// assert_eq!(v3.as_decimal8(), VariantDecimal8::try_new(12345, 2).ok()); - /// - /// // but not if the value would overflow i64 - /// let v4 = Variant::from(VariantDecimal16::try_new(2e19 as i128, 2).unwrap()); - /// assert_eq!(v4.as_decimal8(), None); - /// - /// // or if the variant is not a decimal - /// let v5 = Variant::from("hello!"); - /// assert_eq!(v5.as_decimal8(), None); + /// // but not from other variants + /// let v2 = Variant::from(VariantDecimal4::try_new(1234_i32, 2).unwrap()); + /// assert_eq!(v2.as_decimal8(), None); /// ``` pub fn as_decimal8(&self) -> Option { match *self { - Variant::Int8(_) | Variant::Int16(_) | Variant::Int32(_) | Variant::Int64(_) => { - self.as_num::().and_then(|x| x.try_into().ok()) - } - Variant::Float(f) => single_float_to_decimal::(f as _, 1f64) - .and_then(|x: i64| x.try_into().ok()), - Variant::Double(f) => single_float_to_decimal::(f, 1f64) - .and_then(|x: i64| x.try_into().ok()), - Variant::String(v) => Self::convert_string_to_decimal::(v), - Variant::ShortString(v) => { - Self::convert_string_to_decimal::(v.as_str()) - } - Variant::Decimal4(decimal4) => Some(decimal4.into()), Variant::Decimal8(decimal8) => Some(decimal8), - Variant::Decimal16(decimal16) => decimal16.try_into().ok(), _ => None, } } - /// Converts this variant to tuple with a 16-byte unscaled value if possible. + /// Converts this variant to tuple with a 16-byte unscaled value. /// - /// Returns `Some((i128, u8))` for decimal variants where the unscaled value - /// fits in `i128` range, the scale will be 0 if the input is string variants. - /// `None` for non-decimal variants or decimal values that would overflow. + /// Returns `Some((i128, u8))` for decimal16 variants, `None` for other variants. /// /// # Examples /// @@ -1310,81 +1021,25 @@ impl<'m, 'v> Variant<'m, 'v> { /// use parquet_variant::{Variant, VariantDecimal16, VariantDecimal4}; /// /// // you can extract decimal parts from smaller or equally-sized decimal variants - /// let v1 = Variant::from(VariantDecimal4::try_new(1234_i32, 2).unwrap()); - /// assert_eq!(v1.as_decimal16(), VariantDecimal16::try_new(1234_i128, 2).ok()); + /// let d = VariantDecimal16::try_new(2e19 as i128, 2).unwrap(); + /// let v1 = Variant::from(d); + /// assert_eq!(v1.as_decimal16(), VariantDecimal16::try_new(2e19 as i128, 2).ok()); /// - /// // or from a string variant if it can be parsed as decimal - /// let v2 = Variant::from("123.45"); - /// assert_eq!(v2.as_decimal16(), VariantDecimal16::try_new(12345, 2).ok()); - /// - /// // but not if the variant is not a decimal - /// let v3 = Variant::from("hello!"); - /// assert_eq!(v3.as_decimal16(), None); + /// // but not for other variants + /// let d = VariantDecimal4::try_new(1234_i32, 2).unwrap(); + /// let v2 = Variant::from(d); + /// assert_eq!(v2.as_decimal16(), None); /// ``` pub fn as_decimal16(&self) -> Option { match *self { - Variant::Int8(_) | Variant::Int16(_) | Variant::Int32(_) | Variant::Int64(_) => { - let x = self.as_num::()?; - >::from(x).try_into().ok() - } - Variant::Float(f) => { - single_float_to_decimal::(>::from(f), 1f64) - .and_then(|x| x.try_into().ok()) - } - Variant::Double(f) => { - single_float_to_decimal::(f, 1f64).and_then(|x| x.try_into().ok()) - } - Variant::String(v) => Self::convert_string_to_decimal::(v), - Variant::ShortString(v) => { - Self::convert_string_to_decimal::(v.as_str()) - } - Variant::Decimal4(decimal4) => Some(decimal4.into()), - Variant::Decimal8(decimal8) => Some(decimal8.into()), Variant::Decimal16(decimal16) => Some(decimal16), _ => None, } } - /// Converts this variant to an `f16` if possible. + /// Converts this variant to an `f32`. /// - /// Returns `Some(f16)` for boolean and numeric variants(integers, floating-point, - /// and decimals with scale 0) that fit in `f16` range - /// `None` otherwise. - /// - /// # Example - /// - /// ``` - /// use parquet_variant::Variant; - /// use half::f16; - /// - /// // you can extract an f16 from a float variant - /// let v1 = Variant::from(std::f32::consts::PI); - /// assert_eq!(v1.as_f16(), Some(f16::from_f32(std::f32::consts::PI))); - /// - /// // and from a double variant (with loss of precision to nearest f16) - /// let v2 = Variant::from(std::f64::consts::PI); - /// assert_eq!(v2.as_f16(), Some(f16::from_f64(std::f64::consts::PI))); - /// - /// // and from boolean - /// let v3 = Variant::BooleanTrue; - /// assert_eq!(v3.as_f16(), Some(f16::from_f32(1.0))); - /// - /// // return inf if overflow - /// let v4 = Variant::from(123456); - /// assert_eq!(v4.as_f16(), Some(f16::INFINITY)); - /// - /// // but not from other variants - /// let v5 = Variant::from("hello!"); - /// assert_eq!(v5.as_f16(), None); - pub fn as_f16(&self) -> Option { - self.as_num() - } - - /// Converts this variant to an `f32` if possible. - /// - /// Returns `Some(f32)` for boolean and numeric variants(integers, floating-point, - /// and decimals with scale 0) that fit in `f32` range - /// `None` otherwise. + /// Returns `Some(f32)` for float variant, `None` for other variants. /// /// # Examples /// @@ -1395,55 +1050,39 @@ impl<'m, 'v> Variant<'m, 'v> { /// let v1 = Variant::from(std::f32::consts::PI); /// assert_eq!(v1.as_f32(), Some(std::f32::consts::PI)); /// - /// // and from a double variant (with loss of precision to nearest f32) - /// let v2 = Variant::from(std::f64::consts::PI); - /// assert_eq!(v2.as_f32(), Some(std::f32::consts::PI)); - /// - /// // and from boolean variant - /// let v3 = Variant::BooleanTrue; - /// assert_eq!(v3.as_f32(), Some(1.0)); - /// - /// // and return inf if overflow - /// let v4 = Variant::from(f64::MAX); - /// assert_eq!(v4.as_f32(), Some(f32::INFINITY)); - /// /// // but not from other variants - /// let v5 = Variant::from("hello!"); - /// assert_eq!(v5.as_f32(), None); + /// let v2 = Variant::from(std::f64::consts::PI); + /// assert_eq!(v2.as_f32(), None); /// ``` pub fn as_f32(&self) -> Option { - self.as_num() + match *self { + Variant::Float(i) => Some(i), + _ => None, + } } - /// Converts this variant to an `f64` if possible. + /// Converts this variant to an `f64`. /// - /// Returns `Some(f64)` for boolean and numeric variants(integers, floating-point, - /// and decimals with scale 0) that fit in `f64` range - /// `None` for other variants or can't be represented by an f64. + /// Returns `Some(f64)` for double variant, `None` otherwise. /// /// # Examples /// /// ``` /// use parquet_variant::Variant; /// - /// // you can extract an f64 from a float variant - /// let v1 = Variant::from(std::f32::consts::PI); - /// assert_eq!(v1.as_f64(), Some(std::f32::consts::PI as f64)); - /// - /// // and from a double variant - /// let v2 = Variant::from(std::f64::consts::PI); - /// assert_eq!(v2.as_f64(), Some(std::f64::consts::PI)); - /// - /// // and from boolean variant - /// let v3 = Variant::BooleanTrue; - /// assert_eq!(v3.as_f64(), Some(1.0f64)); + /// // you can extract an f64 from a double variant + /// let v1 = Variant::from(std::f64::consts::PI); + /// assert_eq!(v1.as_f64(), Some(std::f64::consts::PI)); /// - /// // but not from other variants - /// let v5 = Variant::from("hello!"); - /// assert_eq!(v5.as_f64(), None); + /// // but not for other variant + /// let v2 = Variant::from(std::f32::consts::PI); + /// assert_eq!(v2.as_f64(), None); /// ``` pub fn as_f64(&self) -> Option { - self.as_num() + match *self { + Variant::Double(i) => Some(i), + _ => None, + } } /// Converts this variant to an `Object` if it is an [`VariantObject`]. @@ -1690,7 +1329,7 @@ impl From for Variant<'_, '_> { if let Ok(value) = i8::try_from(value) { Variant::Int8(value) } else { - Variant::Int16(num_cast(value).unwrap()) // u8 -> i16 is infallible + Variant::Int16(i16::from(value)) } } } @@ -1701,7 +1340,7 @@ impl From for Variant<'_, '_> { if let Ok(value) = i16::try_from(value) { Variant::Int16(value) } else { - Variant::Int32(num_cast(value).unwrap()) // u16 -> i32 is infallible + Variant::Int32(i32::from(value)) } } } @@ -1711,7 +1350,7 @@ impl From for Variant<'_, '_> { if let Ok(value) = i32::try_from(value) { Variant::Int32(value) } else { - Variant::Int64(num_cast(value).unwrap()) // u32 -> i64 is infallible + Variant::Int64(i64::from(value)) } } } @@ -1723,7 +1362,7 @@ impl From for Variant<'_, '_> { Variant::Int64(value) } else { // u64 max is 18446744073709551615, which fits in i128 - Variant::Decimal16(VariantDecimal16::try_new(num_cast(value).unwrap(), 0).unwrap()) + Variant::Decimal16(VariantDecimal16::try_new(i128::from(value), 0).unwrap()) } } } @@ -1949,21 +1588,6 @@ mod tests { assert!(res.is_err()); } - #[test] - fn test_variant_decimal_conversion() { - let decimal4 = VariantDecimal4::try_new(1234_i32, 2).unwrap(); - let variant = Variant::from(decimal4); - assert_eq!(variant.as_decimal4(), Some(decimal4)); - - let decimal8 = VariantDecimal8::try_new(12345678901_i64, 2).unwrap(); - let variant = Variant::from(decimal8); - assert_eq!(variant.as_decimal8(), Some(decimal8)); - - let decimal16 = VariantDecimal16::try_new(123456789012345678901234567890_i128, 2).unwrap(); - let variant = Variant::from(decimal16); - assert_eq!(variant.as_decimal16(), Some(decimal16)); - } - #[test] fn test_variant_all_subtypes_debug() { use crate::VariantBuilder;