From 2130a96e07533c47967557c7a53ed7f328af1f03 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Mon, 22 Jun 2026 13:32:39 +0800 Subject: [PATCH 1/3] feat: update string functions to use try_append methods and improve testing - Refactored case conversion paths in common.rs to utilize try_append_* methods. - Updated substr_index_general and map_strings in substrindex.rs to use try_append_* methods, and added an all-null regression test. - Enhanced strings.rs by adding a fallible StringViewArrayBuilder::try_append_value method, a try_append_placeholder method, and a corresponding builder test. --- datafusion/functions/src/string/common.rs | 12 +- datafusion/functions/src/strings.rs | 114 ++++++++++++++---- .../functions/src/unicode/substrindex.rs | 32 ++++- 3 files changed, 125 insertions(+), 33 deletions(-) diff --git a/datafusion/functions/src/string/common.rs b/datafusion/functions/src/string/common.rs index 6ecd41b0b9a5c..729c151f4dc3e 100644 --- a/datafusion/functions/src/string/common.rs +++ b/datafusion/functions/src/string/common.rs @@ -370,18 +370,18 @@ fn case_conversion( if let Some(ref n) = nulls { for i in 0..item_len { if n.is_null(i) { - builder.append_placeholder(); + builder.try_append_placeholder()?; } else { // SAFETY: `n.is_null(i)` was false in the branch above. let s = unsafe { string_array.value_unchecked(i) }; - builder.append_value(&unicode_case(s, lower)); + builder.try_append_value(&unicode_case(s, lower))?; } } } else { for i in 0..item_len { // SAFETY: no null buffer means every index is valid. let s = unsafe { string_array.value_unchecked(i) }; - builder.append_value(&unicode_case(s, lower)); + builder.try_append_value(&unicode_case(s, lower))?; } } @@ -431,18 +431,18 @@ fn case_conversion_array( if let Some(ref n) = nulls { for i in 0..item_len { if n.is_null(i) { - builder.append_placeholder(); + builder.try_append_placeholder()?; } else { // SAFETY: `n.is_null(i)` was false in the branch above. let s = unsafe { string_array.value_unchecked(i) }; - builder.append_value(&unicode_case(s, lower)); + builder.try_append_value(&unicode_case(s, lower))?; } } } else { for i in 0..item_len { // SAFETY: no null buffer means every index is valid. let s = unsafe { string_array.value_unchecked(i) }; - builder.append_value(&unicode_case(s, lower)); + builder.try_append_value(&unicode_case(s, lower))?; } } Ok(Arc::new(builder.finish(nulls)?)) diff --git a/datafusion/functions/src/strings.rs b/datafusion/functions/src/strings.rs index d032e50153773..a2da47e7bd827 100644 --- a/datafusion/functions/src/strings.rs +++ b/datafusion/functions/src/strings.rs @@ -339,6 +339,10 @@ fn offset_overflow_error() -> DataFusionError { ) } +fn string_view_overflow_error(limit: &str) -> DataFusionError { + exec_datafusion_err!("byte array offset overflow: {limit} exceeds i32::MAX") +} + fn try_offset(len: usize) -> Result { if len > O::MAX_OFFSET { return Err(offset_overflow_error::()); @@ -684,26 +688,31 @@ impl StringViewArrayBuilder { self.block_size } - /// See [`BulkNullStringArrayBuilder::append_value`]. + /// Fallible variant of [`Self::append_value`]. /// - /// # Panics + /// # Errors /// - /// Panics if the value length, the in-progress buffer offset, or the - /// number of completed buffers exceeds `i32::MAX`. The ByteView spec - /// uses signed 32-bit integers for these fields; exceeding `i32::MAX` - /// would produce an array that does not round-trip through Arrow IPC - /// (see ). + /// Returns an error if the value length, in-progress buffer offset, or + /// number of completed buffers exceeds `i32::MAX`. The ByteView spec uses + /// signed 32-bit integers for these fields; exceeding `i32::MAX` would + /// produce an array that does not round-trip through Arrow IPC (see + /// ). #[inline] - pub fn append_value(&mut self, value: &str) { + pub fn try_append_value(&mut self, value: &str) -> Result<()> { let v = value.as_bytes(); - let length: u32 = - i32::try_from(v.len()).expect("value length exceeds i32::MAX") as u32; + let length: u32 = i32::try_from(v.len()) + .map_err(|_| string_view_overflow_error("value length"))? + as u32; if length <= 12 { self.views.push(make_view(v, 0, 0)); - return; + return Ok(()); } - let required_cap = self.in_progress.len() + length as usize; + let required_cap = self + .in_progress + .len() + .checked_add(length as usize) + .ok_or_else(|| string_view_overflow_error("string view block size"))?; if self.in_progress.capacity() < required_cap { self.flush_in_progress(); let to_reserve = (length as usize).max(self.next_block_size() as usize); @@ -715,17 +724,49 @@ impl StringViewArrayBuilder { } let offset: u32 = i32::try_from(self.in_progress.len()) - .expect("offset exceeds i32::MAX") as u32; + .map_err(|_| string_view_overflow_error("offset"))? + as u32; + let buffer_index: u32 = i32::try_from(self.completed.len()) + .map_err(|_| string_view_overflow_error("buffer count"))? + as u32; self.in_progress.extend_from_slice(v); - self.views.push(self.make_long_view(length, offset, v)); + self.views + .push(Self::make_long_view(length, buffer_index, offset, v)); + Ok(()) } - /// See [`BulkNullStringArrayBuilder::append_placeholder`]. + /// See [`BulkNullStringArrayBuilder::append_value`]. + /// + /// Note: new call sites that need recoverable overflow handling should + /// prefer [`Self::try_append_value`]. + /// + /// # Panics + /// + /// Panics under the same conditions that [`Self::try_append_value`] returns + /// an error. #[inline] - pub fn append_placeholder(&mut self) { + pub fn append_value(&mut self, value: &str) { + self.try_append_value(value) + .expect("byte array offset overflow"); + } + + /// Fallible variant of [`Self::append_placeholder`]. + #[inline] + pub fn try_append_placeholder(&mut self) -> Result<()> { // Zero-length inline view — `length` field is 0, no buffer ref. self.views.push(0); self.placeholder_count += 1; + Ok(()) + } + + /// See [`BulkNullStringArrayBuilder::append_placeholder`]. + /// + /// Note: new call sites that need recoverable overflow handling should + /// prefer [`Self::try_append_placeholder`]. + #[inline] + pub fn append_placeholder(&mut self) { + self.try_append_placeholder() + .expect("byte array offset overflow"); } /// Ensure the in-progress block has room for `length` more bytes, @@ -754,10 +795,12 @@ impl StringViewArrayBuilder { /// function is `[inline(never)]` and has to handle short strings, so /// building the view here ourselves is faster. #[inline] - fn make_long_view(&self, length: u32, offset: u32, prefix_bytes: &[u8]) -> u128 { - let buffer_index: u32 = i32::try_from(self.completed.len()) - .expect("buffer count exceeds i32::MAX") - as u32; + fn make_long_view( + length: u32, + buffer_index: u32, + offset: u32, + prefix_bytes: &[u8], + ) -> u128 { ByteView { length, // length > 12, so prefix_bytes has at least 4 bytes. @@ -797,9 +840,16 @@ impl StringViewArrayBuilder { let cursor = self.in_progress.len(); let offset: u32 = i32::try_from(cursor).expect("offset exceeds i32::MAX") as u32; + let buffer_index: u32 = i32::try_from(self.completed.len()) + .expect("buffer count exceeds i32::MAX") + as u32; self.in_progress.extend(src.iter().map(|&b| map(b))); - self.views - .push(self.make_long_view(length, offset, &self.in_progress[cursor..])); + self.views.push(Self::make_long_view( + length, + buffer_index, + offset, + &self.in_progress[cursor..], + )); } /// See [`BulkNullStringArrayBuilder::append_with`]. @@ -843,8 +893,12 @@ impl StringViewArrayBuilder { as u32; let offset: u32 = i32::try_from(start).expect("offset exceeds i32::MAX") as u32; - self.views.push(self.make_long_view( + let buffer_index: u32 = i32::try_from(self.completed.len()) + .expect("buffer count exceeds i32::MAX") + as u32; + self.views.push(Self::make_long_view( length, + buffer_index, offset, &self.in_progress[start..], )); @@ -1601,6 +1655,20 @@ mod tests { ); } + #[test] + fn string_view_builder_try_append_success_path() { + let mut builder = StringViewArrayBuilder::with_capacity(3); + builder.try_append_value("abc").unwrap(); + builder.try_append_placeholder().unwrap(); + builder.try_append_value("a long string value").unwrap(); + + let nulls = Some(NullBuffer::from(vec![true, false, true])); + let array = builder.finish(nulls).unwrap(); + assert_eq!(array.value(0), "abc"); + assert!(array.is_null(1)); + assert_eq!(array.value(2), "a long string value"); + } + #[test] fn generic_string_builder_try_offset_overflow() { let err = try_offset::(i32::MAX as usize + 1) diff --git a/datafusion/functions/src/unicode/substrindex.rs b/datafusion/functions/src/unicode/substrindex.rs index d122a34a9fc38..a5966f6284417 100644 --- a/datafusion/functions/src/unicode/substrindex.rs +++ b/datafusion/functions/src/unicode/substrindex.rs @@ -281,7 +281,7 @@ where for i in 0..num_rows { if nulls.as_ref().is_some_and(|n| n.is_null(i)) { - builder.append_placeholder(); + builder.try_append_placeholder()?; continue; } // SAFETY: `i < num_rows` and the union of input nulls is valid at i, @@ -289,7 +289,7 @@ where let string = unsafe { string_array.value_unchecked(i) }; let delimiter = unsafe { delimiter_array.value_unchecked(i) }; let n = unsafe { count_array.value_unchecked(i) }; - builder.append_value(substr_index_slice(string, delimiter, n)); + builder.try_append_value(substr_index_slice(string, delimiter, n))?; } Ok(Arc::new(builder.finish(nulls)?) as ArrayRef) @@ -487,13 +487,13 @@ where let nulls = string_array.nulls().cloned(); for i in 0..string_array.len() { if nulls.as_ref().is_some_and(|n| n.is_null(i)) { - builder.append_placeholder(); + builder.try_append_placeholder()?; continue; } // SAFETY: `i < string_array.len()` and `nulls` is valid at i, so the // input is also valid at i. let s = unsafe { string_array.value_unchecked(i) }; - builder.append_value(f(s)); + builder.try_append_value(f(s))?; } Ok(Arc::new(builder.finish(nulls)?) as ArrayRef) } @@ -797,6 +797,30 @@ mod tests { Ok(()) } + #[test] + fn test_substr_index_all_nulls() -> Result<()> { + use super::substr_index_general; + use crate::strings::GenericStringArrayBuilder; + + let strings = StringArray::from(vec![None::<&str>, None]); + let delimiters = StringArray::from(vec![None::<&str>, Some(".")]); + let counts = Int64Array::from(vec![None, None]); + + let result = substr_index_general( + &strings, + &delimiters, + &counts, + GenericStringArrayBuilder::::with_capacity(strings.len(), 0), + )?; + let result = result.as_string::(); + + assert_eq!(result.len(), 2); + assert!(result.is_null(0)); + assert!(result.is_null(1)); + + Ok(()) + } + #[test] fn test_substr_index_utf8view_array_sliced() -> Result<()> { use super::substr_index_view; From bd34d604dd2ff3aa8447a08a6428a0aec2183402 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Mon, 22 Jun 2026 14:02:58 +0800 Subject: [PATCH 2/3] feat(string_view): enhance error handling and optimize StringViewArrayBuilder - Changed limit to field in string_view_overflow_error for better clarity. - Removed unreachable expect in StringViewArrayBuilder::append_placeholder to improve code quality. - Introduced make_long_view_checked and restored the receiver make_long_view for enhanced functionality. - Shortened the all-null test case in substr_index for better readability and maintainability. --- datafusion/functions/src/strings.rs | 48 +++++++++---------- .../functions/src/unicode/substrindex.rs | 5 +- 2 files changed, 25 insertions(+), 28 deletions(-) diff --git a/datafusion/functions/src/strings.rs b/datafusion/functions/src/strings.rs index a2da47e7bd827..d3e0a45292119 100644 --- a/datafusion/functions/src/strings.rs +++ b/datafusion/functions/src/strings.rs @@ -339,8 +339,8 @@ fn offset_overflow_error() -> DataFusionError { ) } -fn string_view_overflow_error(limit: &str) -> DataFusionError { - exec_datafusion_err!("byte array offset overflow: {limit} exceeds i32::MAX") +fn string_view_overflow_error(field: &str) -> DataFusionError { + exec_datafusion_err!("byte array offset overflow: {field} exceeds i32::MAX") } fn try_offset(len: usize) -> Result { @@ -730,8 +730,12 @@ impl StringViewArrayBuilder { .map_err(|_| string_view_overflow_error("buffer count"))? as u32; self.in_progress.extend_from_slice(v); - self.views - .push(Self::make_long_view(length, buffer_index, offset, v)); + self.views.push(Self::make_long_view_checked( + length, + buffer_index, + offset, + v, + )); Ok(()) } @@ -753,9 +757,7 @@ impl StringViewArrayBuilder { /// Fallible variant of [`Self::append_placeholder`]. #[inline] pub fn try_append_placeholder(&mut self) -> Result<()> { - // Zero-length inline view — `length` field is 0, no buffer ref. - self.views.push(0); - self.placeholder_count += 1; + self.append_placeholder(); Ok(()) } @@ -765,8 +767,9 @@ impl StringViewArrayBuilder { /// prefer [`Self::try_append_placeholder`]. #[inline] pub fn append_placeholder(&mut self) { - self.try_append_placeholder() - .expect("byte array offset overflow"); + // Zero-length inline view — `length` field is 0, no buffer ref. + self.views.push(0); + self.placeholder_count += 1; } /// Ensure the in-progress block has room for `length` more bytes, @@ -795,7 +798,7 @@ impl StringViewArrayBuilder { /// function is `[inline(never)]` and has to handle short strings, so /// building the view here ourselves is faster. #[inline] - fn make_long_view( + fn make_long_view_checked( length: u32, buffer_index: u32, offset: u32, @@ -811,6 +814,14 @@ impl StringViewArrayBuilder { .into() } + #[inline] + fn make_long_view(&self, length: u32, offset: u32, prefix_bytes: &[u8]) -> u128 { + let buffer_index: u32 = i32::try_from(self.completed.len()) + .expect("buffer count exceeds i32::MAX") + as u32; + Self::make_long_view_checked(length, buffer_index, offset, prefix_bytes) + } + /// See [`BulkNullStringArrayBuilder::append_byte_map`]. /// /// # Safety @@ -840,16 +851,9 @@ impl StringViewArrayBuilder { let cursor = self.in_progress.len(); let offset: u32 = i32::try_from(cursor).expect("offset exceeds i32::MAX") as u32; - let buffer_index: u32 = i32::try_from(self.completed.len()) - .expect("buffer count exceeds i32::MAX") - as u32; self.in_progress.extend(src.iter().map(|&b| map(b))); - self.views.push(Self::make_long_view( - length, - buffer_index, - offset, - &self.in_progress[cursor..], - )); + self.views + .push(self.make_long_view(length, offset, &self.in_progress[cursor..])); } /// See [`BulkNullStringArrayBuilder::append_with`]. @@ -893,12 +897,8 @@ impl StringViewArrayBuilder { as u32; let offset: u32 = i32::try_from(start).expect("offset exceeds i32::MAX") as u32; - let buffer_index: u32 = i32::try_from(self.completed.len()) - .expect("buffer count exceeds i32::MAX") - as u32; - self.views.push(Self::make_long_view( + self.views.push(self.make_long_view( length, - buffer_index, offset, &self.in_progress[start..], )); diff --git a/datafusion/functions/src/unicode/substrindex.rs b/datafusion/functions/src/unicode/substrindex.rs index a5966f6284417..f9f0bafa04309 100644 --- a/datafusion/functions/src/unicode/substrindex.rs +++ b/datafusion/functions/src/unicode/substrindex.rs @@ -813,10 +813,7 @@ mod tests { GenericStringArrayBuilder::::with_capacity(strings.len(), 0), )?; let result = result.as_string::(); - - assert_eq!(result.len(), 2); - assert!(result.is_null(0)); - assert!(result.is_null(1)); + assert_eq!(result, &StringArray::from(vec![None::<&str>, None])); Ok(()) } From dc960c2ed1fd34d7916b6dd9560f59a5b3387356 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Mon, 22 Jun 2026 14:25:56 +0800 Subject: [PATCH 3/3] feat(strings): enhance try_append_value with long capacity and update documentation - Reused new try_ensure_long_capacity in try_append_value. - Updated ensure_long_capacity to delegate to a fallible helper while preserving panic mode. - Added error documentation for try_append_placeholder. --- datafusion/functions/src/strings.rs | 44 ++++++++++++++++------------- 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/datafusion/functions/src/strings.rs b/datafusion/functions/src/strings.rs index d3e0a45292119..c788c6fb1f33f 100644 --- a/datafusion/functions/src/strings.rs +++ b/datafusion/functions/src/strings.rs @@ -708,20 +708,7 @@ impl StringViewArrayBuilder { return Ok(()); } - let required_cap = self - .in_progress - .len() - .checked_add(length as usize) - .ok_or_else(|| string_view_overflow_error("string view block size"))?; - if self.in_progress.capacity() < required_cap { - self.flush_in_progress(); - let to_reserve = (length as usize).max(self.next_block_size() as usize); - #[expect( - clippy::disallowed_methods, - reason = "StringView's block size bounds growth, so reserve cannot overflow capacity arithmetically. This hot loop intentionally avoids the extra `try_reserve` checks. It remains subject to allocator failure/OOM, which must be managed externally." - )] - self.in_progress.reserve(to_reserve); - } + self.try_ensure_long_capacity(length)?; let offset: u32 = i32::try_from(self.in_progress.len()) .map_err(|_| string_view_overflow_error("offset"))? @@ -755,6 +742,11 @@ impl StringViewArrayBuilder { } /// Fallible variant of [`Self::append_placeholder`]. + /// + /// # Errors + /// + /// This currently cannot fail; it returns `Result` for API symmetry with + /// other fallible append methods. #[inline] pub fn try_append_placeholder(&mut self) -> Result<()> { self.append_placeholder(); @@ -772,13 +764,14 @@ impl StringViewArrayBuilder { self.placeholder_count += 1; } - /// Ensure the in-progress block has room for `length` more bytes, - /// flushing the current block and starting a new (doubled) one if not. - /// Caller must invoke this only when no bytes of the current row are - /// yet in `in_progress` — flushing mid-row would orphan partial data. + /// Fallible variant of [`Self::ensure_long_capacity`]. #[inline] - fn ensure_long_capacity(&mut self, length: u32) { - let required_cap = self.in_progress.len() + length as usize; + fn try_ensure_long_capacity(&mut self, length: u32) -> Result<()> { + let required_cap = self + .in_progress + .len() + .checked_add(length as usize) + .ok_or_else(|| string_view_overflow_error("string view block size"))?; if self.in_progress.capacity() < required_cap { self.flush_in_progress(); let to_reserve = (length as usize).max(self.next_block_size() as usize); @@ -788,6 +781,17 @@ impl StringViewArrayBuilder { )] self.in_progress.reserve(to_reserve); } + Ok(()) + } + + /// Ensure the in-progress block has room for `length` more bytes, + /// flushing the current block and starting a new (doubled) one if not. + /// Caller must invoke this only when no bytes of the current row are + /// yet in `in_progress` — flushing mid-row would orphan partial data. + #[inline] + fn ensure_long_capacity(&mut self, length: u32) { + self.try_ensure_long_capacity(length) + .expect("byte array offset overflow"); } /// Encode a long-form view referencing `length` bytes already written