Eventual-Inc · cckellogg · Jun 9, 2026 · Jun 4, 2026 · Jun 4, 2026 · Jun 4, 2026
diff --git a/daft/expressions/expressions.py b/daft/expressions/expressions.py
@@ -2374,6 +2374,46 @@ def hamming_distance_str(self, other: Expression) -> Expression:
 
         return hamming_distance_str(self, other)
 
+    def levenshtein_distance(self, other: Expression) -> Expression:
+        """Compute the Levenshtein edit distance between two strings.
+
+        Tip: See Also
+            [`daft.functions.levenshtein_distance`](https://docs.daft.ai/en/stable/api/functions/levenshtein_distance/)
+        """
+        from daft.functions import levenshtein_distance
+
+        return levenshtein_distance(self, other)
+
+    def jaro_similarity(self, other: Expression) -> Expression:
+        """Compute the Jaro similarity between two strings.
+
+        Tip: See Also
+            [`daft.functions.jaro_similarity`](https://docs.daft.ai/en/stable/api/functions/jaro_similarity/)
+        """
+        from daft.functions import jaro_similarity
+
+        return jaro_similarity(self, other)
+
+    def jaro_winkler_similarity(self, other: Expression) -> Expression:
+        """Compute the Jaro-Winkler similarity between two strings.
+
+        Tip: See Also
+            [`daft.functions.jaro_winkler_similarity`](https://docs.daft.ai/en/stable/api/functions/jaro_winkler_similarity/)
+        """
+        from daft.functions import jaro_winkler_similarity
+
+        return jaro_winkler_similarity(self, other)
+
+    def damerau_levenshtein_distance(self, other: Expression) -> Expression:
+        """Compute the Damerau-Levenshtein distance between two strings.
+
+        Tip: See Also
+            [`daft.functions.damerau_levenshtein_distance`](https://docs.daft.ai/en/stable/api/functions/damerau_levenshtein_distance/)
+        """
+        from daft.functions import damerau_levenshtein_distance
+
+        return damerau_levenshtein_distance(self, other)
+
     def value_counts(self) -> Expression:
         """Counts the occurrences of each distinct value in the list.
 

diff --git a/daft/functions/__init__.py b/daft/functions/__init__.py
@@ -277,6 +277,10 @@
     regexp_replace,
     find,
     hamming_distance_str,
+    levenshtein_distance,
+    jaro_similarity,
+    jaro_winkler_similarity,
+    damerau_levenshtein_distance,
 )
 from .struct import unnest, to_struct
 from .url import download, upload, parse_url
@@ -350,6 +354,7 @@
     "current_date",
     "current_timestamp",
     "current_timezone",
+    "damerau_levenshtein_distance",
     "date",
     "date_add",
     "date_diff",
@@ -424,6 +429,8 @@
     "is_nan",
     "is_null",
     "jaccard_similarity",
+    "jaro_similarity",
+    "jaro_winkler_similarity",
     "jq",
     "json_array_length",
     "json_object_keys",
@@ -435,6 +442,7 @@
     "left",
     "length",
     "length_bytes",
+    "levenshtein_distance",
     "like",
     "list_agg",
     "list_agg_distinct",

diff --git a/daft/functions/str.py b/daft/functions/str.py
@@ -1612,3 +1612,161 @@ def hamming_distance_str(left: Expression, right: Expression) -> Expression:
         (Showing first 3 of 3 rows)
     """
     return Expression._call_builtin_scalar_fn("hamming_distance_str", left, right)
+
+
+def levenshtein_distance(left: Expression, right: Expression) -> Expression:
+    """Compute the Levenshtein edit distance between two strings.
+
+    The Levenshtein distance is the minimum number of single-character insertions,
+    deletions, or substitutions required to transform one string into the other.
+
+    Args:
+        left: The left string expression to compare.
+        right: The right string expression to compare against.
+
+    Returns:
+        The Levenshtein distance for each pair of strings. Returns null when either
+        input is null.
+
+    Examples:
+        >>> import daft
+        >>> from daft.functions import levenshtein_distance
+        >>> df = daft.from_pydict({"x": ["kitten", "saturday", ""], "y": ["sitting", "sunday", "abc"]})
+        >>> df = df.with_column("distance", levenshtein_distance(df["x"], df["y"]))
+        >>> df.collect()
+        ╭──────────┬─────────┬──────────╮
+        │ x        ┆ y       ┆ distance │
+        │ ---      ┆ ---     ┆ ---      │
+        │ String   ┆ String  ┆ Int64    │
+        ╞══════════╪═════════╪══════════╡
+        │ kitten   ┆ sitting ┆ 3        │
+        ├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
+        │ saturday ┆ sunday  ┆ 3        │
+        ├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
+        │          ┆ abc     ┆ 3        │
+        ╰──────────┴─────────┴──────────╯
+        <BLANKLINE>
+        (Showing first 3 of 3 rows)
+    """
+    return Expression._call_builtin_scalar_fn("levenshtein_distance", left, right)
+
+
+def jaro_similarity(left: Expression, right: Expression) -> Expression:
+    """Compute the Jaro similarity between two strings.
+
+    The Jaro similarity is a measure of similarity between two strings, based on
+    matching characters and transpositions. Returns a value between 0.0 (no similarity)
+    and 1.0 (identical strings).
+
+    Args:
+        left: The left string expression to compare.
+        right: The right string expression to compare against.
+
+    Returns:
+        The Jaro similarity (0.0 to 1.0) for each pair of strings. Returns null when
+        either input is null.
+
+    Examples:
+        >>> import daft
+        >>> from daft.functions import jaro_similarity
+        >>> df = daft.from_pydict({"x": ["martha", "dwayne", "dixon"], "y": ["marhta", "duane", "dicksonx"]})
+        >>> df = df.with_column("similarity", jaro_similarity(df["x"], df["y"]))
+        >>> df.collect()
+        ╭────────┬──────────┬────────────────────╮
+        │ x      ┆ y        ┆ similarity         │
+        │ ---    ┆ ---      ┆ ---                │
+        │ String ┆ String   ┆ Float64            │
+        ╞════════╪══════════╪════════════════════╡
+        │ martha ┆ marhta   ┆ 0.9444444444444445 │
+        ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
+        │ dwayne ┆ duane    ┆ 0.8222222222222223 │
+        ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
+        │ dixon  ┆ dicksonx ┆ 0.7666666666666666 │
+        ╰────────┴──────────┴────────────────────╯
+        <BLANKLINE>
+        (Showing first 3 of 3 rows)
+    """
+    return Expression._call_builtin_scalar_fn("jaro_similarity", left, right)
+
+
+def jaro_winkler_similarity(left: Expression, right: Expression) -> Expression:
+    """Compute the Jaro-Winkler similarity between two strings.
+
+    This is the Jaro similarity with a prefix bonus for strings sharing a common
+    prefix (up to 4 characters). Returns a value between 0.0 (no similarity) and
+    1.0 (identical strings).
+
+    Args:
+        left: The left string expression to compare.
+        right: The right string expression to compare against.
+
+    Returns:
+        The Jaro-Winkler similarity (0.0 to 1.0) for each pair of strings. Returns
+        null when either input is null.
+
+    Examples:
+        >>> import daft
+        >>> from daft.functions import jaro_winkler_similarity
+        >>> df = daft.from_pydict({"x": ["martha", "dwayne", "dixon"], "y": ["marhta", "duane", "dicksonx"]})
+        >>> df = df.with_column("similarity", jaro_winkler_similarity(df["x"], df["y"]))
+        >>> df.collect()
+        ╭────────┬──────────┬────────────────────╮
+        │ x      ┆ y        ┆ similarity         │
+        │ ---    ┆ ---      ┆ ---                │
+        │ String ┆ String   ┆ Float64            │
+        ╞════════╪══════════╪════════════════════╡
+        │ martha ┆ marhta   ┆ 0.9611111111111111 │
+        ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
+        │ dwayne ┆ duane    ┆ 0.8400000000000001 │
+        ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
+        │ dixon  ┆ dicksonx ┆ 0.8133333333333332 │
+        ╰────────┴──────────┴────────────────────╯
+        <BLANKLINE>
+        (Showing first 3 of 3 rows)
+    """
+    return Expression._call_builtin_scalar_fn("jaro_winkler_similarity", left, right)
+
+
+def damerau_levenshtein_distance(left: Expression, right: Expression) -> Expression:
+    """Compute the Damerau-Levenshtein distance between two strings.
+
+    This extends the Levenshtein distance by also counting transpositions of two
+    adjacent characters as a single edit operation (in addition to insertions,
+    deletions, and substitutions).
+
+    Note:
+        This computes the Optimal String Alignment (OSA) variant, which does not
+        allow a substring to be edited more than once. Results may differ from the
+        true Damerau-Levenshtein distance for inputs with overlapping transpositions
+        (e.g., ``"CA"`` to ``"ABC"`` is 3 under OSA but 2 under true
+        Damerau-Levenshtein). OSA does not satisfy the triangle inequality.
+
+    Args:
+        left: The left string expression to compare.
+        right: The right string expression to compare against.
+
+    Returns:
+        The Damerau-Levenshtein (OSA) distance for each pair of strings. Returns null
+        when either input is null.
+
+    Examples:
+        >>> import daft
+        >>> from daft.functions import damerau_levenshtein_distance
+        >>> df = daft.from_pydict({"x": ["abc", "abc", ""], "y": ["bac", "acb", "abc"]})
+        >>> df = df.with_column("distance", damerau_levenshtein_distance(df["x"], df["y"]))
+        >>> df.collect()
+        ╭────────┬────────┬──────────╮
+        │ x      ┆ y      ┆ distance │
+        │ ---    ┆ ---    ┆ ---      │
+        │ String ┆ String ┆ Int64    │
+        ╞════════╪════════╪══════════╡
+        │ abc    ┆ bac    ┆ 1        │
+        ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
+        │ abc    ┆ acb    ┆ 1        │
+        ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
+        │        ┆ abc    ┆ 3        │
+        ╰────────┴────────┴──────────╯
+        <BLANKLINE>
+        (Showing first 3 of 3 rows)
+    """
+    return Expression._call_builtin_scalar_fn("damerau_levenshtein_distance", left, right)
diff --git a/src/daft-functions-utf8/src/damerau_levenshtein.rs b/src/daft-functions-utf8/src/damerau_levenshtein.rs
@@ -0,0 +1,97 @@
+use daft_dsl::functions::{prelude::*, scalar::ScalarFn};
+use serde::{Deserialize, Serialize};
+
+use crate::utils::{binary_str_distance, binary_str_distance_to_field};
+
+/// Compute the Damerau-Levenshtein distance (optimal string alignment variant).
+/// This extends Levenshtein by also allowing transposition of two adjacent characters
+/// as a single edit operation.
+fn compute_damerau_levenshtein_distance(left: &str, right: &str) -> i64 {
+    let left_chars: Vec<char> = left.chars().collect();
+    let right_chars: Vec<char> = right.chars().collect();
+
+    let n = left_chars.len();
+    let m = right_chars.len();
+
+    if n == 0 {
+        return m as i64;
+    }
+    if m == 0 {
+        return n as i64;
+    }
+
+    // Full matrix needed for transposition lookback
+    let mut matrix = vec![vec![0i64; m + 1]; n + 1];
+
+    for (i, row) in matrix.iter_mut().enumerate() {
+        row[0] = i as i64;
+    }
+    for j in 0..=m {
+        matrix[0][j] = j as i64;
+    }
+
+    for i in 1..=n {
+        for j in 1..=m {
+            let cost = i64::from(left_chars[i - 1] != right_chars[j - 1]);
+
+            matrix[i][j] = (matrix[i - 1][j] + 1) // deletion
+                .min(matrix[i][j - 1] + 1) // insertion
+                .min(matrix[i - 1][j - 1] + cost); // substitution
+
+            // Transposition
+            if i > 1
+                && j > 1
+                && left_chars[i - 1] == right_chars[j - 2]
+                && left_chars[i - 2] == right_chars[j - 1]
+            {
+                matrix[i][j] = matrix[i][j].min(matrix[i - 2][j - 2] + 1);
+            }
+        }
+    }
+
+    matrix[n][m]
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
+pub struct DamerauLevenshteinDistance;
+
+#[typetag::serde]
+impl ScalarUDF for DamerauLevenshteinDistance {
+    fn name(&self) -> &'static str {
+        "damerau_levenshtein_distance"
+    }
+
+    fn call(
+        &self,
+        inputs: FunctionArgs<Series>,
+        _ctx: &daft_dsl::functions::scalar::EvalContext,
+    ) -> DaftResult<Series> {
+        binary_str_distance::<daft_core::datatypes::Int64Type, _>(
+            inputs,
+            self.name(),
+            DataType::Int64,
+            compute_damerau_levenshtein_distance,
+        )
+    }
+
+    fn get_return_field(
+        &self,
+        inputs: FunctionArgs<ExprRef>,
+        schema: &Schema,
+    ) -> DaftResult<Field> {
+        binary_str_distance_to_field(inputs, schema, self.name(), DataType::Int64)
+    }
+
+    fn docstring(&self) -> &'static str {
+        "Compute the Damerau-Levenshtein distance between two strings. This extends the \
+        Levenshtein distance by also counting transpositions of two adjacent characters \
+        as a single edit operation. This computes the Optimal String Alignment (OSA) \
+        variant, which may differ from true Damerau-Levenshtein for inputs with \
+        overlapping transpositions. Returns null when either input is null."
+    }
+}
+
+#[must_use]
+pub fn damerau_levenshtein_distance(left: ExprRef, right: ExprRef) -> ExprRef {
+    ScalarFn::builtin(DamerauLevenshteinDistance, vec![left, right]).into()
+}