Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions daft/expressions/expressions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2374,6 +2374,46 @@ def hamming_distance_str(self, other: Expression) -> Expression:

return hamming_distance_str(self, other)

def levenshtein_distance(self, other: Expression) -> Expression:
"""Compute the Levenshtein edit distance between two strings.

Tip: See Also
[`daft.functions.levenshtein_distance`](https://docs.daft.ai/en/stable/api/functions/levenshtein_distance/)
"""
from daft.functions import levenshtein_distance

return levenshtein_distance(self, other)

def jaro_similarity(self, other: Expression) -> Expression:
"""Compute the Jaro similarity between two strings.

Tip: See Also
[`daft.functions.jaro_similarity`](https://docs.daft.ai/en/stable/api/functions/jaro_similarity/)
"""
from daft.functions import jaro_similarity

return jaro_similarity(self, other)

def jaro_winkler_similarity(self, other: Expression) -> Expression:
"""Compute the Jaro-Winkler similarity between two strings.

Tip: See Also
[`daft.functions.jaro_winkler_similarity`](https://docs.daft.ai/en/stable/api/functions/jaro_winkler_similarity/)
"""
from daft.functions import jaro_winkler_similarity

return jaro_winkler_similarity(self, other)

def damerau_levenshtein_distance(self, other: Expression) -> Expression:
"""Compute the Damerau-Levenshtein distance between two strings.

Tip: See Also
[`daft.functions.damerau_levenshtein_distance`](https://docs.daft.ai/en/stable/api/functions/damerau_levenshtein_distance/)
"""
from daft.functions import damerau_levenshtein_distance

return damerau_levenshtein_distance(self, other)

def value_counts(self) -> Expression:
"""Counts the occurrences of each distinct value in the list.

Expand Down
8 changes: 8 additions & 0 deletions daft/functions/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,10 @@
regexp_replace,
find,
hamming_distance_str,
levenshtein_distance,
jaro_similarity,
jaro_winkler_similarity,
damerau_levenshtein_distance,
)
from .struct import unnest, to_struct
from .url import download, upload, parse_url
Expand Down Expand Up @@ -350,6 +354,7 @@
"current_date",
"current_timestamp",
"current_timezone",
"damerau_levenshtein_distance",
"date",
"date_add",
"date_diff",
Expand Down Expand Up @@ -424,6 +429,8 @@
"is_nan",
"is_null",
"jaccard_similarity",
"jaro_similarity",
"jaro_winkler_similarity",
"jq",
"json_array_length",
"json_object_keys",
Expand All @@ -435,6 +442,7 @@
"left",
"length",
"length_bytes",
"levenshtein_distance",
"like",
"list_agg",
"list_agg_distinct",
Expand Down
158 changes: 158 additions & 0 deletions daft/functions/str.py
Original file line number Diff line number Diff line change
Expand Up @@ -1612,3 +1612,161 @@ def hamming_distance_str(left: Expression, right: Expression) -> Expression:
(Showing first 3 of 3 rows)
"""
return Expression._call_builtin_scalar_fn("hamming_distance_str", left, right)


def levenshtein_distance(left: Expression, right: Expression) -> Expression:
"""Compute the Levenshtein edit distance between two strings.

The Levenshtein distance is the minimum number of single-character insertions,
deletions, or substitutions required to transform one string into the other.

Args:
left: The left string expression to compare.
right: The right string expression to compare against.

Returns:
The Levenshtein distance for each pair of strings. Returns null when either
input is null.

Examples:
>>> import daft
>>> from daft.functions import levenshtein_distance
>>> df = daft.from_pydict({"x": ["kitten", "saturday", ""], "y": ["sitting", "sunday", "abc"]})
>>> df = df.with_column("distance", levenshtein_distance(df["x"], df["y"]))
>>> df.collect()
╭──────────┬─────────┬──────────╮
│ x ┆ y ┆ distance │
│ --- ┆ --- ┆ --- │
│ String ┆ String ┆ Int64 │
╞══════════╪═════════╪══════════╡
│ kitten ┆ sitting ┆ 3 │
├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ saturday ┆ sunday ┆ 3 │
├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ ┆ abc ┆ 3 │
╰──────────┴─────────┴──────────╯
<BLANKLINE>
(Showing first 3 of 3 rows)
"""
return Expression._call_builtin_scalar_fn("levenshtein_distance", left, right)


def jaro_similarity(left: Expression, right: Expression) -> Expression:
"""Compute the Jaro similarity between two strings.

The Jaro similarity is a measure of similarity between two strings, based on
matching characters and transpositions. Returns a value between 0.0 (no similarity)
and 1.0 (identical strings).

Args:
left: The left string expression to compare.
right: The right string expression to compare against.

Returns:
The Jaro similarity (0.0 to 1.0) for each pair of strings. Returns null when
either input is null.

Examples:
>>> import daft
>>> from daft.functions import jaro_similarity
>>> df = daft.from_pydict({"x": ["martha", "dwayne", "dixon"], "y": ["marhta", "duane", "dicksonx"]})
>>> df = df.with_column("similarity", jaro_similarity(df["x"], df["y"]))
>>> df.collect()
╭────────┬──────────┬────────────────────╮
│ x ┆ y ┆ similarity │
│ --- ┆ --- ┆ --- │
│ String ┆ String ┆ Float64 │
╞════════╪══════════╪════════════════════╡
│ martha ┆ marhta ┆ 0.9444444444444445 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ dwayne ┆ duane ┆ 0.8222222222222223 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ dixon ┆ dicksonx ┆ 0.7666666666666666 │
╰────────┴──────────┴────────────────────╯
<BLANKLINE>
(Showing first 3 of 3 rows)
"""
return Expression._call_builtin_scalar_fn("jaro_similarity", left, right)


def jaro_winkler_similarity(left: Expression, right: Expression) -> Expression:
"""Compute the Jaro-Winkler similarity between two strings.

This is the Jaro similarity with a prefix bonus for strings sharing a common
prefix (up to 4 characters). Returns a value between 0.0 (no similarity) and
1.0 (identical strings).

Args:
left: The left string expression to compare.
right: The right string expression to compare against.

Returns:
The Jaro-Winkler similarity (0.0 to 1.0) for each pair of strings. Returns
null when either input is null.

Examples:
>>> import daft
>>> from daft.functions import jaro_winkler_similarity
>>> df = daft.from_pydict({"x": ["martha", "dwayne", "dixon"], "y": ["marhta", "duane", "dicksonx"]})
>>> df = df.with_column("similarity", jaro_winkler_similarity(df["x"], df["y"]))
>>> df.collect()
╭────────┬──────────┬────────────────────╮
│ x ┆ y ┆ similarity │
│ --- ┆ --- ┆ --- │
│ String ┆ String ┆ Float64 │
╞════════╪══════════╪════════════════════╡
│ martha ┆ marhta ┆ 0.9611111111111111 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ dwayne ┆ duane ┆ 0.8400000000000001 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ dixon ┆ dicksonx ┆ 0.8133333333333332 │
╰────────┴──────────┴────────────────────╯
<BLANKLINE>
(Showing first 3 of 3 rows)
"""
return Expression._call_builtin_scalar_fn("jaro_winkler_similarity", left, right)


def damerau_levenshtein_distance(left: Expression, right: Expression) -> Expression:
"""Compute the Damerau-Levenshtein distance between two strings.

This extends the Levenshtein distance by also counting transpositions of two
adjacent characters as a single edit operation (in addition to insertions,
deletions, and substitutions).

Note:
This computes the Optimal String Alignment (OSA) variant, which does not
allow a substring to be edited more than once. Results may differ from the
true Damerau-Levenshtein distance for inputs with overlapping transpositions
(e.g., ``"CA"`` to ``"ABC"`` is 3 under OSA but 2 under true
Damerau-Levenshtein). OSA does not satisfy the triangle inequality.

Args:
left: The left string expression to compare.
right: The right string expression to compare against.

Returns:
The Damerau-Levenshtein (OSA) distance for each pair of strings. Returns null
when either input is null.

Examples:
>>> import daft
>>> from daft.functions import damerau_levenshtein_distance
>>> df = daft.from_pydict({"x": ["abc", "abc", ""], "y": ["bac", "acb", "abc"]})
>>> df = df.with_column("distance", damerau_levenshtein_distance(df["x"], df["y"]))
>>> df.collect()
╭────────┬────────┬──────────╮
│ x ┆ y ┆ distance │
│ --- ┆ --- ┆ --- │
│ String ┆ String ┆ Int64 │
╞════════╪════════╪══════════╡
│ abc ┆ bac ┆ 1 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ abc ┆ acb ┆ 1 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ ┆ abc ┆ 3 │
╰────────┴────────┴──────────╯
<BLANKLINE>
(Showing first 3 of 3 rows)
"""
return Expression._call_builtin_scalar_fn("damerau_levenshtein_distance", left, right)
97 changes: 97 additions & 0 deletions src/daft-functions-utf8/src/damerau_levenshtein.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
use daft_dsl::functions::{prelude::*, scalar::ScalarFn};
use serde::{Deserialize, Serialize};

use crate::utils::{binary_str_distance, binary_str_distance_to_field};

/// Compute the Damerau-Levenshtein distance (optimal string alignment variant).
/// This extends Levenshtein by also allowing transposition of two adjacent characters
/// as a single edit operation.
fn compute_damerau_levenshtein_distance(left: &str, right: &str) -> i64 {
let left_chars: Vec<char> = left.chars().collect();
let right_chars: Vec<char> = right.chars().collect();

let n = left_chars.len();
let m = right_chars.len();

if n == 0 {
return m as i64;
}
if m == 0 {
return n as i64;
}

// Full matrix needed for transposition lookback
let mut matrix = vec![vec![0i64; m + 1]; n + 1];

for (i, row) in matrix.iter_mut().enumerate() {
row[0] = i as i64;
}
for j in 0..=m {
matrix[0][j] = j as i64;
}

for i in 1..=n {
for j in 1..=m {
let cost = i64::from(left_chars[i - 1] != right_chars[j - 1]);

matrix[i][j] = (matrix[i - 1][j] + 1) // deletion
.min(matrix[i][j - 1] + 1) // insertion
.min(matrix[i - 1][j - 1] + cost); // substitution

// Transposition
if i > 1
&& j > 1
&& left_chars[i - 1] == right_chars[j - 2]
&& left_chars[i - 2] == right_chars[j - 1]
{
matrix[i][j] = matrix[i][j].min(matrix[i - 2][j - 2] + 1);
}
}
}

matrix[n][m]
}

#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
pub struct DamerauLevenshteinDistance;

#[typetag::serde]
impl ScalarUDF for DamerauLevenshteinDistance {
fn name(&self) -> &'static str {
"damerau_levenshtein_distance"
}

fn call(
&self,
inputs: FunctionArgs<Series>,
_ctx: &daft_dsl::functions::scalar::EvalContext,
) -> DaftResult<Series> {
binary_str_distance::<daft_core::datatypes::Int64Type, _>(
inputs,
self.name(),
DataType::Int64,
compute_damerau_levenshtein_distance,
)
}

fn get_return_field(
&self,
inputs: FunctionArgs<ExprRef>,
schema: &Schema,
) -> DaftResult<Field> {
binary_str_distance_to_field(inputs, schema, self.name(), DataType::Int64)
}

fn docstring(&self) -> &'static str {
"Compute the Damerau-Levenshtein distance between two strings. This extends the \
Levenshtein distance by also counting transpositions of two adjacent characters \
as a single edit operation. This computes the Optimal String Alignment (OSA) \
variant, which may differ from true Damerau-Levenshtein for inputs with \
overlapping transpositions. Returns null when either input is null."
}
}

#[must_use]
pub fn damerau_levenshtein_distance(left: ExprRef, right: ExprRef) -> ExprRef {
ScalarFn::builtin(DamerauLevenshteinDistance, vec![left, right]).into()
}
Loading