From 2f2a6e175b4c8af62cbc78b97af24a8648141074 Mon Sep 17 00:00:00 2001 From: Li Jiajia Date: Thu, 11 Jun 2026 14:51:11 +0800 Subject: [PATCH 1/3] fix(iceberg): default https scheme for scheme-less s3.endpoint in IOConfig conversion --- daft/io/iceberg/_iceberg.py | 8 ++++- tests/io/iceberg/test_iceberg_io_config.py | 36 ++++++++++++++++++++++ 2 files changed, 43 insertions(+), 1 deletion(-) diff --git a/daft/io/iceberg/_iceberg.py b/daft/io/iceberg/_iceberg.py index 5cc6108c7eb..4906078d00b 100644 --- a/daft/io/iceberg/_iceberg.py +++ b/daft/io/iceberg/_iceberg.py @@ -45,9 +45,15 @@ def get_first_property_value(*property_names: str) -> Any | None: is_oss = location is not None and get_protocol_from_path(location) == "oss" + # Some catalogs vend `s3.endpoint` as a bare host. Daft's S3 client requires + # a full URI, so default the scheme to https (PyArrow's default). + endpoint_url = get_first_property_value("s3.endpoint") + if endpoint_url is not None and "://" not in endpoint_url: + endpoint_url = f"https://{endpoint_url}" + io_config = IOConfig( s3=S3Config( - endpoint_url=get_first_property_value("s3.endpoint"), + endpoint_url=endpoint_url, region_name=get_first_property_value("s3.region", "client.region"), key_id=get_first_property_value("s3.access-key-id", "client.access-key-id"), access_key=get_first_property_value("s3.secret-access-key", "client.secret-access-key"), diff --git a/tests/io/iceberg/test_iceberg_io_config.py b/tests/io/iceberg/test_iceberg_io_config.py index 1f9738b111e..22a6598a902 100644 --- a/tests/io/iceberg/test_iceberg_io_config.py +++ b/tests/io/iceberg/test_iceberg_io_config.py @@ -54,3 +54,39 @@ def test_no_location_with_props(): def test_no_location_no_props_returns_none(): """With no location (default) and no properties, no IOConfig is produced.""" assert _convert_iceberg_file_io_properties_to_io_config({}) is None + + +def test_schemeless_endpoint_gets_https_scheme(): + """A vended endpoint without a URI scheme is defaulted to https. + + Some catalogs vend `s3.endpoint` as a bare host; Daft's S3 client requires + a full URI. + """ + props = {"s3.endpoint": "s3.us-west-2.example.com"} + result = _convert_iceberg_file_io_properties_to_io_config(props, "s3://my-bucket/warehouse/db/table") + assert result is not None + assert result.s3.endpoint_url == "https://s3.us-west-2.example.com" + + +def test_schemeless_endpoint_with_trailing_slash(): + """A bare-host endpoint with a trailing slash also gets the https scheme.""" + props = {"s3.endpoint": "s3.us-west-2.example.com/"} + result = _convert_iceberg_file_io_properties_to_io_config(props, "s3://my-bucket/warehouse/db/table") + assert result is not None + assert result.s3.endpoint_url == "https://s3.us-west-2.example.com/" + + +def test_schemeless_endpoint_with_port(): + """A bare host:port endpoint also gets the https scheme.""" + props = {"s3.endpoint": "minio.example.com:9000"} + result = _convert_iceberg_file_io_properties_to_io_config(props, "s3://my-bucket/warehouse/db/table") + assert result is not None + assert result.s3.endpoint_url == "https://minio.example.com:9000" + + +def test_endpoint_with_scheme_unchanged(): + """Endpoints that already carry a scheme are passed through untouched.""" + props = {"s3.endpoint": "http://minio.example.com:9000"} + result = _convert_iceberg_file_io_properties_to_io_config(props, "s3://my-bucket/warehouse/db/table") + assert result is not None + assert result.s3.endpoint_url == "http://minio.example.com:9000" From 228efac366e01277e63d57cfd5375c474d00483f Mon Sep 17 00:00:00 2001 From: Li Jiajia Date: Sun, 14 Jun 2026 23:30:46 +0800 Subject: [PATCH 2/3] fix(io): default https scheme in S3 endpoint normalization --- daft/io/iceberg/_iceberg.py | 8 +- src/daft-io/src/s3_like.rs | 114 ++++++++++++++++----- tests/io/iceberg/test_iceberg_io_config.py | 36 ------- 3 files changed, 91 insertions(+), 67 deletions(-) diff --git a/daft/io/iceberg/_iceberg.py b/daft/io/iceberg/_iceberg.py index 4906078d00b..5cc6108c7eb 100644 --- a/daft/io/iceberg/_iceberg.py +++ b/daft/io/iceberg/_iceberg.py @@ -45,15 +45,9 @@ def get_first_property_value(*property_names: str) -> Any | None: is_oss = location is not None and get_protocol_from_path(location) == "oss" - # Some catalogs vend `s3.endpoint` as a bare host. Daft's S3 client requires - # a full URI, so default the scheme to https (PyArrow's default). - endpoint_url = get_first_property_value("s3.endpoint") - if endpoint_url is not None and "://" not in endpoint_url: - endpoint_url = f"https://{endpoint_url}" - io_config = IOConfig( s3=S3Config( - endpoint_url=endpoint_url, + endpoint_url=get_first_property_value("s3.endpoint"), region_name=get_first_property_value("s3.region", "client.region"), key_id=get_first_property_value("s3.access-key-id", "client.access-key-id"), access_key=get_first_property_value("s3.secret-access-key", "client.secret-access-key"), diff --git a/src/daft-io/src/s3_like.rs b/src/daft-io/src/s3_like.rs index a9f48ce2a8e..73953b36e58 100644 --- a/src/daft-io/src/s3_like.rs +++ b/src/daft-io/src/s3_like.rs @@ -472,6 +472,34 @@ async fn provide_credentials_with_retry( Ok(creds) } +/// Normalize an S3-compatible endpoint URL: default the scheme to `https` when +/// absent (a bare host otherwise fails URI parsing and the AWS SDK rejects it), +/// and ensure a trailing slash for correct path construction. +fn normalize_endpoint_url(url_str: &str) -> String { + // Default to https before parsing so a bare host also gets the trailing slash. + let url_str = if url_str.contains("://") { + url_str.to_string() + } else { + format!("https://{url_str}") + }; + match url::Url::parse(&url_str) { + Ok(mut parsed_url) => { + if !parsed_url.path().ends_with('/') { + parsed_url.set_path(&format!("{}/", parsed_url.path())); + } + parsed_url.to_string() + } + // Fall back to a simple trailing-slash append if parsing still fails. + Err(_) => { + if url_str.ends_with('/') { + url_str + } else { + format!("{url_str}/") + } + } + } +} + async fn build_s3_conf(config: &S3Config) -> super::Result { const DEFAULT_REGION: Region = Region::from_static("us-east-1"); @@ -626,29 +654,12 @@ async fn build_s3_conf(config: &S3Config) -> super::Result { maybe_set_loader_value!(profile_name, &config.profile_name); - // Ensure endpoint URL has trailing slash for proper path construction with S3-compatible services. - // E.g., Supabase provides https://[project_ref].storage.supabase.co/storage/v1/s3 - // but we need to add a trailing slash to the endpoint url so that the path is properly constructed. - let endpoint_url = config.endpoint_url.as_ref().map(|url_str| { - // Parse the URL to properly handle query strings and fragments - match url::Url::parse(url_str) { - Ok(mut parsed_url) => { - // Only add trailing slash if the path doesn't already have one - if !parsed_url.path().ends_with('/') { - parsed_url.set_path(&format!("{}/", parsed_url.path())); - } - parsed_url.to_string() - } - // If parsing fails, fall back to simple string append for backwards compatibility - Err(_) => { - if url_str.ends_with('/') { - url_str.clone() - } else { - format!("{}/", url_str) - } - } - } - }); + // Normalize the endpoint URL (default scheme to https, ensure trailing slash) + // so the AWS SDK receives a valid URI for S3-compatible services. + let endpoint_url = config + .endpoint_url + .as_ref() + .map(|url_str| normalize_endpoint_url(url_str)); maybe_set_loader_value!(endpoint_url, &endpoint_url); maybe_set_loader_value!(identity_cache, identity_cache); @@ -1816,7 +1827,62 @@ mod tests { use common_io_config::S3Config; - use crate::{Result, S3LikeSource, integrations::test_full_get, object_io::ObjectSource}; + use crate::{ + Result, S3LikeSource, integrations::test_full_get, object_io::ObjectSource, + s3_like::normalize_endpoint_url, + }; + + #[test] + fn test_normalize_endpoint_url_defaults_https_scheme() { + // A bare host (e.g. vended by an Iceberg REST catalog) gets an https scheme. + assert_eq!( + normalize_endpoint_url("s3.example.com"), + "https://s3.example.com/" + ); + } + + #[test] + fn test_normalize_endpoint_url_bare_host_with_trailing_slash() { + assert_eq!( + normalize_endpoint_url("s3.example.com/"), + "https://s3.example.com/" + ); + } + + #[test] + fn test_normalize_endpoint_url_bare_host_with_port() { + assert_eq!( + normalize_endpoint_url("minio.example.com:9000"), + "https://minio.example.com:9000/" + ); + } + + #[test] + fn test_normalize_endpoint_url_preserves_scheme() { + // An explicit http scheme is kept (the trailing slash is still added). + assert_eq!( + normalize_endpoint_url("http://minio.example.com:9000"), + "http://minio.example.com:9000/" + ); + } + + #[test] + fn test_normalize_endpoint_url_keeps_existing_trailing_slash() { + // Supabase-style endpoint already ending in a slash is left as-is (#5575). + assert_eq!( + normalize_endpoint_url("https://abc.storage.supabase.co/storage/v1/s3/"), + "https://abc.storage.supabase.co/storage/v1/s3/" + ); + } + + #[test] + fn test_normalize_endpoint_url_adds_trailing_slash_to_path() { + // Supabase-style endpoint without the trailing slash gets one (#5575). + assert_eq!( + normalize_endpoint_url("https://abc.storage.supabase.co/storage/v1/s3"), + "https://abc.storage.supabase.co/storage/v1/s3/" + ); + } #[tokio::test] async fn test_full_get_from_s3() -> Result<()> { diff --git a/tests/io/iceberg/test_iceberg_io_config.py b/tests/io/iceberg/test_iceberg_io_config.py index 22a6598a902..1f9738b111e 100644 --- a/tests/io/iceberg/test_iceberg_io_config.py +++ b/tests/io/iceberg/test_iceberg_io_config.py @@ -54,39 +54,3 @@ def test_no_location_with_props(): def test_no_location_no_props_returns_none(): """With no location (default) and no properties, no IOConfig is produced.""" assert _convert_iceberg_file_io_properties_to_io_config({}) is None - - -def test_schemeless_endpoint_gets_https_scheme(): - """A vended endpoint without a URI scheme is defaulted to https. - - Some catalogs vend `s3.endpoint` as a bare host; Daft's S3 client requires - a full URI. - """ - props = {"s3.endpoint": "s3.us-west-2.example.com"} - result = _convert_iceberg_file_io_properties_to_io_config(props, "s3://my-bucket/warehouse/db/table") - assert result is not None - assert result.s3.endpoint_url == "https://s3.us-west-2.example.com" - - -def test_schemeless_endpoint_with_trailing_slash(): - """A bare-host endpoint with a trailing slash also gets the https scheme.""" - props = {"s3.endpoint": "s3.us-west-2.example.com/"} - result = _convert_iceberg_file_io_properties_to_io_config(props, "s3://my-bucket/warehouse/db/table") - assert result is not None - assert result.s3.endpoint_url == "https://s3.us-west-2.example.com/" - - -def test_schemeless_endpoint_with_port(): - """A bare host:port endpoint also gets the https scheme.""" - props = {"s3.endpoint": "minio.example.com:9000"} - result = _convert_iceberg_file_io_properties_to_io_config(props, "s3://my-bucket/warehouse/db/table") - assert result is not None - assert result.s3.endpoint_url == "https://minio.example.com:9000" - - -def test_endpoint_with_scheme_unchanged(): - """Endpoints that already carry a scheme are passed through untouched.""" - props = {"s3.endpoint": "http://minio.example.com:9000"} - result = _convert_iceberg_file_io_properties_to_io_config(props, "s3://my-bucket/warehouse/db/table") - assert result is not None - assert result.s3.endpoint_url == "http://minio.example.com:9000" From 5e57f02f852060f045ea6227751f6c621da45b54 Mon Sep 17 00:00:00 2001 From: Li Jiajia Date: Wed, 17 Jun 2026 13:19:56 +0800 Subject: [PATCH 3/3] test(io): cover the malformed-endpoint fallback in normalize_endpoint_url --- src/daft-io/src/s3_like.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/daft-io/src/s3_like.rs b/src/daft-io/src/s3_like.rs index 73953b36e58..b64c702fb72 100644 --- a/src/daft-io/src/s3_like.rs +++ b/src/daft-io/src/s3_like.rs @@ -1884,6 +1884,14 @@ mod tests { ); } + #[test] + fn test_normalize_endpoint_url_malformed_falls_back() { + // Even with the scheme prepended, an empty or invalid host still fails + // `Url::parse`, so the `Err(_)` fallback is exercised (and must not panic). + assert_eq!(normalize_endpoint_url(""), "https://"); + assert_eq!(normalize_endpoint_url("a b.com"), "https://a b.com/"); + } + #[tokio::test] async fn test_full_get_from_s3() -> Result<()> { let parquet_file_path = "s3://daft-public-data/test_fixtures/parquet_small/0dad4c3f-da0d-49db-90d8-98684571391b-0.parquet";