Skip to content
Open
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
114 changes: 90 additions & 24 deletions src/daft-io/src/s3_like.rs
Original file line number Diff line number Diff line change
Expand Up @@ -472,6 +472,34 @@ async fn provide_credentials_with_retry(
Ok(creds)
}

/// Normalize an S3-compatible endpoint URL: default the scheme to `https` when
/// absent (a bare host otherwise fails URI parsing and the AWS SDK rejects it),
/// and ensure a trailing slash for correct path construction.
fn normalize_endpoint_url(url_str: &str) -> String {
// Default to https before parsing so a bare host also gets the trailing slash.
let url_str = if url_str.contains("://") {
url_str.to_string()
} else {
format!("https://{url_str}")
};
match url::Url::parse(&url_str) {

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now that a prefix is appended beforehand, should we expect URL parsing to always succeed?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@srilman Thanks for catching that — not always: https:// + an empty or invalid host (e.g. "", "a b.com") still fails Url::parse, so the fallback stays. Added a test for exactly those cases.

Ok(mut parsed_url) => {
if !parsed_url.path().ends_with('/') {
parsed_url.set_path(&format!("{}/", parsed_url.path()));
}
parsed_url.to_string()
}
// Fall back to a simple trailing-slash append if parsing still fails.
Err(_) => {
if url_str.ends_with('/') {
url_str
} else {
format!("{url_str}/")
}
}
}
}

async fn build_s3_conf(config: &S3Config) -> super::Result<s3::Config> {
const DEFAULT_REGION: Region = Region::from_static("us-east-1");

Expand Down Expand Up @@ -626,29 +654,12 @@ async fn build_s3_conf(config: &S3Config) -> super::Result<s3::Config> {

maybe_set_loader_value!(profile_name, &config.profile_name);

// Ensure endpoint URL has trailing slash for proper path construction with S3-compatible services.
// E.g., Supabase provides https://[project_ref].storage.supabase.co/storage/v1/s3
// but we need to add a trailing slash to the endpoint url so that the path is properly constructed.
let endpoint_url = config.endpoint_url.as_ref().map(|url_str| {
// Parse the URL to properly handle query strings and fragments
match url::Url::parse(url_str) {
Ok(mut parsed_url) => {
// Only add trailing slash if the path doesn't already have one
if !parsed_url.path().ends_with('/') {
parsed_url.set_path(&format!("{}/", parsed_url.path()));
}
parsed_url.to_string()
}
// If parsing fails, fall back to simple string append for backwards compatibility
Err(_) => {
if url_str.ends_with('/') {
url_str.clone()
} else {
format!("{}/", url_str)
}
}
}
});
// Normalize the endpoint URL (default scheme to https, ensure trailing slash)
// so the AWS SDK receives a valid URI for S3-compatible services.
let endpoint_url = config
.endpoint_url
.as_ref()
.map(|url_str| normalize_endpoint_url(url_str));
maybe_set_loader_value!(endpoint_url, &endpoint_url);

maybe_set_loader_value!(identity_cache, identity_cache);
Expand Down Expand Up @@ -1816,7 +1827,62 @@ mod tests {

use common_io_config::S3Config;

use crate::{Result, S3LikeSource, integrations::test_full_get, object_io::ObjectSource};
use crate::{
Result, S3LikeSource, integrations::test_full_get, object_io::ObjectSource,
s3_like::normalize_endpoint_url,
};

#[test]
fn test_normalize_endpoint_url_defaults_https_scheme() {
// A bare host (e.g. vended by an Iceberg REST catalog) gets an https scheme.
assert_eq!(
normalize_endpoint_url("s3.example.com"),
"https://s3.example.com/"
);
}

#[test]
fn test_normalize_endpoint_url_bare_host_with_trailing_slash() {
assert_eq!(
normalize_endpoint_url("s3.example.com/"),
"https://s3.example.com/"
);
}

#[test]
fn test_normalize_endpoint_url_bare_host_with_port() {
assert_eq!(
normalize_endpoint_url("minio.example.com:9000"),
"https://minio.example.com:9000/"
);
}

#[test]
fn test_normalize_endpoint_url_preserves_scheme() {
// An explicit http scheme is kept (the trailing slash is still added).
assert_eq!(
normalize_endpoint_url("http://minio.example.com:9000"),
"http://minio.example.com:9000/"
);
}

#[test]
fn test_normalize_endpoint_url_keeps_existing_trailing_slash() {
// Supabase-style endpoint already ending in a slash is left as-is (#5575).
assert_eq!(
normalize_endpoint_url("https://abc.storage.supabase.co/storage/v1/s3/"),
"https://abc.storage.supabase.co/storage/v1/s3/"
);
}

#[test]
fn test_normalize_endpoint_url_adds_trailing_slash_to_path() {
// Supabase-style endpoint without the trailing slash gets one (#5575).
assert_eq!(
normalize_endpoint_url("https://abc.storage.supabase.co/storage/v1/s3"),
"https://abc.storage.supabase.co/storage/v1/s3/"
);
}

#[tokio::test]
async fn test_full_get_from_s3() -> Result<()> {
Expand Down
Loading