From b05f59b9a2f120815a317c73cdf1b3b7998b005d Mon Sep 17 00:00:00 2001 From: EJ Campbell Date: Wed, 3 Dec 2025 02:13:13 -0800 Subject: [PATCH 01/19] Remove all set_creds() calls from sync_io.rs set_creds() uses setfsuid()/setfsgid() to switch credentials, but this breaks /proc/self/fd/ access which fuse-backend-rs uses internally via readlinkat to resolve inode paths. fuse-pipe handles credential switching in its own wrapper layer via CredentialsGuard, so these calls are not needed in the passthrough layer. Removed set_creds() from: - mkdir (line 433) - create (lines 565, 587) - mknod (line 934) - symlink (line 999) --- src/passthrough/sync_io.rs | 50 +++++++++++++------------------------- 1 file changed, 17 insertions(+), 33 deletions(-) diff --git a/src/passthrough/sync_io.rs b/src/passthrough/sync_io.rs index d795f308..7739418d 100644 --- a/src/passthrough/sync_io.rs +++ b/src/passthrough/sync_io.rs @@ -429,13 +429,9 @@ impl FileSystem for PassthroughFs { let data = self.inode_map.get(parent)?; - let res = { - let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?; - - let file = data.get_file()?; - // Safe because this doesn't modify any memory and we check the return value. - unsafe { libc::mkdirat(file.as_raw_fd(), name.as_ptr(), mode & !umask) } - }; + let file = data.get_file()?; + // Safe because this doesn't modify any memory and we check the return value. + let res = unsafe { libc::mkdirat(file.as_raw_fd(), name.as_ptr(), mode & !umask) }; if res < 0 { return Err(io::Error::last_os_error()); } @@ -561,12 +557,9 @@ impl FileSystem for PassthroughFs { let dir = self.inode_map.get(parent)?; let dir_file = dir.get_file()?; - let new_file = { - let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?; - - let flags = self.get_writeback_open_flags(args.flags as i32); - Self::create_file_excl(&dir_file, name, flags, args.mode & !(args.umask & 0o777))? - }; + let flags = self.get_writeback_open_flags(args.flags as i32); + let new_file = + Self::create_file_excl(&dir_file, name, flags, args.mode & !(args.umask & 0o777))?; let entry = self.do_lookup(parent, name)?; let file = match new_file { @@ -584,7 +577,6 @@ impl FileSystem for PassthroughFs { None }; - let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?; self.open_inode(entry.inode, args.flags as i32)? } }; @@ -930,18 +922,14 @@ impl FileSystem for PassthroughFs { let data = self.inode_map.get(parent)?; let file = data.get_file()?; - let res = { - let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?; - - // Safe because this doesn't modify any memory and we check the return value. - unsafe { - libc::mknodat( - file.as_raw_fd(), - name.as_ptr(), - (mode & !umask) as libc::mode_t, - u64::from(rdev), - ) - } + // Safe because this doesn't modify any memory and we check the return value. + let res = unsafe { + libc::mknodat( + file.as_raw_fd(), + name.as_ptr(), + (mode & !umask) as libc::mode_t, + u64::from(rdev), + ) }; if res < 0 { Err(io::Error::last_os_error()) @@ -994,14 +982,10 @@ impl FileSystem for PassthroughFs { self.validate_path_component(name)?; let data = self.inode_map.get(parent)?; + let file = data.get_file()?; - let res = { - let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?; - - let file = data.get_file()?; - // Safe because this doesn't modify any memory and we check the return value. - unsafe { libc::symlinkat(linkname.as_ptr(), file.as_raw_fd(), name.as_ptr()) } - }; + // Safe because this doesn't modify any memory and we check the return value. + let res = unsafe { libc::symlinkat(linkname.as_ptr(), file.as_raw_fd(), name.as_ptr()) }; if res == 0 { self.do_lookup(parent, name) } else { From b206a873c71e239d2d6256aa1ceb4acae8f73745 Mon Sep 17 00:00:00 2001 From: EJ Campbell Date: Sun, 7 Dec 2025 10:00:53 -0800 Subject: [PATCH 02/19] Replace setresuid/setresgid with setfsuid/setfsgid for credential switching The previous implementation used setresuid/setresgid which change the real/effective UID/GID. This breaks access to /proc/self/fd/ because after switching to a non-root user, the process can no longer access files owned by root (euid). This change replaces the scoped_cred! macro with ScopedCreds which uses setfsuid/setfsgid instead. These syscalls ONLY affect filesystem access checks and do NOT change the real/effective UID, so /proc/self/fd/ access is preserved. Key advantages of setfsuid/setfsgid: - Inherently per-thread (no glibc signal synchronization) - Only affects filesystem permission checks - Process retains root privileges for /proc access - Designed specifically for file servers serving multiple users Also restores set_creds() calls to mkdir, create, mknod, and symlink so that created files are owned by the calling user, not root. --- src/passthrough/mod.rs | 131 +++++++++++++++++++++---------------- src/passthrough/sync_io.rs | 8 +++ 2 files changed, 84 insertions(+), 55 deletions(-) diff --git a/src/passthrough/mod.rs b/src/passthrough/mod.rs index 0f0e5408..1160b7b5 100644 --- a/src/passthrough/mod.rs +++ b/src/passthrough/mod.rs @@ -888,66 +888,87 @@ impl BackendFileSystem for PassthroughFs } } -macro_rules! scoped_cred { - ($name:ident, $ty:ty, $syscall_nr:expr) => { - #[derive(Debug)] - pub(crate) struct $name; - - impl $name { - // Changes the effective uid/gid of the current thread to `val`. Changes - // the thread's credentials back to root when the returned struct is dropped. - fn new(val: $ty) -> io::Result> { - if val == 0 { - // Nothing to do since we are already uid 0. - return Ok(None); - } +/// RAII guard for temporarily switching filesystem uid/gid credentials. +/// +/// Uses setfsuid()/setfsgid() syscalls which ONLY affect filesystem access +/// checks for the calling thread. Unlike setresuid/setresgid, these do NOT +/// change the real/effective uid, so the process retains access to /proc/self/fd/. +/// +/// This is critical because fuse-backend-rs internally uses readlinkat on +/// /proc/self/fd/ to resolve inode paths - if we changed euid to non-root, +/// those operations would fail with EACCES. +/// +/// # Thread Safety +/// +/// setfsuid() and setfsgid() are inherently per-thread on Linux - they do not +/// use glibc's signal-based synchronization that setresuid/setresgid use to +/// comply with POSIX. This makes them ideal for multi-threaded file servers +/// that need to serve multiple users concurrently in different threads. +/// +/// See: https://man7.org/linux/man-pages/man2/setfsuid.2.html +/// See: https://stackoverflow.com/questions/1223600/change-uid-gid-only-of-one-thread-in-linux +#[derive(Debug)] +pub(crate) struct ScopedCreds { + original_fsuid: libc::uid_t, + original_fsgid: libc::gid_t, +} - // We want credential changes to be per-thread because otherwise - // we might interfere with operations being carried out on other - // threads with different uids/gids. However, posix requires that - // all threads in a process share the same credentials. To do this - // libc uses signals to ensure that when one thread changes its - // credentials the other threads do the same thing. - // - // So instead we invoke the syscall directly in order to get around - // this limitation. Another option is to use the setfsuid and - // setfsgid systems calls. However since those calls have no way to - // return an error, it's preferable to do this instead. - - // This call is safe because it doesn't modify any memory and we - // check the return value. - let res = unsafe { libc::syscall($syscall_nr, -1, val, -1) }; - if res == 0 { - Ok(Some($name)) - } else { - Err(io::Error::last_os_error()) - } - } +impl ScopedCreds { + /// Switch filesystem credentials to the given uid/gid. + /// Returns None if both uid and gid are 0 (already root). + fn new(uid: libc::uid_t, gid: libc::gid_t) -> io::Result> { + if uid == 0 && gid == 0 { + // Nothing to do since we are already uid/gid 0. + return Ok(None); } - impl Drop for $name { - fn drop(&mut self) { - let res = unsafe { libc::syscall($syscall_nr, -1, 0, -1) }; - if res < 0 { - error!( - "fuse: failed to change credentials back to root: {}", - io::Error::last_os_error(), - ); - } - } + // setfsuid/setfsgid return the PREVIOUS value, not an error code. + // To detect errors, we call twice: first to set, then to verify. + + // Change gid first (same reason as before - changing uid might drop privileges) + let original_fsgid = unsafe { libc::setfsgid(gid) } as libc::gid_t; + // Verify the change took effect + let verify_gid = unsafe { libc::setfsgid(gid) } as libc::gid_t; + if verify_gid != gid { + // Restore and return error + unsafe { libc::setfsgid(original_fsgid) }; + return Err(io::Error::new( + io::ErrorKind::PermissionDenied, + format!("setfsgid({}) failed, got {}", gid, verify_gid), + )); + } + + // Now change uid + let original_fsuid = unsafe { libc::setfsuid(uid) } as libc::uid_t; + // Verify the change took effect + let verify_uid = unsafe { libc::setfsuid(uid) } as libc::uid_t; + if verify_uid != uid { + // Restore both and return error + unsafe { libc::setfsgid(original_fsgid) }; + unsafe { libc::setfsuid(original_fsuid) }; + return Err(io::Error::new( + io::ErrorKind::PermissionDenied, + format!("setfsuid({}) failed, got {}", uid, verify_uid), + )); } - }; + + Ok(Some(ScopedCreds { + original_fsuid, + original_fsgid, + })) + } } -scoped_cred!(ScopedUid, libc::uid_t, libc::SYS_setresuid); -scoped_cred!(ScopedGid, libc::gid_t, libc::SYS_setresgid); - -fn set_creds( - uid: libc::uid_t, - gid: libc::gid_t, -) -> io::Result<(Option, Option)> { - // We have to change the gid before we change the uid because if we change the uid first then we - // lose the capability to change the gid. However changing back can happen in any order. - ScopedGid::new(gid).and_then(|gid| Ok((ScopedUid::new(uid)?, gid))) + +impl Drop for ScopedCreds { + fn drop(&mut self) { + // Restore original credentials (order doesn't matter for restore) + unsafe { libc::setfsuid(self.original_fsuid) }; + unsafe { libc::setfsgid(self.original_fsgid) }; + } +} + +fn set_creds(uid: libc::uid_t, gid: libc::gid_t) -> io::Result> { + ScopedCreds::new(uid, gid) } struct CapFsetid {} diff --git a/src/passthrough/sync_io.rs b/src/passthrough/sync_io.rs index 7739418d..3892a45a 100644 --- a/src/passthrough/sync_io.rs +++ b/src/passthrough/sync_io.rs @@ -430,6 +430,8 @@ impl FileSystem for PassthroughFs { let data = self.inode_map.get(parent)?; let file = data.get_file()?; + // Switch to caller's credentials so the directory is owned by them + let _creds = set_creds(ctx.uid, ctx.gid)?; // Safe because this doesn't modify any memory and we check the return value. let res = unsafe { libc::mkdirat(file.as_raw_fd(), name.as_ptr(), mode & !umask) }; if res < 0 { @@ -558,6 +560,8 @@ impl FileSystem for PassthroughFs { let dir_file = dir.get_file()?; let flags = self.get_writeback_open_flags(args.flags as i32); + // Switch to caller's credentials so the file is owned by them + let _creds = set_creds(ctx.uid, ctx.gid)?; let new_file = Self::create_file_excl(&dir_file, name, flags, args.mode & !(args.umask & 0o777))?; @@ -922,6 +926,8 @@ impl FileSystem for PassthroughFs { let data = self.inode_map.get(parent)?; let file = data.get_file()?; + // Switch to caller's credentials so the node is owned by them + let _creds = set_creds(ctx.uid, ctx.gid)?; // Safe because this doesn't modify any memory and we check the return value. let res = unsafe { libc::mknodat( @@ -984,6 +990,8 @@ impl FileSystem for PassthroughFs { let data = self.inode_map.get(parent)?; let file = data.get_file()?; + // Switch to caller's credentials so the symlink is owned by them + let _creds = set_creds(ctx.uid, ctx.gid)?; // Safe because this doesn't modify any memory and we check the return value. let res = unsafe { libc::symlinkat(linkname.as_ptr(), file.as_raw_fd(), name.as_ptr()) }; if res == 0 { From 2548bbcaba349109bcc016dc1b13f2e91077c289 Mon Sep 17 00:00:00 2001 From: EJ Campbell Date: Sun, 7 Dec 2025 10:02:04 -0800 Subject: [PATCH 03/19] Add set_creds() to setattr for truncate and utimes These operations need to run as the calling user for proper permission checking: - truncate requires write permission - utimensat requires write permission or ownership chown and chmod continue to run as root (correct behavior). --- src/passthrough/sync_io.rs | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/passthrough/sync_io.rs b/src/passthrough/sync_io.rs index 3892a45a..f6e084c1 100644 --- a/src/passthrough/sync_io.rs +++ b/src/passthrough/sync_io.rs @@ -725,7 +725,7 @@ impl FileSystem for PassthroughFs { fn setattr( &self, - _ctx: &Context, + ctx: &Context, inode: Inode, attr: libc::stat64, handle: Option, @@ -816,6 +816,10 @@ impl FileSystem for PassthroughFs { None }; + // Switch to caller's credentials for permission check + // (truncate requires write permission) + let _creds = set_creds(ctx.uid, ctx.gid)?; + // Safe because this doesn't modify any memory and we check the return value. let res = match data { Data::Handle(ref h) => unsafe { @@ -858,6 +862,10 @@ impl FileSystem for PassthroughFs { tvs[1].tv_nsec = attr.st_mtime_nsec; } + // Switch to caller's credentials for permission check + // (utimensat requires write permission or ownership) + let _creds = set_creds(ctx.uid, ctx.gid)?; + // Safe because this doesn't modify any memory and we check the return value. let res = match data { Data::Handle(ref h) => unsafe { From e8118aa611834fc5589a70faa90fc4590de304e2 Mon Sep 17 00:00:00 2001 From: EJ Campbell Date: Sun, 7 Dec 2025 10:22:40 -0800 Subject: [PATCH 04/19] Add set_creds() to all filesystem operations requiring permission checks Operations that modify filesystem state need to run with the caller's credentials so the kernel performs proper permission checks. Without this, operations like rmdir/unlink/rename succeed even when the caller doesn't have write permission to the parent directory. Added set_creds() to: - rmdir: permission check for directory removal - unlink: permission check for file removal - link: permission check for hard link creation - rename: permission check for renaming - open: permission check for file access - opendir: permission check for directory access This fixes 780+ pjdfstest failures where operations were bypassing POSIX permission checks. --- src/passthrough/sync_io.rs | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/src/passthrough/sync_io.rs b/src/passthrough/sync_io.rs index f6e084c1..d924284c 100644 --- a/src/passthrough/sync_io.rs +++ b/src/passthrough/sync_io.rs @@ -389,7 +389,7 @@ impl FileSystem for PassthroughFs { fn opendir( &self, - _ctx: &Context, + ctx: &Context, inode: Inode, flags: u32, ) -> io::Result<(Option, OpenOptions)> { @@ -397,6 +397,8 @@ impl FileSystem for PassthroughFs { info!("fuse: opendir is not supported."); Err(enosys()) } else { + // Switch to caller's credentials for permission check + let _creds = set_creds(ctx.uid, ctx.gid)?; self.do_open(inode, flags | (libc::O_DIRECTORY as u32), 0) .map(|(a, b, _)| (a, b)) } @@ -441,8 +443,10 @@ impl FileSystem for PassthroughFs { self.do_lookup(parent, name) } - fn rmdir(&self, _ctx: &Context, parent: Inode, name: &CStr) -> io::Result<()> { + fn rmdir(&self, ctx: &Context, parent: Inode, name: &CStr) -> io::Result<()> { self.validate_path_component(name)?; + // Switch to caller's credentials for permission check + let _creds = set_creds(ctx.uid, ctx.gid)?; self.do_unlink(parent, name, libc::AT_REMOVEDIR) } @@ -517,7 +521,7 @@ impl FileSystem for PassthroughFs { fn open( &self, - _ctx: &Context, + ctx: &Context, inode: Inode, flags: u32, fuse_flags: u32, @@ -526,6 +530,8 @@ impl FileSystem for PassthroughFs { info!("fuse: open is not supported."); Err(enosys()) } else { + // Switch to caller's credentials for permission check + let _creds = set_creds(ctx.uid, ctx.gid)?; self.do_open(inode, flags, fuse_flags) } } @@ -606,8 +612,10 @@ impl FileSystem for PassthroughFs { Ok((entry, ret_handle, opts, None)) } - fn unlink(&self, _ctx: &Context, parent: Inode, name: &CStr) -> io::Result<()> { + fn unlink(&self, ctx: &Context, parent: Inode, name: &CStr) -> io::Result<()> { self.validate_path_component(name)?; + // Switch to caller's credentials for permission check + let _creds = set_creds(ctx.uid, ctx.gid)?; self.do_unlink(parent, name, 0) } @@ -885,7 +893,7 @@ impl FileSystem for PassthroughFs { fn rename( &self, - _ctx: &Context, + ctx: &Context, olddir: Inode, oldname: &CStr, newdir: Inode, @@ -900,6 +908,9 @@ impl FileSystem for PassthroughFs { let old_file = old_inode.get_file()?; let new_file = new_inode.get_file()?; + // Switch to caller's credentials for permission check + let _creds = set_creds(ctx.uid, ctx.gid)?; + // Safe because this doesn't modify any memory and we check the return value. // TODO: Switch to libc::renameat2 once https://github.com/rust-lang/libc/pull/1508 lands // and we have glibc 2.28. @@ -954,7 +965,7 @@ impl FileSystem for PassthroughFs { fn link( &self, - _ctx: &Context, + ctx: &Context, inode: Inode, newparent: Inode, newname: &CStr, @@ -969,6 +980,9 @@ impl FileSystem for PassthroughFs { // Safe because this is a constant value and a valid C string. let empty = unsafe { CStr::from_bytes_with_nul_unchecked(EMPTY_CSTR) }; + // Switch to caller's credentials for permission check + let _creds = set_creds(ctx.uid, ctx.gid)?; + // Safe because this doesn't modify any memory and we check the return value. let res = unsafe { libc::linkat( From d248a4336062f5eb2f69ed5c89cb442d48ab3e03 Mon Sep 17 00:00:00 2001 From: EJ Campbell Date: Sun, 7 Dec 2025 10:32:47 -0800 Subject: [PATCH 05/19] Add set_creds() to chmod operation in setattr chmod requires the caller to be the file owner or root. Without switching credentials, chmod always succeeds when running as root, bypassing POSIX permission checks. --- src/passthrough/sync_io.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/passthrough/sync_io.rs b/src/passthrough/sync_io.rs index d924284c..d60ae37b 100644 --- a/src/passthrough/sync_io.rs +++ b/src/passthrough/sync_io.rs @@ -768,6 +768,10 @@ impl FileSystem for PassthroughFs { } if valid.contains(SetattrValid::MODE) { + // Switch to caller's credentials for permission check + // (only file owner or root can chmod) + let _creds = set_creds(ctx.uid, ctx.gid)?; + // Safe because this doesn't modify any memory and we check the return value. let res = unsafe { match data { From c12f6d31a2dea7b2363c60c2fc4619d0efcb47c0 Mon Sep 17 00:00:00 2001 From: EJ Campbell Date: Sun, 7 Dec 2025 10:39:56 -0800 Subject: [PATCH 06/19] Add set_creds() to chown operation in setattr chown requires the caller to be root to change owner, or file owner to change group. Without switching credentials, chown always succeeds when running as root, bypassing POSIX permission checks. --- src/passthrough/sync_io.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/passthrough/sync_io.rs b/src/passthrough/sync_io.rs index d60ae37b..e3d28dda 100644 --- a/src/passthrough/sync_io.rs +++ b/src/passthrough/sync_io.rs @@ -803,6 +803,10 @@ impl FileSystem for PassthroughFs { // Safe because this is a constant value and a valid C string. let empty = unsafe { CStr::from_bytes_with_nul_unchecked(EMPTY_CSTR) }; + // Switch to caller's credentials for permission check + // (only root can change owner, owner can change group) + let _creds = set_creds(ctx.uid, ctx.gid)?; + // Safe because this doesn't modify any memory and we check the return value. let res = unsafe { libc::fchownat( From 87caf46d456edacae159db99201039e7e71f2958 Mon Sep 17 00:00:00 2001 From: EJ Campbell Date: Sun, 7 Dec 2025 10:47:35 -0800 Subject: [PATCH 07/19] Add set_creds() to lookup for directory search permission check Without this, lookup always runs as root and bypasses directory search (execute) permission checks. This caused chmod/truncate to succeed even when the user doesn't have search permission on a parent directory component. --- src/passthrough/sync_io.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/passthrough/sync_io.rs b/src/passthrough/sync_io.rs index e3d28dda..8e7d61cc 100644 --- a/src/passthrough/sync_io.rs +++ b/src/passthrough/sync_io.rs @@ -365,11 +365,13 @@ impl FileSystem for PassthroughFs { } } - fn lookup(&self, _ctx: &Context, parent: Inode, name: &CStr) -> io::Result { + fn lookup(&self, ctx: &Context, parent: Inode, name: &CStr) -> io::Result { // Don't use is_safe_path_component(), allow "." and ".." for NFS export support if name.to_bytes_with_nul().contains(&SLASH_ASCII) { return Err(einval()); } + // Switch to caller's credentials for directory search permission check + let _creds = set_creds(ctx.uid, ctx.gid)?; self.do_lookup(parent, name) } From 609391aa0405aad5547753133232ef7e6db91bf5 Mon Sep 17 00:00:00 2001 From: EJ Campbell Date: Sun, 7 Dec 2025 13:28:56 -0800 Subject: [PATCH 08/19] Fix link and ftruncate operations for FUSE default_permissions Link fix: - Remove set_creds() from link() function because linkat with AT_EMPTY_PATH requires CAP_DAC_READ_SEARCH capability which is lost when switching to non-root credentials - Permission checks are handled by the kernel via default_permissions mount option before the LINK request reaches the FUSE server Ftruncate fix: - When a file handle is provided (ftruncate on an open fd), don't switch credentials or re-check permissions - the permission was validated when the fd was opened - This allows ftruncate to succeed on an O_RDWR fd even if the file mode is 0, which is correct POSIX behavior SUID/SGID clearing fix: - Detect when the kernel is clearing SUID/SGID bits on write by non-owner - In this case, don't switch credentials because the operation is kernel- initiated and the writing user doesn't have chmod permission - This allows POSIX-compliant SUID/SGID clearing on write Also adds debug logging for setattr and write operations to aid debugging. --- src/passthrough/mod.rs | 13 ++++-- src/passthrough/sync_io.rs | 86 +++++++++++++++++++++++++++++++------- 2 files changed, 82 insertions(+), 17 deletions(-) diff --git a/src/passthrough/mod.rs b/src/passthrough/mod.rs index 1160b7b5..30fff9ff 100644 --- a/src/passthrough/mod.rs +++ b/src/passthrough/mod.rs @@ -917,8 +917,10 @@ impl ScopedCreds { /// Switch filesystem credentials to the given uid/gid. /// Returns None if both uid and gid are 0 (already root). fn new(uid: libc::uid_t, gid: libc::gid_t) -> io::Result> { + debug!("set_creds: switching to uid={} gid={}", uid, gid); if uid == 0 && gid == 0 { // Nothing to do since we are already uid/gid 0. + debug!("set_creds: uid=0 gid=0, nothing to do"); return Ok(None); } @@ -929,6 +931,7 @@ impl ScopedCreds { let original_fsgid = unsafe { libc::setfsgid(gid) } as libc::gid_t; // Verify the change took effect let verify_gid = unsafe { libc::setfsgid(gid) } as libc::gid_t; + debug!("set_creds: setfsgid({}) returned original={} verify={}", gid, original_fsgid, verify_gid); if verify_gid != gid { // Restore and return error unsafe { libc::setfsgid(original_fsgid) }; @@ -942,6 +945,7 @@ impl ScopedCreds { let original_fsuid = unsafe { libc::setfsuid(uid) } as libc::uid_t; // Verify the change took effect let verify_uid = unsafe { libc::setfsuid(uid) } as libc::uid_t; + debug!("set_creds: setfsuid({}) returned original={} verify={}", uid, original_fsuid, verify_uid); if verify_uid != uid { // Restore both and return error unsafe { libc::setfsgid(original_fsgid) }; @@ -952,6 +956,7 @@ impl ScopedCreds { )); } + debug!("set_creds: success, original_fsuid={} original_fsgid={}", original_fsuid, original_fsgid); Ok(Some(ScopedCreds { original_fsuid, original_fsgid, @@ -982,9 +987,11 @@ impl Drop for CapFsetid { } fn drop_cap_fsetid() -> io::Result> { - if !caps::has_cap(None, caps::CapSet::Effective, caps::Capability::CAP_FSETID) - .map_err(|_e| io::Error::new(io::ErrorKind::PermissionDenied, "no CAP_FSETID capability"))? - { + // Use unwrap_or(false) instead of propagating error - if we can't check + // capabilities, assume we don't have them and continue without error + let has_cap = caps::has_cap(None, caps::CapSet::Effective, caps::Capability::CAP_FSETID) + .unwrap_or(false); + if !has_cap { return Ok(None); } caps::drop(None, caps::CapSet::Effective, caps::Capability::CAP_FSETID).map_err(|_e| { diff --git a/src/passthrough/sync_io.rs b/src/passthrough/sync_io.rs index 8e7d61cc..6b350d1c 100644 --- a/src/passthrough/sync_io.rs +++ b/src/passthrough/sync_io.rs @@ -686,7 +686,7 @@ impl FileSystem for PassthroughFs { fn write( &self, - _ctx: &Context, + ctx: &Context, inode: Inode, handle: Handle, r: &mut dyn ZeroCopyReader, @@ -697,6 +697,20 @@ impl FileSystem for PassthroughFs { flags: u32, fuse_flags: u32, ) -> io::Result { + debug!("write: inode={} handle={} size={} uid={} gid={}", inode, handle, size, ctx.uid, ctx.gid); + // Switch to caller's credentials for the write operation. + // This is needed for proper SUID/SGID bit handling when a non-owner writes. + let _creds = match set_creds(ctx.uid, ctx.gid) { + Ok(c) => { + debug!("write: set_creds succeeded"); + c + } + Err(e) => { + debug!("write: set_creds FAILED: {:?}", e); + return Err(e); + } + }; + let data = self.get_data(handle, inode, libc::O_RDWR)?; // Manually implement File::try_clone() by borrowing fd of data.file instead of dup(). @@ -741,6 +755,8 @@ impl FileSystem for PassthroughFs { handle: Option, valid: SetattrValid, ) -> io::Result<(libc::stat64, Duration)> { + let no_open_val = self.no_open.load(Ordering::Relaxed); + debug!(target: "passthrough", "setattr inode={} handle={:?} valid={:?} no_open={}", inode, handle, valid, no_open_val); let inode_data = self.inode_map.get(inode)?; enum Data { @@ -770,9 +786,45 @@ impl FileSystem for PassthroughFs { } if valid.contains(SetattrValid::MODE) { - // Switch to caller's credentials for permission check - // (only file owner or root can chmod) - let _creds = set_creds(ctx.uid, ctx.gid)?; + // Get current mode to detect SUID/SGID clearing + let current_mode = { + let mut st: libc::stat64 = unsafe { std::mem::zeroed() }; + let res = unsafe { + match &data { + Data::Handle(h) => libc::fstat64(h.borrow_fd().as_raw_fd(), &mut st), + Data::ProcPath(p) => libc::fstatat64( + self.proc_self_fd.as_raw_fd(), + p.as_ptr(), + &mut st, + 0, + ), + } + }; + if res < 0 { + return Err(io::Error::last_os_error()); + } + st.st_mode + }; + + // Check if this is SUID/SGID clearing by the kernel (happens on write by non-owner) + // The kernel sends SETATTR with the new mode = old mode with S_ISUID/S_ISGID cleared. + // In this case, we should NOT switch credentials because: + // 1. The operation is kernel-initiated, not user-initiated + // 2. The writing user doesn't have permission to chmod (they're not the owner) + // 3. The SUID/SGID clearing must succeed for POSIX compliance + let old_special_bits = current_mode & (libc::S_ISUID | libc::S_ISGID); + let new_special_bits = attr.st_mode & (libc::S_ISUID | libc::S_ISGID); + let is_suid_sgid_clearing = + old_special_bits != 0 && new_special_bits == 0 && (current_mode & 0o777) == (attr.st_mode & 0o777); + + let _creds = if is_suid_sgid_clearing { + // Kernel clearing SUID/SGID - do as root + None + } else { + // User-initiated chmod - switch to caller's credentials for permission check + // (only file owner or root can chmod) + Some(set_creds(ctx.uid, ctx.gid)?) + }; // Safe because this doesn't modify any memory and we check the return value. let res = unsafe { @@ -834,16 +886,19 @@ impl FileSystem for PassthroughFs { None }; - // Switch to caller's credentials for permission check - // (truncate requires write permission) - let _creds = set_creds(ctx.uid, ctx.gid)?; - // Safe because this doesn't modify any memory and we check the return value. let res = match data { - Data::Handle(ref h) => unsafe { - libc::ftruncate(h.borrow_fd().as_raw_fd(), attr.st_size) - }, + Data::Handle(ref h) => { + // ftruncate on an already-opened fd doesn't re-check file permissions. + // The permission was validated when the fd was opened, so we don't + // switch credentials here - ftruncate should succeed if the fd has + // write access, regardless of current file mode. + unsafe { libc::ftruncate(h.borrow_fd().as_raw_fd(), attr.st_size) } + } _ => { + // No file handle - need to open the file, which requires permission check. + // Switch to caller's credentials for this case. + let _creds = set_creds(ctx.uid, ctx.gid)?; // There is no `ftruncateat` so we need to get a new fd and truncate it. let f = self.open_inode(inode, libc::O_NONBLOCK | libc::O_RDWR)?; unsafe { libc::ftruncate(f.as_raw_fd(), attr.st_size) } @@ -975,7 +1030,7 @@ impl FileSystem for PassthroughFs { fn link( &self, - ctx: &Context, + _ctx: &Context, inode: Inode, newparent: Inode, newname: &CStr, @@ -990,8 +1045,11 @@ impl FileSystem for PassthroughFs { // Safe because this is a constant value and a valid C string. let empty = unsafe { CStr::from_bytes_with_nul_unchecked(EMPTY_CSTR) }; - // Switch to caller's credentials for permission check - let _creds = set_creds(ctx.uid, ctx.gid)?; + // NOTE: We do NOT call set_creds() here because: + // 1. linkat with AT_EMPTY_PATH requires CAP_DAC_READ_SEARCH capability + // 2. set_creds() drops to non-root uid which loses CAP_DAC_READ_SEARCH + // 3. Permission checks should be done by the kernel via default_permissions + // mount option before the LINK request reaches the FUSE server // Safe because this doesn't modify any memory and we check the return value. let res = unsafe { From ca5398a048c0fa658029b31c0b1992277bde85bc Mon Sep 17 00:00:00 2001 From: EJ Campbell Date: Sun, 7 Dec 2025 13:56:37 -0800 Subject: [PATCH 09/19] Add supplementary groups support via /proc parsing FUSE protocol only passes uid and primary gid in requests, not supplementary groups. With default_permissions mount option, the kernel checks permissions but doesn't consider the caller's supplementary groups. This commit implements the "gocryptfs workaround" (see rfjakob/gocryptfs): 1. Read caller's supplementary groups from /proc//status 2. Call setgroups() to adopt those groups before the filesystem operation 3. Restore original groups when ScopedCreds is dropped IMPORTANT: We use raw SYS_setgroups/SYS_getgroups syscalls instead of the libc wrappers because glibc's NPTL signals ALL threads to change credentials for POSIX compliance. The raw kernel syscalls are per-thread, which is required for a multi-threaded FUSE server where each thread handles different users concurrently. See: https://man7.org/linux/man-pages/man7/nptl.7.html See: https://github.com/rfjakob/gocryptfs (uses same approach) This fixes 108 pjdfstest chown failures where operations like: -u 65534 -g 65532,65531 chown file -1 65531 were returning EPERM because the kernel only saw primary group 65532, not supplementary group 65531. Changes: - mod.rs: Add setgroups_thread(), getgroups_thread() using raw syscalls - mod.rs: Add parse_proc_groups(), get_current_groups() helper functions - mod.rs: Extend ScopedCreds to store and restore original_groups - mod.rs: Update set_creds() to accept pid parameter - sync_io.rs: Update all set_creds() calls to pass ctx.pid Result: All 8789 pjdfstest tests now pass on FUSE filesystem. --- src/passthrough/mod.rs | 147 +++++++++++++++++++++++++++++++++++-- src/passthrough/sync_io.rs | 30 ++++---- 2 files changed, 154 insertions(+), 23 deletions(-) diff --git a/src/passthrough/mod.rs b/src/passthrough/mod.rs index 30fff9ff..0efc0c4a 100644 --- a/src/passthrough/mod.rs +++ b/src/passthrough/mod.rs @@ -907,23 +907,139 @@ impl BackendFileSystem for PassthroughFs /// /// See: https://man7.org/linux/man-pages/man2/setfsuid.2.html /// See: https://stackoverflow.com/questions/1223600/change-uid-gid-only-of-one-thread-in-linux +/// +/// # Supplementary Groups +/// +/// FUSE protocol only passes uid and primary gid in requests, not supplementary groups. +/// With `default_permissions` mount option, the kernel checks permissions before the +/// request reaches the FUSE server, but these checks don't consider the caller's +/// supplementary groups because the kernel doesn't know them for FUSE operations. +/// +/// To fix this, we read the caller's supplementary groups from /proc//status +/// and adopt them using setgroups(). This is the "gocryptfs workaround" approach. +/// See: https://github.com/rfjakob/gocryptfs/commit/e74f48b #[derive(Debug)] pub(crate) struct ScopedCreds { original_fsuid: libc::uid_t, original_fsgid: libc::gid_t, + original_groups: Vec, +} + +/// Parse supplementary groups from /proc//status +/// +/// Returns empty Vec if the file cannot be read (e.g., process exited). +fn parse_proc_groups(pid: libc::pid_t) -> Vec { + // Try /proc//task//status first (more accurate for threads) + // Fall back to /proc//status + let status_path = format!("/proc/{}/task/{}/status", pid, pid); + let content = match std::fs::read_to_string(&status_path) { + Ok(c) => c, + Err(_) => { + let fallback_path = format!("/proc/{}/status", pid); + match std::fs::read_to_string(&fallback_path) { + Ok(c) => c, + Err(e) => { + debug!("parse_proc_groups: failed to read /proc/{}/status: {}", pid, e); + return Vec::new(); + } + } + } + }; + + // Look for line: "Groups:\t1000 1001 1002" + for line in content.lines() { + if let Some(groups_str) = line.strip_prefix("Groups:") { + let groups: Vec = groups_str + .split_whitespace() + .filter_map(|s| s.parse().ok()) + .collect(); + debug!("parse_proc_groups: pid={} groups={:?}", pid, groups); + return groups; + } + } + + debug!("parse_proc_groups: pid={} no Groups line found", pid); + Vec::new() +} + +/// Thread-local setgroups using raw syscall. +/// +/// IMPORTANT: We use the raw SYS_setgroups syscall instead of libc::setgroups() +/// because glibc's NPTL wrapper signals ALL threads to change credentials for +/// POSIX compliance. The raw kernel syscall is per-thread, which is what we need +/// for a multi-threaded FUSE server where each thread handles different users. +/// +/// See: https://man7.org/linux/man-pages/man7/nptl.7.html +/// See: https://github.com/rfjakob/gocryptfs (uses same approach) +unsafe fn setgroups_thread(groups: &[libc::gid_t]) -> libc::c_long { + libc::syscall( + libc::SYS_setgroups, + groups.len() as libc::size_t, + groups.as_ptr(), + ) +} + +/// Thread-local getgroups using raw syscall. +/// +/// Like setgroups_thread, this uses the raw syscall to get per-thread groups. +unsafe fn getgroups_thread(size: libc::c_int, list: *mut libc::gid_t) -> libc::c_long { + libc::syscall(libc::SYS_getgroups, size, list) +} + +/// Get current supplementary groups for this thread +fn get_current_groups() -> io::Result> { + // First call with size 0 to get the actual count + let count = unsafe { getgroups_thread(0, std::ptr::null_mut()) }; + if count < 0 { + return Err(io::Error::last_os_error()); + } + + if count == 0 { + return Ok(Vec::new()); + } + + let mut groups = vec![0 as libc::gid_t; count as usize]; + let actual = unsafe { getgroups_thread(count as libc::c_int, groups.as_mut_ptr()) }; + if actual < 0 { + return Err(io::Error::last_os_error()); + } + + groups.truncate(actual as usize); + Ok(groups) } impl ScopedCreds { - /// Switch filesystem credentials to the given uid/gid. + /// Switch filesystem credentials to the given uid/gid/groups. /// Returns None if both uid and gid are 0 (already root). - fn new(uid: libc::uid_t, gid: libc::gid_t) -> io::Result> { - debug!("set_creds: switching to uid={} gid={}", uid, gid); + fn new(uid: libc::uid_t, gid: libc::gid_t, pid: libc::pid_t) -> io::Result> { + debug!("set_creds: switching to uid={} gid={} pid={}", uid, gid, pid); if uid == 0 && gid == 0 { // Nothing to do since we are already uid/gid 0. debug!("set_creds: uid=0 gid=0, nothing to do"); return Ok(None); } + // Get current groups before we change anything + let original_groups = get_current_groups().unwrap_or_default(); + debug!("set_creds: original_groups={:?}", original_groups); + + // Parse caller's supplementary groups from /proc + let caller_groups = parse_proc_groups(pid); + + // Set supplementary groups first (before dropping privileges) + // Use raw syscall for per-thread behavior (not glibc's process-wide wrapper) + if !caller_groups.is_empty() { + let ret = unsafe { setgroups_thread(&caller_groups) }; + if ret < 0 { + let err = io::Error::last_os_error(); + debug!("set_creds: setgroups({:?}) failed: {}", caller_groups, err); + // Don't fail - continue without supplementary groups + // This can happen if we don't have CAP_SETGID + } else { + debug!("set_creds: setgroups({:?}) succeeded", caller_groups); + } + } + // setfsuid/setfsgid return the PREVIOUS value, not an error code. // To detect errors, we call twice: first to set, then to verify. @@ -933,7 +1049,10 @@ impl ScopedCreds { let verify_gid = unsafe { libc::setfsgid(gid) } as libc::gid_t; debug!("set_creds: setfsgid({}) returned original={} verify={}", gid, original_fsgid, verify_gid); if verify_gid != gid { - // Restore and return error + // Restore groups and return error + if !original_groups.is_empty() { + unsafe { setgroups_thread(&original_groups) }; + } unsafe { libc::setfsgid(original_fsgid) }; return Err(io::Error::new( io::ErrorKind::PermissionDenied, @@ -947,7 +1066,10 @@ impl ScopedCreds { let verify_uid = unsafe { libc::setfsuid(uid) } as libc::uid_t; debug!("set_creds: setfsuid({}) returned original={} verify={}", uid, original_fsuid, verify_uid); if verify_uid != uid { - // Restore both and return error + // Restore all and return error + if !original_groups.is_empty() { + unsafe { setgroups_thread(&original_groups) }; + } unsafe { libc::setfsgid(original_fsgid) }; unsafe { libc::setfsuid(original_fsuid) }; return Err(io::Error::new( @@ -960,20 +1082,29 @@ impl ScopedCreds { Ok(Some(ScopedCreds { original_fsuid, original_fsgid, + original_groups, })) } } impl Drop for ScopedCreds { fn drop(&mut self) { - // Restore original credentials (order doesn't matter for restore) + // Restore original credentials (order: uid first, then gid, then groups) + // Restore uid first so we have privileges to restore groups unsafe { libc::setfsuid(self.original_fsuid) }; unsafe { libc::setfsgid(self.original_fsgid) }; + // Restore supplementary groups using raw syscall for per-thread behavior + if !self.original_groups.is_empty() { + unsafe { setgroups_thread(&self.original_groups) }; + } else { + // If we had no groups originally, clear them + unsafe { setgroups_thread(&[]) }; + } } } -fn set_creds(uid: libc::uid_t, gid: libc::gid_t) -> io::Result> { - ScopedCreds::new(uid, gid) +fn set_creds(uid: libc::uid_t, gid: libc::gid_t, pid: libc::pid_t) -> io::Result> { + ScopedCreds::new(uid, gid, pid) } struct CapFsetid {} diff --git a/src/passthrough/sync_io.rs b/src/passthrough/sync_io.rs index 6b350d1c..c2acf317 100644 --- a/src/passthrough/sync_io.rs +++ b/src/passthrough/sync_io.rs @@ -371,7 +371,7 @@ impl FileSystem for PassthroughFs { return Err(einval()); } // Switch to caller's credentials for directory search permission check - let _creds = set_creds(ctx.uid, ctx.gid)?; + let _creds = set_creds(ctx.uid, ctx.gid, ctx.pid)?; self.do_lookup(parent, name) } @@ -400,7 +400,7 @@ impl FileSystem for PassthroughFs { Err(enosys()) } else { // Switch to caller's credentials for permission check - let _creds = set_creds(ctx.uid, ctx.gid)?; + let _creds = set_creds(ctx.uid, ctx.gid, ctx.pid)?; self.do_open(inode, flags | (libc::O_DIRECTORY as u32), 0) .map(|(a, b, _)| (a, b)) } @@ -435,7 +435,7 @@ impl FileSystem for PassthroughFs { let file = data.get_file()?; // Switch to caller's credentials so the directory is owned by them - let _creds = set_creds(ctx.uid, ctx.gid)?; + let _creds = set_creds(ctx.uid, ctx.gid, ctx.pid)?; // Safe because this doesn't modify any memory and we check the return value. let res = unsafe { libc::mkdirat(file.as_raw_fd(), name.as_ptr(), mode & !umask) }; if res < 0 { @@ -448,7 +448,7 @@ impl FileSystem for PassthroughFs { fn rmdir(&self, ctx: &Context, parent: Inode, name: &CStr) -> io::Result<()> { self.validate_path_component(name)?; // Switch to caller's credentials for permission check - let _creds = set_creds(ctx.uid, ctx.gid)?; + let _creds = set_creds(ctx.uid, ctx.gid, ctx.pid)?; self.do_unlink(parent, name, libc::AT_REMOVEDIR) } @@ -533,7 +533,7 @@ impl FileSystem for PassthroughFs { Err(enosys()) } else { // Switch to caller's credentials for permission check - let _creds = set_creds(ctx.uid, ctx.gid)?; + let _creds = set_creds(ctx.uid, ctx.gid, ctx.pid)?; self.do_open(inode, flags, fuse_flags) } } @@ -569,7 +569,7 @@ impl FileSystem for PassthroughFs { let flags = self.get_writeback_open_flags(args.flags as i32); // Switch to caller's credentials so the file is owned by them - let _creds = set_creds(ctx.uid, ctx.gid)?; + let _creds = set_creds(ctx.uid, ctx.gid, ctx.pid)?; let new_file = Self::create_file_excl(&dir_file, name, flags, args.mode & !(args.umask & 0o777))?; @@ -617,7 +617,7 @@ impl FileSystem for PassthroughFs { fn unlink(&self, ctx: &Context, parent: Inode, name: &CStr) -> io::Result<()> { self.validate_path_component(name)?; // Switch to caller's credentials for permission check - let _creds = set_creds(ctx.uid, ctx.gid)?; + let _creds = set_creds(ctx.uid, ctx.gid, ctx.pid)?; self.do_unlink(parent, name, 0) } @@ -700,7 +700,7 @@ impl FileSystem for PassthroughFs { debug!("write: inode={} handle={} size={} uid={} gid={}", inode, handle, size, ctx.uid, ctx.gid); // Switch to caller's credentials for the write operation. // This is needed for proper SUID/SGID bit handling when a non-owner writes. - let _creds = match set_creds(ctx.uid, ctx.gid) { + let _creds = match set_creds(ctx.uid, ctx.gid, ctx.pid) { Ok(c) => { debug!("write: set_creds succeeded"); c @@ -823,7 +823,7 @@ impl FileSystem for PassthroughFs { } else { // User-initiated chmod - switch to caller's credentials for permission check // (only file owner or root can chmod) - Some(set_creds(ctx.uid, ctx.gid)?) + Some(set_creds(ctx.uid, ctx.gid, ctx.pid)?) }; // Safe because this doesn't modify any memory and we check the return value. @@ -859,7 +859,7 @@ impl FileSystem for PassthroughFs { // Switch to caller's credentials for permission check // (only root can change owner, owner can change group) - let _creds = set_creds(ctx.uid, ctx.gid)?; + let _creds = set_creds(ctx.uid, ctx.gid, ctx.pid)?; // Safe because this doesn't modify any memory and we check the return value. let res = unsafe { @@ -898,7 +898,7 @@ impl FileSystem for PassthroughFs { _ => { // No file handle - need to open the file, which requires permission check. // Switch to caller's credentials for this case. - let _creds = set_creds(ctx.uid, ctx.gid)?; + let _creds = set_creds(ctx.uid, ctx.gid, ctx.pid)?; // There is no `ftruncateat` so we need to get a new fd and truncate it. let f = self.open_inode(inode, libc::O_NONBLOCK | libc::O_RDWR)?; unsafe { libc::ftruncate(f.as_raw_fd(), attr.st_size) } @@ -937,7 +937,7 @@ impl FileSystem for PassthroughFs { // Switch to caller's credentials for permission check // (utimensat requires write permission or ownership) - let _creds = set_creds(ctx.uid, ctx.gid)?; + let _creds = set_creds(ctx.uid, ctx.gid, ctx.pid)?; // Safe because this doesn't modify any memory and we check the return value. let res = match data { @@ -974,7 +974,7 @@ impl FileSystem for PassthroughFs { let new_file = new_inode.get_file()?; // Switch to caller's credentials for permission check - let _creds = set_creds(ctx.uid, ctx.gid)?; + let _creds = set_creds(ctx.uid, ctx.gid, ctx.pid)?; // Safe because this doesn't modify any memory and we check the return value. // TODO: Switch to libc::renameat2 once https://github.com/rust-lang/libc/pull/1508 lands @@ -1011,7 +1011,7 @@ impl FileSystem for PassthroughFs { let file = data.get_file()?; // Switch to caller's credentials so the node is owned by them - let _creds = set_creds(ctx.uid, ctx.gid)?; + let _creds = set_creds(ctx.uid, ctx.gid, ctx.pid)?; // Safe because this doesn't modify any memory and we check the return value. let res = unsafe { libc::mknodat( @@ -1081,7 +1081,7 @@ impl FileSystem for PassthroughFs { let file = data.get_file()?; // Switch to caller's credentials so the symlink is owned by them - let _creds = set_creds(ctx.uid, ctx.gid)?; + let _creds = set_creds(ctx.uid, ctx.gid, ctx.pid)?; // Safe because this doesn't modify any memory and we check the return value. let res = unsafe { libc::symlinkat(linkname.as_ptr(), file.as_raw_fd(), name.as_ptr()) }; if res == 0 { From 221a76bf39614ed61655695bfebc8b919ecc3219 Mon Sep 17 00:00:00 2001 From: EJ Campbell Date: Mon, 8 Dec 2025 00:21:03 -0800 Subject: [PATCH 10/19] Use forwarded supplementary groups for remote filesystems When the Context struct has supplementary_groups set, ScopedCreds now uses those groups directly instead of reading from /proc//status. This is essential for remote filesystems (like fuse-pipe over vsock) where the PID from the FUSE request exists in the guest VM but not on the host server, making /proc//status inaccessible. Changes: - Add supplementary_groups field to Context struct (api/filesystem/mod.rs) - Change Context from Copy to Clone (Vec isn't Copy) - Modify ScopedCreds::new() to accept optional pre-read groups - Add set_creds_from_context() helper function - Update sync_io.rs to use set_creds_from_context() for all operations This works with fuse-pipe's wire protocol which reads groups from /proc//status on the client side and forwards them to the server. --- src/api/filesystem/mod.rs | 9 +++++++- src/passthrough/mod.rs | 43 +++++++++++++++++++++++++++++++++----- src/passthrough/sync_io.rs | 30 +++++++++++++------------- 3 files changed, 61 insertions(+), 21 deletions(-) diff --git a/src/api/filesystem/mod.rs b/src/api/filesystem/mod.rs index aef5a62c..9a7cfb75 100644 --- a/src/api/filesystem/mod.rs +++ b/src/api/filesystem/mod.rs @@ -383,7 +383,7 @@ pub trait ZeroCopyWriter: io::Write { } /// Additional context associated with requests. -#[derive(Default, Clone, Copy, Debug)] +#[derive(Default, Clone, Debug)] pub struct Context { /// The user ID of the calling process. pub uid: libc::uid_t, @@ -393,6 +393,12 @@ pub struct Context { /// The thread group ID of the calling process. pub pid: libc::pid_t, + + /// Supplementary groups of the calling process. + /// + /// When set, these groups are used directly instead of reading from /proc//status. + /// This is essential for remote filesystems where the PID doesn't exist on the server. + pub supplementary_groups: Option>, } impl Context { @@ -408,6 +414,7 @@ impl From<&fuse::InHeader> for Context { uid: source.uid, gid: source.gid, pid: source.pid as i32, + supplementary_groups: None, } } } diff --git a/src/passthrough/mod.rs b/src/passthrough/mod.rs index 0efc0c4a..f1492c88 100644 --- a/src/passthrough/mod.rs +++ b/src/passthrough/mod.rs @@ -28,6 +28,8 @@ use std::time::Duration; use vm_memory::{bitmap::BitmapSlice, ByteValued}; +use crate::api::filesystem::Context; + pub use self::config::{CachePolicy, Config}; use self::file_handle::{FileHandle, OpenableFileHandle}; use self::inode_store::{InodeId, InodeStore}; @@ -1011,7 +1013,16 @@ fn get_current_groups() -> io::Result> { impl ScopedCreds { /// Switch filesystem credentials to the given uid/gid/groups. /// Returns None if both uid and gid are 0 (already root). - fn new(uid: libc::uid_t, gid: libc::gid_t, pid: libc::pid_t) -> io::Result> { + /// + /// The `supplementary_groups` parameter allows passing pre-read supplementary groups + /// instead of reading from /proc//status. This is essential for remote filesystems + /// (like fuse-pipe over vsock) where the PID doesn't exist on the server. + fn new( + uid: libc::uid_t, + gid: libc::gid_t, + pid: libc::pid_t, + supplementary_groups: Option<&[libc::gid_t]>, + ) -> io::Result> { debug!("set_creds: switching to uid={} gid={} pid={}", uid, gid, pid); if uid == 0 && gid == 0 { // Nothing to do since we are already uid/gid 0. @@ -1023,8 +1034,14 @@ impl ScopedCreds { let original_groups = get_current_groups().unwrap_or_default(); debug!("set_creds: original_groups={:?}", original_groups); - // Parse caller's supplementary groups from /proc - let caller_groups = parse_proc_groups(pid); + // Use provided supplementary groups if available, otherwise parse from /proc + let caller_groups = match supplementary_groups { + Some(groups) => { + debug!("set_creds: using provided supplementary_groups={:?}", groups); + groups.to_vec() + } + None => parse_proc_groups(pid), + }; // Set supplementary groups first (before dropping privileges) // Use raw syscall for per-thread behavior (not glibc's process-wide wrapper) @@ -1103,8 +1120,24 @@ impl Drop for ScopedCreds { } } -fn set_creds(uid: libc::uid_t, gid: libc::gid_t, pid: libc::pid_t) -> io::Result> { - ScopedCreds::new(uid, gid, pid) +fn set_creds( + uid: libc::uid_t, + gid: libc::gid_t, + pid: libc::pid_t, + supplementary_groups: Option<&[libc::gid_t]>, +) -> io::Result> { + ScopedCreds::new(uid, gid, pid, supplementary_groups) +} + +/// Convenience function for set_creds that takes a Context reference. +/// Automatically extracts uid, gid, pid, and supplementary_groups. +fn set_creds_from_context(ctx: &Context) -> io::Result> { + set_creds( + ctx.uid, + ctx.gid, + ctx.pid, + ctx.supplementary_groups.as_deref(), + ) } struct CapFsetid {} diff --git a/src/passthrough/sync_io.rs b/src/passthrough/sync_io.rs index c2acf317..821e1075 100644 --- a/src/passthrough/sync_io.rs +++ b/src/passthrough/sync_io.rs @@ -371,7 +371,7 @@ impl FileSystem for PassthroughFs { return Err(einval()); } // Switch to caller's credentials for directory search permission check - let _creds = set_creds(ctx.uid, ctx.gid, ctx.pid)?; + let _creds = set_creds_from_context(ctx)?; self.do_lookup(parent, name) } @@ -400,7 +400,7 @@ impl FileSystem for PassthroughFs { Err(enosys()) } else { // Switch to caller's credentials for permission check - let _creds = set_creds(ctx.uid, ctx.gid, ctx.pid)?; + let _creds = set_creds_from_context(ctx)?; self.do_open(inode, flags | (libc::O_DIRECTORY as u32), 0) .map(|(a, b, _)| (a, b)) } @@ -435,7 +435,7 @@ impl FileSystem for PassthroughFs { let file = data.get_file()?; // Switch to caller's credentials so the directory is owned by them - let _creds = set_creds(ctx.uid, ctx.gid, ctx.pid)?; + let _creds = set_creds_from_context(ctx)?; // Safe because this doesn't modify any memory and we check the return value. let res = unsafe { libc::mkdirat(file.as_raw_fd(), name.as_ptr(), mode & !umask) }; if res < 0 { @@ -448,7 +448,7 @@ impl FileSystem for PassthroughFs { fn rmdir(&self, ctx: &Context, parent: Inode, name: &CStr) -> io::Result<()> { self.validate_path_component(name)?; // Switch to caller's credentials for permission check - let _creds = set_creds(ctx.uid, ctx.gid, ctx.pid)?; + let _creds = set_creds_from_context(ctx)?; self.do_unlink(parent, name, libc::AT_REMOVEDIR) } @@ -533,7 +533,7 @@ impl FileSystem for PassthroughFs { Err(enosys()) } else { // Switch to caller's credentials for permission check - let _creds = set_creds(ctx.uid, ctx.gid, ctx.pid)?; + let _creds = set_creds_from_context(ctx)?; self.do_open(inode, flags, fuse_flags) } } @@ -569,7 +569,7 @@ impl FileSystem for PassthroughFs { let flags = self.get_writeback_open_flags(args.flags as i32); // Switch to caller's credentials so the file is owned by them - let _creds = set_creds(ctx.uid, ctx.gid, ctx.pid)?; + let _creds = set_creds_from_context(ctx)?; let new_file = Self::create_file_excl(&dir_file, name, flags, args.mode & !(args.umask & 0o777))?; @@ -617,7 +617,7 @@ impl FileSystem for PassthroughFs { fn unlink(&self, ctx: &Context, parent: Inode, name: &CStr) -> io::Result<()> { self.validate_path_component(name)?; // Switch to caller's credentials for permission check - let _creds = set_creds(ctx.uid, ctx.gid, ctx.pid)?; + let _creds = set_creds_from_context(ctx)?; self.do_unlink(parent, name, 0) } @@ -700,7 +700,7 @@ impl FileSystem for PassthroughFs { debug!("write: inode={} handle={} size={} uid={} gid={}", inode, handle, size, ctx.uid, ctx.gid); // Switch to caller's credentials for the write operation. // This is needed for proper SUID/SGID bit handling when a non-owner writes. - let _creds = match set_creds(ctx.uid, ctx.gid, ctx.pid) { + let _creds = match set_creds_from_context(ctx) { Ok(c) => { debug!("write: set_creds succeeded"); c @@ -823,7 +823,7 @@ impl FileSystem for PassthroughFs { } else { // User-initiated chmod - switch to caller's credentials for permission check // (only file owner or root can chmod) - Some(set_creds(ctx.uid, ctx.gid, ctx.pid)?) + Some(set_creds_from_context(ctx)?) }; // Safe because this doesn't modify any memory and we check the return value. @@ -859,7 +859,7 @@ impl FileSystem for PassthroughFs { // Switch to caller's credentials for permission check // (only root can change owner, owner can change group) - let _creds = set_creds(ctx.uid, ctx.gid, ctx.pid)?; + let _creds = set_creds_from_context(ctx)?; // Safe because this doesn't modify any memory and we check the return value. let res = unsafe { @@ -898,7 +898,7 @@ impl FileSystem for PassthroughFs { _ => { // No file handle - need to open the file, which requires permission check. // Switch to caller's credentials for this case. - let _creds = set_creds(ctx.uid, ctx.gid, ctx.pid)?; + let _creds = set_creds_from_context(ctx)?; // There is no `ftruncateat` so we need to get a new fd and truncate it. let f = self.open_inode(inode, libc::O_NONBLOCK | libc::O_RDWR)?; unsafe { libc::ftruncate(f.as_raw_fd(), attr.st_size) } @@ -937,7 +937,7 @@ impl FileSystem for PassthroughFs { // Switch to caller's credentials for permission check // (utimensat requires write permission or ownership) - let _creds = set_creds(ctx.uid, ctx.gid, ctx.pid)?; + let _creds = set_creds_from_context(ctx)?; // Safe because this doesn't modify any memory and we check the return value. let res = match data { @@ -974,7 +974,7 @@ impl FileSystem for PassthroughFs { let new_file = new_inode.get_file()?; // Switch to caller's credentials for permission check - let _creds = set_creds(ctx.uid, ctx.gid, ctx.pid)?; + let _creds = set_creds_from_context(ctx)?; // Safe because this doesn't modify any memory and we check the return value. // TODO: Switch to libc::renameat2 once https://github.com/rust-lang/libc/pull/1508 lands @@ -1011,7 +1011,7 @@ impl FileSystem for PassthroughFs { let file = data.get_file()?; // Switch to caller's credentials so the node is owned by them - let _creds = set_creds(ctx.uid, ctx.gid, ctx.pid)?; + let _creds = set_creds_from_context(ctx)?; // Safe because this doesn't modify any memory and we check the return value. let res = unsafe { libc::mknodat( @@ -1081,7 +1081,7 @@ impl FileSystem for PassthroughFs { let file = data.get_file()?; // Switch to caller's credentials so the symlink is owned by them - let _creds = set_creds(ctx.uid, ctx.gid, ctx.pid)?; + let _creds = set_creds_from_context(ctx)?; // Safe because this doesn't modify any memory and we check the return value. let res = unsafe { libc::symlinkat(linkname.as_ptr(), file.as_raw_fd(), name.as_ptr()) }; if res == 0 { From 71597871d11c80c657e5e50be7e549abbe0a606d Mon Sep 17 00:00:00 2001 From: EJ Campbell Date: Mon, 8 Dec 2025 00:35:40 -0800 Subject: [PATCH 11/19] Remove /proc fallback for supplementary groups Simplify credential handling by removing the parse_proc_groups() function that read from /proc//status. This was a fallback for local FUSE mounts, but we control both client and server in fuse-pipe, so groups are always forwarded through the wire protocol. Changes: - Remove parse_proc_groups() function (37 lines) - Remove pid parameter from ScopedCreds::new() - Simplify set_creds() to just take (uid, gid) for upstream API compatibility - set_creds_from_context() calls ScopedCreds::new directly All 8789 pjdfstest POSIX compliance tests pass with this change. --- src/passthrough/mod.rs | 88 ++++++++---------------------------------- 1 file changed, 16 insertions(+), 72 deletions(-) diff --git a/src/passthrough/mod.rs b/src/passthrough/mod.rs index f1492c88..70b6bb7b 100644 --- a/src/passthrough/mod.rs +++ b/src/passthrough/mod.rs @@ -913,13 +913,8 @@ impl BackendFileSystem for PassthroughFs /// # Supplementary Groups /// /// FUSE protocol only passes uid and primary gid in requests, not supplementary groups. -/// With `default_permissions` mount option, the kernel checks permissions before the -/// request reaches the FUSE server, but these checks don't consider the caller's -/// supplementary groups because the kernel doesn't know them for FUSE operations. -/// -/// To fix this, we read the caller's supplementary groups from /proc//status -/// and adopt them using setgroups(). This is the "gocryptfs workaround" approach. -/// See: https://github.com/rfjakob/gocryptfs/commit/e74f48b +/// The caller must forward supplementary groups via the Context struct. We adopt them +/// using setgroups() raw syscall for per-thread credential switching. #[derive(Debug)] pub(crate) struct ScopedCreds { original_fsuid: libc::uid_t, @@ -927,43 +922,6 @@ pub(crate) struct ScopedCreds { original_groups: Vec, } -/// Parse supplementary groups from /proc//status -/// -/// Returns empty Vec if the file cannot be read (e.g., process exited). -fn parse_proc_groups(pid: libc::pid_t) -> Vec { - // Try /proc//task//status first (more accurate for threads) - // Fall back to /proc//status - let status_path = format!("/proc/{}/task/{}/status", pid, pid); - let content = match std::fs::read_to_string(&status_path) { - Ok(c) => c, - Err(_) => { - let fallback_path = format!("/proc/{}/status", pid); - match std::fs::read_to_string(&fallback_path) { - Ok(c) => c, - Err(e) => { - debug!("parse_proc_groups: failed to read /proc/{}/status: {}", pid, e); - return Vec::new(); - } - } - } - }; - - // Look for line: "Groups:\t1000 1001 1002" - for line in content.lines() { - if let Some(groups_str) = line.strip_prefix("Groups:") { - let groups: Vec = groups_str - .split_whitespace() - .filter_map(|s| s.parse().ok()) - .collect(); - debug!("parse_proc_groups: pid={} groups={:?}", pid, groups); - return groups; - } - } - - debug!("parse_proc_groups: pid={} no Groups line found", pid); - Vec::new() -} - /// Thread-local setgroups using raw syscall. /// /// IMPORTANT: We use the raw SYS_setgroups syscall instead of libc::setgroups() @@ -1014,16 +972,14 @@ impl ScopedCreds { /// Switch filesystem credentials to the given uid/gid/groups. /// Returns None if both uid and gid are 0 (already root). /// - /// The `supplementary_groups` parameter allows passing pre-read supplementary groups - /// instead of reading from /proc//status. This is essential for remote filesystems - /// (like fuse-pipe over vsock) where the PID doesn't exist on the server. + /// The `supplementary_groups` parameter allows passing supplementary groups + /// to adopt for the duration of the filesystem operation. fn new( uid: libc::uid_t, gid: libc::gid_t, - pid: libc::pid_t, supplementary_groups: Option<&[libc::gid_t]>, ) -> io::Result> { - debug!("set_creds: switching to uid={} gid={} pid={}", uid, gid, pid); + debug!("set_creds: switching to uid={} gid={}", uid, gid); if uid == 0 && gid == 0 { // Nothing to do since we are already uid/gid 0. debug!("set_creds: uid=0 gid=0, nothing to do"); @@ -1034,14 +990,9 @@ impl ScopedCreds { let original_groups = get_current_groups().unwrap_or_default(); debug!("set_creds: original_groups={:?}", original_groups); - // Use provided supplementary groups if available, otherwise parse from /proc - let caller_groups = match supplementary_groups { - Some(groups) => { - debug!("set_creds: using provided supplementary_groups={:?}", groups); - groups.to_vec() - } - None => parse_proc_groups(pid), - }; + // Use provided supplementary groups (required - caller must forward them) + let caller_groups = supplementary_groups.unwrap_or(&[]).to_vec(); + debug!("set_creds: supplementary_groups={:?}", caller_groups); // Set supplementary groups first (before dropping privileges) // Use raw syscall for per-thread behavior (not glibc's process-wide wrapper) @@ -1120,24 +1071,17 @@ impl Drop for ScopedCreds { } } -fn set_creds( - uid: libc::uid_t, - gid: libc::gid_t, - pid: libc::pid_t, - supplementary_groups: Option<&[libc::gid_t]>, -) -> io::Result> { - ScopedCreds::new(uid, gid, pid, supplementary_groups) +/// Switch filesystem credentials to the given uid/gid. +/// This is the upstream-compatible API that doesn't handle supplementary groups. +#[allow(dead_code)] +fn set_creds(uid: libc::uid_t, gid: libc::gid_t) -> io::Result> { + ScopedCreds::new(uid, gid, None) } -/// Convenience function for set_creds that takes a Context reference. -/// Automatically extracts uid, gid, pid, and supplementary_groups. +/// Switch filesystem credentials from Context. +/// Uses supplementary_groups from Context if available. fn set_creds_from_context(ctx: &Context) -> io::Result> { - set_creds( - ctx.uid, - ctx.gid, - ctx.pid, - ctx.supplementary_groups.as_deref(), - ) + ScopedCreds::new(ctx.uid, ctx.gid, ctx.supplementary_groups.as_deref()) } struct CapFsetid {} From 1d089fecc53b66e6766c554df522042d4b572642 Mon Sep 17 00:00:00 2001 From: EJ Campbell Date: Mon, 8 Dec 2025 19:50:31 -0800 Subject: [PATCH 12/19] Add signed lseek hook for backward-compatible negative offsets --- src/api/filesystem/async_io.rs | 14 ++++++++++ src/api/filesystem/sync_io.rs | 28 +++++++++++++++++++ src/api/server/sync_io.rs | 12 +++++--- src/passthrough/sync_io.rs | 51 ++++++++++++++++++++++++++++------ 4 files changed, 92 insertions(+), 13 deletions(-) diff --git a/src/api/filesystem/async_io.rs b/src/api/filesystem/async_io.rs index fdc846a4..ca36b060 100644 --- a/src/api/filesystem/async_io.rs +++ b/src/api/filesystem/async_io.rs @@ -797,6 +797,20 @@ pub trait AsyncFileSystem: FileSystem { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } + /// Reposition read/write file offset with a signed offset. + /// + /// Default implementation forwards to [`lseek`] for backward compatibility. + fn lseek_signed( + &self, + ctx: Context, + inode: Self::Inode, + handle: Self::Handle, + offset: i64, + whence: u32, + ) -> io::Result { + self.lseek(ctx, inode, handle, offset as u64, whence) + } + /// TODO: support this fn getlk(&self) -> io::Result<()> { Err(io::Error::from_raw_os_error(libc::ENOSYS)) diff --git a/src/api/filesystem/sync_io.rs b/src/api/filesystem/sync_io.rs index 229c1f43..b19fe920 100644 --- a/src/api/filesystem/sync_io.rs +++ b/src/api/filesystem/sync_io.rs @@ -813,6 +813,22 @@ pub trait FileSystem { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } + /// Reposition read/write file offset with a signed offset. + /// + /// Default implementation forwards to [`lseek`] for backward compatibility. + /// Filesystems that need negative offsets (e.g. SEEK_END) can override this + /// to receive the signed value directly. + fn lseek_signed( + &self, + ctx: &Context, + inode: Self::Inode, + handle: Self::Handle, + offset: i64, + whence: u32, + ) -> io::Result { + self.lseek(ctx, inode, handle, offset as u64, whence) + } + /// Query file lock status fn getlk( &self, @@ -1263,6 +1279,18 @@ impl FileSystem for Arc { self.deref().lseek(ctx, inode, handle, offset, whence) } + fn lseek_signed( + &self, + ctx: &Context, + inode: Self::Inode, + handle: Self::Handle, + offset: i64, + whence: u32, + ) -> io::Result { + self.deref() + .lseek_signed(ctx, inode, handle, offset, whence) + } + /// Query file lock status fn getlk( &self, diff --git a/src/api/server/sync_io.rs b/src/api/server/sync_io.rs index ba32efbc..e6aa0d36 100644 --- a/src/api/server/sync_io.rs +++ b/src/api/server/sync_io.rs @@ -1193,11 +1193,15 @@ impl Server { let LseekIn { fh, offset, whence, .. } = ctx.r.read_obj().map_err(Error::DecodeMessage)?; + let offset_signed = offset as i64; - match self - .fs - .lseek(ctx.context(), ctx.nodeid(), fh.into(), offset, whence) - { + match self.fs.lseek_signed( + ctx.context(), + ctx.nodeid(), + fh.into(), + offset_signed, + whence, + ) { Ok(offset) => { let out = LseekOut { offset }; diff --git a/src/passthrough/sync_io.rs b/src/passthrough/sync_io.rs index 821e1075..aad353be 100644 --- a/src/passthrough/sync_io.rs +++ b/src/passthrough/sync_io.rs @@ -697,7 +697,10 @@ impl FileSystem for PassthroughFs { flags: u32, fuse_flags: u32, ) -> io::Result { - debug!("write: inode={} handle={} size={} uid={} gid={}", inode, handle, size, ctx.uid, ctx.gid); + debug!( + "write: inode={} handle={} size={} uid={} gid={}", + inode, handle, size, ctx.uid, ctx.gid + ); // Switch to caller's credentials for the write operation. // This is needed for proper SUID/SGID bit handling when a non-owner writes. let _creds = match set_creds_from_context(ctx) { @@ -792,12 +795,9 @@ impl FileSystem for PassthroughFs { let res = unsafe { match &data { Data::Handle(h) => libc::fstat64(h.borrow_fd().as_raw_fd(), &mut st), - Data::ProcPath(p) => libc::fstatat64( - self.proc_self_fd.as_raw_fd(), - p.as_ptr(), - &mut st, - 0, - ), + Data::ProcPath(p) => { + libc::fstatat64(self.proc_self_fd.as_raw_fd(), p.as_ptr(), &mut st, 0) + } } }; if res < 0 { @@ -814,8 +814,9 @@ impl FileSystem for PassthroughFs { // 3. The SUID/SGID clearing must succeed for POSIX compliance let old_special_bits = current_mode & (libc::S_ISUID | libc::S_ISGID); let new_special_bits = attr.st_mode & (libc::S_ISUID | libc::S_ISGID); - let is_suid_sgid_clearing = - old_special_bits != 0 && new_special_bits == 0 && (current_mode & 0o777) == (attr.st_mode & 0o777); + let is_suid_sgid_clearing = old_special_bits != 0 + && new_special_bits == 0 + && (current_mode & 0o777) == (attr.st_mode & 0o777); let _creds = if is_suid_sgid_clearing { // Kernel clearing SUID/SGID - do as root @@ -1392,6 +1393,17 @@ impl FileSystem for PassthroughFs { handle: Handle, offset: u64, whence: u32, + ) -> io::Result { + self.lseek_signed(_ctx, inode, handle, offset as i64, whence) + } + + fn lseek_signed( + &self, + _ctx: &Context, + inode: Inode, + handle: Handle, + offset: i64, + whence: u32, ) -> io::Result { // Let the Arc in scope, otherwise fd may get invalid. let data = self.handle_map.get(handle, inode)?; @@ -1702,6 +1714,27 @@ mod tests { assert_eq!(statfs.f_namemax, 255); } + #[test] + fn test_lseek_signed_negative_offset() { + let (fs, source) = prepare_fs_tmpdir(); + let ctx = prepare_context(); + + let path = source.as_path().join("seek.txt"); + std::fs::write(&path, b"abcdef").unwrap(); + + let name = CString::new("seek.txt").unwrap(); + let entry = fs.lookup(&ctx, ROOT_ID, &name).unwrap(); + let (handle, _, _) = fs + .open(&ctx, entry.inode, libc::O_RDONLY as u32, 0) + .unwrap(); + let handle = handle.expect("expected handle"); + + let offset = fs + .lseek_signed(&ctx, entry.inode, handle, -2, libc::SEEK_END as u32) + .unwrap(); + assert_eq!(offset, 4); + } + #[test] fn test_fsync_dir() { let (fs, _source) = prepare_fs_tmpdir(); From da797811064ba78a827bc07acdf2d00778e9891f Mon Sep 17 00:00:00 2001 From: EJ Campbell Date: Tue, 9 Dec 2025 04:43:47 -0800 Subject: [PATCH 13/19] Format debug logging statements --- src/passthrough/mod.rs | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/src/passthrough/mod.rs b/src/passthrough/mod.rs index 70b6bb7b..9cdefc2e 100644 --- a/src/passthrough/mod.rs +++ b/src/passthrough/mod.rs @@ -1015,7 +1015,10 @@ impl ScopedCreds { let original_fsgid = unsafe { libc::setfsgid(gid) } as libc::gid_t; // Verify the change took effect let verify_gid = unsafe { libc::setfsgid(gid) } as libc::gid_t; - debug!("set_creds: setfsgid({}) returned original={} verify={}", gid, original_fsgid, verify_gid); + debug!( + "set_creds: setfsgid({}) returned original={} verify={}", + gid, original_fsgid, verify_gid + ); if verify_gid != gid { // Restore groups and return error if !original_groups.is_empty() { @@ -1032,7 +1035,10 @@ impl ScopedCreds { let original_fsuid = unsafe { libc::setfsuid(uid) } as libc::uid_t; // Verify the change took effect let verify_uid = unsafe { libc::setfsuid(uid) } as libc::uid_t; - debug!("set_creds: setfsuid({}) returned original={} verify={}", uid, original_fsuid, verify_uid); + debug!( + "set_creds: setfsuid({}) returned original={} verify={}", + uid, original_fsuid, verify_uid + ); if verify_uid != uid { // Restore all and return error if !original_groups.is_empty() { @@ -1046,7 +1052,10 @@ impl ScopedCreds { )); } - debug!("set_creds: success, original_fsuid={} original_fsgid={}", original_fsuid, original_fsgid); + debug!( + "set_creds: success, original_fsuid={} original_fsgid={}", + original_fsuid, original_fsgid + ); Ok(Some(ScopedCreds { original_fsuid, original_fsgid, @@ -1097,8 +1106,8 @@ impl Drop for CapFsetid { fn drop_cap_fsetid() -> io::Result> { // Use unwrap_or(false) instead of propagating error - if we can't check // capabilities, assume we don't have them and continue without error - let has_cap = caps::has_cap(None, caps::CapSet::Effective, caps::Capability::CAP_FSETID) - .unwrap_or(false); + let has_cap = + caps::has_cap(None, caps::CapSet::Effective, caps::Capability::CAP_FSETID).unwrap_or(false); if !has_cap { return Ok(None); } From 8576023a1b30ed046a26dfa548bd784100cec533 Mon Sep 17 00:00:00 2001 From: ejc3 Date: Fri, 26 Dec 2025 02:34:17 +0000 Subject: [PATCH 14/19] Add copy_file_range support to FileSystem trait - Add copy_file_range method to FileSystem trait with default ENOSYS - Implement copy_file_range in PassthroughFs using libc syscall - Add test for copy_file_range with partial copy and offset support This enables efficient server-side copy operations on filesystems that support it (e.g., btrfs reflinks). When the underlying filesystem supports copy_file_range, copies can be instant O(1) operations instead of O(n) read+write. --- src/api/filesystem/sync_io.rs | 23 +++++++ src/passthrough/sync_io.rs | 115 ++++++++++++++++++++++++++++++++++ 2 files changed, 138 insertions(+) diff --git a/src/api/filesystem/sync_io.rs b/src/api/filesystem/sync_io.rs index b19fe920..50c5883f 100644 --- a/src/api/filesystem/sync_io.rs +++ b/src/api/filesystem/sync_io.rs @@ -868,6 +868,29 @@ pub trait FileSystem { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } + /// Copy a range of data from one file to another. + /// + /// Performs an optimized copy between two file descriptors. On filesystems + /// that support it (like btrfs), this creates a reflink (copy-on-write clone) + /// which is nearly instantaneous regardless of file size. + /// + /// Returns the number of bytes copied. + #[allow(clippy::too_many_arguments)] + fn copy_file_range( + &self, + ctx: &Context, + inode_in: Self::Inode, + handle_in: Self::Handle, + offset_in: u64, + inode_out: Self::Inode, + handle_out: Self::Handle, + offset_out: u64, + len: u64, + flags: u64, + ) -> io::Result { + Err(io::Error::from_raw_os_error(libc::ENOSYS)) + } + /// send ioctl to the file #[allow(clippy::too_many_arguments)] fn ioctl( diff --git a/src/passthrough/sync_io.rs b/src/passthrough/sync_io.rs index aad353be..9ca8109c 100644 --- a/src/passthrough/sync_io.rs +++ b/src/passthrough/sync_io.rs @@ -1425,6 +1425,47 @@ impl FileSystem for PassthroughFs { Ok(res as u64) } } + + fn copy_file_range( + &self, + _ctx: &Context, + inode_in: Inode, + handle_in: Handle, + offset_in: u64, + inode_out: Inode, + handle_out: Handle, + offset_out: u64, + len: u64, + flags: u64, + ) -> io::Result { + // Get file descriptors from handles + let data_in = self.handle_map.get(handle_in, inode_in)?; + let data_out = self.handle_map.get(handle_out, inode_out)?; + + let (_guard_in, file_in) = data_in.get_file_mut(); + let (_guard_out, file_out) = data_out.get_file_mut(); + + let mut off_in = offset_in as libc::off64_t; + let mut off_out = offset_out as libc::off64_t; + + // Safe because we check the return value and the fds are valid + let result = unsafe { + libc::copy_file_range( + file_in.as_raw_fd(), + &mut off_in, + file_out.as_raw_fd(), + &mut off_out, + len as libc::size_t, + flags as libc::c_uint, + ) + }; + + if result < 0 { + Err(io::Error::last_os_error()) + } else { + Ok(result as usize) + } + } } #[cfg(test)] @@ -1743,4 +1784,78 @@ mod tests { assert!(fs.fsyncdir(&ctx, ROOT_ID, false, 0).is_ok()); } + + #[test] + fn test_copy_file_range() { + let (fs, source) = prepare_fs_tmpdir(); + let ctx = prepare_context(); + + // Create source file with data (using std::fs for simplicity) + let test_data = b"Hello, copy_file_range!"; + let src_path = source.as_path().join("source.txt"); + let dst_path = source.as_path().join("dest.txt"); + std::fs::write(&src_path, test_data).unwrap(); + std::fs::write(&dst_path, b"").unwrap(); // Create empty destination + + // Look up and open both files through the passthrough fs + let src_name = CString::new("source.txt").unwrap(); + let src_entry = fs.lookup(&ctx, ROOT_ID, &src_name).unwrap(); + let (src_handle, _, _) = fs + .open(&ctx, src_entry.inode, libc::O_RDWR as u32, 0) + .unwrap(); + let src_handle = src_handle.expect("expected src handle"); + + let dst_name = CString::new("dest.txt").unwrap(); + let dst_entry = fs.lookup(&ctx, ROOT_ID, &dst_name).unwrap(); + let (dst_handle, _, _) = fs + .open(&ctx, dst_entry.inode, libc::O_RDWR as u32, 0) + .unwrap(); + let dst_handle = dst_handle.expect("expected dst handle"); + + // Copy data from source to destination using copy_file_range + let copied = fs + .copy_file_range( + &ctx, + src_entry.inode, + src_handle, + 0, // offset_in + dst_entry.inode, + dst_handle, + 0, // offset_out + test_data.len() as u64, + 0, // flags + ) + .unwrap(); + assert_eq!(copied, test_data.len()); + + // Sync and verify by reading directly from disk + fs.fsync(&ctx, dst_entry.inode, false, dst_handle).unwrap(); + let result = std::fs::read(&dst_path).unwrap(); + assert_eq!(&result, test_data); + + // Test partial copy with offset + let offset = 7; // "Hello, " is 7 bytes + let partial_len = test_data.len() - offset; + let copied = fs + .copy_file_range( + &ctx, + src_entry.inode, + src_handle, + offset as u64, + dst_entry.inode, + dst_handle, + test_data.len() as u64, // append after existing data + partial_len as u64, + 0, + ) + .unwrap(); + assert_eq!(copied, partial_len); + + // Verify the appended data + fs.fsync(&ctx, dst_entry.inode, false, dst_handle).unwrap(); + let result = std::fs::read(&dst_path).unwrap(); + assert_eq!(result.len(), test_data.len() + partial_len); + assert_eq!(&result[..test_data.len()], test_data); + assert_eq!(&result[test_data.len()..], &test_data[offset..]); + } } From 956c1a6f40a3eccafbb52fd8e0257e5981f3aead Mon Sep 17 00:00:00 2001 From: ejc3 Date: Fri, 26 Dec 2025 07:06:14 +0000 Subject: [PATCH 15/19] Add remap_file_range support for FICLONE/FICLONERANGE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add remap_file_range to the FileSystem trait to enable reflink operations through FUSE. This maps to the kernel's .remap_file_range VFS callback, which handles FICLONE and FICLONERANGE ioctls. Changes: - Add remap_file_range method to FileSystem trait (default: ENOSYS) - Implement remap_file_range in PassthroughFs using FICLONE/FICLONERANGE - Add Arc delegations for copy_file_range and remap_file_range The Arc blanket implementation was missing delegations for these methods, causing operations through Arc to incorrectly return ENOSYS instead of calling the actual implementation. On CoW filesystems (btrfs, xfs with reflink), this enables instant file cloning where the copy shares physical storage until modified. Tested: cargo test (unit tests pass) E2E: https://github.com/ejc3/firepod/pull/21 - cp --reflink=always through FUSE → btrfs - filefrag confirms shared extents (true reflinks) --- src/api/filesystem/sync_io.rs | 60 ++++++++++++ src/passthrough/sync_io.rs | 169 ++++++++++++++++++++++++++++++++++ 2 files changed, 229 insertions(+) diff --git a/src/api/filesystem/sync_io.rs b/src/api/filesystem/sync_io.rs index 50c5883f..3f711a2b 100644 --- a/src/api/filesystem/sync_io.rs +++ b/src/api/filesystem/sync_io.rs @@ -891,6 +891,30 @@ pub trait FileSystem { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } + /// Remap file ranges (FICLONE/FICLONERANGE) for copy-on-write filesystems. + /// + /// This is the server-side implementation of the FUSE_REMAP_FILE_RANGE opcode, + /// which enables FICLONE and FICLONERANGE ioctls through FUSE. On btrfs and + /// other CoW filesystems, this creates reflinks - instant copies that share + /// the same physical storage until modified. + /// + /// Returns the number of bytes remapped. + #[allow(clippy::too_many_arguments)] + fn remap_file_range( + &self, + ctx: &Context, + inode_in: Self::Inode, + handle_in: Self::Handle, + offset_in: u64, + inode_out: Self::Inode, + handle_out: Self::Handle, + offset_out: u64, + len: u64, + flags: u32, + ) -> io::Result { + Err(io::Error::from_raw_os_error(libc::ENOSYS)) + } + /// send ioctl to the file #[allow(clippy::too_many_arguments)] fn ioctl( @@ -1403,4 +1427,40 @@ impl FileSystem for Arc { fn id_remap(&self, ctx: &mut Context) -> io::Result<()> { self.deref().id_remap(ctx) } + + #[allow(clippy::too_many_arguments)] + fn copy_file_range( + &self, + ctx: &Context, + inode_in: Self::Inode, + handle_in: Self::Handle, + offset_in: u64, + inode_out: Self::Inode, + handle_out: Self::Handle, + offset_out: u64, + len: u64, + flags: u64, + ) -> io::Result { + self.deref().copy_file_range( + ctx, inode_in, handle_in, offset_in, inode_out, handle_out, offset_out, len, flags, + ) + } + + #[allow(clippy::too_many_arguments)] + fn remap_file_range( + &self, + ctx: &Context, + inode_in: Self::Inode, + handle_in: Self::Handle, + offset_in: u64, + inode_out: Self::Inode, + handle_out: Self::Handle, + offset_out: u64, + len: u64, + flags: u32, + ) -> io::Result { + self.deref().remap_file_range( + ctx, inode_in, handle_in, offset_in, inode_out, handle_out, offset_out, len, flags, + ) + } } diff --git a/src/passthrough/sync_io.rs b/src/passthrough/sync_io.rs index 9ca8109c..015bc161 100644 --- a/src/passthrough/sync_io.rs +++ b/src/passthrough/sync_io.rs @@ -1466,6 +1466,85 @@ impl FileSystem for PassthroughFs { Ok(result as usize) } } + + fn remap_file_range( + &self, + _ctx: &Context, + inode_in: Inode, + handle_in: Handle, + offset_in: u64, + inode_out: Inode, + handle_out: Handle, + offset_out: u64, + len: u64, + flags: u32, + ) -> io::Result { + debug!( + "remap_file_range: ino_in={} fh_in={} ino_out={} fh_out={} len={} flags={}", + inode_in, handle_in, inode_out, handle_out, len, flags + ); + + // Get file descriptors from handles + let data_in = match self.handle_map.get(handle_in, inode_in) { + Ok(d) => d, + Err(e) => { + debug!("remap_file_range: handle_map.get(in) failed: {:?}", e); + return Err(e); + } + }; + let data_out = match self.handle_map.get(handle_out, inode_out) { + Ok(d) => d, + Err(e) => { + debug!("remap_file_range: handle_map.get(out) failed: {:?}", e); + return Err(e); + } + }; + + let (_guard_in, file_in) = data_in.get_file_mut(); + let (_guard_out, file_out) = data_out.get_file_mut(); + + let fd_in = file_in.as_raw_fd(); + let fd_out = file_out.as_raw_fd(); + + debug!("remap_file_range: fd_in={} fd_out={}", fd_in, fd_out); + + // struct file_clone_range from + #[repr(C)] + struct FileCloneRange { + src_fd: i64, + src_offset: u64, + src_length: u64, + dest_offset: u64, + } + + // FICLONE = _IOW('9', 1, int) = 0x40049409 + // FICLONERANGE = _IOW('9', 13, struct file_clone_range) = 0x4020940d + let result = if len == 0 && offset_in == 0 && offset_out == 0 && flags == 0 { + // Whole-file clone (FICLONE) + debug!("remap_file_range: using FICLONE ioctl"); + unsafe { libc::ioctl(fd_out, 0x40049409, fd_in) } + } else { + // Partial clone (FICLONERANGE) + let range = FileCloneRange { + src_fd: fd_in as i64, + src_offset: offset_in, + src_length: len, + dest_offset: offset_out, + }; + debug!("remap_file_range: using FICLONERANGE ioctl"); + unsafe { libc::ioctl(fd_out, 0x4020940d, &range as *const FileCloneRange) } + }; + + if result < 0 { + let err = io::Error::last_os_error(); + debug!("remap_file_range: ioctl failed: {:?}", err); + Err(err) + } else { + debug!("remap_file_range: success, result={}", result); + // Return the length that was cloned (0 for whole-file means success) + Ok(if len == 0 { 0 } else { len as usize }) + } + } } #[cfg(test)] @@ -1858,4 +1937,94 @@ mod tests { assert_eq!(&result[..test_data.len()], test_data); assert_eq!(&result[test_data.len()..], &test_data[offset..]); } + + /// Test that Arc properly delegates all FileSystem methods. + /// + /// This catches a subtle bug where the `impl FileSystem for Arc` + /// blanket implementation might forget to delegate a method, causing it to use + /// the default trait implementation (which returns ENOSYS) instead. + #[test] + fn test_arc_delegates_filesystem_methods() { + let (fs, source) = prepare_fs_tmpdir(); + let arc_fs = Arc::new(fs); + let ctx = prepare_context(); + + // Create test files for copy/remap operations + let src_name = CString::new("arc_test_src.txt").unwrap(); + let dst_name = CString::new("arc_test_dst.txt").unwrap(); + let test_data = b"Arc delegation test data"; + + let args = CreateIn { + flags: libc::O_RDWR as u32, + mode: 0o644, + umask: 0, + fuse_flags: 0, + }; + + // Test basic operations through Arc - these should work, not return ENOSYS + let (src_entry, src_handle, _, _) = arc_fs.create(&ctx, ROOT_ID, &src_name, args).unwrap(); + let src_handle = src_handle.unwrap(); + + // Write test data + let src_path = source.as_path().join("arc_test_src.txt"); + std::fs::write(&src_path, test_data).unwrap(); + + let (dst_entry, dst_handle, _, _) = arc_fs.create(&ctx, ROOT_ID, &dst_name, args).unwrap(); + let dst_handle = dst_handle.unwrap(); + + // Test copy_file_range through Arc - should NOT return ENOSYS + let result = arc_fs.copy_file_range( + &ctx, + src_entry.inode, + src_handle, + 0, + dst_entry.inode, + dst_handle, + 0, + test_data.len() as u64, + 0, + ); + // copy_file_range should succeed or fail with a real error, never ENOSYS + match &result { + Ok(_) => {} // Success + Err(e) => { + assert_ne!( + e.raw_os_error(), + Some(libc::ENOSYS), + "Arc must delegate copy_file_range, not use default ENOSYS impl" + ); + } + } + + // Test remap_file_range through Arc - should NOT return ENOSYS + // On tmpfs this will return EOPNOTSUPP or EINVAL (no reflink support), + // but it should NEVER return ENOSYS (which would mean missing delegation) + let result = arc_fs.remap_file_range( + &ctx, + src_entry.inode, + src_handle, + 0, + dst_entry.inode, + dst_handle, + 0, + 0, // len=0 means whole file + 0, + ); + match &result { + Ok(_) => {} // Success (would require btrfs/xfs with reflink) + Err(e) => { + assert_ne!( + e.raw_os_error(), + Some(libc::ENOSYS), + "Arc must delegate remap_file_range, not use default ENOSYS impl. \ + Got ENOSYS which means the Arc blanket impl is missing this method." + ); + // Expected: EOPNOTSUPP (95) or EINVAL (22) on tmpfs + } + } + + // Cleanup + arc_fs.release(&ctx, src_entry.inode, 0, src_handle, true, true, None).unwrap(); + arc_fs.release(&ctx, dst_entry.inode, 0, dst_handle, true, true, None).unwrap(); + } } From 3a212e57d801a735adc9075b9091ecaf9a6ff8ca Mon Sep 17 00:00:00 2001 From: ejc3 Date: Fri, 26 Dec 2025 10:20:13 +0000 Subject: [PATCH 16/19] Fix remap_file_range: proper return value and ioctl constants Fixes: - Add named constants for FICLONE/FICLONERANGE ioctl numbers - Fix return value for whole-file clone (len=0): get actual size via fstat instead of returning 0, which the kernel interprets as "0 bytes cloned" The kernel expects the number of bytes actually remapped. For FICLONE (whole file), we now query the source file size after successful clone. --- src/passthrough/sync_io.rs | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/src/passthrough/sync_io.rs b/src/passthrough/sync_io.rs index 015bc161..755da812 100644 --- a/src/passthrough/sync_io.rs +++ b/src/passthrough/sync_io.rs @@ -1508,6 +1508,11 @@ impl FileSystem for PassthroughFs { debug!("remap_file_range: fd_in={} fd_out={}", fd_in, fd_out); + // FICLONE = _IOW('9', 1, int) = 0x40049409 + // FICLONERANGE = _IOW('9', 13, struct file_clone_range) = 0x4020940d + const FICLONE: libc::c_ulong = 0x40049409; + const FICLONERANGE: libc::c_ulong = 0x4020940d; + // struct file_clone_range from #[repr(C)] struct FileCloneRange { @@ -1517,12 +1522,10 @@ impl FileSystem for PassthroughFs { dest_offset: u64, } - // FICLONE = _IOW('9', 1, int) = 0x40049409 - // FICLONERANGE = _IOW('9', 13, struct file_clone_range) = 0x4020940d let result = if len == 0 && offset_in == 0 && offset_out == 0 && flags == 0 { // Whole-file clone (FICLONE) debug!("remap_file_range: using FICLONE ioctl"); - unsafe { libc::ioctl(fd_out, 0x40049409, fd_in) } + unsafe { libc::ioctl(fd_out, FICLONE, fd_in) } } else { // Partial clone (FICLONERANGE) let range = FileCloneRange { @@ -1532,7 +1535,7 @@ impl FileSystem for PassthroughFs { dest_offset: offset_out, }; debug!("remap_file_range: using FICLONERANGE ioctl"); - unsafe { libc::ioctl(fd_out, 0x4020940d, &range as *const FileCloneRange) } + unsafe { libc::ioctl(fd_out, FICLONERANGE, &range as *const FileCloneRange) } }; if result < 0 { @@ -1540,9 +1543,19 @@ impl FileSystem for PassthroughFs { debug!("remap_file_range: ioctl failed: {:?}", err); Err(err) } else { - debug!("remap_file_range: success, result={}", result); - // Return the length that was cloned (0 for whole-file means success) - Ok(if len == 0 { 0 } else { len as usize }) + // For whole-file clone (len=0), get actual size from source file + let cloned_len = if len == 0 { + let mut stat: libc::stat = unsafe { std::mem::zeroed() }; + if unsafe { libc::fstat(fd_in, &mut stat) } == 0 { + stat.st_size as usize + } else { + 0 + } + } else { + len as usize + }; + debug!("remap_file_range: success, cloned {} bytes", cloned_len); + Ok(cloned_len) } } } From a8e8d0ca37a33467dde85d6aadb5e8967fe2530e Mon Sep 17 00:00:00 2001 From: ejc3 Date: Sat, 10 Jan 2026 02:16:55 +0000 Subject: [PATCH 17/19] Fix ioctl type for ARM64 compatibility --- src/passthrough/sync_io.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/passthrough/sync_io.rs b/src/passthrough/sync_io.rs index 755da812..edbde53e 100644 --- a/src/passthrough/sync_io.rs +++ b/src/passthrough/sync_io.rs @@ -1510,8 +1510,8 @@ impl FileSystem for PassthroughFs { // FICLONE = _IOW('9', 1, int) = 0x40049409 // FICLONERANGE = _IOW('9', 13, struct file_clone_range) = 0x4020940d - const FICLONE: libc::c_ulong = 0x40049409; - const FICLONERANGE: libc::c_ulong = 0x4020940d; + const FICLONE: libc::Ioctl = 0x40049409; + const FICLONERANGE: libc::Ioctl = 0x4020940d; // struct file_clone_range from #[repr(C)] From 1b001da6b2ad6da9d15c9ddd3214597abf964e44 Mon Sep 17 00:00:00 2001 From: ejc3 Date: Sat, 10 Jan 2026 03:16:13 +0000 Subject: [PATCH 18/19] Fix O_WRONLY open for files without read permission When writeback cache is enabled, O_WRONLY is promoted to O_RDWR so the kernel can issue read requests for page cache operations. However, this fails with EACCES for files that have write-only permission (e.g., mode 0277). Fix: Try promoted flags first, fall back to original flags on EACCES. If the kernel later sends a READ request, it will correctly fail since the file is genuinely write-only. --- src/passthrough/sync_io.rs | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/src/passthrough/sync_io.rs b/src/passthrough/sync_io.rs index edbde53e..f8c491f1 100644 --- a/src/passthrough/sync_io.rs +++ b/src/passthrough/sync_io.rs @@ -38,7 +38,20 @@ impl PassthroughFs { if !self.cfg.allow_direct_io && flags & libc::O_DIRECT != 0 { new_flags &= !libc::O_DIRECT; } - data.open_file(new_flags | libc::O_CLOEXEC, &self.proc_self_fd) + // Try with promoted flags first. If that fails with EACCES (e.g., O_WRONLY + // promoted to O_RDWR but file has no read permission), fall back to original. + match data.open_file(new_flags | libc::O_CLOEXEC, &self.proc_self_fd) { + Ok(file) => Ok(file), + Err(e) if e.raw_os_error() == Some(libc::EACCES) && new_flags != flags => { + // Promotion failed due to permissions, try original flags + let mut orig_flags = flags; + if !self.cfg.allow_direct_io && flags & libc::O_DIRECT != 0 { + orig_flags &= !libc::O_DIRECT; + } + data.open_file(orig_flags | libc::O_CLOEXEC, &self.proc_self_fd) + } + Err(e) => Err(e), + } } } From f42317dc90341d73c42897d5a260d28d07760d6c Mon Sep 17 00:00:00 2001 From: ejc3 Date: Sun, 14 Jun 2026 19:14:55 +0000 Subject: [PATCH 19/19] passthrough: skip per-request fs-cred switch when lacking CAP_SETUID/CAP_SETGID ScopedCreds::new() switches fs credentials via setfsuid/setfsgid and then verifies the change took effect, hard-failing with PermissionDenied if not. But setfsuid/setfsgid are silent no-ops without CAP_SETUID/CAP_SETGID, so a process running unprivileged (e.g. fcvm's rootless volume server, uid!=0, CapEff=0) hard-fails for every non-root request. The consumer maps that errno-less PermissionDenied to EIO, so a non-root process in the guest gets EIO reading any passed-through file (fcvm #683: nginx worker -> HTTP 500). Gate the switch on actually having CAP_SETUID+CAP_SETGID (via the caps crate, same pattern as drop_cap_fsetid). When absent, skip it: the op runs as the server's own uid, and callers that need DAC enforcement (fcvm's guest FUSE mount uses DefaultPermissions; the guest kernel checks perms against the forwarded uid/gid before forwarding the op). Privileged servers (root) keep the caps so the switch and full POSIX semantics are unchanged. --- src/passthrough/mod.rs | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/src/passthrough/mod.rs b/src/passthrough/mod.rs index 9cdefc2e..4e25f7ca 100644 --- a/src/passthrough/mod.rs +++ b/src/passthrough/mod.rs @@ -986,6 +986,29 @@ impl ScopedCreds { return Ok(None); } + // Capability gate (rootless support, #683): the per-request fs-credential switch + // below uses setfsuid/setfsgid, which only take effect with CAP_SETUID/CAP_SETGID. + // When fcvm runs ROOTLESS the volume server is the unprivileged user (no caps), so + // setfsuid is a silent no-op and the verify-and-hard-fail below returns + // PermissionDenied — which the fuse-pipe server maps to EIO, breaking non-root reads + // of --map'd volumes (e.g. nginx's worker → HTTP 500). Skip the switch when we lack + // the caps: the op then runs as the server's own uid (which owns the mapped files), + // and the guest FUSE mount uses DefaultPermissions so the guest kernel already + // enforces the real POSIX DAC against the forwarded uid/gid before the request + // reaches us. Privileged paths (root / bridged / pjdfstest run the server as root) + // keep the caps, so the switch and full POSIX enforcement are unchanged there. + let can_setid = caps::has_cap(None, caps::CapSet::Effective, caps::Capability::CAP_SETUID) + .unwrap_or(false) + && caps::has_cap(None, caps::CapSet::Effective, caps::Capability::CAP_SETGID) + .unwrap_or(false); + if !can_setid { + debug!( + "set_creds: lacking CAP_SETUID/CAP_SETGID (rootless), skipping fs-cred \ + switch; op runs as server uid" + ); + return Ok(None); + } + // Get current groups before we change anything let original_groups = get_current_groups().unwrap_or_default(); debug!("set_creds: original_groups={:?}", original_groups);