diff --git a/src/api/filesystem/async_io.rs b/src/api/filesystem/async_io.rs index fdc846a4..ca36b060 100644 --- a/src/api/filesystem/async_io.rs +++ b/src/api/filesystem/async_io.rs @@ -797,6 +797,20 @@ pub trait AsyncFileSystem: FileSystem { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } + /// Reposition read/write file offset with a signed offset. + /// + /// Default implementation forwards to [`lseek`] for backward compatibility. + fn lseek_signed( + &self, + ctx: Context, + inode: Self::Inode, + handle: Self::Handle, + offset: i64, + whence: u32, + ) -> io::Result { + self.lseek(ctx, inode, handle, offset as u64, whence) + } + /// TODO: support this fn getlk(&self) -> io::Result<()> { Err(io::Error::from_raw_os_error(libc::ENOSYS)) diff --git a/src/api/filesystem/mod.rs b/src/api/filesystem/mod.rs index aef5a62c..9a7cfb75 100644 --- a/src/api/filesystem/mod.rs +++ b/src/api/filesystem/mod.rs @@ -383,7 +383,7 @@ pub trait ZeroCopyWriter: io::Write { } /// Additional context associated with requests. -#[derive(Default, Clone, Copy, Debug)] +#[derive(Default, Clone, Debug)] pub struct Context { /// The user ID of the calling process. pub uid: libc::uid_t, @@ -393,6 +393,12 @@ pub struct Context { /// The thread group ID of the calling process. pub pid: libc::pid_t, + + /// Supplementary groups of the calling process. + /// + /// When set, these groups are used directly instead of reading from /proc//status. + /// This is essential for remote filesystems where the PID doesn't exist on the server. + pub supplementary_groups: Option>, } impl Context { @@ -408,6 +414,7 @@ impl From<&fuse::InHeader> for Context { uid: source.uid, gid: source.gid, pid: source.pid as i32, + supplementary_groups: None, } } } diff --git a/src/api/filesystem/sync_io.rs b/src/api/filesystem/sync_io.rs index 229c1f43..3f711a2b 100644 --- a/src/api/filesystem/sync_io.rs +++ b/src/api/filesystem/sync_io.rs @@ -813,6 +813,22 @@ pub trait FileSystem { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } + /// Reposition read/write file offset with a signed offset. + /// + /// Default implementation forwards to [`lseek`] for backward compatibility. + /// Filesystems that need negative offsets (e.g. SEEK_END) can override this + /// to receive the signed value directly. + fn lseek_signed( + &self, + ctx: &Context, + inode: Self::Inode, + handle: Self::Handle, + offset: i64, + whence: u32, + ) -> io::Result { + self.lseek(ctx, inode, handle, offset as u64, whence) + } + /// Query file lock status fn getlk( &self, @@ -852,6 +868,53 @@ pub trait FileSystem { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } + /// Copy a range of data from one file to another. + /// + /// Performs an optimized copy between two file descriptors. On filesystems + /// that support it (like btrfs), this creates a reflink (copy-on-write clone) + /// which is nearly instantaneous regardless of file size. + /// + /// Returns the number of bytes copied. + #[allow(clippy::too_many_arguments)] + fn copy_file_range( + &self, + ctx: &Context, + inode_in: Self::Inode, + handle_in: Self::Handle, + offset_in: u64, + inode_out: Self::Inode, + handle_out: Self::Handle, + offset_out: u64, + len: u64, + flags: u64, + ) -> io::Result { + Err(io::Error::from_raw_os_error(libc::ENOSYS)) + } + + /// Remap file ranges (FICLONE/FICLONERANGE) for copy-on-write filesystems. + /// + /// This is the server-side implementation of the FUSE_REMAP_FILE_RANGE opcode, + /// which enables FICLONE and FICLONERANGE ioctls through FUSE. On btrfs and + /// other CoW filesystems, this creates reflinks - instant copies that share + /// the same physical storage until modified. + /// + /// Returns the number of bytes remapped. + #[allow(clippy::too_many_arguments)] + fn remap_file_range( + &self, + ctx: &Context, + inode_in: Self::Inode, + handle_in: Self::Handle, + offset_in: u64, + inode_out: Self::Inode, + handle_out: Self::Handle, + offset_out: u64, + len: u64, + flags: u32, + ) -> io::Result { + Err(io::Error::from_raw_os_error(libc::ENOSYS)) + } + /// send ioctl to the file #[allow(clippy::too_many_arguments)] fn ioctl( @@ -1263,6 +1326,18 @@ impl FileSystem for Arc { self.deref().lseek(ctx, inode, handle, offset, whence) } + fn lseek_signed( + &self, + ctx: &Context, + inode: Self::Inode, + handle: Self::Handle, + offset: i64, + whence: u32, + ) -> io::Result { + self.deref() + .lseek_signed(ctx, inode, handle, offset, whence) + } + /// Query file lock status fn getlk( &self, @@ -1352,4 +1427,40 @@ impl FileSystem for Arc { fn id_remap(&self, ctx: &mut Context) -> io::Result<()> { self.deref().id_remap(ctx) } + + #[allow(clippy::too_many_arguments)] + fn copy_file_range( + &self, + ctx: &Context, + inode_in: Self::Inode, + handle_in: Self::Handle, + offset_in: u64, + inode_out: Self::Inode, + handle_out: Self::Handle, + offset_out: u64, + len: u64, + flags: u64, + ) -> io::Result { + self.deref().copy_file_range( + ctx, inode_in, handle_in, offset_in, inode_out, handle_out, offset_out, len, flags, + ) + } + + #[allow(clippy::too_many_arguments)] + fn remap_file_range( + &self, + ctx: &Context, + inode_in: Self::Inode, + handle_in: Self::Handle, + offset_in: u64, + inode_out: Self::Inode, + handle_out: Self::Handle, + offset_out: u64, + len: u64, + flags: u32, + ) -> io::Result { + self.deref().remap_file_range( + ctx, inode_in, handle_in, offset_in, inode_out, handle_out, offset_out, len, flags, + ) + } } diff --git a/src/api/server/sync_io.rs b/src/api/server/sync_io.rs index ba32efbc..e6aa0d36 100644 --- a/src/api/server/sync_io.rs +++ b/src/api/server/sync_io.rs @@ -1193,11 +1193,15 @@ impl Server { let LseekIn { fh, offset, whence, .. } = ctx.r.read_obj().map_err(Error::DecodeMessage)?; + let offset_signed = offset as i64; - match self - .fs - .lseek(ctx.context(), ctx.nodeid(), fh.into(), offset, whence) - { + match self.fs.lseek_signed( + ctx.context(), + ctx.nodeid(), + fh.into(), + offset_signed, + whence, + ) { Ok(offset) => { let out = LseekOut { offset }; diff --git a/src/passthrough/mod.rs b/src/passthrough/mod.rs index 0f0e5408..4e25f7ca 100644 --- a/src/passthrough/mod.rs +++ b/src/passthrough/mod.rs @@ -28,6 +28,8 @@ use std::time::Duration; use vm_memory::{bitmap::BitmapSlice, ByteValued}; +use crate::api::filesystem::Context; + pub use self::config::{CachePolicy, Config}; use self::file_handle::{FileHandle, OpenableFileHandle}; use self::inode_store::{InodeId, InodeStore}; @@ -888,66 +890,230 @@ impl BackendFileSystem for PassthroughFs } } -macro_rules! scoped_cred { - ($name:ident, $ty:ty, $syscall_nr:expr) => { - #[derive(Debug)] - pub(crate) struct $name; - - impl $name { - // Changes the effective uid/gid of the current thread to `val`. Changes - // the thread's credentials back to root when the returned struct is dropped. - fn new(val: $ty) -> io::Result> { - if val == 0 { - // Nothing to do since we are already uid 0. - return Ok(None); - } +/// RAII guard for temporarily switching filesystem uid/gid credentials. +/// +/// Uses setfsuid()/setfsgid() syscalls which ONLY affect filesystem access +/// checks for the calling thread. Unlike setresuid/setresgid, these do NOT +/// change the real/effective uid, so the process retains access to /proc/self/fd/. +/// +/// This is critical because fuse-backend-rs internally uses readlinkat on +/// /proc/self/fd/ to resolve inode paths - if we changed euid to non-root, +/// those operations would fail with EACCES. +/// +/// # Thread Safety +/// +/// setfsuid() and setfsgid() are inherently per-thread on Linux - they do not +/// use glibc's signal-based synchronization that setresuid/setresgid use to +/// comply with POSIX. This makes them ideal for multi-threaded file servers +/// that need to serve multiple users concurrently in different threads. +/// +/// See: https://man7.org/linux/man-pages/man2/setfsuid.2.html +/// See: https://stackoverflow.com/questions/1223600/change-uid-gid-only-of-one-thread-in-linux +/// +/// # Supplementary Groups +/// +/// FUSE protocol only passes uid and primary gid in requests, not supplementary groups. +/// The caller must forward supplementary groups via the Context struct. We adopt them +/// using setgroups() raw syscall for per-thread credential switching. +#[derive(Debug)] +pub(crate) struct ScopedCreds { + original_fsuid: libc::uid_t, + original_fsgid: libc::gid_t, + original_groups: Vec, +} - // We want credential changes to be per-thread because otherwise - // we might interfere with operations being carried out on other - // threads with different uids/gids. However, posix requires that - // all threads in a process share the same credentials. To do this - // libc uses signals to ensure that when one thread changes its - // credentials the other threads do the same thing. - // - // So instead we invoke the syscall directly in order to get around - // this limitation. Another option is to use the setfsuid and - // setfsgid systems calls. However since those calls have no way to - // return an error, it's preferable to do this instead. - - // This call is safe because it doesn't modify any memory and we - // check the return value. - let res = unsafe { libc::syscall($syscall_nr, -1, val, -1) }; - if res == 0 { - Ok(Some($name)) - } else { - Err(io::Error::last_os_error()) - } +/// Thread-local setgroups using raw syscall. +/// +/// IMPORTANT: We use the raw SYS_setgroups syscall instead of libc::setgroups() +/// because glibc's NPTL wrapper signals ALL threads to change credentials for +/// POSIX compliance. The raw kernel syscall is per-thread, which is what we need +/// for a multi-threaded FUSE server where each thread handles different users. +/// +/// See: https://man7.org/linux/man-pages/man7/nptl.7.html +/// See: https://github.com/rfjakob/gocryptfs (uses same approach) +unsafe fn setgroups_thread(groups: &[libc::gid_t]) -> libc::c_long { + libc::syscall( + libc::SYS_setgroups, + groups.len() as libc::size_t, + groups.as_ptr(), + ) +} + +/// Thread-local getgroups using raw syscall. +/// +/// Like setgroups_thread, this uses the raw syscall to get per-thread groups. +unsafe fn getgroups_thread(size: libc::c_int, list: *mut libc::gid_t) -> libc::c_long { + libc::syscall(libc::SYS_getgroups, size, list) +} + +/// Get current supplementary groups for this thread +fn get_current_groups() -> io::Result> { + // First call with size 0 to get the actual count + let count = unsafe { getgroups_thread(0, std::ptr::null_mut()) }; + if count < 0 { + return Err(io::Error::last_os_error()); + } + + if count == 0 { + return Ok(Vec::new()); + } + + let mut groups = vec![0 as libc::gid_t; count as usize]; + let actual = unsafe { getgroups_thread(count as libc::c_int, groups.as_mut_ptr()) }; + if actual < 0 { + return Err(io::Error::last_os_error()); + } + + groups.truncate(actual as usize); + Ok(groups) +} + +impl ScopedCreds { + /// Switch filesystem credentials to the given uid/gid/groups. + /// Returns None if both uid and gid are 0 (already root). + /// + /// The `supplementary_groups` parameter allows passing supplementary groups + /// to adopt for the duration of the filesystem operation. + fn new( + uid: libc::uid_t, + gid: libc::gid_t, + supplementary_groups: Option<&[libc::gid_t]>, + ) -> io::Result> { + debug!("set_creds: switching to uid={} gid={}", uid, gid); + if uid == 0 && gid == 0 { + // Nothing to do since we are already uid/gid 0. + debug!("set_creds: uid=0 gid=0, nothing to do"); + return Ok(None); + } + + // Capability gate (rootless support, #683): the per-request fs-credential switch + // below uses setfsuid/setfsgid, which only take effect with CAP_SETUID/CAP_SETGID. + // When fcvm runs ROOTLESS the volume server is the unprivileged user (no caps), so + // setfsuid is a silent no-op and the verify-and-hard-fail below returns + // PermissionDenied — which the fuse-pipe server maps to EIO, breaking non-root reads + // of --map'd volumes (e.g. nginx's worker → HTTP 500). Skip the switch when we lack + // the caps: the op then runs as the server's own uid (which owns the mapped files), + // and the guest FUSE mount uses DefaultPermissions so the guest kernel already + // enforces the real POSIX DAC against the forwarded uid/gid before the request + // reaches us. Privileged paths (root / bridged / pjdfstest run the server as root) + // keep the caps, so the switch and full POSIX enforcement are unchanged there. + let can_setid = caps::has_cap(None, caps::CapSet::Effective, caps::Capability::CAP_SETUID) + .unwrap_or(false) + && caps::has_cap(None, caps::CapSet::Effective, caps::Capability::CAP_SETGID) + .unwrap_or(false); + if !can_setid { + debug!( + "set_creds: lacking CAP_SETUID/CAP_SETGID (rootless), skipping fs-cred \ + switch; op runs as server uid" + ); + return Ok(None); + } + + // Get current groups before we change anything + let original_groups = get_current_groups().unwrap_or_default(); + debug!("set_creds: original_groups={:?}", original_groups); + + // Use provided supplementary groups (required - caller must forward them) + let caller_groups = supplementary_groups.unwrap_or(&[]).to_vec(); + debug!("set_creds: supplementary_groups={:?}", caller_groups); + + // Set supplementary groups first (before dropping privileges) + // Use raw syscall for per-thread behavior (not glibc's process-wide wrapper) + if !caller_groups.is_empty() { + let ret = unsafe { setgroups_thread(&caller_groups) }; + if ret < 0 { + let err = io::Error::last_os_error(); + debug!("set_creds: setgroups({:?}) failed: {}", caller_groups, err); + // Don't fail - continue without supplementary groups + // This can happen if we don't have CAP_SETGID + } else { + debug!("set_creds: setgroups({:?}) succeeded", caller_groups); } } - impl Drop for $name { - fn drop(&mut self) { - let res = unsafe { libc::syscall($syscall_nr, -1, 0, -1) }; - if res < 0 { - error!( - "fuse: failed to change credentials back to root: {}", - io::Error::last_os_error(), - ); - } + // setfsuid/setfsgid return the PREVIOUS value, not an error code. + // To detect errors, we call twice: first to set, then to verify. + + // Change gid first (same reason as before - changing uid might drop privileges) + let original_fsgid = unsafe { libc::setfsgid(gid) } as libc::gid_t; + // Verify the change took effect + let verify_gid = unsafe { libc::setfsgid(gid) } as libc::gid_t; + debug!( + "set_creds: setfsgid({}) returned original={} verify={}", + gid, original_fsgid, verify_gid + ); + if verify_gid != gid { + // Restore groups and return error + if !original_groups.is_empty() { + unsafe { setgroups_thread(&original_groups) }; } + unsafe { libc::setfsgid(original_fsgid) }; + return Err(io::Error::new( + io::ErrorKind::PermissionDenied, + format!("setfsgid({}) failed, got {}", gid, verify_gid), + )); } - }; + + // Now change uid + let original_fsuid = unsafe { libc::setfsuid(uid) } as libc::uid_t; + // Verify the change took effect + let verify_uid = unsafe { libc::setfsuid(uid) } as libc::uid_t; + debug!( + "set_creds: setfsuid({}) returned original={} verify={}", + uid, original_fsuid, verify_uid + ); + if verify_uid != uid { + // Restore all and return error + if !original_groups.is_empty() { + unsafe { setgroups_thread(&original_groups) }; + } + unsafe { libc::setfsgid(original_fsgid) }; + unsafe { libc::setfsuid(original_fsuid) }; + return Err(io::Error::new( + io::ErrorKind::PermissionDenied, + format!("setfsuid({}) failed, got {}", uid, verify_uid), + )); + } + + debug!( + "set_creds: success, original_fsuid={} original_fsgid={}", + original_fsuid, original_fsgid + ); + Ok(Some(ScopedCreds { + original_fsuid, + original_fsgid, + original_groups, + })) + } } -scoped_cred!(ScopedUid, libc::uid_t, libc::SYS_setresuid); -scoped_cred!(ScopedGid, libc::gid_t, libc::SYS_setresgid); - -fn set_creds( - uid: libc::uid_t, - gid: libc::gid_t, -) -> io::Result<(Option, Option)> { - // We have to change the gid before we change the uid because if we change the uid first then we - // lose the capability to change the gid. However changing back can happen in any order. - ScopedGid::new(gid).and_then(|gid| Ok((ScopedUid::new(uid)?, gid))) + +impl Drop for ScopedCreds { + fn drop(&mut self) { + // Restore original credentials (order: uid first, then gid, then groups) + // Restore uid first so we have privileges to restore groups + unsafe { libc::setfsuid(self.original_fsuid) }; + unsafe { libc::setfsgid(self.original_fsgid) }; + // Restore supplementary groups using raw syscall for per-thread behavior + if !self.original_groups.is_empty() { + unsafe { setgroups_thread(&self.original_groups) }; + } else { + // If we had no groups originally, clear them + unsafe { setgroups_thread(&[]) }; + } + } +} + +/// Switch filesystem credentials to the given uid/gid. +/// This is the upstream-compatible API that doesn't handle supplementary groups. +#[allow(dead_code)] +fn set_creds(uid: libc::uid_t, gid: libc::gid_t) -> io::Result> { + ScopedCreds::new(uid, gid, None) +} + +/// Switch filesystem credentials from Context. +/// Uses supplementary_groups from Context if available. +fn set_creds_from_context(ctx: &Context) -> io::Result> { + ScopedCreds::new(ctx.uid, ctx.gid, ctx.supplementary_groups.as_deref()) } struct CapFsetid {} @@ -961,9 +1127,11 @@ impl Drop for CapFsetid { } fn drop_cap_fsetid() -> io::Result> { - if !caps::has_cap(None, caps::CapSet::Effective, caps::Capability::CAP_FSETID) - .map_err(|_e| io::Error::new(io::ErrorKind::PermissionDenied, "no CAP_FSETID capability"))? - { + // Use unwrap_or(false) instead of propagating error - if we can't check + // capabilities, assume we don't have them and continue without error + let has_cap = + caps::has_cap(None, caps::CapSet::Effective, caps::Capability::CAP_FSETID).unwrap_or(false); + if !has_cap { return Ok(None); } caps::drop(None, caps::CapSet::Effective, caps::Capability::CAP_FSETID).map_err(|_e| { diff --git a/src/passthrough/sync_io.rs b/src/passthrough/sync_io.rs index d795f308..f8c491f1 100644 --- a/src/passthrough/sync_io.rs +++ b/src/passthrough/sync_io.rs @@ -38,7 +38,20 @@ impl PassthroughFs { if !self.cfg.allow_direct_io && flags & libc::O_DIRECT != 0 { new_flags &= !libc::O_DIRECT; } - data.open_file(new_flags | libc::O_CLOEXEC, &self.proc_self_fd) + // Try with promoted flags first. If that fails with EACCES (e.g., O_WRONLY + // promoted to O_RDWR but file has no read permission), fall back to original. + match data.open_file(new_flags | libc::O_CLOEXEC, &self.proc_self_fd) { + Ok(file) => Ok(file), + Err(e) if e.raw_os_error() == Some(libc::EACCES) && new_flags != flags => { + // Promotion failed due to permissions, try original flags + let mut orig_flags = flags; + if !self.cfg.allow_direct_io && flags & libc::O_DIRECT != 0 { + orig_flags &= !libc::O_DIRECT; + } + data.open_file(orig_flags | libc::O_CLOEXEC, &self.proc_self_fd) + } + Err(e) => Err(e), + } } } @@ -365,11 +378,13 @@ impl FileSystem for PassthroughFs { } } - fn lookup(&self, _ctx: &Context, parent: Inode, name: &CStr) -> io::Result { + fn lookup(&self, ctx: &Context, parent: Inode, name: &CStr) -> io::Result { // Don't use is_safe_path_component(), allow "." and ".." for NFS export support if name.to_bytes_with_nul().contains(&SLASH_ASCII) { return Err(einval()); } + // Switch to caller's credentials for directory search permission check + let _creds = set_creds_from_context(ctx)?; self.do_lookup(parent, name) } @@ -389,7 +404,7 @@ impl FileSystem for PassthroughFs { fn opendir( &self, - _ctx: &Context, + ctx: &Context, inode: Inode, flags: u32, ) -> io::Result<(Option, OpenOptions)> { @@ -397,6 +412,8 @@ impl FileSystem for PassthroughFs { info!("fuse: opendir is not supported."); Err(enosys()) } else { + // Switch to caller's credentials for permission check + let _creds = set_creds_from_context(ctx)?; self.do_open(inode, flags | (libc::O_DIRECTORY as u32), 0) .map(|(a, b, _)| (a, b)) } @@ -429,13 +446,11 @@ impl FileSystem for PassthroughFs { let data = self.inode_map.get(parent)?; - let res = { - let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?; - - let file = data.get_file()?; - // Safe because this doesn't modify any memory and we check the return value. - unsafe { libc::mkdirat(file.as_raw_fd(), name.as_ptr(), mode & !umask) } - }; + let file = data.get_file()?; + // Switch to caller's credentials so the directory is owned by them + let _creds = set_creds_from_context(ctx)?; + // Safe because this doesn't modify any memory and we check the return value. + let res = unsafe { libc::mkdirat(file.as_raw_fd(), name.as_ptr(), mode & !umask) }; if res < 0 { return Err(io::Error::last_os_error()); } @@ -443,8 +458,10 @@ impl FileSystem for PassthroughFs { self.do_lookup(parent, name) } - fn rmdir(&self, _ctx: &Context, parent: Inode, name: &CStr) -> io::Result<()> { + fn rmdir(&self, ctx: &Context, parent: Inode, name: &CStr) -> io::Result<()> { self.validate_path_component(name)?; + // Switch to caller's credentials for permission check + let _creds = set_creds_from_context(ctx)?; self.do_unlink(parent, name, libc::AT_REMOVEDIR) } @@ -519,7 +536,7 @@ impl FileSystem for PassthroughFs { fn open( &self, - _ctx: &Context, + ctx: &Context, inode: Inode, flags: u32, fuse_flags: u32, @@ -528,6 +545,8 @@ impl FileSystem for PassthroughFs { info!("fuse: open is not supported."); Err(enosys()) } else { + // Switch to caller's credentials for permission check + let _creds = set_creds_from_context(ctx)?; self.do_open(inode, flags, fuse_flags) } } @@ -561,12 +580,11 @@ impl FileSystem for PassthroughFs { let dir = self.inode_map.get(parent)?; let dir_file = dir.get_file()?; - let new_file = { - let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?; - - let flags = self.get_writeback_open_flags(args.flags as i32); - Self::create_file_excl(&dir_file, name, flags, args.mode & !(args.umask & 0o777))? - }; + let flags = self.get_writeback_open_flags(args.flags as i32); + // Switch to caller's credentials so the file is owned by them + let _creds = set_creds_from_context(ctx)?; + let new_file = + Self::create_file_excl(&dir_file, name, flags, args.mode & !(args.umask & 0o777))?; let entry = self.do_lookup(parent, name)?; let file = match new_file { @@ -584,7 +602,6 @@ impl FileSystem for PassthroughFs { None }; - let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?; self.open_inode(entry.inode, args.flags as i32)? } }; @@ -610,8 +627,10 @@ impl FileSystem for PassthroughFs { Ok((entry, ret_handle, opts, None)) } - fn unlink(&self, _ctx: &Context, parent: Inode, name: &CStr) -> io::Result<()> { + fn unlink(&self, ctx: &Context, parent: Inode, name: &CStr) -> io::Result<()> { self.validate_path_component(name)?; + // Switch to caller's credentials for permission check + let _creds = set_creds_from_context(ctx)?; self.do_unlink(parent, name, 0) } @@ -680,7 +699,7 @@ impl FileSystem for PassthroughFs { fn write( &self, - _ctx: &Context, + ctx: &Context, inode: Inode, handle: Handle, r: &mut dyn ZeroCopyReader, @@ -691,6 +710,23 @@ impl FileSystem for PassthroughFs { flags: u32, fuse_flags: u32, ) -> io::Result { + debug!( + "write: inode={} handle={} size={} uid={} gid={}", + inode, handle, size, ctx.uid, ctx.gid + ); + // Switch to caller's credentials for the write operation. + // This is needed for proper SUID/SGID bit handling when a non-owner writes. + let _creds = match set_creds_from_context(ctx) { + Ok(c) => { + debug!("write: set_creds succeeded"); + c + } + Err(e) => { + debug!("write: set_creds FAILED: {:?}", e); + return Err(e); + } + }; + let data = self.get_data(handle, inode, libc::O_RDWR)?; // Manually implement File::try_clone() by borrowing fd of data.file instead of dup(). @@ -729,12 +765,14 @@ impl FileSystem for PassthroughFs { fn setattr( &self, - _ctx: &Context, + ctx: &Context, inode: Inode, attr: libc::stat64, handle: Option, valid: SetattrValid, ) -> io::Result<(libc::stat64, Duration)> { + let no_open_val = self.no_open.load(Ordering::Relaxed); + debug!(target: "passthrough", "setattr inode={} handle={:?} valid={:?} no_open={}", inode, handle, valid, no_open_val); let inode_data = self.inode_map.get(inode)?; enum Data { @@ -764,6 +802,44 @@ impl FileSystem for PassthroughFs { } if valid.contains(SetattrValid::MODE) { + // Get current mode to detect SUID/SGID clearing + let current_mode = { + let mut st: libc::stat64 = unsafe { std::mem::zeroed() }; + let res = unsafe { + match &data { + Data::Handle(h) => libc::fstat64(h.borrow_fd().as_raw_fd(), &mut st), + Data::ProcPath(p) => { + libc::fstatat64(self.proc_self_fd.as_raw_fd(), p.as_ptr(), &mut st, 0) + } + } + }; + if res < 0 { + return Err(io::Error::last_os_error()); + } + st.st_mode + }; + + // Check if this is SUID/SGID clearing by the kernel (happens on write by non-owner) + // The kernel sends SETATTR with the new mode = old mode with S_ISUID/S_ISGID cleared. + // In this case, we should NOT switch credentials because: + // 1. The operation is kernel-initiated, not user-initiated + // 2. The writing user doesn't have permission to chmod (they're not the owner) + // 3. The SUID/SGID clearing must succeed for POSIX compliance + let old_special_bits = current_mode & (libc::S_ISUID | libc::S_ISGID); + let new_special_bits = attr.st_mode & (libc::S_ISUID | libc::S_ISGID); + let is_suid_sgid_clearing = old_special_bits != 0 + && new_special_bits == 0 + && (current_mode & 0o777) == (attr.st_mode & 0o777); + + let _creds = if is_suid_sgid_clearing { + // Kernel clearing SUID/SGID - do as root + None + } else { + // User-initiated chmod - switch to caller's credentials for permission check + // (only file owner or root can chmod) + Some(set_creds_from_context(ctx)?) + }; + // Safe because this doesn't modify any memory and we check the return value. let res = unsafe { match data { @@ -795,6 +871,10 @@ impl FileSystem for PassthroughFs { // Safe because this is a constant value and a valid C string. let empty = unsafe { CStr::from_bytes_with_nul_unchecked(EMPTY_CSTR) }; + // Switch to caller's credentials for permission check + // (only root can change owner, owner can change group) + let _creds = set_creds_from_context(ctx)?; + // Safe because this doesn't modify any memory and we check the return value. let res = unsafe { libc::fchownat( @@ -822,10 +902,17 @@ impl FileSystem for PassthroughFs { // Safe because this doesn't modify any memory and we check the return value. let res = match data { - Data::Handle(ref h) => unsafe { - libc::ftruncate(h.borrow_fd().as_raw_fd(), attr.st_size) - }, + Data::Handle(ref h) => { + // ftruncate on an already-opened fd doesn't re-check file permissions. + // The permission was validated when the fd was opened, so we don't + // switch credentials here - ftruncate should succeed if the fd has + // write access, regardless of current file mode. + unsafe { libc::ftruncate(h.borrow_fd().as_raw_fd(), attr.st_size) } + } _ => { + // No file handle - need to open the file, which requires permission check. + // Switch to caller's credentials for this case. + let _creds = set_creds_from_context(ctx)?; // There is no `ftruncateat` so we need to get a new fd and truncate it. let f = self.open_inode(inode, libc::O_NONBLOCK | libc::O_RDWR)?; unsafe { libc::ftruncate(f.as_raw_fd(), attr.st_size) } @@ -862,6 +949,10 @@ impl FileSystem for PassthroughFs { tvs[1].tv_nsec = attr.st_mtime_nsec; } + // Switch to caller's credentials for permission check + // (utimensat requires write permission or ownership) + let _creds = set_creds_from_context(ctx)?; + // Safe because this doesn't modify any memory and we check the return value. let res = match data { Data::Handle(ref h) => unsafe { @@ -881,7 +972,7 @@ impl FileSystem for PassthroughFs { fn rename( &self, - _ctx: &Context, + ctx: &Context, olddir: Inode, oldname: &CStr, newdir: Inode, @@ -896,6 +987,9 @@ impl FileSystem for PassthroughFs { let old_file = old_inode.get_file()?; let new_file = new_inode.get_file()?; + // Switch to caller's credentials for permission check + let _creds = set_creds_from_context(ctx)?; + // Safe because this doesn't modify any memory and we check the return value. // TODO: Switch to libc::renameat2 once https://github.com/rust-lang/libc/pull/1508 lands // and we have glibc 2.28. @@ -930,18 +1024,16 @@ impl FileSystem for PassthroughFs { let data = self.inode_map.get(parent)?; let file = data.get_file()?; - let res = { - let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?; - - // Safe because this doesn't modify any memory and we check the return value. - unsafe { - libc::mknodat( - file.as_raw_fd(), - name.as_ptr(), - (mode & !umask) as libc::mode_t, - u64::from(rdev), - ) - } + // Switch to caller's credentials so the node is owned by them + let _creds = set_creds_from_context(ctx)?; + // Safe because this doesn't modify any memory and we check the return value. + let res = unsafe { + libc::mknodat( + file.as_raw_fd(), + name.as_ptr(), + (mode & !umask) as libc::mode_t, + u64::from(rdev), + ) }; if res < 0 { Err(io::Error::last_os_error()) @@ -967,6 +1059,12 @@ impl FileSystem for PassthroughFs { // Safe because this is a constant value and a valid C string. let empty = unsafe { CStr::from_bytes_with_nul_unchecked(EMPTY_CSTR) }; + // NOTE: We do NOT call set_creds() here because: + // 1. linkat with AT_EMPTY_PATH requires CAP_DAC_READ_SEARCH capability + // 2. set_creds() drops to non-root uid which loses CAP_DAC_READ_SEARCH + // 3. Permission checks should be done by the kernel via default_permissions + // mount option before the LINK request reaches the FUSE server + // Safe because this doesn't modify any memory and we check the return value. let res = unsafe { libc::linkat( @@ -994,14 +1092,12 @@ impl FileSystem for PassthroughFs { self.validate_path_component(name)?; let data = self.inode_map.get(parent)?; + let file = data.get_file()?; - let res = { - let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?; - - let file = data.get_file()?; - // Safe because this doesn't modify any memory and we check the return value. - unsafe { libc::symlinkat(linkname.as_ptr(), file.as_raw_fd(), name.as_ptr()) } - }; + // Switch to caller's credentials so the symlink is owned by them + let _creds = set_creds_from_context(ctx)?; + // Safe because this doesn't modify any memory and we check the return value. + let res = unsafe { libc::symlinkat(linkname.as_ptr(), file.as_raw_fd(), name.as_ptr()) }; if res == 0 { self.do_lookup(parent, name) } else { @@ -1310,6 +1406,17 @@ impl FileSystem for PassthroughFs { handle: Handle, offset: u64, whence: u32, + ) -> io::Result { + self.lseek_signed(_ctx, inode, handle, offset as i64, whence) + } + + fn lseek_signed( + &self, + _ctx: &Context, + inode: Inode, + handle: Handle, + offset: i64, + whence: u32, ) -> io::Result { // Let the Arc in scope, otherwise fd may get invalid. let data = self.handle_map.get(handle, inode)?; @@ -1331,6 +1438,139 @@ impl FileSystem for PassthroughFs { Ok(res as u64) } } + + fn copy_file_range( + &self, + _ctx: &Context, + inode_in: Inode, + handle_in: Handle, + offset_in: u64, + inode_out: Inode, + handle_out: Handle, + offset_out: u64, + len: u64, + flags: u64, + ) -> io::Result { + // Get file descriptors from handles + let data_in = self.handle_map.get(handle_in, inode_in)?; + let data_out = self.handle_map.get(handle_out, inode_out)?; + + let (_guard_in, file_in) = data_in.get_file_mut(); + let (_guard_out, file_out) = data_out.get_file_mut(); + + let mut off_in = offset_in as libc::off64_t; + let mut off_out = offset_out as libc::off64_t; + + // Safe because we check the return value and the fds are valid + let result = unsafe { + libc::copy_file_range( + file_in.as_raw_fd(), + &mut off_in, + file_out.as_raw_fd(), + &mut off_out, + len as libc::size_t, + flags as libc::c_uint, + ) + }; + + if result < 0 { + Err(io::Error::last_os_error()) + } else { + Ok(result as usize) + } + } + + fn remap_file_range( + &self, + _ctx: &Context, + inode_in: Inode, + handle_in: Handle, + offset_in: u64, + inode_out: Inode, + handle_out: Handle, + offset_out: u64, + len: u64, + flags: u32, + ) -> io::Result { + debug!( + "remap_file_range: ino_in={} fh_in={} ino_out={} fh_out={} len={} flags={}", + inode_in, handle_in, inode_out, handle_out, len, flags + ); + + // Get file descriptors from handles + let data_in = match self.handle_map.get(handle_in, inode_in) { + Ok(d) => d, + Err(e) => { + debug!("remap_file_range: handle_map.get(in) failed: {:?}", e); + return Err(e); + } + }; + let data_out = match self.handle_map.get(handle_out, inode_out) { + Ok(d) => d, + Err(e) => { + debug!("remap_file_range: handle_map.get(out) failed: {:?}", e); + return Err(e); + } + }; + + let (_guard_in, file_in) = data_in.get_file_mut(); + let (_guard_out, file_out) = data_out.get_file_mut(); + + let fd_in = file_in.as_raw_fd(); + let fd_out = file_out.as_raw_fd(); + + debug!("remap_file_range: fd_in={} fd_out={}", fd_in, fd_out); + + // FICLONE = _IOW('9', 1, int) = 0x40049409 + // FICLONERANGE = _IOW('9', 13, struct file_clone_range) = 0x4020940d + const FICLONE: libc::Ioctl = 0x40049409; + const FICLONERANGE: libc::Ioctl = 0x4020940d; + + // struct file_clone_range from + #[repr(C)] + struct FileCloneRange { + src_fd: i64, + src_offset: u64, + src_length: u64, + dest_offset: u64, + } + + let result = if len == 0 && offset_in == 0 && offset_out == 0 && flags == 0 { + // Whole-file clone (FICLONE) + debug!("remap_file_range: using FICLONE ioctl"); + unsafe { libc::ioctl(fd_out, FICLONE, fd_in) } + } else { + // Partial clone (FICLONERANGE) + let range = FileCloneRange { + src_fd: fd_in as i64, + src_offset: offset_in, + src_length: len, + dest_offset: offset_out, + }; + debug!("remap_file_range: using FICLONERANGE ioctl"); + unsafe { libc::ioctl(fd_out, FICLONERANGE, &range as *const FileCloneRange) } + }; + + if result < 0 { + let err = io::Error::last_os_error(); + debug!("remap_file_range: ioctl failed: {:?}", err); + Err(err) + } else { + // For whole-file clone (len=0), get actual size from source file + let cloned_len = if len == 0 { + let mut stat: libc::stat = unsafe { std::mem::zeroed() }; + if unsafe { libc::fstat(fd_in, &mut stat) } == 0 { + stat.st_size as usize + } else { + 0 + } + } else { + len as usize + }; + debug!("remap_file_range: success, cloned {} bytes", cloned_len); + Ok(cloned_len) + } + } } #[cfg(test)] @@ -1620,6 +1860,27 @@ mod tests { assert_eq!(statfs.f_namemax, 255); } + #[test] + fn test_lseek_signed_negative_offset() { + let (fs, source) = prepare_fs_tmpdir(); + let ctx = prepare_context(); + + let path = source.as_path().join("seek.txt"); + std::fs::write(&path, b"abcdef").unwrap(); + + let name = CString::new("seek.txt").unwrap(); + let entry = fs.lookup(&ctx, ROOT_ID, &name).unwrap(); + let (handle, _, _) = fs + .open(&ctx, entry.inode, libc::O_RDONLY as u32, 0) + .unwrap(); + let handle = handle.expect("expected handle"); + + let offset = fs + .lseek_signed(&ctx, entry.inode, handle, -2, libc::SEEK_END as u32) + .unwrap(); + assert_eq!(offset, 4); + } + #[test] fn test_fsync_dir() { let (fs, _source) = prepare_fs_tmpdir(); @@ -1628,4 +1889,168 @@ mod tests { assert!(fs.fsyncdir(&ctx, ROOT_ID, false, 0).is_ok()); } + + #[test] + fn test_copy_file_range() { + let (fs, source) = prepare_fs_tmpdir(); + let ctx = prepare_context(); + + // Create source file with data (using std::fs for simplicity) + let test_data = b"Hello, copy_file_range!"; + let src_path = source.as_path().join("source.txt"); + let dst_path = source.as_path().join("dest.txt"); + std::fs::write(&src_path, test_data).unwrap(); + std::fs::write(&dst_path, b"").unwrap(); // Create empty destination + + // Look up and open both files through the passthrough fs + let src_name = CString::new("source.txt").unwrap(); + let src_entry = fs.lookup(&ctx, ROOT_ID, &src_name).unwrap(); + let (src_handle, _, _) = fs + .open(&ctx, src_entry.inode, libc::O_RDWR as u32, 0) + .unwrap(); + let src_handle = src_handle.expect("expected src handle"); + + let dst_name = CString::new("dest.txt").unwrap(); + let dst_entry = fs.lookup(&ctx, ROOT_ID, &dst_name).unwrap(); + let (dst_handle, _, _) = fs + .open(&ctx, dst_entry.inode, libc::O_RDWR as u32, 0) + .unwrap(); + let dst_handle = dst_handle.expect("expected dst handle"); + + // Copy data from source to destination using copy_file_range + let copied = fs + .copy_file_range( + &ctx, + src_entry.inode, + src_handle, + 0, // offset_in + dst_entry.inode, + dst_handle, + 0, // offset_out + test_data.len() as u64, + 0, // flags + ) + .unwrap(); + assert_eq!(copied, test_data.len()); + + // Sync and verify by reading directly from disk + fs.fsync(&ctx, dst_entry.inode, false, dst_handle).unwrap(); + let result = std::fs::read(&dst_path).unwrap(); + assert_eq!(&result, test_data); + + // Test partial copy with offset + let offset = 7; // "Hello, " is 7 bytes + let partial_len = test_data.len() - offset; + let copied = fs + .copy_file_range( + &ctx, + src_entry.inode, + src_handle, + offset as u64, + dst_entry.inode, + dst_handle, + test_data.len() as u64, // append after existing data + partial_len as u64, + 0, + ) + .unwrap(); + assert_eq!(copied, partial_len); + + // Verify the appended data + fs.fsync(&ctx, dst_entry.inode, false, dst_handle).unwrap(); + let result = std::fs::read(&dst_path).unwrap(); + assert_eq!(result.len(), test_data.len() + partial_len); + assert_eq!(&result[..test_data.len()], test_data); + assert_eq!(&result[test_data.len()..], &test_data[offset..]); + } + + /// Test that Arc properly delegates all FileSystem methods. + /// + /// This catches a subtle bug where the `impl FileSystem for Arc` + /// blanket implementation might forget to delegate a method, causing it to use + /// the default trait implementation (which returns ENOSYS) instead. + #[test] + fn test_arc_delegates_filesystem_methods() { + let (fs, source) = prepare_fs_tmpdir(); + let arc_fs = Arc::new(fs); + let ctx = prepare_context(); + + // Create test files for copy/remap operations + let src_name = CString::new("arc_test_src.txt").unwrap(); + let dst_name = CString::new("arc_test_dst.txt").unwrap(); + let test_data = b"Arc delegation test data"; + + let args = CreateIn { + flags: libc::O_RDWR as u32, + mode: 0o644, + umask: 0, + fuse_flags: 0, + }; + + // Test basic operations through Arc - these should work, not return ENOSYS + let (src_entry, src_handle, _, _) = arc_fs.create(&ctx, ROOT_ID, &src_name, args).unwrap(); + let src_handle = src_handle.unwrap(); + + // Write test data + let src_path = source.as_path().join("arc_test_src.txt"); + std::fs::write(&src_path, test_data).unwrap(); + + let (dst_entry, dst_handle, _, _) = arc_fs.create(&ctx, ROOT_ID, &dst_name, args).unwrap(); + let dst_handle = dst_handle.unwrap(); + + // Test copy_file_range through Arc - should NOT return ENOSYS + let result = arc_fs.copy_file_range( + &ctx, + src_entry.inode, + src_handle, + 0, + dst_entry.inode, + dst_handle, + 0, + test_data.len() as u64, + 0, + ); + // copy_file_range should succeed or fail with a real error, never ENOSYS + match &result { + Ok(_) => {} // Success + Err(e) => { + assert_ne!( + e.raw_os_error(), + Some(libc::ENOSYS), + "Arc must delegate copy_file_range, not use default ENOSYS impl" + ); + } + } + + // Test remap_file_range through Arc - should NOT return ENOSYS + // On tmpfs this will return EOPNOTSUPP or EINVAL (no reflink support), + // but it should NEVER return ENOSYS (which would mean missing delegation) + let result = arc_fs.remap_file_range( + &ctx, + src_entry.inode, + src_handle, + 0, + dst_entry.inode, + dst_handle, + 0, + 0, // len=0 means whole file + 0, + ); + match &result { + Ok(_) => {} // Success (would require btrfs/xfs with reflink) + Err(e) => { + assert_ne!( + e.raw_os_error(), + Some(libc::ENOSYS), + "Arc must delegate remap_file_range, not use default ENOSYS impl. \ + Got ENOSYS which means the Arc blanket impl is missing this method." + ); + // Expected: EOPNOTSUPP (95) or EINVAL (22) on tmpfs + } + } + + // Cleanup + arc_fs.release(&ctx, src_entry.inode, 0, src_handle, true, true, None).unwrap(); + arc_fs.release(&ctx, dst_entry.inode, 0, dst_handle, true, true, None).unwrap(); + } }