diff --git a/Makefile b/Makefile index 74d7a3f..49726c6 100644 --- a/Makefile +++ b/Makefile @@ -92,6 +92,7 @@ kernel_files-y += kernel/eventlog.c # User-space process support kernel_files-y += kernel/proc/uaccess.c kernel/proc/proc.c \ kernel/proc/syscall.c kernel/proc/loader.c kernel/proc/spawn.c \ + kernel/proc/cap.c \ kernel/proc/pipe.c kernel/proc/signal.c # TCP/IP stack diff --git a/docs/pse51-matrix.md b/docs/pse51-matrix.md index 78a8c4d..3ffcc90 100644 --- a/docs/pse51-matrix.md +++ b/docs/pse51-matrix.md @@ -100,7 +100,7 @@ The following PSE51 services are present and exercised by selftests | `clock_gettime` | `SYS_CLOCK_GETTIME` | implemented | `CLOCK_MONOTONIC`, `CLOCK_REALTIME`, `CLOCK_THREAD_CPUTIME_ID`, `CLOCK_PROCESS_CPUTIME_ID`. | | `clock_getres` | `SYS_CLOCK_GETRES` | implemented | Resolution derives from the timebase frequency; sub-millisecond on QEMU `virt`. | | `clock_settime` | (none) | not-applicable | Realtime clock is anchored to boot ticks; no settable wall clock yet. | -| `clock_nanosleep` | (none) | stubbed | The relative form is covered by `nanosleep`; the absolute form is tracked under PSE51 ABI alignment work in `TODO.md`. | +| `clock_nanosleep` | (none) | stubbed | The relative form is covered by `nanosleep`; the absolute form is tracked under PSE51 ABI alignment work. | | `nanosleep` | `SYS_NANOSLEEP` | implemented-with-mazu-abi | Accepts `struct timespec`. On `EINTR` the kernel writes the unexpired remainder to `*rem` when `rem` is non-NULL (best-effort: a bad `rem` pointer does not mask the `EINTR` return). On normal completion `*rem` is unmodified. `tv_sec` is bounded against u64 overflow to keep the kernel-side ns/ms conversion safe. | ## Synchronization (kernel handles) @@ -124,14 +124,13 @@ sync handle table (`kernel/sync/sync_handle.c`). | Interface (POSIX) | Mazu syscall | Status | Notes | |---|---|---|---| -| `pthread_self` | `SYS_THREAD_SELF` | implemented | Returns `td->id`. | -| `pthread_create` | `SYS_THREAD_CREATE` | implemented | PROC_THREAD_MAX = 4. Slot reservation under `proc_table_lock`, per-thread stack VA inside the proc slot. Priority inherits from creator; an explicit priority arg ABI is a future extension. | -| `pthread_join` | `SYS_THREAD_JOIN` | implemented | Blocks on `target->td_join_wq`; atomically claims `EXITED -> REAPED` via cmpxchg before reaping. EDEADLK on self-join, ESRCH on unknown TID, EINVAL on detached/already-reaped, EINTR on cancellation. | +| `pthread_self` | `SYS_THREAD_SELF` | implemented | Returns the caller's `CAP_TYPE_THREAD` small-int handle. | +| `pthread_create` | `SYS_THREAD_CREATE` | implemented | PROC_THREAD_MAX = 4. Slot reservation under `proc_table_lock`, per-thread stack VA inside the proc slot. Returns a fresh `CAP_TYPE_THREAD` handle. Priority inherits from creator; an explicit priority arg ABI is a future extension. | +| `pthread_join` | `SYS_THREAD_JOIN` | implemented | Blocks on `target->td_join_wq`; atomically claims `EXITED -> REAPED` via cmpxchg before reaping. EDEADLK on self-join, ESRCH on unknown thread handle, EINVAL on detached/already-reaped, EINTR on cancellation. | | `pthread_detach` | `SYS_THREAD_DETACH` | implemented | Tries `JOINABLE -> DETACHED` first; if the target already exited, claims `EXITED -> REAPED` and reaps inline. Either claim wakes pending joiners. | | `pthread_exit` | `SYS_THREAD_EXIT` | implemented | Last-thread exit collapses into `proc_exit`; non-last exit unwinds the thread's robust futex list. A user thread that returns from its entry function lands on the per-process unmapped trampoline at `signal_trampoline_pc(p)+4`; the trap handler synthesizes `SYS_THREAD_EXIT(0)`, so an implicit return is equivalent to an explicit pthread_exit. | -| `pthread_setschedparam` / `_getschedparam` | `SYS_THREAD_SETSCHEDPARAM` / `_GETSCHEDPARAM` | implemented-with-mazu-abi | Take a kernel TID (0 = self) and a scalar priority. Privilege bound: cannot raise above caller's own base priority. | +| `pthread_setschedparam` / `_getschedparam` | `SYS_THREAD_SETSCHEDPARAM` / `_GETSCHEDPARAM` | implemented-with-mazu-abi | Take a `CAP_TYPE_THREAD` handle (0 = self) and a scalar priority. Privilege bound: cannot raise above caller's own base priority. | | `pthread_attr_*` | (libc) | stubbed | Attribute objects (`setstack`, `setdetachstate`, `setschedpolicy`, `setschedparam`, `setinheritsched`) are user-space libc concerns, but a "PSE51 complete" claim requires them to exist somewhere in the toolchain image. Mazu does not ship a libc with these wrappers today. The kernel ABI accepts the resolved (entry, arg, stack, prio) tuple; once a libc lands, this row flips to `not-applicable`. | -| `pthread_setschedparam` / `_getschedparam` | (none) | stubbed | Today scalar priority is set via `SYS_SCHED_SETPARAM` / `_GETPARAM` on the calling thread only. Per-thread policy/priority change blocked on per-thread sched-parameter state migration. | | `pthread_spin_init` / `_lock` / `_trylock` / `_unlock` / `_destroy` | (none) | stubbed | Mazu has kernel-internal spinlocks, but no userspace-visible busy-wait primitive. The `_POSIX_SPIN_LOCKS` macro is therefore intentionally *not* defined and `_SC_SPIN_LOCKS` returns -1 — advertising it would let an app gate on the macro and call absent APIs. Expect a libc-side implementation backed by a futex once threads land, not a kernel syscall. | | `pthread_cancel` / `pthread_setcancelstate` / `pthread_testcancel` | `SYS_THREAD_CANCEL` / `SYS_THREAD_SETCANCELSTATE` / `SYS_THREAD_TESTCANCEL` | implemented | Deferred cancellation: pthread_cancel sets `td_cancel_pending`; the target observes the bit at the next cancellation point and exits with code -ECANCELED. ASYNC type is treated as DEFERRED because Mazu has no in-kernel cancellation points other than blocking syscalls. | @@ -143,7 +142,7 @@ sync handle table (`kernel/sync/sync_handle.c`). | `sigaction` | `SYS_SIGACTION` | implemented | Per-process disposition. `sa_mask` is a `u32` bitmask, not `sigset_t`. | | `sigreturn` | `SYS_SIGRETURN` | implemented | Cookie-validated frame teardown. | | `pthread_sigmask` | `SYS_PTHREAD_SIGMASK` | implemented | Same wire shape as `SYS_SIGPROCMASK`; both operate on the calling thread's `td_sig.blocked`. Distinct syscall numbers so libc can keep `pthread_sigmask` and `sigprocmask` as separate ABI surfaces. | -| `pthread_kill` | `SYS_PTHREAD_KILL` | implemented | Thread-directed signal: bit lands on the named thread's `td_sig.pending` rather than the per-proc `proc_pending` mask. SIGKILL rejected with EINVAL (must be process-wide). | +| `pthread_kill` | `SYS_PTHREAD_KILL` | implemented | Thread-directed signal: bit lands on the named thread's `td_sig.pending` rather than the per-proc `proc_pending` mask. Takes a `CAP_TYPE_THREAD` handle. SIGKILL rejected with EINVAL (must be process-wide). | | `sigsuspend` | `SYS_SIGSUSPEND` | implemented | Replace blocked mask with the supplied set, yield-loop until a deliverable signal arrives, restore prior mask, return EINTR. | | `sigtimedwait` / `sigwait` / `sigwaitinfo` | `SYS_SIGTIMEDWAIT` | implemented-with-mazu-abi | Block until any signal in the supplied set is pending; dequeue without invoking the handler; return signo. Honors `struct timespec *` timeout (NULL = wait forever; expired = EAGAIN). | | `sigqueue` value delivery | (none) | stubbed | Mazu signals are level-style: a single bit per signal in `pending`, no per-signal value queue. The wait API set above advertises `_POSIX_REALTIME_SIGNALS = 1` (subset) but `sigqueue` with a payload value requires an additional bounded queue subsystem. | @@ -154,7 +153,7 @@ sync handle table (`kernel/sync/sync_handle.c`). | Interface (POSIX) | Mazu syscall | Status | Notes | |---|---|---|---| -| `timer_create` | `SYS_TIMER_CREATE` | implemented-with-mazu-abi | Pool-allocated (8 timers per process). Signal number is fixed to `SIGALRM`; the per-call target thread (`SIGEV_THREAD_ID`) is supplied via `posix_timer_settime`'s new `target_tid` parameter (a3 of `SYS_TIMER_SETTIME`); pass 0 for process-directed delivery. If the targeted thread has already exited at expiry, the signal is silently dropped (POSIX strict). | +| `timer_create` | `SYS_TIMER_CREATE` | implemented-with-mazu-abi | Pool-allocated (8 timers per process). Signal number is fixed to `SIGALRM`; the per-call target thread (`SIGEV_THREAD_ID`) is supplied via `posix_timer_settime`'s new thread-handle parameter (a3 of `SYS_TIMER_SETTIME`); pass 0 for process-directed delivery. If the targeted thread has already exited at expiry, the signal is silently dropped (POSIX strict). | | `timer_settime` | `SYS_TIMER_SETTIME` | implemented-with-mazu-abi | ABI takes `u64 value_ms, u64 interval_ms` instead of `struct itimerspec`. `value_ms == 0` disarms (POSIX semantics). | | `timer_gettime` | `SYS_TIMER_GETTIME` | implemented-with-mazu-abi | Returns remaining milliseconds as a scalar. | | `timer_getoverrun` | `SYS_TIMER_GETOVERRUN` | implemented | Increments only while the previous `SIGALRM` is still pending (POSIX overrun semantics). | @@ -278,8 +277,7 @@ The bounded multi-threaded process model is in place: per-thread state migration (signal pending/blocked, signal-frame chain, robust futex list, errno TLS) and the user-visible pthread surface (`SYS_THREAD_CREATE` and friends) have both landed, with -`PROC_THREAD_MAX = 4`. The two remaining gaps, tracked as -non-blocking follow-ups in `TODO.md`, are the `sigqueue` payload +`PROC_THREAD_MAX = 4`. The two remaining gaps are the `sigqueue` payload queue (requires a bounded per-signal queue subsystem) and the `pthread_attr_*` libc family (strictly a libc-side concern; the kernel ABI already accepts the resolved (entry, arg, prio) tuple). diff --git a/include/mazu/cap.h b/include/mazu/cap.h new file mode 100644 index 0000000..0a6ea29 --- /dev/null +++ b/include/mazu/cap.h @@ -0,0 +1,251 @@ +/* SPDX-License-Identifier: MIT */ +/* Capability-based security: public interface. + * + * Every process owns a fixed cap_space (CAP_SPACE_SLOTS entries). Each + * slot is an unforgeable handle to a typed kernel object. The slot word + * packs object_index, type, rights, type_meta, and a 32-bit generation; + * the cap layer enforces unforgeability, single-hop delegation, lazy + * revocation via generation bump, and an active-use pin that keeps the + * underlying object alive across blocking syscalls. + * + * Userspace sees two handle shapes: + * - posix_fd: the small-int slot_index. This is what sys_open returns + * and what sys_read/sys_write/sys_close consume. The cap layer does + * the slot-bound permission check on every dereference. + * - cap_handle: a 64-bit token (cap_make_handle / cap_get_token) that + * carries the generation/type/rights snapshot taken at mint time. + * Required for cap-management syscalls that need stale-handle + * detection across threads or across spaces. + * + * Rights are a 4-bit lattice (READ, WRITE, EXEC, GRANT). Rights are + * monotonically non-increasing on delegation: cap_transfer strips GRANT, + * while spawn-style cap_inherit_fd preserves the source rights snapshot. + * Plain sys_open does not mint GRANT. + * + * Implementation lives in kernel/proc/cap.c; the threat model, slot bit + * layout, lock ordering, and refcount lifecycle are documented in that + * file's header. + */ + +#ifndef MAZU_CAP_H +#define MAZU_CAP_H + +#include +#include + +struct pipe; +struct posix_timer; +struct proc; + +/* Per-process slot count. Sized from the actual object budget + * (PROC_FD_MAX + thread/timer/IPC + reserve), not a round number. + * No heap growth, no dynamic resize. + */ +#define CAP_SPACE_SLOTS 128 + +/* System-wide pool of delegate_record entries. Each cap_transfer + * allocates one; cap_revoke_delegate consumes it. Sized to cover the + * worst-case outstanding-delegation count across all processes. + */ +#define CAP_DELEGATE_RECORD_MAX 1024 + +/* Typed handle kinds. The slot word stores 4 bits, so up to 16 types + * fit; the unused 2 are reserved for future kernel-object surfaces. + * Adding a transferable type also requires extending cap_release_object + * and cap_object_inc_ref dispatches in kernel/proc/cap.c. + */ +enum cap_type { + CAP_TYPE_NONE = 0, /* empty / dropped slot */ + CAP_TYPE_FD = 1, /* POSIX file descriptor (VFS, pipe, console) */ + CAP_TYPE_TIMER = 2, /* POSIX interval timer */ + CAP_TYPE_THREAD = 3, /* pthread handle; reserved-slot range */ + CAP_TYPE_IRQ = 4, /* IRQ control (reserved) */ + CAP_TYPE_ENDPOINT = 5, /* IPC endpoint (reserved) */ + CAP_TYPE_DELEGATE = 6, /* supervisor-side handle on an outstanding grant */ + CAP_TYPE_CAPSPACE = 7, /* meta cap on the cap_space itself (reserved) */ + CAP_TYPE_SCHED = 8, /* scheduling control (reserved) */ + CAP_TYPE_MUTEX = 9, /* pi_mutex pool entry */ + CAP_TYPE_CONDVAR = 10, /* condvar pool entry */ + CAP_TYPE_SEMAPHORE = 11, /* semaphore pool entry */ + CAP_TYPE_BARRIER = 12, /* barrier pool entry */ + CAP_TYPE_RWLOCK = 13, /* rwlock pool entry */ + CAP_TYPE_MQUEUE = 14, /* POSIX message queue */ +}; + +/* 4-bit rights lattice. Rights cannot be amplified after mint: + * - cap_transfer requires GRANT on the source and produces a + * destination without GRANT (single-hop attenuation). + * - cap_inherit_fd clones the source slot into another process for + * spawn-style FD inheritance, preserving the source rights snapshot. + * - Cap lookups verify (slot.rights & required_rights) == required_rights; + * a partial-rights cap is rejected for the operation that exceeds it. + */ +#define CAP_RIGHT_READ BIT(0) /* read-side ops (read, recv, get, query) */ +#define CAP_RIGHT_WRITE \ + BIT(1) /* write-side ops (write, send, post, mutate) \ + */ +#define CAP_RIGHT_EXEC BIT(2) /* reserved for future memory caps */ +#define CAP_RIGHT_GRANT BIT(3) /* may be cap_transfer'd or inherited */ + +/* type_meta bit assignments for CAP_TYPE_FD. Other types reserve their + * own bits in the same 11-bit field but do not use them today. + */ +#define CAP_FD_META_CLOEXEC BIT(0) /* close-on-exec; dup() clears it */ + +/* Backend tag for CAP_TYPE_FD entries. The kind selects the dispose + * hook (vfs_close vs pipe_close vs noop for console) when the last cap + * to the underlying object drops. + */ +enum cap_fd_kind { + CAP_FD_KIND_CONSOLE = 0, + CAP_FD_KIND_VFS = 1, + CAP_FD_KIND_PIPE = 2, +}; + +struct cap_space { + /* Per-slot capability word. The bit layout is documented in + * kernel/proc/cap.c; here it is opaque -- callers go through the + * cap_lookup_* / cap_open_* / cap_drop_* helpers. + */ + u64 slots[CAP_SPACE_SLOTS]; + /* Per-slot grant_epoch snapshot. For slots minted by cap_transfer + * (or by spawn-time inheritance from such a slot), this records the + * originating delegate_record's 64-bit monotonic epoch. + * cap_revoke_delegate's scan matches on (type, object_index, + * delegate_epoch) so that 32-bit slot generation wrapping or two + * unrelated grants of the same object cannot be confused. Zero for + * slots that are not part of any outstanding delegation. + */ + u64 delegate_epoch[CAP_SPACE_SLOTS]; +}; + +/* Object-constructor return shape. Used by cap-system internal mint + * paths that take a fully-resolved object pointer and assign it to a + * slot under fd_lock. + */ +struct cap_ctor_result { + u16 object_index; + u8 rights; + u16 type_meta; +}; + +/* Active-use pin on a kernel object. Returned by cap_lookup_fd / + * cap_lookup_timer / cap_lookup_object after the cap is validated and + * the underlying pool entry's refcount has been bumped. The caller MUST + * pair every non-zeroed return with cap_put_ref so the pool entry + * survives concurrent revocation across blocking syscalls. + * + * The empty / dropped state is type == CAP_TYPE_NONE; cap_put_ref + * tests the type field (not ptr) for liveness, since lookup variants + * for sync primitives and mqueue return ptr == NULL and fetch the + * typed pointer via a separate _get helper. + */ +struct cap_ref { + void *ptr; + u16 object_index; + u8 type; +}; + +/* Read-only snapshot of a cap_space slot, returned by cap_slot_read / + * cap_lookup_slot / cap_lookup_token. The slot_index is the array + * position; the other fields mirror the slot word. + */ +struct cap_slot_view { + bool valid; + u8 slot_index; + u16 object_index; + u8 type; + u8 rights; + u16 type_meta; + u32 generation; +}; + +/* Per-FD pool entry. One per open file description; multiple cap_space + * slots may reference the same entry (dup, transfer, inheritance) and + * refcount tracks how many. + */ +struct fd_pool_entry { + bool in_use; + u8 kind; /* enum cap_fd_kind */ + bool pipe_read_end; + bool is_seekable; + u8 console_id; + sz offset; /* POSIX dup'd FDs share this offset */ + u32 refcount; /* cap_space slots + active-use pins */ + struct vfs_file file; + struct pipe *pipe; +}; + +void cap_init(void); +void cap_space_init(struct proc *p); +void cap_space_teardown(struct proc *p); + +u64 cap_make_handle(const struct cap_slot_view *slot); +i64 cap_get_token(struct proc *p, i32 slot_idx, u8 expected_type); +i64 cap_drop_token(struct proc *p, u64 token); +i64 cap_transfer(struct proc *src, u16 dst_pid, u64 token, u8 new_rights); +i64 cap_revoke_delegate(struct proc *src, u64 delegate_token); + +i64 cap_close_fd(struct proc *p, i32 fd); +struct cap_ref cap_lookup_fd(struct proc *p, i32 fd, u8 required_rights); +void cap_put_ref(struct cap_ref *ref); +i32 cap_dup_fd(struct proc *p, i32 oldfd, i32 newfd_hint, bool exact_target); +i32 cap_inherit_fd(struct proc *src, struct proc *dst, i32 src_fd, i32 dst_fd); +i32 cap_open_vfs(struct proc *p, + struct vfs_file file, + u8 rights, + bool is_seekable, + i32 slot_hint, + bool exact_target); +i32 cap_open_pipe(struct proc *p, + struct pipe *pipe, + bool read_end, + u8 rights, + i32 slot_hint, + bool exact_target); +i32 cap_open_console(struct proc *p, + u8 console_id, + u8 rights, + i32 slot_hint, + bool exact_target); +i32 cap_open_handle(struct proc *p, + u16 object_index, + u8 type, + u8 rights, + i32 slot_hint, + bool exact_target); +i32 cap_open_timer(struct proc *p, + u16 object_index, + u8 rights, + i32 slot_hint, + bool exact_target); +bool cap_fd_is_valid(struct proc *p, i32 fd); +bool cap_fd_has_rights(struct proc *p, i32 fd, u8 rights); +bool cap_fd_is_seekable(struct proc *p, i32 fd); +bool cap_fd_is_pipe(struct proc *p, i32 fd); +bool cap_fd_pipe_read_end(struct proc *p, i32 fd); +struct cap_slot_view cap_slot_read(struct proc *p, i32 slot_idx); +i32 cap_find_free_fd(struct proc *p); +bool cap_lookup_slot(struct proc *p, + i32 handle, + u8 required_rights, + u8 expected_type, + struct cap_slot_view *out); +bool cap_lookup_token(struct proc *p, + u64 token, + u8 required_rights, + u8 expected_type, + struct cap_slot_view *out); +/* cap_lookup_object: validates a cap slot AND takes an active-use ref on the + * underlying object. The returned cap_ref carries the type and object_index; + * caller MUST pair every non-zero return with cap_put_ref so the object + * survives concurrent revocation/destroy across blocking syscalls. + * Returns a zeroed cap_ref on EBADF/EACCES/EINVAL. + */ +struct cap_ref cap_lookup_object(struct proc *p, + i32 handle, + u8 required_rights, + u8 expected_type); +struct cap_ref cap_lookup_timer(struct proc *p, i32 handle, u8 required_rights); + +#endif /* MAZU_CAP_H */ diff --git a/include/mazu/proc.h b/include/mazu/proc.h index 7fca604..1568227 100644 --- a/include/mazu/proc.h +++ b/include/mazu/proc.h @@ -1,9 +1,10 @@ /* SPDX-License-Identifier: MIT */ /* User-space process management. * - * Each process owns a set of user-mapped pages, a file descriptor table, - * and a bounded thread group. The process table is a small static array - * (PROC_MAX entries). + * Each process owns a set of user-mapped pages, a fixed capability table + * (see struct cap_space) that backs POSIX file descriptors and other + * object handles, and a bounded thread group. The process table is a + * small static array (PROC_MAX entries). */ #ifndef MAZU_PROC_H @@ -11,6 +12,7 @@ #include #include +#include #include #include #include @@ -40,7 +42,6 @@ struct signal_state { }; #define PROC_MAX 16 -#define PROC_FD_MAX 32 #define PROC_PAGES_MAX 32 #define PROC_VMA_MAX 8 @@ -52,6 +53,7 @@ struct signal_state { * are bounded. */ #define PROC_THREAD_MAX 4 +#define PROC_FD_MAX (CAP_SPACE_SLOTS - PROC_THREAD_MAX) #define PROC_FD_STDIN 0 #define PROC_FD_STDOUT 1 #define PROC_FD_STDERR 2 @@ -90,19 +92,6 @@ enum proc_state { PROC_STATE_ZOMBIE, /* exited, awaiting parent reap */ }; -struct pipe; /* forward declaration */ - -struct proc_fd { - bool is_open; - bool is_dup; /* true if created via dup/dup2; skip vfs_close */ - bool is_seekable; /* false for console/pipe; true for regular files */ - bool is_pipe; /* true if this FD is a pipe end */ - bool pipe_read_end; /* true: read, false: write (only if is_pipe) */ - sz offset; /* current file position (used by read/write/lseek) */ - struct vfs_file file; - struct pipe *pipe; /* non-NULL if is_pipe */ -}; - /* Per-process VA slot size: divide user address space equally among PROC_MAX * processes. Each slot contains code, data, and stack. Slot i occupies * [USER_CODE_BASE + i*PROC_SLOT_SIZE, USER_CODE_BASE + (i+1)*PROC_SLOT_SIZE). @@ -135,7 +124,7 @@ struct proc { paddr_t paddr; vaddr_t vaddr; } user_pages[PROC_PAGES_MAX]; - struct proc_fd fd_table[PROC_FD_MAX]; + struct cap_space cap_space; u32 magic; u32 generation; u8 vma_cache_read; @@ -164,6 +153,7 @@ struct proc { u16 pid; u16 parent_pid; + u32 parent_generation; char name[32]; char cwd[PROC_PATH_MAX]; }; @@ -229,7 +219,7 @@ bool proc_attach_task_slot(struct proc *p, struct sched_task *td, u8 slot); bool proc_reserve_thread_slot(struct proc *p, u8 *out_slot); void proc_release_thread_slot(struct proc *p, u8 slot); void proc_release_thread_stack(struct proc *p, u8 slot); -void proc_reap_exited_thread_locked(struct proc *p, struct sched_task *td); +i64 proc_reap_exited_thread_locked(struct proc *p, struct sched_task *td); void proc_reap_exited_thread(struct proc *p, struct sched_task *td); /* Detach the given task from the process. The task's ->proc field is cleared. @@ -262,6 +252,21 @@ struct proc *proc_find(u16 pid); /* Look up a process by PID. Caller must hold proc_table_lock. */ struct proc *proc_find_locked(u16 pid); +/* Snapshot the live descendant tree rooted at root/root_generation. + * Includes root itself when still live. Returns the number of entries + * written to out/out_generations (up to max). + */ +sz proc_collect_descendants_locked(struct proc *root, + u32 root_generation, + struct proc **out, + u32 *out_generations, + sz max); +sz proc_collect_descendants(struct proc *root, + u32 root_generation, + struct proc **out, + u32 *out_generations, + sz max); + /* Iterate all active processes, calling cb for each. */ typedef void (*proc_iter_cb_t)(struct proc *p, void *ctx); void proc_for_each(proc_iter_cb_t cb, void *ctx); diff --git a/include/mazu/sched.h b/include/mazu/sched.h index ec5dc0d..cd00927 100644 --- a/include/mazu/sched.h +++ b/include/mazu/sched.h @@ -298,6 +298,7 @@ struct sched_task { i32 td_exit_code; struct wait_queue_head td_join_wq; bool td_exit_started; + i16 td_cap_slot; /* PSE51 cancellation state (pthread_cancel / _setcancelstate / * _setcanceltype / _testcancel). td_cancel_pending is set by diff --git a/include/mazu/spawn.h b/include/mazu/spawn.h index 44907e2..0234990 100644 --- a/include/mazu/spawn.h +++ b/include/mazu/spawn.h @@ -5,9 +5,11 @@ * Backward compatible: file_actions_ptr == 0 && attr_ptr == 0 behaves as the * original sys_spawn(path, pathlen). * - * File actions are executed sequentially in the child's FD table before the - * child is enqueued. The child is not visible to sys_waitpid or sys_kill - * until file actions complete and the task is enqueued. + * The child starts with a spawn-style clone of the parent's inheritable FD + * table. File actions are then executed sequentially in that child-local + * table before the child is enqueued. The child is not visible to + * sys_waitpid or sys_kill until file actions complete and the task is + * enqueued. * * Inspired by Spork (posix_spawn emulation for fork-free systems). */ @@ -55,7 +57,7 @@ struct spawn_attr { struct proc; -/* Apply file actions to a child process's FD table. +/* Apply file actions to a child process's capability-backed FD table. * Called after binary loading, before the child is enqueued. * Returns 0 on success, negative errno on failure. */ diff --git a/include/mazu/syscall.h b/include/mazu/syscall.h index 4c0a246..51d0926 100644 --- a/include/mazu/syscall.h +++ b/include/mazu/syscall.h @@ -132,67 +132,70 @@ */ #define SYS_SIGPROCMASK 84 -/* Per-thread scheduling parameters (pthread_setschedparam / - * pthread_getschedparam). Take a kernel TID (0 = self) and a scalar - * priority. SYS_SCHED_SETPARAM / _GETPARAM continue to operate on - * the calling thread only; these add the by-TID form. +/* Per-thread scheduling parameters + * (pthread_{setschedparam,pthread_getschedparam}). Take a CAP_TYPE_THREAD + * handle (0 = self) and a scalar priority. SYS_SCHED_{SETPARAM,GETPARAM} + * continue to operate on the calling thread only; these add the by-handle form. */ #define SYS_THREAD_SETSCHEDPARAM 85 #define SYS_THREAD_GETSCHEDPARAM 86 -/* POSIX scheduler-policy accessors (sched_setscheduler / - * sched_getscheduler). Mazu honors SCHED_FIFO semantics for all - * normal threads; SCHED_OTHER and SCHED_RR are accepted but - * coerced to FIFO. SCHED_DEADLINE is rejected here because it has - * its own ABI via SYS_SCHED_SETATTR. +/* POSIX scheduler-policy accessors (sched_{setscheduler,getscheduler}). Mazu + * honors SCHED_FIFO semantics for all normal threads; SCHED_OTHER and SCHED_RR + * are accepted but coerced to FIFO. SCHED_DEADLINE is rejected here because it + * has its own ABI via SYS_SCHED_SETATTR. */ #define SYS_SCHED_SETSCHEDULER 87 #define SYS_SCHED_GETSCHEDULER 88 /* PSE51 thread-directed signal API. * - * SYS_PTHREAD_KILL: deliver signo directly to the thread with the - * given kernel TID inside the calling proc. Different from SYS_KILL - * which is process-directed. + * SYS_PTHREAD_KILL: deliver signo directly to the thread named by the given + * CAP_TYPE_THREAD handle inside the calling proc. Different from SYS_KILL which + * is process-directed. * - * SYS_PTHREAD_SIGMASK: identical wire shape to SYS_SIGPROCMASK - * (operates on the calling thread's td_sig.blocked) but exposes the - * pthread_sigmask name for ABI clarity. Userspace libc maps - * pthread_sigmask -> this; sigprocmask -> SYS_SIGPROCMASK. + * SYS_PTHREAD_SIGMASK: identical wire shape to SYS_SIGPROCMASK (operates on the + * calling thread's td_sig.blocked) but exposes the pthread_sigmask name for ABI + * clarity. + * Userspace libc maps pthread_sigmask -> this; sigprocmask -> SYS_SIGPROCMASK. */ #define SYS_PTHREAD_KILL 89 #define SYS_PTHREAD_SIGMASK 90 -/* PSE51 wait-for-signal variants. sigsuspend replaces the calling - * thread's blocked mask with the supplied set, blocks until any - * signal arrives that is not in the new mask, then restores the old - * mask. sigtimedwait blocks until a signal in the supplied set - * becomes pending or the timeout expires, then dequeues the signal - * (without invoking its handler) and returns its number. sigwait - * is the no-timeout variant, expressed by passing a NULL timeout. +/* PSE51 wait-for-signal variants. sigsuspend replaces the calling thread's + * blocked mask with the supplied set, blocks until any signal arrives that is + * not in the new mask, then restores the old mask. sigtimedwait blocks until + * a signal in the supplied set becomes pending or the timeout expires, then + * dequeues the signal (without invoking its handler) and returns its number. + * sigwait is the no-timeout variant, expressed by passing a NULL timeout. */ #define SYS_SIGSUSPEND 91 #define SYS_SIGTIMEDWAIT 92 -/* PSE51 thread cancellation (pthread_cancel / _setcancelstate / - * _setcanceltype / _testcancel). Cancellation is deferred: a - * thread observes the pending bit at the next cancellation point - * (any blocking syscall checks signal_pending_current / - * thread_cancel_pending before entering the wait) and exits with - * code -ECANCELED. +/* PSE51 thread cancellation + * (pthread_{cancel,setcancelstate,setcanceltype,testcancel}). Cancellation is + * deferred: a thread observes the pending bit at the next cancellation point + * (any blocking syscall checks signal_pending_current / thread_cancel_pending + * before entering the wait) and exits with code -ECANCELED. */ #define SYS_THREAD_CANCEL 93 #define SYS_THREAD_SETCANCELSTATE 94 #define SYS_THREAD_TESTCANCEL 95 -#define SYS_NR 96 /* total number of syscalls */ +/* Capability management. */ +#define SYS_CAP_DROP 96 +#define SYS_CAP_TRANSFER 97 +#define SYS_CAP_REVOKE_DELEGATE 98 +#define SYS_CAP_GET_TOKEN 99 + +#define SYS_NR 100 /* total number of syscalls */ /* pthread_setcancelstate state values. */ #define PTHREAD_CANCEL_ENABLE 0 #define PTHREAD_CANCEL_DISABLE 1 -/* POSIX scheduling policies (subset). SCHED_FIFO is the only policy - * Mazu honors directly; SCHED_OTHER and SCHED_RR map onto it. +/* POSIX scheduling policies (subset). SCHED_FIFO is the only policy Mazu honors + * directly; SCHED_OTHER and SCHED_RR map onto it. */ #define SCHED_OTHER 0 #define SCHED_FIFO 1 diff --git a/include/mazu/vfs.h b/include/mazu/vfs.h index f5dd2ce..61b47b1 100644 --- a/include/mazu/vfs.h +++ b/include/mazu/vfs.h @@ -17,6 +17,11 @@ #define VFS_TYPE_FILE 0 #define VFS_TYPE_DIR 1 #define VFS_FLAG_RDONLY BIT(0) +/* Backend nodes whose read/write semantics are stream-like rather than + * positional. Set on synthetic /dev, /proc, /net entries and on real + * directories. lseek on a NOSEEK descriptor returns ESPIPE. + */ +#define VFS_FLAG_NOSEEK BIT(1) #define VFS_MAX_MOUNTS 8 struct vfs_stat { @@ -124,7 +129,7 @@ static inline struct vfs_stat vfs_rdonly_dir_stat(void) return (struct vfs_stat) { .size = 0, .type = VFS_TYPE_DIR, - .flags = VFS_FLAG_RDONLY, + .flags = VFS_FLAG_RDONLY | VFS_FLAG_NOSEEK, .etag = 0, }; } @@ -134,7 +139,7 @@ static inline struct vfs_stat vfs_rdonly_file_stat(void) return (struct vfs_stat) { .size = 0, .type = VFS_TYPE_FILE, - .flags = VFS_FLAG_RDONLY, + .flags = VFS_FLAG_RDONLY | VFS_FLAG_NOSEEK, .etag = 0, }; } diff --git a/kernel/fs/devfs.c b/kernel/fs/devfs.c index 5237147..c481701 100644 --- a/kernel/fs/devfs.c +++ b/kernel/fs/devfs.c @@ -175,7 +175,7 @@ static struct result_vfs_stat devfs_stat(void *ctx __unused, struct str path) struct vfs_stat st = vfs_rdonly_file_stat(); if (dev_table[idx].write != NULL) - st.flags = 0; + st.flags &= ~(u8) VFS_FLAG_RDONLY; return result_vfs_stat_ok(st); } diff --git a/kernel/fs/ramfs.c b/kernel/fs/ramfs.c index 3703bdb..e670aeb 100644 --- a/kernel/fs/ramfs.c +++ b/kernel/fs/ramfs.c @@ -469,7 +469,7 @@ struct result_sz ram_fs_write(struct ram_fs_node *rfs_node, return result_sz_error(EROFS); /* Files are initialized to contain some data when created. */ - assert(rfs_node->data.dat != NULL && rfs_node->data.cap != 0); + assert(rfs_node->data.dat && rfs_node->data.cap != 0); /* An offset outside of the file doesn't make sense. If the offset is equal * to the file length, the write operation appends to the file. @@ -690,10 +690,11 @@ static struct result_vfs_stat ramfs_vfs_stat(void *ctx, struct str path) return result_vfs_stat_error(res.code); struct ram_fs_node *node = result_ram_fs_node_checked(res); + bool is_file = node->type == RAM_FS_TYPE_FILE; struct vfs_stat st = { - .size = (node->type == RAM_FS_TYPE_FILE) ? node->data.len : 0, - .type = (node->type == RAM_FS_TYPE_FILE) ? VFS_TYPE_FILE : VFS_TYPE_DIR, - .flags = 0, + .size = is_file ? node->data.len : 0, + .type = is_file ? VFS_TYPE_FILE : VFS_TYPE_DIR, + .flags = is_file ? 0 : VFS_FLAG_NOSEEK, .etag = node->etag, }; return result_vfs_stat_ok(st); diff --git a/kernel/fs/sfs.c b/kernel/fs/sfs.c index 9ee5c44..b096fe6 100644 --- a/kernel/fs/sfs.c +++ b/kernel/fs/sfs.c @@ -659,10 +659,11 @@ static struct result_vfs_stat sfs_vfs_stat(void *ctx, struct str path) if (!d) return result_vfs_stat_error(EIO); + bool is_dir = d->type == SFS_TYPE_DIR; struct vfs_stat st = { .size = d->size, - .type = (d->type == SFS_TYPE_DIR) ? VFS_TYPE_DIR : VFS_TYPE_FILE, - .flags = 0, + .type = is_dir ? VFS_TYPE_DIR : VFS_TYPE_FILE, + .flags = is_dir ? VFS_FLAG_NOSEEK : 0, .etag = 0, }; bcache_release(db); diff --git a/kernel/ipc/mqueue.c b/kernel/ipc/mqueue.c index 609c18a..a3ca813 100644 --- a/kernel/ipc/mqueue.c +++ b/kernel/ipc/mqueue.c @@ -96,6 +96,7 @@ i32 mqueue_open(struct proc *owner, u32 max_msgs, sz max_msg_size) mq_pool[i].generation = next_generation; mq_pool[i].max_msgs = max_msgs; mq_pool[i].max_msg_size = max_msg_size; + mq_pool[i].refcount = 1; spin_unlock_irqrestore(&mq_global_lock, flags); return i; } @@ -104,25 +105,18 @@ i32 mqueue_open(struct proc *owner, u32 max_msgs, sz max_msg_size) return -(i32) EAGAIN; } -i32 mqueue_close(i32 handle) +/* Tear down the queue's wait state and mark it free. Caller must hold + * mq_global_lock for the in_use transition; this helper acquires the + * per-queue lock to drain waiters. + */ +static void mqueue_destroy_locked(i32 handle) { - if (handle < 0 || handle >= MQ_MAX_QUEUES) - return -(i32) EBADF; - struct mqueue *mq = &mq_pool[handle]; - u64 gflags = spin_lock_irqsave(&mq_global_lock); - if (!mq->in_use) { - spin_unlock_irqrestore(&mq_global_lock, gflags); - return -(i32) EBADF; - } - /* Wake all blocked senders and receivers before closing. - * Waiters will recheck mq state and see in_use == false or an empty - * queue, then return an error. - */ lockdep_acquire(LOCK_LEVEL_WAITQ); u64 flags = spin_lock_irqsave(&mq->lock); mq->in_use = false; + mq->refcount = 0; mqueue_reset_storage_locked(handle); while (!list_empty(&mq->recv_waitq)) { @@ -142,10 +136,61 @@ i32 mqueue_close(i32 handle) spin_unlock_irqrestore(&mq->lock, flags); lockdep_release(LOCK_LEVEL_WAITQ); +} + +i32 mqueue_close(i32 handle) +{ + if (handle < 0 || handle >= MQ_MAX_QUEUES) + return -(i32) EBADF; + + struct mqueue *mq = &mq_pool[handle]; + u64 gflags = spin_lock_irqsave(&mq_global_lock); + if (!mq->in_use) { + spin_unlock_irqrestore(&mq_global_lock, gflags); + return -(i32) EBADF; + } + + mqueue_destroy_locked(handle); spin_unlock_irqrestore(&mq_global_lock, gflags); return 0; } +void mqueue_put_idx(i32 handle) +{ + if (handle < 0 || handle >= MQ_MAX_QUEUES) + return; + + struct mqueue *mq = &mq_pool[handle]; + u64 gflags = spin_lock_irqsave(&mq_global_lock); + if (!mq->in_use) { + spin_unlock_irqrestore(&mq_global_lock, gflags); + return; + } + if (mq->refcount > 1) { + mq->refcount--; + spin_unlock_irqrestore(&mq_global_lock, gflags); + return; + } + mqueue_destroy_locked(handle); + spin_unlock_irqrestore(&mq_global_lock, gflags); +} + +bool mqueue_inc_idx(i32 handle) +{ + if (handle < 0 || handle >= MQ_MAX_QUEUES) + return false; + + struct mqueue *mq = &mq_pool[handle]; + u64 gflags = spin_lock_irqsave(&mq_global_lock); + if (!mq->in_use) { + spin_unlock_irqrestore(&mq_global_lock, gflags); + return false; + } + mq->refcount++; + spin_unlock_irqrestore(&mq_global_lock, gflags); + return true; +} + static struct mqueue *mqueue_get(i32 handle) { if (handle < 0 || handle >= MQ_MAX_QUEUES) diff --git a/kernel/ipc/mqueue.h b/kernel/ipc/mqueue.h index 2e41bdd..998b11d 100644 --- a/kernel/ipc/mqueue.h +++ b/kernel/ipc/mqueue.h @@ -40,6 +40,7 @@ struct mqueue { u32 msg_count; u32 max_msgs; sz max_msg_size; + u32 refcount; /* cap_space slots referencing this queue */ bool in_use; }; @@ -49,9 +50,23 @@ i32 mqueue_open(struct proc *owner, u32 max_msgs, sz max_msg_size); /* Check if the caller owns this handle. */ bool mqueue_check_owner(i32 handle, struct proc *caller); -/* Close and release a message queue handle. */ +/* Close and release a message queue handle. Always tears the queue down, + * waking any blocked senders/receivers with -EBADF. Used by the + * pre-capability ABI and by sync_handle_teardown_proc. + */ i32 mqueue_close(i32 handle); +/* Refcount-aware release: the underlying queue is destroyed only when the + * last cap_space slot referencing it goes away. Used by the capability + * layer in cap_release_object(). + */ +void mqueue_put_idx(i32 handle); + +/* Take an active-use reference on the queue. Returns true on success, + * false if the slot is already torn down. Pairs with mqueue_put_idx(). + */ +bool mqueue_inc_idx(i32 handle); + /* Send a message. Blocks if queue is full. * Returns 0 on success, -EMSGSIZE if msg too large. */ diff --git a/kernel/proc/cap.c b/kernel/proc/cap.c new file mode 100644 index 0000000..deb9dd5 --- /dev/null +++ b/kernel/proc/cap.c @@ -0,0 +1,1323 @@ +/* SPDX-License-Identifier: MIT */ +/* Capability-based security for kernel objects. + * + * Object-bearing syscalls (FD, timer, sync primitives, message queue) gate + * on per-process unforgeable handles instead of relying solely on the + * coarse syscall allow-list. The least-privilege story matters for the + * embedded AI-assistant workload: a sandboxed task can hold a cap to one + * pipe end without thereby gaining access to every other FD the supervisor + * owns. The cap layer composes with the existing dispatch-layer allow-list + * (the cheap two-load bit test still runs first; cap_lookup_* is the + * second-stage object gate). + * + * Threat model and properties enforced here: + * - Unforgeability. A 64-bit cap_handle token packs the slot index, + * type, rights, and a 32-bit generation snapshot. cap_validate_token + * rejects any token whose snapshot diverges from the live slot, so a + * user cannot fabricate a token for an object it never received. + * - Authority confinement. Caps live in a per-process cap_space (a + * fixed direct table). A cap minted in process A is never visible in + * process B unless an explicit cap_transfer or spawn-time inherit + * mints a fresh slot in B. + * - Lazy revocation. cap_drop bumps the slot generation before clearing + * the slot. Slot reuse is normal (a later mint into the same slot + * gets a fresh generation); old tokens for that slot observe EBADF + * on the next validate. No poisoning, no exit-time scan. + * - Single-hop delegation. cap_transfer requires GRANT on the source, + * strips GRANT from the destination, and records the originating + * grant_epoch so the supervisor retains a revocable handle. The + * delegate cannot transitively re-delegate without going through + * another cap_transfer that it does not hold GRANT to perform. + * Spawn-style cap_inherit_fd is distinct from delegation: it clones + * the source FD slot into a newly-created process and preserves the + * source rights/delegate_epoch snapshot. + * - Mass revocation under dup-escape. cap_revoke_delegate scans the + * destination cap_space for slots matching (type, object_index, + * delegate_epoch) and invalidates every match, neutralizing any + * dup() the delegate performed before revocation. + * - Active-use pin. cap_lookup_fd / cap_lookup_timer / + * cap_lookup_object bump a per-object refcount inside the proc's + * fd_lock and return a cap_ref; callers MUST pair every successful + * lookup with cap_put_ref so a concurrent close cannot recycle the + * underlying object slot while another thread is still operating + * on it. + * - Deadlock-free SMP. Two-process operations (cap_transfer, + * cap_inherit_fd, cap_revoke_delegate) acquire fd_locks in ascending + * pid order via the cap_lock_pair helpers, so concurrent A->B and + * B->A flows cannot deadlock. + * + * The external POSIX ABI keeps small-integer FDs: a posix_fd is literally + * the slot_index of a CAP_TYPE_FD entry in the caller's cap_space. The + * full 64-bit cap_handle is required only for cap-management syscalls + * (cap_get_token / cap_drop_token / cap_transfer / cap_revoke_delegate), + * which is the only path that needs cross-thread / cross-space stale- + * handle detection. + */ + +#include +#include +#include +#include +#include + +#include "../ipc/mqueue.h" +#include "../sync/sync_handle.h" +#include "../timer/posix_timer.h" +#include "pipe.h" + +#define CAP_FD_POOL_MAX (PROC_MAX * PROC_FD_MAX) +#define CAP_HANDLE_SLOT_BITS 8 +#define CAP_HANDLE_SLOT_MASK ((1U << CAP_HANDLE_SLOT_BITS) - 1U) + +/* Two-process cap operations (cap_{transfer,inherit_fd,revoke_delegate}) must + * acquire per-process fd_locks in a stable order to prevent A->B vs B->A + * deadlock under concurrent supervisor activity. The rule is: acquire the lock + * on the lower pid first. Same-pid callers fall through to a single acquire. + */ +struct cap_lock_pair { + struct proc *first; + struct proc *second; + u64 first_flags; + u64 second_flags; +}; + +static inline struct cap_lock_pair cap_lock_two(struct proc *a, struct proc *b) +{ + struct cap_lock_pair lp = {0}; + if (!a || !b || a == b) { + lp.first = a ? a : b; + if (lp.first) + lp.first_flags = proc_fd_lock_irqsave(lp.first); + return lp; + } + if (a->pid <= b->pid) { + lp.first = a; + lp.second = b; + } else { + lp.first = b; + lp.second = a; + } + lp.first_flags = proc_fd_lock_irqsave(lp.first); + lp.second_flags = proc_fd_lock_irqsave(lp.second); + return lp; +} + +static inline void cap_unlock_two(struct cap_lock_pair *lp) +{ + if (lp->second) + proc_fd_unlock_irqrestore(lp->second, lp->second_flags); + if (lp->first) + proc_fd_unlock_irqrestore(lp->first, lp->first_flags); +} + +/* Slot word layout (one u64 per cap_space[i], single naturally-aligned + * load on RV64 -- no retry loop, no seqlock needed): + * bit 0 : valid (1 = live slot, 0 = revoked / never minted) + * bits 1..12 : object_index (12 bits, indexes the per-type pool) + * bits 13..16 : type (CAP_TYPE_*; 14 used of 16 reserved) + * bits 17..20 : rights (READ | WRITE | EXEC | GRANT) + * bits 21..31 : type_meta (per-type union; today only CLOEXEC for FD) + * bits 32..63 : generation (incremented on every cap_drop, lazy revoke) + * cap_drop / cap_revoke clear bits 0..31 but preserve the generation, so + * a stale token's snapshot-vs-live mismatch is detectable on revisit. + */ +#define CAP_SLOT_VALID_SHIFT 0 +#define CAP_SLOT_OBJECT_SHIFT 1 +#define CAP_SLOT_TYPE_SHIFT 13 +#define CAP_SLOT_RIGHTS_SHIFT 17 +#define CAP_SLOT_META_SHIFT 21 +#define CAP_SLOT_GENERATION_SHIFT 32 + +#define CAP_SLOT_OBJECT_MASK 0xFFFU +#define CAP_SLOT_TYPE_MASK 0xFU +#define CAP_SLOT_RIGHTS_MASK 0xFU +#define CAP_SLOT_META_MASK 0x7FFU + +/* cap_handle (the 64-bit token returned by cap_get_token) layout: + * bits 0..31 : generation snapshot (must match live slot.generation) + * bits 32..35 : type snapshot (must match live slot.type) + * bits 36..39 : rights snapshot (must match live slot.rights) + * bits 40..47 : slot_index (8 bits; entries < CAP_SPACE_SLOTS) + * bits 48..63 : reserved (zero on mint, ignored on validate) + * The snapshot is what makes handles unforgeable: a token holder cannot + * extend its own rights, change the type, or survive a revocation just + * by mutating the bits, because the validate path checks every snapshot + * against the live slot under fd_lock. + */ +#define CAP_HANDLE_GEN_MASK 0xFFFFFFFFULL +#define CAP_HANDLE_TYPE_SHIFT 32 +#define CAP_HANDLE_RIGHTS_SHIFT 36 +#define CAP_HANDLE_SLOT_SHIFT 40 + +struct delegate_record { + bool in_use; + struct proc *dst_proc; + u32 dst_generation; + u16 dst_object_index; + u8 dst_type; + u64 grant_epoch; + u32 refcount; +}; + +static struct fd_pool_entry fd_pool[CAP_FD_POOL_MAX]; +static spinlock_t fd_pool_lock = SPINLOCK_INITIALIZER; +static struct delegate_record delegate_pool[CAP_DELEGATE_RECORD_MAX]; +static spinlock_t delegate_lock = SPINLOCK_INITIALIZER; +static u64 next_grant_epoch = 1; + +static i64 cap_drop_slot_locked(struct proc *p, struct cap_slot_view slot); + +static inline u64 cap_pack_slot(bool valid, + u16 object_index, + u8 type, + u8 rights, + u16 type_meta, + u32 generation) +{ + return ((u64) (valid ? 1U : 0U) << CAP_SLOT_VALID_SHIFT) | + ((u64) (object_index & CAP_SLOT_OBJECT_MASK) + << CAP_SLOT_OBJECT_SHIFT) | + ((u64) (type & CAP_SLOT_TYPE_MASK) << CAP_SLOT_TYPE_SHIFT) | + ((u64) (rights & CAP_SLOT_RIGHTS_MASK) << CAP_SLOT_RIGHTS_SHIFT) | + ((u64) (type_meta & CAP_SLOT_META_MASK) << CAP_SLOT_META_SHIFT) | + ((u64) generation << CAP_SLOT_GENERATION_SHIFT); +} + +static inline struct cap_slot_view cap_unpack_slot(u64 word, u8 slot_index) +{ + return (struct cap_slot_view) { + .valid = (word & BIT(CAP_SLOT_VALID_SHIFT)) != 0, + .slot_index = slot_index, + .object_index = + (u16) ((word >> CAP_SLOT_OBJECT_SHIFT) & CAP_SLOT_OBJECT_MASK), + .type = (u8) ((word >> CAP_SLOT_TYPE_SHIFT) & CAP_SLOT_TYPE_MASK), + .rights = (u8) ((word >> CAP_SLOT_RIGHTS_SHIFT) & CAP_SLOT_RIGHTS_MASK), + .type_meta = (u16) ((word >> CAP_SLOT_META_SHIFT) & CAP_SLOT_META_MASK), + .generation = (u32) (word >> CAP_SLOT_GENERATION_SHIFT), + }; +} + +static inline u32 cap_next_generation(u32 generation) +{ + return generation + 1; +} + +static inline void cap_slot_publish(struct proc *p, + u8 slot_index, + u16 object_index, + u8 type, + u8 rights, + u16 type_meta, + u32 generation) +{ + __atomic_store_n( + &p->cap_space.slots[slot_index], + cap_pack_slot(true, object_index, type, rights, type_meta, generation), + __ATOMIC_RELEASE); + __atomic_store_n(&p->cap_space.delegate_epoch[slot_index], 0, + __ATOMIC_RELEASE); +} + +static inline u64 cap_slot_delegate_epoch(struct proc *p, u8 slot_index) +{ + return __atomic_load_n(&p->cap_space.delegate_epoch[slot_index], + __ATOMIC_ACQUIRE); +} + +static inline void cap_slot_set_delegate_epoch(struct proc *p, + u8 slot_index, + u64 delegate_epoch) +{ + __atomic_store_n(&p->cap_space.delegate_epoch[slot_index], delegate_epoch, + __ATOMIC_RELEASE); +} + +/* Revoke the slot. The valid bit and payload bits clear, but the + * generation is bumped so any token still naming this slot will fail + * validation (generation mismatch). This is the entire revocation + * mechanism: no scan of dangling tokens, no list of grantees, no + * cooperation from the holders. Holders observe EBADF on next use. + */ +static inline void cap_slot_invalidate(struct proc *p, u8 slot_index) +{ + u64 old = + __atomic_load_n(&p->cap_space.slots[slot_index], __ATOMIC_ACQUIRE); + struct cap_slot_view slot = cap_unpack_slot(old, slot_index); + u32 generation = cap_next_generation(slot.generation); + __atomic_store_n(&p->cap_space.slots[slot_index], + cap_pack_slot(false, 0, 0, 0, 0, generation), + __ATOMIC_RELEASE); + __atomic_store_n(&p->cap_space.delegate_epoch[slot_index], 0, + __ATOMIC_RELEASE); +} + +static inline struct fd_pool_entry *fd_pool_entry(u16 object_index) +{ + if (object_index >= CAP_FD_POOL_MAX) + return NULL; + return &fd_pool[object_index]; +} + +static inline struct posix_timer *timer_pool_entry(u16 object_index) +{ + return posix_timer_ptr(object_index); +} + +static void cap_fd_dispose(u16 object_index) +{ + struct fd_pool_entry *entry = fd_pool_entry(object_index); + if (!entry) + return; + + switch (entry->kind) { + case CAP_FD_KIND_VFS: + vfs_close(&entry->file); + break; + case CAP_FD_KIND_PIPE: + if (entry->pipe) { + if (entry->pipe_read_end) + pipe_close_read(entry->pipe); + else + pipe_close_write(entry->pipe); + } + break; + case CAP_FD_KIND_CONSOLE: + default: + break; + } + + memset(entry, 0, sizeof(*entry)); +} + +/* Drop one reference on the typed pool entry backing a capability slot. + * Unknown types are silently ignored: a CAP_TYPE_DELEGATE slot, for example, + * has no pool refcount of its own (the delegate_record carries the lifetime). + */ +static void cap_release_object_once(u8 type, u16 object_index) +{ + switch (type) { + case CAP_TYPE_TIMER: + posix_timer_put_idx(object_index); + break; + case CAP_TYPE_MUTEX: + sync_mutex_put_idx((i32) object_index); + break; + case CAP_TYPE_CONDVAR: + sync_condvar_put_idx((i32) object_index); + break; + case CAP_TYPE_SEMAPHORE: + sync_sem_put_idx((i32) object_index); + break; + case CAP_TYPE_BARRIER: + sync_barrier_put_idx((i32) object_index); + break; + case CAP_TYPE_RWLOCK: + sync_rwlock_put_idx((i32) object_index); + break; + case CAP_TYPE_MQUEUE: + mqueue_put_idx((i32) object_index); + break; + default: + break; + } +} + +static void cap_release_object(u8 type, u16 object_index, u32 count) +{ + if (count == 0) + return; + + /* CAP_TYPE_FD is special: its refcount lives on the pool entry itself + * (atomic) and dispose runs only when the count actually reaches zero, + * so iterate the atomic decrement to keep per-step zero-detection + * (the count > 1 case is exclusive to cap_revoke_delegate's mass scan). + */ + if (type == CAP_TYPE_FD) { + struct fd_pool_entry *entry = fd_pool_entry(object_index); + if (!entry) + return; + for (u32 i = 0; i < count; i++) { + u32 rc = __atomic_sub_fetch(&entry->refcount, 1, __ATOMIC_ACQ_REL); + if (rc == 0) + cap_fd_dispose(object_index); + } + return; + } + + /* Other typed pools (timer, sync, mqueue) take an internal lock per + * decrement and have no batched put; loop the per-type hook. + */ + for (u32 i = 0; i < count; i++) + cap_release_object_once(type, object_index); +} + +void cap_put_ref(struct cap_ref *ref) +{ + /* Test against type, not ptr: cap_lookup_object returns refs with + * ptr=NULL for types whose typed pointer is fetched separately (sync + * primitives, mqueue). The empty/dropped state is type == CAP_TYPE_NONE. + */ + if (!ref || ref->type == CAP_TYPE_NONE) + return; + cap_release_object(ref->type, ref->object_index, 1); + memset(ref, 0, sizeof(*ref)); +} + +static inline bool cap_slot_is_thread_reserved(i32 slot_index) +{ + return slot_index >= CAP_SPACE_SLOTS - PROC_THREAD_MAX && + slot_index < CAP_SPACE_SLOTS; +} + +static i32 cap_find_free_slot_locked(struct proc *p, bool allow_thread_reserved) +{ + for (i32 i = 0; i < CAP_SPACE_SLOTS; i++) { + if (!allow_thread_reserved && cap_slot_is_thread_reserved(i)) + continue; + struct cap_slot_view slot = cap_unpack_slot( + __atomic_load_n(&p->cap_space.slots[i], __ATOMIC_ACQUIRE), (u8) i); + if (!slot.valid) + return i; + } + return -1; +} + +static i32 cap_reserve_slot_locked(struct proc *p, + i32 slot_hint, + bool exact, + bool allow_thread_reserved) +{ + if (slot_hint >= 0) { + if (slot_hint >= CAP_SPACE_SLOTS) + return -(i32) EBADF; + if (!allow_thread_reserved && cap_slot_is_thread_reserved(slot_hint)) + return -(i32) EBADF; + struct cap_slot_view slot = cap_unpack_slot( + __atomic_load_n(&p->cap_space.slots[slot_hint], __ATOMIC_ACQUIRE), + (u8) slot_hint); + if (!slot.valid || exact) + return slot_hint; + } + + i32 free_slot = cap_find_free_slot_locked(p, allow_thread_reserved); + return free_slot >= 0 ? free_slot : -(i32) ENOSPC; +} + +static i32 cap_fd_pool_alloc(enum cap_fd_kind kind) +{ + u64 flags = spin_lock_irqsave(&fd_pool_lock); + for (i32 i = 0; i < CAP_FD_POOL_MAX; i++) { + if (!fd_pool[i].in_use) { + memset(&fd_pool[i], 0, sizeof(fd_pool[i])); + fd_pool[i].in_use = true; + fd_pool[i].kind = (u8) kind; + __atomic_store_n(&fd_pool[i].refcount, 1, __ATOMIC_RELEASE); + spin_unlock_irqrestore(&fd_pool_lock, flags); + return i; + } + } + spin_unlock_irqrestore(&fd_pool_lock, flags); + return -(i32) ENFILE; +} + +static void cap_fd_pool_abort_new(u16 object_index) +{ + struct fd_pool_entry *entry = fd_pool_entry(object_index); + if (!entry) + return; + u32 rc = __atomic_sub_fetch(&entry->refcount, 1, __ATOMIC_ACQ_REL); + if (rc == 0) + cap_fd_dispose(object_index); +} + +static i32 cap_mint_fd_locked(struct proc *p, + u16 object_index, + u8 rights, + u16 type_meta, + i32 slot_hint, + bool exact_target, + bool bump_ref) +{ + if (bump_ref) { + struct fd_pool_entry *entry = fd_pool_entry(object_index); + if (!entry || !entry->in_use) + return -(i32) EBADF; + __atomic_fetch_add(&entry->refcount, 1, __ATOMIC_RELAXED); + } + + i32 slot_index = cap_reserve_slot_locked(p, slot_hint, exact_target, false); + if (slot_index < 0) { + if (bump_ref) + cap_release_object(CAP_TYPE_FD, object_index, 1); + return slot_index == -(i32) ENOSPC ? -(i32) EMFILE : slot_index; + } + + u64 old = + __atomic_load_n(&p->cap_space.slots[slot_index], __ATOMIC_ACQUIRE); + struct cap_slot_view prev = cap_unpack_slot(old, (u8) slot_index); + if (exact_target && prev.valid) { + i64 drop_rc = cap_drop_slot_locked(p, prev); + if (drop_rc < 0) { + if (bump_ref) + cap_release_object(CAP_TYPE_FD, object_index, 1); + return (i32) drop_rc; + } + old = + __atomic_load_n(&p->cap_space.slots[slot_index], __ATOMIC_ACQUIRE); + prev = cap_unpack_slot(old, (u8) slot_index); + } + cap_slot_publish(p, (u8) slot_index, object_index, CAP_TYPE_FD, rights, + type_meta, prev.generation); + return slot_index; +} + +/* Authenticate a cap_handle token against the live cap_space slot. + * + * This is the unforgeability gate: a token presented by user space + * carries a generation/type/rights snapshot taken at mint time. The + * validate step refuses the token unless every snapshot field still + * matches the slot. Concretely it rejects: + * - slot_index out of range -> EBADF on the caller (token byzantine) + * - !slot.valid -> EBADF (slot revoked or never minted) + * - generation mismatch -> EBADF (slot was drop/re-minted; the + * caller is holding a stale handle) + * - type mismatch -> EBADF (slot index recycled into a + * different object type) + * - rights mismatch -> EBADF (the caller is presenting an + * amplified token; rights cannot be + * upgraded after mint) + * The caller must hold the proc's fd_lock; the atomic load alone is + * sufficient to read a consistent slot word but the lock pairs the + * validate with whatever object-level work follows. + */ +static bool cap_validate_token_locked(struct proc *p, + u64 token, + struct cap_slot_view *out) +{ + u32 generation = (u32) (token & CAP_HANDLE_GEN_MASK); + u8 type = (u8) ((token >> CAP_HANDLE_TYPE_SHIFT) & CAP_SLOT_TYPE_MASK); + u8 rights = + (u8) ((token >> CAP_HANDLE_RIGHTS_SHIFT) & CAP_SLOT_RIGHTS_MASK); + u32 slot_index = + (u32) ((token >> CAP_HANDLE_SLOT_SHIFT) & CAP_HANDLE_SLOT_MASK); + + if (slot_index >= CAP_SPACE_SLOTS) + return false; + + struct cap_slot_view slot = cap_unpack_slot( + __atomic_load_n(&p->cap_space.slots[slot_index], __ATOMIC_ACQUIRE), + (u8) slot_index); + if (!slot.valid || slot.generation != generation || slot.type != type || + slot.rights != rights) + return false; + if (out) + *out = slot; + return true; +} + +u64 cap_make_handle(const struct cap_slot_view *slot) +{ + if (!slot || !slot->valid) + return 0; + return ((u64) slot->generation) | + ((u64) slot->type << CAP_HANDLE_TYPE_SHIFT) | + ((u64) slot->rights << CAP_HANDLE_RIGHTS_SHIFT) | + ((u64) slot->slot_index << CAP_HANDLE_SLOT_SHIFT); +} + +struct cap_slot_view cap_slot_read(struct proc *p, i32 slot_idx) +{ + if (!p || slot_idx < 0 || slot_idx >= CAP_SPACE_SLOTS) + return (struct cap_slot_view) {0}; + u64 flags = proc_fd_lock_irqsave(p); + struct cap_slot_view slot = cap_unpack_slot( + __atomic_load_n(&p->cap_space.slots[slot_idx], __ATOMIC_ACQUIRE), + (u8) slot_idx); + proc_fd_unlock_irqrestore(p, flags); + return slot; +} + +i32 cap_find_free_fd(struct proc *p) +{ + if (!p) + return -1; + u64 flags = proc_fd_lock_irqsave(p); + i32 slot = cap_find_free_slot_locked(p, false); + proc_fd_unlock_irqrestore(p, flags); + return slot; +} + +bool cap_lookup_slot(struct proc *p, + i32 handle, + u8 required_rights, + u8 expected_type, + struct cap_slot_view *out) +{ + if (!p || handle < 0 || handle >= CAP_SPACE_SLOTS) + return false; + + u64 flags = proc_fd_lock_irqsave(p); + struct cap_slot_view slot = cap_unpack_slot( + __atomic_load_n(&p->cap_space.slots[handle], __ATOMIC_ACQUIRE), + (u8) handle); + bool ok = slot.valid && slot.type == expected_type && + (slot.rights & required_rights) == required_rights; + proc_fd_unlock_irqrestore(p, flags); + if (!ok) + return false; + if (out) + *out = slot; + return true; +} + +bool cap_lookup_token(struct proc *p, + u64 token, + u8 required_rights, + u8 expected_type, + struct cap_slot_view *out) +{ + if (!p) + return false; + + u64 flags = proc_fd_lock_irqsave(p); + struct cap_slot_view slot; + bool ok = cap_validate_token_locked(p, token, &slot) && + slot.type == expected_type && + (slot.rights & required_rights) == required_rights; + proc_fd_unlock_irqrestore(p, flags); + if (!ok) + return false; + if (out) + *out = slot; + return true; +} + +i64 cap_get_token(struct proc *p, i32 slot_idx, u8 expected_type) +{ + if (!p || slot_idx < 0 || slot_idx >= CAP_SPACE_SLOTS) + return -(i64) EBADF; + + u64 flags = proc_fd_lock_irqsave(p); + struct cap_slot_view slot = cap_unpack_slot( + __atomic_load_n(&p->cap_space.slots[slot_idx], __ATOMIC_ACQUIRE), + (u8) slot_idx); + if (!slot.valid) { + proc_fd_unlock_irqrestore(p, flags); + return -(i64) EBADF; + } + if (slot.type != expected_type) { + proc_fd_unlock_irqrestore(p, flags); + return -(i64) EINVAL; + } + u64 handle = cap_make_handle(&slot); + proc_fd_unlock_irqrestore(p, flags); + return (i64) handle; +} + +static i64 cap_drop_slot_locked(struct proc *p, struct cap_slot_view slot) +{ + if (!slot.valid) + return -(i64) EBADF; + + cap_slot_invalidate(p, slot.slot_index); + if (slot.type == CAP_TYPE_DELEGATE) { + u16 record_index = slot.object_index; + u64 dflags = spin_lock_irqsave(&delegate_lock); + if (record_index < CAP_DELEGATE_RECORD_MAX && + delegate_pool[record_index].in_use) { + u32 rc = __atomic_sub_fetch(&delegate_pool[record_index].refcount, + 1, __ATOMIC_ACQ_REL); + if (rc == 0) + memset(&delegate_pool[record_index], 0, + sizeof(delegate_pool[record_index])); + } + spin_unlock_irqrestore(&delegate_lock, dflags); + return 0; + } + + cap_release_object(slot.type, slot.object_index, 1); + return 0; +} + +i64 cap_drop_token(struct proc *p, u64 token) +{ + if (!p) + return -(i64) EPERM; + + u64 flags = proc_fd_lock_irqsave(p); + struct cap_slot_view slot; + if (!cap_validate_token_locked(p, token, &slot)) { + proc_fd_unlock_irqrestore(p, flags); + return -(i64) EBADF; + } + i64 rc = cap_drop_slot_locked(p, slot); + proc_fd_unlock_irqrestore(p, flags); + return rc; +} + +i64 cap_close_fd(struct proc *p, i32 fd) +{ + if (!p || fd < 0 || fd >= CAP_SPACE_SLOTS) + return -(i64) EBADF; + + u64 flags = proc_fd_lock_irqsave(p); + struct cap_slot_view slot = cap_unpack_slot( + __atomic_load_n(&p->cap_space.slots[fd], __ATOMIC_ACQUIRE), (u8) fd); + if (!slot.valid || slot.type != CAP_TYPE_FD) { + proc_fd_unlock_irqrestore(p, flags); + return -(i64) EBADF; + } + i64 rc = cap_drop_slot_locked(p, slot); + proc_fd_unlock_irqrestore(p, flags); + return rc; +} + +/* Validate a cap slot of type expected_type with required_rights and bump + * the matching pool entry's refcount under the proc's fd_lock. The pool + * entry pointer (or NULL on failure), object_index, and is-live flag are + * returned via the out parameters so each typed lookup helper can wrap a + * cap_ref around the result with its own struct pointer type. + * + * Returns true on success. On failure (bad slot, wrong type, missing + * rights, torn-down pool entry) returns false with the fd_lock released. + */ +static bool cap_pinned_lookup(struct proc *p, + i32 handle, + u8 required_rights, + u8 expected_type, + u16 *out_object_index, + void **out_entry) +{ + *out_entry = NULL; + *out_object_index = 0; + + if (!p || handle < 0 || handle >= CAP_SPACE_SLOTS) + return false; + + u64 flags = proc_fd_lock_irqsave(p); + struct cap_slot_view slot = cap_unpack_slot( + __atomic_load_n(&p->cap_space.slots[handle], __ATOMIC_ACQUIRE), + (u8) handle); + if (!slot.valid || slot.type != expected_type || + (slot.rights & required_rights) != required_rights) { + proc_fd_unlock_irqrestore(p, flags); + return false; + } + + /* Each pool keeps its in_use / refcount fields at compatible offsets; + * the typed pointer is returned as void * and reinterpreted by the + * caller. CAP_TYPE_FD and CAP_TYPE_TIMER are the two paths that share + * this body today. + */ + bool ok = false; + if (expected_type == CAP_TYPE_FD) { + struct fd_pool_entry *entry = fd_pool_entry(slot.object_index); + if (entry && entry->in_use) { + __atomic_fetch_add(&entry->refcount, 1, __ATOMIC_RELAXED); + *out_entry = entry; + ok = true; + } + } else if (expected_type == CAP_TYPE_TIMER) { + struct posix_timer *timer = timer_pool_entry(slot.object_index); + if (timer && timer->in_use) { + __atomic_fetch_add(&timer->refcount, 1, __ATOMIC_RELAXED); + *out_entry = timer; + ok = true; + } + } + proc_fd_unlock_irqrestore(p, flags); + + if (ok) + *out_object_index = slot.object_index; + return ok; +} + +struct cap_ref cap_lookup_fd(struct proc *p, i32 fd, u8 required_rights) +{ + void *entry = NULL; + u16 object_index = 0; + if (!cap_pinned_lookup(p, fd, required_rights, CAP_TYPE_FD, &object_index, + &entry)) + return (struct cap_ref) {0}; + return (struct cap_ref) { + .ptr = entry, + .object_index = object_index, + .type = CAP_TYPE_FD, + }; +} + +bool cap_fd_is_valid(struct proc *p, i32 fd) +{ + struct cap_slot_view slot = cap_slot_read(p, fd); + return slot.valid && slot.type == CAP_TYPE_FD; +} + +bool cap_fd_has_rights(struct proc *p, i32 fd, u8 rights) +{ + struct cap_slot_view slot = cap_slot_read(p, fd); + return slot.valid && slot.type == CAP_TYPE_FD && + (slot.rights & rights) == rights; +} + +bool cap_fd_is_seekable(struct proc *p, i32 fd) +{ + struct cap_ref ref = cap_lookup_fd(p, fd, 0); + if (!ref.ptr) + return false; + bool seekable = ((struct fd_pool_entry *) ref.ptr)->is_seekable; + cap_put_ref(&ref); + return seekable; +} + +bool cap_fd_is_pipe(struct proc *p, i32 fd) +{ + struct cap_ref ref = cap_lookup_fd(p, fd, 0); + if (!ref.ptr) + return false; + bool is_pipe = ((struct fd_pool_entry *) ref.ptr)->kind == CAP_FD_KIND_PIPE; + cap_put_ref(&ref); + return is_pipe; +} + +bool cap_fd_pipe_read_end(struct proc *p, i32 fd) +{ + struct cap_ref ref = cap_lookup_fd(p, fd, 0); + if (!ref.ptr) + return false; + bool read_end = ((struct fd_pool_entry *) ref.ptr)->pipe_read_end; + cap_put_ref(&ref); + return read_end; +} + +/* Type-specific active-use ref bumper. Called under the caller's fd_lock + * after the cap_space slot has been validated. Returns true on success; false + * if the underlying pool entry is already torn down. + * + * Today's transferable / lookup-pinnable types are sync primitives and mqueue. + * FD and timer have their own dedicated lookup helpers (cap_lookup_{fd,timer}) + * that bump per-pool refcounts directly; THREAD caps are pinned by + * proc_table_lock and don't carry a separate refcount today. Adding a new + * transferable object type requires extending this switch AND + * cap_release_object's dispatch. + */ +static bool cap_object_inc_ref(u8 type, u16 object_index) +{ + switch (type) { + case CAP_TYPE_MUTEX: + return sync_mutex_inc_idx((i32) object_index); + case CAP_TYPE_CONDVAR: + return sync_condvar_inc_idx((i32) object_index); + case CAP_TYPE_SEMAPHORE: + return sync_sem_inc_idx((i32) object_index); + case CAP_TYPE_BARRIER: + return sync_barrier_inc_idx((i32) object_index); + case CAP_TYPE_RWLOCK: + return sync_rwlock_inc_idx((i32) object_index); + case CAP_TYPE_MQUEUE: + return mqueue_inc_idx((i32) object_index); + default: + return false; + } +} + +struct cap_ref cap_lookup_object(struct proc *p, + i32 handle, + u8 required_rights, + u8 expected_type) +{ + if (!p || handle < 0 || handle >= CAP_SPACE_SLOTS) + return (struct cap_ref) {0}; + + u64 flags = proc_fd_lock_irqsave(p); + struct cap_slot_view slot = cap_unpack_slot( + __atomic_load_n(&p->cap_space.slots[handle], __ATOMIC_ACQUIRE), + (u8) handle); + if (!slot.valid || slot.type != expected_type || + (slot.rights & required_rights) != required_rights) { + proc_fd_unlock_irqrestore(p, flags); + return (struct cap_ref) {0}; + } + + if (!cap_object_inc_ref(expected_type, slot.object_index)) { + proc_fd_unlock_irqrestore(p, flags); + return (struct cap_ref) {0}; + } + proc_fd_unlock_irqrestore(p, flags); + + return (struct cap_ref) { + .ptr = NULL, + .object_index = slot.object_index, + .type = expected_type, + }; +} + +struct cap_ref cap_lookup_timer(struct proc *p, i32 handle, u8 required_rights) +{ + void *entry = NULL; + u16 object_index = 0; + if (!cap_pinned_lookup(p, handle, required_rights, CAP_TYPE_TIMER, + &object_index, &entry)) + return (struct cap_ref) {0}; + return (struct cap_ref) { + .ptr = entry, + .object_index = object_index, + .type = CAP_TYPE_TIMER, + }; +} + +i32 cap_open_vfs(struct proc *p, + struct vfs_file file, + u8 rights, + bool is_seekable, + i32 slot_hint, + bool exact_target) +{ + i32 object_index = cap_fd_pool_alloc(CAP_FD_KIND_VFS); + if (object_index < 0) { + vfs_close(&file); + return object_index; + } + + struct fd_pool_entry *entry = fd_pool_entry((u16) object_index); + entry->file = file; + entry->is_seekable = is_seekable; + + u64 flags = proc_fd_lock_irqsave(p); + i32 fd = cap_mint_fd_locked(p, (u16) object_index, rights, 0, slot_hint, + exact_target, false); + proc_fd_unlock_irqrestore(p, flags); + if (fd < 0) { + cap_fd_pool_abort_new((u16) object_index); + return fd; + } + return fd; +} + +i32 cap_open_pipe(struct proc *p, + struct pipe *pipe, + bool read_end, + u8 rights, + i32 slot_hint, + bool exact_target) +{ + i32 object_index = cap_fd_pool_alloc(CAP_FD_KIND_PIPE); + if (object_index < 0) { + if (read_end) + pipe_close_read(pipe); + else + pipe_close_write(pipe); + return object_index; + } + + struct fd_pool_entry *entry = fd_pool_entry((u16) object_index); + entry->pipe = pipe; + entry->pipe_read_end = read_end; + entry->is_seekable = false; + + u64 flags = proc_fd_lock_irqsave(p); + i32 fd = cap_mint_fd_locked(p, (u16) object_index, rights, 0, slot_hint, + exact_target, false); + proc_fd_unlock_irqrestore(p, flags); + if (fd < 0) { + cap_fd_pool_abort_new((u16) object_index); + return fd; + } + return fd; +} + +i32 cap_open_console(struct proc *p, + u8 console_id, + u8 rights, + i32 slot_hint, + bool exact_target) +{ + i32 object_index = cap_fd_pool_alloc(CAP_FD_KIND_CONSOLE); + if (object_index < 0) + return object_index; + + struct fd_pool_entry *entry = fd_pool_entry((u16) object_index); + entry->console_id = console_id; + entry->is_seekable = false; + + u64 flags = proc_fd_lock_irqsave(p); + i32 fd = cap_mint_fd_locked(p, (u16) object_index, rights, 0, slot_hint, + exact_target, false); + proc_fd_unlock_irqrestore(p, flags); + if (fd < 0) { + cap_fd_pool_abort_new((u16) object_index); + return fd; + } + return fd; +} + +i32 cap_open_handle(struct proc *p, + u16 object_index, + u8 type, + u8 rights, + i32 slot_hint, + bool exact_target) +{ + if (!p) + return -(i32) EINVAL; + + u64 flags = proc_fd_lock_irqsave(p); + i32 handle = cap_reserve_slot_locked(p, slot_hint, exact_target, + type == CAP_TYPE_THREAD); + if (handle >= 0) { + u64 old = + __atomic_load_n(&p->cap_space.slots[handle], __ATOMIC_ACQUIRE); + struct cap_slot_view prev = cap_unpack_slot(old, (u8) handle); + if (exact_target && prev.valid) + (void) cap_drop_slot_locked(p, prev); + old = __atomic_load_n(&p->cap_space.slots[handle], __ATOMIC_ACQUIRE); + prev = cap_unpack_slot(old, (u8) handle); + cap_slot_publish(p, (u8) handle, object_index, type, rights, 0, + prev.generation); + } + proc_fd_unlock_irqrestore(p, flags); + return handle < 0 ? (handle == -(i32) ENOSPC ? -(i32) EMFILE : handle) + : handle; +} + +i32 cap_open_timer(struct proc *p, + u16 object_index, + u8 rights, + i32 slot_hint, + bool exact_target) +{ + if (!p || !timer_pool_entry(object_index)) + return -(i32) EINVAL; + return cap_open_handle(p, object_index, CAP_TYPE_TIMER, rights, slot_hint, + exact_target); +} + +i32 cap_dup_fd(struct proc *p, i32 oldfd, i32 newfd_hint, bool exact_target) +{ + if (!p || oldfd < 0 || oldfd >= PROC_FD_MAX) + return -(i32) EBADF; + if (exact_target && (newfd_hint < 0 || newfd_hint >= PROC_FD_MAX)) + return -(i32) EBADF; + + u64 flags = proc_fd_lock_irqsave(p); + struct cap_slot_view slot = cap_unpack_slot( + __atomic_load_n(&p->cap_space.slots[oldfd], __ATOMIC_ACQUIRE), + (u8) oldfd); + if (!slot.valid || slot.type != CAP_TYPE_FD) { + proc_fd_unlock_irqrestore(p, flags); + return -(i32) EBADF; + } + + if (exact_target && newfd_hint == oldfd) { + proc_fd_unlock_irqrestore(p, flags); + return oldfd; + } + i32 fd = cap_mint_fd_locked(p, slot.object_index, slot.rights, + slot.type_meta & ~CAP_FD_META_CLOEXEC, + newfd_hint, exact_target, true); + if (fd >= 0) + cap_slot_set_delegate_epoch(p, (u8) fd, + cap_slot_delegate_epoch(p, (u8) oldfd)); + proc_fd_unlock_irqrestore(p, flags); + return fd; +} + +i32 cap_inherit_fd(struct proc *src, struct proc *dst, i32 src_fd, i32 dst_fd) +{ + if (!src || !dst || src_fd < 0 || src_fd >= PROC_FD_MAX) + return -(i32) EBADF; + if (dst_fd < 0 || dst_fd >= PROC_FD_MAX) + return -(i32) EBADF; + if (src == dst) + return cap_dup_fd(src, src_fd, dst_fd, true); + + struct cap_lock_pair lp = cap_lock_two(src, dst); + struct cap_slot_view slot = cap_unpack_slot( + __atomic_load_n(&src->cap_space.slots[src_fd], __ATOMIC_ACQUIRE), + (u8) src_fd); + if (!slot.valid || slot.type != CAP_TYPE_FD) { + cap_unlock_two(&lp); + return -(i32) EBADF; + } + i32 fd = cap_mint_fd_locked(dst, slot.object_index, slot.rights, + slot.type_meta, dst_fd, true, true); + if (fd >= 0) + cap_slot_set_delegate_epoch(dst, (u8) fd, + cap_slot_delegate_epoch(src, (u8) src_fd)); + cap_unlock_two(&lp); + return fd; +} + +static i32 delegate_record_alloc(struct proc *dst, + u16 dst_object_index, + u8 dst_type) +{ + u64 flags = spin_lock_irqsave(&delegate_lock); + for (i32 i = 0; i < CAP_DELEGATE_RECORD_MAX; i++) { + if (!delegate_pool[i].in_use) { + delegate_pool[i].in_use = true; + delegate_pool[i].dst_proc = dst; + delegate_pool[i].dst_generation = dst ? dst->generation : 0; + delegate_pool[i].dst_object_index = dst_object_index; + delegate_pool[i].dst_type = dst_type; + delegate_pool[i].grant_epoch = + __atomic_fetch_add(&next_grant_epoch, 1, __ATOMIC_RELAXED); + __atomic_store_n(&delegate_pool[i].refcount, 1, __ATOMIC_RELEASE); + spin_unlock_irqrestore(&delegate_lock, flags); + return i; + } + } + spin_unlock_irqrestore(&delegate_lock, flags); + return -(i32) ENOSPC; +} + +static void delegate_record_put(u16 record_index) +{ + u64 flags = spin_lock_irqsave(&delegate_lock); + if (record_index < CAP_DELEGATE_RECORD_MAX && + delegate_pool[record_index].in_use) { + u32 rc = __atomic_sub_fetch(&delegate_pool[record_index].refcount, 1, + __ATOMIC_ACQ_REL); + if (rc == 0) + memset(&delegate_pool[record_index], 0, sizeof(delegate_pool[0])); + } + spin_unlock_irqrestore(&delegate_lock, flags); +} + +i64 cap_transfer(struct proc *src, u16 dst_pid, u64 token, u8 new_rights) +{ + if (!src) + return -(i64) EPERM; + + if ((new_rights & CAP_RIGHT_GRANT) != 0) + return -(i64) EINVAL; + + struct proc *dst = proc_find(dst_pid); + if (!dst) + return -(i64) ESRCH; + if (dst == src) + return -(i64) EINVAL; + + struct cap_lock_pair lp = cap_lock_two(src, dst); + struct cap_slot_view src_slot; + if (!cap_validate_token_locked(src, token, &src_slot)) { + cap_unlock_two(&lp); + return -(i64) EBADF; + } + if (src_slot.type != CAP_TYPE_FD) { + cap_unlock_two(&lp); + return -(i64) EINVAL; + } + if ((src_slot.rights & CAP_RIGHT_GRANT) == 0) { + cap_unlock_two(&lp); + return -(i64) EACCES; + } + if ((new_rights & src_slot.rights) != new_rights) { + cap_unlock_two(&lp); + return -(i64) EINVAL; + } + + i32 child_fd = cap_mint_fd_locked(dst, src_slot.object_index, + new_rights & ~CAP_RIGHT_GRANT, + src_slot.type_meta, -1, false, true); + if (child_fd < 0) { + cap_unlock_two(&lp); + return child_fd == -(i32) EMFILE ? -(i64) ENOSPC : child_fd; + } + + i32 record_index = + delegate_record_alloc(dst, src_slot.object_index, src_slot.type); + if (record_index < 0) { + struct cap_slot_view child_slot = cap_unpack_slot( + __atomic_load_n(&dst->cap_space.slots[child_fd], __ATOMIC_ACQUIRE), + (u8) child_fd); + cap_drop_slot_locked(dst, child_slot); + cap_unlock_two(&lp); + return -(i64) ENOSPC; + } + + i32 delegate_slot = cap_reserve_slot_locked(src, -1, false, false); + if (delegate_slot < 0) { + struct cap_slot_view child_slot = cap_unpack_slot( + __atomic_load_n(&dst->cap_space.slots[child_fd], __ATOMIC_ACQUIRE), + (u8) child_fd); + cap_drop_slot_locked(dst, child_slot); + delegate_record_put((u16) record_index); + cap_unlock_two(&lp); + return -(i64) ENOSPC; + } + + u64 old = + __atomic_load_n(&src->cap_space.slots[delegate_slot], __ATOMIC_ACQUIRE); + struct cap_slot_view prev = cap_unpack_slot(old, (u8) delegate_slot); + cap_slot_set_delegate_epoch(dst, (u8) child_fd, + delegate_pool[record_index].grant_epoch); + cap_slot_publish(src, (u8) delegate_slot, (u16) record_index, + CAP_TYPE_DELEGATE, CAP_RIGHT_READ, 0, prev.generation); + struct cap_slot_view handle_slot = cap_unpack_slot( + __atomic_load_n(&src->cap_space.slots[delegate_slot], __ATOMIC_ACQUIRE), + (u8) delegate_slot); + + cap_unlock_two(&lp); + return (i64) cap_make_handle(&handle_slot); +} + +static i64 cap_revoke_delegate_impl(struct proc *src, + u64 delegate_token, + bool proc_table_locked) +{ + if (!src) + return -(i64) EPERM; + + /* Phase 1: under src lock, validate the DELEGATE handle, snapshot + * the underlying delegate_record, and bump the record refcount so + * it survives the lock dance below. + */ + u64 sflags = proc_fd_lock_irqsave(src); + struct cap_slot_view delegate_slot; + if (!cap_validate_token_locked(src, delegate_token, &delegate_slot) || + delegate_slot.type != CAP_TYPE_DELEGATE) { + proc_fd_unlock_irqrestore(src, sflags); + return -(i64) EBADF; + } + + u16 record_index = delegate_slot.object_index; + if (record_index >= CAP_DELEGATE_RECORD_MAX || + !delegate_pool[record_index].in_use) { + proc_fd_unlock_irqrestore(src, sflags); + return -(i64) EBADF; + } + + __atomic_fetch_add(&delegate_pool[record_index].refcount, 1, + __ATOMIC_RELAXED); + struct proc *dst = delegate_pool[record_index].dst_proc; + u32 dst_generation = delegate_pool[record_index].dst_generation; + u16 object_index = delegate_pool[record_index].dst_object_index; + u8 type = delegate_pool[record_index].dst_type; + u64 grant_epoch = delegate_pool[record_index].grant_epoch; + + if (!dst || dst->generation != dst_generation) { + cap_drop_slot_locked(src, delegate_slot); + proc_fd_unlock_irqrestore(src, sflags); + delegate_record_put(record_index); + return 0; + } + + /* Phase 2: release src lock and re-acquire BOTH locks in ascending + * pid order so cap_revoke_delegate is deadlock-free against a + * concurrent cap_transfer / cap_revoke_delegate / cap_inherit_fd in + * the opposite direction. + */ + proc_fd_unlock_irqrestore(src, sflags); + + struct proc *descendants[PROC_MAX]; + u32 descendant_generations[PROC_MAX]; + u32 revoked = 0; + + for (;;) { + sz ndesc = proc_table_locked ? proc_collect_descendants_locked( + dst, dst_generation, descendants, + descendant_generations, PROC_MAX) + : proc_collect_descendants( + dst, dst_generation, descendants, + descendant_generations, PROC_MAX); + u32 pass_revoked = 0; + + for (sz n = 0; n < ndesc; n++) { + struct proc *target = descendants[n]; + if (!target || target->generation != descendant_generations[n]) + continue; + + u64 dflags = proc_fd_lock_irqsave(target); + if (target->state == PROC_STATE_FREE || + target->generation != descendant_generations[n]) { + proc_fd_unlock_irqrestore(target, dflags); + continue; + } + for (i32 i = 0; i < CAP_SPACE_SLOTS; i++) { + struct cap_slot_view slot = + cap_unpack_slot(__atomic_load_n(&target->cap_space.slots[i], + __ATOMIC_ACQUIRE), + (u8) i); + if (!slot.valid || slot.type != type || + slot.object_index != object_index || + cap_slot_delegate_epoch(target, (u8) i) != grant_epoch) + continue; + cap_slot_invalidate(target, (u8) i); + pass_revoked++; + } + proc_fd_unlock_irqrestore(target, dflags); + } + + revoked += pass_revoked; + if (pass_revoked == 0) + break; + } + + sflags = proc_fd_lock_irqsave(src); + if (cap_validate_token_locked(src, delegate_token, &delegate_slot) && + delegate_slot.type == CAP_TYPE_DELEGATE) + cap_drop_slot_locked(src, delegate_slot); + proc_fd_unlock_irqrestore(src, sflags); + + if (revoked > 0) + cap_release_object(type, object_index, revoked); + delegate_record_put(record_index); + return 0; +} + +i64 cap_revoke_delegate(struct proc *src, u64 delegate_token) +{ + return cap_revoke_delegate_impl(src, delegate_token, false); +} + +void cap_init(void) +{ + memset(fd_pool, 0, sizeof(fd_pool)); + memset(delegate_pool, 0, sizeof(delegate_pool)); + fd_pool_lock = (spinlock_t) SPINLOCK_INITIALIZER; + delegate_lock = (spinlock_t) SPINLOCK_INITIALIZER; + __atomic_store_n(&next_grant_epoch, 1, __ATOMIC_RELAXED); +} + +void cap_space_init(struct proc *p) +{ + assert(p); + memset(&p->cap_space, 0, sizeof(p->cap_space)); + + /* Fixed stdio slots preserve the existing ABI expectations. */ + (void) cap_open_console(p, PROC_FD_STDIN, CAP_RIGHT_READ | CAP_RIGHT_GRANT, + PROC_FD_STDIN, true); + (void) cap_open_console(p, PROC_FD_STDOUT, + CAP_RIGHT_WRITE | CAP_RIGHT_GRANT, PROC_FD_STDOUT, + true); + (void) cap_open_console(p, PROC_FD_STDERR, + CAP_RIGHT_WRITE | CAP_RIGHT_GRANT, PROC_FD_STDERR, + true); +} + +void cap_space_teardown(struct proc *p) +{ + assert(p); + + /* Revoke delegates first so children lose access before we tear down + * our own handles. + */ + for (i32 i = 0; i < CAP_SPACE_SLOTS; i++) { + struct cap_slot_view slot = cap_slot_read(p, i); + if (!slot.valid || slot.type != CAP_TYPE_DELEGATE) + continue; + (void) cap_revoke_delegate_impl(p, cap_make_handle(&slot), true); + } + + u64 flags = proc_fd_lock_irqsave(p); + for (i32 i = 0; i < CAP_SPACE_SLOTS; i++) { + struct cap_slot_view slot = cap_unpack_slot( + __atomic_load_n(&p->cap_space.slots[i], __ATOMIC_ACQUIRE), (u8) i); + if (!slot.valid) + continue; + (void) cap_drop_slot_locked(p, slot); + } + proc_fd_unlock_irqrestore(p, flags); +} + +#include __INC_TEST(cap) diff --git a/kernel/proc/proc.c b/kernel/proc/proc.c index 606e942..2dcf170 100644 --- a/kernel/proc/proc.c +++ b/kernel/proc/proc.c @@ -7,6 +7,7 @@ */ #include +#include #include #include #include @@ -22,8 +23,6 @@ #include "../sync/futex.h" #include "../sync/sync_handle.h" #include "../timer/posix_timer.h" -#include "pipe.h" - static struct proc proc_table[PROC_MAX]; spinlock_t proc_table_lock = SPINLOCK_INITIALIZER; static u16 next_pid = 1; @@ -39,12 +38,30 @@ static sz proc_slot(struct proc *p) return (sz) (p - proc_table); } +static bool proc_child_matches_parent(const struct proc *child, + const struct proc *parent) +{ + return child && parent && child != parent && + child->parent_pid == parent->pid && + child->parent_generation == parent->generation; +} + static vaddr_t proc_thread_stack_top(const struct proc *p, u8 slot) { return (vaddr_t) (p->va_stack_top - (u64) slot * (USER_STACK_SIZE + PAGE_SIZE)); } +static i32 proc_detach_thread_cap_slot_locked(struct sched_task *td) +{ + if (!td || td->td_cap_slot < 0) + return -1; + + i32 slot = td->td_cap_slot; + td->td_cap_slot = -1; + return slot; +} + u64 proc_table_lock_irqsave(void) { lockdep_acquire(LOCK_LEVEL_PROC); @@ -105,30 +122,6 @@ static u16 proc_alloc_pid_locked(void) return 0; } -static void proc_close_fd_impl(struct proc *p, i32 fd) -{ - struct proc_fd *f = &p->fd_table[fd]; - if (!f->is_open) - return; - - /* Release the underlying resource before clearing the slot. - * The is_seekable check (instead of fd > PROC_FD_STDERR) ensures - * vfs_close runs for VFS files redirected onto stdio via spawn - * file actions. Console FDs (is_seekable == false) are skipped. - */ - if (f->is_pipe) { - if (f->pipe_read_end) - pipe_close_read(f->pipe); - else - pipe_close_write(f->pipe); - } else if (f->is_seekable && !f->is_dup) { - vfs_close(&f->file); - } - - /* Zero the entire slot to reset all fields at once. */ - memset(f, 0, sizeof(*f)); -} - static void proc_free_locked(struct proc *p) { MAGIC_CHECK(p, PROC_MAGIC); @@ -137,10 +130,7 @@ static void proc_free_locked(struct proc *p) proc_free_user_pages(p); - u64 fd_flags = proc_fd_lock_irqsave(p); - for (i32 i = 0; i < PROC_FD_MAX; i++) - proc_close_fd_impl(p, i); - proc_fd_unlock_irqrestore(p, fd_flags); + cap_space_teardown(p); p->n_vmas = 0; p->generation++; @@ -153,6 +143,7 @@ void proc_init(void) memset(proc_table, 0, sizeof(proc_table)); proc_table_lock = (spinlock_t) SPINLOCK_INITIALIZER; next_pid = 1; + cap_init(); for (sz i = 0; i < PROC_MAX; i++) { init_waitqueue_head(&child_wqs[i]); init_waitqueue_head(&proc_table[i].thread_event_wq); @@ -204,9 +195,6 @@ struct proc *proc_alloc(void) p->fd_lock = (spinlock_t) SPINLOCK_INITIALIZER; p->sig_lock = (spinlock_t) SPINLOCK_INITIALIZER; init_waitqueue_head(&p->thread_event_wq); - p->fd_table[PROC_FD_STDIN].is_open = true; - p->fd_table[PROC_FD_STDOUT].is_open = true; - p->fd_table[PROC_FD_STDERR].is_open = true; p->cwd[0] = '/'; p->cwd_len = 1; /* Assign per-process VA window based on table slot index. @@ -220,6 +208,7 @@ struct proc *proc_alloc(void) * Ensures a clean list head regardless of prior state. */ init_waitqueue_head(&child_wqs[i]); + cap_space_init(p); proc_table_unlock_irqrestore(flags); return p; } @@ -343,18 +332,35 @@ void proc_release_thread_stack(struct proc *p, u8 slot) proc_remove_vma(p, stack_bottom, USER_STACK_SIZE); } -void proc_reap_exited_thread_locked(struct proc *p, struct sched_task *td) +static void proc_drop_thread_token(struct proc *p, i64 token) { assert(p); - if (!td) + if (token < 0) return; + (void) cap_drop_token(p, (u64) token); +} + +i64 proc_reap_exited_thread_locked(struct proc *p, struct sched_task *td) +{ + assert(p); + if (!td) + return -1; + + i64 token = -1; u8 slot = proc_task_slot(p, td); if (slot < PROC_THREAD_MAX) { + i32 cap_slot = proc_detach_thread_cap_slot_locked(td); + if (cap_slot >= 0) { + struct cap_slot_view cap_slot_view = cap_slot_read(p, cap_slot); + if (cap_slot_view.valid && cap_slot_view.type == CAP_TYPE_THREAD) + token = (i64) cap_make_handle(&cap_slot_view); + } p->exited_cpu_time_us += td->cpu_time_us; proc_release_thread_stack(p, slot); proc_detach_task(p, td); } + return token; } void proc_reap_exited_thread(struct proc *p, struct sched_task *td) @@ -364,22 +370,21 @@ void proc_reap_exited_thread(struct proc *p, struct sched_task *td) return; u64 pflags = proc_table_lock_irqsave(); - proc_reap_exited_thread_locked(p, td); + i64 token = proc_reap_exited_thread_locked(p, td); proc_table_unlock_irqrestore(pflags); + proc_drop_thread_token(p, token); } void proc_close_fd_locked(struct proc *p, i32 fd) { assert(p); assert(fd >= 0 && fd < PROC_FD_MAX); - proc_close_fd_impl(p, fd); + (void) cap_close_fd(p, fd); } void proc_close_fd(struct proc *p, i32 fd) { - u64 flags = proc_fd_lock_irqsave(p); - proc_close_fd_impl(p, fd); - proc_fd_unlock_irqrestore(p, flags); + (void) cap_close_fd(p, fd); } void proc_free(struct proc *p) @@ -407,6 +412,73 @@ struct proc *proc_find(u16 pid) return p; } +sz proc_collect_descendants_locked(struct proc *root, + u32 root_generation, + struct proc **out, + u32 *out_generations, + sz max) +{ + if (!root || !out || !out_generations || max == 0) + return 0; + + if (root->state == PROC_STATE_FREE || root->generation != root_generation) + return 0; + + sz count = 0; + out[count] = root; + out_generations[count] = root->generation; + count++; + + bool added = true; + while (added && count < max) { + added = false; + for (sz i = 0; i < PROC_MAX && count < max; i++) { + struct proc *candidate = &proc_table[i]; + if (candidate->state == PROC_STATE_FREE) + continue; + + bool is_descendant = false; + for (sz j = 0; j < count; j++) { + if (proc_child_matches_parent(candidate, out[j])) { + is_descendant = true; + break; + } + } + if (!is_descendant) + continue; + + bool already_listed = false; + for (sz j = 0; j < count; j++) { + if (out[j] == candidate) { + already_listed = true; + break; + } + } + if (already_listed) + continue; + + out[count] = candidate; + out_generations[count] = candidate->generation; + count++; + added = true; + } + } + return count; +} + +sz proc_collect_descendants(struct proc *root, + u32 root_generation, + struct proc **out, + u32 *out_generations, + sz max) +{ + u64 flags = proc_table_lock_irqsave(); + sz count = proc_collect_descendants_locked(root, root_generation, out, + out_generations, max); + proc_table_unlock_irqrestore(flags); + return count; +} + void proc_for_each(proc_iter_cb_t cb, void *ctx) { u64 flags = proc_table_lock_irqsave(); @@ -421,7 +493,7 @@ static bool has_zombie_child_locked(struct proc *parent) { for (sz i = 0; i < PROC_MAX; i++) { if (proc_table[i].state == PROC_STATE_ZOMBIE && - proc_table[i].parent_pid == parent->pid) + proc_child_matches_parent(&proc_table[i], parent)) return true; } return false; @@ -434,7 +506,7 @@ static bool has_any_child_locked(struct proc *parent) if (i == proc_slot(parent)) continue; if (proc_table[i].state != PROC_STATE_FREE && - proc_table[i].parent_pid == parent->pid) + proc_child_matches_parent(&proc_table[i], parent)) return true; } return false; @@ -456,7 +528,8 @@ void proc_notify_parent(struct proc *child) u64 flags = proc_table_lock_irqsave(); for (sz i = 0; i < PROC_MAX; i++) { if (proc_table[i].state != PROC_STATE_FREE && - proc_table[i].pid == child->parent_pid) { + proc_table[i].pid == child->parent_pid && + proc_table[i].generation == child->parent_generation) { wake_slot = i; break; } @@ -695,21 +768,24 @@ void proc_reparent_children(struct proc *parent) for (sz i = 0; i < PROC_MAX; i++) { if (proc_table[i].state == PROC_STATE_FREE) continue; - if (proc_table[i].parent_pid != parent->pid) + if (!proc_child_matches_parent(&proc_table[i], parent)) continue; if (&proc_table[i] == parent) continue; if (init_alive) { proc_table[i].parent_pid = 1; + proc_table[i].parent_generation = init->generation; if (proc_table[i].state == PROC_STATE_ZOMBIE) need_wake_init = true; } else { /* No init: auto-reap zombie children. */ if (proc_table[i].state == PROC_STATE_ZOMBIE) proc_free_locked(&proc_table[i]); - else + else { proc_table[i].parent_pid = 0; /* true orphan */ + proc_table[i].parent_generation = 0; + } } } proc_table_unlock_irqrestore(flags); @@ -795,7 +871,8 @@ void proc_exit(struct proc *p, i32 exit_code) for (sz i = 0; i < PROC_MAX; i++) { if (proc_table[i].state != PROC_STATE_FREE && proc_table[i].state != PROC_STATE_ZOMBIE && - proc_table[i].pid == p->parent_pid) { + proc_table[i].pid == p->parent_pid && + proc_table[i].generation == p->parent_generation) { parent_slot = i; break; } diff --git a/kernel/proc/spawn.c b/kernel/proc/spawn.c index 83247f2..0d8c5e7 100644 --- a/kernel/proc/spawn.c +++ b/kernel/proc/spawn.c @@ -2,89 +2,72 @@ /* posix_spawn file actions and spawn attributes. * * Executes file actions (open/close/dup2) in the child's FD table before - * the child is enqueued. All mutations hold the child's fd_lock. - * The child is in EMBRYO state during this phase - not yet visible to + * the child is enqueued. The child already holds its inherited parent FD + * snapshot at this point, so ordered mutations are resolved entirely against + * the child-local table. All mutations hold the child's fd_lock. The child + * is in EMBRYO state during this phase - not yet visible to * other tasks or harts. */ +#include #include #include #include #include #include -#include "pipe.h" - -/* Open a file and install it at a specific FD in the child's table. - * Caller must hold child->fd_lock. - */ -static i32 fa_open_locked(struct proc *child, i32 fd, char *kpath, sz pathlen) +static i32 fa_open(struct proc *child, i32 fd, char *kpath, sz pathlen) { if (fd < 0 || fd >= PROC_FD_MAX) return -(i32) EBADF; struct str path = str_new(kpath, pathlen); + struct result_vfs_stat st = vfs_stat(path); + if (st.is_error) + return -(i32) st.code; + struct vfs_stat vstat = result_vfs_stat_checked(st); + /* Permit GRANT on supervisor-opened FDs explicitly flagged for delegation. + * SPAWN_FA_OPEN is exactly that path: the parent is configuring the child's + * FD table at spawn time, so the minted cap carries GRANT and preserves the + * normal spawn inheritance semantics. Plain sys_open still mints a + * non-delegable cap. + */ + u8 rights = CAP_RIGHT_READ | CAP_RIGHT_GRANT; + if ((vstat.flags & VFS_FLAG_RDONLY) == 0) + rights |= CAP_RIGHT_WRITE; + bool is_seekable = (vstat.flags & VFS_FLAG_NOSEEK) == 0; + struct result_vfs_file fres = vfs_open(path); if (fres.is_error) return -(i32) fres.code; - - /* Close existing FD if occupied. */ - if (child->fd_table[fd].is_open) - proc_close_fd_locked(child, fd); - - child->fd_table[fd].is_open = true; - child->fd_table[fd].is_seekable = true; - child->fd_table[fd].is_dup = false; - child->fd_table[fd].offset = 0; - child->fd_table[fd].file = result_vfs_file_checked(fres); - return 0; + i32 rc = cap_open_vfs(child, result_vfs_file_checked(fres), rights, + is_seekable, fd, true); + return rc < 0 ? rc : 0; } -/* Close a specific FD in the child's table. - * Caller must hold child->fd_lock. - */ -static i32 fa_close_locked(struct proc *child, i32 fd) +static i32 fa_close(struct proc *child, i32 fd) { if (fd < 0 || fd >= PROC_FD_MAX) return -(i32) EBADF; - - if (child->fd_table[fd].is_open) - proc_close_fd_locked(child, fd); - return 0; + /* POSIX posix_spawn_file_actions_addclose ignores already-closed FDs. */ + i64 rc = cap_close_fd(child, fd); + return rc == -(i64) EBADF ? 0 : (i32) rc; } -/* Duplicate oldfd to newfd in the child's table. - * Caller must hold child->fd_lock. - */ -static i32 fa_dup2_locked(struct proc *child, i32 oldfd, i32 newfd) +static i32 fa_dup2(struct proc *child, i32 oldfd, i32 newfd) { if (oldfd < 0 || oldfd >= PROC_FD_MAX) return -(i32) EBADF; if (newfd < 0 || newfd >= PROC_FD_MAX) return -(i32) EBADF; - if (!child->fd_table[oldfd].is_open) - return -(i32) EBADF; - - if (oldfd == newfd) - return 0; - - if (child->fd_table[newfd].is_open) - proc_close_fd_locked(child, newfd); - child->fd_table[newfd] = child->fd_table[oldfd]; - child->fd_table[newfd].is_dup = true; - - if (child->fd_table[newfd].is_pipe) { - u64 pf = spin_lock_irqsave(&child->fd_table[newfd].pipe->lock); - if (child->fd_table[newfd].pipe_read_end) - child->fd_table[newfd].pipe->readers++; - else - child->fd_table[newfd].pipe->writers++; - spin_unlock_irqrestore(&child->fd_table[newfd].pipe->lock, pf); - } - return 0; + i32 rc = cap_dup_fd(child, oldfd, newfd, true); + return rc < 0 ? rc : 0; } +/* Actions execute in caller order, matching posix_spawn semantics, against + * the child's already-inherited FD table. + */ i32 spawn_apply_file_actions(struct proc *child, const struct spawn_file_action *actions, sz count) @@ -94,38 +77,49 @@ i32 spawn_apply_file_actions(struct proc *child, if (count > SPAWN_FA_MAX) return -(i32) EINVAL; - i32 rc = 0; - u64 fd_flags = proc_fd_lock_irqsave(child); - + /* Validate all entries up front so a malformed action late in the + * list fails before any side effect lands on the child. + */ for (sz i = 0; i < count; i++) { const struct spawn_file_action *fa = &actions[i]; - switch (fa->type) { case SPAWN_FA_OPEN: - if (fa->pathlen == 0 || fa->pathlen > SPAWN_FA_PATH_MAX) { - rc = -(i32) EINVAL; - goto out; - } - rc = fa_open_locked(child, fa->fd, (char *) fa->path, fa->pathlen); + if (fa->pathlen == 0 || fa->pathlen > SPAWN_FA_PATH_MAX) + return -(i32) EINVAL; break; case SPAWN_FA_CLOSE: - rc = fa_close_locked(child, fa->fd); - break; case SPAWN_FA_DUP2: - rc = fa_dup2_locked(child, fa->fd, fa->newfd); break; default: - rc = -(i32) EINVAL; - goto out; + return -(i32) EINVAL; } + } - if (rc < 0) - goto out; + for (sz i = 0; i < count; i++) { + const struct spawn_file_action *fa = &actions[i]; + i32 rc = 0; + switch (fa->type) { + case SPAWN_FA_CLOSE: + rc = fa_close(child, fa->fd); + if (rc < 0) + return rc; + break; + case SPAWN_FA_OPEN: + rc = fa_open(child, fa->fd, (char *) fa->path, fa->pathlen); + if (rc < 0) + return rc; + break; + case SPAWN_FA_DUP2: + rc = fa_dup2(child, fa->fd, fa->newfd); + if (rc < 0) + return rc; + break; + default: + return -(i32) EINVAL; + } } -out: - proc_fd_unlock_irqrestore(child, fd_flags); - return rc; + return 0; } i32 spawn_apply_attr(struct proc *child __unused, diff --git a/kernel/proc/syscall.c b/kernel/proc/syscall.c index 65fecf6..101ce2a 100644 --- a/kernel/proc/syscall.c +++ b/kernel/proc/syscall.c @@ -12,6 +12,7 @@ */ #include +#include #include #include #include @@ -76,16 +77,6 @@ static inline bool validate_fd_number(i32 fd) return fd >= 0 && fd < PROC_FD_MAX; } -static i32 find_free_fd_locked(struct proc *p) -{ - assert(p); - for (i32 i = 0; i < PROC_FD_MAX; i++) { - if (!p->fd_table[i].is_open) - return i; - } - return -1; -} - static i64 sys_exit(struct trap_frame *tf, struct sched_task *td) { if (!td) @@ -110,31 +101,22 @@ static i64 sys_write(struct trap_frame *tf, struct sched_task *td) if (!p || !validate_fd_number(fd)) return -(i64) EBADF; - u64 fd_flags = proc_fd_lock_irqsave(p); - if (!p->fd_table[fd].is_open) { - proc_fd_unlock_irqrestore(p, fd_flags); - return -(i64) EBADF; - } + struct cap_ref ref = cap_lookup_fd(p, fd, CAP_RIGHT_WRITE); + if (!ref.ptr) + return cap_fd_is_valid(p, fd) ? -(i64) EACCES : -(i64) EBADF; + struct fd_pool_entry *entry = ref.ptr; + i64 rc; - /* Pipe dispatch: release fd_lock before blocking I/O. - * Bump the pipe's writer refcount first so concurrent close() on - * the last FD cannot free the pipe during the operation. - */ - if (p->fd_table[fd].is_pipe) { - struct pipe *pipe = p->fd_table[fd].pipe; - bool is_write = !p->fd_table[fd].pipe_read_end; - if (!is_write) { - proc_fd_unlock_irqrestore(p, fd_flags); - return -(i64) EBADF; + if (entry->kind == CAP_FD_KIND_PIPE) { + struct pipe *pipe = entry->pipe; + if (entry->pipe_read_end) { + rc = -(i64) EBADF; + goto out; } - u64 pf = spin_lock_irqsave(&pipe->lock); - pipe->writers++; - spin_unlock_irqrestore(&pipe->lock, pf); - proc_fd_unlock_irqrestore(p, fd_flags); if (len <= 0) { - pipe_close_write(pipe); - return 0; + rc = 0; + goto out; } if (len > PIPE_BUF_SIZE) len = PIPE_BUF_SIZE; @@ -145,70 +127,68 @@ static i64 sys_write(struct trap_frame *tf, struct sched_task *td) * interleave their data, breaking POSIX PIPE_BUF atomicity. */ char kbuf[PIPE_BUF_SIZE]; - i64 rc = copy_from_user(kbuf, ubuf, len); - if (rc < 0) { - pipe_close_write(pipe); - return rc; - } - i64 written = pipe_write(pipe, kbuf, len); - pipe_close_write(pipe); - return written; + rc = copy_from_user(kbuf, ubuf, len); + if (rc < 0) + goto out; + rc = pipe_write(pipe, kbuf, len); + goto out; } if (len <= 0) { - proc_fd_unlock_irqrestore(p, fd_flags); - return 0; + rc = 0; + goto out; } if (len > 4096) len = 4096; - /* fd 1/2 = stdout/stderr -> console output. */ - if (fd == PROC_FD_STDOUT || fd == PROC_FD_STDERR) { + if (entry->kind == CAP_FD_KIND_CONSOLE) { + if (entry->console_id != PROC_FD_STDOUT && + entry->console_id != PROC_FD_STDERR) { + rc = -(i64) EBADF; + goto out; + } char kbuf[256]; sz total = 0; while (total < len) { sz chunk = len - total; if (chunk > (sz) sizeof(kbuf)) chunk = (sz) sizeof(kbuf); - i64 rc = copy_from_user(kbuf, ubuf + total, chunk); - if (rc < 0) { - proc_fd_unlock_irqrestore(p, fd_flags); - return rc; - } + rc = copy_from_user(kbuf, ubuf + total, chunk); + if (rc < 0) + goto out; print_str((struct str) {.dat = kbuf, .len = chunk}); total += chunk; } - proc_fd_unlock_irqrestore(p, fd_flags); - return total; + rc = (i64) total; + goto out; } char kbuf[256]; sz total = 0; - sz base_off = p->fd_table[fd].offset; + sz base_off = entry->offset; while (total < len) { sz chunk = len - total; if (chunk > (sz) sizeof(kbuf)) chunk = (sz) sizeof(kbuf); - i64 rc = copy_from_user(kbuf, ubuf + total, chunk); - if (rc < 0) { - proc_fd_unlock_irqrestore(p, fd_flags); - return rc; - } + rc = copy_from_user(kbuf, ubuf + total, chunk); + if (rc < 0) + goto out; struct byte_view bv = byte_view_new(kbuf, chunk); - struct result_sz wres = - vfs_write(&p->fd_table[fd].file, bv, base_off + total); + struct result_sz wres = vfs_write(&entry->file, bv, base_off + total); if (wres.is_error) { - proc_fd_unlock_irqrestore(p, fd_flags); - return -(i64) wres.code; + rc = -(i64) wres.code; + goto out; } sz written = result_sz_checked(wres); if (written == 0) break; total += written; } - p->fd_table[fd].offset = base_off + total; - proc_fd_unlock_irqrestore(p, fd_flags); - return total; + entry->offset = base_off + total; + rc = (i64) total; +out: + cap_put_ref(&ref); + return rc; } static i64 sys_read(struct trap_frame *tf, struct sched_task *td) @@ -221,31 +201,22 @@ static i64 sys_read(struct trap_frame *tf, struct sched_task *td) if (!p || !validate_fd_number(fd)) return -(i64) EBADF; - u64 fd_flags = proc_fd_lock_irqsave(p); - if (!p->fd_table[fd].is_open) { - proc_fd_unlock_irqrestore(p, fd_flags); - return -(i64) EBADF; - } + struct cap_ref ref = cap_lookup_fd(p, fd, CAP_RIGHT_READ); + if (!ref.ptr) + return cap_fd_is_valid(p, fd) ? -(i64) EACCES : -(i64) EBADF; + struct fd_pool_entry *entry = ref.ptr; + i64 rc; - /* Pipe dispatch: release fd_lock before blocking I/O. - * Bump the pipe's reader refcount first so concurrent close() on - * the last FD cannot free the pipe during the operation. - */ - if (p->fd_table[fd].is_pipe) { - struct pipe *pipe = p->fd_table[fd].pipe; - bool is_read = p->fd_table[fd].pipe_read_end; - if (!is_read) { - proc_fd_unlock_irqrestore(p, fd_flags); - return -(i64) EBADF; + if (entry->kind == CAP_FD_KIND_PIPE) { + struct pipe *pipe = entry->pipe; + if (!entry->pipe_read_end) { + rc = -(i64) EBADF; + goto out; } - u64 pf = spin_lock_irqsave(&pipe->lock); - pipe->readers++; - spin_unlock_irqrestore(&pipe->lock, pf); - proc_fd_unlock_irqrestore(p, fd_flags); if (len <= 0) { - pipe_close_read(pipe); - return 0; + rc = 0; + goto out; } if (len > 4096) len = 4096; @@ -253,12 +224,12 @@ static i64 sys_read(struct trap_frame *tf, struct sched_task *td) /* Validate the entire user buffer before consuming pipe bytes. * pipe_read advances the head irreversibly, so a late EFAULT * from copy_to_user would lose data from a non-seekable FD. - * Must check writability (PTE_W), not just accessibility - + * Must check writability (PTE_W), not just accessibility: * a read-only mapping passes user_addr_valid but faults on write. */ if (!user_addr_writable(ubuf, len)) { - pipe_close_read(pipe); - return -(i64) EFAULT; + rc = -(i64) EFAULT; + goto out; } char kbuf[256]; @@ -269,63 +240,62 @@ static i64 sys_read(struct trap_frame *tf, struct sched_task *td) chunk = (sz) sizeof(kbuf); i64 got = pipe_read(pipe, kbuf, chunk); if (got < 0) { - pipe_close_read(pipe); - return total > 0 ? (i64) total : got; + rc = total > 0 ? (i64) total : got; + goto out; } if (got == 0) break; - i64 rc = copy_to_user(ubuf + total, kbuf, (sz) got); + rc = copy_to_user(ubuf + total, kbuf, (sz) got); if (rc < 0) { - pipe_close_read(pipe); - return total > 0 ? (i64) total : rc; + rc = total > 0 ? (i64) total : rc; + goto out; } total += (sz) got; if ((sz) got < chunk) break; /* short read: don't block again */ } - pipe_close_read(pipe); - return (i64) total; + rc = (i64) total; + goto out; } - if (fd == PROC_FD_STDIN) { - proc_fd_unlock_irqrestore(p, fd_flags); - return 0; + if (entry->kind == CAP_FD_KIND_CONSOLE) { + rc = 0; + goto out; } if (len <= 0) { - proc_fd_unlock_irqrestore(p, fd_flags); - return 0; + rc = 0; + goto out; } if (len > 4096) len = 4096; char kbuf[256]; sz total = 0; - sz base_off = p->fd_table[fd].offset; + sz base_off = entry->offset; while (total < len) { sz chunk = len - total; if (chunk > (sz) sizeof(kbuf)) chunk = (sz) sizeof(kbuf); struct byte_buf bb = byte_buf_new(kbuf, 0, chunk); - struct result_sz rres = - vfs_read(&p->fd_table[fd].file, &bb, base_off + total); + struct result_sz rres = vfs_read(&entry->file, &bb, base_off + total); if (rres.is_error) { - proc_fd_unlock_irqrestore(p, fd_flags); - return -(i64) rres.code; + rc = -(i64) rres.code; + goto out; } sz got = result_sz_checked(rres); if (got == 0) break; - i64 rc = copy_to_user(ubuf + total, kbuf, got); - if (rc < 0) { - proc_fd_unlock_irqrestore(p, fd_flags); - return rc; - } + rc = copy_to_user(ubuf + total, kbuf, got); + if (rc < 0) + goto out; total += got; } - p->fd_table[fd].offset = base_off + total; - proc_fd_unlock_irqrestore(p, fd_flags); - return total; + entry->offset = base_off + total; + rc = (i64) total; +out: + cap_put_ref(&ref); + return rc; } static i64 sys_open(struct trap_frame *tf, struct sched_task *td) @@ -337,11 +307,7 @@ static i64 sys_open(struct trap_frame *tf, struct sched_task *td) if (!p) return -(i64) EPERM; - /* Preserve EMFILE semantics before touching user memory. */ - u64 fd_flags = proc_fd_lock_irqsave(p); - i32 fd = find_free_fd_locked(p); - proc_fd_unlock_irqrestore(p, fd_flags); - if (fd < 0) + if (cap_find_free_fd(p) < 0) return -(i64) EMFILE; i64 perr; @@ -350,26 +316,24 @@ static i64 sys_open(struct trap_frame *tf, struct sched_task *td) if (perr < 0) return perr; + struct result_vfs_stat st = vfs_stat(path); + if (st.is_error) + return -(i64) st.code; + struct vfs_stat vstat = result_vfs_stat_checked(st); + /* GRANT is set only on system-minted FDs and on supervisor-opened FDs + * explicitly flagged for delegation (e.g., SPAWN_FA_OPEN). Plain sys_open + * mints a non-delegable cap. + */ + u8 rights = CAP_RIGHT_READ; + if ((vstat.flags & VFS_FLAG_RDONLY) == 0) + rights |= CAP_RIGHT_WRITE; + bool is_seekable = (vstat.flags & VFS_FLAG_NOSEEK) == 0; + struct result_vfs_file fres = vfs_open(path); if (fres.is_error) return -(i64) fres.code; - - fd_flags = proc_fd_lock_irqsave(p); - fd = find_free_fd_locked(p); - if (fd < 0) { - struct vfs_file file = result_vfs_file_checked(fres); - proc_fd_unlock_irqrestore(p, fd_flags); - vfs_close(&file); - return -(i64) EMFILE; - } - - p->fd_table[fd] = (struct proc_fd) { - .is_open = true, - .is_seekable = true, - .file = result_vfs_file_checked(fres), - }; - proc_fd_unlock_irqrestore(p, fd_flags); - return fd; + return (i64) cap_open_vfs(p, result_vfs_file_checked(fres), rights, + is_seekable, -1, false); } static i64 sys_close(struct trap_frame *tf, struct sched_task *td) @@ -380,15 +344,7 @@ static i64 sys_close(struct trap_frame *tf, struct sched_task *td) if (!p || !validate_fd_number(fd)) return -(i64) EBADF; - u64 fd_flags = proc_fd_lock_irqsave(p); - if (!p->fd_table[fd].is_open) { - proc_fd_unlock_irqrestore(p, fd_flags); - return -(i64) EBADF; - } - - proc_close_fd_locked(p, fd); - proc_fd_unlock_irqrestore(p, fd_flags); - return 0; + return cap_close_fd(p, fd); } static i64 sys_stat(struct trap_frame *tf, struct sched_task *td) @@ -559,6 +515,7 @@ static i64 sys_spawn(struct trap_frame *tf, struct sched_task *td) return -(i64) ENOMEM; } child->parent_pid = parent->pid; + child->parent_generation = parent->generation; /* Copy binary name into child proc. */ sz namelen = pathlen < 31 ? pathlen : 31; @@ -594,50 +551,16 @@ static i64 sys_spawn(struct trap_frame *tf, struct sched_task *td) kvalloc_free(buf_ba); - /* Inherit parent's FD table so file actions (especially DUP2) can - * reference descriptors that are open in the parent. - * - * Pipe FDs: inherit with refcount bump (pipes have proper lifecycle). - * Console FDs (stdin/stdout/stderr without VFS backing): inherit as-is. - * VFS file FDs: do NOT inherit - the VFS layer has no refcounting, - * so sharing a struct vfs_file across processes leads to use-after-free - * when either side closes the descriptor. Callers that need file - * redirection must use SPAWN_FA_OPEN file actions instead. - * - * Lock ordering: parent fd_lock first (read), child fd_lock second - * (write, uncontended - child is EMBRYO, invisible to other harts). - */ - { - u64 pf = proc_fd_lock_irqsave(parent); - u64 cf = proc_fd_lock_irqsave(child); - for (i32 i = 0; i < PROC_FD_MAX; i++) { - if (!parent->fd_table[i].is_open) - continue; - if (parent->fd_table[i].is_pipe) { - child->fd_table[i] = parent->fd_table[i]; - struct pipe *pipe = child->fd_table[i].pipe; - /* IRQs already disabled by parent fd_lock; plain - * spin_lock avoids redundant irqsave/restore. - */ - spin_lock(&pipe->lock); - if (child->fd_table[i].pipe_read_end) - pipe->readers++; - else - pipe->writers++; - spin_unlock(&pipe->lock); - } else if (!parent->fd_table[i].is_seekable) { - /* Console FD (no VFS backing): safe to copy by value. */ - child->fd_table[i] = parent->fd_table[i]; - } - /* VFS file FDs (is_seekable && !is_pipe): skip - no refcount, - * sharing leads to use-after-free on close. - */ + for (i32 fd = 0; fd < PROC_FD_MAX; fd++) { + if (!cap_fd_is_valid(parent, fd)) + continue; + i32 inherit_rc = cap_inherit_fd(parent, child, fd, fd); + if (inherit_rc < 0) { + proc_free(child); + return (i64) inherit_rc; } - proc_fd_unlock_irqrestore(child, cf); - proc_fd_unlock_irqrestore(parent, pf); } - /* Apply file actions to child's FD table (child still in EMBRYO). */ if (fa_count > 0) { i32 fa_rc = spawn_apply_file_actions(child, kfa, fa_count); if (fa_rc < 0) { @@ -698,43 +621,13 @@ static i64 sys_getppid(struct trap_frame *tf, struct sched_task *td) return p ? (i64) p->parent_pid : 0; } -/* Bump pipe refcount after duplicating a pipe FD. Caller holds fd_lock. */ -static void pipe_bump_refcount(struct proc_fd *f) -{ - if (!f->is_pipe) - return; - u64 pf = spin_lock_irqsave(&f->pipe->lock); - if (f->pipe_read_end) - f->pipe->readers++; - else - f->pipe->writers++; - spin_unlock_irqrestore(&f->pipe->lock, pf); -} - static i64 sys_dup(struct trap_frame *tf, struct sched_task *td) { i32 oldfd = (i32) tf->a0; struct proc *p = td->proc; if (!p || !validate_fd_number(oldfd)) return -(i64) EBADF; - - u64 fd_flags = proc_fd_lock_irqsave(p); - if (!p->fd_table[oldfd].is_open) { - proc_fd_unlock_irqrestore(p, fd_flags); - return -(i64) EBADF; - } - - i32 newfd = find_free_fd_locked(p); - if (newfd < 0) { - proc_fd_unlock_irqrestore(p, fd_flags); - return -(i64) EMFILE; - } - - p->fd_table[newfd] = p->fd_table[oldfd]; - p->fd_table[newfd].is_dup = true; - pipe_bump_refcount(&p->fd_table[newfd]); - proc_fd_unlock_irqrestore(p, fd_flags); - return newfd; + return (i64) cap_dup_fd(p, oldfd, -1, false); } static i64 sys_dup2(struct trap_frame *tf, struct sched_task *td) @@ -744,25 +637,9 @@ static i64 sys_dup2(struct trap_frame *tf, struct sched_task *td) struct proc *p = td->proc; if (!p || !validate_fd_number(oldfd)) return -(i64) EBADF; - if (newfd < 0 || newfd >= PROC_FD_MAX) - return -(i64) EBADF; - - u64 fd_flags = proc_fd_lock_irqsave(p); - if (!p->fd_table[oldfd].is_open) { - proc_fd_unlock_irqrestore(p, fd_flags); + if (!validate_fd_number(newfd)) return -(i64) EBADF; - } - if (oldfd == newfd) { - proc_fd_unlock_irqrestore(p, fd_flags); - return newfd; - } - if (p->fd_table[newfd].is_open) - proc_close_fd_locked(p, newfd); - p->fd_table[newfd] = p->fd_table[oldfd]; - p->fd_table[newfd].is_dup = true; - pipe_bump_refcount(&p->fd_table[newfd]); - proc_fd_unlock_irqrestore(p, fd_flags); - return newfd; + return (i64) cap_dup_fd(p, oldfd, newfd, true); } static i64 sys_lseek(struct trap_frame *tf, struct sched_task *td) @@ -775,58 +652,60 @@ static i64 sys_lseek(struct trap_frame *tf, struct sched_task *td) if (!p || !validate_fd_number(fd)) return -(i64) EBADF; - u64 fd_flags = proc_fd_lock_irqsave(p); - if (!p->fd_table[fd].is_open) { - proc_fd_unlock_irqrestore(p, fd_flags); + struct cap_ref ref = cap_lookup_fd(p, fd, 0); + if (!ref.ptr) return -(i64) EBADF; - } - if (!p->fd_table[fd].is_seekable) { - proc_fd_unlock_irqrestore(p, fd_flags); - return -(i64) ESPIPE; + struct fd_pool_entry *entry = ref.ptr; + i64 rc; + if (!entry->is_seekable) { + rc = -(i64) ESPIPE; + goto out; } - sz cur = p->fd_table[fd].offset; + sz cur = entry->offset; sz new_off; switch (whence) { case SEEK_SET: if (offset < 0) { - proc_fd_unlock_irqrestore(p, fd_flags); - return -(i64) EINVAL; + rc = -(i64) EINVAL; + goto out; } new_off = (sz) offset; break; case SEEK_CUR: { u64 delta = (offset < 0) ? (u64) (-(offset + 1)) + 1 : (u64) offset; if (offset < 0 && delta > (u64) cur) { - proc_fd_unlock_irqrestore(p, fd_flags); - return -(i64) EINVAL; + rc = -(i64) EINVAL; + goto out; } if (offset > 0 && (u64) cur > (u64) (I64_MAX - offset)) { - proc_fd_unlock_irqrestore(p, fd_flags); - return -(i64) EINVAL; + rc = -(i64) EINVAL; + goto out; } new_off = cur + (sz) offset; break; } case SEEK_END: /* Needs vfs_file_size() - not yet available. */ - proc_fd_unlock_irqrestore(p, fd_flags); - return -(i64) ENOSYS; + rc = -(i64) ENOSYS; + goto out; default: - proc_fd_unlock_irqrestore(p, fd_flags); - return -(i64) EINVAL; + rc = -(i64) EINVAL; + goto out; } /* Guard: new_off must fit in i64 for the return value. */ if (new_off > (sz) I64_MAX) { - proc_fd_unlock_irqrestore(p, fd_flags); - return -(i64) EINVAL; + rc = -(i64) EINVAL; + goto out; } - p->fd_table[fd].offset = new_off; - proc_fd_unlock_irqrestore(p, fd_flags); - return (i64) new_off; + entry->offset = new_off; + rc = (i64) new_off; +out: + cap_put_ref(&ref); + return rc; } static i64 sys_chdir(struct trap_frame *tf, struct sched_task *td) @@ -934,36 +813,20 @@ static i64 sys_pipe(struct trap_frame *tf, struct sched_task *td) if (!pipe) return -(i64) ENOMEM; - u64 fd_flags = proc_fd_lock_irqsave(p); - - /* Find two free FDs. */ - i32 rfd = find_free_fd_locked(p); - i32 wfd = -1; - if (rfd >= 0) { - p->fd_table[rfd].is_open = true; - wfd = find_free_fd_locked(p); - p->fd_table[rfd].is_open = false; - } - if (rfd < 0 || wfd < 0) { - proc_fd_unlock_irqrestore(p, fd_flags); + i32 rfd = cap_open_pipe(p, pipe, true, CAP_RIGHT_READ | CAP_RIGHT_GRANT, -1, + false); + if (rfd < 0) { pipe_close_read(pipe); pipe_close_write(pipe); - return -(i64) EMFILE; + return rfd; + } + i32 wfd = cap_open_pipe(p, pipe, false, CAP_RIGHT_WRITE | CAP_RIGHT_GRANT, + -1, false); + if (wfd < 0) { + proc_close_fd(p, rfd); + pipe_close_write(pipe); + return wfd; } - - p->fd_table[rfd] = (struct proc_fd) { - .is_open = true, - .is_pipe = true, - .pipe_read_end = true, - .pipe = pipe, - }; - p->fd_table[wfd] = (struct proc_fd) { - .is_open = true, - .is_pipe = true, - .pipe = pipe, - }; - - proc_fd_unlock_irqrestore(p, fd_flags); /* Copy FD pair to user-space: fds[0] = read, fds[1] = write. */ i32 kfds[2] = {rfd, wfd}; @@ -1511,13 +1374,9 @@ static i64 sys_fsync_common(struct sched_task *td, i32 fd) struct proc *p = td ? td->proc : NULL; if (!p || !validate_fd_number(fd)) return -(i64) EBADF; - u64 flags = proc_fd_lock_irqsave(p); - bool open = p->fd_table[fd].is_open; - bool is_pipe = p->fd_table[fd].is_pipe; - proc_fd_unlock_irqrestore(p, flags); - if (!open) + if (!cap_fd_is_valid(p, fd)) return -(i64) EBADF; - if (is_pipe) + if (cap_fd_is_pipe(p, fd)) return -(i64) EINVAL; return 0; } @@ -1537,49 +1396,88 @@ static i64 sys_fdatasync(struct trap_frame *tf, struct sched_task *td) static i64 sys_mutex_init_h(struct trap_frame *tf __unused, struct sched_task *td __unused) { - i32 h = sync_mutex_alloc(td->proc); - return (i64) h; + i32 object_index = sync_mutex_alloc(td->proc); + if (object_index < 0) + return (i64) object_index; + i32 handle = cap_open_handle(td->proc, (u16) object_index, CAP_TYPE_MUTEX, + CAP_RIGHT_WRITE, -1, false); + if (handle < 0) { + sync_mutex_put_idx(object_index); + return (i64) handle; + } + return (i64) handle; } static i64 sys_mutex_lock_h(struct trap_frame *tf, struct sched_task *td __unused) { i32 handle = (i32) tf->a0; - struct pi_mutex *mtx = sync_mutex_get(handle, td->proc); - if (!mtx) + struct cap_ref ref = + cap_lookup_object(td->proc, handle, CAP_RIGHT_WRITE, CAP_TYPE_MUTEX); + if (!ref.type) return -(i64) EINVAL; - if (thread_cancel_enabled_pending(td)) - return cancel_thread_now(tf, td); - return maybe_cancel_at_cancellation_point( + i64 rc; + struct pi_mutex *mtx = sync_mutex_get((i32) ref.object_index); + if (!mtx) { + rc = -(i64) EINVAL; + goto out; + } + if (thread_cancel_enabled_pending(td)) { + rc = cancel_thread_now(tf, td); + goto out; + } + rc = maybe_cancel_at_cancellation_point( tf, td, (i64) pi_mutex_lock_interruptible(mtx)); +out: + cap_put_ref(&ref); + return rc; } static i64 sys_mutex_trylock_h(struct trap_frame *tf, struct sched_task *td __unused) { i32 handle = (i32) tf->a0; - struct pi_mutex *mtx = sync_mutex_get(handle, td->proc); - if (!mtx) + struct cap_ref ref = + cap_lookup_object(td->proc, handle, CAP_RIGHT_WRITE, CAP_TYPE_MUTEX); + if (!ref.type) return -(i64) EINVAL; - return (i64) pi_mutex_trylock(mtx); + struct pi_mutex *mtx = sync_mutex_get((i32) ref.object_index); + i64 rc = mtx ? (i64) pi_mutex_trylock(mtx) : -(i64) EINVAL; + cap_put_ref(&ref); + return rc; } static i64 sys_mutex_unlock_h(struct trap_frame *tf, struct sched_task *td __unused) { i32 handle = (i32) tf->a0; - struct pi_mutex *mtx = sync_mutex_get(handle, td->proc); - if (!mtx) + struct cap_ref ref = + cap_lookup_object(td->proc, handle, CAP_RIGHT_WRITE, CAP_TYPE_MUTEX); + if (!ref.type) return -(i64) EINVAL; + struct pi_mutex *mtx = sync_mutex_get((i32) ref.object_index); + if (!mtx) { + cap_put_ref(&ref); + return -(i64) EINVAL; + } pi_mutex_unlock(mtx); + cap_put_ref(&ref); return 0; } static i64 sys_cond_init_h(struct trap_frame *tf __unused, struct sched_task *td __unused) { - i32 h = sync_condvar_alloc(td->proc); - return (i64) h; + i32 object_index = sync_condvar_alloc(td->proc); + if (object_index < 0) + return (i64) object_index; + i32 handle = cap_open_handle(td->proc, (u16) object_index, CAP_TYPE_CONDVAR, + CAP_RIGHT_WRITE, -1, false); + if (handle < 0) { + sync_condvar_put_idx(object_index); + return (i64) handle; + } + return (i64) handle; } static i64 sys_cond_wait_h(struct trap_frame *tf, @@ -1587,14 +1485,33 @@ static i64 sys_cond_wait_h(struct trap_frame *tf, { i32 cv_h = (i32) tf->a0; i32 mtx_h = (i32) tf->a1; - struct condvar *cv = sync_condvar_get(cv_h, td->proc); - struct pi_mutex *mtx = sync_mutex_get(mtx_h, td->proc); - if (!cv || !mtx) + struct cap_ref cv_ref = + cap_lookup_object(td->proc, cv_h, CAP_RIGHT_WRITE, CAP_TYPE_CONDVAR); + if (!cv_ref.type) return -(i64) EINVAL; - if (thread_cancel_enabled_pending(td)) - return cancel_thread_now(tf, td); - return maybe_cancel_at_cancellation_point(tf, td, - (i64) condvar_wait(cv, mtx)); + struct cap_ref mtx_ref = + cap_lookup_object(td->proc, mtx_h, CAP_RIGHT_WRITE, CAP_TYPE_MUTEX); + if (!mtx_ref.type) { + cap_put_ref(&cv_ref); + return -(i64) EINVAL; + } + i64 rc; + struct condvar *cv = sync_condvar_get((i32) cv_ref.object_index); + struct pi_mutex *mtx = sync_mutex_get((i32) mtx_ref.object_index); + if (!cv || !mtx) { + rc = -(i64) EINVAL; + goto out; + } + if (thread_cancel_enabled_pending(td)) { + rc = cancel_thread_now(tf, td); + goto out; + } + rc = + maybe_cancel_at_cancellation_point(tf, td, (i64) condvar_wait(cv, mtx)); +out: + cap_put_ref(&mtx_ref); + cap_put_ref(&cv_ref); + return rc; } /* Convert an absolute CLOCK_MONOTONIC timespec (in user space) to a @@ -1686,75 +1603,130 @@ static i64 sys_cond_timedwait_h(struct trap_frame *tf, i64 rc = timed_wait_abs_to_rel(CLOCK_REALTIME, u_abs_ts, &timeout); if (rc < 0) return rc; - struct condvar *cv = sync_condvar_get(cv_h, td->proc); - struct pi_mutex *mtx = sync_mutex_get(mtx_h, td->proc); - if (!cv || !mtx) + struct cap_ref cv_ref = + cap_lookup_object(td->proc, cv_h, CAP_RIGHT_WRITE, CAP_TYPE_CONDVAR); + if (!cv_ref.type) return -(i64) EINVAL; - if (thread_cancel_enabled_pending(td)) - return cancel_thread_now(tf, td); - return maybe_cancel_at_cancellation_point( + struct cap_ref mtx_ref = + cap_lookup_object(td->proc, mtx_h, CAP_RIGHT_WRITE, CAP_TYPE_MUTEX); + if (!mtx_ref.type) { + cap_put_ref(&cv_ref); + return -(i64) EINVAL; + } + struct condvar *cv = sync_condvar_get((i32) cv_ref.object_index); + struct pi_mutex *mtx = sync_mutex_get((i32) mtx_ref.object_index); + if (!cv || !mtx) { + rc = -(i64) EINVAL; + goto out; + } + if (thread_cancel_enabled_pending(td)) { + rc = cancel_thread_now(tf, td); + goto out; + } + rc = maybe_cancel_at_cancellation_point( tf, td, (i64) condvar_wait_timeout(cv, mtx, timeout)); +out: + cap_put_ref(&mtx_ref); + cap_put_ref(&cv_ref); + return rc; } static i64 sys_cond_signal_h(struct trap_frame *tf, struct sched_task *td __unused) { i32 handle = (i32) tf->a0; - struct condvar *cv = sync_condvar_get(handle, td->proc); - if (!cv) + struct cap_ref ref = + cap_lookup_object(td->proc, handle, CAP_RIGHT_WRITE, CAP_TYPE_CONDVAR); + if (!ref.type) return -(i64) EINVAL; - condvar_signal(cv); - return 0; + struct condvar *cv = sync_condvar_get((i32) ref.object_index); + if (cv) + condvar_signal(cv); + cap_put_ref(&ref); + return cv ? 0 : -(i64) EINVAL; } static i64 sys_cond_broadcast_h(struct trap_frame *tf, struct sched_task *td __unused) { i32 handle = (i32) tf->a0; - struct condvar *cv = sync_condvar_get(handle, td->proc); - if (!cv) + struct cap_ref ref = + cap_lookup_object(td->proc, handle, CAP_RIGHT_WRITE, CAP_TYPE_CONDVAR); + if (!ref.type) return -(i64) EINVAL; - condvar_broadcast(cv); - return 0; + struct condvar *cv = sync_condvar_get((i32) ref.object_index); + if (cv) + condvar_broadcast(cv); + cap_put_ref(&ref); + return cv ? 0 : -(i64) EINVAL; } static i64 sys_sem_init_h(struct trap_frame *tf, struct sched_task *td __unused) { i32 initial = (i32) tf->a0; - i32 h = sync_sem_alloc(td->proc, initial); - return (i64) h; + i32 object_index = sync_sem_alloc(td->proc, initial); + if (object_index < 0) + return (i64) object_index; + i32 handle = + cap_open_handle(td->proc, (u16) object_index, CAP_TYPE_SEMAPHORE, + CAP_RIGHT_WRITE, -1, false); + if (handle < 0) { + sync_sem_put_idx(object_index); + return (i64) handle; + } + return (i64) handle; } static i64 sys_sem_wait_h(struct trap_frame *tf, struct sched_task *td __unused) { i32 handle = (i32) tf->a0; - struct semaphore *s = sync_sem_get(handle, td->proc); - if (!s) + struct cap_ref ref = cap_lookup_object(td->proc, handle, CAP_RIGHT_WRITE, + CAP_TYPE_SEMAPHORE); + if (!ref.type) return -(i64) EINVAL; - if (thread_cancel_enabled_pending(td)) - return cancel_thread_now(tf, td); - return maybe_cancel_at_cancellation_point(tf, td, - (i64) sem_wait_interruptible(s)); + i64 rc; + struct semaphore *s = sync_sem_get((i32) ref.object_index); + if (!s) { + rc = -(i64) EINVAL; + goto out; + } + if (thread_cancel_enabled_pending(td)) { + rc = cancel_thread_now(tf, td); + goto out; + } + rc = maybe_cancel_at_cancellation_point(tf, td, + (i64) sem_wait_interruptible(s)); +out: + cap_put_ref(&ref); + return rc; } static i64 sys_sem_trywait_h(struct trap_frame *tf, struct sched_task *td __unused) { i32 handle = (i32) tf->a0; - struct semaphore *s = sync_sem_get(handle, td->proc); - if (!s) + struct cap_ref ref = cap_lookup_object(td->proc, handle, CAP_RIGHT_WRITE, + CAP_TYPE_SEMAPHORE); + if (!ref.type) return -(i64) EINVAL; - return (i64) sem_trywait(s); + struct semaphore *s = sync_sem_get((i32) ref.object_index); + i64 rc = s ? (i64) sem_trywait(s) : -(i64) EINVAL; + cap_put_ref(&ref); + return rc; } static i64 sys_sem_post_h(struct trap_frame *tf, struct sched_task *td __unused) { i32 handle = (i32) tf->a0; - struct semaphore *s = sync_sem_get(handle, td->proc); - if (!s) + struct cap_ref ref = cap_lookup_object(td->proc, handle, CAP_RIGHT_WRITE, + CAP_TYPE_SEMAPHORE); + if (!ref.type) return -(i64) EINVAL; - sem_post(s); - return 0; + struct semaphore *s = sync_sem_get((i32) ref.object_index); + if (s) + sem_post(s); + cap_put_ref(&ref); + return s ? 0 : -(i64) EINVAL; } static i64 sys_sem_timedwait_h(struct trap_frame *tf, @@ -1766,13 +1738,24 @@ static i64 sys_sem_timedwait_h(struct trap_frame *tf, i64 rc = timed_wait_abs_to_rel(CLOCK_REALTIME, u_abs_ts, &timeout); if (rc < 0) return rc; - struct semaphore *s = sync_sem_get(handle, td->proc); - if (!s) + struct cap_ref ref = cap_lookup_object(td->proc, handle, CAP_RIGHT_WRITE, + CAP_TYPE_SEMAPHORE); + if (!ref.type) return -(i64) EINVAL; - if (thread_cancel_enabled_pending(td)) - return cancel_thread_now(tf, td); - return maybe_cancel_at_cancellation_point(tf, td, - (i64) sem_timedwait(s, timeout)); + struct semaphore *s = sync_sem_get((i32) ref.object_index); + if (!s) { + rc = -(i64) EINVAL; + goto out; + } + if (thread_cancel_enabled_pending(td)) { + rc = cancel_thread_now(tf, td); + goto out; + } + rc = maybe_cancel_at_cancellation_point(tf, td, + (i64) sem_timedwait(s, timeout)); +out: + cap_put_ref(&ref); + return rc; } /* --- POSIX barriers (item 15i) --- */ @@ -1781,33 +1764,64 @@ static i64 sys_barrier_init_h(struct trap_frame *tf, struct sched_task *td __unused) { u32 count = (u32) tf->a0; - i32 h = sync_barrier_alloc(td->proc, count); - return (i64) h; + i32 object_index = sync_barrier_alloc(td->proc, count); + if (object_index < 0) + return (i64) object_index; + i32 handle = cap_open_handle(td->proc, (u16) object_index, CAP_TYPE_BARRIER, + CAP_RIGHT_WRITE, -1, false); + if (handle < 0) { + sync_barrier_put_idx(object_index); + return (i64) handle; + } + return (i64) handle; } static i64 sys_barrier_wait_h(struct trap_frame *tf, struct sched_task *td __unused) { i32 handle = (i32) tf->a0; - struct barrier *b = sync_barrier_get(handle, td->proc); - if (!b) + struct cap_ref ref = + cap_lookup_object(td->proc, handle, CAP_RIGHT_WRITE, CAP_TYPE_BARRIER); + if (!ref.type) return -(i64) EINVAL; - if (thread_cancel_enabled_pending(td)) - return cancel_thread_now(tf, td); - return maybe_cancel_at_cancellation_point( + i64 rc; + struct barrier *b = sync_barrier_get((i32) ref.object_index); + if (!b) { + rc = -(i64) EINVAL; + goto out; + } + if (thread_cancel_enabled_pending(td)) { + rc = cancel_thread_now(tf, td); + goto out; + } + rc = maybe_cancel_at_cancellation_point( tf, td, (i64) barrier_wait_interruptible(b)); +out: + cap_put_ref(&ref); + return rc; } static i64 sys_barrier_destroy_h(struct trap_frame *tf, struct sched_task *td __unused) { i32 handle = (i32) tf->a0; - struct barrier *b = sync_barrier_get(handle, td->proc); - if (!b) + /* Pin across barrier_destroy so a concurrent destroy cannot free the + * pool entry while we still operate on the primitive. + */ + struct cap_ref ref = + cap_lookup_object(td->proc, handle, CAP_RIGHT_WRITE, CAP_TYPE_BARRIER); + if (!ref.type) + return -(i64) EINVAL; + struct cap_slot_view slot = cap_slot_read(td->proc, handle); + struct barrier *b = sync_barrier_get((i32) ref.object_index); + if (!b || !slot.valid) { + cap_put_ref(&ref); return -(i64) EINVAL; + } i32 rc = barrier_destroy(b); + cap_put_ref(&ref); if (rc == 0) - sync_barrier_free(handle, td->proc); + return cap_drop_token(td->proc, cap_make_handle(&slot)); return (i64) rc; } @@ -1816,65 +1830,109 @@ static i64 sys_barrier_destroy_h(struct trap_frame *tf, static i64 sys_rwlock_init_h(struct trap_frame *tf __unused, struct sched_task *td __unused) { - i32 h = sync_rwlock_alloc(td->proc); - return (i64) h; + i32 object_index = sync_rwlock_alloc(td->proc); + if (object_index < 0) + return (i64) object_index; + i32 handle = cap_open_handle(td->proc, (u16) object_index, CAP_TYPE_RWLOCK, + CAP_RIGHT_WRITE, -1, false); + if (handle < 0) { + sync_rwlock_put_idx(object_index); + return (i64) handle; + } + return (i64) handle; } static i64 sys_rwlock_rdlock_h(struct trap_frame *tf, struct sched_task *td __unused) { i32 handle = (i32) tf->a0; - struct rwlock *rw = sync_rwlock_get(handle, td->proc); - if (!rw) + struct cap_ref ref = + cap_lookup_object(td->proc, handle, CAP_RIGHT_WRITE, CAP_TYPE_RWLOCK); + if (!ref.type) return -(i64) EINVAL; - if (thread_cancel_enabled_pending(td)) - return cancel_thread_now(tf, td); - return maybe_cancel_at_cancellation_point( + i64 rc; + struct rwlock *rw = sync_rwlock_get((i32) ref.object_index); + if (!rw) { + rc = -(i64) EINVAL; + goto out; + } + if (thread_cancel_enabled_pending(td)) { + rc = cancel_thread_now(tf, td); + goto out; + } + rc = maybe_cancel_at_cancellation_point( tf, td, (i64) rwlock_rdlock_interruptible(rw)); +out: + cap_put_ref(&ref); + return rc; } static i64 sys_rwlock_wrlock_h(struct trap_frame *tf, struct sched_task *td __unused) { i32 handle = (i32) tf->a0; - struct rwlock *rw = sync_rwlock_get(handle, td->proc); - if (!rw) + struct cap_ref ref = + cap_lookup_object(td->proc, handle, CAP_RIGHT_WRITE, CAP_TYPE_RWLOCK); + if (!ref.type) return -(i64) EINVAL; - if (thread_cancel_enabled_pending(td)) - return cancel_thread_now(tf, td); - return maybe_cancel_at_cancellation_point( + i64 rc; + struct rwlock *rw = sync_rwlock_get((i32) ref.object_index); + if (!rw) { + rc = -(i64) EINVAL; + goto out; + } + if (thread_cancel_enabled_pending(td)) { + rc = cancel_thread_now(tf, td); + goto out; + } + rc = maybe_cancel_at_cancellation_point( tf, td, (i64) rwlock_wrlock_interruptible(rw)); +out: + cap_put_ref(&ref); + return rc; } static i64 sys_rwlock_tryrdlock_h(struct trap_frame *tf, struct sched_task *td __unused) { i32 handle = (i32) tf->a0; - struct rwlock *rw = sync_rwlock_get(handle, td->proc); - if (!rw) + struct cap_ref ref = + cap_lookup_object(td->proc, handle, CAP_RIGHT_WRITE, CAP_TYPE_RWLOCK); + if (!ref.type) return -(i64) EINVAL; - return (i64) rwlock_tryrdlock(rw); + struct rwlock *rw = sync_rwlock_get((i32) ref.object_index); + i64 rc = rw ? (i64) rwlock_tryrdlock(rw) : -(i64) EINVAL; + cap_put_ref(&ref); + return rc; } static i64 sys_rwlock_trywrlock_h(struct trap_frame *tf, struct sched_task *td __unused) { i32 handle = (i32) tf->a0; - struct rwlock *rw = sync_rwlock_get(handle, td->proc); - if (!rw) + struct cap_ref ref = + cap_lookup_object(td->proc, handle, CAP_RIGHT_WRITE, CAP_TYPE_RWLOCK); + if (!ref.type) return -(i64) EINVAL; - return (i64) rwlock_trywrlock(rw); + struct rwlock *rw = sync_rwlock_get((i32) ref.object_index); + i64 rc = rw ? (i64) rwlock_trywrlock(rw) : -(i64) EINVAL; + cap_put_ref(&ref); + return rc; } static i64 sys_rwlock_unlock_h(struct trap_frame *tf, struct sched_task *td __unused) { i32 handle = (i32) tf->a0; - struct rwlock *rw = sync_rwlock_get(handle, td->proc); - if (!rw) + struct cap_ref ref = + cap_lookup_object(td->proc, handle, CAP_RIGHT_WRITE, CAP_TYPE_RWLOCK); + if (!ref.type) return -(i64) EINVAL; - rwlock_unlock(rw); - return 0; + struct rwlock *rw = sync_rwlock_get((i32) ref.object_index); + if (rw) + rwlock_unlock(rw); + cap_put_ref(&ref); + return rw ? 0 : -(i64) EINVAL; } static i64 sys_rwlock_timedrdlock_h(struct trap_frame *tf, @@ -1886,13 +1944,24 @@ static i64 sys_rwlock_timedrdlock_h(struct trap_frame *tf, i64 rc = timed_wait_abs_to_rel(CLOCK_REALTIME, u_abs_ts, &timeout); if (rc < 0) return rc; - struct rwlock *rw = sync_rwlock_get(handle, td->proc); - if (!rw) + struct cap_ref ref = + cap_lookup_object(td->proc, handle, CAP_RIGHT_WRITE, CAP_TYPE_RWLOCK); + if (!ref.type) return -(i64) EINVAL; - if (thread_cancel_enabled_pending(td)) - return cancel_thread_now(tf, td); - return maybe_cancel_at_cancellation_point( + struct rwlock *rw = sync_rwlock_get((i32) ref.object_index); + if (!rw) { + rc = -(i64) EINVAL; + goto out; + } + if (thread_cancel_enabled_pending(td)) { + rc = cancel_thread_now(tf, td); + goto out; + } + rc = maybe_cancel_at_cancellation_point( tf, td, (i64) rwlock_timedrdlock(rw, timeout)); +out: + cap_put_ref(&ref); + return rc; } static i64 sys_rwlock_timedwrlock_h(struct trap_frame *tf, @@ -1904,25 +1973,47 @@ static i64 sys_rwlock_timedwrlock_h(struct trap_frame *tf, i64 rc = timed_wait_abs_to_rel(CLOCK_REALTIME, u_abs_ts, &timeout); if (rc < 0) return rc; - struct rwlock *rw = sync_rwlock_get(handle, td->proc); - if (!rw) + struct cap_ref ref = + cap_lookup_object(td->proc, handle, CAP_RIGHT_WRITE, CAP_TYPE_RWLOCK); + if (!ref.type) return -(i64) EINVAL; - if (thread_cancel_enabled_pending(td)) - return cancel_thread_now(tf, td); - return maybe_cancel_at_cancellation_point( + struct rwlock *rw = sync_rwlock_get((i32) ref.object_index); + if (!rw) { + rc = -(i64) EINVAL; + goto out; + } + if (thread_cancel_enabled_pending(td)) { + rc = cancel_thread_now(tf, td); + goto out; + } + rc = maybe_cancel_at_cancellation_point( tf, td, (i64) rwlock_timedwrlock(rw, timeout)); +out: + cap_put_ref(&ref); + return rc; } static i64 sys_rwlock_destroy_h(struct trap_frame *tf, struct sched_task *td __unused) { i32 handle = (i32) tf->a0; - struct rwlock *rw = sync_rwlock_get(handle, td->proc); - if (!rw) + /* Pin across rwlock_destroy: a concurrent destroy must not free the + * pool entry under us. + */ + struct cap_ref ref = + cap_lookup_object(td->proc, handle, CAP_RIGHT_WRITE, CAP_TYPE_RWLOCK); + if (!ref.type) + return -(i64) EINVAL; + struct cap_slot_view slot = cap_slot_read(td->proc, handle); + struct rwlock *rw = sync_rwlock_get((i32) ref.object_index); + if (!rw || !slot.valid) { + cap_put_ref(&ref); return -(i64) EINVAL; + } i32 rc = rwlock_destroy(rw); + cap_put_ref(&ref); if (rc == 0) - sync_rwlock_free(handle, td->proc); + return cap_drop_token(td->proc, cap_make_handle(&slot)); return (i64) rc; } @@ -1933,52 +2024,74 @@ static i64 sys_mq_open(struct trap_frame *tf, struct sched_task *td) u32 max_msgs = (u32) tf->a0; sz max_msg_size = (sz) tf->a1; struct proc *p = td ? td->proc : NULL; - return (i64) mqueue_open(p, max_msgs, max_msg_size); + i32 object_index = mqueue_open(p, max_msgs, max_msg_size); + if (object_index < 0) + return (i64) object_index; + i32 handle = cap_open_handle(p, (u16) object_index, CAP_TYPE_MQUEUE, + CAP_RIGHT_READ | CAP_RIGHT_WRITE, -1, false); + if (handle < 0) { + (void) mqueue_close(object_index); + return (i64) handle; + } + return (i64) handle; } static i64 sys_mq_close(struct trap_frame *tf, struct sched_task *td) { i32 handle = (i32) tf->a0; - struct proc *p = td ? td->proc : NULL; - if (!mqueue_check_owner(handle, p)) - return -(i64) EPERM; - return (i64) mqueue_close(handle); + struct cap_slot_view slot; + if (!cap_lookup_slot(td->proc, handle, 0, CAP_TYPE_MQUEUE, &slot)) + return -(i64) EINVAL; + return cap_drop_token(td->proc, cap_make_handle(&slot)); } static i64 sys_mq_send(struct trap_frame *tf, struct sched_task *td) { i32 handle = (i32) tf->a0; - struct proc *p = td ? td->proc : NULL; - if (!mqueue_check_owner(handle, p)) - return -(i64) EPERM; + struct cap_ref ref = + cap_lookup_object(td->proc, handle, CAP_RIGHT_WRITE, CAP_TYPE_MQUEUE); + if (!ref.type) + return -(i64) EINVAL; ptr u_msg = (ptr) tf->a1; sz len = (sz) tf->a2; u32 priority = (u32) tf->a3; + i64 rc; - if (len <= 0 || len > MQ_MAX_MSG_SIZE) - return -(i64) EMSGSIZE; + if (len <= 0 || len > MQ_MAX_MSG_SIZE) { + rc = -(i64) EMSGSIZE; + goto out; + } u8 kbuf[MQ_MAX_MSG_SIZE]; - i64 rc = copy_from_user(kbuf, u_msg, len); + rc = copy_from_user(kbuf, u_msg, len); if (rc < 0) - return rc; + goto out; - if (thread_cancel_enabled_pending(td)) - return cancel_thread_now(tf, td); - return maybe_cancel_at_cancellation_point( - tf, td, (i64) mqueue_send(handle, kbuf, len, priority)); + if (thread_cancel_enabled_pending(td)) { + rc = cancel_thread_now(tf, td); + goto out; + } + rc = maybe_cancel_at_cancellation_point( + tf, td, (i64) mqueue_send((i32) ref.object_index, kbuf, len, priority)); +out: + cap_put_ref(&ref); + return rc; } -static i64 sys_mq_receive(struct trap_frame *tf, struct sched_task *td) +/* Shared receive body for sys_mq_{receive,timedreceive}. timed selects + * mqueue_timedreceive (which honors timeout); plain selects mqueue_receive + * (blocks indefinitely). On entry, ref is the pinned mqueue cap; the + * caller releases it on return. + */ +static i64 mq_receive_common(struct trap_frame *tf, + struct sched_task *td, + struct cap_ref *ref, + ptr u_buf, + sz buf_size, + ptr u_prio, + bool timed, + struct time_ms timeout) { - i32 handle = (i32) tf->a0; - struct proc *p = td ? td->proc : NULL; - if (!mqueue_check_owner(handle, p)) - return -(i64) EPERM; - ptr u_buf = (ptr) tf->a1; - sz buf_size = (sz) tf->a2; - ptr u_prio = (ptr) tf->a3; - if (buf_size <= 0 || buf_size > MQ_MAX_MSG_SIZE) return -(i64) EINVAL; @@ -1987,7 +2100,10 @@ static i64 sys_mq_receive(struct trap_frame *tf, struct sched_task *td) u8 kbuf[MQ_MAX_MSG_SIZE]; u32 prio = 0; - i32 ret = mqueue_receive(handle, kbuf, buf_size, &prio); + i32 ret = + timed ? mqueue_timedreceive((i32) ref->object_index, kbuf, buf_size, + &prio, timeout) + : mqueue_receive((i32) ref->object_index, kbuf, buf_size, &prio); if (ret < 0) return maybe_cancel_at_cancellation_point(tf, td, (i64) ret); @@ -2002,43 +2118,36 @@ static i64 sys_mq_receive(struct trap_frame *tf, struct sched_task *td) return (i64) ret; } -static i64 sys_mq_timedreceive(struct trap_frame *tf, struct sched_task *td) +static i64 sys_mq_receive(struct trap_frame *tf, struct sched_task *td) { i32 handle = (i32) tf->a0; - struct proc *p = td ? td->proc : NULL; - if (!mqueue_check_owner(handle, p)) - return -(i64) EPERM; - ptr u_buf = (ptr) tf->a1; - sz buf_size = (sz) tf->a2; - ptr u_prio = (ptr) tf->a3; - ptr u_abs_ts = (ptr) tf->a4; - - if (buf_size <= 0 || buf_size > MQ_MAX_MSG_SIZE) + struct cap_ref ref = + cap_lookup_object(td->proc, handle, CAP_RIGHT_READ, CAP_TYPE_MQUEUE); + if (!ref.type) return -(i64) EINVAL; + i64 rc = mq_receive_common(tf, td, &ref, (ptr) tf->a1, (sz) tf->a2, + (ptr) tf->a3, false, (struct time_ms) {0}); + cap_put_ref(&ref); + return rc; +} +static i64 sys_mq_timedreceive(struct trap_frame *tf, struct sched_task *td) +{ + i32 handle = (i32) tf->a0; + struct cap_ref ref = + cap_lookup_object(td->proc, handle, CAP_RIGHT_READ, CAP_TYPE_MQUEUE); + if (!ref.type) + return -(i64) EINVAL; struct time_ms timeout; - i64 trc = timed_wait_abs_to_rel(CLOCK_REALTIME, u_abs_ts, &timeout); - if (trc < 0) - return trc; - - if (thread_cancel_enabled_pending(td)) - return cancel_thread_now(tf, td); - - u8 kbuf[MQ_MAX_MSG_SIZE]; - u32 prio = 0; - i32 ret = mqueue_timedreceive(handle, kbuf, buf_size, &prio, timeout); - if (ret < 0) - return maybe_cancel_at_cancellation_point(tf, td, (i64) ret); - - i64 rc = copy_to_user(u_buf, kbuf, (sz) ret); - if (rc < 0) + i64 rc = timed_wait_abs_to_rel(CLOCK_REALTIME, (ptr) tf->a4, &timeout); + if (rc < 0) { + cap_put_ref(&ref); return rc; - if (u_prio) { - rc = copy_to_user(u_prio, &prio, sizeof(prio)); - if (rc < 0) - return rc; } - return (i64) ret; + rc = mq_receive_common(tf, td, &ref, (ptr) tf->a1, (sz) tf->a2, + (ptr) tf->a3, true, timeout); + cap_put_ref(&ref); + return rc; } /* --- PSE51 scheduling syscalls (item 17) --- */ @@ -2110,14 +2219,32 @@ static i64 sys_kill_h(struct trap_frame *tf, struct sched_task *td __unused) /* Forward declaration; defined alongside the other thread syscalls * later in the file. */ -static struct sched_task *thread_find_by_tid_locked(struct proc *p, u16 tid); +static bool thread_lookup_cap(struct proc *p, + u64 handle, + u8 required_rights, + struct cap_slot_view *out) +{ + return cap_lookup_token(p, handle, required_rights, CAP_TYPE_THREAD, out); +} + +static struct sched_task *thread_from_cap_locked( + struct proc *p, + const struct cap_slot_view *slot) +{ + if (!p || !slot || slot->object_index >= PROC_THREAD_MAX) + return NULL; + struct sched_task *target = p->tasks[slot->object_index]; + if (!target || target->td_cap_slot != (i16) slot->slot_index) + return NULL; + return target; +} static bool thread_target_is_live(const struct sched_task *target); /* pthread_kill: thread-directed signal delivery within the calling * proc. Differs from kill(): the bit lands on a specific thread's * td_sig.pending rather than the per-proc proc_pending mask, so the - * signal targets exactly that thread. Unknown TID -> ESRCH; signo==0 + * signal targets exactly that thread. Unknown thread handle -> ESRCH; signo==0 * is an existence check. SIGKILL is process-wide by definition and * is rejected with EINVAL since pthread_kill targeting a single * thread cannot meaningfully forward it. @@ -2126,7 +2253,7 @@ static i64 sys_pthread_kill_h(struct trap_frame *tf, struct sched_task *td) { if (!td || !td->proc) return -(i64) EPERM; - u16 tid = (u16) tf->a0; + u64 handle = tf->a0; i32 signo = (i32) tf->a1; struct proc *p = td->proc; @@ -2135,10 +2262,12 @@ static i64 sys_pthread_kill_h(struct trap_frame *tf, struct sched_task *td) if (signo == SIGKILL) return -(i64) EINVAL; - struct sched_task *target = NULL; + struct cap_slot_view cap_slot; + if (!thread_lookup_cap(p, handle, CAP_RIGHT_WRITE, &cap_slot)) + return -(i64) ESRCH; u64 tflags = proc_table_lock_irqsave(); - target = thread_find_by_tid_locked(p, tid); + struct sched_task *target = thread_from_cap_locked(p, &cap_slot); if (!thread_target_is_live(target)) { proc_table_unlock_irqrestore(tflags); return -(i64) ESRCH; @@ -2479,7 +2608,7 @@ static i64 sys_sigprocmask_h(struct trap_frame *tf, struct sched_task *td) /* PSE51 user threads. PROC_THREAD_MAX bounds the per-process task * list; sched_create_user_thread allocates a new task in the calling * proc, runs it on its own per-thread stack inside the proc VA - * window, and returns the kernel TID via td->id. Lifecycle: + * window, and returns the CAP_TYPE_THREAD slot index. Lifecycle: * * create -> JOINABLE * detach -> DETACHED (no one will join; auto-cleans on exit) @@ -2513,23 +2642,7 @@ static i64 sys_thread_create_h(struct trap_frame *tf, struct sched_task *td) inherited_sigmask, &new_td); if (rc < 0) return (i64) rc; - return (i64) new_td->id; -} - -/* Find the proc-attached thread with the given kernel TID. Caller - * must hold proc_table_lock. Returns NULL if no such thread is - * attached (it may have been detached and reaped already). - */ -static struct sched_task *thread_find_by_tid_locked(struct proc *p, u16 tid) -{ - if (!p) - return NULL; - for (u8 i = 0; i < PROC_THREAD_MAX; i++) { - struct sched_task *t = p->tasks[i]; - if (t && t->id == tid) - return t; - } - return NULL; + return cap_get_token(td->proc, new_td->td_cap_slot, CAP_TYPE_THREAD); } static bool thread_target_is_live(const struct sched_task *target) @@ -2560,17 +2673,19 @@ static bool thread_claim_reap(struct sched_task *target) __ATOMIC_RELAXED); } -static bool thread_join_wait_done_locked(struct proc *p, u16 tid) +static bool thread_join_wait_done_locked(struct proc *p, u16 task_slot) { - struct sched_task *target = thread_find_by_tid_locked(p, tid); + if (!p || task_slot >= PROC_THREAD_MAX) + return true; + struct sched_task *target = p->tasks[task_slot]; return !target || target->td_join_state != TD_JOIN_JOINABLE; } -static bool thread_join_wait_done(struct proc *p, u16 tid) +static bool thread_join_wait_done(struct proc *p, u16 task_slot) { u64 flags = proc_table_lock_irqsave(); - bool done = thread_join_wait_done_locked(p, tid); + bool done = thread_join_wait_done_locked(p, task_slot); proc_table_unlock_irqrestore(flags); return done; } @@ -2579,13 +2694,16 @@ static i64 sys_thread_join_h(struct trap_frame *tf, struct sched_task *td) { if (!td || !td->proc) return -(i64) EPERM; - u16 tid = (u16) tf->a0; + u64 handle = tf->a0; ptr u_exit_code = (ptr) tf->a1; struct proc *p = td->proc; - /* pthread_join(self) -> EDEADLK per POSIX. */ - if (tid == td->id) + struct cap_slot_view thread_slot; + if (!thread_lookup_cap(p, handle, CAP_RIGHT_READ, &thread_slot)) + return -(i64) ESRCH; + if (thread_slot.slot_index == (u8) td->td_cap_slot) return -(i64) EDEADLK; + u16 task_slot = thread_slot.object_index; for (;;) { struct sched_task *target = NULL; @@ -2593,7 +2711,7 @@ static i64 sys_thread_join_h(struct trap_frame *tf, struct sched_task *td) i32 exit_code = 0; u64 pflags = proc_table_lock_irqsave(); - target = thread_find_by_tid_locked(p, tid); + target = thread_from_cap_locked(p, &thread_slot); if (target) { join_state = (i32) __atomic_load_n((u8 *) &target->td_join_state, __ATOMIC_ACQUIRE); @@ -2605,8 +2723,11 @@ static i64 sys_thread_join_h(struct trap_frame *tf, struct sched_task *td) } exit_code = target->td_exit_code; if (thread_claim_reap(target)) { - proc_reap_exited_thread_locked(p, target); + i64 thread_token = + proc_reap_exited_thread_locked(p, target); proc_table_unlock_irqrestore(pflags); + if (thread_token >= 0) + (void) cap_drop_token(p, (u64) thread_token); wake_up(&p->thread_event_wq, I32_MAX); sched_reap_user_thread(target); if (u_exit_code) { @@ -2636,8 +2757,8 @@ static i64 sys_thread_join_h(struct trap_frame *tf, struct sched_task *td) * would let a concurrent free race this dereference. */ enum wait_unblock_reason reason; - wait_event_reason(p->thread_event_wq, thread_join_wait_done(p, tid), - reason); + wait_event_reason(p->thread_event_wq, + thread_join_wait_done(p, task_slot), reason); if (wait_unblock_is_terminal(reason)) return -(i64) EINTR; /* Loop back: re-take the locks and observe the new state. */ @@ -2648,11 +2769,15 @@ static i64 sys_thread_detach_h(struct trap_frame *tf, struct sched_task *td) { if (!td || !td->proc) return -(i64) EPERM; - u16 tid = (u16) tf->a0; + u64 handle = tf->a0; struct proc *p = td->proc; + struct cap_slot_view thread_slot; + if (!thread_lookup_cap(p, handle, CAP_RIGHT_WRITE, &thread_slot)) + return -(i64) ESRCH; + u64 pflags = proc_table_lock_irqsave(); - struct sched_task *target = thread_find_by_tid_locked(p, tid); + struct sched_task *target = thread_from_cap_locked(p, &thread_slot); if (!target) { proc_table_unlock_irqrestore(pflags); return -(i64) ESRCH; @@ -2668,8 +2793,9 @@ static i64 sys_thread_detach_h(struct trap_frame *tf, struct sched_task *td) bool claimed_reap = false; if (!claimed_join) claimed_reap = thread_claim_reap(target); + i64 thread_token = -(i64) EBADF; if (claimed_reap) - proc_reap_exited_thread_locked(p, target); + thread_token = proc_reap_exited_thread_locked(p, target); proc_table_unlock_irqrestore(pflags); if (claimed_join) { @@ -2681,6 +2807,8 @@ static i64 sys_thread_detach_h(struct trap_frame *tf, struct sched_task *td) return 0; } if (claimed_reap) { + if (thread_token >= 0) + (void) cap_drop_token(p, (u64) thread_token); /* Target already exited; this caller wins the reap. */ wake_up(&p->thread_event_wq, I32_MAX); sched_reap_user_thread(target); @@ -2732,7 +2860,7 @@ static i64 sys_thread_self_h(struct trap_frame *tf __unused, { if (!td) return -(i64) EPERM; - return (i64) td->id; + return cap_get_token(td->proc, td->td_cap_slot, CAP_TYPE_THREAD); } /* pthread_cancel(tid): mark the target thread cancellation-pending. @@ -2744,11 +2872,15 @@ static i64 sys_thread_cancel_h(struct trap_frame *tf, struct sched_task *td) { if (!td || !td->proc) return -(i64) EPERM; - u16 tid = (u16) tf->a0; + u64 handle = tf->a0; struct proc *p = td->proc; + struct cap_slot_view thread_slot; + if (!thread_lookup_cap(p, handle, CAP_RIGHT_WRITE, &thread_slot)) + return -(i64) ESRCH; + u64 pflags = proc_table_lock_irqsave(); - struct sched_task *target = thread_find_by_tid_locked(p, tid); + struct sched_task *target = thread_from_cap_locked(p, &thread_slot); bool target_live = thread_target_is_live(target); if (target_live) __atomic_store_n(&target->td_cancel_pending, true, __ATOMIC_RELAXED); @@ -2813,7 +2945,7 @@ static i64 sys_thread_setschedparam_h(struct trap_frame *tf, { if (!td || !td->proc) return -(i64) EPERM; - u16 tid = (u16) tf->a0; + u64 handle = tf->a0; i32 new_prio = (i32) tf->a1; if (new_prio < SCHED_PRIO_IDLE || new_prio >= CONFIG_SCHED_NPRIO) @@ -2822,9 +2954,17 @@ static i64 sys_thread_setschedparam_h(struct trap_frame *tf, return -(i64) EPERM; struct sched_task *target = td; - if (tid != 0 && tid != td->id) { + if (handle != 0) { + struct cap_slot_view thread_slot; + if (!thread_lookup_cap(td->proc, handle, CAP_RIGHT_WRITE, &thread_slot)) + return -(i64) ESRCH; + if (thread_slot.slot_index == (u8) td->td_cap_slot) { + td->td_base_prio = (u8) new_prio; + pi_mutex_refresh_prio(td); + return 0; + } u64 pflags = proc_table_lock_irqsave(); - target = thread_find_by_tid_locked(td->proc, tid); + target = thread_from_cap_locked(td->proc, &thread_slot); if (target) { target->td_base_prio = (u8) new_prio; pi_mutex_refresh_prio(target); @@ -2844,12 +2984,17 @@ static i64 sys_thread_getschedparam_h(struct trap_frame *tf, { if (!td || !td->proc) return -(i64) EPERM; - u16 tid = (u16) tf->a0; - if (tid == 0 || tid == td->id) + u64 handle = tf->a0; + if (handle == 0) return (i64) td->td_base_prio; + struct cap_slot_view thread_slot; + if (!thread_lookup_cap(td->proc, handle, CAP_RIGHT_READ, &thread_slot)) + return -(i64) ESRCH; + if (thread_slot.slot_index == (u8) td->td_cap_slot) + return (i64) td->td_base_prio; u64 pflags = proc_table_lock_irqsave(); - struct sched_task *target = thread_find_by_tid_locked(td->proc, tid); + struct sched_task *target = thread_from_cap_locked(td->proc, &thread_slot); i64 result = target ? (i64) target->td_base_prio : -(i64) ESRCH; proc_table_unlock_irqrestore(pflags); return result; @@ -2931,7 +3076,16 @@ static i64 sys_timer_create_h(struct trap_frame *tf __unused, { if (!td || !td->proc) return -(i64) EPERM; - return (i64) posix_timer_create(td->proc); + i32 object_index = posix_timer_alloc(td->proc); + if (object_index < 0) + return (i64) object_index; + i32 handle = cap_open_timer(td->proc, (u16) object_index, + CAP_RIGHT_READ | CAP_RIGHT_WRITE, -1, false); + if (handle < 0) { + posix_timer_put_idx((u16) object_index); + return (i64) handle; + } + return (i64) handle; } static i64 sys_timer_settime_h(struct trap_frame *tf, struct sched_task *td) @@ -2947,30 +3101,90 @@ static i64 sys_timer_settime_h(struct trap_frame *tf, struct sched_task *td) * proc up front so misconfigured timers fail at settime, not at * silent expiry. */ - u16 target_tid = (u16) tf->a3; - return (i64) posix_timer_settime(handle, td->proc, value_ms, interval_ms, - target_tid); + u16 target_tid = 0; + if (tf->a3 != 0) { + struct cap_slot_view target_slot; + if (!thread_lookup_cap(td->proc, tf->a3, CAP_RIGHT_READ, &target_slot)) + return -(i64) ESRCH; + + u64 pflags = proc_table_lock_irqsave(); + struct sched_task *target = + thread_from_cap_locked(td->proc, &target_slot); + bool live = thread_target_is_live(target); + if (live) + target_tid = target->id; + proc_table_unlock_irqrestore(pflags); + if (!live) + return -(i64) ESRCH; + } + struct cap_ref ref = cap_lookup_timer(td->proc, handle, CAP_RIGHT_WRITE); + if (!ref.ptr) + return -(i64) EINVAL; + i64 rc = (i64) posix_timer_settime_idx(ref.object_index, value_ms, + interval_ms, target_tid); + cap_put_ref(&ref); + return rc; } static i64 sys_timer_delete_h(struct trap_frame *tf, struct sched_task *td) { if (!td || !td->proc) return -(i64) EPERM; - return (i64) posix_timer_delete((i32) tf->a0, td->proc); + i32 handle = (i32) tf->a0; + struct cap_slot_view slot = cap_slot_read(td->proc, handle); + if (!slot.valid || slot.type != CAP_TYPE_TIMER) + return -(i64) EINVAL; + return cap_drop_token(td->proc, cap_make_handle(&slot)); } static i64 sys_timer_gettime_h(struct trap_frame *tf, struct sched_task *td) { if (!td || !td->proc) return -(i64) EPERM; - return (i64) posix_timer_gettime((i32) tf->a0, td->proc); + struct cap_ref ref = + cap_lookup_timer(td->proc, (i32) tf->a0, CAP_RIGHT_READ); + if (!ref.ptr) + return -(i64) EINVAL; + i64 rc = posix_timer_gettime_idx(ref.object_index); + cap_put_ref(&ref); + return rc; } static i64 sys_timer_getoverrun_h(struct trap_frame *tf, struct sched_task *td) { if (!td || !td->proc) return -(i64) EPERM; - return (i64) posix_timer_getoverrun((i32) tf->a0, td->proc); + struct cap_ref ref = + cap_lookup_timer(td->proc, (i32) tf->a0, CAP_RIGHT_READ); + if (!ref.ptr) + return -(i64) EINVAL; + i64 rc = posix_timer_getoverrun_idx(ref.object_index); + cap_put_ref(&ref); + return rc; +} + +static i64 sys_cap_drop_h(struct trap_frame *tf, struct sched_task *td) +{ + return cap_drop_token(td->proc, tf->a0); +} + +static i64 sys_cap_transfer_h(struct trap_frame *tf, struct sched_task *td) +{ + i64 raw_pid = (i64) tf->a0; + if (raw_pid <= 0 || raw_pid > (i64) U16_MAX) + return -(i64) EINVAL; + return cap_transfer(td->proc, (u16) raw_pid, tf->a1, (u8) tf->a2); +} + +static i64 sys_cap_revoke_delegate_h(struct trap_frame *tf, + struct sched_task *td) +{ + return cap_revoke_delegate(td->proc, tf->a0); +} + +static i64 sys_cap_get_token_h(struct trap_frame *tf, struct sched_task *td) +{ + return cap_get_token(td->proc, (i32) tf->a0, (u8) tf->a1); } typedef i64 (*syscall_fn_t)(struct trap_frame *tf, struct sched_task *td); @@ -3106,6 +3320,11 @@ static const struct syscall_entry syscall_table[SYS_NR] = { [SYS_TIMER_DELETE] = {sys_timer_delete_h, SYSCALL_F_NEEDS_PROC}, [SYS_TIMER_GETTIME] = {sys_timer_gettime_h, SYSCALL_F_NEEDS_PROC}, [SYS_TIMER_GETOVERRUN] = {sys_timer_getoverrun_h, SYSCALL_F_NEEDS_PROC}, + [SYS_CAP_DROP] = {sys_cap_drop_h, SYSCALL_F_NEEDS_PROC}, + [SYS_CAP_TRANSFER] = {sys_cap_transfer_h, SYSCALL_F_NEEDS_PROC}, + [SYS_CAP_REVOKE_DELEGATE] = {sys_cap_revoke_delegate_h, + SYSCALL_F_NEEDS_PROC}, + [SYS_CAP_GET_TOKEN] = {sys_cap_get_token_h, SYSCALL_F_NEEDS_PROC}, }; /* Security counters, global and irq-safe via atomics. */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c3253f1..419d581 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include @@ -1132,6 +1133,7 @@ static struct sched_task *sched_task_alloc_init(void) task->td_exit_code = 0; init_waitqueue_head(&task->td_join_wq); task->td_exit_started = false; + task->td_cap_slot = -1; #if CONFIG_SCHED_DEADLINE sched_dl_task_init(task); @@ -1242,6 +1244,11 @@ static vaddr_t user_thread_stack_top(struct proc *p, u8 idx) (u64) idx * (USER_STACK_SIZE + PAGE_SIZE)); } +static inline i32 user_thread_cap_slot(u8 task_slot) +{ + return CAP_SPACE_SLOTS - PROC_THREAD_MAX + (i32) task_slot; +} + static void rollback_user_thread_stack(struct proc *p, ptr stack_bottom, vaddr_t stack_top_va) @@ -1322,6 +1329,23 @@ struct result sched_create_user_task(struct proc *p, ptr entry, u8 prio) } } + u8 thread_slot = proc_task_slot(p, task); + i32 thread_handle = cap_open_handle( + p, thread_slot, CAP_TYPE_THREAD, CAP_RIGHT_READ | CAP_RIGHT_WRITE, + user_thread_cap_slot(thread_slot), true); + if (thread_handle < 0) { + u64 pflags = proc_table_lock_irqsave(); + i64 token = proc_reap_exited_thread_locked(p, task); + proc_table_unlock_irqrestore(pflags); + if (token >= 0) + (void) cap_drop_token(p, (u64) token); + paging_map_page((vaddr_t) task->guard, (paddr_t) task->guard, + PT_FLAG_RW); + kvalloc_free(byte_array_new((void *) task, sizeof(*task))); + return result_error((u16) (-thread_handle)); + } + task->td_cap_slot = (i16) thread_handle; + sched_task_enqueue(task, prio); return result_ok(); @@ -1433,6 +1457,22 @@ i32 sched_create_user_thread(struct proc *p, } } + i32 thread_handle = cap_open_handle(p, slot, CAP_TYPE_THREAD, + CAP_RIGHT_READ | CAP_RIGHT_WRITE, + user_thread_cap_slot(slot), true); + if (thread_handle < 0) { + u64 pflags = proc_table_lock_irqsave(); + i64 token = proc_reap_exited_thread_locked(p, task); + proc_table_unlock_irqrestore(pflags); + if (token >= 0) + (void) cap_drop_token(p, (u64) token); + paging_map_page((vaddr_t) task->guard, (paddr_t) task->guard, + PT_FLAG_RW); + kvalloc_free(byte_array_new((void *) task, sizeof(*task))); + return thread_handle; + } + task->td_cap_slot = (i16) thread_handle; + sched_task_enqueue(task, prio); *out_td = task; return 0; diff --git a/kernel/sync/sync_handle.c b/kernel/sync/sync_handle.c index 213dcad..bd3b95b 100644 --- a/kernel/sync/sync_handle.c +++ b/kernel/sync/sync_handle.c @@ -30,6 +30,116 @@ static inline bool sync_owner_ok(struct proc *owner, return owner == caller && caller && gen == caller->generation; } +/* Generate the five near-identical pool helper functions + * (_inc_idx, _put_idx, _free_idx, _get, _free) for one sync type. + * + * - prefix: function-name prefix (e.g. sync_mutex) + * - prim_type: primitive struct exposed via _get (e.g. struct pi_mutex) + * - prim_field: field name of the primitive inside the slot (e.g. mtx) + * - pool_var: pool array (e.g. mutex_pool) + * - pool_size: pool capacity macro (e.g. SYNC_MAX_MUTEXES) + * + * The slot struct type is inferred via typeof((pool_var)[0]); this avoids + * a redundant type parameter and the clang-tidy bugprone-macro-parentheses + * false positive that triggers on bare type names inside macros. + * + * _alloc and _init differ in arity (sem takes initial_count, barrier takes + * count, the others take none), so each type still defines _alloc inline. + */ +#define SYNC_POOL_DEFINE(prefix, prim_type, prim_field, pool_var, pool_size) \ + prim_type *prefix##_get(i32 handle) \ + { \ + if (handle < 0 || handle >= (pool_size)) \ + return NULL; \ + typeof(&(pool_var)[0]) s = &(pool_var)[handle]; \ + if (!s->in_use) \ + return NULL; \ + return &s->prim_field; \ + } \ + \ + bool prefix##_inc_idx(i32 handle) \ + { \ + if (handle < 0 || handle >= (pool_size)) \ + return false; \ + u64 flags = spin_lock_irqsave(&sync_lock); \ + typeof(&(pool_var)[0]) s = &(pool_var)[handle]; \ + if (!s->in_use) { \ + spin_unlock_irqrestore(&sync_lock, flags); \ + return false; \ + } \ + s->refcount++; \ + spin_unlock_irqrestore(&sync_lock, flags); \ + return true; \ + } \ + \ + void prefix##_free(i32 handle, struct proc *caller) \ + { \ + if (handle < 0 || handle >= (pool_size)) \ + return; \ + u64 flags = spin_lock_irqsave(&sync_lock); \ + typeof(&(pool_var)[0]) s = &(pool_var)[handle]; \ + if (s->in_use && sync_owner_ok(s->owner, s->owner_gen, caller)) { \ + s->in_use = false; \ + s->owner = NULL; \ + } \ + spin_unlock_irqrestore(&sync_lock, flags); \ + } \ + \ + void prefix##_free_idx(i32 handle) \ + { \ + if (handle < 0 || handle >= (pool_size)) \ + return; \ + u64 flags = spin_lock_irqsave(&sync_lock); \ + (pool_var)[handle].in_use = false; \ + (pool_var)[handle].owner = NULL; \ + (pool_var)[handle].refcount = 0; \ + spin_unlock_irqrestore(&sync_lock, flags); \ + } \ + \ + void prefix##_put_idx(i32 handle) \ + { \ + if (handle < 0 || handle >= (pool_size)) \ + return; \ + u64 flags = spin_lock_irqsave(&sync_lock); \ + typeof(&(pool_var)[0]) s = &(pool_var)[handle]; \ + if (!s->in_use) { \ + spin_unlock_irqrestore(&sync_lock, flags); \ + return; \ + } \ + if (s->refcount <= 1) { \ + s->refcount = 0; \ + s->in_use = false; \ + s->owner = NULL; \ + } else { \ + s->refcount--; \ + } \ + spin_unlock_irqrestore(&sync_lock, flags); \ + } + +/* Walk pool_var looking for the first slot with !in_use; stamp ownership + * and refcount=1 into it, leave *slot_ptr pointing at it, and break with + * idx set to its index. If no slot is free, idx remains -1. + * + * The caller wires up the primitive after the reserve (the primitive's + * init() signature differs per type, so it cannot be folded in here). + */ +#define SYNC_POOL_RESERVE(pool_var, pool_size, owner_arg, idx_out, slot_out) \ + do { \ + (idx_out) = -1; \ + for (i32 _i = 0; _i < (pool_size); _i++) { \ + if (!(pool_var)[_i].in_use) { \ + (pool_var)[_i].in_use = true; \ + (pool_var)[_i].owner = (owner_arg); \ + (pool_var)[_i].owner_gen = \ + (owner_arg) ? (owner_arg)->generation : 0; \ + (pool_var)[_i].refcount = 1; \ + (slot_out) = &(pool_var)[_i]; \ + (idx_out) = _i; \ + break; \ + } \ + } \ + } while (0) + void sync_handle_init(void) { for (i32 i = 0; i < SYNC_MAX_MUTEXES; i++) @@ -44,258 +154,113 @@ void sync_handle_init(void) rwlock_pool[i].in_use = false; } -/* --- Mutex --- */ +/* sync_*_get returns the kernel-internal primitive pointer for a live + * pool slot. It assumes the capability layer has already authorized the + * caller (and pinned an active-use refcount via sync_*_inc_idx); the + * historical owner check is therefore retired: the cap is the + * authority, not the pool's owner pointer. + */ +SYNC_POOL_DEFINE(sync_mutex, struct pi_mutex, mtx, mutex_pool, SYNC_MAX_MUTEXES) +SYNC_POOL_DEFINE(sync_condvar, + struct condvar, + cv, + condvar_pool, + SYNC_MAX_CONDVARS) +SYNC_POOL_DEFINE(sync_sem, struct semaphore, sem, sem_pool, SYNC_MAX_SEMAPHORES) +SYNC_POOL_DEFINE(sync_barrier, + struct barrier, + bar, + barrier_pool, + SYNC_MAX_BARRIERS) +SYNC_POOL_DEFINE(sync_rwlock, struct rwlock, rw, rwlock_pool, SYNC_MAX_RWLOCKS) i32 sync_mutex_alloc(struct proc *owner) { + struct sync_mutex_slot *s = NULL; + i32 idx; u64 flags = spin_lock_irqsave(&sync_lock); - for (i32 i = 0; i < SYNC_MAX_MUTEXES; i++) { - if (!mutex_pool[i].in_use) { - mutex_pool[i].in_use = true; - mutex_pool[i].owner = owner; - mutex_pool[i].owner_gen = owner ? owner->generation : 0; - pi_mutex_init(&mutex_pool[i].mtx); - spin_unlock_irqrestore(&sync_lock, flags); - return i; - } - } + SYNC_POOL_RESERVE(mutex_pool, SYNC_MAX_MUTEXES, owner, idx, s); + if (idx >= 0) + pi_mutex_init(&s->mtx); spin_unlock_irqrestore(&sync_lock, flags); - return -(i32) EAGAIN; -} - -struct pi_mutex *sync_mutex_get(i32 handle, struct proc *caller) -{ - if (handle < 0 || handle >= SYNC_MAX_MUTEXES) - return NULL; - struct sync_mutex_slot *s = &mutex_pool[handle]; - if (!s->in_use || !sync_owner_ok(s->owner, s->owner_gen, caller)) - return NULL; - return &s->mtx; + return idx < 0 ? -(i32) EAGAIN : idx; } -void sync_mutex_free(i32 handle, struct proc *caller) -{ - if (handle < 0 || handle >= SYNC_MAX_MUTEXES) - return; - u64 flags = spin_lock_irqsave(&sync_lock); - struct sync_mutex_slot *s = &mutex_pool[handle]; - if (s->in_use && sync_owner_ok(s->owner, s->owner_gen, caller)) { - s->in_use = false; - s->owner = NULL; - } - spin_unlock_irqrestore(&sync_lock, flags); -} - -/* --- Condvar --- */ - i32 sync_condvar_alloc(struct proc *owner) { + struct sync_condvar_slot *s = NULL; + i32 idx; u64 flags = spin_lock_irqsave(&sync_lock); - for (i32 i = 0; i < SYNC_MAX_CONDVARS; i++) { - if (!condvar_pool[i].in_use) { - condvar_pool[i].in_use = true; - condvar_pool[i].owner = owner; - condvar_pool[i].owner_gen = owner ? owner->generation : 0; - condvar_init(&condvar_pool[i].cv); - spin_unlock_irqrestore(&sync_lock, flags); - return i; - } - } + SYNC_POOL_RESERVE(condvar_pool, SYNC_MAX_CONDVARS, owner, idx, s); + if (idx >= 0) + condvar_init(&s->cv); spin_unlock_irqrestore(&sync_lock, flags); - return -(i32) EAGAIN; + return idx < 0 ? -(i32) EAGAIN : idx; } -struct condvar *sync_condvar_get(i32 handle, struct proc *caller) -{ - if (handle < 0 || handle >= SYNC_MAX_CONDVARS) - return NULL; - struct sync_condvar_slot *s = &condvar_pool[handle]; - if (!s->in_use || !sync_owner_ok(s->owner, s->owner_gen, caller)) - return NULL; - return &s->cv; -} - -void sync_condvar_free(i32 handle, struct proc *caller) -{ - if (handle < 0 || handle >= SYNC_MAX_CONDVARS) - return; - u64 flags = spin_lock_irqsave(&sync_lock); - struct sync_condvar_slot *s = &condvar_pool[handle]; - if (s->in_use && sync_owner_ok(s->owner, s->owner_gen, caller)) { - s->in_use = false; - s->owner = NULL; - } - spin_unlock_irqrestore(&sync_lock, flags); -} - -/* --- Semaphore --- */ - i32 sync_sem_alloc(struct proc *owner, i32 initial_count) { if (initial_count < 0) return -(i32) EINVAL; + struct sync_sem_slot *s = NULL; + i32 idx; u64 flags = spin_lock_irqsave(&sync_lock); - for (i32 i = 0; i < SYNC_MAX_SEMAPHORES; i++) { - if (!sem_pool[i].in_use) { - sem_pool[i].in_use = true; - sem_pool[i].owner = owner; - sem_pool[i].owner_gen = owner ? owner->generation : 0; - sem_init(&sem_pool[i].sem, initial_count); - spin_unlock_irqrestore(&sync_lock, flags); - return i; - } - } + SYNC_POOL_RESERVE(sem_pool, SYNC_MAX_SEMAPHORES, owner, idx, s); + if (idx >= 0) + sem_init(&s->sem, initial_count); spin_unlock_irqrestore(&sync_lock, flags); - return -(i32) EAGAIN; + return idx < 0 ? -(i32) EAGAIN : idx; } -struct semaphore *sync_sem_get(i32 handle, struct proc *caller) -{ - if (handle < 0 || handle >= SYNC_MAX_SEMAPHORES) - return NULL; - struct sync_sem_slot *s = &sem_pool[handle]; - if (!s->in_use || !sync_owner_ok(s->owner, s->owner_gen, caller)) - return NULL; - return &s->sem; -} - -void sync_sem_free(i32 handle, struct proc *caller) -{ - if (handle < 0 || handle >= SYNC_MAX_SEMAPHORES) - return; - u64 flags = spin_lock_irqsave(&sync_lock); - struct sync_sem_slot *s = &sem_pool[handle]; - if (s->in_use && sync_owner_ok(s->owner, s->owner_gen, caller)) { - s->in_use = false; - s->owner = NULL; - } - spin_unlock_irqrestore(&sync_lock, flags); -} - -/* --- Barrier --- */ - i32 sync_barrier_alloc(struct proc *owner, u32 count) { if (count == 0) return -(i32) EINVAL; + struct sync_barrier_slot *s = NULL; + i32 idx; u64 flags = spin_lock_irqsave(&sync_lock); - for (i32 i = 0; i < SYNC_MAX_BARRIERS; i++) { - if (!barrier_pool[i].in_use) { - barrier_pool[i].in_use = true; - barrier_pool[i].owner = owner; - barrier_pool[i].owner_gen = owner ? owner->generation : 0; - barrier_init(&barrier_pool[i].bar, count); - spin_unlock_irqrestore(&sync_lock, flags); - return i; - } - } - spin_unlock_irqrestore(&sync_lock, flags); - return -(i32) EAGAIN; -} - -struct barrier *sync_barrier_get(i32 handle, struct proc *caller) -{ - if (handle < 0 || handle >= SYNC_MAX_BARRIERS) - return NULL; - struct sync_barrier_slot *s = &barrier_pool[handle]; - if (!s->in_use || !sync_owner_ok(s->owner, s->owner_gen, caller)) - return NULL; - return &s->bar; -} - -void sync_barrier_free(i32 handle, struct proc *caller) -{ - if (handle < 0 || handle >= SYNC_MAX_BARRIERS) - return; - u64 flags = spin_lock_irqsave(&sync_lock); - struct sync_barrier_slot *s = &barrier_pool[handle]; - if (s->in_use && sync_owner_ok(s->owner, s->owner_gen, caller)) { - s->in_use = false; - s->owner = NULL; - } + SYNC_POOL_RESERVE(barrier_pool, SYNC_MAX_BARRIERS, owner, idx, s); + if (idx >= 0) + barrier_init(&s->bar, count); spin_unlock_irqrestore(&sync_lock, flags); + return idx < 0 ? -(i32) EAGAIN : idx; } -/* --- Rwlock --- */ - i32 sync_rwlock_alloc(struct proc *owner) { + struct sync_rwlock_slot *s = NULL; + i32 idx; u64 flags = spin_lock_irqsave(&sync_lock); - for (i32 i = 0; i < SYNC_MAX_RWLOCKS; i++) { - if (!rwlock_pool[i].in_use) { - rwlock_pool[i].in_use = true; - rwlock_pool[i].owner = owner; - rwlock_pool[i].owner_gen = owner ? owner->generation : 0; - rwlock_init(&rwlock_pool[i].rw); - spin_unlock_irqrestore(&sync_lock, flags); - return i; - } - } + SYNC_POOL_RESERVE(rwlock_pool, SYNC_MAX_RWLOCKS, owner, idx, s); + if (idx >= 0) + rwlock_init(&s->rw); spin_unlock_irqrestore(&sync_lock, flags); - return -(i32) EAGAIN; + return idx < 0 ? -(i32) EAGAIN : idx; } -struct rwlock *sync_rwlock_get(i32 handle, struct proc *caller) -{ - if (handle < 0 || handle >= SYNC_MAX_RWLOCKS) - return NULL; - struct sync_rwlock_slot *s = &rwlock_pool[handle]; - if (!s->in_use || !sync_owner_ok(s->owner, s->owner_gen, caller)) - return NULL; - return &s->rw; -} - -void sync_rwlock_free(i32 handle, struct proc *caller) -{ - if (handle < 0 || handle >= SYNC_MAX_RWLOCKS) - return; - u64 flags = spin_lock_irqsave(&sync_lock); - struct sync_rwlock_slot *s = &rwlock_pool[handle]; - if (s->in_use && sync_owner_ok(s->owner, s->owner_gen, caller)) { - s->in_use = false; - s->owner = NULL; - } - spin_unlock_irqrestore(&sync_lock, flags); -} +/* Per-type tear-down walker. Called once per pool from + * sync_handle_teardown_proc under sync_lock. Releases every slot owned by p. + */ +#define SYNC_TEARDOWN_POOL(pool_var, pool_size, proc_ptr) \ + do { \ + for (i32 _i = 0; _i < (pool_size); _i++) { \ + typeof((pool_var)[0]) *_s = &(pool_var)[_i]; \ + if (_s->in_use && \ + sync_owner_ok(_s->owner, _s->owner_gen, (proc_ptr))) { \ + _s->in_use = false; \ + _s->owner = NULL; \ + } \ + } \ + } while (0) void sync_handle_teardown_proc(struct proc *p) { u64 flags = spin_lock_irqsave(&sync_lock); - for (i32 i = 0; i < SYNC_MAX_MUTEXES; i++) { - struct sync_mutex_slot *s = &mutex_pool[i]; - if (s->in_use && sync_owner_ok(s->owner, s->owner_gen, p)) { - s->in_use = false; - s->owner = NULL; - } - } - for (i32 i = 0; i < SYNC_MAX_CONDVARS; i++) { - struct sync_condvar_slot *s = &condvar_pool[i]; - if (s->in_use && sync_owner_ok(s->owner, s->owner_gen, p)) { - s->in_use = false; - s->owner = NULL; - } - } - for (i32 i = 0; i < SYNC_MAX_SEMAPHORES; i++) { - struct sync_sem_slot *s = &sem_pool[i]; - if (s->in_use && sync_owner_ok(s->owner, s->owner_gen, p)) { - s->in_use = false; - s->owner = NULL; - } - } - for (i32 i = 0; i < SYNC_MAX_BARRIERS; i++) { - struct sync_barrier_slot *s = &barrier_pool[i]; - if (s->in_use && sync_owner_ok(s->owner, s->owner_gen, p)) { - s->in_use = false; - s->owner = NULL; - } - } - for (i32 i = 0; i < SYNC_MAX_RWLOCKS; i++) { - struct sync_rwlock_slot *s = &rwlock_pool[i]; - if (s->in_use && sync_owner_ok(s->owner, s->owner_gen, p)) { - s->in_use = false; - s->owner = NULL; - } - } + SYNC_TEARDOWN_POOL(mutex_pool, SYNC_MAX_MUTEXES, p); + SYNC_TEARDOWN_POOL(condvar_pool, SYNC_MAX_CONDVARS, p); + SYNC_TEARDOWN_POOL(sem_pool, SYNC_MAX_SEMAPHORES, p); + SYNC_TEARDOWN_POOL(barrier_pool, SYNC_MAX_BARRIERS, p); + SYNC_TEARDOWN_POOL(rwlock_pool, SYNC_MAX_RWLOCKS, p); spin_unlock_irqrestore(&sync_lock, flags); } diff --git a/kernel/sync/sync_handle.h b/kernel/sync/sync_handle.h index 03967c1..c3ef516 100644 --- a/kernel/sync/sync_handle.h +++ b/kernel/sync/sync_handle.h @@ -24,10 +24,18 @@ #define SYNC_MAX_BARRIERS 8 #define SYNC_MAX_RWLOCKS 16 +/* refcount tracks how many cap_space slots reference this pool entry. The + * underlying primitive is destroyed exactly once at the decrement that + * returns zero. _free_idx is the brutal teardown path used when a process + * is dying and its ownership is being repossessed wholesale; _put_idx is + * the refcount-aware path the capability layer uses on cap_drop / + * cap_revoke_delegate. + */ struct sync_mutex_slot { struct pi_mutex mtx; struct proc *owner; u32 owner_gen; + u32 refcount; bool in_use; }; @@ -35,6 +43,7 @@ struct sync_condvar_slot { struct condvar cv; struct proc *owner; u32 owner_gen; + u32 refcount; bool in_use; }; @@ -42,6 +51,7 @@ struct sync_sem_slot { struct semaphore sem; struct proc *owner; u32 owner_gen; + u32 refcount; bool in_use; }; @@ -49,6 +59,7 @@ struct sync_barrier_slot { struct barrier bar; struct proc *owner; u32 owner_gen; + u32 refcount; bool in_use; }; @@ -56,6 +67,7 @@ struct sync_rwlock_slot { struct rwlock rw; struct proc *owner; u32 owner_gen; + u32 refcount; bool in_use; }; @@ -65,25 +77,46 @@ void sync_handle_init(void); /* All alloc functions take the owning process for isolation. * All get functions validate the caller matches the owner. */ +/* The _inc_idx helpers atomically take an active-use reference on the + * underlying primitive. Returns true on success, false if the slot was + * already torn down. Callers MUST pair every successful increment with + * the matching _put_idx so the underlying object outlives the in-flight + * operation (the cap_lookup_object pin contract). + */ i32 sync_mutex_alloc(struct proc *owner); -struct pi_mutex *sync_mutex_get(i32 handle, struct proc *caller); +struct pi_mutex *sync_mutex_get(i32 handle); void sync_mutex_free(i32 handle, struct proc *caller); +void sync_mutex_free_idx(i32 handle); +void sync_mutex_put_idx(i32 handle); +bool sync_mutex_inc_idx(i32 handle); i32 sync_condvar_alloc(struct proc *owner); -struct condvar *sync_condvar_get(i32 handle, struct proc *caller); +struct condvar *sync_condvar_get(i32 handle); void sync_condvar_free(i32 handle, struct proc *caller); +void sync_condvar_free_idx(i32 handle); +void sync_condvar_put_idx(i32 handle); +bool sync_condvar_inc_idx(i32 handle); i32 sync_sem_alloc(struct proc *owner, i32 initial_count); -struct semaphore *sync_sem_get(i32 handle, struct proc *caller); +struct semaphore *sync_sem_get(i32 handle); void sync_sem_free(i32 handle, struct proc *caller); +void sync_sem_free_idx(i32 handle); +void sync_sem_put_idx(i32 handle); +bool sync_sem_inc_idx(i32 handle); i32 sync_barrier_alloc(struct proc *owner, u32 count); -struct barrier *sync_barrier_get(i32 handle, struct proc *caller); +struct barrier *sync_barrier_get(i32 handle); void sync_barrier_free(i32 handle, struct proc *caller); +void sync_barrier_free_idx(i32 handle); +void sync_barrier_put_idx(i32 handle); +bool sync_barrier_inc_idx(i32 handle); i32 sync_rwlock_alloc(struct proc *owner); -struct rwlock *sync_rwlock_get(i32 handle, struct proc *caller); +struct rwlock *sync_rwlock_get(i32 handle); void sync_rwlock_free(i32 handle, struct proc *caller); +void sync_rwlock_free_idx(i32 handle); +void sync_rwlock_put_idx(i32 handle); +bool sync_rwlock_inc_idx(i32 handle); /* Release all sync handles owned by a process. Called during proc_exit. */ void sync_handle_teardown_proc(struct proc *p); diff --git a/kernel/timer/posix_timer.c b/kernel/timer/posix_timer.c index 476d276..9c8764f 100644 --- a/kernel/timer/posix_timer.c +++ b/kernel/timer/posix_timer.c @@ -15,13 +15,6 @@ static struct posix_timer timer_pool[POSIX_TIMER_MAX]; static spinlock_t timer_lock = SPINLOCK_INITIALIZER; -static inline bool posix_timer_owner_matches(const struct posix_timer *t, - const struct proc *p) -{ - return t->in_use && t->owner == p && p && - t->owner_generation == p->generation; -} - static inline bool posix_timer_owner_alive(const struct posix_timer *t) { struct proc *owner = t->owner; @@ -57,6 +50,64 @@ static bool posix_timer_target_is_live(const struct sched_task *td) td->state != TD_STATE_TERMINATING; } +i32 posix_timer_alloc(struct proc *p) +{ + if (!p) + return -(i32) EINVAL; + + u64 flags = spin_lock_irqsave(&timer_lock); + for (i32 i = 0; i < POSIX_TIMER_MAX; i++) { + if (!timer_pool[i].in_use) { + timer_pool[i].in_use = true; + timer_pool[i].owner = p; + timer_pool[i].owner_generation = p->generation; + timer_pool[i].armed = false; + timer_pool[i].overrun = 0; + timer_pool[i].interval_ticks = 0; + timer_pool[i].target_tid = 0; + timer_pool[i].refcount = 1; + callout_init(&timer_pool[i].co); + spin_unlock_irqrestore(&timer_lock, flags); + return i; + } + } + spin_unlock_irqrestore(&timer_lock, flags); + return -(i32) EAGAIN; +} + +struct posix_timer *posix_timer_ptr(u16 object_index) +{ + if (object_index >= POSIX_TIMER_MAX) + return NULL; + return &timer_pool[object_index]; +} + +static void posix_timer_free(struct posix_timer *t) +{ + t->armed = false; + callout_cancel_sync(&t->co); + + u64 flags = spin_lock_irqsave(&timer_lock); + t->in_use = false; + t->owner = NULL; + t->owner_generation = 0; + t->interval_ticks = 0; + t->overrun = 0; + t->target_tid = 0; + t->refcount = 0; + spin_unlock_irqrestore(&timer_lock, flags); +} + +void posix_timer_put_idx(u16 object_index) +{ + struct posix_timer *t = posix_timer_ptr(object_index); + if (!t || !t->in_use) + return; + u32 rc = __atomic_sub_fetch(&t->refcount, 1, __ATOMIC_ACQ_REL); + if (rc == 0) + posix_timer_free(t); +} + static void timer_expiry_fn(void *arg) { struct posix_timer *t = arg; @@ -71,11 +122,6 @@ static void timer_expiry_fn(void *arg) t->armed = false; return; } - /* SIGEV_THREAD_ID: deliver to the specified thread directly. If - * the target thread has exited since posix_timer_settime, the - * signal is silently dropped: a thread-directed signal must not - * spray onto sibling threads that did not opt in. - */ if (t->target_tid != 0) { u64 tflags = proc_table_lock_irqsave(); struct sched_task *target = NULL; @@ -101,72 +147,34 @@ static void timer_expiry_fn(void *arg) } if (t->interval_ticks > 0) { - /* Periodic: re-arm only if still armed. posix_timer_delete - * clears armed before callout_cancel_sync, so re-checking here - * prevents re-arming a deleted timer. - */ if (!t->armed) return; - /* POSIX overrun: count expirations that occur while the previous - * SIGALRM is still pending (not yet delivered/handled) on any - * thread in the group. - */ - { - u64 tflags = proc_table_lock_irqsave(); - bool alrm_pending = - posix_timer_signal_pending_locked(owner, SIGALRM); - proc_table_unlock_irqrestore(tflags); - if (alrm_pending) - t->overrun++; - } + u64 tflags = proc_table_lock_irqsave(); + bool alrm_pending = posix_timer_signal_pending_locked(owner, SIGALRM); + proc_table_unlock_irqrestore(tflags); + if (alrm_pending) + t->overrun++; callout_set_ticks(&t->co, t->interval_ticks, timer_expiry_fn, t); } else { t->armed = false; } } -i32 posix_timer_create(struct proc *p) +i32 posix_timer_settime_idx(u16 object_index, + u64 value_ms, + u64 interval_ms, + u16 target_tid) { - if (!p) + if (object_index >= POSIX_TIMER_MAX) return -(i32) EINVAL; - u64 flags = spin_lock_irqsave(&timer_lock); - for (i32 i = 0; i < POSIX_TIMER_MAX; i++) { - if (!timer_pool[i].in_use) { - timer_pool[i].in_use = true; - timer_pool[i].owner = p; - timer_pool[i].owner_generation = p->generation; - timer_pool[i].armed = false; - timer_pool[i].overrun = 0; - timer_pool[i].interval_ticks = 0; - callout_init(&timer_pool[i].co); - spin_unlock_irqrestore(&timer_lock, flags); - return i; - } - } - spin_unlock_irqrestore(&timer_lock, flags); - return -(i32) EAGAIN; -} - -i32 posix_timer_settime(i32 handle, - struct proc *caller, - u64 value_ms, - u64 interval_ms, - u16 target_tid) -{ - if (handle < 0 || handle >= POSIX_TIMER_MAX) - return -(i32) EINVAL; - - struct posix_timer *t = &timer_pool[handle]; + struct posix_timer *t = &timer_pool[object_index]; if (!t->in_use) return -(i32) EINVAL; - if (!posix_timer_owner_matches(t, caller)) - return -(i32) EPERM; + struct proc *caller = t->owner; + if (!caller || t->owner_generation != caller->generation) + return -(i32) EINVAL; - /* If a specific TID is requested, validate it is currently a live - * thread of the owning proc; reject up front so user space cannot - * set up a timer that silently fails to deliver later. - */ if (target_tid != 0) { u64 tflags = proc_table_lock_irqsave(); bool found = false; @@ -182,14 +190,9 @@ i32 posix_timer_settime(i32 handle, return -(i32) ESRCH; } - /* Disarm any existing timer before reconfiguration. Use synchronous - * cancel so an in-flight expiry callback cannot race with the new state. - */ callout_cancel_sync(&t->co); - t->target_tid = target_tid; - /* POSIX: value_ms == 0 means disarm the timer. */ if (value_ms == 0) { t->armed = false; t->interval_ticks = 0; @@ -197,9 +200,6 @@ i32 posix_timer_settime(i32 handle, } t->interval_ticks = (interval_ms > 0) ? time_ms_to_ticks(interval_ms) : 0; - /* Clamp sub-tick intervals to 1 tick to prevent silent one-shot - * degradation when time_ms_to_ticks rounds down to 0. - */ if (interval_ms > 0 && t->interval_ticks == 0) t->interval_ticks = 1; t->overrun = 0; @@ -213,48 +213,15 @@ i32 posix_timer_settime(i32 handle, return 0; } -i32 posix_timer_delete(i32 handle, struct proc *caller) -{ - if (handle < 0 || handle >= POSIX_TIMER_MAX) - return -(i32) EINVAL; - - struct posix_timer *t = &timer_pool[handle]; - if (!t->in_use) - return -(i32) EINVAL; - if (!posix_timer_owner_matches(t, caller)) - return -(i32) EPERM; - - /* Mark unarmed first so an in-flight callback bails out. */ - t->armed = false; - - /* Synchronous cancel: wait for any in-flight callback to complete - * before freeing the handle. Plain callout_cancel is not sufficient - * because a periodic timer_expiry_fn could be mid-execution on another - * hart and would re-arm the callout after an async cancel returns. - */ - callout_cancel_sync(&t->co); - - u64 flags = spin_lock_irqsave(&timer_lock); - t->in_use = false; - t->owner = NULL; - t->owner_generation = 0; - spin_unlock_irqrestore(&timer_lock, flags); - - return 0; -} - -i64 posix_timer_gettime(i32 handle, struct proc *caller) +i64 posix_timer_gettime_idx(u16 object_index) { - if (handle < 0 || handle >= POSIX_TIMER_MAX) + if (object_index >= POSIX_TIMER_MAX) return -(i64) EINVAL; - struct posix_timer *t = &timer_pool[handle]; + struct posix_timer *t = &timer_pool[object_index]; if (!t->in_use) return -(i64) EINVAL; - if (!posix_timer_owner_matches(t, caller)) - return -(i64) EPERM; if (!t->armed) return 0; - /* Compute actual remaining time from the callout's absolute deadline. */ u64 now = time_rdtime(); u64 deadline = t->co.deadline; if (deadline <= now) @@ -262,15 +229,13 @@ i64 posix_timer_gettime(i32 handle, struct proc *caller) return (i64) time_ticks_to_ms(deadline - now); } -i64 posix_timer_getoverrun(i32 handle, struct proc *caller) +i64 posix_timer_getoverrun_idx(u16 object_index) { - if (handle < 0 || handle >= POSIX_TIMER_MAX) + if (object_index >= POSIX_TIMER_MAX) return -(i64) EINVAL; - struct posix_timer *t = &timer_pool[handle]; + struct posix_timer *t = &timer_pool[object_index]; if (!t->in_use) return -(i64) EINVAL; - if (!posix_timer_owner_matches(t, caller)) - return -(i64) EPERM; return (i64) t->overrun; } @@ -278,15 +243,8 @@ void posix_timer_teardown_proc(struct proc *p) { for (i32 i = 0; i < POSIX_TIMER_MAX; i++) { struct posix_timer *t = &timer_pool[i]; - if (posix_timer_owner_matches(t, p)) { - t->armed = false; - callout_cancel_sync(&t->co); - u64 flags = spin_lock_irqsave(&timer_lock); - t->in_use = false; - t->owner = NULL; - t->owner_generation = 0; - spin_unlock_irqrestore(&timer_lock, flags); - } + if (t->in_use && t->owner == p && t->owner_generation == p->generation) + posix_timer_free(t); } } @@ -294,6 +252,7 @@ static void posix_timer_boot_init(u32 flag __unused) { for (i32 i = 0; i < POSIX_TIMER_MAX; i++) { timer_pool[i].in_use = false; + timer_pool[i].refcount = 0; callout_init(&timer_pool[i].co); } } diff --git a/kernel/timer/posix_timer.h b/kernel/timer/posix_timer.h index a38747a..aa55ebc 100644 --- a/kernel/timer/posix_timer.h +++ b/kernel/timer/posix_timer.h @@ -23,6 +23,7 @@ struct posix_timer { u32 overrun; bool armed; bool in_use; + u32 refcount; /* SIGEV_THREAD_ID target TID. 0 selects process-directed delivery * (the kernel's signal_send picks an eligible thread). Non-zero * directs SIGALRM at the matching thread of the owning proc; if @@ -32,31 +33,32 @@ struct posix_timer { u16 target_tid; }; -/* Allocate a timer for the given process. Returns handle >= 0 or -EAGAIN. */ -i32 posix_timer_create(struct proc *p); +/* Allocate a timer object for the given process. Returns pool index >= 0. */ +i32 posix_timer_alloc(struct proc *p); -/* Arm a timer. Caller must be the owner. +/* Arm a timer by pool index. * value_ms = initial expiry (0 = disarm), interval_ms = repeat (0 = one-shot). * target_tid = SIGEV_THREAD_ID target (0 = process-directed). */ -i32 posix_timer_settime(i32 handle, - struct proc *caller, - u64 value_ms, - u64 interval_ms, - u16 target_tid); +i32 posix_timer_settime_idx(u16 object_index, + u64 value_ms, + u64 interval_ms, + u16 target_tid); -/* Disarm and delete a timer. Caller must be the owner. */ -i32 posix_timer_delete(i32 handle, struct proc *caller); +/* Drop a timer object reference. Refcount 0 tears the timer down. */ +void posix_timer_put_idx(u16 object_index); -/* Get remaining time in ms. Caller must be the owner. - * Returns ms >= 0 or -EINVAL/-EPERM. +/* Get remaining time in ms by pool index. + * Returns ms >= 0 or -EINVAL. */ -i64 posix_timer_gettime(i32 handle, struct proc *caller); +i64 posix_timer_gettime_idx(u16 object_index); -/* Get overrun count. Caller must be the owner. - * Returns count >= 0 or -EINVAL/-EPERM. +/* Get overrun count by pool index. + * Returns count >= 0 or -EINVAL. */ -i64 posix_timer_getoverrun(i32 handle, struct proc *caller); +i64 posix_timer_getoverrun_idx(u16 object_index); + +struct posix_timer *posix_timer_ptr(u16 object_index); /* Delete all timers owned by a process. Called during process teardown. */ void posix_timer_teardown_proc(struct proc *p); diff --git a/tests/tests-cap.c b/tests/tests-cap.c new file mode 100644 index 0000000..a9691ef --- /dev/null +++ b/tests/tests-cap.c @@ -0,0 +1,401 @@ +/* SPDX-License-Identifier: MIT */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static i32 selftest_cap_drop_invalidates_token(void) +{ + struct proc *p = proc_alloc(); + assert(p); + + i64 token = cap_get_token(p, PROC_FD_STDOUT, CAP_TYPE_FD); + assert(token > 0); + assert(cap_drop_token(p, (u64) token) == 0); + assert(!cap_fd_is_valid(p, PROC_FD_STDOUT)); + assert(cap_drop_token(p, (u64) token) == -(i64) EBADF); + + proc_free(p); + return 0; +} +DEFINE_SELFTEST(cap_drop_invalidates_token, + selftest_cap_drop_invalidates_token); + +static i32 selftest_cap_timer_token_drop(void) +{ + struct proc *p = proc_alloc(); + assert(p); + + i32 object_index = posix_timer_alloc(p); + assert(object_index >= 0); + i32 handle = cap_open_timer(p, (u16) object_index, + CAP_RIGHT_READ | CAP_RIGHT_WRITE, 3, true); + assert(handle == 3); + + i64 token = cap_get_token(p, handle, CAP_TYPE_TIMER); + assert(token > 0); + assert(cap_drop_token(p, (u64) token) == 0); + assert(!cap_slot_read(p, handle).valid); + struct posix_timer *timer = posix_timer_ptr((u16) object_index); + assert(timer); + assert(!timer->in_use); + + proc_free(p); + return 0; +} +DEFINE_SELFTEST(cap_timer_token_drop, selftest_cap_timer_token_drop); + +static i32 selftest_cap_transfer_revokes_child_dups(void) +{ + struct proc *parent = proc_alloc(); + struct proc *child = proc_alloc(); + assert(parent); + assert(child); + + struct pipe *pipe = pipe_alloc(); + assert(pipe); + assert(cap_open_pipe(parent, pipe, true, CAP_RIGHT_READ | CAP_RIGHT_GRANT, + 3, true) == 3); + + i64 src_token = cap_get_token(parent, 3, CAP_TYPE_FD); + assert(src_token > 0); + i64 delegate = + cap_transfer(parent, child->pid, (u64) src_token, CAP_RIGHT_READ); + assert(delegate > 0); + + assert(cap_fd_is_valid(child, 3)); + i64 child_token = cap_get_token(child, 3, CAP_TYPE_FD); + assert(child_token > 0); + assert(cap_transfer(child, parent->pid, (u64) child_token, + CAP_RIGHT_READ) == -(i64) EACCES); + + assert(cap_dup_fd(child, 3, 4, true) == 4); + assert(cap_fd_is_valid(child, 4)); + + assert(cap_revoke_delegate(parent, (u64) delegate) == 0); + assert(!cap_fd_is_valid(child, 3)); + assert(!cap_fd_is_valid(child, 4)); + + proc_free(parent); + proc_free(child); + return 0; +} +DEFINE_SELFTEST(cap_transfer_revokes_child_dups, + selftest_cap_transfer_revokes_child_dups); + +static i32 selftest_cap_inherit_preserves_rights(void) +{ + struct proc *parent = proc_alloc(); + struct proc *child = proc_alloc(); + struct proc *grandchild = proc_alloc(); + assert(parent); + assert(child); + assert(grandchild); + + assert(cap_inherit_fd(parent, child, PROC_FD_STDOUT, 5) == 5); + assert(cap_fd_is_valid(child, 5)); + assert(cap_fd_has_rights(child, 5, CAP_RIGHT_GRANT)); + assert(cap_fd_has_rights(child, 5, CAP_RIGHT_WRITE)); + + assert(cap_inherit_fd(child, grandchild, 5, 6) == 6); + assert(cap_fd_is_valid(grandchild, 6)); + assert(cap_fd_has_rights(grandchild, 6, CAP_RIGHT_GRANT)); + + proc_free(parent); + proc_free(child); + proc_free(grandchild); + return 0; +} +DEFINE_SELFTEST(cap_inherit_preserves_rights, + selftest_cap_inherit_preserves_rights); + +static i32 selftest_cap_revoke_preserves_unrelated_aliases(void) +{ + struct proc *parent = proc_alloc(); + struct proc *child = proc_alloc(); + assert(parent); + assert(child); + + struct pipe *pipe = pipe_alloc(); + assert(pipe); + assert(cap_open_pipe(parent, pipe, true, CAP_RIGHT_READ | CAP_RIGHT_GRANT, + 3, true) == 3); + + i64 src_token = cap_get_token(parent, 3, CAP_TYPE_FD); + assert(src_token > 0); + + i64 delegate_a = + cap_transfer(parent, child->pid, (u64) src_token, CAP_RIGHT_READ); + assert(delegate_a > 0); + assert(cap_fd_is_valid(child, 3)); + + assert(cap_dup_fd(child, 3, 4, true) == 4); + assert(cap_fd_is_valid(child, 4)); + + i64 delegate_b = + cap_transfer(parent, child->pid, (u64) src_token, CAP_RIGHT_READ); + assert(delegate_b > 0); + assert(cap_fd_is_valid(child, 5)); + + assert(cap_revoke_delegate(parent, (u64) delegate_a) == 0); + assert(!cap_fd_is_valid(child, 3)); + assert(!cap_fd_is_valid(child, 4)); + assert(cap_fd_is_valid(child, 5)); + + proc_free(parent); + proc_free(child); + return 0; +} +DEFINE_SELFTEST(cap_revoke_preserves_unrelated_aliases, + selftest_cap_revoke_preserves_unrelated_aliases); + +static i32 selftest_cap_inherit_clones_non_grant_fd(void) +{ + struct proc *parent = proc_alloc(); + struct proc *child = proc_alloc(); + assert(parent); + assert(child); + + struct pipe *pipe = pipe_alloc(); + assert(pipe); + assert(cap_open_pipe(parent, pipe, true, CAP_RIGHT_READ, 3, true) == 3); + assert(!cap_fd_has_rights(parent, 3, CAP_RIGHT_GRANT)); + + assert(cap_inherit_fd(parent, child, 3, 3) == 3); + assert(cap_fd_is_valid(child, 3)); + assert(cap_fd_has_rights(child, 3, CAP_RIGHT_READ)); + assert(!cap_fd_has_rights(child, 3, CAP_RIGHT_GRANT)); + + proc_free(parent); + proc_free(child); + return 0; +} +DEFINE_SELFTEST(cap_inherit_clones_non_grant_fd, + selftest_cap_inherit_clones_non_grant_fd); + +static i32 selftest_cap_revoke_delegate_revokes_spawned_child(void) +{ + struct proc *donor = proc_alloc(); + struct proc *parent = proc_alloc(); + struct proc *child = proc_alloc(); + assert(donor); + assert(parent); + assert(child); + + struct pipe *pipe = pipe_alloc(); + assert(pipe); + assert(cap_open_pipe(donor, pipe, true, CAP_RIGHT_READ | CAP_RIGHT_GRANT, 3, + true) == 3); + + i64 src_token = cap_get_token(donor, 3, CAP_TYPE_FD); + assert(src_token > 0); + i64 delegate = + cap_transfer(donor, parent->pid, (u64) src_token, CAP_RIGHT_READ); + assert(delegate > 0); + assert(cap_fd_is_valid(parent, 3)); + assert(!cap_fd_has_rights(parent, 3, CAP_RIGHT_GRANT)); + child->parent_pid = parent->pid; + child->parent_generation = parent->generation; + assert(cap_inherit_fd(parent, child, 3, 3) == 3); + assert(cap_fd_is_valid(child, 3)); + + assert(cap_revoke_delegate(donor, (u64) delegate) == 0); + assert(!cap_fd_is_valid(parent, 3)); + assert(!cap_fd_is_valid(child, 3)); + + proc_free(donor); + proc_free(parent); + proc_free(child); + return 0; +} +DEFINE_SELFTEST(cap_revoke_delegate_revokes_spawned_child, + selftest_cap_revoke_delegate_revokes_spawned_child); + +static i32 selftest_cap_mutex_token_drop(void) +{ + struct proc *p = proc_alloc(); + assert(p); + + i32 object_index = sync_mutex_alloc(p); + assert(object_index >= 0); + i32 handle = cap_open_handle(p, (u16) object_index, CAP_TYPE_MUTEX, + CAP_RIGHT_WRITE, 5, true); + assert(handle == 5); + + i64 token = cap_get_token(p, handle, CAP_TYPE_MUTEX); + assert(token > 0); + assert(cap_drop_token(p, (u64) token) == 0); + assert(!cap_slot_read(p, handle).valid); + assert(!sync_mutex_get(object_index)); + + proc_free(p); + return 0; +} +DEFINE_SELFTEST(cap_mutex_token_drop, selftest_cap_mutex_token_drop); + +/* Validate the cap_lookup_object active-use pin lifecycle: + * (1) lookup bumps refcount above 1 so an intervening drop cannot free + * the underlying primitive, + * (2) cap_put_ref decrements; the next drop frees, + * (3) without cap_put_ref the slot stays alive (refcount remains > 0). + */ +static i32 selftest_cap_lookup_object_pins_sync(void) +{ + struct proc *p = proc_alloc(); + assert(p); + + i32 object_index = sync_mutex_alloc(p); + assert(object_index >= 0); + i32 handle = cap_open_handle(p, (u16) object_index, CAP_TYPE_MUTEX, + CAP_RIGHT_WRITE, 5, true); + assert(handle == 5); + + /* Pin via cap_lookup_object, then concurrently drop the token. The + * underlying mutex must stay live until cap_put_ref runs. + */ + struct cap_ref ref = + cap_lookup_object(p, handle, CAP_RIGHT_WRITE, CAP_TYPE_MUTEX); + assert(ref.type == CAP_TYPE_MUTEX); + + i64 token = cap_get_token(p, handle, CAP_TYPE_MUTEX); + assert(token > 0); + assert(cap_drop_token(p, (u64) token) == 0); + /* The cap slot is gone, but the pinned object survives. */ + assert(!cap_slot_read(p, handle).valid); + assert(sync_mutex_get(object_index)); + + /* Release the pin -- now the underlying primitive is destroyed. */ + cap_put_ref(&ref); + assert(sync_mutex_get(object_index) == NULL); + assert(ref.type == CAP_TYPE_NONE); + + proc_free(p); + return 0; +} +DEFINE_SELFTEST(cap_lookup_object_pins_sync, + selftest_cap_lookup_object_pins_sync); + +static i32 selftest_cap_thread_slots_are_reserved(void) +{ + struct proc *p = proc_alloc(); + assert(p); + + i32 object_index = posix_timer_alloc(p); + assert(object_index >= 0); + + i32 reserved_slot = CAP_SPACE_SLOTS - 1; + assert(cap_open_timer(p, (u16) object_index, + CAP_RIGHT_READ | CAP_RIGHT_WRITE, reserved_slot, + true) == -(i32) EBADF); + assert(posix_timer_ptr((u16) object_index)->in_use); + posix_timer_put_idx((u16) object_index); + + proc_free(p); + return 0; +} +DEFINE_SELFTEST(cap_thread_slots_are_reserved, + selftest_cap_thread_slots_are_reserved); + +static i32 selftest_cap_dup_reserved_slot_preserves_thread_handle(void) +{ + struct proc *p = proc_alloc(); + assert(p); + + /* sched_task carries an embedded kernel stack and guard page; allocate it + * on the kvalloc heap rather than the selftest task's own stack to avoid + * overflowing into the guard. + */ + struct option_byte_array td_mem = + kvalloc_alloc(sizeof(struct sched_task), alignof(struct sched_task)); + assert(!td_mem.is_none); + struct byte_array td_ba = option_byte_array_checked(td_mem); + struct sched_task *td = byte_array_ptr(td_ba); + memset(td, 0, sizeof(*td)); + + td->proc = p; + { + u64 flags = proc_table_lock_irqsave(); + assert(proc_attach_task(p, td)); + proc_table_unlock_irqrestore(flags); + } + + u8 thread_slot = proc_task_slot(p, td); + i32 reserved_slot = CAP_SPACE_SLOTS - PROC_THREAD_MAX + (i32) thread_slot; + i32 thread_handle = + cap_open_handle(p, thread_slot, CAP_TYPE_THREAD, + CAP_RIGHT_READ | CAP_RIGHT_WRITE, reserved_slot, true); + assert(thread_handle == reserved_slot); + td->td_cap_slot = (i16) thread_handle; + + i64 thread_token = cap_get_token(p, thread_handle, CAP_TYPE_THREAD); + assert(thread_token > 0); + assert(cap_dup_fd(p, PROC_FD_STDOUT, thread_handle, true) == -(i32) EBADF); + assert(cap_get_token(p, thread_handle, CAP_TYPE_THREAD) == thread_token); + + /* Detach the dummy task so proc_free sees an empty task list, then release + * the heap-allocated task storage. + */ + { + u64 flags = proc_table_lock_irqsave(); + proc_detach_task(p, td); + proc_table_unlock_irqrestore(flags); + } + kvalloc_free(td_ba); + proc_free(p); + return 0; +} +DEFINE_SELFTEST(cap_dup_reserved_slot_preserves_thread_handle, + selftest_cap_dup_reserved_slot_preserves_thread_handle); + +/* Token slot_index spans bits 40..47 (8 bits). Setting bit 47 in an otherwise + * valid token must yield an out-of-range slot and reject the lookup; the + * earlier mask collapsed bit 47 to a live alias. + */ +static i32 selftest_cap_token_slot_bit47_rejected(void) +{ + struct proc *p = proc_alloc(); + assert(p); + + i64 token = cap_get_token(p, PROC_FD_STDOUT, CAP_TYPE_FD); + assert(token > 0); + + u64 aliased = (u64) token | ((u64) 1 << 47); + assert(aliased != (u64) token); + /* The aliased token must NOT validate; cap_drop_token returns EBADF. */ + assert(cap_drop_token(p, aliased) == -(i64) EBADF); + /* Original cap is still live. */ + assert(cap_fd_is_valid(p, PROC_FD_STDOUT)); + + proc_free(p); + return 0; +} +DEFINE_SELFTEST(cap_token_slot_bit47_rejected, + selftest_cap_token_slot_bit47_rejected); + +static i32 selftest_cap_mqueue_token_drop(void) +{ + struct proc *p = proc_alloc(); + assert(p); + + i32 object_index = mqueue_open(p, 4, 16); + assert(object_index >= 0); + i32 handle = cap_open_handle(p, (u16) object_index, CAP_TYPE_MQUEUE, + CAP_RIGHT_READ | CAP_RIGHT_WRITE, 6, true); + assert(handle == 6); + + i64 token = cap_get_token(p, handle, CAP_TYPE_MQUEUE); + assert(token > 0); + assert(cap_drop_token(p, (u64) token) == 0); + assert(!cap_slot_read(p, handle).valid); + assert(mqueue_close(object_index) == -(i32) EBADF); + + proc_free(p); + return 0; +} +DEFINE_SELFTEST(cap_mqueue_token_drop, selftest_cap_mqueue_token_drop); diff --git a/tests/tests-loader.c b/tests/tests-loader.c index 461ea75..577d5bf 100644 --- a/tests/tests-loader.c +++ b/tests/tests-loader.c @@ -8,7 +8,7 @@ static i32 selftest_elf_load(void) u8 garbage[64] = {0}; struct byte_view bv = byte_view_new(garbage, sizeof(garbage)); struct proc *p = proc_alloc(); - assert(p != NULL); + assert(p); struct result r = proc_load_elf(p, bv); assert(r.is_error); /* should fail: bad magic */ proc_free(p); diff --git a/tests/tests-pipe.c b/tests/tests-pipe.c index defbd95..60356bb 100644 --- a/tests/tests-pipe.c +++ b/tests/tests-pipe.c @@ -18,7 +18,7 @@ static bool pipe_mem_eq(const void *a, const void *b, sz n) static i32 selftest_pipe_alloc(void) { struct pipe *p = pipe_alloc(); - assert(p != NULL); + assert(p); assert(p->readers == 1); assert(p->writers == 1); assert(pipe_used(p) == 0); @@ -33,7 +33,7 @@ DEFINE_SELFTEST(pipe_alloc, selftest_pipe_alloc); static i32 selftest_pipe_write_read(void) { struct pipe *p = pipe_alloc(); - assert(p != NULL); + assert(p); const char msg[] = "hello pipe"; i64 written = pipe_write(p, msg, sizeof(msg)); @@ -54,7 +54,7 @@ DEFINE_SELFTEST(pipe_write_read, selftest_pipe_write_read); static i32 selftest_pipe_eof(void) { struct pipe *p = pipe_alloc(); - assert(p != NULL); + assert(p); /* Write some data first. */ const char msg[] = "data"; @@ -83,7 +83,7 @@ DEFINE_SELFTEST(pipe_eof, selftest_pipe_eof); static i32 selftest_pipe_epipe(void) { struct pipe *p = pipe_alloc(); - assert(p != NULL); + assert(p); pipe_close_read(p); @@ -100,7 +100,7 @@ DEFINE_SELFTEST(pipe_epipe, selftest_pipe_epipe); static i32 selftest_pipe_full(void) { struct pipe *p = pipe_alloc(); - assert(p != NULL); + assert(p); /* Fill the pipe completely. */ u8 wbuf[256]; @@ -141,7 +141,7 @@ DEFINE_SELFTEST(pipe_full, selftest_pipe_full); static i32 selftest_pipe_multi_write(void) { struct pipe *p = pipe_alloc(); - assert(p != NULL); + assert(p); /* Three writes. */ i64 w1 = pipe_write(p, "AAA", 3); @@ -167,7 +167,7 @@ DEFINE_SELFTEST(pipe_multi_write, selftest_pipe_multi_write); static i32 selftest_pipe_zero_len(void) { struct pipe *p = pipe_alloc(); - assert(p != NULL); + assert(p); char buf[4]; i64 r = pipe_read(p, buf, 0); @@ -183,22 +183,16 @@ DEFINE_SELFTEST(pipe_zero_len, selftest_pipe_zero_len); static i32 selftest_pipe_fd_install(void) { struct proc *pr = proc_alloc(); - assert(pr != NULL); + assert(pr); struct pipe *p = pipe_alloc(); - assert(p != NULL); + assert(p); /* Install read end at FD 3, write end at FD 4. */ - u64 fd_flags = proc_fd_lock_irqsave(pr); - pr->fd_table[3].is_open = true; - pr->fd_table[3].is_pipe = true; - pr->fd_table[3].pipe_read_end = true; - pr->fd_table[3].pipe = p; - pr->fd_table[4].is_open = true; - pr->fd_table[4].is_pipe = true; - pr->fd_table[4].pipe_read_end = false; - pr->fd_table[4].pipe = p; - proc_fd_unlock_irqrestore(pr, fd_flags); + assert(cap_open_pipe(pr, p, true, CAP_RIGHT_READ | CAP_RIGHT_GRANT, 3, + true) == 3); + assert(cap_open_pipe(pr, p, false, CAP_RIGHT_WRITE | CAP_RIGHT_GRANT, 4, + true) == 4); /* Write via pipe, read via pipe. */ const char msg[] = "fd-test"; @@ -223,7 +217,7 @@ DEFINE_SELFTEST(pipe_fd_install, selftest_pipe_fd_install); static i32 selftest_pipe_wraparound(void) { struct pipe *p = pipe_alloc(); - assert(p != NULL); + assert(p); /* Fill 3/4 of the buffer, read it all, then write again to wrap. */ u8 fill[256]; @@ -286,7 +280,7 @@ DEFINE_SELFTEST(pipe_wraparound, selftest_pipe_wraparound); static i32 selftest_pipe_dup_refcount(void) { struct pipe *p = pipe_alloc(); - assert(p != NULL); + assert(p); assert(p->readers == 1); assert(p->writers == 1); diff --git a/tests/tests-posix_timer.c b/tests/tests-posix_timer.c index e471e81..ca88acd 100644 --- a/tests/tests-posix_timer.c +++ b/tests/tests-posix_timer.c @@ -33,17 +33,22 @@ static i32 test_posix_timer_owner_isolation(void) { struct proc *owner = proc_alloc(); struct proc *other = proc_alloc(); - SELFTEST_ASSERT(owner != NULL, 1); - SELFTEST_ASSERT(other != NULL, 2); + SELFTEST_ASSERT(owner, 1); + SELFTEST_ASSERT(other, 2); - i32 h = posix_timer_create(owner); + i32 object_index = posix_timer_alloc(owner); + SELFTEST_ASSERT(object_index >= 0, 3); + i32 h = cap_open_timer(owner, (u16) object_index, + CAP_RIGHT_READ | CAP_RIGHT_WRITE, 3, true); SELFTEST_ASSERT(h >= 0, 3); - SELFTEST_ASSERT(posix_timer_settime(h, other, 10, 0, 0) == -(i32) EPERM, 4); - SELFTEST_ASSERT(posix_timer_gettime(h, other) == -(i64) EPERM, 5); - SELFTEST_ASSERT(posix_timer_getoverrun(h, other) == -(i64) EPERM, 6); - SELFTEST_ASSERT(posix_timer_delete(h, other) == -(i32) EPERM, 7); - SELFTEST_ASSERT(posix_timer_delete(h, owner) == 0, 8); + SELFTEST_ASSERT(cap_lookup_timer(other, h, CAP_RIGHT_WRITE).ptr == NULL, 4); + SELFTEST_ASSERT(cap_lookup_timer(other, h, CAP_RIGHT_READ).ptr == NULL, 5); + + struct cap_slot_view slot = cap_slot_read(owner, h); + SELFTEST_ASSERT(slot.valid, 6); + SELFTEST_ASSERT(slot.type == CAP_TYPE_TIMER, 7); + SELFTEST_ASSERT(cap_drop_token(owner, cap_make_handle(&slot)) == 0, 8); proc_free(owner); proc_free(other); @@ -54,26 +59,31 @@ DEFINE_SELFTEST(posix_timer_owner_isolation, test_posix_timer_owner_isolation); static i32 test_posix_timer_teardown_on_proc_exit(void) { struct proc *owner = proc_alloc(); - SELFTEST_ASSERT(owner != NULL, 1); + SELFTEST_ASSERT(owner, 1); proc_set_state(owner, PROC_STATE_RUNNING); - i32 h = posix_timer_create(owner); - SELFTEST_ASSERT(h >= 0, 2); - SELFTEST_ASSERT(posix_timer_settime(h, owner, 50, 0, 0) == 0, 3); - SELFTEST_ASSERT(timer_pool[h].in_use, 4); - SELFTEST_ASSERT(timer_pool[h].owner == owner, 5); + i32 object_index = posix_timer_alloc(owner); + SELFTEST_ASSERT(object_index >= 0, 2); + i32 h = cap_open_timer(owner, (u16) object_index, + CAP_RIGHT_READ | CAP_RIGHT_WRITE, 3, true); + SELFTEST_ASSERT(h >= 0, 3); + SELFTEST_ASSERT(posix_timer_settime_idx((u16) object_index, 50, 0, 0) == 0, + 4); + SELFTEST_ASSERT(timer_pool[object_index].in_use, 5); + SELFTEST_ASSERT(timer_pool[object_index].owner == owner, 6); proc_exit(owner, 0); - SELFTEST_ASSERT(!timer_pool[h].in_use, 6); - SELFTEST_ASSERT(!timer_pool[h].armed, 7); - SELFTEST_ASSERT(timer_pool[h].owner == NULL, 8); - SELFTEST_ASSERT(timer_pool[h].owner_generation == 0, 9); + SELFTEST_ASSERT(!timer_pool[object_index].in_use, 7); + SELFTEST_ASSERT(!timer_pool[object_index].armed, 8); + SELFTEST_ASSERT(!timer_pool[object_index].owner, 9); + SELFTEST_ASSERT(timer_pool[object_index].owner_generation == 0, 10); struct proc *reused = proc_alloc(); - SELFTEST_ASSERT(reused != NULL, 10); - SELFTEST_ASSERT(posix_timer_settime(h, reused, 10, 0, 0) == -(i32) EINVAL, - 11); + SELFTEST_ASSERT(reused, 11); + SELFTEST_ASSERT( + posix_timer_settime_idx((u16) object_index, 10, 0, 0) == -(i32) EINVAL, + 12); proc_free(reused); return 0; } @@ -83,9 +93,9 @@ DEFINE_SELFTEST(posix_timer_teardown_on_proc_exit, static i32 test_posix_timer_stale_owner_generation(void) { struct proc *owner = proc_alloc(); - SELFTEST_ASSERT(owner != NULL, 1); + SELFTEST_ASSERT(owner, 1); - i32 h = posix_timer_create(owner); + i32 h = posix_timer_alloc(owner); SELFTEST_ASSERT(h >= 0, 2); struct posix_timer *t = &timer_pool[h]; @@ -95,7 +105,7 @@ static i32 test_posix_timer_stale_owner_generation(void) proc_free(owner); struct proc *reused = proc_alloc(); - SELFTEST_ASSERT(reused != NULL, 4); + SELFTEST_ASSERT(reused, 4); SELFTEST_ASSERT(reused == owner, 5); proc_set_state(reused, PROC_STATE_RUNNING); /* No task is attached to the reused proc here, so the per-thread @@ -125,22 +135,23 @@ DEFINE_SELFTEST(posix_timer_stale_owner_generation, static i32 test_posix_timer_rejects_exited_thread_target(void) { struct proc *owner = proc_alloc(); - SELFTEST_ASSERT(owner != NULL, 1); + SELFTEST_ASSERT(owner, 1); proc_set_state(owner, PROC_STATE_RUNNING); struct sched_task *target = alloc_mock_task(); - SELFTEST_ASSERT(target != NULL, 2); + SELFTEST_ASSERT(target, 2); target->proc = owner; target->id = 7; target->td_join_state = TD_JOIN_EXITED; SELFTEST_ASSERT(attach_mock_task(owner, target), 3); - i32 h = posix_timer_create(owner); + i32 h = posix_timer_alloc(owner); SELFTEST_ASSERT(h >= 0, 4); SELFTEST_ASSERT( - posix_timer_settime(h, owner, 10, 0, target->id) == -(i32) ESRCH, 5); + posix_timer_settime_idx((u16) h, 10, 0, target->id) == -(i32) ESRCH, 5); SELFTEST_ASSERT(timer_pool[h].target_tid == 0, 6); - SELFTEST_ASSERT(posix_timer_delete(h, owner) == 0, 7); + posix_timer_put_idx((u16) h); + SELFTEST_ASSERT(!timer_pool[h].in_use, 7); u64 flags = proc_table_lock_irqsave(); proc_detach_task(owner, target); diff --git a/tests/tests-proc.c b/tests/tests-proc.c index 54e49d5..da5da4e 100644 --- a/tests/tests-proc.c +++ b/tests/tests-proc.c @@ -6,7 +6,7 @@ static i32 selftest_proc_alloc_free(void) { struct proc *p = proc_alloc(); - assert(p != NULL); + assert(p); assert(p->magic == PROC_MAGIC); assert(p->state == PROC_STATE_EMBRYO); assert(p->pid > 0); @@ -22,19 +22,19 @@ static i32 selftest_proc_alloc_free(void) } DEFINE_SELFTEST(proc_alloc_free, selftest_proc_alloc_free); -static i32 selftest_proc_fd_table(void) +static i32 selftest_proc_cap_fd_slots(void) { struct proc *p = proc_alloc(); - assert(p != NULL); - assert(p->fd_table[PROC_FD_STDIN].is_open); - assert(p->fd_table[PROC_FD_STDOUT].is_open); - assert(p->fd_table[PROC_FD_STDERR].is_open); + assert(p); + assert(cap_fd_is_valid(p, PROC_FD_STDIN)); + assert(cap_fd_is_valid(p, PROC_FD_STDOUT)); + assert(cap_fd_is_valid(p, PROC_FD_STDERR)); for (sz i = PROC_FD_STDERR + 1; i < PROC_FD_MAX; i++) - assert(!p->fd_table[i].is_open); + assert(!cap_fd_is_valid(p, (i32) i)); proc_free(p); return 0; } -DEFINE_SELFTEST(proc_fd_table, selftest_proc_fd_table); +DEFINE_SELFTEST(proc_cap_fd_slots, selftest_proc_cap_fd_slots); static i32 selftest_proc_pid_wrap_collision(void) { @@ -43,8 +43,8 @@ static i32 selftest_proc_pid_wrap_collision(void) struct proc *p1 = proc_alloc(); struct proc *p2 = proc_alloc(); - assert(p1 != NULL); - assert(p2 != NULL); + assert(p1); + assert(p2); assert(p1->pid == U16_MAX); assert(p2->pid == 1); assert(p1->pid != p2->pid); @@ -59,7 +59,7 @@ DEFINE_SELFTEST(proc_pid_wrap_collision, selftest_proc_pid_wrap_collision); static i32 selftest_proc_vma_tracking(void) { struct proc *p = proc_alloc(); - assert(p != NULL); + assert(p); assert(p->n_vmas == 0); /* Register a code VMA at USER_CODE_BASE, 2 pages. */ @@ -97,7 +97,7 @@ DEFINE_SELFTEST(proc_vma_tracking, selftest_proc_vma_tracking); static i32 selftest_proc_state_machine(void) { struct proc *p = proc_alloc(); - assert(p != NULL); + assert(p); assert(p->state == PROC_STATE_EMBRYO); /* EMBRYO -> RUNNING */ @@ -119,7 +119,7 @@ DEFINE_SELFTEST(proc_state_machine, selftest_proc_state_machine); static i32 selftest_proc_sleeping_transition(void) { struct proc *p = proc_alloc(); - assert(p != NULL); + assert(p); proc_set_state(p, PROC_STATE_RUNNING); proc_set_state(p, PROC_STATE_SLEEPING); @@ -141,23 +141,25 @@ static i32 selftest_proc_reparent_orphan(void) u16 save_next = next_pid; next_pid = 1; struct proc *init = proc_alloc(); - assert(init != NULL); + assert(init); assert(init->pid == 1); proc_set_state(init, PROC_STATE_RUNNING); /* Allocate parent and two children. */ struct proc *parent = proc_alloc(); - assert(parent != NULL); + assert(parent); proc_set_state(parent, PROC_STATE_RUNNING); struct proc *c1 = proc_alloc(); - assert(c1 != NULL); + assert(c1); c1->parent_pid = parent->pid; + c1->parent_generation = parent->generation; proc_set_state(c1, PROC_STATE_RUNNING); struct proc *c2 = proc_alloc(); - assert(c2 != NULL); + assert(c2); c2->parent_pid = parent->pid; + c2->parent_generation = parent->generation; proc_set_state(c2, PROC_STATE_RUNNING); assert(proc_count_children(parent->pid) == 2); @@ -189,16 +191,17 @@ static i32 selftest_proc_reparent_zombie(void) u16 save_next = next_pid; next_pid = 1; struct proc *init = proc_alloc(); - assert(init != NULL && init->pid == 1); + assert(init && init->pid == 1); proc_set_state(init, PROC_STATE_RUNNING); struct proc *parent = proc_alloc(); - assert(parent != NULL); + assert(parent); proc_set_state(parent, PROC_STATE_RUNNING); struct proc *child = proc_alloc(); - assert(child != NULL); + assert(child); child->parent_pid = parent->pid; + child->parent_generation = parent->generation; proc_set_state(child, PROC_STATE_RUNNING); proc_set_state(child, PROC_STATE_ZOMBIE); /* child died first */ u16 child_pid = child->pid; @@ -228,14 +231,15 @@ DEFINE_SELFTEST(proc_reparent_zombie, selftest_proc_reparent_zombie); static i32 selftest_proc_multi_child_wait(void) { struct proc *parent = proc_alloc(); - assert(parent != NULL); + assert(parent); proc_set_state(parent, PROC_STATE_RUNNING); struct proc *children[3]; for (sz i = 0; i < 3; i++) { children[i] = proc_alloc(); - assert(children[i] != NULL); + assert(children[i]); children[i]->parent_pid = parent->pid; + children[i]->parent_generation = parent->generation; proc_set_state(children[i], PROC_STATE_RUNNING); } @@ -269,12 +273,13 @@ DEFINE_SELFTEST(proc_multi_child_wait, selftest_proc_multi_child_wait); static i32 selftest_proc_exit_lifecycle(void) { struct proc *parent = proc_alloc(); - assert(parent != NULL); + assert(parent); proc_set_state(parent, PROC_STATE_RUNNING); struct proc *child = proc_alloc(); - assert(child != NULL); + assert(child); child->parent_pid = parent->pid; + child->parent_generation = parent->generation; proc_set_state(child, PROC_STATE_RUNNING); u16 child_pid = child->pid; @@ -309,12 +314,13 @@ DEFINE_SELFTEST(proc_exit_lifecycle, selftest_proc_exit_lifecycle); static i32 selftest_proc_slot_reuse(void) { struct proc *parent = proc_alloc(); - assert(parent != NULL); + assert(parent); proc_set_state(parent, PROC_STATE_RUNNING); struct proc *child = proc_alloc(); - assert(child != NULL); + assert(child); child->parent_pid = parent->pid; + child->parent_generation = parent->generation; proc_set_state(child, PROC_STATE_RUNNING); u32 gen = child->generation; @@ -331,7 +337,7 @@ static i32 selftest_proc_slot_reuse(void) /* Can allocate again in that slot. */ struct proc *reused = proc_alloc(); - assert(reused != NULL); + assert(reused); /* The allocator may or may not give us the same slot, but the table * should have capacity. */ @@ -347,20 +353,22 @@ DEFINE_SELFTEST(proc_slot_reuse, selftest_proc_slot_reuse); static i32 selftest_proc_count_children(void) { struct proc *parent = proc_alloc(); - assert(parent != NULL); + assert(parent); proc_set_state(parent, PROC_STATE_RUNNING); assert(proc_count_children(parent->pid) == 0); struct proc *c1 = proc_alloc(); - assert(c1 != NULL); + assert(c1); c1->parent_pid = parent->pid; + c1->parent_generation = parent->generation; proc_set_state(c1, PROC_STATE_RUNNING); assert(proc_count_children(parent->pid) == 1); struct proc *c2 = proc_alloc(); - assert(c2 != NULL); + assert(c2); c2->parent_pid = parent->pid; + c2->parent_generation = parent->generation; proc_set_state(c2, PROC_STATE_RUNNING); assert(proc_count_children(parent->pid) == 2); @@ -395,7 +403,7 @@ DEFINE_SELFTEST(proc_count_children, selftest_proc_count_children); static i32 selftest_proc_task_list(void) { struct proc *p = proc_alloc(); - assert(p != NULL); + assert(p); assert(p->n_tasks == 0); assert(proc_thread_group_leader(p) == NULL); /* Transition to RUNNING so proc_reserve_thread_slot accepts the diff --git a/tests/tests-pse51.c b/tests/tests-pse51.c index 282fec6..97cee36 100644 --- a/tests/tests-pse51.c +++ b/tests/tests-pse51.c @@ -57,7 +57,7 @@ static i32 test_pse51_mutex(void) i32 h = sync_mutex_alloc(NULL); SELFTEST_ASSERT(h >= 0, 1); - struct pi_mutex *m = sync_mutex_get(h, NULL); + struct pi_mutex *m = sync_mutex_get(h); SELFTEST_ASSERT(m != NULL, 2); pi_mutex_lock(m); @@ -77,7 +77,7 @@ static i32 test_pse51_sem(void) i32 h = sync_sem_alloc(NULL, 1); SELFTEST_ASSERT(h >= 0, 1); - struct semaphore *s = sync_sem_get(h, NULL); + struct semaphore *s = sync_sem_get(h); SELFTEST_ASSERT(s != NULL, 2); sem_wait(s); diff --git a/tests/tests-spawn.c b/tests/tests-spawn.c index cc0b40b..7fde27a 100644 --- a/tests/tests-spawn.c +++ b/tests/tests-spawn.c @@ -1,4 +1,5 @@ /* SPDX-License-Identifier: MIT */ +#include #include #include #include @@ -18,7 +19,7 @@ static bool spawn_test_vfs_available(void) static i32 selftest_spawn_fa_close(void) { struct proc *p = proc_alloc(); - assert(p != NULL); + assert(p); /* FDs 0/1/2 are open by default. Close FD 1 (stdout). */ struct spawn_file_action fa = { @@ -27,11 +28,11 @@ static i32 selftest_spawn_fa_close(void) }; i32 rc = spawn_apply_file_actions(p, &fa, 1); assert(rc == 0); - assert(!p->fd_table[PROC_FD_STDOUT].is_open); + assert(!cap_fd_is_valid(p, PROC_FD_STDOUT)); /* FDs 0 and 2 should still be open. */ - assert(p->fd_table[PROC_FD_STDIN].is_open); - assert(p->fd_table[PROC_FD_STDERR].is_open); + assert(cap_fd_is_valid(p, PROC_FD_STDIN)); + assert(cap_fd_is_valid(p, PROC_FD_STDERR)); proc_free(p); return 0; @@ -42,7 +43,7 @@ DEFINE_SELFTEST(spawn_fa_close, selftest_spawn_fa_close); static i32 selftest_spawn_fa_dup2(void) { struct proc *p = proc_alloc(); - assert(p != NULL); + assert(p); /* Dup FD 0 (stdin) to FD 5. */ struct spawn_file_action fa = { @@ -52,8 +53,8 @@ static i32 selftest_spawn_fa_dup2(void) }; i32 rc = spawn_apply_file_actions(p, &fa, 1); assert(rc == 0); - assert(p->fd_table[5].is_open); - assert(p->fd_table[5].is_dup); + assert(cap_fd_is_valid(p, 5)); + assert(cap_fd_is_valid(p, PROC_FD_STDIN)); proc_free(p); return 0; @@ -67,7 +68,7 @@ static i32 selftest_spawn_fa_open(void) return 0; /* skip gracefully */ struct proc *p = proc_alloc(); - assert(p != NULL); + assert(p); char path[] = "/hello.txt"; struct spawn_file_action fa = { @@ -78,9 +79,8 @@ static i32 selftest_spawn_fa_open(void) }; i32 rc = spawn_apply_file_actions(p, &fa, 1); assert(rc == 0); - assert(p->fd_table[5].is_open); - assert(p->fd_table[5].is_seekable); - assert(!p->fd_table[5].is_dup); + assert(cap_fd_is_valid(p, 5)); + assert(cap_fd_is_seekable(p, 5)); proc_free(p); return 0; @@ -94,7 +94,7 @@ static i32 selftest_spawn_fa_open_replace(void) return 0; struct proc *p = proc_alloc(); - assert(p != NULL); + assert(p); char path[] = "/hello.txt"; struct spawn_file_action fa = { @@ -105,8 +105,8 @@ static i32 selftest_spawn_fa_open_replace(void) }; i32 rc = spawn_apply_file_actions(p, &fa, 1); assert(rc == 0); - assert(p->fd_table[PROC_FD_STDIN].is_open); - assert(p->fd_table[PROC_FD_STDIN].is_seekable); + assert(cap_fd_is_valid(p, PROC_FD_STDIN)); + assert(cap_fd_is_seekable(p, PROC_FD_STDIN)); proc_free(p); return 0; @@ -120,7 +120,7 @@ static i32 selftest_spawn_fa_multi(void) return 0; struct proc *p = proc_alloc(); - assert(p != NULL); + assert(p); char path[] = "/hello.txt"; struct spawn_file_action fas[3] = { @@ -137,10 +137,9 @@ static i32 selftest_spawn_fa_multi(void) i32 rc = spawn_apply_file_actions(p, fas, 3); assert(rc == 0); - assert(!p->fd_table[PROC_FD_STDERR].is_open); - assert(p->fd_table[3].is_open); - assert(p->fd_table[4].is_open); - assert(p->fd_table[4].is_dup); + assert(!cap_fd_is_valid(p, PROC_FD_STDERR)); + assert(cap_fd_is_valid(p, 3)); + assert(cap_fd_is_valid(p, 4)); proc_free(p); return 0; @@ -151,7 +150,7 @@ DEFINE_SELFTEST(spawn_fa_multi, selftest_spawn_fa_multi); static i32 selftest_spawn_fa_invalid_type(void) { struct proc *p = proc_alloc(); - assert(p != NULL); + assert(p); struct spawn_file_action fa = { .type = 99, @@ -169,7 +168,7 @@ DEFINE_SELFTEST(spawn_fa_invalid_type, selftest_spawn_fa_invalid_type); static i32 selftest_spawn_fa_bad_fd(void) { struct proc *p = proc_alloc(); - assert(p != NULL); + assert(p); struct spawn_file_action fa = { .type = SPAWN_FA_CLOSE, @@ -187,7 +186,7 @@ DEFINE_SELFTEST(spawn_fa_bad_fd, selftest_spawn_fa_bad_fd); static i32 selftest_spawn_fa_too_many(void) { struct proc *p = proc_alloc(); - assert(p != NULL); + assert(p); struct spawn_file_action fas[1]; /* dummy, count matters */ i32 rc = spawn_apply_file_actions(p, fas, SPAWN_FA_MAX + 1); @@ -252,21 +251,91 @@ static i32 selftest_spawn_attr_noop(void) } DEFINE_SELFTEST(spawn_attr_noop, selftest_spawn_attr_noop); +/* File actions run in caller order. dup2(3,4) followed by close(3) + * must leave fd 4 open while closing only the source slot. + */ +static i32 selftest_spawn_fa_dup2_then_close_order(void) +{ + struct proc *p = proc_alloc(); + assert(p); + + struct spawn_file_action fas[2] = { + {.type = SPAWN_FA_DUP2, .fd = PROC_FD_STDIN, .newfd = 4}, + {.type = SPAWN_FA_CLOSE, .fd = PROC_FD_STDIN}, + }; + + i32 rc = spawn_apply_file_actions(p, fas, 2); + assert(rc == 0); + assert(cap_fd_is_valid(p, 4)); + assert(!cap_fd_is_valid(p, PROC_FD_STDIN)); + + proc_free(p); + return 0; +} +DEFINE_SELFTEST(spawn_fa_dup2_then_close_order, + selftest_spawn_fa_dup2_then_close_order); + +/* File actions must observe earlier child-side dup2 mutations. */ +static i32 selftest_spawn_fa_dup2_chain_order(void) +{ + struct proc *p = proc_alloc(); + assert(p); + + struct spawn_file_action fas[2] = { + {.type = SPAWN_FA_DUP2, .fd = PROC_FD_STDIN, .newfd = 3}, + {.type = SPAWN_FA_DUP2, .fd = 3, .newfd = 4}, + }; + + i32 rc = spawn_apply_file_actions(p, fas, 2); + assert(rc == 0); + assert(cap_fd_is_valid(p, 3)); + assert(cap_fd_is_valid(p, 4)); + + proc_free(p); + return 0; +} +DEFINE_SELFTEST(spawn_fa_dup2_chain_order, selftest_spawn_fa_dup2_chain_order); + /* Test zero file actions is a no-op. */ static i32 selftest_spawn_fa_zero(void) { struct proc *p = proc_alloc(); - assert(p != NULL); + assert(p); i32 rc = spawn_apply_file_actions(p, NULL, 0); assert(rc == 0); /* All default FDs should still be open. */ - assert(p->fd_table[PROC_FD_STDIN].is_open); - assert(p->fd_table[PROC_FD_STDOUT].is_open); - assert(p->fd_table[PROC_FD_STDERR].is_open); + assert(cap_fd_is_valid(p, PROC_FD_STDIN)); + assert(cap_fd_is_valid(p, PROC_FD_STDOUT)); + assert(cap_fd_is_valid(p, PROC_FD_STDERR)); proc_free(p); return 0; } DEFINE_SELFTEST(spawn_fa_zero, selftest_spawn_fa_zero); + +static i32 selftest_spawn_fa_open_grants_inherit(void) +{ + if (!spawn_test_vfs_available()) + return 0; + + struct proc *p = proc_alloc(); + assert(p); + + char path[] = "/hello.txt"; + struct spawn_file_action fa = { + .type = SPAWN_FA_OPEN, + .fd = 5, + .pathlen = 10, + .path = (u64) (uptr) path, + }; + i32 rc = spawn_apply_file_actions(p, &fa, 1); + assert(rc == 0); + assert(cap_fd_has_rights(p, 5, CAP_RIGHT_GRANT)); + + proc_free(p); + return 0; +} +DEFINE_SELFTEST(spawn_fa_open_grants_inherit, + selftest_spawn_fa_open_grants_inherit); diff --git a/tests/tests-syscall.c b/tests/tests-syscall.c index b39ec61..b7aab3d 100644 --- a/tests/tests-syscall.c +++ b/tests/tests-syscall.c @@ -1,7 +1,11 @@ /* SPDX-License-Identifier: MIT */ +#include +#include +#include #include #include #include +#include #include "../kernel/sync/futex.h" static struct proc *alloc_running_proc(void) @@ -20,6 +24,7 @@ static struct sched_task *alloc_mock_task(void) return NULL; struct sched_task *td = byte_array_ptr(option_byte_array_checked(td_mem)); memset(td, 0, sizeof(*td)); + td->td_cap_slot = -1; return td; } @@ -28,6 +33,29 @@ static void free_mock_task(struct sched_task *td) kvalloc_free(byte_array_new((byte *) td, sizeof(*td))); } +static bool syscall_test_vfs_available(void) +{ + struct result_vfs_file f = vfs_open(STR("/hello.txt")); + if (f.is_error) + return false; + struct vfs_file vf = result_vfs_file_checked(f); + vfs_close(&vf); + return true; +} + +static inline i32 syscall_test_thread_cap_slot(u8 task_slot) +{ + return CAP_SPACE_SLOTS - PROC_THREAD_MAX + (i32) task_slot; +} + +static i64 syscall_test_thread_token(struct proc *p, struct sched_task *td) +{ + assert(p); + assert(td); + assert(td->td_cap_slot >= 0); + return cap_get_token(p, td->td_cap_slot, CAP_TYPE_THREAD); +} + /* Allocate a RUNNING proc + mock task, linked together. */ static bool alloc_proc_and_task(struct proc **out_p, struct sched_task **out_td) { @@ -52,6 +80,20 @@ static bool alloc_proc_and_task(struct proc **out_p, struct sched_task **out_td) return false; } } + u8 thread_slot = proc_task_slot(p, td); + i32 thread_handle = cap_open_handle( + p, thread_slot, CAP_TYPE_THREAD, CAP_RIGHT_READ | CAP_RIGHT_WRITE, + syscall_test_thread_cap_slot(thread_slot), true); + if (thread_handle < 0) { + u64 pflags = proc_table_lock_irqsave(); + (void) proc_reap_exited_thread_locked(p, td); + proc_table_unlock_irqrestore(pflags); + free_mock_task(td); + proc_set_state(p, PROC_STATE_ZOMBIE); + proc_free(p); + return false; + } + td->td_cap_slot = (i16) thread_handle; *out_p = p; *out_td = td; return true; @@ -76,23 +118,19 @@ static bool attach_mock_thread(struct proc *p, struct sched_task *target) flags = proc_table_lock_irqsave(); ok = proc_attach_task_slot(p, target, slot); proc_table_unlock_irqrestore(flags); - return ok; -} - -static bool proc_has_thread_tid(struct proc *p, u16 tid) -{ - bool found = false; - - u64 flags = proc_table_lock_irqsave(); - for (u8 i = 0; i < PROC_THREAD_MAX; i++) { - struct sched_task *task = p->tasks[i]; - if (task && task->id == tid) { - found = true; - break; - } + if (!ok) + return false; + i32 thread_handle = cap_open_handle( + p, slot, CAP_TYPE_THREAD, CAP_RIGHT_READ | CAP_RIGHT_WRITE, + syscall_test_thread_cap_slot(slot), true); + if (thread_handle < 0) { + flags = proc_table_lock_irqsave(); + (void) proc_reap_exited_thread_locked(p, target); + proc_table_unlock_irqrestore(flags); + return false; } - proc_table_unlock_irqrestore(flags); - return found; + target->td_cap_slot = (i16) thread_handle; + return ok; } static i32 selftest_sys_open_emfile(void) @@ -102,21 +140,106 @@ static i32 selftest_sys_open_emfile(void) assert(alloc_proc_and_task(&p, &td)); /* Fill all FD slots so sys_open returns EMFILE. */ - for (sz i = PROC_FD_STDERR + 1; i < PROC_FD_MAX; i++) - p->fd_table[i].is_open = true; + for (sz i = PROC_FD_STDERR + 1; i < CAP_SPACE_SLOTS - PROC_THREAD_MAX; i++) + assert(cap_dup_fd(p, PROC_FD_STDOUT, (i32) i, true) == (i32) i); struct trap_frame tf = {0}; tf.a0 = USER_CODE_BASE; tf.a1 = 4; assert(sys_open(&tf, td) == -(i64) EMFILE); - for (sz i = PROC_FD_STDERR + 1; i < PROC_FD_MAX; i++) - p->fd_table[i].is_open = false; free_proc_and_task(p, td); return 0; } DEFINE_SELFTEST(sys_open_emfile, selftest_sys_open_emfile); +static i32 selftest_sys_open_mints_non_grant_fd(void) +{ + if (!syscall_test_vfs_available()) + return 0; + + struct proc *p; + struct sched_task *td; + assert(alloc_proc_and_task(&p, &td)); + + const char path[] = "/hello.txt"; + const vaddr_t va = USER_DATA_BASE + (137UL * PAGE_SIZE); + assert(proc_map_user_page(p, va, PT_FLAG_RW | PT_FLAG_USER).is_error == + false); + assert(copy_to_user(va, path, sizeof(path)) == 0); + + struct trap_frame tf = {0}; + tf.a0 = va; + tf.a1 = sizeof(path) - 1; + i64 fd = sys_open(&tf, td); + assert(fd >= 0); + assert(!cap_fd_has_rights(p, (i32) fd, CAP_RIGHT_GRANT)); + + free_proc_and_task(p, td); + return 0; +} +DEFINE_SELFTEST(sys_open_mints_non_grant_fd, + selftest_sys_open_mints_non_grant_fd); + +static i32 selftest_sys_mq_open_emfile_rollback(void) +{ + struct proc *p; + struct sched_task *td; + assert(alloc_proc_and_task(&p, &td)); + + for (i32 fd = PROC_FD_STDERR + 1; fd < CAP_SPACE_SLOTS - PROC_THREAD_MAX; + fd++) + assert(cap_dup_fd(p, PROC_FD_STDOUT, fd, true) == fd); + + struct trap_frame tf = {0}; + tf.a0 = 4; + tf.a1 = 8; + assert(sys_mq_open(&tf, td) == -(i64) EMFILE); + + i32 handles[MQ_MAX_QUEUES]; + for (i32 i = 0; i < MQ_MAX_QUEUES; i++) { + handles[i] = mqueue_open(NULL, 4, 8); + assert(handles[i] >= 0); + } + assert(mqueue_open(NULL, 4, 8) == -(i32) EAGAIN); + for (i32 i = 0; i < MQ_MAX_QUEUES; i++) + assert(mqueue_close(handles[i]) == 0); + + free_proc_and_task(p, td); + return 0; +} +DEFINE_SELFTEST(sys_mq_open_emfile_rollback, + selftest_sys_mq_open_emfile_rollback); + +static i32 selftest_sys_mutex_init_emfile_rollback(void) +{ + struct proc *p; + struct sched_task *td; + assert(alloc_proc_and_task(&p, &td)); + + for (i32 fd = PROC_FD_STDERR + 1; fd < CAP_SPACE_SLOTS - PROC_THREAD_MAX; + fd++) + assert(cap_dup_fd(p, PROC_FD_STDOUT, fd, true) == fd); + + struct trap_frame tf = {0}; + tf.a7 = SYS_MUTEX_INIT; + assert(syscall_dispatch(&tf, td) == -(i64) EMFILE); + + i32 handles[SYNC_MAX_MUTEXES]; + for (i32 i = 0; i < SYNC_MAX_MUTEXES; i++) { + handles[i] = sync_mutex_alloc(NULL); + assert(handles[i] >= 0); + } + assert(sync_mutex_alloc(NULL) == -(i32) EAGAIN); + for (i32 i = 0; i < SYNC_MAX_MUTEXES; i++) + sync_mutex_put_idx(handles[i]); + + free_proc_and_task(p, td); + return 0; +} +DEFINE_SELFTEST(sys_mutex_init_emfile_rollback, + selftest_sys_mutex_init_emfile_rollback); + static i32 selftest_sys_exit_frees_proc_slot(void) { struct proc *p; @@ -149,7 +272,7 @@ DEFINE_SELFTEST(syscall_enosys, selftest_syscall_enosys); static i32 selftest_syscall_needs_proc(void) { struct sched_task *td = alloc_mock_task(); - assert(td != NULL); + assert(td); td->proc = NULL; /* no process */ struct trap_frame tf = {0}; @@ -164,7 +287,7 @@ DEFINE_SELFTEST(syscall_needs_proc, selftest_syscall_needs_proc); static i32 selftest_syscall_needs_proc_new_handlers(void) { struct sched_task *td = alloc_mock_task(); - assert(td != NULL); + assert(td); td->proc = NULL; struct trap_frame tf = {0}; @@ -417,7 +540,7 @@ static i32 selftest_sys_dup(void) tf.a0 = PROC_FD_STDOUT; i64 newfd = syscall_dispatch(&tf, td); assert(newfd == 3); - assert(p->fd_table[3].is_open); + assert(cap_fd_is_valid(p, 3)); /* Dup again; should get FD 4. */ tf.a0 = PROC_FD_STDOUT; @@ -458,7 +581,7 @@ static i32 selftest_sys_dup2(void) tf.a1 = 5; i64 rc = syscall_dispatch(&tf, td); assert(rc == 5); - assert(p->fd_table[5].is_open); + assert(cap_fd_is_valid(p, 5)); /* dup2(stdout, stdout) is a no-op, returns stdout. */ tf.a0 = PROC_FD_STDOUT; @@ -466,6 +589,17 @@ static i32 selftest_sys_dup2(void) rc = syscall_dispatch(&tf, td); assert(rc == PROC_FD_STDOUT); + /* dup2 to a reserved CAP_TYPE_THREAD slot must fail without + * destroying the live thread handle stored there. + */ + i64 thread_token = syscall_test_thread_token(p, td); + assert(thread_token > 0); + tf.a0 = PROC_FD_STDOUT; + tf.a1 = td->td_cap_slot; + rc = syscall_dispatch(&tf, td); + assert(rc == -(i64) EBADF); + assert(syscall_test_thread_token(p, td) == thread_token); + free_proc_and_task(p, td); return 0; } @@ -477,13 +611,17 @@ static i32 selftest_sys_lseek(void) struct sched_task *td; assert(alloc_proc_and_task(&p, &td)); - /* Set up a fake seekable FD without opening a real file. - * sys_lseek only touches fd_table[fd].offset / is_seekable. - */ + if (!syscall_test_vfs_available()) { + free_proc_and_task(p, td); + return 0; + } + i32 fd = 3; - p->fd_table[fd].is_open = true; - p->fd_table[fd].is_seekable = true; - p->fd_table[fd].offset = 0; + struct result_vfs_file fres = vfs_open(STR("/hello.txt")); + assert(!fres.is_error); + assert(cap_open_vfs(p, result_vfs_file_checked(fres), + CAP_RIGHT_READ | CAP_RIGHT_WRITE, true, fd, + true) == fd); /* SEEK_SET to position 10. */ struct trap_frame tf = {0}; @@ -492,7 +630,10 @@ static i32 selftest_sys_lseek(void) tf.a2 = SEEK_SET; i64 pos = sys_lseek(&tf, td); assert(pos == 10); - assert(p->fd_table[fd].offset == 10); + struct cap_ref ref = cap_lookup_fd(p, fd, 0); + assert(ref.ptr); + assert(((struct fd_pool_entry *) ref.ptr)->offset == 10); + cap_put_ref(&ref); /* SEEK_CUR +5 -> position 15. */ tf.a1 = 5; @@ -511,8 +652,6 @@ static i32 selftest_sys_lseek(void) tf.a2 = SEEK_SET; assert(sys_lseek(&tf, td) == -(i64) ESPIPE); - /* Cleanup. */ - p->fd_table[fd].is_open = false; free_proc_and_task(p, td); return 0; } @@ -783,7 +922,7 @@ static i32 selftest_fsync_fdatasync(void) /* Closed but in-range FD -> EBADF. */ tf.a7 = SYS_FSYNC; tf.a0 = PROC_FD_STDERR + 1; - assert(!p->fd_table[PROC_FD_STDERR + 1].is_open); + assert(!cap_fd_is_valid(p, PROC_FD_STDERR + 1)); assert(syscall_dispatch(&tf, td) == -(i64) EBADF); /* Open FD (stdout) -> success. */ @@ -794,13 +933,15 @@ static i32 selftest_fsync_fdatasync(void) assert(syscall_dispatch(&tf, td) == 0); /* Pipe FD -> EINVAL (POSIX: fsync on a non-syncable file type). */ - p->fd_table[PROC_FD_STDIN].is_pipe = true; + struct pipe *pipe = pipe_alloc(); + assert(pipe); + assert(cap_open_pipe(p, pipe, true, CAP_RIGHT_READ | CAP_RIGHT_GRANT, + PROC_FD_STDIN, true) == PROC_FD_STDIN); tf.a7 = SYS_FSYNC; tf.a0 = PROC_FD_STDIN; assert(syscall_dispatch(&tf, td) == -(i64) EINVAL); tf.a7 = SYS_FDATASYNC; assert(syscall_dispatch(&tf, td) == -(i64) EINVAL); - p->fd_table[PROC_FD_STDIN].is_pipe = false; free_proc_and_task(p, td); return 0; @@ -1033,9 +1174,9 @@ static i32 selftest_thread_schedparam(void) tf.a1 = CONFIG_SCHED_NPRIO + 5; assert(syscall_dispatch(&tf, td) == -(i64) EINVAL); - /* Unknown TID -> ESRCH. */ + /* Unknown thread handle -> ESRCH. */ tf.a7 = SYS_THREAD_GETSCHEDPARAM; - tf.a0 = (u64) U16_MAX; + tf.a0 = (u64) -1; assert(syscall_dispatch(&tf, td) == -(i64) ESRCH); /* sched_setscheduler / sched_getscheduler return SCHED_FIFO for @@ -1079,7 +1220,6 @@ static i32 selftest_thread_detach_states(void) */ struct sched_task *target = alloc_mock_task(); assert(target); - target->id = td->id + 1; target->td_join_state = TD_JOIN_JOINABLE; init_waitqueue_head(&target->td_join_wq); assert(attach_mock_thread(p, target)); @@ -1087,7 +1227,7 @@ static i32 selftest_thread_detach_states(void) /* Detach the JOINABLE target -> succeeds, state becomes DETACHED. */ struct trap_frame tf = {0}; tf.a7 = SYS_THREAD_DETACH; - tf.a0 = target->id; + tf.a0 = (u64) syscall_test_thread_token(p, target); assert(syscall_dispatch(&tf, td) == 0); assert(target->td_join_state == TD_JOIN_DETACHED); @@ -1096,16 +1236,16 @@ static i32 selftest_thread_detach_states(void) /* Join on a DETACHED thread -> EINVAL. */ tf.a7 = SYS_THREAD_JOIN; - tf.a0 = target->id; + tf.a0 = (u64) syscall_test_thread_token(p, target); tf.a1 = 0; assert(syscall_dispatch(&tf, td) == -(i64) EINVAL); /* Self-join -> EDEADLK. */ - tf.a0 = td->id; + tf.a0 = (u64) syscall_test_thread_token(p, td); assert(syscall_dispatch(&tf, td) == -(i64) EDEADLK); - /* Unknown TID -> ESRCH. */ - tf.a0 = (u64) U16_MAX - 1; + /* Unknown thread handle -> ESRCH. */ + tf.a0 = (u64) -1; assert(syscall_dispatch(&tf, td) == -(i64) ESRCH); /* Cleanup: detach the target before freeing the proc. */ @@ -1127,7 +1267,6 @@ static i32 selftest_thread_join_efault_preserves_exited_target(void) struct sched_task *target = alloc_mock_task(); assert(target); target->proc = p; - target->id = td->id + 1; target->td_join_state = TD_JOIN_EXITED; target->td_exit_code = 42; init_waitqueue_head(&target->td_join_wq); @@ -1135,11 +1274,11 @@ static i32 selftest_thread_join_efault_preserves_exited_target(void) struct trap_frame tf = {0}; tf.a7 = SYS_THREAD_JOIN; - tf.a0 = target->id; + tf.a0 = (u64) syscall_test_thread_token(p, target); tf.a1 = USER_CODE_BASE - sizeof(i32); assert(syscall_dispatch(&tf, td) == -(i64) EFAULT); assert(target->td_join_state == TD_JOIN_EXITED); - assert(proc_has_thread_tid(p, target->id)); + assert(cap_slot_read(p, target->td_cap_slot).valid); const vaddr_t va = USER_DATA_BASE + (136UL * PAGE_SIZE); assert(proc_map_user_page(p, va, PT_FLAG_RW | PT_FLAG_USER).is_error == @@ -1149,7 +1288,7 @@ static i32 selftest_thread_join_efault_preserves_exited_target(void) i32 exit_code = 0; assert(copy_from_user(&exit_code, va, sizeof(exit_code)) == 0); assert(exit_code == 42); - assert(!proc_has_thread_tid(p, target->id)); + assert(!cap_slot_read(p, target->td_cap_slot).valid); free_proc_and_task(p, td); return 0; @@ -1157,6 +1296,53 @@ static i32 selftest_thread_join_efault_preserves_exited_target(void) DEFINE_SELFTEST(thread_join_efault_preserves_exited_target, selftest_thread_join_efault_preserves_exited_target); +static i32 selftest_thread_handle_stale_rejected_after_reuse(void) +{ + struct proc *p; + struct sched_task *td; + assert(alloc_proc_and_task(&p, &td)); + + struct sched_task *old_target = alloc_mock_task(); + assert(old_target); + old_target->proc = p; + old_target->td_join_state = TD_JOIN_JOINABLE; + init_waitqueue_head(&old_target->td_join_wq); + assert(attach_mock_thread(p, old_target)); + u64 stale_token = (u64) syscall_test_thread_token(p, old_target); + + u64 flags = proc_table_lock_irqsave(); + proc_detach_task(p, old_target); + proc_table_unlock_irqrestore(flags); + free_mock_task(old_target); + + struct sched_task *new_target = alloc_mock_task(); + assert(new_target); + new_target->proc = p; + new_target->td_join_state = TD_JOIN_JOINABLE; + init_waitqueue_head(&new_target->td_join_wq); + assert(attach_mock_thread(p, new_target)); + u64 fresh_token = (u64) syscall_test_thread_token(p, new_target); + assert(fresh_token != stale_token); + + struct trap_frame tf = {0}; + tf.a7 = SYS_PTHREAD_KILL; + tf.a0 = stale_token; + tf.a1 = 0; + assert(syscall_dispatch(&tf, td) == -(i64) ESRCH); + + tf.a0 = fresh_token; + assert(syscall_dispatch(&tf, td) == 0); + + flags = proc_table_lock_irqsave(); + proc_detach_task(p, new_target); + proc_table_unlock_irqrestore(flags); + free_mock_task(new_target); + free_proc_and_task(p, td); + return 0; +} +DEFINE_SELFTEST(thread_handle_stale_rejected_after_reuse, + selftest_thread_handle_stale_rejected_after_reuse); + /* Verify clock_gettime on the per-thread and per-process CPU-time * clocks returns a valid timespec. Mock task cpu_time_us is zero by * default; the values are still validated for shape (no fault, no @@ -1210,7 +1396,7 @@ static i32 selftest_pthread_kill(void) /* Self-target: signo lands on td->td_sig.pending. */ tf.a7 = SYS_PTHREAD_KILL; - tf.a0 = td->id; + tf.a0 = (u64) syscall_test_thread_token(p, td); tf.a1 = SIGUSR1; assert(syscall_dispatch(&tf, td) == 0); assert((td->td_sig.pending & sig_bit(SIGUSR1)) != 0); @@ -1226,8 +1412,8 @@ static i32 selftest_pthread_kill(void) tf.a1 = SIGKILL; assert(syscall_dispatch(&tf, td) == -(i64) EINVAL); - /* Unknown TID -> ESRCH. */ - tf.a0 = (u64) U16_MAX - 5; + /* Unknown thread handle -> ESRCH. */ + tf.a0 = (u64) -1; tf.a1 = SIGUSR1; assert(syscall_dispatch(&tf, td) == -(i64) ESRCH); @@ -1245,14 +1431,13 @@ static i32 selftest_pthread_kill_exited_thread_esrch(void) struct sched_task *target = alloc_mock_task(); assert(target); target->proc = p; - target->id = td->id + 1; target->td_join_state = TD_JOIN_EXITED; init_waitqueue_head(&target->td_join_wq); assert(attach_mock_thread(p, target)); struct trap_frame tf = {0}; tf.a7 = SYS_PTHREAD_KILL; - tf.a0 = target->id; + tf.a0 = (u64) syscall_test_thread_token(p, target); tf.a1 = SIGUSR1; assert(syscall_dispatch(&tf, td) == -(i64) ESRCH); assert(target->td_sig.pending == 0); @@ -1300,13 +1485,13 @@ static i32 selftest_thread_cancel_state(void) tf.a0 = 99; assert(syscall_dispatch(&tf, td) == -(i64) EINVAL); - /* SYS_THREAD_CANCEL on unknown TID -> ESRCH. */ + /* SYS_THREAD_CANCEL on unknown thread handle -> ESRCH. */ tf.a7 = SYS_THREAD_CANCEL; - tf.a0 = (u64) U16_MAX - 7; + tf.a0 = (u64) -1; assert(syscall_dispatch(&tf, td) == -(i64) ESRCH); - /* SYS_THREAD_CANCEL on self-tid sets td_cancel_pending. */ - tf.a0 = td->id; + /* SYS_THREAD_CANCEL on self handle sets td_cancel_pending. */ + tf.a0 = (u64) syscall_test_thread_token(p, td); assert(syscall_dispatch(&tf, td) == 0); assert(td->td_cancel_pending == true); @@ -1331,14 +1516,13 @@ static i32 selftest_thread_cancel_exited_thread_esrch(void) struct sched_task *target = alloc_mock_task(); assert(target); target->proc = p; - target->id = td->id + 1; target->td_join_state = TD_JOIN_EXITED; init_waitqueue_head(&target->td_join_wq); assert(attach_mock_thread(p, target)); struct trap_frame tf = {0}; tf.a7 = SYS_THREAD_CANCEL; - tf.a0 = target->id; + tf.a0 = (u64) syscall_test_thread_token(p, target); assert(syscall_dispatch(&tf, td) == -(i64) ESRCH); assert(target->td_cancel_pending == false); diff --git a/tests/tests-uaccess.c b/tests/tests-uaccess.c index 0bb1e5a..d5fd1db 100644 --- a/tests/tests-uaccess.c +++ b/tests/tests-uaccess.c @@ -9,7 +9,7 @@ static i32 selftest_uaccess_validation(void) char dst[16] = {0}; const vaddr_t test_page = USER_DATA_BASE + (128UL * PAGE_SIZE); struct proc *p = proc_alloc(); - assert(p != NULL); + assert(p); assert( proc_map_user_page(p, test_page, PT_FLAG_RW | PT_FLAG_USER).is_error == false);