$(eval $(call import_lib,$(CONFIG_UK_BASE)/lib/ukdebug))
$(eval $(call import_lib,$(CONFIG_UK_BASE)/lib/ukfalloc))
$(eval $(call import_lib,$(CONFIG_UK_BASE)/lib/ukfallocbuddy))
+$(eval $(call import_lib,$(CONFIG_UK_BASE)/lib/ukfile))
$(eval $(call import_lib,$(CONFIG_UK_BASE)/lib/uklibid))
$(eval $(call import_lib,$(CONFIG_UK_BASE)/lib/ukintctlr))
$(eval $(call import_lib,$(CONFIG_UK_BASE)/lib/uklibparam))
--- /dev/null
+config LIBUKFILE
+ bool "ukfile: Common support for files"
+ select LIBUKLOCK
+ select LIBUKLOCK_MUTEX
+ select LIBUKLOCK_RWLOCK
+ select LIBUKSCHED
+
+# Hidden, selected by core components when required
+config LIBUKFILE_CHAINUPDATE
+ bool
--- /dev/null
+$(eval $(call addlib_s,libukfile,$(CONFIG_LIBUKFILE)))
+
+CINCLUDES-$(CONFIG_LIBUKFILE) += -I$(LIBUKFILE_BASE)/include
+CXXINCLUDES-$(CONFIG_LIBUKFILE) += -I$(LIBUKFILE_BASE)/include
+
+LIBUKFILE_SRCS-y += $(LIBUKFILE_BASE)/pollqueue.c
+LIBUKFILE_SRCS-y += $(LIBUKFILE_BASE)/file-nops.c
--- /dev/null
+# `ukfile`: files for Unikraft
+
+This core library contains the unikraft abstractions of a "file" as well as an "open file" (a.k.a. "open file descriptions").
+These are low-level internal abstractions that do not have direct correspondents in any userspace-facing API.
+Not to be confused with "file descriptors" or other similar POSIX-y concepts; please see `posix-fd*` for those.
+
+This README discusses higher-level design considerations for (open) files.
+Consult the headers `uk/file.h` and `uk/ofile.h` for specifics on implementation.
+
+## What is a _file_?
+
+To overuse a classical *NIX idiom, "everything is a file".
+More concretely however, a file is an abstraction for any resource that offers a combination of input, output, and/or control operations.
+A file in Unikraft is a combination of an _immutable identity_ coupled with _mutable state_.
+
+Files are represented in Unikraft by the `struct uk_file` type, referenced in APIs as `const struct uk_file *` to enforce immutability.
+Identity consists of:
+- A volume identifier: driver-specific field, used to identify the file type as well as its originating driver instance
+- A file node: reference to driver-specific data associated with the file
+- Table of file operations: implementations of well-defined file operations (see below)
+
+File state is used for bookkeeping purposes and includes:
+- Reference counting (strong & weak references)
+- Locks for synchronization
+- Event set & queue for polling operations
+
+### File Operations
+
+Files allow for a defined set of operations, some of which are driver-implemented, while others are common across all files.
+Driver-specific operations have a well-defined interface and are implemented by file drivers.
+These are:
+- I/O: manipulating an array of unstructured bytes
+ - `read`: retrieve a specific contiguous block of bytes from this array
+ - `write`: ensure a specific contiguous block in this array has specific bytes
+- Metadata: manipulating a defined structure of metadata related to the file
+ - `getstat`: get file metadata fields
+ - `setstat`: set file metadata fields
+- Control: requests for special operations to be performed by the file
+ - `ctl`
+- (internal) cleanup/destructor: what happens, if anything, when we no longer need the file
+
+Common operations are implemented centrally for all file objects:
+- Reference counting: acquire/release of regular (strong) or weak references
+ - Strong references allow the full functionality of files
+ - Weak references allow only common operations (polling, locking)
+- Event polling & notification:
+ - Driver API:
+ - Set & clear what event flags are active on the file
+ - User API:
+ - Check whether specific events are set on a file
+ - Wait and be awoken when an event becomes set on a file
+- Voluntary kernel-space synchronization mechanisms:
+ - Driver operations provide no synchronization or atomicity guarantees themselves in the general case
+ - Drivers are free to implement these operations as efficiently as their internal data model allows
+ - Higher-level APIs that want to provide atomicity guarantees (e.g. POSIX read vs. write serialization) can and should use these mechanisms to achieve their goal
+
+
+## What is an _open file_?
+
+Open files are stateful and mutable references to a file that is "in use".
+The precise definition of "in use" is intentionally left vague and up to client code.
+Open file state consists of:
+- Reference count, allowing multiple independent shared references
+- Open "mode", a client-defined bitmask of open file options
+- Current position for I/O (i.e. what one sets with `lseek()`)
+- Lock for synchronizing changes to the above
+
+Open files are represented in Unikraft by the `struct uk_ofile` type.
+A single `struct uk_file` may be referenced by an arbitrary number of ofiles, each of which acts independently from the others.
+
+Open files do not expose any operations themselves, instead providing only a base data structure for higher-level abstractions, such as file descriptor tables and POSIX I/O syscalls.
--- /dev/null
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright (c) 2023, Unikraft GmbH and The Unikraft Authors.
+ * Licensed under the BSD-3-Clause License (the "License").
+ * You may not use this file except in compliance with the License.
+ */
+
+#include <errno.h>
+
+#include <uk/file/nops.h>
+
+
+ssize_t uk_file_nop_read(const struct uk_file *f __unused,
+ const struct iovec *iov __unused, int iovcnt __unused,
+ off_t off __unused, long flags __unused)
+{
+ return -ENOSYS;
+}
+
+ssize_t uk_file_nop_write(const struct uk_file *f __unused,
+ const struct iovec *iov __unused, int iovcnt __unused,
+ off_t off __unused, long flags __unused)
+{
+ return -ENOSYS;
+}
+
+int uk_file_nop_getstat(const struct uk_file *f __unused,
+ unsigned mask __unused, struct uk_statx *arg __unused)
+{
+ return -ENOSYS;
+}
+
+int uk_file_nop_setstat(const struct uk_file *f __unused,
+ unsigned mask __unused,
+ const struct uk_statx *arg __unused)
+{
+ return -ENOSYS;
+}
+
+int uk_file_nop_ctl(const struct uk_file *f __unused, int fam __unused,
+ int req __unused, uintptr_t arg1 __unused,
+ uintptr_t arg2 __unused, uintptr_t arg3 __unused)
+{
+ return -ENOSYS;
+}
+
+const struct uk_file_ops uk_file_nops = {
+ .read = uk_file_nop_read,
+ .write = uk_file_nop_write,
+ .getstat = uk_file_nop_getstat,
+ .setstat = uk_file_nop_setstat,
+ .ctl = uk_file_nop_ctl
+};
--- /dev/null
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright (c) 2023, Unikraft GmbH and The Unikraft Authors.
+ * Licensed under the BSD-3-Clause License (the "License").
+ * You may not use this file except in compliance with the License.
+ */
+
+/* Fundamental abstraction for files in Unikraft. */
+
+#ifndef __UKFILE_FILE_H__
+#define __UKFILE_FILE_H__
+
+#include <fcntl.h>
+#include <sys/uio.h>
+
+#include <uk/rwlock.h>
+#include <uk/weak_refcount.h>
+#include <uk/file/pollqueue.h>
+#include <uk/file/statx.h>
+
+
+struct uk_file;
+
+/* File operations, to be provided by drivers */
+
+/* I/O functions are non-blocking & return -EAGAIN when unable to perform.
+ * The behavior of concurrent calls to these functions is driver-dependent and
+ * no general assumptions can be made about their ordering and/or interleaving.
+ * Callers should themselves use the state->iolock (and/or other locks)
+ * as appropriate in order to provide the desired concurrency guarantees.
+ */
+
+/* I/O */
+typedef ssize_t (*uk_file_io_func)(const struct uk_file *f,
+ const struct iovec *iov, int iovcnt,
+ off_t off, long flags);
+
+/* Info (stat-like & chXXX-like) */
+typedef int (*uk_file_getstat_func)(const struct uk_file *f,
+ unsigned int mask,
+ struct uk_statx *arg);
+typedef int (*uk_file_setstat_func)(const struct uk_file *f,
+ unsigned int mask,
+ const struct uk_statx *arg);
+
+/* Control */
+/* Values for the `fam` argument of file_ctl */
+#define UKFILE_CTL_FILE 0 /* File controls (sync, allocation, etc.) */
+#define UKFILE_CTL_IOCTL 1 /* Linux-compatible ioctl() requests */
+
+/*
+ * SYNC((int)all, void, void)
+ * Flush modified file data & metadata to storage.
+ * If all is 0, flush minimum of metadata, if 1 flush all metadata.
+ */
+#define UKFILE_CTL_FILE_SYNC 0
+
+/*
+ * TRUNC((off_t)len, void, void)
+ * Truncate file to `len` bytes.
+ */
+#define UKFILE_CTL_FILE_TRUNC 1
+
+/*
+ * FALLOC((int)mode, (off_t)offset, (off_t)len)
+ * Linux-compatible fallocate operation.
+ */
+#define UKFILE_CTL_FILE_FALLOC 2
+
+/*
+ * FADVISE((off_t)offset, (off_t)len, (int)advice)
+ * Linux-compatible fadvise operations.
+ */
+#define UKFILE_CTL_FILE_FADVISE 3
+
+typedef int (*uk_file_ctl_func)(const struct uk_file *f, int fam, int req,
+ uintptr_t arg1, uintptr_t arg2, uintptr_t arg3);
+
+/* Destructor */
+/* what - bitwise OR of what to release:
+ * UK_FILE_RELEASE_RES - file resources
+ * UK_FILE_RELEASE_OBJ - file object
+ */
+#define UK_FILE_RELEASE_RES UK_SWREFCOUNT_LAST_STRONG
+#define UK_FILE_RELEASE_OBJ UK_SWREFCOUNT_LAST_REF
+
+typedef void (*uk_file_release_func)(const struct uk_file *f, int what);
+
+struct uk_file_ops {
+ uk_file_io_func read;
+ uk_file_io_func write;
+ uk_file_getstat_func getstat;
+ uk_file_setstat_func setstat;
+ uk_file_ctl_func ctl;
+};
+
+/* File struct */
+
+struct uk_file_state {
+ /* Synchronization for higher-level operations */
+ struct uk_rwlock iolock;
+ /* Polling & events */
+ struct uk_pollq pollq;
+ /* Voluntary locks (flock) */
+ /* TODO */
+};
+
+#define UK_FILE_STATE_INITIALIZER(name) ((struct uk_file_state){ \
+ .iolock = UK_RWLOCK_INITIALIZER((name).iolock, 0), \
+ .pollq = UK_POLLQ_INITIALIZER((name).pollq) \
+})
+
+/*
+ * Reference count type used by uk_file.
+ *
+ * The exact reference count is an implementation detail that we do not wish to
+ * expose to consumers. Drivers may, however, need to allocate and initialize
+ * this structure; we therefore provide a typedef and initializer.
+ */
+typedef struct uk_swrefcount uk_file_refcnt;
+
+struct uk_file {
+ /* Identity */
+ const void *vol; /* Volume instance; needed to check file kind */
+ void *node; /* Driver-specific inode data */
+ /* Ops table */
+ const struct uk_file_ops *ops;
+ /* Mutable state (refcounting, poll events & locks) */
+ uk_file_refcnt *refcnt;
+ struct uk_file_state *state;
+ /* Destructor, never call directly */
+ uk_file_release_func _release;
+};
+
+/* Files always get created with one strong reference held */
+#define UK_FILE_REFCNT_INITIALIZER UK_SWREFCOUNT_INITIALIZER(1, 1)
+
+/* Operations inlines */
+static inline
+ssize_t uk_file_read(const struct uk_file *f,
+ const struct iovec *iov, int iovcnt,
+ off_t off, long flags)
+{
+ return f->ops->read(f, iov, iovcnt, off, flags);
+}
+
+static inline
+ssize_t uk_file_write(const struct uk_file *f,
+ const struct iovec *iov, int iovcnt,
+ off_t off, long flags)
+{
+ return f->ops->write(f, iov, iovcnt, off, flags);
+}
+
+static inline
+int uk_file_getstat(const struct uk_file *f,
+ unsigned int mask, struct uk_statx *arg)
+{
+ return f->ops->getstat(f, mask, arg);
+}
+
+static inline
+int uk_file_setstat(const struct uk_file *f,
+ unsigned int mask, const struct uk_statx *arg)
+{
+ return f->ops->setstat(f, mask, arg);
+}
+
+static inline
+int uk_file_ctl(const struct uk_file *f, int fam, int req,
+ uintptr_t arg1, uintptr_t arg2, uintptr_t arg3)
+{
+ return f->ops->ctl(f, fam, req, arg1, arg2, arg3);
+}
+
+/* Refcounting & destruction */
+
+static inline
+void uk_file_acquire(const struct uk_file *f)
+{
+ uk_swrefcount_acquire(f->refcnt);
+}
+
+static inline
+void uk_file_acquire_weak(const struct uk_file *f)
+{
+ uk_swrefcount_acquire_weak(f->refcnt);
+}
+
+static inline
+void uk_file_release(const struct uk_file *f)
+{
+ int r = uk_swrefcount_release(f->refcnt);
+
+ if (r)
+ f->_release(f, r);
+}
+
+static inline
+void uk_file_release_weak(const struct uk_file *f)
+{
+ int r = uk_swrefcount_release_weak(f->refcnt);
+
+ if (r)
+ f->_release(f, r);
+}
+
+/* High-level I/O locking */
+
+static inline void uk_file_rlock(const struct uk_file *f)
+{
+ uk_rwlock_rlock(&f->state->iolock);
+}
+
+static inline void uk_file_runlock(const struct uk_file *f)
+{
+ uk_rwlock_runlock(&f->state->iolock);
+}
+
+static inline void uk_file_wlock(const struct uk_file *f)
+{
+ uk_rwlock_wlock(&f->state->iolock);
+}
+
+static inline void uk_file_wunlock(const struct uk_file *f)
+{
+ uk_rwlock_wunlock(&f->state->iolock);
+}
+
+/* Events & polling */
+
+static inline
+uk_pollevent uk_file_poll_immediate(const struct uk_file *f, uk_pollevent req)
+{
+ return uk_pollq_poll_immediate(&f->state->pollq, req);
+}
+
+static inline
+uk_pollevent uk_file_poll_until(const struct uk_file *f, uk_pollevent req,
+ __nsec deadline)
+{
+ return uk_pollq_poll_until(&f->state->pollq, req, deadline);
+}
+
+static inline
+uk_pollevent uk_file_poll(const struct uk_file *f, uk_pollevent req)
+{
+ return uk_file_poll_until(f, req, 0);
+}
+
+static inline
+uk_pollevent uk_file_event_clear(const struct uk_file *f, uk_pollevent clr)
+{
+ return uk_pollq_clear(&f->state->pollq, clr);
+}
+
+static inline
+uk_pollevent uk_file_event_set(const struct uk_file *f, uk_pollevent set)
+{
+ return uk_pollq_set(&f->state->pollq, set);
+}
+
+static inline
+uk_pollevent uk_file_event_assign(const struct uk_file *f, uk_pollevent set)
+{
+ return uk_pollq_assign(&f->state->pollq, set);
+}
+
+#endif /* __UKFILE_FILE_H__ */
--- /dev/null
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright (c) 2023, Unikraft GmbH and The Unikraft Authors.
+ * Licensed under the BSD-3-Clause License (the "License").
+ * You may not use this file except in compliance with the License.
+ */
+
+/* Convenience uk_file stub operations */
+
+#ifndef __UKFILE_FILE_NOPS_H__
+#define __UKFILE_FILE_NOPS_H__
+
+#include <uk/file.h>
+
+extern const struct uk_file_ops uk_file_nops;
+
+ssize_t uk_file_nop_read(const struct uk_file *f,
+ const struct iovec *iov, int iovcnt,
+ off_t off, long flags);
+
+ssize_t uk_file_nop_write(const struct uk_file *f,
+ const struct iovec *iov, int iovcnt,
+ off_t off, long flags);
+
+int uk_file_nop_getstat(const struct uk_file *f,
+ unsigned int mask, struct uk_statx *arg);
+
+int uk_file_nop_setstat(const struct uk_file *f,
+ unsigned int mask, const struct uk_statx *arg);
+
+int uk_file_nop_ctl(const struct uk_file *f, int fam, int req,
+ uintptr_t arg1, uintptr_t arg2, uintptr_t arg3);
+
+#endif /* __UKFILE_FILE_NOPS_H__ */
--- /dev/null
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright (c) 2023, Unikraft GmbH and The Unikraft Authors.
+ * Licensed under the BSD-3-Clause License (the "License").
+ * You may not use this file except in compliance with the License.
+ */
+
+/* Multi-event poll/wait queue with update chaining support */
+
+#ifndef __UKFILE_POLLQUEUE_H__
+#define __UKFILE_POLLQUEUE_H__
+
+#include <uk/config.h>
+
+#include <uk/assert.h>
+#include <uk/arch/atomic.h>
+#include <uk/rwlock.h>
+#include <uk/plat/time.h>
+#include <uk/thread.h>
+
+/*
+ * Bitmask of event flags.
+ *
+ * Should be large enough to accomodate what userspace will use as event flags
+ * in the least significant bits, along with Unikraft-internal flags (if any)
+ * in the more significant bits.
+ */
+typedef unsigned int uk_pollevent;
+
+/**
+ * Ticket for registering on the poll waiting list.
+ *
+ * If the newly set events overlap with those in `mask`, wake up `thread`.
+ * Tickets are atomically released from the wait queue when waking.
+ */
+struct uk_poll_ticket {
+ struct uk_poll_ticket *next;
+ struct uk_thread *thread; /* Thread to wake up */
+ uk_pollevent mask; /* Events to register for */
+};
+
+#if CONFIG_LIBUKFILE_CHAINUPDATE
+
+/* Update chaining */
+
+enum uk_poll_chain_type {
+ UK_POLL_CHAINTYPE_UPDATE,
+ UK_POLL_CHAINTYPE_CALLBACK
+};
+
+enum uk_poll_chain_op {
+ UK_POLL_CHAINOP_CLEAR,
+ UK_POLL_CHAINOP_SET
+};
+
+struct uk_poll_chain;
+
+/**
+ * Update chaining callback function; called on event propagations.
+ *
+ * @param ev The events that triggered this update.
+ * @param op Whether `events` are being set or cleared.
+ * @param tick The update chaining ticket this callback is registered with.
+ */
+typedef void (*uk_poll_chain_callback_fn)(uk_pollevent ev,
+ enum uk_poll_chain_op op,
+ struct uk_poll_chain *tick);
+
+/**
+ * Ticket for registering on the update chaining list.
+ *
+ * If newly modified events overlap with those in `mask`, perform a chain update
+ * of these overlapping bits according to `type`:
+ * - UK_POLL_CHAINTYPE_UPDATE: propagate events to `queue`.
+ * If `set` != 0 set/clear events in `set`, instead of original
+ * - UK_POLL_CHAINTYPE_CALLBACK: call `callback`
+ */
+struct uk_poll_chain {
+ struct uk_poll_chain *next;
+ uk_pollevent mask; /* Events to register for */
+ enum uk_poll_chain_type type;
+ union {
+ struct {
+ struct uk_pollq *queue; /* Where to propagate updates */
+ uk_pollevent set; /* Events to set */
+ };
+ struct {
+ uk_poll_chain_callback_fn callback;
+ void *arg;
+ };
+ };
+};
+
+/* Initializer for a chain ticket that propagates events to another queue */
+#define UK_POLL_CHAIN_UPDATE(msk, to, ev) ((struct uk_poll_chain){ \
+ .next = NULL, \
+ .mask = (msk), \
+ .type = UK_POLL_CHAINTYPE_UPDATE, \
+ .queue = (to), \
+ .set = (ev) \
+})
+
+/* Initializer for a chain ticket that calls a custom callback */
+#define UK_POLL_CHAIN_CALLBACK(msk, cb, dat) ((struct uk_poll_chain){ \
+ .next = NULL, \
+ .mask = (msk), \
+ .type = UK_POLL_CHAINTYPE_CALLBACK, \
+ .callback = (cb), \
+ .arg = (dat) \
+})
+
+#endif /* CONFIG_LIBUKFILE_CHAINUPDATE */
+
+/* Main queue */
+struct uk_pollq {
+ /* Notification lists */
+ struct uk_poll_ticket *wait; /* Polling threads */
+ struct uk_poll_ticket **waitend;
+#if CONFIG_LIBUKFILE_CHAINUPDATE
+ struct uk_poll_chain *prop; /* Registrations for chained updates */
+ struct uk_poll_chain **propend;
+#endif /* CONFIG_LIBUKFILE_CHAINUPDATE */
+
+ /* Events */
+ volatile uk_pollevent events; /* Instantaneous event levels */
+ uk_pollevent waitmask; /* Events waited on by threads */
+#if CONFIG_LIBUKFILE_CHAINUPDATE
+ uk_pollevent propmask; /* Events registered for chaining */
+#endif /* CONFIG_LIBUKFILE_CHAINUPDATE */
+ /* Locks & sundry */
+#if CONFIG_LIBUKFILE_CHAINUPDATE
+ void *_tag; /* Internal use */
+ struct uk_rwlock proplock; /* Chained updates list lock */
+#endif /* CONFIG_LIBUKFILE_CHAINUPDATE */
+ struct uk_rwlock waitlock; /* Wait list lock */
+};
+
+#if CONFIG_LIBUKFILE_CHAINUPDATE
+#define UK_POLLQ_INITIALIZER(q) \
+ ((struct uk_pollq){ \
+ .wait = NULL, \
+ .waitend = &(q).wait, \
+ .prop = NULL, \
+ .propend = &(q).prop, \
+ .events = 0, \
+ .waitmask = 0, \
+ .propmask = 0, \
+ .proplock = UK_RWLOCK_INITIALIZER((q).proplock, 0), \
+ .waitlock = UK_RWLOCK_INITIALIZER((q).waitlock, 0), \
+ })
+#else /* !CONFIG_LIBUKFILE_CHAINUPDATE */
+#define UK_POLLQ_INITIALIZER(q) \
+ ((struct uk_pollq){ \
+ .wait = NULL, \
+ .waitend = &(q).wait, \
+ .events = 0, \
+ .waitmask = 0, \
+ .waitlock = UK_RWLOCK_INITIALIZER((q).waitlock, 0), \
+ })
+#endif /* !CONFIG_LIBUKFILE_CHAINUPDATE */
+
+/**
+ * Initialize the fields of `q` to a valid empty state.
+ */
+static inline
+void uk_pollq_init(struct uk_pollq *q)
+{
+ q->wait = NULL;
+ q->waitend = &q->wait;
+ q->events = 0;
+ q->waitmask = 0;
+ uk_rwlock_init(&q->waitlock);
+#if CONFIG_LIBUKFILE_CHAINUPDATE
+ q->prop = NULL;
+ q->propend = &q->prop;
+ q->propmask = 0;
+ uk_rwlock_init(&q->proplock);
+#endif /* CONFIG_LIBUKFILE_CHAINUPDATE */
+}
+
+/* Polling cancellation */
+
+/**
+ * Remove a specific `ticket` from the wait list.
+ */
+static inline
+void uk_pollq_cancel_ticket(struct uk_pollq *q, struct uk_poll_ticket *ticket)
+{
+ uk_rwlock_wlock(&q->waitlock);
+ for (struct uk_poll_ticket **p = &q->wait; *p; p = &(*p)->next)
+ if (*p == ticket) {
+ *p = ticket->next;
+ ticket->next = NULL;
+ if (!*p)
+ q->waitend = p;
+ break;
+ }
+ uk_rwlock_wunlock(&q->waitlock);
+}
+
+/**
+ * Remove the ticket of a specific `thread` from the wait list.
+ */
+static inline
+void uk_pollq_cancel_thread(struct uk_pollq *q, struct uk_thread *thread)
+{
+ uk_rwlock_wlock(&q->waitlock);
+ for (struct uk_poll_ticket **p = &q->wait; *p; p = &(*p)->next) {
+ struct uk_poll_ticket *t = *p;
+
+ if (t->thread == thread) {
+ *p = t->next;
+ t->next = NULL;
+ if (!*p)
+ q->waitend = p;
+ break;
+ }
+ }
+ uk_rwlock_wunlock(&q->waitlock);
+}
+
+/**
+ * Remove the ticket of the current thread from the wait list.
+ */
+#define uk_pollq_cancel(q) uk_pollq_cancel_thread((q), uk_thread_current())
+
+/* Polling */
+
+/**
+ * Poll for the events in `req`; never block, always return immediately.
+ *
+ * @return
+ * Bitwise AND between `req` and the events set in `q`.
+ */
+static inline
+uk_pollevent uk_pollq_poll_immediate(struct uk_pollq *q, uk_pollevent req)
+{
+ return q->events & req;
+}
+
+/**
+ * INTERNAL. Atomically poll & lock if required.
+ *
+ * @param q Target queue.
+ * @param req Events to poll for.
+ * @param exp Events expected to be already set.
+ *
+ * @return
+ * non-zero evmask with lock released if events appeared
+ * 0 with lock held otherwise.
+ */
+static inline
+uk_pollevent _pollq_lock(struct uk_pollq *q, uk_pollevent req,
+ uk_pollevent exp)
+{
+ uk_pollevent ev;
+
+ uk_rwlock_rlock(&q->waitlock);
+ /* Check if events were set while acquiring the lock */
+ if ((ev = uk_pollq_poll_immediate(q, req) & ~exp))
+ uk_rwlock_runlock(&q->waitlock);
+ return ev;
+}
+
+/**
+ * INTERNAL. Wait for events until a timeout.
+ *
+ * Must be called only after `_pollq_lock` returns 0.
+ *
+ * @param q Target queue.
+ * @param req Events to poll for.
+ * @param deadline Max number of nanoseconds to wait or, or 0 if forever
+ *
+ * @return
+ * 0 on timeout
+ * non-zero if awoken
+ */
+static inline
+int _pollq_wait(struct uk_pollq *q, uk_pollevent req, __nsec deadline)
+{
+ struct uk_poll_ticket **tail;
+ struct uk_thread *__current;
+ struct uk_poll_ticket tick;
+ int timeout;
+
+ /* Mark request in waitmask */
+ (void)ukarch_or(&q->waitmask, req);
+ /* Compete to register */
+
+ __current = uk_thread_current();
+ tick = (struct uk_poll_ticket){
+ .next = NULL,
+ .thread = __current,
+ .mask = req,
+ };
+ tail = ukarch_exchange_n(&q->waitend, &tick.next);
+ /* tail is ours alone, safe to link in */
+ UK_ASSERT(!*tail); /* Should be a genuine list tail */
+ *tail = &tick;
+
+ /* Block until awoken */
+ uk_thread_block_until(__current, deadline);
+ uk_rwlock_runlock(&q->waitlock);
+ uk_sched_yield();
+ /* Back, wake up, check if timed out & try again */
+ timeout = deadline && ukplat_monotonic_clock() >= deadline;
+ if (timeout)
+ uk_pollq_cancel_ticket(q, &tick);
+ return !timeout;
+}
+
+/**
+ * Poll for the events in `req`, blocking until `deadline` or an event is set.
+ *
+ * @param q Target queue.
+ * @param req Events to poll for.
+ * @param deadline Max number of nanoseconds to wait for, or 0 if forever
+ *
+ * @return
+ * Bitwise AND between `req` and the events set in `q`, or 0 if timed out
+ */
+static inline
+uk_pollevent uk_pollq_poll_until(struct uk_pollq *q, uk_pollevent req,
+ __nsec deadline)
+{
+ uk_pollevent ev;
+
+ do {
+ if ((ev = uk_pollq_poll_immediate(q, req)))
+ return ev;
+ if ((ev = _pollq_lock(q, req, 0)))
+ return ev;
+ } while (_pollq_wait(q, req, deadline));
+ return ev;
+}
+
+/**
+ * Poll for the events in `req`, blocking until an event is set.
+ *
+ * @param q Target queue.
+ * @param req Events to poll for.
+ *
+ * @return
+ * Bitwise AND between `req` and the events set in `q`
+ */
+#define uk_pollq_poll(q, req) uk_pollq_poll_until(q, req, 0)
+
+/**
+ * Poll for event rising edges in `req`, blocking until `deadline` or an edge.
+ *
+ * In contrast to normal poll, will not return immediately if events are set,
+ * nor return which events were detected.
+ * Use `uk_pollq_poll_immediate` to check the current set events, however events
+ * may have been modified in the meantime, potentially leading to lost edges.
+ * To correctly handle these missed edges, use update chaining.
+ *
+ * @param q Target queue.
+ * @param req Events to poll for.
+ * @param deadline Max number of nanoseconds to wait for, or 0 if forever
+ *
+ * @return
+ * 1 if a rising edge was detected,
+ * 0 if timed out
+ */
+static inline
+int uk_pollq_edge_poll_until(struct uk_pollq *q, uk_pollevent req,
+ __nsec deadline)
+{
+ uk_pollevent level = uk_pollq_poll_immediate(q, req);
+
+ /* Acquire lock & check for new events */
+ if (_pollq_lock(q, req, level))
+ return 1;
+ /* Wait for notification */
+ return _pollq_wait(q, req, deadline);
+}
+
+/**
+ * Poll for event rising edges in `req`, blocking until a rising edge.
+ *
+ * In contrast to normal poll, will not return immediately if events are set,
+ * nor return which events were detected.
+ * Use `uk_pollq_poll_immediate` to check the current set events.
+ * To correctly handle missed edges, use update chaining.
+ *
+ * @param q Target queue.
+ * @param req Events to poll for.
+ *
+ * @return
+ * 1 if a rising edge was detected,
+ * 0 if timed out
+ */
+#define uk_pollq_edge_poll(q, req) uk_pollq_edge_poll_until(q, req, 0)
+
+
+#if CONFIG_LIBUKFILE_CHAINUPDATE
+/* Propagation */
+
+/**
+ * INTERNAL. Register update chaining ticket.
+ *
+ * Must be called with appropriate locks held
+ *
+ * @param q Target queue.
+ * @param tick Update chaining ticket to register.
+ */
+static inline
+void _pollq_register(struct uk_pollq *q, struct uk_poll_chain *tick)
+{
+ struct uk_poll_chain **tail;
+
+ (void)ukarch_or(&q->propmask, tick->mask);
+ tail = ukarch_exchange_n(&q->propend, &tick->next);
+ UK_ASSERT(!*tail); /* Should be genuine list tail */
+ *tail = tick;
+}
+
+/**
+ * Register ticket `tick` for event propagations on `q`.
+ *
+ * @param q Target queue.
+ * @param tick Update chaining ticket to register.
+ */
+static inline
+void uk_pollq_register(struct uk_pollq *q, struct uk_poll_chain *tick)
+{
+ uk_rwlock_rlock(&q->proplock);
+ _pollq_register(q, tick);
+ uk_rwlock_runlock(&q->proplock);
+}
+
+/**
+ * Unregister ticket `tick` from event propagations on `q`.
+ *
+ * @param q Target queue.
+ * @param tick Update chaining ticket to unregister.
+ */
+static inline
+void uk_pollq_unregister(struct uk_pollq *q, struct uk_poll_chain *tick)
+{
+ uk_rwlock_wlock(&q->proplock);
+ for (struct uk_poll_chain **p = &q->prop; *p; p = &(*p)->next)
+ if (*p == tick) {
+ *p = tick->next;
+ tick->next = NULL;
+ if (!*p) /* We unlinked last node */
+ q->propend = p;
+ break;
+ }
+ uk_rwlock_wunlock(&q->proplock);
+}
+
+/**
+ * Update the registration ticket `tick` with values from `ntick` atomically.
+ *
+ * `ntick` should first be initialized from `tick`, then have values updated.
+ * Supplying a `tick` that is not registered with `q` or `ntick` with a `next`
+ * field different from the one in `tick` is undefined behavior.
+ *
+ * @param q Target queue.
+ * @param tick Update chaining ticket to update.
+ * @param ntick New values for fields in `tick`.
+ */
+static inline
+void uk_pollq_reregister(struct uk_pollq *q, struct uk_poll_chain *tick,
+ const struct uk_poll_chain *ntick)
+{
+ UK_ASSERT(tick->next == ntick->next);
+ uk_rwlock_rlock(&q->proplock);
+ ukarch_or(&q->propmask, ntick->mask);
+ *tick = *ntick;
+ uk_rwlock_runlock(&q->proplock);
+}
+
+/**
+ * Poll for events and/or register for propagation on `q`.
+ *
+ * @param q Target queue.
+ * @param tick Update chaining ticket to register, if needed.
+ * @param force If 0, will immediately return without registering if any of the
+ * requested events are set. If non-zero, always register.
+ *
+ * @return
+ * Requested events that are currently active.
+ */
+static inline
+uk_pollevent uk_pollq_poll_register(struct uk_pollq *q,
+ struct uk_poll_chain *tick, int force)
+{
+ uk_pollevent ev;
+ uk_pollevent req = tick->mask;
+
+ if (!force && (ev = uk_pollq_poll_immediate(q, req)))
+ return ev;
+ /* Might need to register */
+ uk_rwlock_rlock(&q->proplock);
+ if ((ev = uk_pollq_poll_immediate(q, req)) && !force)
+ goto out;
+ _pollq_register(q, tick);
+out:
+ uk_rwlock_runlock(&q->proplock);
+ return ev;
+}
+
+/**
+ * Poll for event rising edges and/or register for propagation on `q`.
+ *
+ * @param q Target queue.
+ * @param tick Update chaining ticket to register, if needed.
+ * @param force If 0, will immediately return without registering if any of the
+ * requested event rising edges are detected. If non-zero, always register.
+ *
+ * @return
+ * Detected rising edges of requested events.
+ */
+static inline
+uk_pollevent uk_pollq_edge_poll_register(struct uk_pollq *q,
+ struct uk_poll_chain *tick,
+ int force)
+{
+ uk_pollevent ev;
+ uk_pollevent req = tick->mask;
+ uk_pollevent level = uk_pollq_poll_immediate(q, req);
+
+ uk_rwlock_rlock(&q->proplock);
+ if ((ev = uk_pollq_poll_immediate(q, req) & ~level) && !force)
+ goto out;
+ _pollq_register(q, tick);
+out:
+ uk_rwlock_runlock(&q->proplock);
+ return ev;
+}
+#endif /* CONFIG_LIBUKFILE_CHAINUPDATE */
+
+/* Updating */
+
+/**
+ * Update events, clearing those in `clr`.
+ *
+ * @param q Target queue.
+ * @param clr Events to clear.
+ *
+ * @return
+ * The previous event set.
+ */
+uk_pollevent uk_pollq_clear(struct uk_pollq *q, uk_pollevent clr);
+
+/**
+ * Update events, setting those in `set` and handling notifications.
+ *
+ * @param q Target queue.
+ * @param set Events to set.
+ * @param n Maximum number of threads to wake up. If < 0 wake up all threads.
+ * Chained updates have their own defined notification semantics and may
+ * notify more threads than specified in `n`.
+ *
+ * @return
+ * The previous event set.
+ */
+uk_pollevent uk_pollq_set_n(struct uk_pollq *q, uk_pollevent set, int n);
+
+/**
+ * Replace the events in `q` with `val` and handle notifications.
+ *
+ * @param q Target queue.
+ * @param val New event set.
+ * @param n Maximum number of threads to wake up. If < 0 wake up all threads.
+ * Chained updates have their own defined notification semantics and may
+ * notify more threads than specified in `n`
+ *
+ * @return
+ * The previous event set.
+ */
+uk_pollevent uk_pollq_assign_n(struct uk_pollq *q, uk_pollevent val, int n);
+
+#define UK_POLLQ_NOTIFY_ALL -1
+
+/**
+ * Update events, setting those in `set` and handling notifications.
+ *
+ * @param q Target queue.
+ * @param set Events to set.
+ *
+ * @return
+ * The previous event set.
+ */
+#define uk_pollq_set(q, s) uk_pollq_set_n(q, s, UK_POLLQ_NOTIFY_ALL)
+
+/**
+ * Replace the events in `q` with `val` and handle notifications.
+ *
+ * @param q Target queue.
+ * @param val New event set.
+ *
+ * @return
+ * The previous event set.
+ */
+#define uk_pollq_assign(q, s) uk_pollq_assign_n(q, s, UK_POLLQ_NOTIFY_ALL)
+
+#endif /* __UKFILE_POLLQUEUE_H__ */
--- /dev/null
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright (c) 2023, Unikraft GmbH and The Unikraft Authors.
+ * Licensed under the BSD-3-Clause License (the "License").
+ * You may not use this file except in compliance with the License.
+ */
+
+#ifndef __UKFILE_STATX_H__
+#define __UKFILE_STATX_H__
+
+#include <stdint.h>
+
+/*
+ * Linux-compatible `statx` structure for use in syscalls, along with
+ * definitions of bit flags.
+ *
+ * Layout taken from statx(2) man page with paddings from musl v1.2.3.
+ * Flag values taken from Linux headers v6.5.6 (include/uapi/linux/stat.h).
+ */
+
+struct uk_statx_timestamp {
+ int64_t tv_sec;
+ uint32_t tv_nsec;
+ int32_t _reserved;
+};
+
+struct uk_statx {
+ uint32_t stx_mask;
+ uint32_t stx_blksize;
+ uint64_t stx_attributes;
+ uint32_t stx_nlink;
+ uint32_t stx_uid;
+ uint32_t stx_gid;
+ uint16_t stx_mode;
+ uint16_t _mode_reserved;
+ uint64_t stx_ino;
+ uint64_t stx_size;
+ uint64_t stx_blocks;
+ uint64_t stx_attributes_mask;
+ struct uk_statx_timestamp stx_atime, stx_btime, stx_ctime, stx_mtime;
+ uint32_t stx_rdev_major;
+ uint32_t stx_rdev_minor;
+ uint32_t stx_dev_major;
+ uint32_t stx_dev_minor;
+ uint64_t stx_mnt_id;
+ uint32_t stx_dio_mem_align;
+ uint32_t stx_dio_offset_align;
+ uint64_t _spare[12];
+};
+
+/* Bits used in stx_mask */
+#define UK_STATX_TYPE 0x00000001U /* File type in stx_mode */
+#define UK_STATX_MODE 0x00000002U /* File mode (perms) in stx_mode */
+#define UK_STATX_NLINK 0x00000004U /* stx_nlink */
+#define UK_STATX_UID 0x00000008U /* stx_uid */
+#define UK_STATX_GID 0x00000010U /* stx_gid */
+#define UK_STATX_ATIME 0x00000020U /* stx_atime */
+#define UK_STATX_MTIME 0x00000040U /* stx_mtime */
+#define UK_STATX_CTIME 0x00000080U /* stx_ctime */
+#define UK_STATX_BTIME 0x00000800U /* stx_btime */
+#define UK_STATX_INO 0x00000100U /* stx_ino */
+#define UK_STATX_SIZE 0x00000200U /* stx_size */
+#define UK_STATX_BLOCKS 0x00000400U /* stx_blocks */
+#define UK_STATX_MNT_ID 0x00001000U /* stx_mnt_id */
+#define UK_STATX_DIOALIGN 0x00002000U /* stx_dio_*_align */
+
+#define UK_STATX_BASIC_STATS 0x000007ffU /* Fields common with `struct stat` */
+#define UK_STATX__RESERVED 0x80000000U /* Reserved bit */
+
+/* Bits used in stx_attributes and stx_attributes_mask */
+#define UK_STATX_ATTR_COMPRESSED 0x00000004 /* File is compressed by the fs */
+#define UK_STATX_ATTR_IMMUTABLE 0x00000010 /* File is marked immutable */
+#define UK_STATX_ATTR_APPEND 0x00000020 /* File is append-only */
+#define UK_STATX_ATTR_NODUMP 0x00000040 /* File is not to be dumped */
+#define UK_STATX_ATTR_ENCRYPTED 0x00000800 /* Requires key to decrypt in fs */
+#define UK_STATX_ATTR_AUTOMOUNT 0x00001000 /* Dir: Automount trigger */
+#define UK_STATX_ATTR_MOUNT_ROOT 0x00002000 /* Root of a mount */
+#define UK_STATX_ATTR_VERITY 0x00100000 /* Verity protected file */
+#define UK_STATX_ATTR_DAX 0x00200000 /* File is currently in DAX state */
+
+#endif /* __UKFILE_STATX_H__ */
--- /dev/null
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright (c) 2023, Unikraft GmbH and The Unikraft Authors.
+ * Licensed under the BSD-3-Clause License (the "License").
+ * You may not use this file except in compliance with the License.
+ */
+
+/* Open file description */
+
+#ifndef __UKFILE_OFILE_H__
+#define __UKFILE_OFILE_H__
+
+#include <uk/essentials.h>
+#include <uk/file.h>
+#include <uk/mutex.h>
+
+struct uk_ofile {
+ const struct uk_file *file;
+ unsigned int mode;
+ __atomic refcnt;
+ off_t pos;
+ struct uk_mutex lock; /* Lock for modifying open file state */
+};
+
+static inline
+void uk_ofile_init(struct uk_ofile *of)
+{
+ uk_refcount_init(&of->refcnt, 0);
+ uk_mutex_init(&of->lock);
+}
+
+#endif /* __UKFILE_OFILE_H__ */
--- /dev/null
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright (c) 2023, Unikraft GmbH and The Unikraft Authors.
+ * Licensed under the BSD-3-Clause License (the "License").
+ * You may not use this file except in compliance with the License.
+ */
+
+#include <uk/file/pollqueue.h>
+
+#include <uk/assert.h>
+
+static void pollq_notify_n(struct uk_pollq *q, uk_pollevent set, int n)
+{
+ uk_rwlock_wlock(&q->waitlock);
+ if (q->waitmask & set) {
+ /* Walk wait list, wake up & collect */
+ uk_pollevent seen = 0;
+
+ for (struct uk_poll_ticket **p = &q->wait; *p; p = &(*p)->next) {
+ struct uk_poll_ticket *t = *p;
+
+ if (!n)
+ goto done;
+ if (t->mask & set) {
+ *p = t->next;
+ t->next = NULL;
+ uk_thread_wake(t->thread);
+ n--;
+ } else {
+ seen |= t->mask;
+ }
+ if (!*p) {
+ /* We just unlinked last node */
+ q->waitend = p;
+ break;
+ }
+ }
+ /* Reached end of list, can prune waitmask */
+ q->waitmask = seen;
+ }
+done:
+ uk_rwlock_wunlock(&q->waitlock);
+}
+
+#if CONFIG_LIBUKFILE_CHAINUPDATE
+static void pollq_propagate(struct uk_pollq *q,
+ enum uk_poll_chain_op op, uk_pollevent set)
+{
+ uk_rwlock_wlock(&q->proplock);
+ if (q->propmask & set) {
+ uk_pollevent seen;
+
+ /* Tag this queue in case of chaining loops */
+ UK_ASSERT(!q->_tag);
+ q->_tag = uk_thread_current();
+ /* Walk chain list & propagate updates */
+ seen = 0;
+ for (struct uk_poll_chain **p = &q->prop; *p; p = &(*p)->next) {
+ struct uk_poll_chain *t = *p;
+ uk_pollevent req = set & t->mask;
+
+ if (req) {
+ switch (t->type) {
+ case UK_POLL_CHAINTYPE_UPDATE:
+ {
+ uk_pollevent ev = t->set ? t->set : req;
+
+ switch (op) {
+ case UK_POLL_CHAINOP_CLEAR:
+ uk_pollq_clear(t->queue, ev);
+ break;
+ case UK_POLL_CHAINOP_SET:
+ uk_pollq_set(t->queue, ev);
+ break;
+ }
+ }
+ break;
+ case UK_POLL_CHAINTYPE_CALLBACK:
+ t->callback(req, op, t);
+ break;
+ }
+ }
+ seen |= t->mask;
+ }
+ q->propmask = seen; /* Prune propmask */
+ q->_tag = NULL; /* Clear tag */
+ }
+ uk_rwlock_wunlock(&q->proplock);
+}
+#endif /* CONFIG_LIBUKFILE_CHAINUPDATE */
+
+uk_pollevent uk_pollq_clear(struct uk_pollq *q, uk_pollevent clr)
+{
+ uk_pollevent prev = ukarch_and(&q->events, ~clr);
+
+#if CONFIG_LIBUKFILE_CHAINUPDATE
+ pollq_propagate(q, UK_POLL_CHAINOP_CLEAR, clr);
+#endif /* CONFIG_LIBUKFILE_CHAINUPDATE */
+ return prev;
+}
+
+uk_pollevent uk_pollq_set_n(struct uk_pollq *q, uk_pollevent set, int n)
+{
+ uk_pollevent prev;
+
+ if (!set)
+ return 0;
+
+#if CONFIG_LIBUKFILE_CHAINUPDATE
+ if (q->_tag == uk_thread_current()) /* Chaining update loop, return */
+ return 0;
+#endif /* CONFIG_LIBUKFILE_CHAINUPDATE */
+
+ prev = ukarch_or(&q->events, set);
+ pollq_notify_n(q, set, n);
+#if CONFIG_LIBUKFILE_CHAINUPDATE
+ pollq_propagate(q, UK_POLL_CHAINOP_SET, set);
+#endif /* CONFIG_LIBUKFILE_CHAINUPDATE */
+ return prev;
+}
+
+uk_pollevent uk_pollq_assign_n(struct uk_pollq *q, uk_pollevent val, int n)
+{
+ uk_pollevent prev = ukarch_exchange_n(&q->events, val);
+ uk_pollevent set = val & ~prev;
+
+ if (set) {
+ pollq_notify_n(q, set, n);
+#if CONFIG_LIBUKFILE_CHAINUPDATE
+ pollq_propagate(q, UK_POLL_CHAINOP_SET, set);
+#endif /* CONFIG_LIBUKFILE_CHAINUPDATE */
+ }
+ return prev;
+}