From: Andrei Tatar Date: Mon, 20 Nov 2023 15:20:25 +0000 (+0100) Subject: lib/ukfile: Introduce the ukfile library X-Git-Tag: RELEASE-0.16.0~191 X-Git-Url: http://xenbits.xensource.com/gitweb?a=commitdiff_plain;h=9b327f8efa996359e456958a2f6e43b5f2b6fdee;p=unikraft%2Funikraft.git lib/ukfile: Introduce the ukfile library This change introduces the ukfile library to provide a new fundamental abstraction for "files" in Unikraft. This forms the base for a larger rearchitecting of vfscore. Checkpatch-Ignore: ENOSYS Checkpatch-Ignore: VOLATILE Signed-off-by: Andrei Tatar Reviewed-by: Simon Kuenzer Approved-by: Simon Kuenzer GitHub-Closes: #1165 --- diff --git a/lib/Makefile.uk b/lib/Makefile.uk index 319a572a2..383e22b3c 100644 --- a/lib/Makefile.uk +++ b/lib/Makefile.uk @@ -35,6 +35,7 @@ $(eval $(call import_lib,$(CONFIG_UK_BASE)/lib/ukcpio)) $(eval $(call import_lib,$(CONFIG_UK_BASE)/lib/ukdebug)) $(eval $(call import_lib,$(CONFIG_UK_BASE)/lib/ukfalloc)) $(eval $(call import_lib,$(CONFIG_UK_BASE)/lib/ukfallocbuddy)) +$(eval $(call import_lib,$(CONFIG_UK_BASE)/lib/ukfile)) $(eval $(call import_lib,$(CONFIG_UK_BASE)/lib/uklibid)) $(eval $(call import_lib,$(CONFIG_UK_BASE)/lib/ukintctlr)) $(eval $(call import_lib,$(CONFIG_UK_BASE)/lib/uklibparam)) diff --git a/lib/ukfile/Config.uk b/lib/ukfile/Config.uk new file mode 100644 index 000000000..ae294d476 --- /dev/null +++ b/lib/ukfile/Config.uk @@ -0,0 +1,10 @@ +config LIBUKFILE + bool "ukfile: Common support for files" + select LIBUKLOCK + select LIBUKLOCK_MUTEX + select LIBUKLOCK_RWLOCK + select LIBUKSCHED + +# Hidden, selected by core components when required +config LIBUKFILE_CHAINUPDATE + bool diff --git a/lib/ukfile/Makefile.uk b/lib/ukfile/Makefile.uk new file mode 100644 index 000000000..f045e9b33 --- /dev/null +++ b/lib/ukfile/Makefile.uk @@ -0,0 +1,7 @@ +$(eval $(call addlib_s,libukfile,$(CONFIG_LIBUKFILE))) + +CINCLUDES-$(CONFIG_LIBUKFILE) += -I$(LIBUKFILE_BASE)/include +CXXINCLUDES-$(CONFIG_LIBUKFILE) += -I$(LIBUKFILE_BASE)/include + +LIBUKFILE_SRCS-y += $(LIBUKFILE_BASE)/pollqueue.c +LIBUKFILE_SRCS-y += $(LIBUKFILE_BASE)/file-nops.c diff --git a/lib/ukfile/README.md b/lib/ukfile/README.md new file mode 100644 index 000000000..819482960 --- /dev/null +++ b/lib/ukfile/README.md @@ -0,0 +1,71 @@ +# `ukfile`: files for Unikraft + +This core library contains the unikraft abstractions of a "file" as well as an "open file" (a.k.a. "open file descriptions"). +These are low-level internal abstractions that do not have direct correspondents in any userspace-facing API. +Not to be confused with "file descriptors" or other similar POSIX-y concepts; please see `posix-fd*` for those. + +This README discusses higher-level design considerations for (open) files. +Consult the headers `uk/file.h` and `uk/ofile.h` for specifics on implementation. + +## What is a _file_? + +To overuse a classical *NIX idiom, "everything is a file". +More concretely however, a file is an abstraction for any resource that offers a combination of input, output, and/or control operations. +A file in Unikraft is a combination of an _immutable identity_ coupled with _mutable state_. + +Files are represented in Unikraft by the `struct uk_file` type, referenced in APIs as `const struct uk_file *` to enforce immutability. +Identity consists of: +- A volume identifier: driver-specific field, used to identify the file type as well as its originating driver instance +- A file node: reference to driver-specific data associated with the file +- Table of file operations: implementations of well-defined file operations (see below) + +File state is used for bookkeeping purposes and includes: +- Reference counting (strong & weak references) +- Locks for synchronization +- Event set & queue for polling operations + +### File Operations + +Files allow for a defined set of operations, some of which are driver-implemented, while others are common across all files. +Driver-specific operations have a well-defined interface and are implemented by file drivers. +These are: +- I/O: manipulating an array of unstructured bytes + - `read`: retrieve a specific contiguous block of bytes from this array + - `write`: ensure a specific contiguous block in this array has specific bytes +- Metadata: manipulating a defined structure of metadata related to the file + - `getstat`: get file metadata fields + - `setstat`: set file metadata fields +- Control: requests for special operations to be performed by the file + - `ctl` +- (internal) cleanup/destructor: what happens, if anything, when we no longer need the file + +Common operations are implemented centrally for all file objects: +- Reference counting: acquire/release of regular (strong) or weak references + - Strong references allow the full functionality of files + - Weak references allow only common operations (polling, locking) +- Event polling & notification: + - Driver API: + - Set & clear what event flags are active on the file + - User API: + - Check whether specific events are set on a file + - Wait and be awoken when an event becomes set on a file +- Voluntary kernel-space synchronization mechanisms: + - Driver operations provide no synchronization or atomicity guarantees themselves in the general case + - Drivers are free to implement these operations as efficiently as their internal data model allows + - Higher-level APIs that want to provide atomicity guarantees (e.g. POSIX read vs. write serialization) can and should use these mechanisms to achieve their goal + + +## What is an _open file_? + +Open files are stateful and mutable references to a file that is "in use". +The precise definition of "in use" is intentionally left vague and up to client code. +Open file state consists of: +- Reference count, allowing multiple independent shared references +- Open "mode", a client-defined bitmask of open file options +- Current position for I/O (i.e. what one sets with `lseek()`) +- Lock for synchronizing changes to the above + +Open files are represented in Unikraft by the `struct uk_ofile` type. +A single `struct uk_file` may be referenced by an arbitrary number of ofiles, each of which acts independently from the others. + +Open files do not expose any operations themselves, instead providing only a base data structure for higher-level abstractions, such as file descriptor tables and POSIX I/O syscalls. diff --git a/lib/ukfile/file-nops.c b/lib/ukfile/file-nops.c new file mode 100644 index 000000000..16da21631 --- /dev/null +++ b/lib/ukfile/file-nops.c @@ -0,0 +1,52 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright (c) 2023, Unikraft GmbH and The Unikraft Authors. + * Licensed under the BSD-3-Clause License (the "License"). + * You may not use this file except in compliance with the License. + */ + +#include + +#include + + +ssize_t uk_file_nop_read(const struct uk_file *f __unused, + const struct iovec *iov __unused, int iovcnt __unused, + off_t off __unused, long flags __unused) +{ + return -ENOSYS; +} + +ssize_t uk_file_nop_write(const struct uk_file *f __unused, + const struct iovec *iov __unused, int iovcnt __unused, + off_t off __unused, long flags __unused) +{ + return -ENOSYS; +} + +int uk_file_nop_getstat(const struct uk_file *f __unused, + unsigned mask __unused, struct uk_statx *arg __unused) +{ + return -ENOSYS; +} + +int uk_file_nop_setstat(const struct uk_file *f __unused, + unsigned mask __unused, + const struct uk_statx *arg __unused) +{ + return -ENOSYS; +} + +int uk_file_nop_ctl(const struct uk_file *f __unused, int fam __unused, + int req __unused, uintptr_t arg1 __unused, + uintptr_t arg2 __unused, uintptr_t arg3 __unused) +{ + return -ENOSYS; +} + +const struct uk_file_ops uk_file_nops = { + .read = uk_file_nop_read, + .write = uk_file_nop_write, + .getstat = uk_file_nop_getstat, + .setstat = uk_file_nop_setstat, + .ctl = uk_file_nop_ctl +}; diff --git a/lib/ukfile/include/uk/file.h b/lib/ukfile/include/uk/file.h new file mode 100644 index 000000000..064f4f0b3 --- /dev/null +++ b/lib/ukfile/include/uk/file.h @@ -0,0 +1,268 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright (c) 2023, Unikraft GmbH and The Unikraft Authors. + * Licensed under the BSD-3-Clause License (the "License"). + * You may not use this file except in compliance with the License. + */ + +/* Fundamental abstraction for files in Unikraft. */ + +#ifndef __UKFILE_FILE_H__ +#define __UKFILE_FILE_H__ + +#include +#include + +#include +#include +#include +#include + + +struct uk_file; + +/* File operations, to be provided by drivers */ + +/* I/O functions are non-blocking & return -EAGAIN when unable to perform. + * The behavior of concurrent calls to these functions is driver-dependent and + * no general assumptions can be made about their ordering and/or interleaving. + * Callers should themselves use the state->iolock (and/or other locks) + * as appropriate in order to provide the desired concurrency guarantees. + */ + +/* I/O */ +typedef ssize_t (*uk_file_io_func)(const struct uk_file *f, + const struct iovec *iov, int iovcnt, + off_t off, long flags); + +/* Info (stat-like & chXXX-like) */ +typedef int (*uk_file_getstat_func)(const struct uk_file *f, + unsigned int mask, + struct uk_statx *arg); +typedef int (*uk_file_setstat_func)(const struct uk_file *f, + unsigned int mask, + const struct uk_statx *arg); + +/* Control */ +/* Values for the `fam` argument of file_ctl */ +#define UKFILE_CTL_FILE 0 /* File controls (sync, allocation, etc.) */ +#define UKFILE_CTL_IOCTL 1 /* Linux-compatible ioctl() requests */ + +/* + * SYNC((int)all, void, void) + * Flush modified file data & metadata to storage. + * If all is 0, flush minimum of metadata, if 1 flush all metadata. + */ +#define UKFILE_CTL_FILE_SYNC 0 + +/* + * TRUNC((off_t)len, void, void) + * Truncate file to `len` bytes. + */ +#define UKFILE_CTL_FILE_TRUNC 1 + +/* + * FALLOC((int)mode, (off_t)offset, (off_t)len) + * Linux-compatible fallocate operation. + */ +#define UKFILE_CTL_FILE_FALLOC 2 + +/* + * FADVISE((off_t)offset, (off_t)len, (int)advice) + * Linux-compatible fadvise operations. + */ +#define UKFILE_CTL_FILE_FADVISE 3 + +typedef int (*uk_file_ctl_func)(const struct uk_file *f, int fam, int req, + uintptr_t arg1, uintptr_t arg2, uintptr_t arg3); + +/* Destructor */ +/* what - bitwise OR of what to release: + * UK_FILE_RELEASE_RES - file resources + * UK_FILE_RELEASE_OBJ - file object + */ +#define UK_FILE_RELEASE_RES UK_SWREFCOUNT_LAST_STRONG +#define UK_FILE_RELEASE_OBJ UK_SWREFCOUNT_LAST_REF + +typedef void (*uk_file_release_func)(const struct uk_file *f, int what); + +struct uk_file_ops { + uk_file_io_func read; + uk_file_io_func write; + uk_file_getstat_func getstat; + uk_file_setstat_func setstat; + uk_file_ctl_func ctl; +}; + +/* File struct */ + +struct uk_file_state { + /* Synchronization for higher-level operations */ + struct uk_rwlock iolock; + /* Polling & events */ + struct uk_pollq pollq; + /* Voluntary locks (flock) */ + /* TODO */ +}; + +#define UK_FILE_STATE_INITIALIZER(name) ((struct uk_file_state){ \ + .iolock = UK_RWLOCK_INITIALIZER((name).iolock, 0), \ + .pollq = UK_POLLQ_INITIALIZER((name).pollq) \ +}) + +/* + * Reference count type used by uk_file. + * + * The exact reference count is an implementation detail that we do not wish to + * expose to consumers. Drivers may, however, need to allocate and initialize + * this structure; we therefore provide a typedef and initializer. + */ +typedef struct uk_swrefcount uk_file_refcnt; + +struct uk_file { + /* Identity */ + const void *vol; /* Volume instance; needed to check file kind */ + void *node; /* Driver-specific inode data */ + /* Ops table */ + const struct uk_file_ops *ops; + /* Mutable state (refcounting, poll events & locks) */ + uk_file_refcnt *refcnt; + struct uk_file_state *state; + /* Destructor, never call directly */ + uk_file_release_func _release; +}; + +/* Files always get created with one strong reference held */ +#define UK_FILE_REFCNT_INITIALIZER UK_SWREFCOUNT_INITIALIZER(1, 1) + +/* Operations inlines */ +static inline +ssize_t uk_file_read(const struct uk_file *f, + const struct iovec *iov, int iovcnt, + off_t off, long flags) +{ + return f->ops->read(f, iov, iovcnt, off, flags); +} + +static inline +ssize_t uk_file_write(const struct uk_file *f, + const struct iovec *iov, int iovcnt, + off_t off, long flags) +{ + return f->ops->write(f, iov, iovcnt, off, flags); +} + +static inline +int uk_file_getstat(const struct uk_file *f, + unsigned int mask, struct uk_statx *arg) +{ + return f->ops->getstat(f, mask, arg); +} + +static inline +int uk_file_setstat(const struct uk_file *f, + unsigned int mask, const struct uk_statx *arg) +{ + return f->ops->setstat(f, mask, arg); +} + +static inline +int uk_file_ctl(const struct uk_file *f, int fam, int req, + uintptr_t arg1, uintptr_t arg2, uintptr_t arg3) +{ + return f->ops->ctl(f, fam, req, arg1, arg2, arg3); +} + +/* Refcounting & destruction */ + +static inline +void uk_file_acquire(const struct uk_file *f) +{ + uk_swrefcount_acquire(f->refcnt); +} + +static inline +void uk_file_acquire_weak(const struct uk_file *f) +{ + uk_swrefcount_acquire_weak(f->refcnt); +} + +static inline +void uk_file_release(const struct uk_file *f) +{ + int r = uk_swrefcount_release(f->refcnt); + + if (r) + f->_release(f, r); +} + +static inline +void uk_file_release_weak(const struct uk_file *f) +{ + int r = uk_swrefcount_release_weak(f->refcnt); + + if (r) + f->_release(f, r); +} + +/* High-level I/O locking */ + +static inline void uk_file_rlock(const struct uk_file *f) +{ + uk_rwlock_rlock(&f->state->iolock); +} + +static inline void uk_file_runlock(const struct uk_file *f) +{ + uk_rwlock_runlock(&f->state->iolock); +} + +static inline void uk_file_wlock(const struct uk_file *f) +{ + uk_rwlock_wlock(&f->state->iolock); +} + +static inline void uk_file_wunlock(const struct uk_file *f) +{ + uk_rwlock_wunlock(&f->state->iolock); +} + +/* Events & polling */ + +static inline +uk_pollevent uk_file_poll_immediate(const struct uk_file *f, uk_pollevent req) +{ + return uk_pollq_poll_immediate(&f->state->pollq, req); +} + +static inline +uk_pollevent uk_file_poll_until(const struct uk_file *f, uk_pollevent req, + __nsec deadline) +{ + return uk_pollq_poll_until(&f->state->pollq, req, deadline); +} + +static inline +uk_pollevent uk_file_poll(const struct uk_file *f, uk_pollevent req) +{ + return uk_file_poll_until(f, req, 0); +} + +static inline +uk_pollevent uk_file_event_clear(const struct uk_file *f, uk_pollevent clr) +{ + return uk_pollq_clear(&f->state->pollq, clr); +} + +static inline +uk_pollevent uk_file_event_set(const struct uk_file *f, uk_pollevent set) +{ + return uk_pollq_set(&f->state->pollq, set); +} + +static inline +uk_pollevent uk_file_event_assign(const struct uk_file *f, uk_pollevent set) +{ + return uk_pollq_assign(&f->state->pollq, set); +} + +#endif /* __UKFILE_FILE_H__ */ diff --git a/lib/ukfile/include/uk/file/nops.h b/lib/ukfile/include/uk/file/nops.h new file mode 100644 index 000000000..62c50afc3 --- /dev/null +++ b/lib/ukfile/include/uk/file/nops.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright (c) 2023, Unikraft GmbH and The Unikraft Authors. + * Licensed under the BSD-3-Clause License (the "License"). + * You may not use this file except in compliance with the License. + */ + +/* Convenience uk_file stub operations */ + +#ifndef __UKFILE_FILE_NOPS_H__ +#define __UKFILE_FILE_NOPS_H__ + +#include + +extern const struct uk_file_ops uk_file_nops; + +ssize_t uk_file_nop_read(const struct uk_file *f, + const struct iovec *iov, int iovcnt, + off_t off, long flags); + +ssize_t uk_file_nop_write(const struct uk_file *f, + const struct iovec *iov, int iovcnt, + off_t off, long flags); + +int uk_file_nop_getstat(const struct uk_file *f, + unsigned int mask, struct uk_statx *arg); + +int uk_file_nop_setstat(const struct uk_file *f, + unsigned int mask, const struct uk_statx *arg); + +int uk_file_nop_ctl(const struct uk_file *f, int fam, int req, + uintptr_t arg1, uintptr_t arg2, uintptr_t arg3); + +#endif /* __UKFILE_FILE_NOPS_H__ */ diff --git a/lib/ukfile/include/uk/file/pollqueue.h b/lib/ukfile/include/uk/file/pollqueue.h new file mode 100644 index 000000000..1790869b9 --- /dev/null +++ b/lib/ukfile/include/uk/file/pollqueue.h @@ -0,0 +1,599 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright (c) 2023, Unikraft GmbH and The Unikraft Authors. + * Licensed under the BSD-3-Clause License (the "License"). + * You may not use this file except in compliance with the License. + */ + +/* Multi-event poll/wait queue with update chaining support */ + +#ifndef __UKFILE_POLLQUEUE_H__ +#define __UKFILE_POLLQUEUE_H__ + +#include + +#include +#include +#include +#include +#include + +/* + * Bitmask of event flags. + * + * Should be large enough to accomodate what userspace will use as event flags + * in the least significant bits, along with Unikraft-internal flags (if any) + * in the more significant bits. + */ +typedef unsigned int uk_pollevent; + +/** + * Ticket for registering on the poll waiting list. + * + * If the newly set events overlap with those in `mask`, wake up `thread`. + * Tickets are atomically released from the wait queue when waking. + */ +struct uk_poll_ticket { + struct uk_poll_ticket *next; + struct uk_thread *thread; /* Thread to wake up */ + uk_pollevent mask; /* Events to register for */ +}; + +#if CONFIG_LIBUKFILE_CHAINUPDATE + +/* Update chaining */ + +enum uk_poll_chain_type { + UK_POLL_CHAINTYPE_UPDATE, + UK_POLL_CHAINTYPE_CALLBACK +}; + +enum uk_poll_chain_op { + UK_POLL_CHAINOP_CLEAR, + UK_POLL_CHAINOP_SET +}; + +struct uk_poll_chain; + +/** + * Update chaining callback function; called on event propagations. + * + * @param ev The events that triggered this update. + * @param op Whether `events` are being set or cleared. + * @param tick The update chaining ticket this callback is registered with. + */ +typedef void (*uk_poll_chain_callback_fn)(uk_pollevent ev, + enum uk_poll_chain_op op, + struct uk_poll_chain *tick); + +/** + * Ticket for registering on the update chaining list. + * + * If newly modified events overlap with those in `mask`, perform a chain update + * of these overlapping bits according to `type`: + * - UK_POLL_CHAINTYPE_UPDATE: propagate events to `queue`. + * If `set` != 0 set/clear events in `set`, instead of original + * - UK_POLL_CHAINTYPE_CALLBACK: call `callback` + */ +struct uk_poll_chain { + struct uk_poll_chain *next; + uk_pollevent mask; /* Events to register for */ + enum uk_poll_chain_type type; + union { + struct { + struct uk_pollq *queue; /* Where to propagate updates */ + uk_pollevent set; /* Events to set */ + }; + struct { + uk_poll_chain_callback_fn callback; + void *arg; + }; + }; +}; + +/* Initializer for a chain ticket that propagates events to another queue */ +#define UK_POLL_CHAIN_UPDATE(msk, to, ev) ((struct uk_poll_chain){ \ + .next = NULL, \ + .mask = (msk), \ + .type = UK_POLL_CHAINTYPE_UPDATE, \ + .queue = (to), \ + .set = (ev) \ +}) + +/* Initializer for a chain ticket that calls a custom callback */ +#define UK_POLL_CHAIN_CALLBACK(msk, cb, dat) ((struct uk_poll_chain){ \ + .next = NULL, \ + .mask = (msk), \ + .type = UK_POLL_CHAINTYPE_CALLBACK, \ + .callback = (cb), \ + .arg = (dat) \ +}) + +#endif /* CONFIG_LIBUKFILE_CHAINUPDATE */ + +/* Main queue */ +struct uk_pollq { + /* Notification lists */ + struct uk_poll_ticket *wait; /* Polling threads */ + struct uk_poll_ticket **waitend; +#if CONFIG_LIBUKFILE_CHAINUPDATE + struct uk_poll_chain *prop; /* Registrations for chained updates */ + struct uk_poll_chain **propend; +#endif /* CONFIG_LIBUKFILE_CHAINUPDATE */ + + /* Events */ + volatile uk_pollevent events; /* Instantaneous event levels */ + uk_pollevent waitmask; /* Events waited on by threads */ +#if CONFIG_LIBUKFILE_CHAINUPDATE + uk_pollevent propmask; /* Events registered for chaining */ +#endif /* CONFIG_LIBUKFILE_CHAINUPDATE */ + /* Locks & sundry */ +#if CONFIG_LIBUKFILE_CHAINUPDATE + void *_tag; /* Internal use */ + struct uk_rwlock proplock; /* Chained updates list lock */ +#endif /* CONFIG_LIBUKFILE_CHAINUPDATE */ + struct uk_rwlock waitlock; /* Wait list lock */ +}; + +#if CONFIG_LIBUKFILE_CHAINUPDATE +#define UK_POLLQ_INITIALIZER(q) \ + ((struct uk_pollq){ \ + .wait = NULL, \ + .waitend = &(q).wait, \ + .prop = NULL, \ + .propend = &(q).prop, \ + .events = 0, \ + .waitmask = 0, \ + .propmask = 0, \ + .proplock = UK_RWLOCK_INITIALIZER((q).proplock, 0), \ + .waitlock = UK_RWLOCK_INITIALIZER((q).waitlock, 0), \ + }) +#else /* !CONFIG_LIBUKFILE_CHAINUPDATE */ +#define UK_POLLQ_INITIALIZER(q) \ + ((struct uk_pollq){ \ + .wait = NULL, \ + .waitend = &(q).wait, \ + .events = 0, \ + .waitmask = 0, \ + .waitlock = UK_RWLOCK_INITIALIZER((q).waitlock, 0), \ + }) +#endif /* !CONFIG_LIBUKFILE_CHAINUPDATE */ + +/** + * Initialize the fields of `q` to a valid empty state. + */ +static inline +void uk_pollq_init(struct uk_pollq *q) +{ + q->wait = NULL; + q->waitend = &q->wait; + q->events = 0; + q->waitmask = 0; + uk_rwlock_init(&q->waitlock); +#if CONFIG_LIBUKFILE_CHAINUPDATE + q->prop = NULL; + q->propend = &q->prop; + q->propmask = 0; + uk_rwlock_init(&q->proplock); +#endif /* CONFIG_LIBUKFILE_CHAINUPDATE */ +} + +/* Polling cancellation */ + +/** + * Remove a specific `ticket` from the wait list. + */ +static inline +void uk_pollq_cancel_ticket(struct uk_pollq *q, struct uk_poll_ticket *ticket) +{ + uk_rwlock_wlock(&q->waitlock); + for (struct uk_poll_ticket **p = &q->wait; *p; p = &(*p)->next) + if (*p == ticket) { + *p = ticket->next; + ticket->next = NULL; + if (!*p) + q->waitend = p; + break; + } + uk_rwlock_wunlock(&q->waitlock); +} + +/** + * Remove the ticket of a specific `thread` from the wait list. + */ +static inline +void uk_pollq_cancel_thread(struct uk_pollq *q, struct uk_thread *thread) +{ + uk_rwlock_wlock(&q->waitlock); + for (struct uk_poll_ticket **p = &q->wait; *p; p = &(*p)->next) { + struct uk_poll_ticket *t = *p; + + if (t->thread == thread) { + *p = t->next; + t->next = NULL; + if (!*p) + q->waitend = p; + break; + } + } + uk_rwlock_wunlock(&q->waitlock); +} + +/** + * Remove the ticket of the current thread from the wait list. + */ +#define uk_pollq_cancel(q) uk_pollq_cancel_thread((q), uk_thread_current()) + +/* Polling */ + +/** + * Poll for the events in `req`; never block, always return immediately. + * + * @return + * Bitwise AND between `req` and the events set in `q`. + */ +static inline +uk_pollevent uk_pollq_poll_immediate(struct uk_pollq *q, uk_pollevent req) +{ + return q->events & req; +} + +/** + * INTERNAL. Atomically poll & lock if required. + * + * @param q Target queue. + * @param req Events to poll for. + * @param exp Events expected to be already set. + * + * @return + * non-zero evmask with lock released if events appeared + * 0 with lock held otherwise. + */ +static inline +uk_pollevent _pollq_lock(struct uk_pollq *q, uk_pollevent req, + uk_pollevent exp) +{ + uk_pollevent ev; + + uk_rwlock_rlock(&q->waitlock); + /* Check if events were set while acquiring the lock */ + if ((ev = uk_pollq_poll_immediate(q, req) & ~exp)) + uk_rwlock_runlock(&q->waitlock); + return ev; +} + +/** + * INTERNAL. Wait for events until a timeout. + * + * Must be called only after `_pollq_lock` returns 0. + * + * @param q Target queue. + * @param req Events to poll for. + * @param deadline Max number of nanoseconds to wait or, or 0 if forever + * + * @return + * 0 on timeout + * non-zero if awoken + */ +static inline +int _pollq_wait(struct uk_pollq *q, uk_pollevent req, __nsec deadline) +{ + struct uk_poll_ticket **tail; + struct uk_thread *__current; + struct uk_poll_ticket tick; + int timeout; + + /* Mark request in waitmask */ + (void)ukarch_or(&q->waitmask, req); + /* Compete to register */ + + __current = uk_thread_current(); + tick = (struct uk_poll_ticket){ + .next = NULL, + .thread = __current, + .mask = req, + }; + tail = ukarch_exchange_n(&q->waitend, &tick.next); + /* tail is ours alone, safe to link in */ + UK_ASSERT(!*tail); /* Should be a genuine list tail */ + *tail = &tick; + + /* Block until awoken */ + uk_thread_block_until(__current, deadline); + uk_rwlock_runlock(&q->waitlock); + uk_sched_yield(); + /* Back, wake up, check if timed out & try again */ + timeout = deadline && ukplat_monotonic_clock() >= deadline; + if (timeout) + uk_pollq_cancel_ticket(q, &tick); + return !timeout; +} + +/** + * Poll for the events in `req`, blocking until `deadline` or an event is set. + * + * @param q Target queue. + * @param req Events to poll for. + * @param deadline Max number of nanoseconds to wait for, or 0 if forever + * + * @return + * Bitwise AND between `req` and the events set in `q`, or 0 if timed out + */ +static inline +uk_pollevent uk_pollq_poll_until(struct uk_pollq *q, uk_pollevent req, + __nsec deadline) +{ + uk_pollevent ev; + + do { + if ((ev = uk_pollq_poll_immediate(q, req))) + return ev; + if ((ev = _pollq_lock(q, req, 0))) + return ev; + } while (_pollq_wait(q, req, deadline)); + return ev; +} + +/** + * Poll for the events in `req`, blocking until an event is set. + * + * @param q Target queue. + * @param req Events to poll for. + * + * @return + * Bitwise AND between `req` and the events set in `q` + */ +#define uk_pollq_poll(q, req) uk_pollq_poll_until(q, req, 0) + +/** + * Poll for event rising edges in `req`, blocking until `deadline` or an edge. + * + * In contrast to normal poll, will not return immediately if events are set, + * nor return which events were detected. + * Use `uk_pollq_poll_immediate` to check the current set events, however events + * may have been modified in the meantime, potentially leading to lost edges. + * To correctly handle these missed edges, use update chaining. + * + * @param q Target queue. + * @param req Events to poll for. + * @param deadline Max number of nanoseconds to wait for, or 0 if forever + * + * @return + * 1 if a rising edge was detected, + * 0 if timed out + */ +static inline +int uk_pollq_edge_poll_until(struct uk_pollq *q, uk_pollevent req, + __nsec deadline) +{ + uk_pollevent level = uk_pollq_poll_immediate(q, req); + + /* Acquire lock & check for new events */ + if (_pollq_lock(q, req, level)) + return 1; + /* Wait for notification */ + return _pollq_wait(q, req, deadline); +} + +/** + * Poll for event rising edges in `req`, blocking until a rising edge. + * + * In contrast to normal poll, will not return immediately if events are set, + * nor return which events were detected. + * Use `uk_pollq_poll_immediate` to check the current set events. + * To correctly handle missed edges, use update chaining. + * + * @param q Target queue. + * @param req Events to poll for. + * + * @return + * 1 if a rising edge was detected, + * 0 if timed out + */ +#define uk_pollq_edge_poll(q, req) uk_pollq_edge_poll_until(q, req, 0) + + +#if CONFIG_LIBUKFILE_CHAINUPDATE +/* Propagation */ + +/** + * INTERNAL. Register update chaining ticket. + * + * Must be called with appropriate locks held + * + * @param q Target queue. + * @param tick Update chaining ticket to register. + */ +static inline +void _pollq_register(struct uk_pollq *q, struct uk_poll_chain *tick) +{ + struct uk_poll_chain **tail; + + (void)ukarch_or(&q->propmask, tick->mask); + tail = ukarch_exchange_n(&q->propend, &tick->next); + UK_ASSERT(!*tail); /* Should be genuine list tail */ + *tail = tick; +} + +/** + * Register ticket `tick` for event propagations on `q`. + * + * @param q Target queue. + * @param tick Update chaining ticket to register. + */ +static inline +void uk_pollq_register(struct uk_pollq *q, struct uk_poll_chain *tick) +{ + uk_rwlock_rlock(&q->proplock); + _pollq_register(q, tick); + uk_rwlock_runlock(&q->proplock); +} + +/** + * Unregister ticket `tick` from event propagations on `q`. + * + * @param q Target queue. + * @param tick Update chaining ticket to unregister. + */ +static inline +void uk_pollq_unregister(struct uk_pollq *q, struct uk_poll_chain *tick) +{ + uk_rwlock_wlock(&q->proplock); + for (struct uk_poll_chain **p = &q->prop; *p; p = &(*p)->next) + if (*p == tick) { + *p = tick->next; + tick->next = NULL; + if (!*p) /* We unlinked last node */ + q->propend = p; + break; + } + uk_rwlock_wunlock(&q->proplock); +} + +/** + * Update the registration ticket `tick` with values from `ntick` atomically. + * + * `ntick` should first be initialized from `tick`, then have values updated. + * Supplying a `tick` that is not registered with `q` or `ntick` with a `next` + * field different from the one in `tick` is undefined behavior. + * + * @param q Target queue. + * @param tick Update chaining ticket to update. + * @param ntick New values for fields in `tick`. + */ +static inline +void uk_pollq_reregister(struct uk_pollq *q, struct uk_poll_chain *tick, + const struct uk_poll_chain *ntick) +{ + UK_ASSERT(tick->next == ntick->next); + uk_rwlock_rlock(&q->proplock); + ukarch_or(&q->propmask, ntick->mask); + *tick = *ntick; + uk_rwlock_runlock(&q->proplock); +} + +/** + * Poll for events and/or register for propagation on `q`. + * + * @param q Target queue. + * @param tick Update chaining ticket to register, if needed. + * @param force If 0, will immediately return without registering if any of the + * requested events are set. If non-zero, always register. + * + * @return + * Requested events that are currently active. + */ +static inline +uk_pollevent uk_pollq_poll_register(struct uk_pollq *q, + struct uk_poll_chain *tick, int force) +{ + uk_pollevent ev; + uk_pollevent req = tick->mask; + + if (!force && (ev = uk_pollq_poll_immediate(q, req))) + return ev; + /* Might need to register */ + uk_rwlock_rlock(&q->proplock); + if ((ev = uk_pollq_poll_immediate(q, req)) && !force) + goto out; + _pollq_register(q, tick); +out: + uk_rwlock_runlock(&q->proplock); + return ev; +} + +/** + * Poll for event rising edges and/or register for propagation on `q`. + * + * @param q Target queue. + * @param tick Update chaining ticket to register, if needed. + * @param force If 0, will immediately return without registering if any of the + * requested event rising edges are detected. If non-zero, always register. + * + * @return + * Detected rising edges of requested events. + */ +static inline +uk_pollevent uk_pollq_edge_poll_register(struct uk_pollq *q, + struct uk_poll_chain *tick, + int force) +{ + uk_pollevent ev; + uk_pollevent req = tick->mask; + uk_pollevent level = uk_pollq_poll_immediate(q, req); + + uk_rwlock_rlock(&q->proplock); + if ((ev = uk_pollq_poll_immediate(q, req) & ~level) && !force) + goto out; + _pollq_register(q, tick); +out: + uk_rwlock_runlock(&q->proplock); + return ev; +} +#endif /* CONFIG_LIBUKFILE_CHAINUPDATE */ + +/* Updating */ + +/** + * Update events, clearing those in `clr`. + * + * @param q Target queue. + * @param clr Events to clear. + * + * @return + * The previous event set. + */ +uk_pollevent uk_pollq_clear(struct uk_pollq *q, uk_pollevent clr); + +/** + * Update events, setting those in `set` and handling notifications. + * + * @param q Target queue. + * @param set Events to set. + * @param n Maximum number of threads to wake up. If < 0 wake up all threads. + * Chained updates have their own defined notification semantics and may + * notify more threads than specified in `n`. + * + * @return + * The previous event set. + */ +uk_pollevent uk_pollq_set_n(struct uk_pollq *q, uk_pollevent set, int n); + +/** + * Replace the events in `q` with `val` and handle notifications. + * + * @param q Target queue. + * @param val New event set. + * @param n Maximum number of threads to wake up. If < 0 wake up all threads. + * Chained updates have their own defined notification semantics and may + * notify more threads than specified in `n` + * + * @return + * The previous event set. + */ +uk_pollevent uk_pollq_assign_n(struct uk_pollq *q, uk_pollevent val, int n); + +#define UK_POLLQ_NOTIFY_ALL -1 + +/** + * Update events, setting those in `set` and handling notifications. + * + * @param q Target queue. + * @param set Events to set. + * + * @return + * The previous event set. + */ +#define uk_pollq_set(q, s) uk_pollq_set_n(q, s, UK_POLLQ_NOTIFY_ALL) + +/** + * Replace the events in `q` with `val` and handle notifications. + * + * @param q Target queue. + * @param val New event set. + * + * @return + * The previous event set. + */ +#define uk_pollq_assign(q, s) uk_pollq_assign_n(q, s, UK_POLLQ_NOTIFY_ALL) + +#endif /* __UKFILE_POLLQUEUE_H__ */ diff --git a/lib/ukfile/include/uk/file/statx.h b/lib/ukfile/include/uk/file/statx.h new file mode 100644 index 000000000..b8b5fc906 --- /dev/null +++ b/lib/ukfile/include/uk/file/statx.h @@ -0,0 +1,80 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright (c) 2023, Unikraft GmbH and The Unikraft Authors. + * Licensed under the BSD-3-Clause License (the "License"). + * You may not use this file except in compliance with the License. + */ + +#ifndef __UKFILE_STATX_H__ +#define __UKFILE_STATX_H__ + +#include + +/* + * Linux-compatible `statx` structure for use in syscalls, along with + * definitions of bit flags. + * + * Layout taken from statx(2) man page with paddings from musl v1.2.3. + * Flag values taken from Linux headers v6.5.6 (include/uapi/linux/stat.h). + */ + +struct uk_statx_timestamp { + int64_t tv_sec; + uint32_t tv_nsec; + int32_t _reserved; +}; + +struct uk_statx { + uint32_t stx_mask; + uint32_t stx_blksize; + uint64_t stx_attributes; + uint32_t stx_nlink; + uint32_t stx_uid; + uint32_t stx_gid; + uint16_t stx_mode; + uint16_t _mode_reserved; + uint64_t stx_ino; + uint64_t stx_size; + uint64_t stx_blocks; + uint64_t stx_attributes_mask; + struct uk_statx_timestamp stx_atime, stx_btime, stx_ctime, stx_mtime; + uint32_t stx_rdev_major; + uint32_t stx_rdev_minor; + uint32_t stx_dev_major; + uint32_t stx_dev_minor; + uint64_t stx_mnt_id; + uint32_t stx_dio_mem_align; + uint32_t stx_dio_offset_align; + uint64_t _spare[12]; +}; + +/* Bits used in stx_mask */ +#define UK_STATX_TYPE 0x00000001U /* File type in stx_mode */ +#define UK_STATX_MODE 0x00000002U /* File mode (perms) in stx_mode */ +#define UK_STATX_NLINK 0x00000004U /* stx_nlink */ +#define UK_STATX_UID 0x00000008U /* stx_uid */ +#define UK_STATX_GID 0x00000010U /* stx_gid */ +#define UK_STATX_ATIME 0x00000020U /* stx_atime */ +#define UK_STATX_MTIME 0x00000040U /* stx_mtime */ +#define UK_STATX_CTIME 0x00000080U /* stx_ctime */ +#define UK_STATX_BTIME 0x00000800U /* stx_btime */ +#define UK_STATX_INO 0x00000100U /* stx_ino */ +#define UK_STATX_SIZE 0x00000200U /* stx_size */ +#define UK_STATX_BLOCKS 0x00000400U /* stx_blocks */ +#define UK_STATX_MNT_ID 0x00001000U /* stx_mnt_id */ +#define UK_STATX_DIOALIGN 0x00002000U /* stx_dio_*_align */ + +#define UK_STATX_BASIC_STATS 0x000007ffU /* Fields common with `struct stat` */ +#define UK_STATX__RESERVED 0x80000000U /* Reserved bit */ + +/* Bits used in stx_attributes and stx_attributes_mask */ +#define UK_STATX_ATTR_COMPRESSED 0x00000004 /* File is compressed by the fs */ +#define UK_STATX_ATTR_IMMUTABLE 0x00000010 /* File is marked immutable */ +#define UK_STATX_ATTR_APPEND 0x00000020 /* File is append-only */ +#define UK_STATX_ATTR_NODUMP 0x00000040 /* File is not to be dumped */ +#define UK_STATX_ATTR_ENCRYPTED 0x00000800 /* Requires key to decrypt in fs */ +#define UK_STATX_ATTR_AUTOMOUNT 0x00001000 /* Dir: Automount trigger */ +#define UK_STATX_ATTR_MOUNT_ROOT 0x00002000 /* Root of a mount */ +#define UK_STATX_ATTR_VERITY 0x00100000 /* Verity protected file */ +#define UK_STATX_ATTR_DAX 0x00200000 /* File is currently in DAX state */ + +#endif /* __UKFILE_STATX_H__ */ diff --git a/lib/ukfile/include/uk/ofile.h b/lib/ukfile/include/uk/ofile.h new file mode 100644 index 000000000..d552ef699 --- /dev/null +++ b/lib/ukfile/include/uk/ofile.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright (c) 2023, Unikraft GmbH and The Unikraft Authors. + * Licensed under the BSD-3-Clause License (the "License"). + * You may not use this file except in compliance with the License. + */ + +/* Open file description */ + +#ifndef __UKFILE_OFILE_H__ +#define __UKFILE_OFILE_H__ + +#include +#include +#include + +struct uk_ofile { + const struct uk_file *file; + unsigned int mode; + __atomic refcnt; + off_t pos; + struct uk_mutex lock; /* Lock for modifying open file state */ +}; + +static inline +void uk_ofile_init(struct uk_ofile *of) +{ + uk_refcount_init(&of->refcnt, 0); + uk_mutex_init(&of->lock); +} + +#endif /* __UKFILE_OFILE_H__ */ diff --git a/lib/ukfile/pollqueue.c b/lib/ukfile/pollqueue.c new file mode 100644 index 000000000..5bf28cbe8 --- /dev/null +++ b/lib/ukfile/pollqueue.c @@ -0,0 +1,133 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright (c) 2023, Unikraft GmbH and The Unikraft Authors. + * Licensed under the BSD-3-Clause License (the "License"). + * You may not use this file except in compliance with the License. + */ + +#include + +#include + +static void pollq_notify_n(struct uk_pollq *q, uk_pollevent set, int n) +{ + uk_rwlock_wlock(&q->waitlock); + if (q->waitmask & set) { + /* Walk wait list, wake up & collect */ + uk_pollevent seen = 0; + + for (struct uk_poll_ticket **p = &q->wait; *p; p = &(*p)->next) { + struct uk_poll_ticket *t = *p; + + if (!n) + goto done; + if (t->mask & set) { + *p = t->next; + t->next = NULL; + uk_thread_wake(t->thread); + n--; + } else { + seen |= t->mask; + } + if (!*p) { + /* We just unlinked last node */ + q->waitend = p; + break; + } + } + /* Reached end of list, can prune waitmask */ + q->waitmask = seen; + } +done: + uk_rwlock_wunlock(&q->waitlock); +} + +#if CONFIG_LIBUKFILE_CHAINUPDATE +static void pollq_propagate(struct uk_pollq *q, + enum uk_poll_chain_op op, uk_pollevent set) +{ + uk_rwlock_wlock(&q->proplock); + if (q->propmask & set) { + uk_pollevent seen; + + /* Tag this queue in case of chaining loops */ + UK_ASSERT(!q->_tag); + q->_tag = uk_thread_current(); + /* Walk chain list & propagate updates */ + seen = 0; + for (struct uk_poll_chain **p = &q->prop; *p; p = &(*p)->next) { + struct uk_poll_chain *t = *p; + uk_pollevent req = set & t->mask; + + if (req) { + switch (t->type) { + case UK_POLL_CHAINTYPE_UPDATE: + { + uk_pollevent ev = t->set ? t->set : req; + + switch (op) { + case UK_POLL_CHAINOP_CLEAR: + uk_pollq_clear(t->queue, ev); + break; + case UK_POLL_CHAINOP_SET: + uk_pollq_set(t->queue, ev); + break; + } + } + break; + case UK_POLL_CHAINTYPE_CALLBACK: + t->callback(req, op, t); + break; + } + } + seen |= t->mask; + } + q->propmask = seen; /* Prune propmask */ + q->_tag = NULL; /* Clear tag */ + } + uk_rwlock_wunlock(&q->proplock); +} +#endif /* CONFIG_LIBUKFILE_CHAINUPDATE */ + +uk_pollevent uk_pollq_clear(struct uk_pollq *q, uk_pollevent clr) +{ + uk_pollevent prev = ukarch_and(&q->events, ~clr); + +#if CONFIG_LIBUKFILE_CHAINUPDATE + pollq_propagate(q, UK_POLL_CHAINOP_CLEAR, clr); +#endif /* CONFIG_LIBUKFILE_CHAINUPDATE */ + return prev; +} + +uk_pollevent uk_pollq_set_n(struct uk_pollq *q, uk_pollevent set, int n) +{ + uk_pollevent prev; + + if (!set) + return 0; + +#if CONFIG_LIBUKFILE_CHAINUPDATE + if (q->_tag == uk_thread_current()) /* Chaining update loop, return */ + return 0; +#endif /* CONFIG_LIBUKFILE_CHAINUPDATE */ + + prev = ukarch_or(&q->events, set); + pollq_notify_n(q, set, n); +#if CONFIG_LIBUKFILE_CHAINUPDATE + pollq_propagate(q, UK_POLL_CHAINOP_SET, set); +#endif /* CONFIG_LIBUKFILE_CHAINUPDATE */ + return prev; +} + +uk_pollevent uk_pollq_assign_n(struct uk_pollq *q, uk_pollevent val, int n) +{ + uk_pollevent prev = ukarch_exchange_n(&q->events, val); + uk_pollevent set = val & ~prev; + + if (set) { + pollq_notify_n(q, set, n); +#if CONFIG_LIBUKFILE_CHAINUPDATE + pollq_propagate(q, UK_POLL_CHAINOP_SET, set); +#endif /* CONFIG_LIBUKFILE_CHAINUPDATE */ + } + return prev; +}