From: Andrei Tatar Date: Tue, 21 Nov 2023 19:22:23 +0000 (+0100) Subject: lib/posix-fdtab: Introduce fdtab library X-Git-Tag: RELEASE-0.16.0~190 X-Git-Url: http://xenbits.xensource.com/gitweb?a=commitdiff_plain;h=9009bacf0d6312850a29de6a2e560667bc52fc92;p=unikraft%2Funikraft.git lib/posix-fdtab: Introduce fdtab library This change introduces the posix-fdtab library, tasked with managing the file descriptor table, mapping integers to open file descriptions. posix-fdtab exposes functionality through Unikraft-internal APIs and is independent from the file descriptor table implementation in vfscore, which it aims to replace. Checkpatch-Ignore: VOLATILE Checkpatch-Ignore: LINE_SPACING Signed-off-by: Andrei Tatar Reviewed-by: Simon Kuenzer Approved-by: Simon Kuenzer GitHub-Closes: #1168 --- diff --git a/lib/Makefile.uk b/lib/Makefile.uk index 383e22b3c..693cca485 100644 --- a/lib/Makefile.uk +++ b/lib/Makefile.uk @@ -11,6 +11,7 @@ $(eval $(call import_lib,$(CONFIG_UK_BASE)/lib/isrlib)) $(eval $(call import_lib,$(CONFIG_UK_BASE)/lib/nolibc)) $(eval $(call import_lib,$(CONFIG_UK_BASE)/lib/posix-environ)) $(eval $(call import_lib,$(CONFIG_UK_BASE)/lib/posix-event)) +$(eval $(call import_lib,$(CONFIG_UK_BASE)/lib/posix-fdtab)) $(eval $(call import_lib,$(CONFIG_UK_BASE)/lib/posix-libdl)) $(eval $(call import_lib,$(CONFIG_UK_BASE)/lib/posix-mmap)) $(eval $(call import_lib,$(CONFIG_UK_BASE)/lib/posix-process)) diff --git a/lib/posix-fdtab/Config.uk b/lib/posix-fdtab/Config.uk new file mode 100644 index 000000000..5e7a37279 --- /dev/null +++ b/lib/posix-fdtab/Config.uk @@ -0,0 +1,10 @@ +menuconfig LIBPOSIX_FDTAB + bool "posix-fdtab: File descriptor table" + select LIBUKFILE + +if LIBPOSIX_FDTAB + config LIBPOSIX_FDTAB_MAXFDS + int "Maximum number of file descriptors" + default 1024 + +endif diff --git a/lib/posix-fdtab/Makefile.uk b/lib/posix-fdtab/Makefile.uk new file mode 100644 index 000000000..80ef19bc9 --- /dev/null +++ b/lib/posix-fdtab/Makefile.uk @@ -0,0 +1,6 @@ +$(eval $(call addlib_s,libposix_fdtab,$(CONFIG_LIBPOSIX_FDTAB))) + +CINCLUDES-$(CONFIG_LIBPOSIX_FDTAB) += -I$(LIBPOSIX_FDTAB_BASE)/include +CXXINCLUDES-$(CONFIG_LIBPOSIX_FDTAB) += -I$(LIBPOSIX_FDTAB_BASE)/include + +LIBPOSIX_FDTAB_SRCS-y += $(LIBPOSIX_FDTAB_BASE)/fdtab.c diff --git a/lib/posix-fdtab/fdtab.c b/lib/posix-fdtab/fdtab.c new file mode 100644 index 000000000..8a2c15b3a --- /dev/null +++ b/lib/posix-fdtab/fdtab.c @@ -0,0 +1,347 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright (c) 2023, Unikraft GmbH and The Unikraft Authors. + * Licensed under the BSD-3-Clause License (the "License"). + * You may not use this file except in compliance with the License. + */ + +#include +#include + +#include +#include +#include +#include + +#include + +#include "fmap.h" + + +#define UK_FDTAB_SIZE CONFIG_LIBPOSIX_FDTAB_MAXFDS +UK_CTASSERT(UK_FDTAB_SIZE <= UK_FD_MAX); + +/* Static init fdtab */ + +static char init_bmap[UK_BMAP_SZ(UK_FDTAB_SIZE)]; +static void *init_fdmap[UK_FDTAB_SIZE]; + +struct uk_fdtab { + struct uk_alloc *alloc; + struct uk_fmap fmap; +}; + +static struct uk_fdtab init_fdtab = { + .fmap = { + .bmap = { + .size = UK_FDTAB_SIZE, + .bitmap = (unsigned long *)init_bmap + }, + .map = init_fdmap + } +}; + +static int init_posix_fdtab(void) +{ + init_fdtab.alloc = uk_alloc_get_default(); + /* Consider skipping init for .map (static vars are inited to 0) */ + uk_fmap_init(&init_fdtab.fmap); + return 0; +} + +/* Init fdtab as early as possible, to enable functions that rely on fds */ +uk_early_initcall_prio(init_posix_fdtab, UK_PRIO_EARLIEST); + +/* TODO: Adapt when multiple processes are supported */ +static inline struct uk_fdtab *_active_tab(void) +{ + return &init_fdtab; +} + +/* Encode flags in entry pointer using the least significant bits */ +/* made available by the open file structure's alignment */ +struct fdval { + void *p; + int flags; +}; + +#define UK_FDTAB_CLOEXEC 1 + +#define _MAX_FLAG 1 + +#define _FLAG_MASK (((uintptr_t)_MAX_FLAG << 1) - 1) + +UK_CTASSERT(__alignof__(struct uk_ofile) > _MAX_FLAG); + +static inline const void *fdtab_encode(const void *f, int flags) +{ + UK_ASSERT(!((uintptr_t)f & _FLAG_MASK)); + return (const void *)((uintptr_t)f | flags); +} + +static inline struct fdval fdtab_decode(void *p) +{ + uintptr_t v = (uintptr_t)p; + + return (struct fdval) { + .p = (void *)(v & ~_FLAG_MASK), + .flags = v & _FLAG_MASK + }; +} + +/* struct uk_ofile allocation & refcounting */ +static inline struct uk_ofile *ofile_new(struct uk_fdtab *tab) +{ + struct uk_ofile *of = uk_malloc(tab->alloc, sizeof(*of)); + + if (of) + uk_ofile_init(of); + return of; +} +static inline void ofile_del(struct uk_fdtab *tab, struct uk_ofile *of) +{ + uk_free(tab->alloc, of); +} + +static inline void ofile_acq(struct uk_ofile *of) +{ + uk_refcount_acquire(&of->refcnt); +} +static inline void ofile_rel(struct uk_fdtab *tab, struct uk_ofile *of) +{ + if (uk_refcount_release(&of->refcnt)) { + uk_file_release(of->file); + ofile_del(tab, of); + } +} + +#define file_acq(p, f) ofile_acq((struct uk_ofile *)(p)) +#define file_rel(t, p, f) ofile_rel((t), (struct uk_ofile *)(p)) + +/* Ops */ + +int uk_fdtab_open(const struct uk_file *f, unsigned int mode) +{ + struct uk_fdtab *tab; + struct uk_ofile *of; + int flags; + const void *entry; + int fd; + + UK_ASSERT(f); + + tab = _active_tab(); + of = ofile_new(tab); + if (!of) + return -ENOMEM; + /* Take refs on file & ofile */ + uk_file_acquire(f); + ofile_acq(of); + /* Prepare open file */ + of->file = f; + of->pos = 0; + of->mode = mode & ~O_CLOEXEC; + /* Place the file in fdtab */ + flags = (mode & O_CLOEXEC) ? UK_FDTAB_CLOEXEC : 0; + entry = fdtab_encode(of, flags); + fd = uk_fmap_put(&tab->fmap, entry, 0); + if (fd >= UK_FDTAB_SIZE) + goto err_out; + return fd; +err_out: + /* Release open file & file ref */ + ofile_rel(tab, of); + return -ENFILE; +} + +int uk_fdtab_setflags(int fd, int flags) +{ + struct uk_fdtab *tab; + struct uk_fmap *fmap; + void *p; + struct fdval v; + const void *newp; + + if (flags & ~O_CLOEXEC) + return -EINVAL; + + tab = _active_tab(); + fmap = &tab->fmap; + + p = uk_fmap_critical_take(fmap, fd); + if (!p) + return -EBADF; + v = fdtab_decode(p); + v.flags &= ~UK_FDTAB_CLOEXEC; + v.flags |= flags ? UK_FDTAB_CLOEXEC : 0; + + newp = fdtab_encode(v.p, v.flags); + uk_fmap_critical_put(fmap, fd, newp); + return 0; +} + +int uk_fdtab_getflags(int fd) +{ + struct uk_fdtab *tab = _active_tab(); + void *p = uk_fmap_lookup(&tab->fmap, fd); + struct fdval v; + int ret; + + if (!p) + return -EBADF; + + v = fdtab_decode(p); + ret = 0; + if (v.flags & UK_FDTAB_CLOEXEC) + ret |= O_CLOEXEC; + return ret; +} + + +static struct fdval _fdtab_get(struct uk_fdtab *tab, int fd) +{ + struct fdval ret = { NULL, 0 }; + + if (fd >= 0) { + /* Need to refcount atomically => critical take & put */ + struct uk_fmap *fmap = &tab->fmap; + void *p = uk_fmap_critical_take(fmap, fd); + + if (p) { + ret = fdtab_decode(p); + file_acq(ret.p, ret.flags); + uk_fmap_critical_put(fmap, fd, p); + } + } + return ret; +} + +struct uk_ofile *uk_fdtab_get(int fd) +{ + struct uk_fdtab *tab = _active_tab(); + struct fdval v = _fdtab_get(tab, fd); + + return (struct uk_ofile *)v.p; +} + +void uk_fdtab_ret(struct uk_ofile *of) +{ + UK_ASSERT(of); + ofile_rel(_active_tab(), of); +} + +void uk_fdtab_cloexec(void) +{ + struct uk_fdtab *tab = _active_tab(); + struct uk_fmap *fmap = &tab->fmap; + + for (int i = 0; i < UK_FDTAB_SIZE; i++) { + void *p = uk_fmap_lookup(fmap, i); + + if (p) { + struct fdval v = fdtab_decode(p); + + if (v.flags & UK_FDTAB_CLOEXEC) { + void *pp = uk_fmap_take(fmap, i); + + UK_ASSERT(p == pp); + file_rel(tab, v.p, v.flags); + } + } + } +} + +/* Internal Syscalls */ + +int uk_sys_close(int fd) +{ + struct uk_fdtab *tab; + void *p; + struct fdval v; + + tab = _active_tab(); + p = uk_fmap_take(&tab->fmap, fd); + if (!p) + return -EBADF; + v = fdtab_decode(p); + file_rel(tab, v.p, v.flags); + return 0; +} + +int uk_sys_dup3(int oldfd, int newfd, int flags) +{ + int r __maybe_unused; + struct uk_fdtab *tab; + struct fdval dup; + void *prevp; + const void *newent; + + if (oldfd == newfd) + return -EINVAL; + if (oldfd < 0 || oldfd >= UK_FDTAB_SIZE || + newfd < 0 || newfd >= UK_FDTAB_SIZE) + return -EBADF; + if (flags & ~O_CLOEXEC) + return -EINVAL; + + tab = _active_tab(); + dup = _fdtab_get(tab, oldfd); + if (!dup.p) + return -EBADF; /* oldfd not open */ + dup.flags &= ~UK_FDTAB_CLOEXEC; + dup.flags |= flags ? UK_FDTAB_CLOEXEC : 0; + + prevp = NULL; + newent = fdtab_encode(dup.p, dup.flags); + r = uk_fmap_xchg(&tab->fmap, newfd, newent, &prevp); + UK_ASSERT(!r); /* newfd should be in range */ + if (prevp) { + struct fdval prevv = fdtab_decode(prevp); + + file_rel(tab, prevv.p, prevv.flags); + } + return newfd; +} + +int uk_sys_dup2(int oldfd, int newfd) +{ + if (oldfd == newfd) + if (uk_fmap_lookup(&(_active_tab())->fmap, oldfd)) + return newfd; + else + return -EBADF; + else + return uk_sys_dup3(oldfd, newfd, 0); +} + +int uk_sys_dup_min(int oldfd, int min, int flags) +{ + struct uk_fdtab *tab; + struct fdval dup; + const void *newent; + int fd; + + if (oldfd < 0) + return -EBADF; + if (flags & ~O_CLOEXEC) + return -EINVAL; + + tab = _active_tab(); + dup = _fdtab_get(tab, oldfd); + if (!dup.p) + return -EBADF; + dup.flags &= ~UK_FDTAB_CLOEXEC; + dup.flags |= flags ? UK_FDTAB_CLOEXEC : 0; + + newent = fdtab_encode(dup.p, dup.flags); + fd = uk_fmap_put(&tab->fmap, newent, min); + if (fd >= UK_FDTAB_SIZE) { + file_rel(tab, dup.p, dup.flags); + return -ENFILE; + } + return fd; +} + +int uk_sys_dup(int oldfd) +{ + return uk_sys_dup_min(oldfd, 0, 0); +} diff --git a/lib/posix-fdtab/fmap.h b/lib/posix-fdtab/fmap.h new file mode 100644 index 000000000..147ef196b --- /dev/null +++ b/lib/posix-fdtab/fmap.h @@ -0,0 +1,304 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright (c) 2023, Unikraft GmbH and The Unikraft Authors. + * Licensed under the BSD-3-Clause License (the "License"). + * You may not use this file except in compliance with the License. + */ + +/* Lock-free data structure specialized in mapping integers to pointers */ + +#ifndef __UK_FDTAB_FMAP_H__ +#define __UK_FDTAB_FMAP_H__ + +#include + +#include +#include +#include +#include +#include +#include + +/** + * Lock-free bitmap, with ones representing a free index. + */ +struct uk_bmap { + volatile unsigned long *bitmap; + size_t size; +}; + +#define UK_BMAP_SZ(s) (UK_BITS_TO_LONGS(s) * sizeof(unsigned long)) + + +/** + * Initialize the bitmap; must not be called concurrently with other functions. + */ +static inline void uk_bmap_init(const struct uk_bmap *bm) +{ + uk_bitmap_fill((void *)bm->bitmap, bm->size); +} + +/** + * Mark `idx` as used, and return whether we were the ones to do so. + * + * @return + * 0 if we marked `idx` as used, non-zero otherwise + */ +static inline int uk_bmap_reserve(const struct uk_bmap *bm, int idx) +{ + unsigned long mask; + unsigned long v; + + if (!IN_RANGE(idx, 0, bm->size)) + return -1; + mask = UK_BIT_MASK(idx); + v = ukarch_and(&bm->bitmap[UK_BIT_WORD(idx)], ~mask); + return !(v & mask); +} + +/** + * Mark `idx` as free, and return whether we were the ones to do so. + * + * @return + * 0 if we freed `idx`, non-zero if it was already free or out of range + */ +static inline int uk_bmap_free(const struct uk_bmap *bm, int idx) +{ + unsigned long mask; + unsigned long v; + + if (!IN_RANGE(idx, 0, bm->size)) + return -1; + mask = UK_BIT_MASK(idx); + v = ukarch_or(&bm->bitmap[UK_BIT_WORD(idx)], mask); + return !!(v & mask); +} + +static inline int uk_bmap_isfree(const struct uk_bmap *bm, int idx) +{ + if (!IN_RANGE(idx, 0, bm->size)) + return -1; + return uk_test_bit(idx, bm->bitmap); +} + +/** + * Allocate and return the smallest free index larger than `min`. + * + * @return + * The allocated index or `>= bm->size` if map full. + */ +static inline int uk_bmap_request(const struct uk_bmap *bm, int min) +{ + int pos; + + do { + /* Seems safe to cast away volatility, revisit if problem */ + pos = uk_find_next_bit((unsigned long *)bm->bitmap, + bm->size, min); + if (pos >= bm->size) + return bm->size; + /* If bit was already cleared, we lost the race, retry */ + } while (uk_bmap_reserve(bm, pos)); + + return pos; +} + +/** + * Mapping of integers to pointers. + */ +struct uk_fmap { + struct uk_bmap bmap; + void *volatile *map; +}; + +#define UK_FMAP_SZ(s) ((s) * sizeof(void *)) + + +#define _FMAP_INRANGE(m, i) IN_RANGE(i, 0, (m)->bmap.size) + +/** + * Initialize the memory for a uk_fmap. + * + * The `size` field must be correctly set and the map buffers allocated. + */ +static inline void uk_fmap_init(const struct uk_fmap *m) +{ + memset((void *)m->map, 0, m->bmap.size * sizeof(void *)); + uk_bmap_init(&m->bmap); +} + +/** + * Lookup the entry at `idx`. + * + * WARNING: Use of this function is vulnerable to use-after-free race conditions + * for entry objects whose lifetime depends on their membership in this data + * structure (e.g., refcounting on put, take, and after lookups). + * For that case please use the `uk_fmap_critical_*` functions. + * + * @return + * The entry at `idx`, or NULL if out of range. + */ +static inline void *uk_fmap_lookup(const struct uk_fmap *m, int idx) +{ + void *got; + + if (!_FMAP_INRANGE(m, idx)) + return NULL; + + do { + got = m->map[idx]; + if (!got) { + if (uk_bmap_isfree(&m->bmap, idx)) + break; /* Entry is actually free */ + uk_sched_yield(); /* Lost race, retry */ + } + } while (!got); + return got; +} + +/** + * Put an entry in the map and return its index. + * + * @return + * newly allocated index, or out of range if map full + */ +static inline +int uk_fmap_put(const struct uk_fmap *m, const void *p, int min) +{ + void *got; + int pos; + + pos = uk_bmap_request(&m->bmap, min); + if (!_FMAP_INRANGE(m, pos)) + return pos; /* Map full */ + + got = ukarch_exchange_n(&m->map[pos], (void *)p); + UK_ASSERT(got == NULL); /* There can't be stuff in there, abort */ + + return pos; +} + +/** + * Take the entry at `idx` out of the map and return it. + * + * @return + * Previous entry, or NULL if empty or out of range + */ +static inline void *uk_fmap_take(const struct uk_fmap *m, int idx) +{ + int v __maybe_unused; + void *got; + + if (!_FMAP_INRANGE(m, idx)) + return NULL; + + do { + if (uk_bmap_isfree(&m->bmap, idx)) + return NULL; /* Already free */ + + /* At most one take thread gets the previous non-NULL value */ + got = ukarch_exchange_n(&m->map[idx], NULL); + if (!got) + /* We lost the race with a (critical) take, retry */ + uk_sched_yield(); + } while (!got); + + /* We are that one thread; nobody else can set the bitmap */ + v = uk_bmap_free(&m->bmap, idx); + UK_ASSERT(!v); + return got; +} + +/** + * Take out an entry, without marking its `idx` as free. + * + * Calling this function is akin to taking a lock on `idx` which should be + * released as soon as practical by a matching call to `uk_fmap_critical_put`. + * Take care to not block inbetween these two calls. + * + * @return + * The entry at `idx`, or NULL if not present or out of range. + */ +static inline +void *uk_fmap_critical_take(const struct uk_fmap *m, int idx) +{ + void *got; + + if (!_FMAP_INRANGE(m, idx)) + return NULL; + do { + got = ukarch_exchange_n(&m->map[idx], NULL); + if (!got) { + if (uk_bmap_isfree(&m->bmap, idx)) + /* idx is actually empty */ + break; + /* Lost race with (critical) take, retry */ + uk_sched_yield(); + } + } while (!got); + return got; +} + +/** + * Place an entry at `idx`, following a call to `uk_fmap_critical_take`. + * + * Calling on an `idx` without a matching call to `uk_fmap_critical_take` + * is undefined. The value of `p` need not match the value previously taken out. + * + * @return + * 0 on success, non-zero if `idx` out of range + */ +static inline +int uk_fmap_critical_put(const struct uk_fmap *m, int idx, const void *p) +{ + void *got; + + if (!_FMAP_INRANGE(m, idx)) + return -1; + + (void)uk_bmap_reserve(&m->bmap, idx); + got = ukarch_exchange_n(&m->map[idx], p); + UK_ASSERT(got == NULL); + return 0; +} + +/** + * Atomically exhange the entry at `idx` with `p`, returning previous in `prev`. + * + * If `idx` is free, it is marked as used and `*prev` is set to NULL. + * + * @return + * 0 on success, non-zero if `idx` out of range + */ +static inline +int uk_fmap_xchg(const struct uk_fmap *m, int idx, + const void *p, void **prev) +{ + void *got; + + if (!_FMAP_INRANGE(m, idx)) + return -1; + + /* Exchanging entries directly is problematic, must use take & put */ + for (;;) { + int r = uk_bmap_reserve(&m->bmap, idx); + + if (r) { + /* There was already something there */ + got = uk_fmap_critical_take(m, idx); + if (got) { + (void)uk_fmap_critical_put(m, idx, p); + *prev = got; + return 0; + } + /* Lost race with (critical) take, retry */ + uk_sched_yield(); + } else { + /* idx was free, we're basically a put now */ + got = ukarch_exchange_n(&m->map[idx], p); + UK_ASSERT(got == NULL); + return 0; + } + } +} + +#endif /* __UK_FDTAB_FMAP_H__ */ diff --git a/lib/posix-fdtab/include/uk/posix-fdtab.h b/lib/posix-fdtab/include/uk/posix-fdtab.h new file mode 100644 index 000000000..cc9e329ea --- /dev/null +++ b/lib/posix-fdtab/include/uk/posix-fdtab.h @@ -0,0 +1,73 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright (c) 2023, Unikraft GmbH and The Unikraft Authors. + * Licensed under the BSD-3-Clause License (the "License"). + * You may not use this file except in compliance with the License. + */ + +/* POSIX compatible file descriptor table */ + +#ifndef __UK_POSIX_FDTAB_H__ +#define __UK_POSIX_FDTAB_H__ + +#include +#include + +#define UK_FD_MAX INT_MAX + +/** + * Open the file `f` with `mode` and associate it with a file descriptor. + * + * The lifetime of `f` must cover the entirety of this function call. + * + * @return + * The newly allocated file descriptor. + */ +int uk_fdtab_open(const struct uk_file *f, unsigned int mode); + +/** + * Get the open file description associated with descriptor `fd`. + * + * Users should call uk_fdtab_ret when done with the open file reference. + * + * @return + * Open file reference or NULL if `fd` is not an open file descriptor. + */ +struct uk_ofile *uk_fdtab_get(int fd); + +/** + * Return a reference to an open file when done using it. + */ +void uk_fdtab_ret(struct uk_ofile *of); + +/** + * Set flags on file descriptor. Currently only supports O_CLOEXEC. + * + * @return + * 0 if successful, < 0 if `fd` not mapped + */ +int uk_fdtab_setflags(int fd, int flags); + +/** + * Get the flags set on `fd`. Currently only supports O_CLOEXEC. + * + * @return + * >= 0, flags set on `fd` + * < 0, if `fd` not mapped + */ +int uk_fdtab_getflags(int fd); + +/** + * Close all files that have the O_CLOEXEC flag set. + * + * Assumes no other threads are touching the active fdtab during the call. + */ +void uk_fdtab_cloexec(void); + +/* Internal syscalls */ +int uk_sys_close(int fd); +int uk_sys_dup(int oldfd); +int uk_sys_dup_min(int oldfd, int min, int flags); +int uk_sys_dup2(int oldfd, int newfd); +int uk_sys_dup3(int oldfd, int newfd, int flags); + +#endif /* __UK_POSIX_FDTAB_H__ */