$(eval $(call import_lib,$(CONFIG_UK_BASE)/lib/nolibc))
$(eval $(call import_lib,$(CONFIG_UK_BASE)/lib/posix-environ))
$(eval $(call import_lib,$(CONFIG_UK_BASE)/lib/posix-event))
+$(eval $(call import_lib,$(CONFIG_UK_BASE)/lib/posix-fdtab))
$(eval $(call import_lib,$(CONFIG_UK_BASE)/lib/posix-libdl))
$(eval $(call import_lib,$(CONFIG_UK_BASE)/lib/posix-mmap))
$(eval $(call import_lib,$(CONFIG_UK_BASE)/lib/posix-process))
--- /dev/null
+menuconfig LIBPOSIX_FDTAB
+ bool "posix-fdtab: File descriptor table"
+ select LIBUKFILE
+
+if LIBPOSIX_FDTAB
+ config LIBPOSIX_FDTAB_MAXFDS
+ int "Maximum number of file descriptors"
+ default 1024
+
+endif
--- /dev/null
+$(eval $(call addlib_s,libposix_fdtab,$(CONFIG_LIBPOSIX_FDTAB)))
+
+CINCLUDES-$(CONFIG_LIBPOSIX_FDTAB) += -I$(LIBPOSIX_FDTAB_BASE)/include
+CXXINCLUDES-$(CONFIG_LIBPOSIX_FDTAB) += -I$(LIBPOSIX_FDTAB_BASE)/include
+
+LIBPOSIX_FDTAB_SRCS-y += $(LIBPOSIX_FDTAB_BASE)/fdtab.c
--- /dev/null
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright (c) 2023, Unikraft GmbH and The Unikraft Authors.
+ * Licensed under the BSD-3-Clause License (the "License").
+ * You may not use this file except in compliance with the License.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+
+#include <uk/alloc.h>
+#include <uk/assert.h>
+#include <uk/config.h>
+#include <uk/init.h>
+
+#include <uk/posix-fdtab.h>
+
+#include "fmap.h"
+
+
+#define UK_FDTAB_SIZE CONFIG_LIBPOSIX_FDTAB_MAXFDS
+UK_CTASSERT(UK_FDTAB_SIZE <= UK_FD_MAX);
+
+/* Static init fdtab */
+
+static char init_bmap[UK_BMAP_SZ(UK_FDTAB_SIZE)];
+static void *init_fdmap[UK_FDTAB_SIZE];
+
+struct uk_fdtab {
+ struct uk_alloc *alloc;
+ struct uk_fmap fmap;
+};
+
+static struct uk_fdtab init_fdtab = {
+ .fmap = {
+ .bmap = {
+ .size = UK_FDTAB_SIZE,
+ .bitmap = (unsigned long *)init_bmap
+ },
+ .map = init_fdmap
+ }
+};
+
+static int init_posix_fdtab(void)
+{
+ init_fdtab.alloc = uk_alloc_get_default();
+ /* Consider skipping init for .map (static vars are inited to 0) */
+ uk_fmap_init(&init_fdtab.fmap);
+ return 0;
+}
+
+/* Init fdtab as early as possible, to enable functions that rely on fds */
+uk_early_initcall_prio(init_posix_fdtab, UK_PRIO_EARLIEST);
+
+/* TODO: Adapt when multiple processes are supported */
+static inline struct uk_fdtab *_active_tab(void)
+{
+ return &init_fdtab;
+}
+
+/* Encode flags in entry pointer using the least significant bits */
+/* made available by the open file structure's alignment */
+struct fdval {
+ void *p;
+ int flags;
+};
+
+#define UK_FDTAB_CLOEXEC 1
+
+#define _MAX_FLAG 1
+
+#define _FLAG_MASK (((uintptr_t)_MAX_FLAG << 1) - 1)
+
+UK_CTASSERT(__alignof__(struct uk_ofile) > _MAX_FLAG);
+
+static inline const void *fdtab_encode(const void *f, int flags)
+{
+ UK_ASSERT(!((uintptr_t)f & _FLAG_MASK));
+ return (const void *)((uintptr_t)f | flags);
+}
+
+static inline struct fdval fdtab_decode(void *p)
+{
+ uintptr_t v = (uintptr_t)p;
+
+ return (struct fdval) {
+ .p = (void *)(v & ~_FLAG_MASK),
+ .flags = v & _FLAG_MASK
+ };
+}
+
+/* struct uk_ofile allocation & refcounting */
+static inline struct uk_ofile *ofile_new(struct uk_fdtab *tab)
+{
+ struct uk_ofile *of = uk_malloc(tab->alloc, sizeof(*of));
+
+ if (of)
+ uk_ofile_init(of);
+ return of;
+}
+static inline void ofile_del(struct uk_fdtab *tab, struct uk_ofile *of)
+{
+ uk_free(tab->alloc, of);
+}
+
+static inline void ofile_acq(struct uk_ofile *of)
+{
+ uk_refcount_acquire(&of->refcnt);
+}
+static inline void ofile_rel(struct uk_fdtab *tab, struct uk_ofile *of)
+{
+ if (uk_refcount_release(&of->refcnt)) {
+ uk_file_release(of->file);
+ ofile_del(tab, of);
+ }
+}
+
+#define file_acq(p, f) ofile_acq((struct uk_ofile *)(p))
+#define file_rel(t, p, f) ofile_rel((t), (struct uk_ofile *)(p))
+
+/* Ops */
+
+int uk_fdtab_open(const struct uk_file *f, unsigned int mode)
+{
+ struct uk_fdtab *tab;
+ struct uk_ofile *of;
+ int flags;
+ const void *entry;
+ int fd;
+
+ UK_ASSERT(f);
+
+ tab = _active_tab();
+ of = ofile_new(tab);
+ if (!of)
+ return -ENOMEM;
+ /* Take refs on file & ofile */
+ uk_file_acquire(f);
+ ofile_acq(of);
+ /* Prepare open file */
+ of->file = f;
+ of->pos = 0;
+ of->mode = mode & ~O_CLOEXEC;
+ /* Place the file in fdtab */
+ flags = (mode & O_CLOEXEC) ? UK_FDTAB_CLOEXEC : 0;
+ entry = fdtab_encode(of, flags);
+ fd = uk_fmap_put(&tab->fmap, entry, 0);
+ if (fd >= UK_FDTAB_SIZE)
+ goto err_out;
+ return fd;
+err_out:
+ /* Release open file & file ref */
+ ofile_rel(tab, of);
+ return -ENFILE;
+}
+
+int uk_fdtab_setflags(int fd, int flags)
+{
+ struct uk_fdtab *tab;
+ struct uk_fmap *fmap;
+ void *p;
+ struct fdval v;
+ const void *newp;
+
+ if (flags & ~O_CLOEXEC)
+ return -EINVAL;
+
+ tab = _active_tab();
+ fmap = &tab->fmap;
+
+ p = uk_fmap_critical_take(fmap, fd);
+ if (!p)
+ return -EBADF;
+ v = fdtab_decode(p);
+ v.flags &= ~UK_FDTAB_CLOEXEC;
+ v.flags |= flags ? UK_FDTAB_CLOEXEC : 0;
+
+ newp = fdtab_encode(v.p, v.flags);
+ uk_fmap_critical_put(fmap, fd, newp);
+ return 0;
+}
+
+int uk_fdtab_getflags(int fd)
+{
+ struct uk_fdtab *tab = _active_tab();
+ void *p = uk_fmap_lookup(&tab->fmap, fd);
+ struct fdval v;
+ int ret;
+
+ if (!p)
+ return -EBADF;
+
+ v = fdtab_decode(p);
+ ret = 0;
+ if (v.flags & UK_FDTAB_CLOEXEC)
+ ret |= O_CLOEXEC;
+ return ret;
+}
+
+
+static struct fdval _fdtab_get(struct uk_fdtab *tab, int fd)
+{
+ struct fdval ret = { NULL, 0 };
+
+ if (fd >= 0) {
+ /* Need to refcount atomically => critical take & put */
+ struct uk_fmap *fmap = &tab->fmap;
+ void *p = uk_fmap_critical_take(fmap, fd);
+
+ if (p) {
+ ret = fdtab_decode(p);
+ file_acq(ret.p, ret.flags);
+ uk_fmap_critical_put(fmap, fd, p);
+ }
+ }
+ return ret;
+}
+
+struct uk_ofile *uk_fdtab_get(int fd)
+{
+ struct uk_fdtab *tab = _active_tab();
+ struct fdval v = _fdtab_get(tab, fd);
+
+ return (struct uk_ofile *)v.p;
+}
+
+void uk_fdtab_ret(struct uk_ofile *of)
+{
+ UK_ASSERT(of);
+ ofile_rel(_active_tab(), of);
+}
+
+void uk_fdtab_cloexec(void)
+{
+ struct uk_fdtab *tab = _active_tab();
+ struct uk_fmap *fmap = &tab->fmap;
+
+ for (int i = 0; i < UK_FDTAB_SIZE; i++) {
+ void *p = uk_fmap_lookup(fmap, i);
+
+ if (p) {
+ struct fdval v = fdtab_decode(p);
+
+ if (v.flags & UK_FDTAB_CLOEXEC) {
+ void *pp = uk_fmap_take(fmap, i);
+
+ UK_ASSERT(p == pp);
+ file_rel(tab, v.p, v.flags);
+ }
+ }
+ }
+}
+
+/* Internal Syscalls */
+
+int uk_sys_close(int fd)
+{
+ struct uk_fdtab *tab;
+ void *p;
+ struct fdval v;
+
+ tab = _active_tab();
+ p = uk_fmap_take(&tab->fmap, fd);
+ if (!p)
+ return -EBADF;
+ v = fdtab_decode(p);
+ file_rel(tab, v.p, v.flags);
+ return 0;
+}
+
+int uk_sys_dup3(int oldfd, int newfd, int flags)
+{
+ int r __maybe_unused;
+ struct uk_fdtab *tab;
+ struct fdval dup;
+ void *prevp;
+ const void *newent;
+
+ if (oldfd == newfd)
+ return -EINVAL;
+ if (oldfd < 0 || oldfd >= UK_FDTAB_SIZE ||
+ newfd < 0 || newfd >= UK_FDTAB_SIZE)
+ return -EBADF;
+ if (flags & ~O_CLOEXEC)
+ return -EINVAL;
+
+ tab = _active_tab();
+ dup = _fdtab_get(tab, oldfd);
+ if (!dup.p)
+ return -EBADF; /* oldfd not open */
+ dup.flags &= ~UK_FDTAB_CLOEXEC;
+ dup.flags |= flags ? UK_FDTAB_CLOEXEC : 0;
+
+ prevp = NULL;
+ newent = fdtab_encode(dup.p, dup.flags);
+ r = uk_fmap_xchg(&tab->fmap, newfd, newent, &prevp);
+ UK_ASSERT(!r); /* newfd should be in range */
+ if (prevp) {
+ struct fdval prevv = fdtab_decode(prevp);
+
+ file_rel(tab, prevv.p, prevv.flags);
+ }
+ return newfd;
+}
+
+int uk_sys_dup2(int oldfd, int newfd)
+{
+ if (oldfd == newfd)
+ if (uk_fmap_lookup(&(_active_tab())->fmap, oldfd))
+ return newfd;
+ else
+ return -EBADF;
+ else
+ return uk_sys_dup3(oldfd, newfd, 0);
+}
+
+int uk_sys_dup_min(int oldfd, int min, int flags)
+{
+ struct uk_fdtab *tab;
+ struct fdval dup;
+ const void *newent;
+ int fd;
+
+ if (oldfd < 0)
+ return -EBADF;
+ if (flags & ~O_CLOEXEC)
+ return -EINVAL;
+
+ tab = _active_tab();
+ dup = _fdtab_get(tab, oldfd);
+ if (!dup.p)
+ return -EBADF;
+ dup.flags &= ~UK_FDTAB_CLOEXEC;
+ dup.flags |= flags ? UK_FDTAB_CLOEXEC : 0;
+
+ newent = fdtab_encode(dup.p, dup.flags);
+ fd = uk_fmap_put(&tab->fmap, newent, min);
+ if (fd >= UK_FDTAB_SIZE) {
+ file_rel(tab, dup.p, dup.flags);
+ return -ENFILE;
+ }
+ return fd;
+}
+
+int uk_sys_dup(int oldfd)
+{
+ return uk_sys_dup_min(oldfd, 0, 0);
+}
--- /dev/null
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright (c) 2023, Unikraft GmbH and The Unikraft Authors.
+ * Licensed under the BSD-3-Clause License (the "License").
+ * You may not use this file except in compliance with the License.
+ */
+
+/* Lock-free data structure specialized in mapping integers to pointers */
+
+#ifndef __UK_FDTAB_FMAP_H__
+#define __UK_FDTAB_FMAP_H__
+
+#include <string.h>
+
+#include <uk/arch/atomic.h>
+#include <uk/assert.h>
+#include <uk/bitops.h>
+#include <uk/bitmap.h>
+#include <uk/essentials.h>
+#include <uk/thread.h>
+
+/**
+ * Lock-free bitmap, with ones representing a free index.
+ */
+struct uk_bmap {
+ volatile unsigned long *bitmap;
+ size_t size;
+};
+
+#define UK_BMAP_SZ(s) (UK_BITS_TO_LONGS(s) * sizeof(unsigned long))
+
+
+/**
+ * Initialize the bitmap; must not be called concurrently with other functions.
+ */
+static inline void uk_bmap_init(const struct uk_bmap *bm)
+{
+ uk_bitmap_fill((void *)bm->bitmap, bm->size);
+}
+
+/**
+ * Mark `idx` as used, and return whether we were the ones to do so.
+ *
+ * @return
+ * 0 if we marked `idx` as used, non-zero otherwise
+ */
+static inline int uk_bmap_reserve(const struct uk_bmap *bm, int idx)
+{
+ unsigned long mask;
+ unsigned long v;
+
+ if (!IN_RANGE(idx, 0, bm->size))
+ return -1;
+ mask = UK_BIT_MASK(idx);
+ v = ukarch_and(&bm->bitmap[UK_BIT_WORD(idx)], ~mask);
+ return !(v & mask);
+}
+
+/**
+ * Mark `idx` as free, and return whether we were the ones to do so.
+ *
+ * @return
+ * 0 if we freed `idx`, non-zero if it was already free or out of range
+ */
+static inline int uk_bmap_free(const struct uk_bmap *bm, int idx)
+{
+ unsigned long mask;
+ unsigned long v;
+
+ if (!IN_RANGE(idx, 0, bm->size))
+ return -1;
+ mask = UK_BIT_MASK(idx);
+ v = ukarch_or(&bm->bitmap[UK_BIT_WORD(idx)], mask);
+ return !!(v & mask);
+}
+
+static inline int uk_bmap_isfree(const struct uk_bmap *bm, int idx)
+{
+ if (!IN_RANGE(idx, 0, bm->size))
+ return -1;
+ return uk_test_bit(idx, bm->bitmap);
+}
+
+/**
+ * Allocate and return the smallest free index larger than `min`.
+ *
+ * @return
+ * The allocated index or `>= bm->size` if map full.
+ */
+static inline int uk_bmap_request(const struct uk_bmap *bm, int min)
+{
+ int pos;
+
+ do {
+ /* Seems safe to cast away volatility, revisit if problem */
+ pos = uk_find_next_bit((unsigned long *)bm->bitmap,
+ bm->size, min);
+ if (pos >= bm->size)
+ return bm->size;
+ /* If bit was already cleared, we lost the race, retry */
+ } while (uk_bmap_reserve(bm, pos));
+
+ return pos;
+}
+
+/**
+ * Mapping of integers to pointers.
+ */
+struct uk_fmap {
+ struct uk_bmap bmap;
+ void *volatile *map;
+};
+
+#define UK_FMAP_SZ(s) ((s) * sizeof(void *))
+
+
+#define _FMAP_INRANGE(m, i) IN_RANGE(i, 0, (m)->bmap.size)
+
+/**
+ * Initialize the memory for a uk_fmap.
+ *
+ * The `size` field must be correctly set and the map buffers allocated.
+ */
+static inline void uk_fmap_init(const struct uk_fmap *m)
+{
+ memset((void *)m->map, 0, m->bmap.size * sizeof(void *));
+ uk_bmap_init(&m->bmap);
+}
+
+/**
+ * Lookup the entry at `idx`.
+ *
+ * WARNING: Use of this function is vulnerable to use-after-free race conditions
+ * for entry objects whose lifetime depends on their membership in this data
+ * structure (e.g., refcounting on put, take, and after lookups).
+ * For that case please use the `uk_fmap_critical_*` functions.
+ *
+ * @return
+ * The entry at `idx`, or NULL if out of range.
+ */
+static inline void *uk_fmap_lookup(const struct uk_fmap *m, int idx)
+{
+ void *got;
+
+ if (!_FMAP_INRANGE(m, idx))
+ return NULL;
+
+ do {
+ got = m->map[idx];
+ if (!got) {
+ if (uk_bmap_isfree(&m->bmap, idx))
+ break; /* Entry is actually free */
+ uk_sched_yield(); /* Lost race, retry */
+ }
+ } while (!got);
+ return got;
+}
+
+/**
+ * Put an entry in the map and return its index.
+ *
+ * @return
+ * newly allocated index, or out of range if map full
+ */
+static inline
+int uk_fmap_put(const struct uk_fmap *m, const void *p, int min)
+{
+ void *got;
+ int pos;
+
+ pos = uk_bmap_request(&m->bmap, min);
+ if (!_FMAP_INRANGE(m, pos))
+ return pos; /* Map full */
+
+ got = ukarch_exchange_n(&m->map[pos], (void *)p);
+ UK_ASSERT(got == NULL); /* There can't be stuff in there, abort */
+
+ return pos;
+}
+
+/**
+ * Take the entry at `idx` out of the map and return it.
+ *
+ * @return
+ * Previous entry, or NULL if empty or out of range
+ */
+static inline void *uk_fmap_take(const struct uk_fmap *m, int idx)
+{
+ int v __maybe_unused;
+ void *got;
+
+ if (!_FMAP_INRANGE(m, idx))
+ return NULL;
+
+ do {
+ if (uk_bmap_isfree(&m->bmap, idx))
+ return NULL; /* Already free */
+
+ /* At most one take thread gets the previous non-NULL value */
+ got = ukarch_exchange_n(&m->map[idx], NULL);
+ if (!got)
+ /* We lost the race with a (critical) take, retry */
+ uk_sched_yield();
+ } while (!got);
+
+ /* We are that one thread; nobody else can set the bitmap */
+ v = uk_bmap_free(&m->bmap, idx);
+ UK_ASSERT(!v);
+ return got;
+}
+
+/**
+ * Take out an entry, without marking its `idx` as free.
+ *
+ * Calling this function is akin to taking a lock on `idx` which should be
+ * released as soon as practical by a matching call to `uk_fmap_critical_put`.
+ * Take care to not block inbetween these two calls.
+ *
+ * @return
+ * The entry at `idx`, or NULL if not present or out of range.
+ */
+static inline
+void *uk_fmap_critical_take(const struct uk_fmap *m, int idx)
+{
+ void *got;
+
+ if (!_FMAP_INRANGE(m, idx))
+ return NULL;
+ do {
+ got = ukarch_exchange_n(&m->map[idx], NULL);
+ if (!got) {
+ if (uk_bmap_isfree(&m->bmap, idx))
+ /* idx is actually empty */
+ break;
+ /* Lost race with (critical) take, retry */
+ uk_sched_yield();
+ }
+ } while (!got);
+ return got;
+}
+
+/**
+ * Place an entry at `idx`, following a call to `uk_fmap_critical_take`.
+ *
+ * Calling on an `idx` without a matching call to `uk_fmap_critical_take`
+ * is undefined. The value of `p` need not match the value previously taken out.
+ *
+ * @return
+ * 0 on success, non-zero if `idx` out of range
+ */
+static inline
+int uk_fmap_critical_put(const struct uk_fmap *m, int idx, const void *p)
+{
+ void *got;
+
+ if (!_FMAP_INRANGE(m, idx))
+ return -1;
+
+ (void)uk_bmap_reserve(&m->bmap, idx);
+ got = ukarch_exchange_n(&m->map[idx], p);
+ UK_ASSERT(got == NULL);
+ return 0;
+}
+
+/**
+ * Atomically exhange the entry at `idx` with `p`, returning previous in `prev`.
+ *
+ * If `idx` is free, it is marked as used and `*prev` is set to NULL.
+ *
+ * @return
+ * 0 on success, non-zero if `idx` out of range
+ */
+static inline
+int uk_fmap_xchg(const struct uk_fmap *m, int idx,
+ const void *p, void **prev)
+{
+ void *got;
+
+ if (!_FMAP_INRANGE(m, idx))
+ return -1;
+
+ /* Exchanging entries directly is problematic, must use take & put */
+ for (;;) {
+ int r = uk_bmap_reserve(&m->bmap, idx);
+
+ if (r) {
+ /* There was already something there */
+ got = uk_fmap_critical_take(m, idx);
+ if (got) {
+ (void)uk_fmap_critical_put(m, idx, p);
+ *prev = got;
+ return 0;
+ }
+ /* Lost race with (critical) take, retry */
+ uk_sched_yield();
+ } else {
+ /* idx was free, we're basically a put now */
+ got = ukarch_exchange_n(&m->map[idx], p);
+ UK_ASSERT(got == NULL);
+ return 0;
+ }
+ }
+}
+
+#endif /* __UK_FDTAB_FMAP_H__ */
--- /dev/null
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright (c) 2023, Unikraft GmbH and The Unikraft Authors.
+ * Licensed under the BSD-3-Clause License (the "License").
+ * You may not use this file except in compliance with the License.
+ */
+
+/* POSIX compatible file descriptor table */
+
+#ifndef __UK_POSIX_FDTAB_H__
+#define __UK_POSIX_FDTAB_H__
+
+#include <uk/config.h>
+#include <uk/ofile.h>
+
+#define UK_FD_MAX INT_MAX
+
+/**
+ * Open the file `f` with `mode` and associate it with a file descriptor.
+ *
+ * The lifetime of `f` must cover the entirety of this function call.
+ *
+ * @return
+ * The newly allocated file descriptor.
+ */
+int uk_fdtab_open(const struct uk_file *f, unsigned int mode);
+
+/**
+ * Get the open file description associated with descriptor `fd`.
+ *
+ * Users should call uk_fdtab_ret when done with the open file reference.
+ *
+ * @return
+ * Open file reference or NULL if `fd` is not an open file descriptor.
+ */
+struct uk_ofile *uk_fdtab_get(int fd);
+
+/**
+ * Return a reference to an open file when done using it.
+ */
+void uk_fdtab_ret(struct uk_ofile *of);
+
+/**
+ * Set flags on file descriptor. Currently only supports O_CLOEXEC.
+ *
+ * @return
+ * 0 if successful, < 0 if `fd` not mapped
+ */
+int uk_fdtab_setflags(int fd, int flags);
+
+/**
+ * Get the flags set on `fd`. Currently only supports O_CLOEXEC.
+ *
+ * @return
+ * >= 0, flags set on `fd`
+ * < 0, if `fd` not mapped
+ */
+int uk_fdtab_getflags(int fd);
+
+/**
+ * Close all files that have the O_CLOEXEC flag set.
+ *
+ * Assumes no other threads are touching the active fdtab during the call.
+ */
+void uk_fdtab_cloexec(void);
+
+/* Internal syscalls */
+int uk_sys_close(int fd);
+int uk_sys_dup(int oldfd);
+int uk_sys_dup_min(int oldfd, int min, int flags);
+int uk_sys_dup2(int oldfd, int newfd);
+int uk_sys_dup3(int oldfd, int newfd, int flags);
+
+#endif /* __UK_POSIX_FDTAB_H__ */