Define two routines tables: syscall_entertab and syscall_exittab.
These tables shall be iterated upon on system call entry and exit
respectively.
By registering into one of these tables, one may be able to have a custom
function called during the entering/exiting of a called system call.
The order these routines are executed in is dictated by their
priority: lower priority means earlier.
Note that there may exist some nested system calls, e.g. system call
handler invoked through a binary system call ending up calling a system
call of its own. To deal with such cases, introduce a TLS variable for
keeping track of when we enter/exit a syscall, binary or native. This way
we are able to let registered handlers know whether they are in a nested
context or not.
Very important is that we must remember to reset this TLS variable to 0
in the context of exiting execve since the process is born anew with a
fresh counter.
Signed-off-by: Sergiu Moga <sergiu@unikraft.io>
Approved-by: Michalis Pappas <michalis@unikraft.io>
Reviewed-by: Michalis Pappas <michalis@unikraft.io>
GitHub-Closes: #1277
LIBSYSCALL_SHIM_SRCS-$(CONFIG_LIBSYSCALL_SHIM_HANDLER) += $(LIBSYSCALL_SHIM_BASE)/uk_syscall_binary.c|isr
LIBSYSCALL_SHIM_SRCS-y += $(LIBSYSCALL_SHIM_BASE)/uk_prsyscall.c
+LIBSYSCALL_SHIM_SRCS-$(CONFIG_LIBPOSIX_PROCESS_EXECVE) += $(LIBSYSCALL_SHIM_BASE)/execve.c
LIBSYSCALL_SHIM_SRCS-y += $(LIBSYSCALL_SHIM_BASE)/vars.c
+LIBSYSCALL_SHIM_SRCS-y += $(LIBSYSCALL_SHIM_BASE)/syscall_exittab.lds.S
+LIBSYSCALL_SHIM_SRCS-y += $(LIBSYSCALL_SHIM_BASE)/syscall_entertab.lds.S
--- /dev/null
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright (c) 2025, Unikraft GmbH and The Unikraft Authors.
+ * Licensed under the BSD-3-Clause License (the "License").
+ * You may not use this file except in compliance with the License.
+ */
+
+#include <uk/event.h>
+#include <uk/prio.h>
+#include <uk/process.h>
+#include <uk/syscall.h>
+#include <uk/thread.h>
+
+static int syscall_nested_depth_reset(void *data)
+{
+ struct posix_process_execve_event_data *event_data;
+
+ UK_ASSERT(data);
+
+ event_data = (struct posix_process_execve_event_data *)data;
+
+ UK_ASSERT(event_data->thread);
+
+ /* execve() will always enter ukplat_syscall_handler() with a
+ * sane depth value, that is:
+ *
+ * - when preceded by a vfork() we start off with a new uktls,
+ * and hence enter ukplat_sycall_handler() with zero depth.
+ * - when NOT preceded by a vfork(), the depth should have a
+ * sane value from the previous syscall, that is we also enter
+ * uk_syscall_handler() with a depth of zero.
+ *
+ * Nevertheless, since execve() does not go through the syscall return
+ * path we must reset its counter back to zero here so that the
+ * next syscall starts off with a sane value.
+ */
+ uk_thread_uktls_var(event_data->thread, uk_syscall_nested_depth) = 0;
+
+ return UK_EVENT_HANDLED_CONT;
+}
+
+UK_EVENT_HANDLER_PRIO(POSIX_PROCESS_EXECVE_EVENT,
+ syscall_nested_depth_reset,
+ UK_PRIO_EARLIEST);
#include <uk/errptr.h>
#include <errno.h>
#include <stdarg.h>
+#include <stddef.h>
#include <uk/print.h>
#include <uk/legacy_syscall.h>
#include <uk/bits/syscall_linuxabi.h>
+#include <uk/syscall_exittab.h>
+#include <uk/syscall_entertab.h>
#ifdef __cplusplus
extern "C" {
#endif
+#if CONFIG_LIBSYSCALL_SHIM
+static inline
+void _uk_syscall_wrapper_do_entertab(struct ukarch_execenv *execenv)
+{
+ struct uk_syscall_enter_ctx enter_ctx;
+
+ uk_syscall_nested_depth++;
+
+ uk_syscall_enter_ctx_init(&enter_ctx,
+ execenv,
+ uk_syscall_nested_depth,
+ 0);
+ uk_syscall_entertab_run(&enter_ctx);
+}
+#else /* !CONFIG_LIBSYSCALL_SHIM */
+static inline
+void _uk_syscall_wrapper_do_entertab(struct ukarch_execenv *execenv __unused)
+{ }
+#endif /* !CONFIG_LIBSYSCALL_SHIM */
+
+#if CONFIG_LIBSYSCALL_SHIM
+static inline
+void _uk_syscall_wrapper_do_exittab(struct ukarch_execenv *execenv)
+{
+ struct uk_syscall_exit_ctx exit_ctx;
+
+ uk_syscall_exit_ctx_init(&exit_ctx,
+ execenv,
+ uk_syscall_nested_depth,
+ 0);
+ uk_syscall_exittab_run(&exit_ctx);
+
+ uk_syscall_nested_depth--;
+}
+#else /* !CONFIG_LIBSYSCALL_SHIM */
+static inline
+void _uk_syscall_wrapper_do_exittab(struct ukarch_execenv *execenv __unused)
+{ }
+#endif /* !CONFIG_LIBSYSCALL_SHIM */
+
/*
* Whenever the hidden Config.uk option LIBSYSCALL_SHIM_NOWRAPPER
* is set, the creation of libc-style wrappers are disable by the
{ \
long ret; \
\
+ _uk_syscall_wrapper_do_entertab(NULL); \
+ \
__UK_SYSCALL_PRINTD(x, rtype, ename, __VA_ARGS__); \
+ \
ret = (long) __##ename( \
UK_ARG_MAPx(x, UK_S_ARG_CAST_ACTUAL, __VA_ARGS__)); \
+ \
+ _uk_syscall_wrapper_do_exittab(NULL); \
+ \
return ret; \
} \
static inline rtype __##ename(UK_ARG_MAPx(x, \
long ret; \
\
__UK_SYSCALL_PRINTD(x, rtype, rname, __VA_ARGS__); \
+ \
+ _uk_syscall_wrapper_do_entertab(NULL); \
+ \
ret = (long) __##rname( \
UK_ARG_MAPx(x, UK_S_ARG_CAST_ACTUAL, __VA_ARGS__)); \
+ \
+ _uk_syscall_wrapper_do_exittab(NULL); \
+ \
return ret; \
} \
static inline rtype __##rname(UK_ARG_MAPx(x, \
static inline rtype __##rname(UK_EXECENV_DECLMAPx(UK_S_EXECENV_ARG_ACTUAL,\
x, UK_S_ARG_ACTUAL,\
__VA_ARGS__)); \
- long __used rname(long _execenv) \
+ long __used rname(long _execenv) \
{ \
struct ukarch_execenv *execenv; \
long ret; \
execenv = (struct ukarch_execenv *)_execenv; \
__UK_SYSCALL_EXECENV_PRINTD(execenv, x, rtype, rname, \
__VA_ARGS__); \
+ \
+ _uk_syscall_wrapper_do_entertab(execenv); \
+ \
ret = (long) __##rname(UK_EXECENV_CALLMAPx(x, \
UK_S_ARG_ACTUAL, \
__VA_ARGS__)); \
+ \
+ _uk_syscall_wrapper_do_exittab(execenv); \
+ \
return ret; \
} \
- static inline rtype __used __##rname(UK_EXECENV_DECLMAPx( \
+ static inline rtype __used __##rname(UK_EXECENV_DECLMAPx( \
UK_S_EXECENV_ARG_ACTUAL_MAYBE_UNUSED,\
x, UK_S_ARG_ACTUAL_MAYBE_UNUSED,\
__VA_ARGS__))
--- /dev/null
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright (c) 2024, Unikraft GmbH and The Unikraft Authors.
+ * Licensed under the BSD-3-Clause License (the "License").
+ * You may not use this file except in compliance with the License.
+ */
+
+#ifndef __UK_SYSCALL_ENTERTAB_H__
+#define __UK_SYSCALL_ENTERTAB_H__
+
+#include <uk/assert.h>
+#include <uk/arch/ctx.h>
+
+struct uk_syscall_enter_ctx {
+ struct ukarch_execenv *execenv;
+ unsigned long nested_depth;
+#define UK_SYSCALL_ENTER_CTX_BINARY_SYSCALL (1 << 0)
+ __u32 flags;
+};
+
+#if CONFIG_LIBSYSCALL_SHIM
+/*
+ * Initialize a system call enter context.
+ *
+ * @param enter_ctx
+ * Pointer to the enter context to initialize
+ * @param execenv
+ * Pointer to the execution environment to be used on enter context
+ * initialization
+ * @param nested_depth
+ * How many system calls have been called (native or binary) on this
+ * current context before this system call on whose entry we are going
+ * to run the system call enter table, plus one, the system call itself.
+ * This helps us tell whether we are in a nested system call or not.
+ * E.g., if this is the first system call called, then this should be 1;
+ * if this is a system call called by another system call then this should
+ * be previous system call's nested_depth + 1.
+ * @param flags
+ * System call enter context flags:
+ * UK_SYSCALL_ENTER_CTX_BINARY_SYSCALL We are in a binary system
+ * call's context
+ */
+static inline
+void uk_syscall_enter_ctx_init(struct uk_syscall_enter_ctx *enter_ctx,
+ struct ukarch_execenv *execenv,
+ unsigned long nested_depth,
+ __u32 flags)
+{
+ UK_ASSERT(enter_ctx);
+ enter_ctx->execenv = execenv;
+ enter_ctx->nested_depth = nested_depth;
+ enter_ctx->flags = flags;
+}
+#else /* !CONFIG_LIBSYSCALL_SHIM */
+static inline
+void uk_syscall_enter_ctx_init(struct uk_syscall_enter_ctx *enter_ctx __unused,
+ struct ukarch_execenv *execenv __unused,
+ unsigned long nested_depth __unused,
+ __u32 flags __unused)
+{ }
+#endif /* !CONFIG_LIBSYSCALL_SHIM */
+
+typedef void (*uk_syscall_entertab_func_t)(struct uk_syscall_enter_ctx *);
+
+struct uk_syscall_entertab_entry {
+ uk_syscall_entertab_func_t func;
+};
+
+extern const struct uk_syscall_entertab_entry uk_syscall_entertab_start[];
+extern const struct uk_syscall_entertab_entry uk_syscall_entertab_end;
+
+extern __thread unsigned long uk_syscall_nested_depth;
+
+/**
+ * Helper macro for iterating over system call enter functions.
+ * Please note that the table may contain NULL pointer entries.
+ *
+ * @param itr
+ * Iterator variable (struct uk_syscall_entertab_entry *) which points to the
+ * individual table entries during iteration
+ * @param tab_start
+ * Start address of table (type: const struct uk_syscall_entertab_entry[])
+ * @param tab_end
+ * End address of table (type: const struct uk_syscall_entertab_entry)
+ */
+#define uk_syscall_entertab_foreach(itr, tab_start, tab_end) \
+ for ((itr) = DECONST(struct uk_syscall_entertab_entry *, tab_start);\
+ (itr) < &(tab_end); \
+ (itr)++)
+
+/**
+ * Register a Unikraft system call enter function.
+ *
+ * @param fn
+ * System call enter function to be called
+ * @param prio
+ * Priority level: (`UK_PRIO_ EARLIEST()` to `UK_PRIO_LATEST()`).
+ * Use the UK_PRIO_AFTER()/UK_PRIO_BEFORE() helper macro for computing
+ * priority dependencies.
+ * Note: Any other value for level will be ignored
+ */
+#define __UK_SYSCALL_ENTERTAB_ENTRY(fn, prio) \
+ static const struct uk_syscall_entertab_entry \
+ __used __section(".uk_syscall_entertab" #prio) __align(8) \
+ __uk_syscall_entertab ## prio ## _ ## fn = { \
+ .func = (fn), \
+ }
+
+#define _UK_SYSCALL_ENTERTAB(fn, prio) \
+ __UK_SYSCALL_ENTERTAB_ENTRY(fn, prio)
+
+#define uk_syscall_entertab_prio(fn, prio) \
+ _UK_SYSCALL_ENTERTAB(fn, prio)
+
+#if CONFIG_LIBSYSCALL_SHIM
+/*
+ * Run the routines registered into the hooked system call enter table routines.
+ * If any of those routines fails, then system will crash as there is no sane
+ * error value to be returned to the system call caller.
+ *
+ * @param enter_ctx
+ * Pointer to the system call enter context
+ */
+static inline
+void uk_syscall_entertab_run(struct uk_syscall_enter_ctx *enter_ctx)
+{
+ struct uk_syscall_entertab_entry *entry;
+
+ UK_ASSERT(enter_ctx);
+
+ uk_pr_debug("Syscall enter table @ %p - %p\n",
+ &uk_syscall_entertab_start[0], &uk_syscall_entertab_end);
+
+ uk_syscall_entertab_foreach(entry,
+ uk_syscall_entertab_start,
+ uk_syscall_entertab_end) {
+ UK_ASSERT(entry);
+
+ if (!entry->func)
+ continue;
+
+ uk_pr_debug("Call syscall enter table entry: %p(%p)...\n",
+ entry->func, enter_ctx);
+ (*entry->func)(enter_ctx);
+ }
+}
+#else /* !CONFIG_LIBSYSCALL_SHIM */
+static inline
+void uk_syscall_entertab_run(struct uk_syscall_enter_ctx *enter_ctx __unused)
+{ }
+#endif /* !CONFIG_LIBSYSCALL_SHIM */
+#endif /* __UK_SYSCALL_ENTERTAB_H__ */
--- /dev/null
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright (c) 2024, Unikraft GmbH and The Unikraft Authors.
+ * Licensed under the BSD-3-Clause License (the "License").
+ * You may not use this file except in compliance with the License.
+ */
+
+#ifndef __UK_SYSCALL_EXITTAB_H__
+#define __UK_SYSCALL_EXITTAB_H__
+
+#include <uk/assert.h>
+#include <uk/arch/ctx.h>
+
+struct uk_syscall_exit_ctx {
+ struct ukarch_execenv *execenv;
+ unsigned long nested_depth;
+#define UK_SYSCALL_EXIT_CTX_BINARY_SYSCALL (1 << 0)
+ __u32 flags;
+};
+
+#if CONFIG_LIBSYSCALL_SHIM
+/*
+ * Initialize a system call exit context.
+ *
+ * @param exit_ctx
+ * Pointer to the exit context to initialize
+ * @param execenv
+ * Pointer to the execution environment to be used on exit context
+ * initialization
+ * @param nested_depth
+ * How many system calls have been called (native or binary) on this
+ * current context before this system call on whose exit we are going
+ * to run the system call exit table, plus one, the system call itself.
+ * This helps us tell whether we are in a nested system call or not.
+ * E.g., if this is the first system call called, then this should be 1;
+ * if this is a system call called by another system call then this should
+ * be previous system call's nested_depth + 1.
+ * @param flags
+ * System call exit context flags:
+ * UK_SYSCALL_EXIT_CTX_BINARY_SYSCALL We are in a binary system
+ * call's context
+ */
+static inline
+void uk_syscall_exit_ctx_init(struct uk_syscall_exit_ctx *exit_ctx,
+ struct ukarch_execenv *execenv,
+ unsigned long nested_depth,
+ __u32 flags)
+{
+ UK_ASSERT(exit_ctx);
+ exit_ctx->execenv = execenv;
+ exit_ctx->nested_depth = nested_depth;
+ exit_ctx->flags = flags;
+}
+#else /* !CONFIG_LIBSYSCALL_SHIM */
+static inline
+void uk_syscall_exit_ctx_init(struct uk_syscall_exit_ctx *exit_ctx __unused,
+ struct ukarch_execenv *execenv __unused,
+ unsigned long nested_depth __unused,
+ __u32 flags __unused)
+{ }
+#endif /* !CONFIG_LIBSYSCALL_SHIM */
+
+typedef void (*uk_syscall_exittab_func_t)(struct uk_syscall_exit_ctx *);
+
+struct uk_syscall_exittab_entry {
+ uk_syscall_exittab_func_t func;
+};
+
+extern const struct uk_syscall_exittab_entry uk_syscall_exittab_start[];
+extern const struct uk_syscall_exittab_entry uk_syscall_exittab_end;
+
+extern __thread unsigned long uk_syscall_nested_depth;
+
+/**
+ * Helper macro for iterating over system call exit functions.
+ * Please note that the table may contain NULL pointer entries.
+ *
+ * @param itr
+ * Iterator variable (struct uk_syscall_exittab_entry *) which points to the
+ * individual table entries during iteration
+ * @param syscall_exittab_start
+ * Start address of table (type: const struct uk_syscall_exittab_entry[])
+ * @param syscall_exittab_end
+ * End address of table (type: const struct uk_syscall_exittab_entry)
+ */
+#define uk_syscall_exittab_foreach(itr, tab_start, tab_end) \
+ for ((itr) = DECONST(struct uk_syscall_exittab_entry *, tab_start);\
+ (itr) < &(tab_end); \
+ (itr)++)
+
+/**
+ * Register a Unikraft system call exit function.
+ *
+ * @param fn
+ * System call exit function to be called
+ * @param prio
+ * Priority level: (`UK_PRIO_ EARLIEST()` to `UK_PRIO_LATEST()`).
+ * Use the UK_PRIO_AFTER()/UK_PRIO_BEFORE() helper macro for computing
+ * priority dependencies.
+ * Note: Any other value for level will be ignored
+ */
+#define __UK_SYSCALL_EXITTAB_ENTRY(fn, prio) \
+ static const struct uk_syscall_exittab_entry \
+ __used __section(".uk_syscall_exittab" #prio) __align(8) \
+ __uk_syscall_exittab ## prio ## _ ## fn = { \
+ .func = (fn), \
+ }
+
+#define _UK_SYSCALL_EXITTAB(fn, prio) \
+ __UK_SYSCALL_EXITTAB_ENTRY(fn, prio)
+
+#define uk_syscall_exittab_prio(fn, prio) \
+ _UK_SYSCALL_EXITTAB(fn, prio)
+
+#if CONFIG_LIBSYSCALL_SHIM
+/*
+ * Run the routines registered into the hooked system call exit table routines.
+ * If any of those routines fails, then system will crash as there is no sane
+ * error value to be returned to the system call caller.
+ *
+ * @param exit_ctx
+ * Pointer to the system call exit context
+ */
+static inline
+void uk_syscall_exittab_run(struct uk_syscall_exit_ctx *exit_ctx)
+{
+ struct uk_syscall_exittab_entry *entry;
+
+ UK_ASSERT(exit_ctx);
+
+ uk_pr_debug("Syscall exit table @ %p - %p\n",
+ &uk_syscall_exittab_start[0], &uk_syscall_exittab_end);
+
+ uk_syscall_exittab_foreach(entry,
+ uk_syscall_exittab_start,
+ uk_syscall_exittab_end) {
+ UK_ASSERT(entry);
+
+ if (!entry->func)
+ continue;
+
+ uk_pr_debug("Call syscall exit table entry: %p(%p)...\n",
+ entry->func, exit_ctx);
+ (*entry->func)(exit_ctx);
+ }
+}
+#else /* !CONFIG_LIBSYSCALL_SHIM */
+static inline
+void uk_syscall_exittab_run(struct uk_syscall_exit_ctx *exit_ctx __unused)
+{ }
+#endif /* !CONFIG_LIBSYSCALL_SHIM */
+#endif /* __UK_SYSCALL_EXITTAB_H__ */
--- /dev/null
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright (c) 2024, Unikraft GmbH and The Unikraft Authors.
+ * Licensed under the BSD-3-Clause License (the "License").
+ * You may not use this file except in compliance with the License.
+ */
+
+SECTIONS
+{
+ . = ALIGN(8);
+
+ .uk_syscall_entertab :
+ {
+ PROVIDE(uk_syscall_entertab_start = .);
+ KEEP(*(SORT_BY_NAME(.uk_syscall_entertab[0-9])))
+ PROVIDE(uk_syscall_entertab_end = .);
+ }
+}
+INSERT AFTER .rodata;
--- /dev/null
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright (c) 2024, Unikraft GmbH and The Unikraft Authors.
+ * Licensed under the BSD-3-Clause License (the "License").
+ * You may not use this file except in compliance with the License.
+ */
+
+SECTIONS
+{
+ . = ALIGN(8);
+
+ .uk_syscall_exittab :
+ {
+ PROVIDE(uk_syscall_exittab_start = .);
+ KEEP(*(SORT_BY_NAME(.uk_syscall_exittab[0-9])))
+ PROVIDE(uk_syscall_exittab_end = .);
+ }
+}
+INSERT AFTER .rodata;
#endif /* !CONFIG_LIBSYSCALL_SHIM_STRACE_ANSI_COLOR */
int prsyscalllen __maybe_unused;
#endif /* CONFIG_LIBSYSCALL_SHIM_STRACE */
+ struct uk_syscall_enter_ctx enter_ctx;
+ struct uk_syscall_exit_ctx exit_ctx;
struct ukarch_auxspcb *auxspcb;
struct ukarch_execenv *execenv;
#if CONFIG_LIBSYSCALL_SHIM_HANDLER_ULTLS
execenv->regs.__syscall_rarg1);
#endif /* CONFIG_LIBSYSCALL_SHIM_DEBUG_HANDLER */
+ uk_syscall_nested_depth++;
+ uk_syscall_enter_ctx_init(&enter_ctx,
+ execenv, uk_syscall_nested_depth,
+ UK_SYSCALL_ENTER_CTX_BINARY_SYSCALL);
+ uk_syscall_entertab_run(&enter_ctx);
+
execenv->regs.__syscall_rret0 = uk_syscall6_r_e(execenv);
+ uk_syscall_exit_ctx_init(&exit_ctx,
+ execenv, uk_syscall_nested_depth,
+ UK_SYSCALL_EXIT_CTX_BINARY_SYSCALL);
+ uk_syscall_exittab_run(&exit_ctx);
+ uk_syscall_nested_depth--;
+
#if CONFIG_LIBSYSCALL_SHIM_STRACE
prsyscalllen = uk_snprsyscall(prsyscallbuf, ARRAY_SIZE(prsyscallbuf),
#if CONFIG_LIBSYSCALL_SHIM_STRACE_ANSI_COLOR
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright (c) 2024, Unikraft GmbH and The Unikraft Authors.
+ * Licensed under the BSD-3-Clause License (the "License").
+ * You may not use this file except in compliance with the License.
+ */
+
#include <uk/syscall.h>
+__thread unsigned long uk_syscall_nested_depth;