From: Roger Pau Monne Date: Tue, 9 Jul 2024 14:27:40 +0000 (+0200) Subject: x86/mm: switch to a per-CPU mapped stack when using ASI X-Git-Url: http://xenbits.xensource.com/gitweb?a=commitdiff_plain;h=5f082c52dd16b4bcb0163857997f8f0d30d8f5cc;p=people%2Froyger%2Fxen.git x86/mm: switch to a per-CPU mapped stack when using ASI When using ASI the CPU stack is mapped using a range of fixmap entries in the per-CPU region. This ensures the stack is only accessible by the current CPU. Note however there's further work required in order to allocate the stack from domheap instead of xenheap, and ensure the stack is not part of the direct map. For domains not running with ASI enabled all the CPU stacks are mapped in the per-domain L3, so that the stack is always at the same linear address, regardless of whether ASI is enabled or not for the domain. When calling UEFI runtime methods the current per-domain slot needs to be added to the EFI L4, so that the stack is available in UEFI. Finally, some users of callfunc IPIs pass parameters from the stack, so when handling a callfunc IPI the stack of the caller CPU is mapped into the address space of the CPU handling the IPI. This needs further work to use a bounce buffer in order to avoid having to map remote CPU stacks. Signed-off-by: Roger Pau Monné --- There's also further work required in order to avoid mapping remote stack when handling callfunc IPIs. --- diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c index 6e1f622f73..fbb1b232d0 100644 --- a/xen/arch/x86/domain.c +++ b/xen/arch/x86/domain.c @@ -563,6 +563,26 @@ int arch_vcpu_create(struct vcpu *v) if ( rc ) return rc; + if ( opt_asi_hvm || opt_asi_pv ) + { + if ( is_idle_vcpu(v) || d->arch.asi ) + create_perdomain_mapping(v, PCPU_STACK_VIRT(0), + nr_cpu_ids << STACK_ORDER, false); + else if ( !v->vcpu_id ) + { + l3_pgentry_t *idle_perdomain = + __map_domain_page(idle_vcpu[0]->domain->arch.perdomain_l3_pg); + l3_pgentry_t *guest_perdomain = + __map_domain_page(d->arch.perdomain_l3_pg); + + l3e_write(&guest_perdomain[PCPU_STACK_SLOT], + idle_perdomain[PCPU_STACK_SLOT]); + + unmap_domain_page(guest_perdomain); + unmap_domain_page(idle_perdomain); + } + } + rc = mapcache_vcpu_init(v); if ( rc ) return rc; @@ -2031,6 +2051,16 @@ static void __context_switch(struct vcpu *n) } vcpu_restore_fpu_nonlazy(n, false); nd->arch.ctxt_switch->to(n); + if ( nd->arch.asi ) + { + /* + * Tear down previous stack mappings and map current pCPU stack. + * This is safe because not yet running on 'n' page-tables. + */ + destroy_perdomain_mapping(n, PCPU_STACK_VIRT(0), + nr_cpu_ids << STACK_ORDER); + vcpu_set_stack_mappings(n, cpu, true); + } } psr_ctxt_switch_to(nd); diff --git a/xen/arch/x86/include/asm/config.h b/xen/arch/x86/include/asm/config.h index af3ff3cb87..016d6c8b21 100644 --- a/xen/arch/x86/include/asm/config.h +++ b/xen/arch/x86/include/asm/config.h @@ -168,7 +168,7 @@ /* Slot 260: per-domain mappings (including map cache). */ #define PERDOMAIN_VIRT_START (PML4_ADDR(260)) #define PERDOMAIN_SLOT_MBYTES (PML4_ENTRY_BYTES >> (20 + PAGETABLE_ORDER)) -#define PERDOMAIN_SLOTS 3 +#define PERDOMAIN_SLOTS 4 #define PERDOMAIN_VIRT_SLOT(s) (PERDOMAIN_VIRT_START + (s) * \ (PERDOMAIN_SLOT_MBYTES << 20)) /* Slot 4: mirror of per-domain mappings (for compat xlat area accesses). */ @@ -288,6 +288,14 @@ extern unsigned long xen_phys_start; #define ARG_XLAT_START(v) \ (ARG_XLAT_VIRT_START + ((v)->vcpu_id << ARG_XLAT_VA_SHIFT)) +/* Per-CPU stacks area when using ASI. */ +#define PCPU_STACK_SLOT 3 +#define PCPU_STACK_VIRT_START PERDOMAIN_VIRT_SLOT(PCPU_STACK_SLOT) +#define PCPU_STACK_VIRT_END (PCPU_STACK_VIRT_START + \ + (PERDOMAIN_SLOT_MBYTES << 20)) +#define PCPU_STACK_VIRT(cpu) (PCPU_STACK_VIRT_START + \ + (cpu << STACK_ORDER) * PAGE_SIZE) + #define ELFSIZE 64 #define ARCH_CRASH_SAVE_VMCOREINFO diff --git a/xen/arch/x86/include/asm/current.h b/xen/arch/x86/include/asm/current.h index bcec328c98..4a9776f87a 100644 --- a/xen/arch/x86/include/asm/current.h +++ b/xen/arch/x86/include/asm/current.h @@ -24,6 +24,11 @@ * 0 - IST Shadow Stacks (4x 1k, read-only) */ +static inline bool is_shstk_slot(unsigned int i) +{ + return (i == 0 || i == PRIMARY_SHSTK_SLOT); +} + /* * Identify which stack page the stack pointer is on. Returns an index * as per the comment above. diff --git a/xen/arch/x86/include/asm/mm.h b/xen/arch/x86/include/asm/mm.h index f79d1594fd..77f31685fd 100644 --- a/xen/arch/x86/include/asm/mm.h +++ b/xen/arch/x86/include/asm/mm.h @@ -519,7 +519,7 @@ extern struct rangeset *mmio_ro_ranges; #define compat_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20)) #define compat_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20)) -void memguard_guard_stack(void *p); +void memguard_guard_stack(void *p, unsigned int cpu); void memguard_unguard_stack(void *p); /* diff --git a/xen/arch/x86/include/asm/smp.h b/xen/arch/x86/include/asm/smp.h index c8c7960134..e1d3d368e1 100644 --- a/xen/arch/x86/include/asm/smp.h +++ b/xen/arch/x86/include/asm/smp.h @@ -79,6 +79,20 @@ extern bool unaccounted_cpus; void *cpu_alloc_stack(unsigned int cpu); +/* + * Setup the per-CPU area stack mappings. + * + * @v: vCPU where the mappings are to appear. + * @stack_cpu: CPU whose stacks should be mapped. + * @map_shstk: create mappings for shadow stack regions. + */ +void vcpu_set_stack_mappings(const struct vcpu *v, unsigned int stack_cpu, + bool map_shstk); + +#define HAS_ARCH_SMP_CALLFUNC +void arch_smp_pre_callfunc(unsigned int cpu); +void arch_smp_post_callfunc(unsigned int cpu); + #endif /* !__ASSEMBLY__ */ #endif diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c index 864466c0fe..a7c59ad62b 100644 --- a/xen/arch/x86/mm.c +++ b/xen/arch/x86/mm.c @@ -87,6 +87,7 @@ * doing the final put_page(), and remove it from the iommu if so. */ +#include #include #include #include @@ -6424,8 +6425,10 @@ int create_perdomain_mapping(struct vcpu *v, unsigned long va, return rc; } -void populate_perdomain_mapping(const struct vcpu *v, unsigned long va, - mfn_t *mfn, unsigned long nr) +static void populate_perdomain_mapping_flags(const struct vcpu *v, + unsigned long va, mfn_t *mfn, + unsigned long nr, + unsigned int flags) { l1_pgentry_t *l1tab = NULL, *pl1e; const l3_pgentry_t *l3tab; @@ -6454,7 +6457,7 @@ void populate_perdomain_mapping(const struct vcpu *v, unsigned long va, ASSERT_UNREACHABLE(); free_domheap_page(l1e_get_page(*pl1e)); } - l1e_write(pl1e, l1e_from_mfn(mfn[i], __PAGE_HYPERVISOR_RW)); + l1e_write(pl1e, l1e_from_mfn(mfn[i], flags)); } return; @@ -6505,7 +6508,7 @@ void populate_perdomain_mapping(const struct vcpu *v, unsigned long va, free_domheap_page(l1e_get_page(*pl1e)); } - l1e_write(pl1e, l1e_from_mfn(*mfn, __PAGE_HYPERVISOR_RW)); + l1e_write(pl1e, l1e_from_mfn(*mfn, flags)); } unmap_domain_page(l1tab); @@ -6513,6 +6516,31 @@ void populate_perdomain_mapping(const struct vcpu *v, unsigned long va, unmap_domain_page(l3tab); } +void populate_perdomain_mapping(const struct vcpu *v, unsigned long va, + mfn_t *mfn, unsigned long nr) +{ + populate_perdomain_mapping_flags(v, va, mfn, nr, __PAGE_HYPERVISOR_RW); +} + +void vcpu_set_stack_mappings(const struct vcpu *v, unsigned int stack_cpu, + bool map_shstk) +{ + unsigned int i; + + for ( i = 0; i < (1U << STACK_ORDER); i++ ) + { + unsigned int flags = is_shstk_slot(i) ? __PAGE_HYPERVISOR_SHSTK + : __PAGE_HYPERVISOR_RW; + mfn_t mfn = virt_to_mfn(stack_base[stack_cpu] + i * PAGE_SIZE); + + if ( is_shstk_slot(i) && !map_shstk ) + continue; + + populate_perdomain_mapping_flags(v, + PCPU_STACK_VIRT(stack_cpu) + i * PAGE_SIZE, &mfn, 1, flags); + } +} + void destroy_perdomain_mapping(const struct vcpu *v, unsigned long va, unsigned int nr) { @@ -6597,7 +6625,12 @@ void free_perdomain_mappings(struct vcpu *v) l3tab = __map_domain_page(d->arch.asi ? v->arch.pervcpu_l3_pg : d->arch.perdomain_l3_pg); - for ( i = 0; i < PERDOMAIN_SLOTS; ++i) + for ( i = 0; i < PERDOMAIN_SLOTS; ++i ) + { + if ( i == PCPU_STACK_SLOT && !d->arch.asi ) + /* Without ASI the stack L3e is shared with the idle page-tables. */ + continue; + if ( l3e_get_flags(l3tab[i]) & _PAGE_PRESENT ) { struct page_info *l2pg = l3e_get_page(l3tab[i]); @@ -6627,6 +6660,7 @@ void free_perdomain_mappings(struct vcpu *v) unmap_domain_page(l2tab); free_domheap_page(l2pg); } + } unmap_domain_page(l3tab); free_domheap_page(d->arch.asi ? v->arch.pervcpu_l3_pg @@ -6635,31 +6669,39 @@ void free_perdomain_mappings(struct vcpu *v) v->arch.pervcpu_l3_pg = NULL; } -static void write_sss_token(unsigned long *ptr) +static void write_sss_token(unsigned long *ptr, unsigned long va) { /* * A supervisor shadow stack token is its own linear address, with the * busy bit (0) clear. */ - *ptr = (unsigned long)ptr; + *ptr = va; } -void memguard_guard_stack(void *p) +void memguard_guard_stack(void *p, unsigned int cpu) { + unsigned long va = (opt_asi_hvm || opt_asi_pv) ? PCPU_STACK_VIRT(cpu) + : (unsigned long)p; + /* IST Shadow stacks. 4x 1k in stack page 0. */ if ( IS_ENABLED(CONFIG_XEN_SHSTK) ) { - write_sss_token(p + (IST_MCE * IST_SHSTK_SIZE) - 8); - write_sss_token(p + (IST_NMI * IST_SHSTK_SIZE) - 8); - write_sss_token(p + (IST_DB * IST_SHSTK_SIZE) - 8); - write_sss_token(p + (IST_DF * IST_SHSTK_SIZE) - 8); + write_sss_token(p + (IST_MCE * IST_SHSTK_SIZE) - 8, + va + (IST_MCE * IST_SHSTK_SIZE) - 8); + write_sss_token(p + (IST_NMI * IST_SHSTK_SIZE) - 8, + va + (IST_NMI * IST_SHSTK_SIZE) - 8); + write_sss_token(p + (IST_DB * IST_SHSTK_SIZE) - 8, + va + (IST_DB * IST_SHSTK_SIZE) - 8); + write_sss_token(p + (IST_DF * IST_SHSTK_SIZE) - 8, + va + (IST_DF * IST_SHSTK_SIZE) - 8); } map_pages_to_xen((unsigned long)p, virt_to_mfn(p), 1, PAGE_HYPERVISOR_SHSTK); /* Primary Shadow Stack. 1x 4k in stack page 5. */ p += PRIMARY_SHSTK_SLOT * PAGE_SIZE; + va += PRIMARY_SHSTK_SLOT * PAGE_SIZE; if ( IS_ENABLED(CONFIG_XEN_SHSTK) ) - write_sss_token(p + PAGE_SIZE - 8); + write_sss_token(p + PAGE_SIZE - 8, va + PAGE_SIZE - 8); map_pages_to_xen((unsigned long)p, virt_to_mfn(p), 1, PAGE_HYPERVISOR_SHSTK); } diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c index 0a748e2c14..d489dc7a93 100644 --- a/xen/arch/x86/setup.c +++ b/xen/arch/x86/setup.c @@ -390,6 +390,11 @@ static void __init init_idle_domain(void) scheduler_init(); set_current(idle_vcpu[0]); this_cpu(curr_vcpu) = current; + if ( opt_asi_hvm || opt_asi_pv ) + /* Set per-domain slot in the idle page-tables to access stack mappings. */ + l4e_write(&idle_pg_table[l4_table_offset(PERDOMAIN_VIRT_START)], + l4e_from_page(idle_vcpu[0]->domain->arch.perdomain_l3_pg, + __PAGE_HYPERVISOR_RW)); } void srat_detect_node(int cpu) @@ -884,8 +889,6 @@ static void __init noreturn reinit_bsp_stack(void) /* Update SYSCALL trampolines */ percpu_traps_init(); - stack_base[0] = stack; - rc = setup_cpu_root_pgt(0); if ( rc ) panic("Error %d setting up PV root page table\n", rc); @@ -1856,10 +1859,6 @@ void asmlinkage __init noreturn __start_xen(void) system_state = SYS_STATE_boot; - bsp_stack = cpu_alloc_stack(0); - if ( !bsp_stack ) - panic("No memory for BSP stack\n"); - console_init_ring(); vesa_init(); @@ -2042,6 +2041,16 @@ void asmlinkage __init noreturn __start_xen(void) alternative_branches(); + /* + * Alloc the BSP stack closer to the point where the AP ones also get + * allocated - and after the speculation mitigations have been initialized. + * In order to set up the shadow stack token correctly Xen needs to know + * whether per-CPU mapped stacks are being used. + */ + bsp_stack = cpu_alloc_stack(0); + if ( !bsp_stack ) + panic("No memory for BSP stack\n"); + /* * NB: when running as a PV shim VCPUOP_up/down is wired to the shim * physical cpu_add/remove functions, so launch the guest with only @@ -2147,8 +2156,17 @@ void asmlinkage __init noreturn __start_xen(void) info->last_spec_ctrl = default_xen_spec_ctrl; } + stack_base[0] = bsp_stack; + /* Copy the cpu info block, and move onto the BSP stack. */ - bsp_info = get_cpu_info_from_stack((unsigned long)bsp_stack); + if ( opt_asi_hvm || opt_asi_pv ) + { + vcpu_set_stack_mappings(idle_vcpu[0], 0, true); + bsp_info = get_cpu_info_from_stack(PCPU_STACK_VIRT(0)); + } + else + bsp_info = get_cpu_info_from_stack((unsigned long)bsp_stack); + *bsp_info = *info; asm volatile ("mov %[stk], %%rsp; jmp %c[fn]" :: diff --git a/xen/arch/x86/smp.c b/xen/arch/x86/smp.c index 02a6ed7593..ce3deefb20 100644 --- a/xen/arch/x86/smp.c +++ b/xen/arch/x86/smp.c @@ -9,6 +9,7 @@ */ #include +#include #include #include #include @@ -27,6 +28,8 @@ #include #include +#include + /* Helper functions to prepare APIC register values. */ static unsigned int prepare_ICR(unsigned int shortcut, int vector) { @@ -435,3 +438,39 @@ long cf_check cpu_down_helper(void *data) ret = cpu_down(cpu); return ret; } + +void arch_smp_pre_callfunc(unsigned int cpu) +{ + if ( !opt_asi_hvm && !opt_asi_pv ) + /* + * Avoid the unconditional sync_local_execstate() call below if ASI is + * not enabled for any domain. + */ + return; + + /* + * Sync execution state, so that the page-tables cannot change while + * creating or destroying the stack mappings. + */ + sync_local_execstate(); + if ( cpu == smp_processor_id() || !current->domain->arch.asi || + /* EFI page-tables have all pCPU stacks mapped. */ + efi_rs_using_pgtables() ) + return; + + vcpu_set_stack_mappings(current, cpu, false); +} + +void arch_smp_post_callfunc(unsigned int cpu) +{ + if ( cpu == smp_processor_id() || !current->domain->arch.asi || + /* EFI page-tables have all pCPU stacks mapped. */ + efi_rs_using_pgtables() ) + return; + + ASSERT(current == this_cpu(curr_vcpu)); + destroy_perdomain_mapping(current, PCPU_STACK_VIRT(cpu), + (1U << STACK_ORDER)); + + flush_area_local((void *)PCPU_STACK_VIRT(cpu), FLUSH_ORDER(STACK_ORDER)); +} diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c index a740a64022..72e9f2147a 100644 --- a/xen/arch/x86/smpboot.c +++ b/xen/arch/x86/smpboot.c @@ -582,7 +582,21 @@ static int do_boot_cpu(int apicid, int cpu) printk("Booting processor %d/%d eip %lx\n", cpu, apicid, start_eip); - stack_start = stack_base[cpu] + STACK_SIZE - sizeof(struct cpu_info); + if ( opt_asi_hvm || opt_asi_pv ) + { + /* + * Uniformly run with the stack mappings in the per-domain area if ASI + * is enabled for any domain type. + */ + vcpu_set_stack_mappings(idle_vcpu[cpu], cpu, true); + + ASSERT(IS_ALIGNED(PCPU_STACK_VIRT(cpu), STACK_SIZE)); + + stack_start = (void *)PCPU_STACK_VIRT(cpu) + STACK_SIZE - + sizeof(struct cpu_info); + } + else + stack_start = stack_base[cpu] + STACK_SIZE - sizeof(struct cpu_info); /* This grunge runs the startup process for the targeted processor. */ @@ -1030,7 +1044,7 @@ void *cpu_alloc_stack(unsigned int cpu) stack = alloc_xenheap_pages(STACK_ORDER, memflags); if ( stack ) - memguard_guard_stack(stack); + memguard_guard_stack(stack, cpu); return stack; } @@ -1146,6 +1160,8 @@ static struct notifier_block cpu_smpboot_nfb = { void __init smp_prepare_cpus(void) { + BUILD_BUG_ON(PCPU_STACK_VIRT(CONFIG_NR_CPUS) > PCPU_STACK_VIRT_END); + register_cpu_notifier(&cpu_smpboot_nfb); mtrr_aps_sync_begin(); diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c index 6421abc3ca..8231874fe2 100644 --- a/xen/arch/x86/traps.c +++ b/xen/arch/x86/traps.c @@ -609,10 +609,12 @@ void show_stack_overflow(unsigned int cpu, const struct cpu_user_regs *regs) unsigned long esp = regs->rsp; unsigned long curr_stack_base = esp & ~(STACK_SIZE - 1); unsigned long esp_top, esp_bottom; + const void *stack = current->domain->arch.asi ? (void *)PCPU_STACK_VIRT(cpu) + : stack_base[cpu]; - if ( _p(curr_stack_base) != stack_base[cpu] ) + if ( _p(curr_stack_base) != stack ) printk("Current stack base %p differs from expected %p\n", - _p(curr_stack_base), stack_base[cpu]); + _p(curr_stack_base), stack); esp_bottom = (esp | (STACK_SIZE - 1)) + 1; esp_top = esp_bottom - PRIMARY_STACK_SIZE; diff --git a/xen/common/smp.c b/xen/common/smp.c index a011f541f1..04f5aede0d 100644 --- a/xen/common/smp.c +++ b/xen/common/smp.c @@ -29,6 +29,7 @@ static struct call_data_struct { void (*func) (void *info); void *info; int wait; + unsigned int caller; cpumask_t selected; } call_data; @@ -63,6 +64,7 @@ void on_selected_cpus( call_data.func = func; call_data.info = info; call_data.wait = wait; + call_data.caller = smp_processor_id(); smp_send_call_function_mask(&call_data.selected); @@ -82,6 +84,12 @@ void smp_call_function_interrupt(void) if ( !cpumask_test_cpu(cpu, &call_data.selected) ) return; + /* + * TODO: use bounce buffers to pass callfunc data, so that when using ASI + * there's no need to map remote CPU stacks. + */ + arch_smp_pre_callfunc(call_data.caller); + irq_enter(); if ( unlikely(!func) ) @@ -102,6 +110,8 @@ void smp_call_function_interrupt(void) } irq_exit(); + + arch_smp_post_callfunc(call_data.caller); } /* diff --git a/xen/common/stop_machine.c b/xen/common/stop_machine.c index 398cfd507c..142059c363 100644 --- a/xen/common/stop_machine.c +++ b/xen/common/stop_machine.c @@ -40,6 +40,7 @@ enum stopmachine_state { struct stopmachine_data { unsigned int nr_cpus; + unsigned int caller; enum stopmachine_state state; atomic_t done; @@ -104,6 +105,7 @@ int stop_machine_run(int (*fn)(void *data), void *data, unsigned int cpu) stopmachine_data.fn_result = 0; atomic_set(&stopmachine_data.done, 0); stopmachine_data.state = STOPMACHINE_START; + stopmachine_data.caller = this; smp_wmb(); @@ -148,6 +150,12 @@ static void cf_check stopmachine_action(void *data) BUG_ON(cpu != smp_processor_id()); + /* + * TODO: use bounce buffers to pass callfunc data, so that when using ASI + * there's no need to map remote CPU stacks. + */ + arch_smp_pre_callfunc(stopmachine_data.caller); + smp_mb(); while ( state != STOPMACHINE_EXIT ) @@ -180,6 +188,8 @@ static void cf_check stopmachine_action(void *data) } local_irq_enable(); + + arch_smp_post_callfunc(stopmachine_data.caller); } static int cf_check cpu_callback( diff --git a/xen/include/xen/smp.h b/xen/include/xen/smp.h index 2ca9ff1bfc..610c279ca2 100644 --- a/xen/include/xen/smp.h +++ b/xen/include/xen/smp.h @@ -76,4 +76,9 @@ extern void *stack_base[NR_CPUS]; void initialize_cpu_data(unsigned int cpu); int setup_cpu_root_pgt(unsigned int cpu); +#ifndef HAS_ARCH_SMP_CALLFUNC +static inline void arch_smp_pre_callfunc(unsigned int cpu) {} +static inline void arch_smp_post_callfunc(unsigned int cpu) {} +#endif + #endif /* __XEN_SMP_H__ */