if ( rc )
return rc;
+ if ( opt_asi_hvm || opt_asi_pv )
+ {
+ if ( is_idle_vcpu(v) || d->arch.asi )
+ create_perdomain_mapping(v, PCPU_STACK_VIRT(0),
+ nr_cpu_ids << STACK_ORDER, false);
+ else if ( !v->vcpu_id )
+ {
+ l3_pgentry_t *idle_perdomain =
+ __map_domain_page(idle_vcpu[0]->domain->arch.perdomain_l3_pg);
+ l3_pgentry_t *guest_perdomain =
+ __map_domain_page(d->arch.perdomain_l3_pg);
+
+ l3e_write(&guest_perdomain[PCPU_STACK_SLOT],
+ idle_perdomain[PCPU_STACK_SLOT]);
+
+ unmap_domain_page(guest_perdomain);
+ unmap_domain_page(idle_perdomain);
+ }
+ }
+
rc = mapcache_vcpu_init(v);
if ( rc )
return rc;
}
vcpu_restore_fpu_nonlazy(n, false);
nd->arch.ctxt_switch->to(n);
+ if ( nd->arch.asi )
+ {
+ /*
+ * Tear down previous stack mappings and map current pCPU stack.
+ * This is safe because not yet running on 'n' page-tables.
+ */
+ destroy_perdomain_mapping(n, PCPU_STACK_VIRT(0),
+ nr_cpu_ids << STACK_ORDER);
+ vcpu_set_stack_mappings(n, cpu, true);
+ }
}
psr_ctxt_switch_to(nd);
/* Slot 260: per-domain mappings (including map cache). */
#define PERDOMAIN_VIRT_START (PML4_ADDR(260))
#define PERDOMAIN_SLOT_MBYTES (PML4_ENTRY_BYTES >> (20 + PAGETABLE_ORDER))
-#define PERDOMAIN_SLOTS 3
+#define PERDOMAIN_SLOTS 4
#define PERDOMAIN_VIRT_SLOT(s) (PERDOMAIN_VIRT_START + (s) * \
(PERDOMAIN_SLOT_MBYTES << 20))
/* Slot 4: mirror of per-domain mappings (for compat xlat area accesses). */
#define ARG_XLAT_START(v) \
(ARG_XLAT_VIRT_START + ((v)->vcpu_id << ARG_XLAT_VA_SHIFT))
+/* Per-CPU stacks area when using ASI. */
+#define PCPU_STACK_SLOT 3
+#define PCPU_STACK_VIRT_START PERDOMAIN_VIRT_SLOT(PCPU_STACK_SLOT)
+#define PCPU_STACK_VIRT_END (PCPU_STACK_VIRT_START + \
+ (PERDOMAIN_SLOT_MBYTES << 20))
+#define PCPU_STACK_VIRT(cpu) (PCPU_STACK_VIRT_START + \
+ (cpu << STACK_ORDER) * PAGE_SIZE)
+
#define ELFSIZE 64
#define ARCH_CRASH_SAVE_VMCOREINFO
* 0 - IST Shadow Stacks (4x 1k, read-only)
*/
+static inline bool is_shstk_slot(unsigned int i)
+{
+ return (i == 0 || i == PRIMARY_SHSTK_SLOT);
+}
+
/*
* Identify which stack page the stack pointer is on. Returns an index
* as per the comment above.
#define compat_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20))
#define compat_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20))
-void memguard_guard_stack(void *p);
+void memguard_guard_stack(void *p, unsigned int cpu);
void memguard_unguard_stack(void *p);
/*
void *cpu_alloc_stack(unsigned int cpu);
+/*
+ * Setup the per-CPU area stack mappings.
+ *
+ * @v: vCPU where the mappings are to appear.
+ * @stack_cpu: CPU whose stacks should be mapped.
+ * @map_shstk: create mappings for shadow stack regions.
+ */
+void vcpu_set_stack_mappings(const struct vcpu *v, unsigned int stack_cpu,
+ bool map_shstk);
+
+#define HAS_ARCH_SMP_CALLFUNC
+void arch_smp_pre_callfunc(unsigned int cpu);
+void arch_smp_post_callfunc(unsigned int cpu);
+
#endif /* !__ASSEMBLY__ */
#endif
* doing the final put_page(), and remove it from the iommu if so.
*/
+#include <xen/cpu.h>
#include <xen/init.h>
#include <xen/ioreq.h>
#include <xen/kernel.h>
return rc;
}
-void populate_perdomain_mapping(const struct vcpu *v, unsigned long va,
- mfn_t *mfn, unsigned long nr)
+static void populate_perdomain_mapping_flags(const struct vcpu *v,
+ unsigned long va, mfn_t *mfn,
+ unsigned long nr,
+ unsigned int flags)
{
l1_pgentry_t *l1tab = NULL, *pl1e;
const l3_pgentry_t *l3tab;
ASSERT_UNREACHABLE();
free_domheap_page(l1e_get_page(*pl1e));
}
- l1e_write(pl1e, l1e_from_mfn(mfn[i], __PAGE_HYPERVISOR_RW));
+ l1e_write(pl1e, l1e_from_mfn(mfn[i], flags));
}
return;
free_domheap_page(l1e_get_page(*pl1e));
}
- l1e_write(pl1e, l1e_from_mfn(*mfn, __PAGE_HYPERVISOR_RW));
+ l1e_write(pl1e, l1e_from_mfn(*mfn, flags));
}
unmap_domain_page(l1tab);
unmap_domain_page(l3tab);
}
+void populate_perdomain_mapping(const struct vcpu *v, unsigned long va,
+ mfn_t *mfn, unsigned long nr)
+{
+ populate_perdomain_mapping_flags(v, va, mfn, nr, __PAGE_HYPERVISOR_RW);
+}
+
+void vcpu_set_stack_mappings(const struct vcpu *v, unsigned int stack_cpu,
+ bool map_shstk)
+{
+ unsigned int i;
+
+ for ( i = 0; i < (1U << STACK_ORDER); i++ )
+ {
+ unsigned int flags = is_shstk_slot(i) ? __PAGE_HYPERVISOR_SHSTK
+ : __PAGE_HYPERVISOR_RW;
+ mfn_t mfn = virt_to_mfn(stack_base[stack_cpu] + i * PAGE_SIZE);
+
+ if ( is_shstk_slot(i) && !map_shstk )
+ continue;
+
+ populate_perdomain_mapping_flags(v,
+ PCPU_STACK_VIRT(stack_cpu) + i * PAGE_SIZE, &mfn, 1, flags);
+ }
+}
+
void destroy_perdomain_mapping(const struct vcpu *v, unsigned long va,
unsigned int nr)
{
l3tab = __map_domain_page(d->arch.asi ? v->arch.pervcpu_l3_pg
: d->arch.perdomain_l3_pg);
- for ( i = 0; i < PERDOMAIN_SLOTS; ++i)
+ for ( i = 0; i < PERDOMAIN_SLOTS; ++i )
+ {
+ if ( i == PCPU_STACK_SLOT && !d->arch.asi )
+ /* Without ASI the stack L3e is shared with the idle page-tables. */
+ continue;
+
if ( l3e_get_flags(l3tab[i]) & _PAGE_PRESENT )
{
struct page_info *l2pg = l3e_get_page(l3tab[i]);
unmap_domain_page(l2tab);
free_domheap_page(l2pg);
}
+ }
unmap_domain_page(l3tab);
free_domheap_page(d->arch.asi ? v->arch.pervcpu_l3_pg
v->arch.pervcpu_l3_pg = NULL;
}
-static void write_sss_token(unsigned long *ptr)
+static void write_sss_token(unsigned long *ptr, unsigned long va)
{
/*
* A supervisor shadow stack token is its own linear address, with the
* busy bit (0) clear.
*/
- *ptr = (unsigned long)ptr;
+ *ptr = va;
}
-void memguard_guard_stack(void *p)
+void memguard_guard_stack(void *p, unsigned int cpu)
{
+ unsigned long va = (opt_asi_hvm || opt_asi_pv) ? PCPU_STACK_VIRT(cpu)
+ : (unsigned long)p;
+
/* IST Shadow stacks. 4x 1k in stack page 0. */
if ( IS_ENABLED(CONFIG_XEN_SHSTK) )
{
- write_sss_token(p + (IST_MCE * IST_SHSTK_SIZE) - 8);
- write_sss_token(p + (IST_NMI * IST_SHSTK_SIZE) - 8);
- write_sss_token(p + (IST_DB * IST_SHSTK_SIZE) - 8);
- write_sss_token(p + (IST_DF * IST_SHSTK_SIZE) - 8);
+ write_sss_token(p + (IST_MCE * IST_SHSTK_SIZE) - 8,
+ va + (IST_MCE * IST_SHSTK_SIZE) - 8);
+ write_sss_token(p + (IST_NMI * IST_SHSTK_SIZE) - 8,
+ va + (IST_NMI * IST_SHSTK_SIZE) - 8);
+ write_sss_token(p + (IST_DB * IST_SHSTK_SIZE) - 8,
+ va + (IST_DB * IST_SHSTK_SIZE) - 8);
+ write_sss_token(p + (IST_DF * IST_SHSTK_SIZE) - 8,
+ va + (IST_DF * IST_SHSTK_SIZE) - 8);
}
map_pages_to_xen((unsigned long)p, virt_to_mfn(p), 1, PAGE_HYPERVISOR_SHSTK);
/* Primary Shadow Stack. 1x 4k in stack page 5. */
p += PRIMARY_SHSTK_SLOT * PAGE_SIZE;
+ va += PRIMARY_SHSTK_SLOT * PAGE_SIZE;
if ( IS_ENABLED(CONFIG_XEN_SHSTK) )
- write_sss_token(p + PAGE_SIZE - 8);
+ write_sss_token(p + PAGE_SIZE - 8, va + PAGE_SIZE - 8);
map_pages_to_xen((unsigned long)p, virt_to_mfn(p), 1, PAGE_HYPERVISOR_SHSTK);
}
scheduler_init();
set_current(idle_vcpu[0]);
this_cpu(curr_vcpu) = current;
+ if ( opt_asi_hvm || opt_asi_pv )
+ /* Set per-domain slot in the idle page-tables to access stack mappings. */
+ l4e_write(&idle_pg_table[l4_table_offset(PERDOMAIN_VIRT_START)],
+ l4e_from_page(idle_vcpu[0]->domain->arch.perdomain_l3_pg,
+ __PAGE_HYPERVISOR_RW));
}
void srat_detect_node(int cpu)
/* Update SYSCALL trampolines */
percpu_traps_init();
- stack_base[0] = stack;
-
rc = setup_cpu_root_pgt(0);
if ( rc )
panic("Error %d setting up PV root page table\n", rc);
system_state = SYS_STATE_boot;
- bsp_stack = cpu_alloc_stack(0);
- if ( !bsp_stack )
- panic("No memory for BSP stack\n");
-
console_init_ring();
vesa_init();
alternative_branches();
+ /*
+ * Alloc the BSP stack closer to the point where the AP ones also get
+ * allocated - and after the speculation mitigations have been initialized.
+ * In order to set up the shadow stack token correctly Xen needs to know
+ * whether per-CPU mapped stacks are being used.
+ */
+ bsp_stack = cpu_alloc_stack(0);
+ if ( !bsp_stack )
+ panic("No memory for BSP stack\n");
+
/*
* NB: when running as a PV shim VCPUOP_up/down is wired to the shim
* physical cpu_add/remove functions, so launch the guest with only
info->last_spec_ctrl = default_xen_spec_ctrl;
}
+ stack_base[0] = bsp_stack;
+
/* Copy the cpu info block, and move onto the BSP stack. */
- bsp_info = get_cpu_info_from_stack((unsigned long)bsp_stack);
+ if ( opt_asi_hvm || opt_asi_pv )
+ {
+ vcpu_set_stack_mappings(idle_vcpu[0], 0, true);
+ bsp_info = get_cpu_info_from_stack(PCPU_STACK_VIRT(0));
+ }
+ else
+ bsp_info = get_cpu_info_from_stack((unsigned long)bsp_stack);
+
*bsp_info = *info;
asm volatile ("mov %[stk], %%rsp; jmp %c[fn]" ::
*/
#include <xen/cpu.h>
+#include <xen/efi.h>
#include <xen/irq.h>
#include <xen/sched.h>
#include <xen/delay.h>
#include <asm/hpet.h>
#include <asm/setup.h>
+#include <asm/spec_ctrl.h>
+
/* Helper functions to prepare APIC register values. */
static unsigned int prepare_ICR(unsigned int shortcut, int vector)
{
ret = cpu_down(cpu);
return ret;
}
+
+void arch_smp_pre_callfunc(unsigned int cpu)
+{
+ if ( !opt_asi_hvm && !opt_asi_pv )
+ /*
+ * Avoid the unconditional sync_local_execstate() call below if ASI is
+ * not enabled for any domain.
+ */
+ return;
+
+ /*
+ * Sync execution state, so that the page-tables cannot change while
+ * creating or destroying the stack mappings.
+ */
+ sync_local_execstate();
+ if ( cpu == smp_processor_id() || !current->domain->arch.asi ||
+ /* EFI page-tables have all pCPU stacks mapped. */
+ efi_rs_using_pgtables() )
+ return;
+
+ vcpu_set_stack_mappings(current, cpu, false);
+}
+
+void arch_smp_post_callfunc(unsigned int cpu)
+{
+ if ( cpu == smp_processor_id() || !current->domain->arch.asi ||
+ /* EFI page-tables have all pCPU stacks mapped. */
+ efi_rs_using_pgtables() )
+ return;
+
+ ASSERT(current == this_cpu(curr_vcpu));
+ destroy_perdomain_mapping(current, PCPU_STACK_VIRT(cpu),
+ (1U << STACK_ORDER));
+
+ flush_area_local((void *)PCPU_STACK_VIRT(cpu), FLUSH_ORDER(STACK_ORDER));
+}
printk("Booting processor %d/%d eip %lx\n",
cpu, apicid, start_eip);
- stack_start = stack_base[cpu] + STACK_SIZE - sizeof(struct cpu_info);
+ if ( opt_asi_hvm || opt_asi_pv )
+ {
+ /*
+ * Uniformly run with the stack mappings in the per-domain area if ASI
+ * is enabled for any domain type.
+ */
+ vcpu_set_stack_mappings(idle_vcpu[cpu], cpu, true);
+
+ ASSERT(IS_ALIGNED(PCPU_STACK_VIRT(cpu), STACK_SIZE));
+
+ stack_start = (void *)PCPU_STACK_VIRT(cpu) + STACK_SIZE -
+ sizeof(struct cpu_info);
+ }
+ else
+ stack_start = stack_base[cpu] + STACK_SIZE - sizeof(struct cpu_info);
/* This grunge runs the startup process for the targeted processor. */
stack = alloc_xenheap_pages(STACK_ORDER, memflags);
if ( stack )
- memguard_guard_stack(stack);
+ memguard_guard_stack(stack, cpu);
return stack;
}
void __init smp_prepare_cpus(void)
{
+ BUILD_BUG_ON(PCPU_STACK_VIRT(CONFIG_NR_CPUS) > PCPU_STACK_VIRT_END);
+
register_cpu_notifier(&cpu_smpboot_nfb);
mtrr_aps_sync_begin();
unsigned long esp = regs->rsp;
unsigned long curr_stack_base = esp & ~(STACK_SIZE - 1);
unsigned long esp_top, esp_bottom;
+ const void *stack = current->domain->arch.asi ? (void *)PCPU_STACK_VIRT(cpu)
+ : stack_base[cpu];
- if ( _p(curr_stack_base) != stack_base[cpu] )
+ if ( _p(curr_stack_base) != stack )
printk("Current stack base %p differs from expected %p\n",
- _p(curr_stack_base), stack_base[cpu]);
+ _p(curr_stack_base), stack);
esp_bottom = (esp | (STACK_SIZE - 1)) + 1;
esp_top = esp_bottom - PRIMARY_STACK_SIZE;
void (*func) (void *info);
void *info;
int wait;
+ unsigned int caller;
cpumask_t selected;
} call_data;
call_data.func = func;
call_data.info = info;
call_data.wait = wait;
+ call_data.caller = smp_processor_id();
smp_send_call_function_mask(&call_data.selected);
if ( !cpumask_test_cpu(cpu, &call_data.selected) )
return;
+ /*
+ * TODO: use bounce buffers to pass callfunc data, so that when using ASI
+ * there's no need to map remote CPU stacks.
+ */
+ arch_smp_pre_callfunc(call_data.caller);
+
irq_enter();
if ( unlikely(!func) )
}
irq_exit();
+
+ arch_smp_post_callfunc(call_data.caller);
}
/*
struct stopmachine_data {
unsigned int nr_cpus;
+ unsigned int caller;
enum stopmachine_state state;
atomic_t done;
stopmachine_data.fn_result = 0;
atomic_set(&stopmachine_data.done, 0);
stopmachine_data.state = STOPMACHINE_START;
+ stopmachine_data.caller = this;
smp_wmb();
BUG_ON(cpu != smp_processor_id());
+ /*
+ * TODO: use bounce buffers to pass callfunc data, so that when using ASI
+ * there's no need to map remote CPU stacks.
+ */
+ arch_smp_pre_callfunc(stopmachine_data.caller);
+
smp_mb();
while ( state != STOPMACHINE_EXIT )
}
local_irq_enable();
+
+ arch_smp_post_callfunc(stopmachine_data.caller);
}
static int cf_check cpu_callback(
void initialize_cpu_data(unsigned int cpu);
int setup_cpu_root_pgt(unsigned int cpu);
+#ifndef HAS_ARCH_SMP_CALLFUNC
+static inline void arch_smp_pre_callfunc(unsigned int cpu) {}
+static inline void arch_smp_post_callfunc(unsigned int cpu) {}
+#endif
+
#endif /* __XEN_SMP_H__ */