init_idle_domain();
+ this_cpu(stubs.addr) = alloc_stub_page(smp_processor_id(),
+ &this_cpu(stubs).mfn);
+ BUG_ON(!this_cpu(stubs.addr));
+
trap_init();
rcu_init();
#include <xen/kernel.h>
#include <xen/mm.h>
#include <xen/domain.h>
+#include <xen/domain_page.h>
#include <xen/sched.h>
#include <xen/sched-if.h>
#include <xen/irq.h>
return rc;
}
+#define STUB_BUF_CPU_OFFS(cpu) (((cpu) & (STUBS_PER_PAGE - 1)) * STUB_BUF_SIZE)
+
+unsigned long alloc_stub_page(unsigned int cpu, unsigned long *mfn)
+{
+ unsigned long stub_va;
+ struct page_info *pg;
+
+ BUILD_BUG_ON(STUBS_PER_PAGE & (STUBS_PER_PAGE - 1));
+
+ if ( *mfn )
+ pg = mfn_to_page(*mfn);
+ else
+ {
+ nodeid_t node = cpu_to_node(cpu);
+ unsigned int memflags = node != NUMA_NO_NODE ? MEMF_node(node) : 0;
+
+ pg = alloc_domheap_page(NULL, memflags);
+ if ( !pg )
+ return 0;
+
+ unmap_domain_page(memset(__map_domain_page(pg), 0xcc, PAGE_SIZE));
+ }
+
+ stub_va = XEN_VIRT_END - (cpu + 1) * PAGE_SIZE;
+ if ( map_pages_to_xen(stub_va, page_to_mfn(pg), 1,
+ PAGE_HYPERVISOR_RX | MAP_SMALL_PAGES) )
+ {
+ if ( !*mfn )
+ free_domheap_page(pg);
+ stub_va = 0;
+ }
+ else if ( !*mfn )
+ *mfn = page_to_mfn(pg);
+
+ return stub_va;
+}
+
void cpu_exit_clear(unsigned int cpu)
{
cpu_uninit(cpu);
free_cpumask_var(per_cpu(cpu_sibling_mask, cpu));
free_cpumask_var(per_cpu(cpu_core_mask, cpu));
+ if ( per_cpu(stubs.addr, cpu) )
+ {
+ unsigned long mfn = per_cpu(stubs.mfn, cpu);
+ unsigned char *stub_page = map_domain_page(mfn);
+ unsigned int i;
+
+ memset(stub_page + STUB_BUF_CPU_OFFS(cpu), 0xcc, STUB_BUF_SIZE);
+ for ( i = 0; i < STUBS_PER_PAGE; ++i )
+ if ( stub_page[i * STUB_BUF_SIZE] != 0xcc )
+ break;
+ unmap_domain_page(stub_page);
+ destroy_xen_mappings(per_cpu(stubs.addr, cpu) & PAGE_MASK,
+ (per_cpu(stubs.addr, cpu) | ~PAGE_MASK) + 1);
+ if ( i == STUBS_PER_PAGE )
+ free_domheap_page(mfn_to_page(mfn));
+ }
+
order = get_order_from_pages(NR_RESERVED_GDT_PAGES);
free_xenheap_pages(per_cpu(gdt_table, cpu), order);
static int cpu_smpboot_alloc(unsigned int cpu)
{
- unsigned int order, memflags = 0;
+ unsigned int i, order, memflags = 0;
nodeid_t node = cpu_to_node(cpu);
struct desc_struct *gdt;
+ unsigned long stub_page;
if ( node != NUMA_NO_NODE )
memflags = MEMF_node(node);
goto oom;
memcpy(idt_tables[cpu], idt_table, IDT_ENTRIES * sizeof(idt_entry_t));
+ for ( stub_page = 0, i = cpu & ~(STUBS_PER_PAGE - 1);
+ i < nr_cpu_ids && i <= (cpu | (STUBS_PER_PAGE - 1)); ++i )
+ if ( cpu_online(i) && cpu_to_node(i) == node )
+ {
+ per_cpu(stubs.mfn, cpu) = per_cpu(stubs.mfn, i);
+ break;
+ }
+ BUG_ON(i == cpu);
+ stub_page = alloc_stub_page(cpu, &per_cpu(stubs.mfn, cpu));
+ if ( !stub_page )
+ goto oom;
+ per_cpu(stubs.addr, cpu) = stub_page + STUB_BUF_CPU_OFFS(cpu);
+
if ( zalloc_cpumask_var(&per_cpu(cpu_sibling_mask, cpu)) &&
zalloc_cpumask_var(&per_cpu(cpu_core_mask, cpu)) )
return 0;
movb $0,TRAPBOUNCE_flags(%rdx)
jmp compat_test_all_events
-ENTRY(compat_syscall)
+/* See lstar_enter for entry register state. */
+ENTRY(cstar_enter)
+ sti
+ movq 8(%rsp),%rax /* Restore %rax. */
+ movq $FLAT_KERNEL_SS,8(%rsp)
+ pushq %r11
+ pushq $FLAT_USER_CS32
+ pushq %rcx
+ pushq $0
+ SAVE_VOLATILE TRAP_syscall
+ GET_CURRENT(%rbx)
+ movq VCPU_domain(%rbx),%rcx
+ cmpb $0,DOMAIN_is_32bit_pv(%rcx)
+ je switch_to_kernel
cmpb $0,VCPU_syscall32_disables_events(%rbx)
movzwl VCPU_syscall32_sel(%rbx),%esi
movq VCPU_syscall32_addr(%rbx),%rax
#include <public/xen.h>
#include <irq_vectors.h>
- ALIGN
/* %rbx: struct vcpu */
-switch_to_kernel:
+ENTRY(switch_to_kernel)
leaq VCPU_trap_bounce(%rbx),%rdx
/* TB_eip = (32-bit syscall && syscall32_addr) ?
* syscall32_addr : syscall_addr */
* When entering SYSCALL from user mode:
* Vector directly to the registered arch.syscall_addr.
*
- * Initial work is done by per-CPU stack trampolines. At this point %rsp
- * has been initialised to point at the correct Xen stack, and %rsp, %rflags
- * and %cs have been saved. All other registers are still to be saved onto
- * the stack, starting with %rip, and an appropriate %ss must be saved into
- * the space left by the trampoline.
+ * Initial work is done by per-CPU trampolines. At this point %rsp has been
+ * initialised to point at the correct Xen stack, %rsp has been saved, and
+ * %rax needs to be restored from the %ss save slot. All other registers are
+ * still to be saved onto the stack, starting with RFLAGS, and an appropriate
+ * %ss must be saved into the space left by the trampoline.
*/
-ENTRY(syscall_enter)
+ENTRY(lstar_enter)
sti
- movl $FLAT_KERNEL_SS,24(%rsp)
+ movq 8(%rsp),%rax /* Restore %rax. */
+ movq $FLAT_KERNEL_SS,8(%rsp)
+ pushq %r11
+ pushq $FLAT_KERNEL_CS64
pushq %rcx
pushq $0
- movq 24(%rsp),%r11 /* Re-load user RFLAGS into %r11 before saving */
SAVE_VOLATILE TRAP_syscall
GET_CURRENT(%rbx)
- movq VCPU_domain(%rbx),%rcx
- testb $1,DOMAIN_is_32bit_pv(%rcx)
- jnz compat_syscall
testb $TF_kernel_mode,VCPU_thread_flags(%rbx)
jz switch_to_kernel
return 0;
}
-static int write_stack_trampoline(
- char *stack, char *stack_bottom, uint16_t cs_seg)
+static unsigned int write_stub_trampoline(
+ unsigned char *stub, unsigned long stub_va,
+ unsigned long stack_bottom, unsigned long target_va)
{
- /* movq %rsp, saversp(%rip) */
- stack[0] = 0x48;
- stack[1] = 0x89;
- stack[2] = 0x25;
- *(u32 *)&stack[3] = (stack_bottom - &stack[7]) - 16;
-
- /* leaq saversp(%rip), %rsp */
- stack[7] = 0x48;
- stack[8] = 0x8d;
- stack[9] = 0x25;
- *(u32 *)&stack[10] = (stack_bottom - &stack[14]) - 16;
-
- /* pushq %r11 */
- stack[14] = 0x41;
- stack[15] = 0x53;
-
- /* pushq $<cs_seg> */
- stack[16] = 0x68;
- *(u32 *)&stack[17] = cs_seg;
-
- /* movq $syscall_enter,%r11 */
- stack[21] = 0x49;
- stack[22] = 0xbb;
- *(void **)&stack[23] = (void *)syscall_enter;
-
- /* jmpq *%r11 */
- stack[31] = 0x41;
- stack[32] = 0xff;
- stack[33] = 0xe3;
-
- return 34;
+ /* movabsq %rax, stack_bottom - 8 */
+ stub[0] = 0x48;
+ stub[1] = 0xa3;
+ *(uint64_t *)&stub[2] = stack_bottom - 8;
+
+ /* movq %rsp, %rax */
+ stub[10] = 0x48;
+ stub[11] = 0x89;
+ stub[12] = 0xe0;
+
+ /* movabsq $stack_bottom - 8, %rsp */
+ stub[13] = 0x48;
+ stub[14] = 0xbc;
+ *(uint64_t *)&stub[15] = stack_bottom - 8;
+
+ /* pushq %rax */
+ stub[23] = 0x50;
+
+ /* jmp target_va */
+ stub[24] = 0xe9;
+ *(int32_t *)&stub[25] = target_va - (stub_va + 29);
+
+ /* Round up to a multiple of 16 bytes. */
+ return 32;
}
+DEFINE_PER_CPU(struct stubs, stubs);
+void lstar_enter(void);
+void cstar_enter(void);
+
void __devinit subarch_percpu_traps_init(void)
{
- char *stack_bottom, *stack;
-
- stack_bottom = (char *)get_stack_bottom();
- stack = (char *)((unsigned long)stack_bottom & ~(STACK_SIZE - 1));
+ unsigned long stack_bottom = get_stack_bottom();
+ unsigned long stub_va = this_cpu(stubs.addr);
+ unsigned char *stub_page;
+ unsigned int offset;
/* IST_MAX IST pages + 1 syscall page + 1 guard page + primary stack. */
BUILD_BUG_ON((IST_MAX + 2) * PAGE_SIZE + PRIMARY_STACK_SIZE > STACK_SIZE);
- /* Trampoline for SYSCALL entry from long mode. */
- stack = &stack[IST_MAX * PAGE_SIZE]; /* Skip the IST stacks. */
- wrmsrl(MSR_LSTAR, (unsigned long)stack);
- stack += write_stack_trampoline(stack, stack_bottom, FLAT_KERNEL_CS64);
+ stub_page = map_domain_page(this_cpu(stubs.mfn));
+
+ /* Trampoline for SYSCALL entry from 64-bit mode. */
+ wrmsrl(MSR_LSTAR, stub_va);
+ offset = write_stub_trampoline(stub_page + (stub_va & ~PAGE_MASK),
+ stub_va, stack_bottom,
+ (unsigned long)lstar_enter);
+ stub_va += offset;
if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ||
boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR )
{
/* SYSENTER entry. */
- wrmsrl(MSR_IA32_SYSENTER_ESP, (unsigned long)stack_bottom);
+ wrmsrl(MSR_IA32_SYSENTER_ESP, stack_bottom);
wrmsrl(MSR_IA32_SYSENTER_EIP, (unsigned long)sysenter_entry);
wrmsr(MSR_IA32_SYSENTER_CS, __HYPERVISOR_CS, 0);
}
/* Trampoline for SYSCALL entry from compatibility mode. */
- stack = (char *)L1_CACHE_ALIGN((unsigned long)stack);
- wrmsrl(MSR_CSTAR, (unsigned long)stack);
- stack += write_stack_trampoline(stack, stack_bottom, FLAT_USER_CS32);
+ wrmsrl(MSR_CSTAR, stub_va);
+ offset += write_stub_trampoline(stub_page + (stub_va & ~PAGE_MASK),
+ stub_va, stack_bottom,
+ (unsigned long)cstar_enter);
+
+ /* Don't consume more than half of the stub space here. */
+ ASSERT(offset <= STUB_BUF_SIZE / 2);
+
+ unmap_domain_page(stub_page);
/* Common SYSCALL parameters. */
wrmsr(MSR_STAR, 0, (FLAT_RING3_CS32<<16) | __HYPERVISOR_CS);
.comment 0 : { *(.comment) }
}
+ASSERT(__image_base__ > XEN_VIRT_START ||
+ _end <= XEN_VIRT_END - NR_CPUS * PAGE_SIZE,
+ "Xen image overlaps stubs area")
ASSERT(kexec_reloc_size - kexec_reloc <= PAGE_SIZE, "kexec_reloc is too large")
/* Primary stack is restricted to 8kB by guard pages. */
#define PRIMARY_STACK_SIZE 8192
+/* Total size of syscall and emulation stubs. */
+#define STUB_BUF_SHIFT (L1_CACHE_SHIFT > 7 ? L1_CACHE_SHIFT : 7)
+#define STUB_BUF_SIZE (1 << STUB_BUF_SHIFT)
+
/* Return value for zero-size _xmalloc(), distinguished from NULL. */
#define ZERO_BLOCK_PTR ((void *)0xBAD0BAD0BAD0BAD0UL)
#define _PAGE_GNTTAB 0
#endif
-#define __PAGE_HYPERVISOR \
- (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED)
-#define __PAGE_HYPERVISOR_NOCACHE \
- (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_PCD | _PAGE_ACCESSED)
+#define __PAGE_HYPERVISOR_RX (_PAGE_PRESENT | _PAGE_ACCESSED)
+#define __PAGE_HYPERVISOR (__PAGE_HYPERVISOR_RX | \
+ _PAGE_DIRTY | _PAGE_RW)
+#define __PAGE_HYPERVISOR_NOCACHE (__PAGE_HYPERVISOR | _PAGE_PCD)
#define MAP_SMALL_PAGES _PAGE_AVAIL0 /* don't use superpages mappings */
void enable_nmis(void);
void do_reserved_trap(struct cpu_user_regs *regs);
-void syscall_enter(void);
void sysenter_entry(void);
void sysenter_eflags_saved(void);
void compat_hypercall(void);
void int80_direct_trap(void);
+#define STUBS_PER_PAGE (PAGE_SIZE / STUB_BUF_SIZE)
+
+struct stubs {
+ union {
+ void(*func)(void);
+ unsigned long addr;
+ };
+ unsigned long mfn;
+};
+
+DECLARE_PER_CPU(struct stubs, stubs);
+unsigned long alloc_stub_page(unsigned int cpu, unsigned long *mfn);
+
extern int hypercall(void);
int cpuid_hypervisor_leaves( uint32_t idx, uint32_t sub_idx,
#define _PAGE_GUEST_KERNEL (1U<<12)
#define PAGE_HYPERVISOR (__PAGE_HYPERVISOR | _PAGE_GLOBAL)
+#define PAGE_HYPERVISOR_RX (__PAGE_HYPERVISOR_RX | _PAGE_GLOBAL)
#define PAGE_HYPERVISOR_NOCACHE (__PAGE_HYPERVISOR_NOCACHE | _PAGE_GLOBAL)
#endif /* __X86_64_PAGE_H__ */