From 0f1cb96e9785294f149ab3c7feb90c0eb9daeede Mon Sep 17 00:00:00 2001 From: Keir Fraser Date: Tue, 26 May 2009 15:01:36 +0100 Subject: [PATCH] x86 hvm: Allow cross-vendor migration Intercept #UD and emulate SYSCALL/SYSENTER/SYSEXIT as necessary. Signed-off-by: Christoph Egger Signed-off-by: Keir Fraser --- xen/arch/x86/hvm/svm/svm.c | 100 +++++++++++- xen/arch/x86/hvm/svm/vmcb.c | 8 +- xen/arch/x86/hvm/vmx/vmcs.c | 3 +- xen/arch/x86/hvm/vmx/vmx.c | 36 +++++ xen/arch/x86/x86_emulate/x86_emulate.c | 212 ++++++++++++++++++++++++- xen/include/asm-x86/hvm/svm/vmcb.h | 9 ++ xen/include/public/arch-x86/hvm/save.h | 4 +- 7 files changed, 354 insertions(+), 18 deletions(-) diff --git a/xen/arch/x86/hvm/svm/svm.c b/xen/arch/x86/hvm/svm/svm.c index a70f0ffd95..d800e8cd1b 100644 --- a/xen/arch/x86/hvm/svm/svm.c +++ b/xen/arch/x86/hvm/svm/svm.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -199,9 +200,9 @@ static int svm_vmcb_save(struct vcpu *v, struct hvm_hw_cpu *c) c->cr3 = v->arch.hvm_vcpu.guest_cr[3]; c->cr4 = v->arch.hvm_vcpu.guest_cr[4]; - c->sysenter_cs = vmcb->sysenter_cs; - c->sysenter_esp = vmcb->sysenter_esp; - c->sysenter_eip = vmcb->sysenter_eip; + c->sysenter_cs = v->arch.hvm_svm.guest_sysenter_cs; + c->sysenter_esp = v->arch.hvm_svm.guest_sysenter_esp; + c->sysenter_eip = v->arch.hvm_svm.guest_sysenter_eip; c->pending_event = 0; c->error_code = 0; @@ -258,9 +259,9 @@ static int svm_vmcb_restore(struct vcpu *v, struct hvm_hw_cpu *c) svm_update_guest_cr(v, 2); svm_update_guest_cr(v, 4); - vmcb->sysenter_cs = c->sysenter_cs; - vmcb->sysenter_esp = c->sysenter_esp; - vmcb->sysenter_eip = c->sysenter_eip; + v->arch.hvm_svm.guest_sysenter_cs = c->sysenter_cs; + v->arch.hvm_svm.guest_sysenter_esp = c->sysenter_esp; + v->arch.hvm_svm.guest_sysenter_eip = c->sysenter_eip; if ( paging_mode_hap(v->domain) ) { @@ -286,7 +287,7 @@ static int svm_vmcb_restore(struct vcpu *v, struct hvm_hw_cpu *c) return 0; } - + static void svm_save_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data) { struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; @@ -985,6 +986,16 @@ static int svm_msr_read_intercept(struct cpu_user_regs *regs) msr_content = v->arch.hvm_vcpu.guest_efer; break; + case MSR_IA32_SYSENTER_CS: + msr_content = v->arch.hvm_svm.guest_sysenter_cs; + break; + case MSR_IA32_SYSENTER_ESP: + msr_content = v->arch.hvm_svm.guest_sysenter_esp; + break; + case MSR_IA32_SYSENTER_EIP: + msr_content = v->arch.hvm_svm.guest_sysenter_eip; + break; + case MSR_IA32_MC4_MISC: /* Threshold register */ case MSR_F10_MC4_MISC1 ... MSR_F10_MC4_MISC3: /* @@ -1067,6 +1078,16 @@ static int svm_msr_write_intercept(struct cpu_user_regs *regs) case MSR_K8_VM_HSAVE_PA: goto gpf; + case MSR_IA32_SYSENTER_CS: + v->arch.hvm_svm.guest_sysenter_cs = msr_content; + break; + case MSR_IA32_SYSENTER_ESP: + v->arch.hvm_svm.guest_sysenter_esp = msr_content; + break; + case MSR_IA32_SYSENTER_EIP: + v->arch.hvm_svm.guest_sysenter_eip = msr_content; + break; + case MSR_IA32_DEBUGCTLMSR: vmcb->debugctlmsr = msr_content; if ( !msr_content || !cpu_has_svm_lbrv ) @@ -1165,6 +1186,66 @@ static void svm_vmexit_do_rdtsc(struct cpu_user_regs *regs) hvm_rdtsc_intercept(regs); } +static void svm_dump_regs(const char *from, struct cpu_user_regs *regs) +{ + printk("Dumping guest's current registers at %s...\n", from); + printk("Size of regs = 0x%lx, address = %p\n", + sizeof(struct cpu_user_regs), regs); + + printk("r15 = 0x%016"PRIx64", r14 = 0x%016"PRIx64"\n", + regs->r15, regs->r14); + printk("r13 = 0x%016"PRIx64", r12 = 0x%016"PRIx64"\n", + regs->r13, regs->r12); + printk("rbp = 0x%016"PRIx64", rbx = 0x%016"PRIx64"\n", + regs->rbp, regs->rbx); + printk("r11 = 0x%016"PRIx64", r10 = 0x%016"PRIx64"\n", + regs->r11, regs->r10); + printk("r9 = 0x%016"PRIx64", r8 = 0x%016"PRIx64"\n", + regs->r9, regs->r8); + printk("rax = 0x%016"PRIx64", rcx = 0x%016"PRIx64"\n", + regs->rax, regs->rcx); + printk("rdx = 0x%016"PRIx64", rsi = 0x%016"PRIx64"\n", + regs->rdx, regs->rsi); + printk("rdi = 0x%016"PRIx64", rsp = 0x%016"PRIx64"\n", + regs->rdi, regs->rsp); + printk("error code = 0x%08"PRIx32", entry_vector = 0x%08"PRIx32"\n", + regs->error_code, regs->entry_vector); + printk("rip = 0x%016"PRIx64", rflags = 0x%016"PRIx64"\n", + regs->rip, regs->rflags); +} + +static void svm_vmexit_ud_intercept(struct cpu_user_regs *regs) +{ + struct hvm_emulate_ctxt ctxt; + int rc; + + hvm_emulate_prepare(&ctxt, regs); + + rc = hvm_emulate_one(&ctxt); + + switch ( rc ) + { + case X86EMUL_UNHANDLEABLE: + gdprintk(XENLOG_WARNING, + "instruction emulation failed @ %04x:%lx: " + "%02x %02x %02x %02x %02x %02x\n", + hvmemul_get_seg_reg(x86_seg_cs, &ctxt)->sel, + ctxt.insn_buf_eip, + ctxt.insn_buf[0], ctxt.insn_buf[1], + ctxt.insn_buf[2], ctxt.insn_buf[3], + ctxt.insn_buf[4], ctxt.insn_buf[5]); + return; + case X86EMUL_EXCEPTION: + if ( ctxt.exn_pending ) + hvm_inject_exception(ctxt.exn_vector, ctxt.exn_error_code, 0); + break; + default: + break; + } + + hvm_emulate_writeback(&ctxt); +} + static void wbinvd_ipi(void *info) { wbinvd(); @@ -1229,6 +1310,7 @@ asmlinkage void svm_vmexit_handler(struct cpu_user_regs *regs) if ( unlikely(exit_reason == VMEXIT_INVALID) ) { svm_dump_vmcb(__func__, vmcb); + svm_dump_regs(__func__, regs); goto exit_and_crash; } @@ -1305,6 +1387,10 @@ asmlinkage void svm_vmexit_handler(struct cpu_user_regs *regs) break; } + case VMEXIT_EXCEPTION_UD: + svm_vmexit_ud_intercept(regs); + break; + /* Asynchronous event, handled when we STGI'd after the VMEXIT. */ case VMEXIT_EXCEPTION_MC: HVMTRACE_0D(MCE); diff --git a/xen/arch/x86/hvm/svm/vmcb.c b/xen/arch/x86/hvm/svm/vmcb.c index 080c05c762..9c4a3cca89 100644 --- a/xen/arch/x86/hvm/svm/vmcb.c +++ b/xen/arch/x86/hvm/svm/vmcb.c @@ -150,9 +150,6 @@ static int construct_vmcb(struct vcpu *v) svm_disable_intercept_for_msr(v, MSR_LSTAR); svm_disable_intercept_for_msr(v, MSR_STAR); svm_disable_intercept_for_msr(v, MSR_SYSCALL_MASK); - svm_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_CS); - svm_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_ESP); - svm_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_EIP); vmcb->msrpm_base_pa = (u64)virt_to_maddr(arch_svm->msrpm); vmcb->iopm_base_pa = (u64)virt_to_maddr(hvm_io_bitmap); @@ -222,7 +219,10 @@ static int construct_vmcb(struct vcpu *v) paging_update_paging_modes(v); - vmcb->exception_intercepts = HVM_TRAP_MASK | (1U << TRAP_no_device); + vmcb->exception_intercepts = + HVM_TRAP_MASK + | (1U << TRAP_no_device) + | (1U << TRAP_invalid_op); if ( paging_mode_hap(v->domain) ) { diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c index aea69c7b94..5401e5638d 100644 --- a/xen/arch/x86/hvm/vmx/vmcs.c +++ b/xen/arch/x86/hvm/vmx/vmcs.c @@ -668,7 +668,8 @@ static int construct_vmcs(struct vcpu *v) __vmwrite(EXCEPTION_BITMAP, HVM_TRAP_MASK | (paging_mode_hap(d) ? 0 : (1U << TRAP_page_fault)) - | (1U << TRAP_no_device)); + | (1U << TRAP_no_device) + | (1U << TRAP_invalid_op)); v->arch.hvm_vcpu.guest_cr[0] = X86_CR0_PE | X86_CR0_ET; hvm_update_guest_cr(v, 0); diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c index 64d9f39c89..5fb1421ac6 100644 --- a/xen/arch/x86/hvm/vmx/vmx.c +++ b/xen/arch/x86/hvm/vmx/vmx.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -2248,6 +2249,38 @@ asmlinkage void vmx_enter_realmode(struct cpu_user_regs *regs) regs->eflags |= (X86_EFLAGS_VM | X86_EFLAGS_IOPL); } +static void vmx_vmexit_ud_intercept(struct cpu_user_regs *regs) +{ + struct hvm_emulate_ctxt ctxt; + int rc; + + hvm_emulate_prepare(&ctxt, regs); + + rc = hvm_emulate_one(&ctxt); + + switch ( rc ) + { + case X86EMUL_UNHANDLEABLE: + gdprintk(XENLOG_WARNING, + "instruction emulation failed @ %04x:%lx: " + "%02x %02x %02x %02x %02x %02x\n", + hvmemul_get_seg_reg(x86_seg_cs, &ctxt)->sel, + ctxt.insn_buf_eip, + ctxt.insn_buf[0], ctxt.insn_buf[1], + ctxt.insn_buf[2], ctxt.insn_buf[3], + ctxt.insn_buf[4], ctxt.insn_buf[5]); + return; + case X86EMUL_EXCEPTION: + if ( ctxt.exn_pending ) + hvm_inject_exception(ctxt.exn_vector, ctxt.exn_error_code, 0); + break; + default: + break; + } + + hvm_emulate_writeback(&ctxt); +} + asmlinkage void vmx_vmexit_handler(struct cpu_user_regs *regs) { unsigned int exit_reason, idtv_info; @@ -2434,6 +2467,9 @@ asmlinkage void vmx_vmexit_handler(struct cpu_user_regs *regs) HVMTRACE_0D(MCE); do_machine_check(regs); break; + case TRAP_invalid_op: + vmx_vmexit_ud_intercept(regs); + break; default: goto exit_and_crash; } diff --git a/xen/arch/x86/x86_emulate/x86_emulate.c b/xen/arch/x86/x86_emulate/x86_emulate.c index b5c3fc236a..e6a94455d5 100644 --- a/xen/arch/x86/x86_emulate/x86_emulate.c +++ b/xen/arch/x86/x86_emulate/x86_emulate.c @@ -172,7 +172,7 @@ static uint8_t opcode_table[256] = { static uint8_t twobyte_table[256] = { /* 0x00 - 0x07 */ - SrcMem16|ModRM, ImplicitOps|ModRM, 0, 0, 0, 0, ImplicitOps, 0, + SrcMem16|ModRM, ImplicitOps|ModRM, 0, 0, 0, ImplicitOps, ImplicitOps, 0, /* 0x08 - 0x0F */ ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps|ModRM, 0, 0, /* 0x10 - 0x17 */ @@ -186,7 +186,8 @@ static uint8_t twobyte_table[256] = { /* 0x28 - 0x2F */ 0, 0, 0, 0, 0, 0, 0, 0, /* 0x30 - 0x37 */ - ImplicitOps, ImplicitOps, ImplicitOps, 0, 0, 0, 0, 0, + ImplicitOps, ImplicitOps, ImplicitOps, 0, + ImplicitOps, ImplicitOps, 0, 0, /* 0x38 - 0x3F */ 0, 0, 0, 0, 0, 0, 0, 0, /* 0x40 - 0x47 */ @@ -280,7 +281,17 @@ struct operand { }; /* MSRs. */ -#define MSR_TSC 0x10 +#define MSR_TSC 0x00000010 +#define MSR_SYSENTER_CS 0x00000174 +#define MSR_SYSENTER_ESP 0x00000175 +#define MSR_SYSENTER_EIP 0x00000176 +#define MSR_EFER 0xc0000080 +#define EFER_SCE (1u<<0) +#define EFER_LMA (1u<<10) +#define MSR_STAR 0xc0000081 +#define MSR_LSTAR 0xc0000082 +#define MSR_CSTAR 0xc0000083 +#define MSR_FMASK 0xc0000084 /* Control register flags. */ #define CR0_PE (1<<0) @@ -941,6 +952,20 @@ in_protmode( return !(in_realmode(ctxt, ops) || (ctxt->regs->eflags & EFLG_VM)); } +static int +in_longmode( + struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) +{ + uint64_t efer; + + if (ops->read_msr == NULL) + return -1; + + ops->read_msr(MSR_EFER, &efer, ctxt); + return !!(efer & EFER_LMA); +} + static int realmode_load_seg( enum x86_segment seg, @@ -3544,6 +3569,71 @@ x86_emulate( break; } + case 0x05: /* syscall */ { + uint64_t msr_content; + struct segment_register cs = { 0 }, ss = { 0 }; + int rc; + + fail_if(ops->read_msr == NULL); + fail_if(ops->read_segment == NULL); + fail_if(ops->write_segment == NULL); + + generate_exception_if(in_realmode(ctxt, ops), EXC_UD, 0); + generate_exception_if(!in_protmode(ctxt, ops), EXC_UD, 0); + generate_exception_if(lock_prefix, EXC_UD, 0); + + /* Inject #UD if syscall/sysret are disabled. */ + rc = ops->read_msr(MSR_EFER, &msr_content, ctxt); + fail_if(rc != 0); + generate_exception_if((msr_content & EFER_SCE) == 0, EXC_UD, 0); + + rc = ops->read_msr(MSR_STAR, &msr_content, ctxt); + fail_if(rc != 0); + + msr_content >>= 32; + cs.sel = (uint16_t)(msr_content & 0xfffc); + ss.sel = (uint16_t)(msr_content + 8); + + cs.base = ss.base = 0; /* flat segment */ + cs.limit = ss.limit = ~0u; /* 4GB limit */ + cs.attr.bytes = 0xc9b; /* G+DB+P+S+Code */ + ss.attr.bytes = 0xc93; /* G+DB+P+S+Data */ + + if ( in_longmode(ctxt, ops) ) + { + cs.attr.fields.db = 0; + cs.attr.fields.l = 1; + + _regs.rcx = _regs.rip; + _regs.r11 = _regs.eflags & ~EFLG_RF; + + rc = ops->read_msr(mode_64bit() ? MSR_LSTAR : MSR_CSTAR, + &msr_content, ctxt); + fail_if(rc != 0); + + _regs.rip = msr_content; + + rc = ops->read_msr(MSR_FMASK, &msr_content, ctxt); + fail_if(rc != 0); + _regs.eflags &= ~(msr_content | EFLG_RF); + } + else + { + rc = ops->read_msr(MSR_STAR, &msr_content, ctxt); + fail_if(rc != 0); + + _regs.rcx = _regs.rip; + _regs.eip = (uint32_t)msr_content; + _regs.eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); + } + + if ( (rc = ops->write_segment(x86_seg_cs, &cs, ctxt)) || + (rc = ops->write_segment(x86_seg_ss, &ss, ctxt)) ) + goto done; + + break; + } + case 0x06: /* clts */ generate_exception_if(!mode_ring0(), EXC_GP, 0); fail_if((ops->read_cr == NULL) || (ops->write_cr == NULL)); @@ -3645,6 +3735,122 @@ x86_emulate( dst.type = OP_NONE; break; + case 0x34: /* sysenter */ { + uint64_t msr_content; + struct segment_register cs, ss; + int rc; + + fail_if(ops->read_msr == NULL); + fail_if(ops->read_segment == NULL); + fail_if(ops->write_segment == NULL); + + generate_exception_if(mode_ring0(), EXC_GP, 0); + generate_exception_if(in_realmode(ctxt, ops), EXC_GP, 0); + generate_exception_if(!in_protmode(ctxt, ops), EXC_GP, 0); + generate_exception_if(lock_prefix, EXC_UD, 0); + + rc = ops->read_msr(MSR_SYSENTER_CS, &msr_content, ctxt); + fail_if(rc != 0); + + if ( mode_64bit() ) + generate_exception_if(msr_content == 0, EXC_GP, 0); + else + generate_exception_if((msr_content & 0xfffc) == 0, EXC_GP, 0); + + _regs.eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); + + ops->read_segment(x86_seg_cs, &cs, ctxt); + cs.sel = (uint16_t)msr_content & ~3; /* SELECTOR_RPL_MASK */ + cs.base = 0; /* flat segment */ + cs.limit = ~0u; /* 4GB limit */ + cs.attr.bytes = 0xc9b; /* G+DB+P+S+Code */ + + ss.sel = cs.sel + 8; + ss.base = 0; /* flat segment */ + ss.limit = ~0u; /* 4GB limit */ + ss.attr.bytes = 0xc93; /* G+DB+P+S+Data */ + + if ( in_longmode(ctxt, ops) ) + { + cs.attr.fields.db = 0; + cs.attr.fields.l = 1; + } + + rc = ops->write_segment(x86_seg_cs, &cs, ctxt); + fail_if(rc != 0); + rc = ops->write_segment(x86_seg_ss, &ss, ctxt); + fail_if(rc != 0); + + rc = ops->read_msr(MSR_SYSENTER_EIP, &msr_content, ctxt); + fail_if(rc != 0); + _regs.rip = msr_content; + + rc = ops->read_msr(MSR_SYSENTER_ESP, &msr_content, ctxt); + fail_if(rc != 0); + _regs.rsp = msr_content; + + break; + } + + case 0x35: /* sysexit */ { + uint64_t msr_content; + struct segment_register cs, ss; + int user64 = !!(rex_prefix & 8); /* REX.W */ + int rc; + + fail_if(ops->read_msr == NULL); + fail_if(ops->read_segment == NULL); + fail_if(ops->write_segment == NULL); + + generate_exception_if(!mode_ring0(), EXC_GP, 0); + generate_exception_if(in_realmode(ctxt, ops), EXC_GP, 0); + generate_exception_if(!in_protmode(ctxt, ops), EXC_GP, 0); + generate_exception_if(lock_prefix, EXC_UD, 0); + + rc = ops->read_msr(MSR_SYSENTER_CS, &msr_content, ctxt); + fail_if(rc != 0); + rc = ops->read_segment(x86_seg_cs, &cs, ctxt); + fail_if(rc != 0); + + if ( user64 ) + { + cs.sel = (uint16_t)(msr_content + 32); + ss.sel = (cs.sel + 8); + generate_exception_if(msr_content == 0, EXC_GP, 0); + } + else + { + cs.sel = (uint16_t)(msr_content + 16); + ss.sel = (uint16_t)(msr_content + 24); + generate_exception_if((msr_content & 0xfffc) == 0, EXC_GP, 0); + } + + cs.sel |= 0x3; /* SELECTOR_RPL_MASK */ + cs.base = 0; /* flat segment */ + cs.limit = ~0u; /* 4GB limit */ + cs.attr.bytes = 0xcfb; /* G+DB+P+DPL3+S+Code */ + + ss.sel |= 0x3; /* SELECTOR_RPL_MASK */ + ss.base = 0; /* flat segment */ + ss.limit = ~0u; /* 4GB limit */ + ss.attr.bytes = 0xcf3; /* G+DB+P+DPL3+S+Data */ + + if ( user64 ) + { + cs.attr.fields.db = 0; + cs.attr.fields.l = 1; + } + + rc = ops->write_segment(x86_seg_cs, &cs, ctxt); + fail_if(rc != 0); + rc = ops->write_segment(x86_seg_ss, &ss, ctxt); + fail_if(rc != 0); + + _regs.rip = _regs.rdx; + _regs.rsp = _regs.rcx; + break; + } + case 0x6f: /* movq mm/m64,mm */ { uint8_t stub[] = { 0x0f, 0x6f, modrm, 0xc3 }; struct fpu_insn_ctxt fic = { .insn_bytes = sizeof(stub)-1 }; diff --git a/xen/include/asm-x86/hvm/svm/vmcb.h b/xen/include/asm-x86/hvm/svm/vmcb.h index b15c69a4f9..47efa8d75a 100644 --- a/xen/include/asm-x86/hvm/svm/vmcb.h +++ b/xen/include/asm-x86/hvm/svm/vmcb.h @@ -459,6 +459,15 @@ struct arch_svm_struct { unsigned long *msrpm; int launch_core; bool_t vmcb_in_sync; /* VMCB sync'ed with VMSAVE? */ + + /* Upper four bytes are undefined in the VMCB, therefore we can't + * use the fields in the VMCB. Write a 64bit value and then read a 64bit + * value is fine unless there's a VMRUN/VMEXIT in between which clears + * the upper four bytes. + */ + uint64_t guest_sysenter_cs; + uint64_t guest_sysenter_esp; + uint64_t guest_sysenter_eip; }; struct vmcb_struct *alloc_vmcb(void); diff --git a/xen/include/public/arch-x86/hvm/save.h b/xen/include/public/arch-x86/hvm/save.h index bfdc7267c9..d3692d6a86 100644 --- a/xen/include/public/arch-x86/hvm/save.h +++ b/xen/include/public/arch-x86/hvm/save.h @@ -123,9 +123,7 @@ struct hvm_hw_cpu { uint32_t tr_arbytes; uint32_t ldtr_arbytes; - uint32_t sysenter_cs; - uint32_t padding0; - + uint64_t sysenter_cs; uint64_t sysenter_esp; uint64_t sysenter_eip; -- 2.39.5