ia64/xen-unstable

view xen/arch/x86/hvm/vmx/vmx.c @ 17416:0553004fa328

x86, vmx: Enable VPID (Virtual Processor Identification)

Allows TLB entries to be retained across VM entry and VM exit, and Xen
can now identify distinct address spaces through a new
virtual-processor ID (VPID) field of the VMCS.

Signed-off-by: Xin Li <xin.b.li@intel.com>
Signed-off-by: Jun Nakajima <jun.nakajima@intel.com>
Signed-off-by: Xiaohui Xin <Xiaohui.xin@intel.com>
Signed-off-by: Keir Fraser <keir.fraser@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Wed Apr 09 14:34:49 2008 +0100 (2008-04-09)
parents 9b635405ef90
children 3cac47973e15
line source
1 /*
2 * vmx.c: handling VMX architecture-related VM exits
3 * Copyright (c) 2004, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
16 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 */
19 #include <xen/config.h>
20 #include <xen/init.h>
21 #include <xen/lib.h>
22 #include <xen/trace.h>
23 #include <xen/sched.h>
24 #include <xen/irq.h>
25 #include <xen/softirq.h>
26 #include <xen/domain_page.h>
27 #include <xen/hypercall.h>
28 #include <xen/perfc.h>
29 #include <asm/current.h>
30 #include <asm/io.h>
31 #include <asm/regs.h>
32 #include <asm/cpufeature.h>
33 #include <asm/processor.h>
34 #include <asm/types.h>
35 #include <asm/debugreg.h>
36 #include <asm/msr.h>
37 #include <asm/spinlock.h>
38 #include <asm/paging.h>
39 #include <asm/p2m.h>
40 #include <asm/hvm/hvm.h>
41 #include <asm/hvm/support.h>
42 #include <asm/hvm/vmx/vmx.h>
43 #include <asm/hvm/vmx/vmcs.h>
44 #include <public/sched.h>
45 #include <public/hvm/ioreq.h>
46 #include <asm/hvm/vpic.h>
47 #include <asm/hvm/vlapic.h>
48 #include <asm/x86_emulate.h>
49 #include <asm/hvm/vpt.h>
50 #include <public/hvm/save.h>
51 #include <asm/hvm/trace.h>
53 enum handler_return { HNDL_done, HNDL_unhandled, HNDL_exception_raised };
55 static void vmx_ctxt_switch_from(struct vcpu *v);
56 static void vmx_ctxt_switch_to(struct vcpu *v);
58 static int vmx_alloc_vlapic_mapping(struct domain *d);
59 static void vmx_free_vlapic_mapping(struct domain *d);
60 static int vmx_alloc_vpid(struct domain *d);
61 static void vmx_free_vpid(struct domain *d);
62 static void vmx_install_vlapic_mapping(struct vcpu *v);
63 static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr);
64 static void vmx_update_guest_efer(struct vcpu *v);
65 static void vmx_cpuid_intercept(
66 unsigned int *eax, unsigned int *ebx,
67 unsigned int *ecx, unsigned int *edx);
68 static void vmx_wbinvd_intercept(void);
69 static void vmx_fpu_dirty_intercept(void);
70 static int vmx_msr_read_intercept(struct cpu_user_regs *regs);
71 static int vmx_msr_write_intercept(struct cpu_user_regs *regs);
72 static void vmx_invlpg_intercept(unsigned long vaddr);
74 static int vmx_domain_initialise(struct domain *d)
75 {
76 int rc;
78 d->arch.hvm_domain.vmx.ept_control.etmt = EPT_DEFAULT_MT;
79 d->arch.hvm_domain.vmx.ept_control.gaw = EPT_DEFAULT_GAW;
80 d->arch.hvm_domain.vmx.ept_control.asr =
81 pagetable_get_pfn(d->arch.phys_table);
83 if ( (rc = vmx_alloc_vpid(d)) != 0 )
84 return rc;
86 if ( (rc = vmx_alloc_vlapic_mapping(d)) != 0 )
87 {
88 vmx_free_vpid(d);
89 return rc;
90 }
92 return 0;
93 }
95 static void vmx_domain_destroy(struct domain *d)
96 {
97 ept_sync_domain(d);
98 vmx_free_vlapic_mapping(d);
99 vmx_free_vpid(d);
100 }
102 static int vmx_vcpu_initialise(struct vcpu *v)
103 {
104 int rc;
106 spin_lock_init(&v->arch.hvm_vmx.vmcs_lock);
108 v->arch.schedule_tail = vmx_do_resume;
109 v->arch.ctxt_switch_from = vmx_ctxt_switch_from;
110 v->arch.ctxt_switch_to = vmx_ctxt_switch_to;
112 if ( (rc = vmx_create_vmcs(v)) != 0 )
113 {
114 dprintk(XENLOG_WARNING,
115 "Failed to create VMCS for vcpu %d: err=%d.\n",
116 v->vcpu_id, rc);
117 return rc;
118 }
120 vpmu_initialise(v);
122 vmx_install_vlapic_mapping(v);
124 /* %eax == 1 signals full real-mode support to the guest loader. */
125 if ( v->vcpu_id == 0 )
126 v->arch.guest_context.user_regs.eax = 1;
128 return 0;
129 }
131 static void vmx_vcpu_destroy(struct vcpu *v)
132 {
133 vmx_destroy_vmcs(v);
134 vpmu_destroy(v);
135 }
137 #ifdef __x86_64__
139 static DEFINE_PER_CPU(struct vmx_msr_state, host_msr_state);
141 static u32 msr_index[VMX_MSR_COUNT] =
142 {
143 MSR_LSTAR, MSR_STAR, MSR_SYSCALL_MASK
144 };
146 static void vmx_save_host_msrs(void)
147 {
148 struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state);
149 int i;
151 for ( i = 0; i < VMX_MSR_COUNT; i++ )
152 rdmsrl(msr_index[i], host_msr_state->msrs[i]);
153 }
155 #define WRITE_MSR(address) \
156 guest_msr_state->msrs[VMX_INDEX_MSR_ ## address] = msr_content; \
157 set_bit(VMX_INDEX_MSR_ ## address, &guest_msr_state->flags); \
158 wrmsrl(MSR_ ## address, msr_content); \
159 set_bit(VMX_INDEX_MSR_ ## address, &host_msr_state->flags); \
160 break
162 static enum handler_return long_mode_do_msr_read(struct cpu_user_regs *regs)
163 {
164 u64 msr_content = 0;
165 u32 ecx = regs->ecx;
166 struct vcpu *v = current;
167 struct vmx_msr_state *guest_msr_state = &v->arch.hvm_vmx.msr_state;
169 switch ( ecx )
170 {
171 case MSR_EFER:
172 msr_content = v->arch.hvm_vcpu.guest_efer;
173 break;
175 case MSR_FS_BASE:
176 msr_content = __vmread(GUEST_FS_BASE);
177 goto check_long_mode;
179 case MSR_GS_BASE:
180 msr_content = __vmread(GUEST_GS_BASE);
181 goto check_long_mode;
183 case MSR_SHADOW_GS_BASE:
184 msr_content = v->arch.hvm_vmx.shadow_gs;
185 check_long_mode:
186 if ( !(hvm_long_mode_enabled(v)) )
187 {
188 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
189 return HNDL_exception_raised;
190 }
191 break;
193 case MSR_STAR:
194 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_STAR];
195 break;
197 case MSR_LSTAR:
198 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_LSTAR];
199 break;
201 case MSR_CSTAR:
202 msr_content = v->arch.hvm_vmx.cstar;
203 break;
205 case MSR_SYSCALL_MASK:
206 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_SYSCALL_MASK];
207 break;
209 default:
210 return HNDL_unhandled;
211 }
213 HVM_DBG_LOG(DBG_LEVEL_0, "msr 0x%x content 0x%"PRIx64, ecx, msr_content);
215 regs->eax = (u32)(msr_content >> 0);
216 regs->edx = (u32)(msr_content >> 32);
218 return HNDL_done;
219 }
221 static enum handler_return long_mode_do_msr_write(struct cpu_user_regs *regs)
222 {
223 u64 msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
224 u32 ecx = regs->ecx;
225 struct vcpu *v = current;
226 struct vmx_msr_state *guest_msr_state = &v->arch.hvm_vmx.msr_state;
227 struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state);
229 HVM_DBG_LOG(DBG_LEVEL_0, "msr 0x%x content 0x%"PRIx64, ecx, msr_content);
231 switch ( ecx )
232 {
233 case MSR_EFER:
234 if ( hvm_set_efer(msr_content) )
235 goto exception_raised;
236 break;
238 case MSR_FS_BASE:
239 case MSR_GS_BASE:
240 case MSR_SHADOW_GS_BASE:
241 if ( !hvm_long_mode_enabled(v) )
242 goto gp_fault;
244 if ( !is_canonical_address(msr_content) )
245 goto uncanonical_address;
247 if ( ecx == MSR_FS_BASE )
248 __vmwrite(GUEST_FS_BASE, msr_content);
249 else if ( ecx == MSR_GS_BASE )
250 __vmwrite(GUEST_GS_BASE, msr_content);
251 else
252 {
253 v->arch.hvm_vmx.shadow_gs = msr_content;
254 wrmsrl(MSR_SHADOW_GS_BASE, msr_content);
255 }
257 break;
259 case MSR_STAR:
260 WRITE_MSR(STAR);
262 case MSR_LSTAR:
263 if ( !is_canonical_address(msr_content) )
264 goto uncanonical_address;
265 WRITE_MSR(LSTAR);
267 case MSR_CSTAR:
268 if ( !is_canonical_address(msr_content) )
269 goto uncanonical_address;
270 v->arch.hvm_vmx.cstar = msr_content;
271 break;
273 case MSR_SYSCALL_MASK:
274 WRITE_MSR(SYSCALL_MASK);
276 default:
277 return HNDL_unhandled;
278 }
280 return HNDL_done;
282 uncanonical_address:
283 HVM_DBG_LOG(DBG_LEVEL_0, "Not cano address of msr write %x", ecx);
284 gp_fault:
285 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
286 exception_raised:
287 return HNDL_exception_raised;
288 }
290 /*
291 * To avoid MSR save/restore at every VM exit/entry time, we restore
292 * the x86_64 specific MSRs at domain switch time. Since these MSRs
293 * are not modified once set for para domains, we don't save them,
294 * but simply reset them to values set in percpu_traps_init().
295 */
296 static void vmx_restore_host_msrs(void)
297 {
298 struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state);
299 int i;
301 while ( host_msr_state->flags )
302 {
303 i = find_first_set_bit(host_msr_state->flags);
304 wrmsrl(msr_index[i], host_msr_state->msrs[i]);
305 clear_bit(i, &host_msr_state->flags);
306 }
308 if ( cpu_has_nx && !(read_efer() & EFER_NX) )
309 write_efer(read_efer() | EFER_NX);
310 }
312 static void vmx_save_guest_msrs(struct vcpu *v)
313 {
314 /* MSR_SHADOW_GS_BASE may have been changed by swapgs instruction. */
315 rdmsrl(MSR_SHADOW_GS_BASE, v->arch.hvm_vmx.shadow_gs);
316 }
318 static void vmx_restore_guest_msrs(struct vcpu *v)
319 {
320 struct vmx_msr_state *guest_msr_state, *host_msr_state;
321 unsigned long guest_flags;
322 int i;
324 guest_msr_state = &v->arch.hvm_vmx.msr_state;
325 host_msr_state = &this_cpu(host_msr_state);
327 wrmsrl(MSR_SHADOW_GS_BASE, v->arch.hvm_vmx.shadow_gs);
329 guest_flags = guest_msr_state->flags;
331 while ( guest_flags )
332 {
333 i = find_first_set_bit(guest_flags);
335 HVM_DBG_LOG(DBG_LEVEL_2,
336 "restore guest's index %d msr %x with value %lx",
337 i, msr_index[i], guest_msr_state->msrs[i]);
338 set_bit(i, &host_msr_state->flags);
339 wrmsrl(msr_index[i], guest_msr_state->msrs[i]);
340 clear_bit(i, &guest_flags);
341 }
343 if ( (v->arch.hvm_vcpu.guest_efer ^ read_efer()) & (EFER_NX | EFER_SCE) )
344 {
345 HVM_DBG_LOG(DBG_LEVEL_2,
346 "restore guest's EFER with value %lx",
347 v->arch.hvm_vcpu.guest_efer);
348 write_efer((read_efer() & ~(EFER_NX | EFER_SCE)) |
349 (v->arch.hvm_vcpu.guest_efer & (EFER_NX | EFER_SCE)));
350 }
351 }
353 #else /* __i386__ */
355 #define vmx_save_host_msrs() ((void)0)
357 static void vmx_restore_host_msrs(void)
358 {
359 if ( cpu_has_nx && !(read_efer() & EFER_NX) )
360 write_efer(read_efer() | EFER_NX);
361 }
363 #define vmx_save_guest_msrs(v) ((void)0)
365 static void vmx_restore_guest_msrs(struct vcpu *v)
366 {
367 if ( (v->arch.hvm_vcpu.guest_efer ^ read_efer()) & EFER_NX )
368 {
369 HVM_DBG_LOG(DBG_LEVEL_2,
370 "restore guest's EFER with value %lx",
371 v->arch.hvm_vcpu.guest_efer);
372 write_efer((read_efer() & ~EFER_NX) |
373 (v->arch.hvm_vcpu.guest_efer & EFER_NX));
374 }
375 }
377 static enum handler_return long_mode_do_msr_read(struct cpu_user_regs *regs)
378 {
379 u64 msr_content = 0;
380 struct vcpu *v = current;
382 switch ( regs->ecx )
383 {
384 case MSR_EFER:
385 msr_content = v->arch.hvm_vcpu.guest_efer;
386 break;
388 default:
389 return HNDL_unhandled;
390 }
392 regs->eax = msr_content >> 0;
393 regs->edx = msr_content >> 32;
395 return HNDL_done;
396 }
398 static enum handler_return long_mode_do_msr_write(struct cpu_user_regs *regs)
399 {
400 u64 msr_content = regs->eax | ((u64)regs->edx << 32);
402 switch ( regs->ecx )
403 {
404 case MSR_EFER:
405 if ( hvm_set_efer(msr_content) )
406 return HNDL_exception_raised;
407 break;
409 default:
410 return HNDL_unhandled;
411 }
413 return HNDL_done;
414 }
416 #endif /* __i386__ */
418 static int vmx_guest_x86_mode(struct vcpu *v)
419 {
420 unsigned int cs_ar_bytes;
422 if ( unlikely(!(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE)) )
423 return 0;
424 if ( unlikely(guest_cpu_user_regs()->eflags & X86_EFLAGS_VM) )
425 return 1;
426 cs_ar_bytes = __vmread(GUEST_CS_AR_BYTES);
427 if ( hvm_long_mode_enabled(v) &&
428 likely(cs_ar_bytes & X86_SEG_AR_CS_LM_ACTIVE) )
429 return 8;
430 return (likely(cs_ar_bytes & X86_SEG_AR_DEF_OP_SIZE) ? 4 : 2);
431 }
433 static void vmx_save_dr(struct vcpu *v)
434 {
435 if ( !v->arch.hvm_vcpu.flag_dr_dirty )
436 return;
438 /* Clear the DR dirty flag and re-enable intercepts for DR accesses. */
439 v->arch.hvm_vcpu.flag_dr_dirty = 0;
440 v->arch.hvm_vmx.exec_control |= CPU_BASED_MOV_DR_EXITING;
441 __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control);
443 v->arch.guest_context.debugreg[0] = read_debugreg(0);
444 v->arch.guest_context.debugreg[1] = read_debugreg(1);
445 v->arch.guest_context.debugreg[2] = read_debugreg(2);
446 v->arch.guest_context.debugreg[3] = read_debugreg(3);
447 v->arch.guest_context.debugreg[6] = read_debugreg(6);
448 /* DR7 must be saved as it is used by vmx_restore_dr(). */
449 v->arch.guest_context.debugreg[7] = __vmread(GUEST_DR7);
450 }
452 static void __restore_debug_registers(struct vcpu *v)
453 {
454 if ( v->arch.hvm_vcpu.flag_dr_dirty )
455 return;
457 v->arch.hvm_vcpu.flag_dr_dirty = 1;
459 write_debugreg(0, v->arch.guest_context.debugreg[0]);
460 write_debugreg(1, v->arch.guest_context.debugreg[1]);
461 write_debugreg(2, v->arch.guest_context.debugreg[2]);
462 write_debugreg(3, v->arch.guest_context.debugreg[3]);
463 write_debugreg(6, v->arch.guest_context.debugreg[6]);
464 /* DR7 is loaded from the VMCS. */
465 }
467 /*
468 * DR7 is saved and restored on every vmexit. Other debug registers only
469 * need to be restored if their value is going to affect execution -- i.e.,
470 * if one of the breakpoints is enabled. So mask out all bits that don't
471 * enable some breakpoint functionality.
472 */
473 static void vmx_restore_dr(struct vcpu *v)
474 {
475 /* NB. __vmread() is not usable here, so we cannot read from the VMCS. */
476 if ( unlikely(v->arch.guest_context.debugreg[7] & DR7_ACTIVE_MASK) )
477 __restore_debug_registers(v);
478 }
480 static void vmx_vmcs_save(struct vcpu *v, struct hvm_hw_cpu *c)
481 {
482 uint32_t ev;
484 vmx_vmcs_enter(v);
486 c->cr0 = v->arch.hvm_vcpu.guest_cr[0];
487 c->cr2 = v->arch.hvm_vcpu.guest_cr[2];
488 c->cr3 = v->arch.hvm_vcpu.guest_cr[3];
489 c->cr4 = v->arch.hvm_vcpu.guest_cr[4];
491 c->msr_efer = v->arch.hvm_vcpu.guest_efer;
493 c->sysenter_cs = __vmread(GUEST_SYSENTER_CS);
494 c->sysenter_esp = __vmread(GUEST_SYSENTER_ESP);
495 c->sysenter_eip = __vmread(GUEST_SYSENTER_EIP);
497 c->pending_event = 0;
498 c->error_code = 0;
499 if ( ((ev = __vmread(VM_ENTRY_INTR_INFO)) & INTR_INFO_VALID_MASK) &&
500 hvm_event_needs_reinjection((ev >> 8) & 7, ev & 0xff) )
501 {
502 c->pending_event = ev;
503 c->error_code = __vmread(VM_ENTRY_EXCEPTION_ERROR_CODE);
504 }
506 vmx_vmcs_exit(v);
507 }
509 static int vmx_restore_cr0_cr3(
510 struct vcpu *v, unsigned long cr0, unsigned long cr3)
511 {
512 unsigned long mfn = 0;
513 p2m_type_t p2mt;
515 if ( paging_mode_shadow(v->domain) )
516 {
517 if ( cr0 & X86_CR0_PG )
518 {
519 mfn = mfn_x(gfn_to_mfn(v->domain, cr3 >> PAGE_SHIFT, &p2mt));
520 if ( !p2m_is_ram(p2mt) || !get_page(mfn_to_page(mfn), v->domain) )
521 {
522 gdprintk(XENLOG_ERR, "Invalid CR3 value=0x%lx\n", cr3);
523 return -EINVAL;
524 }
525 }
527 if ( hvm_paging_enabled(v) )
528 put_page(pagetable_get_page(v->arch.guest_table));
530 v->arch.guest_table = pagetable_from_pfn(mfn);
531 }
533 v->arch.hvm_vcpu.guest_cr[0] = cr0 | X86_CR0_ET;
534 v->arch.hvm_vcpu.guest_cr[3] = cr3;
536 return 0;
537 }
539 static int vmx_vmcs_restore(struct vcpu *v, struct hvm_hw_cpu *c)
540 {
541 int rc;
543 if ( c->pending_valid &&
544 ((c->pending_type == 1) || (c->pending_type > 6) ||
545 (c->pending_reserved != 0)) )
546 {
547 gdprintk(XENLOG_ERR, "Invalid pending event 0x%"PRIx32".\n",
548 c->pending_event);
549 return -EINVAL;
550 }
552 rc = vmx_restore_cr0_cr3(v, c->cr0, c->cr3);
553 if ( rc )
554 return rc;
556 vmx_vmcs_enter(v);
558 v->arch.hvm_vcpu.guest_cr[2] = c->cr2;
559 v->arch.hvm_vcpu.guest_cr[4] = c->cr4;
560 vmx_update_guest_cr(v, 0);
561 vmx_update_guest_cr(v, 2);
562 vmx_update_guest_cr(v, 4);
564 #ifdef HVM_DEBUG_SUSPEND
565 printk("%s: cr3=0x%"PRIx64", cr0=0x%"PRIx64", cr4=0x%"PRIx64".\n",
566 __func__, c->cr3, c->cr0, c->cr4);
567 #endif
569 v->arch.hvm_vcpu.guest_efer = c->msr_efer;
570 vmx_update_guest_efer(v);
572 __vmwrite(GUEST_SYSENTER_CS, c->sysenter_cs);
573 __vmwrite(GUEST_SYSENTER_ESP, c->sysenter_esp);
574 __vmwrite(GUEST_SYSENTER_EIP, c->sysenter_eip);
576 __vmwrite(GUEST_DR7, c->dr7);
578 vmx_vmcs_exit(v);
580 paging_update_paging_modes(v);
582 if ( c->pending_valid )
583 {
584 gdprintk(XENLOG_INFO, "Re-injecting 0x%"PRIx32", 0x%"PRIx32"\n",
585 c->pending_event, c->error_code);
587 if ( hvm_event_needs_reinjection(c->pending_type, c->pending_vector) )
588 {
589 vmx_vmcs_enter(v);
590 __vmwrite(VM_ENTRY_INTR_INFO, c->pending_event);
591 __vmwrite(VM_ENTRY_EXCEPTION_ERROR_CODE, c->error_code);
592 vmx_vmcs_exit(v);
593 }
594 }
596 return 0;
597 }
599 #if defined(__x86_64__) && defined(HVM_DEBUG_SUSPEND)
600 static void dump_msr_state(struct vmx_msr_state *m)
601 {
602 int i = 0;
603 printk("**** msr state ****\n");
604 printk("shadow_gs=0x%lx, flags=0x%lx, msr_items:", m->shadow_gs, m->flags);
605 for ( i = 0; i < VMX_MSR_COUNT; i++ )
606 printk("0x%lx,", m->msrs[i]);
607 printk("\n");
608 }
609 #else
610 #define dump_msr_state(m) ((void)0)
611 #endif
613 static void vmx_save_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data)
614 {
615 #ifdef __x86_64__
616 struct vmx_msr_state *guest_state = &v->arch.hvm_vmx.msr_state;
617 unsigned long guest_flags = guest_state->flags;
619 data->shadow_gs = v->arch.hvm_vmx.shadow_gs;
620 data->msr_cstar = v->arch.hvm_vmx.cstar;
622 /* save msrs */
623 data->msr_flags = guest_flags;
624 data->msr_lstar = guest_state->msrs[VMX_INDEX_MSR_LSTAR];
625 data->msr_star = guest_state->msrs[VMX_INDEX_MSR_STAR];
626 data->msr_syscall_mask = guest_state->msrs[VMX_INDEX_MSR_SYSCALL_MASK];
627 #endif
629 data->tsc = hvm_get_guest_time(v);
631 dump_msr_state(guest_state);
632 }
634 static void vmx_load_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data)
635 {
636 #ifdef __x86_64__
637 struct vmx_msr_state *guest_state = &v->arch.hvm_vmx.msr_state;
639 /* restore msrs */
640 guest_state->flags = data->msr_flags;
641 guest_state->msrs[VMX_INDEX_MSR_LSTAR] = data->msr_lstar;
642 guest_state->msrs[VMX_INDEX_MSR_STAR] = data->msr_star;
643 guest_state->msrs[VMX_INDEX_MSR_SYSCALL_MASK] = data->msr_syscall_mask;
645 v->arch.hvm_vmx.cstar = data->msr_cstar;
646 v->arch.hvm_vmx.shadow_gs = data->shadow_gs;
647 #endif
649 hvm_set_guest_time(v, data->tsc);
651 dump_msr_state(guest_state);
652 }
655 static void vmx_save_vmcs_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt)
656 {
657 vmx_save_cpu_state(v, ctxt);
658 vmx_vmcs_save(v, ctxt);
659 }
661 static int vmx_load_vmcs_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt)
662 {
663 vmx_load_cpu_state(v, ctxt);
665 if ( vmx_vmcs_restore(v, ctxt) )
666 {
667 gdprintk(XENLOG_ERR, "vmx_vmcs restore failed!\n");
668 domain_crash(v->domain);
669 return -EINVAL;
670 }
672 return 0;
673 }
675 static void vmx_fpu_enter(struct vcpu *v)
676 {
677 setup_fpu(v);
678 __vm_clear_bit(EXCEPTION_BITMAP, TRAP_no_device);
679 v->arch.hvm_vmx.host_cr0 &= ~X86_CR0_TS;
680 __vmwrite(HOST_CR0, v->arch.hvm_vmx.host_cr0);
681 }
683 static void vmx_fpu_leave(struct vcpu *v)
684 {
685 ASSERT(!v->fpu_dirtied);
686 ASSERT(read_cr0() & X86_CR0_TS);
688 if ( !(v->arch.hvm_vmx.host_cr0 & X86_CR0_TS) )
689 {
690 v->arch.hvm_vmx.host_cr0 |= X86_CR0_TS;
691 __vmwrite(HOST_CR0, v->arch.hvm_vmx.host_cr0);
692 }
694 /*
695 * If the guest does not have TS enabled then we must cause and handle an
696 * exception on first use of the FPU. If the guest *does* have TS enabled
697 * then this is not necessary: no FPU activity can occur until the guest
698 * clears CR0.TS, and we will initialise the FPU when that happens.
699 */
700 if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) )
701 {
702 v->arch.hvm_vcpu.hw_cr[0] |= X86_CR0_TS;
703 __vmwrite(GUEST_CR0, v->arch.hvm_vcpu.hw_cr[0]);
704 __vm_set_bit(EXCEPTION_BITMAP, TRAP_no_device);
705 }
706 }
708 static void vmx_ctxt_switch_from(struct vcpu *v)
709 {
710 vmx_fpu_leave(v);
711 vmx_save_guest_msrs(v);
712 vmx_restore_host_msrs();
713 vmx_save_dr(v);
714 vpmu_save(v);
715 }
717 static void vmx_ctxt_switch_to(struct vcpu *v)
718 {
719 /* HOST_CR4 in VMCS is always mmu_cr4_features. Sync CR4 now. */
720 if ( unlikely(read_cr4() != mmu_cr4_features) )
721 write_cr4(mmu_cr4_features);
723 vmx_restore_guest_msrs(v);
724 vmx_restore_dr(v);
725 vpmu_load(v);
726 }
728 static void vmx_get_segment_register(struct vcpu *v, enum x86_segment seg,
729 struct segment_register *reg)
730 {
731 uint32_t attr = 0;
733 vmx_vmcs_enter(v);
735 switch ( seg )
736 {
737 case x86_seg_cs:
738 reg->sel = __vmread(GUEST_CS_SELECTOR);
739 reg->limit = __vmread(GUEST_CS_LIMIT);
740 reg->base = __vmread(GUEST_CS_BASE);
741 attr = __vmread(GUEST_CS_AR_BYTES);
742 break;
743 case x86_seg_ds:
744 reg->sel = __vmread(GUEST_DS_SELECTOR);
745 reg->limit = __vmread(GUEST_DS_LIMIT);
746 reg->base = __vmread(GUEST_DS_BASE);
747 attr = __vmread(GUEST_DS_AR_BYTES);
748 break;
749 case x86_seg_es:
750 reg->sel = __vmread(GUEST_ES_SELECTOR);
751 reg->limit = __vmread(GUEST_ES_LIMIT);
752 reg->base = __vmread(GUEST_ES_BASE);
753 attr = __vmread(GUEST_ES_AR_BYTES);
754 break;
755 case x86_seg_fs:
756 reg->sel = __vmread(GUEST_FS_SELECTOR);
757 reg->limit = __vmread(GUEST_FS_LIMIT);
758 reg->base = __vmread(GUEST_FS_BASE);
759 attr = __vmread(GUEST_FS_AR_BYTES);
760 break;
761 case x86_seg_gs:
762 reg->sel = __vmread(GUEST_GS_SELECTOR);
763 reg->limit = __vmread(GUEST_GS_LIMIT);
764 reg->base = __vmread(GUEST_GS_BASE);
765 attr = __vmread(GUEST_GS_AR_BYTES);
766 break;
767 case x86_seg_ss:
768 reg->sel = __vmread(GUEST_SS_SELECTOR);
769 reg->limit = __vmread(GUEST_SS_LIMIT);
770 reg->base = __vmread(GUEST_SS_BASE);
771 attr = __vmread(GUEST_SS_AR_BYTES);
772 break;
773 case x86_seg_tr:
774 reg->sel = __vmread(GUEST_TR_SELECTOR);
775 reg->limit = __vmread(GUEST_TR_LIMIT);
776 reg->base = __vmread(GUEST_TR_BASE);
777 attr = __vmread(GUEST_TR_AR_BYTES);
778 break;
779 case x86_seg_gdtr:
780 reg->limit = __vmread(GUEST_GDTR_LIMIT);
781 reg->base = __vmread(GUEST_GDTR_BASE);
782 break;
783 case x86_seg_idtr:
784 reg->limit = __vmread(GUEST_IDTR_LIMIT);
785 reg->base = __vmread(GUEST_IDTR_BASE);
786 break;
787 case x86_seg_ldtr:
788 reg->sel = __vmread(GUEST_LDTR_SELECTOR);
789 reg->limit = __vmread(GUEST_LDTR_LIMIT);
790 reg->base = __vmread(GUEST_LDTR_BASE);
791 attr = __vmread(GUEST_LDTR_AR_BYTES);
792 break;
793 default:
794 BUG();
795 }
797 vmx_vmcs_exit(v);
799 reg->attr.bytes = (attr & 0xff) | ((attr >> 4) & 0xf00);
800 /* Unusable flag is folded into Present flag. */
801 if ( attr & (1u<<16) )
802 reg->attr.fields.p = 0;
803 }
805 static void vmx_set_segment_register(struct vcpu *v, enum x86_segment seg,
806 struct segment_register *reg)
807 {
808 uint32_t attr;
810 attr = reg->attr.bytes;
811 attr = ((attr & 0xf00) << 4) | (attr & 0xff);
813 /* Not-present must mean unusable. */
814 if ( !reg->attr.fields.p )
815 attr |= (1u << 16);
817 vmx_vmcs_enter(v);
819 switch ( seg )
820 {
821 case x86_seg_cs:
822 __vmwrite(GUEST_CS_SELECTOR, reg->sel);
823 __vmwrite(GUEST_CS_LIMIT, reg->limit);
824 __vmwrite(GUEST_CS_BASE, reg->base);
825 __vmwrite(GUEST_CS_AR_BYTES, attr);
826 break;
827 case x86_seg_ds:
828 __vmwrite(GUEST_DS_SELECTOR, reg->sel);
829 __vmwrite(GUEST_DS_LIMIT, reg->limit);
830 __vmwrite(GUEST_DS_BASE, reg->base);
831 __vmwrite(GUEST_DS_AR_BYTES, attr);
832 break;
833 case x86_seg_es:
834 __vmwrite(GUEST_ES_SELECTOR, reg->sel);
835 __vmwrite(GUEST_ES_LIMIT, reg->limit);
836 __vmwrite(GUEST_ES_BASE, reg->base);
837 __vmwrite(GUEST_ES_AR_BYTES, attr);
838 break;
839 case x86_seg_fs:
840 __vmwrite(GUEST_FS_SELECTOR, reg->sel);
841 __vmwrite(GUEST_FS_LIMIT, reg->limit);
842 __vmwrite(GUEST_FS_BASE, reg->base);
843 __vmwrite(GUEST_FS_AR_BYTES, attr);
844 break;
845 case x86_seg_gs:
846 __vmwrite(GUEST_GS_SELECTOR, reg->sel);
847 __vmwrite(GUEST_GS_LIMIT, reg->limit);
848 __vmwrite(GUEST_GS_BASE, reg->base);
849 __vmwrite(GUEST_GS_AR_BYTES, attr);
850 break;
851 case x86_seg_ss:
852 __vmwrite(GUEST_SS_SELECTOR, reg->sel);
853 __vmwrite(GUEST_SS_LIMIT, reg->limit);
854 __vmwrite(GUEST_SS_BASE, reg->base);
855 __vmwrite(GUEST_SS_AR_BYTES, attr);
856 break;
857 case x86_seg_tr:
858 __vmwrite(GUEST_TR_SELECTOR, reg->sel);
859 __vmwrite(GUEST_TR_LIMIT, reg->limit);
860 __vmwrite(GUEST_TR_BASE, reg->base);
861 __vmwrite(GUEST_TR_AR_BYTES, attr);
862 break;
863 case x86_seg_gdtr:
864 __vmwrite(GUEST_GDTR_LIMIT, reg->limit);
865 __vmwrite(GUEST_GDTR_BASE, reg->base);
866 break;
867 case x86_seg_idtr:
868 __vmwrite(GUEST_IDTR_LIMIT, reg->limit);
869 __vmwrite(GUEST_IDTR_BASE, reg->base);
870 break;
871 case x86_seg_ldtr:
872 __vmwrite(GUEST_LDTR_SELECTOR, reg->sel);
873 __vmwrite(GUEST_LDTR_LIMIT, reg->limit);
874 __vmwrite(GUEST_LDTR_BASE, reg->base);
875 __vmwrite(GUEST_LDTR_AR_BYTES, attr);
876 break;
877 default:
878 BUG();
879 }
881 vmx_vmcs_exit(v);
882 }
884 static void vmx_set_tsc_offset(struct vcpu *v, u64 offset)
885 {
886 vmx_vmcs_enter(v);
887 __vmwrite(TSC_OFFSET, offset);
888 #if defined (__i386__)
889 __vmwrite(TSC_OFFSET_HIGH, offset >> 32);
890 #endif
891 vmx_vmcs_exit(v);
892 }
894 void do_nmi(struct cpu_user_regs *);
896 static void vmx_init_hypercall_page(struct domain *d, void *hypercall_page)
897 {
898 char *p;
899 int i;
901 for ( i = 0; i < (PAGE_SIZE / 32); i++ )
902 {
903 p = (char *)(hypercall_page + (i * 32));
904 *(u8 *)(p + 0) = 0xb8; /* mov imm32, %eax */
905 *(u32 *)(p + 1) = i;
906 *(u8 *)(p + 5) = 0x0f; /* vmcall */
907 *(u8 *)(p + 6) = 0x01;
908 *(u8 *)(p + 7) = 0xc1;
909 *(u8 *)(p + 8) = 0xc3; /* ret */
910 }
912 /* Don't support HYPERVISOR_iret at the moment */
913 *(u16 *)(hypercall_page + (__HYPERVISOR_iret * 32)) = 0x0b0f; /* ud2 */
914 }
916 static unsigned int vmx_get_interrupt_shadow(struct vcpu *v)
917 {
918 return __vmread(GUEST_INTERRUPTIBILITY_INFO);
919 }
921 static void vmx_set_interrupt_shadow(struct vcpu *v, unsigned int intr_shadow)
922 {
923 __vmwrite(GUEST_INTERRUPTIBILITY_INFO, intr_shadow);
924 }
926 static void vmx_load_pdptrs(struct vcpu *v)
927 {
928 unsigned long cr3 = v->arch.hvm_vcpu.guest_cr[3], mfn;
929 uint64_t *guest_pdptrs;
930 p2m_type_t p2mt;
931 char *p;
933 /* EPT needs to load PDPTRS into VMCS for PAE. */
934 if ( !hvm_pae_enabled(v) || (v->arch.hvm_vcpu.guest_efer & EFER_LMA) )
935 return;
937 if ( cr3 & 0x1fUL )
938 goto crash;
940 mfn = mfn_x(gfn_to_mfn(v->domain, cr3 >> PAGE_SHIFT, &p2mt));
941 if ( !p2m_is_ram(p2mt) )
942 goto crash;
944 p = map_domain_page(mfn);
946 guest_pdptrs = (uint64_t *)(p + (cr3 & ~PAGE_MASK));
948 /*
949 * We do not check the PDPTRs for validity. The CPU will do this during
950 * vm entry, and we can handle the failure there and crash the guest.
951 * The only thing we could do better here is #GP instead.
952 */
954 vmx_vmcs_enter(v);
956 __vmwrite(GUEST_PDPTR0, guest_pdptrs[0]);
957 __vmwrite(GUEST_PDPTR1, guest_pdptrs[1]);
958 __vmwrite(GUEST_PDPTR2, guest_pdptrs[2]);
959 __vmwrite(GUEST_PDPTR3, guest_pdptrs[3]);
960 #ifdef CONFIG_X86_PAE
961 __vmwrite(GUEST_PDPTR0_HIGH, guest_pdptrs[0] >> 32);
962 __vmwrite(GUEST_PDPTR1_HIGH, guest_pdptrs[1] >> 32);
963 __vmwrite(GUEST_PDPTR2_HIGH, guest_pdptrs[2] >> 32);
964 __vmwrite(GUEST_PDPTR3_HIGH, guest_pdptrs[3] >> 32);
965 #endif
967 vmx_vmcs_exit(v);
969 unmap_domain_page(p);
970 return;
972 crash:
973 domain_crash(v->domain);
974 }
976 static void vmx_update_host_cr3(struct vcpu *v)
977 {
978 vmx_vmcs_enter(v);
979 __vmwrite(HOST_CR3, v->arch.cr3);
980 vmx_vmcs_exit(v);
981 }
983 static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr)
984 {
985 vmx_vmcs_enter(v);
987 switch ( cr )
988 {
989 case 0: {
990 unsigned long hw_cr0_mask =
991 X86_CR0_NE | X86_CR0_PG | X86_CR0_PE;
993 if ( paging_mode_shadow(v->domain) )
994 hw_cr0_mask |= X86_CR0_WP;
996 if ( paging_mode_hap(v->domain) )
997 {
998 /* We manage GUEST_CR3 when guest CR0.PE is zero. */
999 uint32_t cr3_ctls = (CPU_BASED_CR3_LOAD_EXITING |
1000 CPU_BASED_CR3_STORE_EXITING);
1001 v->arch.hvm_vmx.exec_control &= ~cr3_ctls;
1002 if ( !hvm_paging_enabled(v) )
1003 v->arch.hvm_vmx.exec_control |= cr3_ctls;
1004 __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control);
1006 /* Changing CR0.PE can change some bits in real CR4. */
1007 vmx_update_guest_cr(v, 4);
1010 if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) )
1012 if ( v != current )
1013 hw_cr0_mask |= X86_CR0_TS;
1014 else if ( v->arch.hvm_vcpu.hw_cr[0] & X86_CR0_TS )
1015 vmx_fpu_enter(v);
1018 v->arch.hvm_vmx.vmxemul &= ~VMXEMUL_REALMODE;
1019 if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE) )
1020 v->arch.hvm_vmx.vmxemul |= VMXEMUL_REALMODE;
1022 v->arch.hvm_vcpu.hw_cr[0] =
1023 v->arch.hvm_vcpu.guest_cr[0] | hw_cr0_mask;
1024 __vmwrite(GUEST_CR0, v->arch.hvm_vcpu.hw_cr[0]);
1025 __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vcpu.guest_cr[0]);
1026 break;
1028 case 2:
1029 /* CR2 is updated in exit stub. */
1030 break;
1031 case 3:
1032 if ( paging_mode_hap(v->domain) )
1034 if ( !hvm_paging_enabled(v) )
1035 v->arch.hvm_vcpu.hw_cr[3] =
1036 v->domain->arch.hvm_domain.params[HVM_PARAM_IDENT_PT];
1037 vmx_load_pdptrs(v);
1040 __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr[3]);
1041 vpid_sync_vcpu_all(v);
1042 break;
1043 case 4:
1044 v->arch.hvm_vcpu.hw_cr[4] = HVM_CR4_HOST_MASK;
1045 if ( paging_mode_hap(v->domain) )
1046 v->arch.hvm_vcpu.hw_cr[4] &= ~X86_CR4_PAE;
1047 v->arch.hvm_vcpu.hw_cr[4] |= v->arch.hvm_vcpu.guest_cr[4];
1048 if ( paging_mode_hap(v->domain) && !hvm_paging_enabled(v) )
1050 v->arch.hvm_vcpu.hw_cr[4] |= X86_CR4_PSE;
1051 v->arch.hvm_vcpu.hw_cr[4] &= ~X86_CR4_PAE;
1053 __vmwrite(GUEST_CR4, v->arch.hvm_vcpu.hw_cr[4]);
1054 __vmwrite(CR4_READ_SHADOW, v->arch.hvm_vcpu.guest_cr[4]);
1055 break;
1056 default:
1057 BUG();
1060 vmx_vmcs_exit(v);
1063 static void vmx_update_guest_efer(struct vcpu *v)
1065 #ifdef __x86_64__
1066 unsigned long vm_entry_value;
1068 vmx_vmcs_enter(v);
1070 vm_entry_value = __vmread(VM_ENTRY_CONTROLS);
1071 if ( v->arch.hvm_vcpu.guest_efer & EFER_LMA )
1072 vm_entry_value |= VM_ENTRY_IA32E_MODE;
1073 else
1074 vm_entry_value &= ~VM_ENTRY_IA32E_MODE;
1075 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
1077 vmx_vmcs_exit(v);
1078 #endif
1080 if ( v == current )
1081 write_efer((read_efer() & ~(EFER_NX|EFER_SCE)) |
1082 (v->arch.hvm_vcpu.guest_efer & (EFER_NX|EFER_SCE)));
1085 static void vmx_flush_guest_tlbs(void)
1087 /*
1088 * If VPID (i.e. tagged TLB support) is not enabled, the fact that
1089 * we're in Xen at all means any guest will have a clean TLB when
1090 * it's next run, because VMRESUME will flush it for us.
1092 * If enabled, we invalidate all translations associated with all
1093 * VPID values.
1094 */
1095 vpid_sync_all();
1098 static void __ept_sync_domain(void *info)
1100 struct domain *d = info;
1101 __invept(1, d->arch.hvm_domain.vmx.ept_control.eptp, 0);
1104 void ept_sync_domain(struct domain *d)
1106 /* Only if using EPT and this domain has some VCPUs to dirty. */
1107 if ( d->arch.hvm_domain.hap_enabled && d->vcpu[0] )
1108 on_each_cpu(__ept_sync_domain, d, 1, 1);
1111 static void __vmx_inject_exception(
1112 struct vcpu *v, int trap, int type, int error_code)
1114 unsigned long intr_fields;
1116 /*
1117 * NB. Callers do not need to worry about clearing STI/MOV-SS blocking:
1118 * "If the VM entry is injecting, there is no blocking by STI or by
1119 * MOV SS following the VM entry, regardless of the contents of the
1120 * interruptibility-state field [in the guest-state area before the
1121 * VM entry]", PRM Vol. 3, 22.6.1 (Interruptibility State).
1122 */
1124 intr_fields = (INTR_INFO_VALID_MASK | (type<<8) | trap);
1125 if ( error_code != HVM_DELIVER_NO_ERROR_CODE ) {
1126 __vmwrite(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
1127 intr_fields |= INTR_INFO_DELIVER_CODE_MASK;
1130 __vmwrite(VM_ENTRY_INTR_INFO, intr_fields);
1132 if ( trap == TRAP_page_fault )
1133 HVMTRACE_2D(PF_INJECT, v, v->arch.hvm_vcpu.guest_cr[2], error_code);
1134 else
1135 HVMTRACE_2D(INJ_EXC, v, trap, error_code);
1138 void vmx_inject_hw_exception(struct vcpu *v, int trap, int error_code)
1140 unsigned long intr_info = __vmread(VM_ENTRY_INTR_INFO);
1142 if ( unlikely(intr_info & INTR_INFO_VALID_MASK) &&
1143 (((intr_info >> 8) & 7) == X86_EVENTTYPE_HW_EXCEPTION) )
1145 trap = hvm_combine_hw_exceptions((uint8_t)intr_info, trap);
1146 if ( trap == TRAP_double_fault )
1147 error_code = 0;
1150 __vmx_inject_exception(v, trap, X86_EVENTTYPE_HW_EXCEPTION, error_code);
1153 void vmx_inject_extint(struct vcpu *v, int trap)
1155 __vmx_inject_exception(v, trap, X86_EVENTTYPE_EXT_INTR,
1156 HVM_DELIVER_NO_ERROR_CODE);
1159 void vmx_inject_nmi(struct vcpu *v)
1161 __vmx_inject_exception(v, 2, X86_EVENTTYPE_NMI,
1162 HVM_DELIVER_NO_ERROR_CODE);
1165 static void vmx_inject_exception(
1166 unsigned int trapnr, int errcode, unsigned long cr2)
1168 struct vcpu *curr = current;
1170 vmx_inject_hw_exception(curr, trapnr, errcode);
1172 if ( trapnr == TRAP_page_fault )
1173 curr->arch.hvm_vcpu.guest_cr[2] = cr2;
1175 if ( (trapnr == TRAP_debug) &&
1176 (guest_cpu_user_regs()->eflags & X86_EFLAGS_TF) )
1178 __restore_debug_registers(curr);
1179 write_debugreg(6, read_debugreg(6) | 0x4000);
1183 static int vmx_event_pending(struct vcpu *v)
1185 ASSERT(v == current);
1186 return (__vmread(VM_ENTRY_INTR_INFO) & INTR_INFO_VALID_MASK);
1189 static int vmx_do_pmu_interrupt(struct cpu_user_regs *regs)
1191 return vpmu_do_interrupt(regs);
1194 static struct hvm_function_table vmx_function_table = {
1195 .name = "VMX",
1196 .domain_initialise = vmx_domain_initialise,
1197 .domain_destroy = vmx_domain_destroy,
1198 .vcpu_initialise = vmx_vcpu_initialise,
1199 .vcpu_destroy = vmx_vcpu_destroy,
1200 .save_cpu_ctxt = vmx_save_vmcs_ctxt,
1201 .load_cpu_ctxt = vmx_load_vmcs_ctxt,
1202 .get_interrupt_shadow = vmx_get_interrupt_shadow,
1203 .set_interrupt_shadow = vmx_set_interrupt_shadow,
1204 .guest_x86_mode = vmx_guest_x86_mode,
1205 .get_segment_register = vmx_get_segment_register,
1206 .set_segment_register = vmx_set_segment_register,
1207 .update_host_cr3 = vmx_update_host_cr3,
1208 .update_guest_cr = vmx_update_guest_cr,
1209 .update_guest_efer = vmx_update_guest_efer,
1210 .flush_guest_tlbs = vmx_flush_guest_tlbs,
1211 .set_tsc_offset = vmx_set_tsc_offset,
1212 .inject_exception = vmx_inject_exception,
1213 .init_hypercall_page = vmx_init_hypercall_page,
1214 .event_pending = vmx_event_pending,
1215 .do_pmu_interrupt = vmx_do_pmu_interrupt,
1216 .cpu_up = vmx_cpu_up,
1217 .cpu_down = vmx_cpu_down,
1218 .cpuid_intercept = vmx_cpuid_intercept,
1219 .wbinvd_intercept = vmx_wbinvd_intercept,
1220 .fpu_dirty_intercept = vmx_fpu_dirty_intercept,
1221 .msr_read_intercept = vmx_msr_read_intercept,
1222 .msr_write_intercept = vmx_msr_write_intercept,
1223 .invlpg_intercept = vmx_invlpg_intercept
1224 };
1226 static unsigned long *vpid_bitmap;
1227 #define VPID_BITMAP_SIZE ((1u << VMCS_VPID_WIDTH) / MAX_VIRT_CPUS)
1229 void start_vmx(void)
1231 static int bootstrapped;
1233 vmx_save_host_msrs();
1235 if ( bootstrapped )
1237 if ( hvm_enabled && !vmx_cpu_up() )
1239 printk("VMX: FATAL: failed to initialise CPU%d!\n",
1240 smp_processor_id());
1241 BUG();
1243 return;
1246 bootstrapped = 1;
1248 /* Xen does not fill x86_capability words except 0. */
1249 boot_cpu_data.x86_capability[4] = cpuid_ecx(1);
1251 if ( !test_bit(X86_FEATURE_VMXE, &boot_cpu_data.x86_capability) )
1252 return;
1254 set_in_cr4(X86_CR4_VMXE);
1256 if ( !vmx_cpu_up() )
1258 printk("VMX: failed to initialise.\n");
1259 return;
1262 if ( cpu_has_vmx_ept )
1264 printk("VMX: EPT is available.\n");
1265 vmx_function_table.hap_supported = 1;
1268 if ( cpu_has_vmx_vpid )
1270 printk("VMX: VPID is available.\n");
1272 vpid_bitmap = xmalloc_array(
1273 unsigned long, BITS_TO_LONGS(VPID_BITMAP_SIZE));
1274 BUG_ON(vpid_bitmap == NULL);
1275 memset(vpid_bitmap, 0, BITS_TO_LONGS(VPID_BITMAP_SIZE) * sizeof(long));
1277 /* VPID 0 is used by VMX root mode (the hypervisor). */
1278 __set_bit(0, vpid_bitmap);
1281 setup_vmcs_dump();
1283 hvm_enable(&vmx_function_table);
1286 /*
1287 * Not all cases receive valid value in the VM-exit instruction length field.
1288 * Callers must know what they're doing!
1289 */
1290 static int __get_instruction_length(void)
1292 int len;
1293 len = __vmread(VM_EXIT_INSTRUCTION_LEN); /* Safe: callers audited */
1294 BUG_ON((len < 1) || (len > 15));
1295 return len;
1298 static void __update_guest_eip(unsigned long inst_len)
1300 struct cpu_user_regs *regs = guest_cpu_user_regs();
1301 unsigned long x;
1303 regs->eip += inst_len;
1304 regs->eflags &= ~X86_EFLAGS_RF;
1306 x = __vmread(GUEST_INTERRUPTIBILITY_INFO);
1307 if ( x & (VMX_INTR_SHADOW_STI | VMX_INTR_SHADOW_MOV_SS) )
1309 x &= ~(VMX_INTR_SHADOW_STI | VMX_INTR_SHADOW_MOV_SS);
1310 __vmwrite(GUEST_INTERRUPTIBILITY_INFO, x);
1313 if ( regs->eflags & X86_EFLAGS_TF )
1314 vmx_inject_exception(TRAP_debug, HVM_DELIVER_NO_ERROR_CODE, 0);
1317 static void vmx_fpu_dirty_intercept(void)
1319 struct vcpu *curr = current;
1321 vmx_fpu_enter(curr);
1323 /* Disable TS in guest CR0 unless the guest wants the exception too. */
1324 if ( !(curr->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) )
1326 curr->arch.hvm_vcpu.hw_cr[0] &= ~X86_CR0_TS;
1327 __vmwrite(GUEST_CR0, curr->arch.hvm_vcpu.hw_cr[0]);
1331 #define bitmaskof(idx) (1U << ((idx) & 31))
1332 static void vmx_cpuid_intercept(
1333 unsigned int *eax, unsigned int *ebx,
1334 unsigned int *ecx, unsigned int *edx)
1336 unsigned int input = *eax;
1337 unsigned int count = *ecx;
1339 hvm_cpuid(input, eax, ebx, ecx, edx);
1341 switch ( input )
1343 case 0x00000001:
1344 /* Mask AMD-only features. */
1345 *ecx &= ~(bitmaskof(X86_FEATURE_POPCNT));
1346 break;
1348 case 0x00000004:
1349 cpuid_count(input, count, eax, ebx, ecx, edx);
1350 *eax &= 0x3FFF; /* one core */
1351 break;
1353 case 0x00000006:
1354 case 0x00000009:
1355 *eax = *ebx = *ecx = *edx = 0;
1356 break;
1358 case 0x80000001:
1359 /* Only a few features are advertised in Intel's 0x80000001. */
1360 *ecx &= (bitmaskof(X86_FEATURE_LAHF_LM));
1361 *edx &= (bitmaskof(X86_FEATURE_NX) |
1362 bitmaskof(X86_FEATURE_LM) |
1363 bitmaskof(X86_FEATURE_SYSCALL));
1364 break;
1367 HVMTRACE_3D(CPUID, current, input,
1368 ((uint64_t)*eax << 32) | *ebx, ((uint64_t)*ecx << 32) | *edx);
1371 static void vmx_do_cpuid(struct cpu_user_regs *regs)
1373 unsigned int eax, ebx, ecx, edx;
1375 eax = regs->eax;
1376 ebx = regs->ebx;
1377 ecx = regs->ecx;
1378 edx = regs->edx;
1380 vmx_cpuid_intercept(&eax, &ebx, &ecx, &edx);
1382 regs->eax = eax;
1383 regs->ebx = ebx;
1384 regs->ecx = ecx;
1385 regs->edx = edx;
1388 static void vmx_dr_access(unsigned long exit_qualification,
1389 struct cpu_user_regs *regs)
1391 struct vcpu *v = current;
1393 HVMTRACE_0D(DR_WRITE, v);
1395 if ( !v->arch.hvm_vcpu.flag_dr_dirty )
1396 __restore_debug_registers(v);
1398 /* Allow guest direct access to DR registers */
1399 v->arch.hvm_vmx.exec_control &= ~CPU_BASED_MOV_DR_EXITING;
1400 __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control);
1403 static void vmx_invlpg_intercept(unsigned long vaddr)
1405 struct vcpu *curr = current;
1406 HVMTRACE_2D(INVLPG, curr, /*invlpga=*/ 0, vaddr);
1407 paging_invlpg(curr, vaddr);
1410 #define CASE_SET_REG(REG, reg) \
1411 case VMX_CONTROL_REG_ACCESS_GPR_ ## REG: regs->reg = value; break
1412 #define CASE_GET_REG(REG, reg) \
1413 case VMX_CONTROL_REG_ACCESS_GPR_ ## REG: value = regs->reg; break
1415 #define CASE_EXTEND_SET_REG \
1416 CASE_EXTEND_REG(S)
1417 #define CASE_EXTEND_GET_REG \
1418 CASE_EXTEND_REG(G)
1420 #ifdef __i386__
1421 #define CASE_EXTEND_REG(T)
1422 #else
1423 #define CASE_EXTEND_REG(T) \
1424 CASE_ ## T ## ET_REG(R8, r8); \
1425 CASE_ ## T ## ET_REG(R9, r9); \
1426 CASE_ ## T ## ET_REG(R10, r10); \
1427 CASE_ ## T ## ET_REG(R11, r11); \
1428 CASE_ ## T ## ET_REG(R12, r12); \
1429 CASE_ ## T ## ET_REG(R13, r13); \
1430 CASE_ ## T ## ET_REG(R14, r14); \
1431 CASE_ ## T ## ET_REG(R15, r15)
1432 #endif
1434 static int mov_to_cr(int gp, int cr, struct cpu_user_regs *regs)
1436 unsigned long value;
1437 struct vcpu *v = current;
1438 struct vlapic *vlapic = vcpu_vlapic(v);
1440 switch ( gp )
1442 CASE_GET_REG(EAX, eax);
1443 CASE_GET_REG(ECX, ecx);
1444 CASE_GET_REG(EDX, edx);
1445 CASE_GET_REG(EBX, ebx);
1446 CASE_GET_REG(EBP, ebp);
1447 CASE_GET_REG(ESI, esi);
1448 CASE_GET_REG(EDI, edi);
1449 CASE_GET_REG(ESP, esp);
1450 CASE_EXTEND_GET_REG;
1451 default:
1452 gdprintk(XENLOG_ERR, "invalid gp: %d\n", gp);
1453 goto exit_and_crash;
1456 HVMTRACE_2D(CR_WRITE, v, cr, value);
1458 HVM_DBG_LOG(DBG_LEVEL_1, "CR%d, value = %lx", cr, value);
1460 switch ( cr )
1462 case 0:
1463 return !hvm_set_cr0(value);
1465 case 3:
1466 return !hvm_set_cr3(value);
1468 case 4:
1469 return !hvm_set_cr4(value);
1471 case 8:
1472 vlapic_set_reg(vlapic, APIC_TASKPRI, ((value & 0x0F) << 4));
1473 break;
1475 default:
1476 gdprintk(XENLOG_ERR, "invalid cr: %d\n", cr);
1477 goto exit_and_crash;
1480 return 1;
1482 exit_and_crash:
1483 domain_crash(v->domain);
1484 return 0;
1487 /*
1488 * Read from control registers. CR0 and CR4 are read from the shadow.
1489 */
1490 static void mov_from_cr(int cr, int gp, struct cpu_user_regs *regs)
1492 unsigned long value = 0;
1493 struct vcpu *v = current;
1494 struct vlapic *vlapic = vcpu_vlapic(v);
1496 switch ( cr )
1498 case 3:
1499 value = (unsigned long)v->arch.hvm_vcpu.guest_cr[3];
1500 break;
1501 case 8:
1502 value = (unsigned long)vlapic_get_reg(vlapic, APIC_TASKPRI);
1503 value = (value & 0xF0) >> 4;
1504 break;
1505 default:
1506 gdprintk(XENLOG_ERR, "invalid cr: %d\n", cr);
1507 domain_crash(v->domain);
1508 break;
1511 switch ( gp ) {
1512 CASE_SET_REG(EAX, eax);
1513 CASE_SET_REG(ECX, ecx);
1514 CASE_SET_REG(EDX, edx);
1515 CASE_SET_REG(EBX, ebx);
1516 CASE_SET_REG(EBP, ebp);
1517 CASE_SET_REG(ESI, esi);
1518 CASE_SET_REG(EDI, edi);
1519 CASE_SET_REG(ESP, esp);
1520 CASE_EXTEND_SET_REG;
1521 default:
1522 printk("invalid gp: %d\n", gp);
1523 domain_crash(v->domain);
1524 break;
1527 HVMTRACE_2D(CR_READ, v, cr, value);
1529 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR%d, value = %lx", cr, value);
1532 static int vmx_cr_access(unsigned long exit_qualification,
1533 struct cpu_user_regs *regs)
1535 unsigned int gp, cr;
1536 unsigned long value;
1537 struct vcpu *v = current;
1539 switch ( exit_qualification & VMX_CONTROL_REG_ACCESS_TYPE )
1541 case VMX_CONTROL_REG_ACCESS_TYPE_MOV_TO_CR:
1542 gp = exit_qualification & VMX_CONTROL_REG_ACCESS_GPR;
1543 cr = exit_qualification & VMX_CONTROL_REG_ACCESS_NUM;
1544 return mov_to_cr(gp, cr, regs);
1545 case VMX_CONTROL_REG_ACCESS_TYPE_MOV_FROM_CR:
1546 gp = exit_qualification & VMX_CONTROL_REG_ACCESS_GPR;
1547 cr = exit_qualification & VMX_CONTROL_REG_ACCESS_NUM;
1548 mov_from_cr(cr, gp, regs);
1549 break;
1550 case VMX_CONTROL_REG_ACCESS_TYPE_CLTS:
1551 v->arch.hvm_vcpu.guest_cr[0] &= ~X86_CR0_TS;
1552 vmx_update_guest_cr(v, 0);
1553 HVMTRACE_0D(CLTS, current);
1554 break;
1555 case VMX_CONTROL_REG_ACCESS_TYPE_LMSW:
1556 value = v->arch.hvm_vcpu.guest_cr[0];
1557 value = (value & ~0xFFFF) | ((exit_qualification >> 16) & 0xFFFF);
1558 HVMTRACE_1D(LMSW, current, value);
1559 return !hvm_set_cr0(value);
1560 default:
1561 BUG();
1564 return 1;
1567 static const struct lbr_info {
1568 u32 base, count;
1569 } p4_lbr[] = {
1570 { MSR_P4_LER_FROM_LIP, 1 },
1571 { MSR_P4_LER_TO_LIP, 1 },
1572 { MSR_P4_LASTBRANCH_TOS, 1 },
1573 { MSR_P4_LASTBRANCH_0_FROM_LIP, NUM_MSR_P4_LASTBRANCH_FROM_TO },
1574 { MSR_P4_LASTBRANCH_0_TO_LIP, NUM_MSR_P4_LASTBRANCH_FROM_TO },
1575 { 0, 0 }
1576 }, c2_lbr[] = {
1577 { MSR_IA32_LASTINTFROMIP, 1 },
1578 { MSR_IA32_LASTINTTOIP, 1 },
1579 { MSR_C2_LASTBRANCH_TOS, 1 },
1580 { MSR_C2_LASTBRANCH_0_FROM_IP, NUM_MSR_C2_LASTBRANCH_FROM_TO },
1581 { MSR_C2_LASTBRANCH_0_TO_IP, NUM_MSR_C2_LASTBRANCH_FROM_TO },
1582 { 0, 0 }
1583 #ifdef __i386__
1584 }, pm_lbr[] = {
1585 { MSR_IA32_LASTINTFROMIP, 1 },
1586 { MSR_IA32_LASTINTTOIP, 1 },
1587 { MSR_PM_LASTBRANCH_TOS, 1 },
1588 { MSR_PM_LASTBRANCH_0, NUM_MSR_PM_LASTBRANCH },
1589 { 0, 0 }
1590 #endif
1591 };
1593 static const struct lbr_info *last_branch_msr_get(void)
1595 switch ( boot_cpu_data.x86 )
1597 case 6:
1598 switch ( boot_cpu_data.x86_model )
1600 #ifdef __i386__
1601 /* PentiumM */
1602 case 9: case 13:
1603 /* Core Solo/Duo */
1604 case 14:
1605 return pm_lbr;
1606 break;
1607 #endif
1608 /* Core2 Duo */
1609 case 15:
1610 return c2_lbr;
1611 break;
1613 break;
1615 case 15:
1616 switch ( boot_cpu_data.x86_model )
1618 /* Pentium4/Xeon with em64t */
1619 case 3: case 4: case 6:
1620 return p4_lbr;
1621 break;
1623 break;
1626 return NULL;
1629 static int is_last_branch_msr(u32 ecx)
1631 const struct lbr_info *lbr = last_branch_msr_get();
1633 if ( lbr == NULL )
1634 return 0;
1636 for ( ; lbr->count; lbr++ )
1637 if ( (ecx >= lbr->base) && (ecx < (lbr->base + lbr->count)) )
1638 return 1;
1640 return 0;
1643 static int vmx_msr_read_intercept(struct cpu_user_regs *regs)
1645 u64 msr_content = 0;
1646 u32 ecx = regs->ecx, eax, edx;
1647 struct vcpu *v = current;
1648 int index;
1649 u64 *var_range_base = (u64*)v->arch.hvm_vcpu.mtrr.var_ranges;
1650 u64 *fixed_range_base = (u64*)v->arch.hvm_vcpu.mtrr.fixed_ranges;
1652 HVM_DBG_LOG(DBG_LEVEL_1, "ecx=%x", ecx);
1654 switch ( ecx )
1656 case MSR_IA32_TSC:
1657 msr_content = hvm_get_guest_time(v);
1658 break;
1659 case MSR_IA32_SYSENTER_CS:
1660 msr_content = (u32)__vmread(GUEST_SYSENTER_CS);
1661 break;
1662 case MSR_IA32_SYSENTER_ESP:
1663 msr_content = __vmread(GUEST_SYSENTER_ESP);
1664 break;
1665 case MSR_IA32_SYSENTER_EIP:
1666 msr_content = __vmread(GUEST_SYSENTER_EIP);
1667 break;
1668 case MSR_IA32_APICBASE:
1669 msr_content = vcpu_vlapic(v)->hw.apic_base_msr;
1670 break;
1671 case MSR_IA32_CR_PAT:
1672 msr_content = v->arch.hvm_vcpu.pat_cr;
1673 break;
1674 case MSR_MTRRcap:
1675 msr_content = v->arch.hvm_vcpu.mtrr.mtrr_cap;
1676 break;
1677 case MSR_MTRRdefType:
1678 msr_content = v->arch.hvm_vcpu.mtrr.def_type
1679 | (v->arch.hvm_vcpu.mtrr.enabled << 10);
1680 break;
1681 case MSR_MTRRfix64K_00000:
1682 msr_content = fixed_range_base[0];
1683 break;
1684 case MSR_MTRRfix16K_80000:
1685 case MSR_MTRRfix16K_A0000:
1686 index = regs->ecx - MSR_MTRRfix16K_80000;
1687 msr_content = fixed_range_base[index + 1];
1688 break;
1689 case MSR_MTRRfix4K_C0000...MSR_MTRRfix4K_F8000:
1690 index = regs->ecx - MSR_MTRRfix4K_C0000;
1691 msr_content = fixed_range_base[index + 3];
1692 break;
1693 case MSR_IA32_MTRR_PHYSBASE0...MSR_IA32_MTRR_PHYSMASK7:
1694 index = regs->ecx - MSR_IA32_MTRR_PHYSBASE0;
1695 msr_content = var_range_base[index];
1696 break;
1697 case MSR_IA32_DEBUGCTLMSR:
1698 msr_content = __vmread(GUEST_IA32_DEBUGCTL);
1699 #ifdef __i386__
1700 msr_content |= (u64)__vmread(GUEST_IA32_DEBUGCTL_HIGH) << 32;
1701 #endif
1702 break;
1703 case MSR_IA32_VMX_BASIC...MSR_IA32_VMX_PROCBASED_CTLS2:
1704 goto gp_fault;
1705 case MSR_IA32_MCG_CAP:
1706 case MSR_IA32_MCG_STATUS:
1707 case MSR_IA32_MC0_STATUS:
1708 case MSR_IA32_MC1_STATUS:
1709 case MSR_IA32_MC2_STATUS:
1710 case MSR_IA32_MC3_STATUS:
1711 case MSR_IA32_MC4_STATUS:
1712 case MSR_IA32_MC5_STATUS:
1713 /* No point in letting the guest see real MCEs */
1714 msr_content = 0;
1715 break;
1716 case MSR_IA32_MISC_ENABLE:
1717 rdmsrl(MSR_IA32_MISC_ENABLE, msr_content);
1718 /* Debug Trace Store is not supported. */
1719 msr_content |= MSR_IA32_MISC_ENABLE_BTS_UNAVAIL |
1720 MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL;
1721 break;
1722 default:
1723 if ( vpmu_do_rdmsr(regs) )
1724 goto done;
1725 switch ( long_mode_do_msr_read(regs) )
1727 case HNDL_unhandled:
1728 break;
1729 case HNDL_exception_raised:
1730 return X86EMUL_EXCEPTION;
1731 case HNDL_done:
1732 goto done;
1735 if ( vmx_read_guest_msr(v, ecx, &msr_content) == 0 )
1736 break;
1738 if ( is_last_branch_msr(ecx) )
1740 msr_content = 0;
1741 break;
1744 if ( rdmsr_hypervisor_regs(ecx, &eax, &edx) ||
1745 rdmsr_safe(ecx, eax, edx) == 0 )
1747 regs->eax = eax;
1748 regs->edx = edx;
1749 goto done;
1752 goto gp_fault;
1755 regs->eax = msr_content & 0xFFFFFFFF;
1756 regs->edx = msr_content >> 32;
1758 done:
1759 hvmtrace_msr_read(v, ecx, msr_content);
1760 HVM_DBG_LOG(DBG_LEVEL_1, "returns: ecx=%x, eax=%lx, edx=%lx",
1761 ecx, (unsigned long)regs->eax,
1762 (unsigned long)regs->edx);
1763 return X86EMUL_OKAY;
1765 gp_fault:
1766 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
1767 return X86EMUL_EXCEPTION;
1770 static int vmx_alloc_vlapic_mapping(struct domain *d)
1772 void *apic_va;
1774 if ( !cpu_has_vmx_virtualize_apic_accesses )
1775 return 0;
1777 apic_va = alloc_xenheap_page();
1778 if ( apic_va == NULL )
1779 return -ENOMEM;
1780 share_xen_page_with_guest(virt_to_page(apic_va), d, XENSHARE_writable);
1781 set_mmio_p2m_entry(
1782 d, paddr_to_pfn(APIC_DEFAULT_PHYS_BASE), _mfn(virt_to_mfn(apic_va)));
1783 d->arch.hvm_domain.vmx.apic_access_mfn = virt_to_mfn(apic_va);
1785 return 0;
1788 static void vmx_free_vlapic_mapping(struct domain *d)
1790 unsigned long mfn = d->arch.hvm_domain.vmx.apic_access_mfn;
1791 if ( mfn != 0 )
1792 free_xenheap_page(mfn_to_virt(mfn));
1795 static int vmx_alloc_vpid(struct domain *d)
1797 int idx;
1799 if ( !cpu_has_vmx_vpid )
1800 return 0;
1802 do {
1803 idx = find_first_zero_bit(vpid_bitmap, VPID_BITMAP_SIZE);
1804 if ( idx >= VPID_BITMAP_SIZE )
1806 dprintk(XENLOG_WARNING, "VMX VPID space exhausted.\n");
1807 return -EBUSY;
1810 while ( test_and_set_bit(idx, vpid_bitmap) );
1812 d->arch.hvm_domain.vmx.vpid_base = idx * MAX_VIRT_CPUS;
1813 return 0;
1816 static void vmx_free_vpid(struct domain *d)
1818 if ( !cpu_has_vmx_vpid )
1819 return;
1821 clear_bit(d->arch.hvm_domain.vmx.vpid_base / MAX_VIRT_CPUS, vpid_bitmap);
1824 static void vmx_install_vlapic_mapping(struct vcpu *v)
1826 paddr_t virt_page_ma, apic_page_ma;
1828 if ( !cpu_has_vmx_virtualize_apic_accesses )
1829 return;
1831 virt_page_ma = page_to_maddr(vcpu_vlapic(v)->regs_page);
1832 apic_page_ma = v->domain->arch.hvm_domain.vmx.apic_access_mfn;
1833 apic_page_ma <<= PAGE_SHIFT;
1835 vmx_vmcs_enter(v);
1836 __vmwrite(VIRTUAL_APIC_PAGE_ADDR, virt_page_ma);
1837 __vmwrite(APIC_ACCESS_ADDR, apic_page_ma);
1838 vmx_vmcs_exit(v);
1841 void vmx_vlapic_msr_changed(struct vcpu *v)
1843 struct vlapic *vlapic = vcpu_vlapic(v);
1844 uint32_t ctl;
1846 if ( !cpu_has_vmx_virtualize_apic_accesses )
1847 return;
1849 vmx_vmcs_enter(v);
1850 ctl = __vmread(SECONDARY_VM_EXEC_CONTROL);
1851 ctl &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
1852 if ( !vlapic_hw_disabled(vlapic) &&
1853 (vlapic_base_address(vlapic) == APIC_DEFAULT_PHYS_BASE) )
1854 ctl |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
1855 __vmwrite(SECONDARY_VM_EXEC_CONTROL, ctl);
1856 vmx_vmcs_exit(v);
1859 extern bool_t mtrr_var_range_msr_set(struct mtrr_state *v,
1860 u32 msr, u64 msr_content);
1861 extern bool_t mtrr_fix_range_msr_set(struct mtrr_state *v,
1862 int row, u64 msr_content);
1863 extern bool_t mtrr_def_type_msr_set(struct mtrr_state *v, u64 msr_content);
1864 extern bool_t pat_msr_set(u64 *pat, u64 msr);
1866 static int vmx_msr_write_intercept(struct cpu_user_regs *regs)
1868 u32 ecx = regs->ecx;
1869 u64 msr_content;
1870 struct vcpu *v = current;
1871 int index;
1873 HVM_DBG_LOG(DBG_LEVEL_1, "ecx=%x, eax=%x, edx=%x",
1874 ecx, (u32)regs->eax, (u32)regs->edx);
1876 msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
1878 hvmtrace_msr_write(v, ecx, msr_content);
1880 switch ( ecx )
1882 case MSR_IA32_TSC:
1883 hvm_set_guest_time(v, msr_content);
1884 pt_reset(v);
1885 break;
1886 case MSR_IA32_SYSENTER_CS:
1887 __vmwrite(GUEST_SYSENTER_CS, msr_content);
1888 break;
1889 case MSR_IA32_SYSENTER_ESP:
1890 __vmwrite(GUEST_SYSENTER_ESP, msr_content);
1891 break;
1892 case MSR_IA32_SYSENTER_EIP:
1893 __vmwrite(GUEST_SYSENTER_EIP, msr_content);
1894 break;
1895 case MSR_IA32_APICBASE:
1896 vlapic_msr_set(vcpu_vlapic(v), msr_content);
1897 break;
1898 case MSR_IA32_CR_PAT:
1899 if ( !pat_msr_set(&v->arch.hvm_vcpu.pat_cr, msr_content) )
1900 goto gp_fault;
1901 break;
1902 case MSR_MTRRdefType:
1903 if ( !mtrr_def_type_msr_set(&v->arch.hvm_vcpu.mtrr, msr_content) )
1904 goto gp_fault;
1905 break;
1906 case MSR_MTRRfix64K_00000:
1907 if ( !mtrr_fix_range_msr_set(&v->arch.hvm_vcpu.mtrr, 0, msr_content) )
1908 goto gp_fault;
1909 break;
1910 case MSR_MTRRfix16K_80000:
1911 case MSR_MTRRfix16K_A0000:
1912 index = regs->ecx - MSR_MTRRfix16K_80000 + 1;
1913 if ( !mtrr_fix_range_msr_set(&v->arch.hvm_vcpu.mtrr,
1914 index, msr_content) )
1915 goto gp_fault;
1916 break;
1917 case MSR_MTRRfix4K_C0000...MSR_MTRRfix4K_F8000:
1918 index = regs->ecx - MSR_MTRRfix4K_C0000 + 3;
1919 if ( !mtrr_fix_range_msr_set(&v->arch.hvm_vcpu.mtrr,
1920 index, msr_content) )
1921 goto gp_fault;
1922 break;
1923 case MSR_IA32_MTRR_PHYSBASE0...MSR_IA32_MTRR_PHYSMASK7:
1924 if ( !mtrr_var_range_msr_set(&v->arch.hvm_vcpu.mtrr,
1925 regs->ecx, msr_content) )
1926 goto gp_fault;
1927 break;
1928 case MSR_MTRRcap:
1929 goto gp_fault;
1930 case MSR_IA32_DEBUGCTLMSR: {
1931 int i, rc = 0;
1933 if ( !msr_content || (msr_content & ~3) )
1934 break;
1936 if ( msr_content & 1 )
1938 const struct lbr_info *lbr = last_branch_msr_get();
1939 if ( lbr == NULL )
1940 break;
1942 for ( ; (rc == 0) && lbr->count; lbr++ )
1943 for ( i = 0; (rc == 0) && (i < lbr->count); i++ )
1944 if ( (rc = vmx_add_guest_msr(v, lbr->base + i)) == 0 )
1945 vmx_disable_intercept_for_msr(v, lbr->base + i);
1948 if ( (rc < 0) ||
1949 (vmx_add_host_load_msr(v, ecx) < 0) )
1950 vmx_inject_hw_exception(v, TRAP_machine_check, 0);
1951 else
1953 __vmwrite(GUEST_IA32_DEBUGCTL, msr_content);
1954 #ifdef __i386__
1955 __vmwrite(GUEST_IA32_DEBUGCTL_HIGH, msr_content >> 32);
1956 #endif
1959 break;
1961 case MSR_IA32_VMX_BASIC...MSR_IA32_VMX_PROCBASED_CTLS2:
1962 goto gp_fault;
1963 default:
1964 if ( vpmu_do_wrmsr(regs) )
1965 return X86EMUL_OKAY;
1966 switch ( long_mode_do_msr_write(regs) )
1968 case HNDL_unhandled:
1969 if ( (vmx_write_guest_msr(v, ecx, msr_content) != 0) &&
1970 !is_last_branch_msr(ecx) )
1971 wrmsr_hypervisor_regs(ecx, regs->eax, regs->edx);
1972 break;
1973 case HNDL_exception_raised:
1974 return X86EMUL_EXCEPTION;
1975 case HNDL_done:
1976 break;
1978 break;
1981 return X86EMUL_OKAY;
1983 gp_fault:
1984 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
1985 return X86EMUL_EXCEPTION;
1988 static void vmx_do_hlt(struct cpu_user_regs *regs)
1990 unsigned long intr_info = __vmread(VM_ENTRY_INTR_INFO);
1991 struct vcpu *curr = current;
1993 /* Check for pending exception. */
1994 if ( intr_info & INTR_INFO_VALID_MASK )
1996 HVMTRACE_1D(HLT, curr, /*int pending=*/ 1);
1997 return;
2000 HVMTRACE_1D(HLT, curr, /*int pending=*/ 0);
2001 hvm_hlt(regs->eflags);
2004 static void vmx_do_extint(struct cpu_user_regs *regs)
2006 unsigned int vector;
2008 asmlinkage void do_IRQ(struct cpu_user_regs *);
2009 fastcall void smp_apic_timer_interrupt(struct cpu_user_regs *);
2010 fastcall void smp_event_check_interrupt(void);
2011 fastcall void smp_invalidate_interrupt(void);
2012 fastcall void smp_call_function_interrupt(void);
2013 fastcall void smp_spurious_interrupt(struct cpu_user_regs *regs);
2014 fastcall void smp_error_interrupt(struct cpu_user_regs *regs);
2015 fastcall void smp_pmu_apic_interrupt(struct cpu_user_regs *regs);
2016 #ifdef CONFIG_X86_MCE_P4THERMAL
2017 fastcall void smp_thermal_interrupt(struct cpu_user_regs *regs);
2018 #endif
2020 vector = __vmread(VM_EXIT_INTR_INFO);
2021 BUG_ON(!(vector & INTR_INFO_VALID_MASK));
2023 vector &= INTR_INFO_VECTOR_MASK;
2024 HVMTRACE_1D(INTR, current, vector);
2026 switch ( vector )
2028 case LOCAL_TIMER_VECTOR:
2029 smp_apic_timer_interrupt(regs);
2030 break;
2031 case EVENT_CHECK_VECTOR:
2032 smp_event_check_interrupt();
2033 break;
2034 case INVALIDATE_TLB_VECTOR:
2035 smp_invalidate_interrupt();
2036 break;
2037 case CALL_FUNCTION_VECTOR:
2038 smp_call_function_interrupt();
2039 break;
2040 case SPURIOUS_APIC_VECTOR:
2041 smp_spurious_interrupt(regs);
2042 break;
2043 case ERROR_APIC_VECTOR:
2044 smp_error_interrupt(regs);
2045 break;
2046 case PMU_APIC_VECTOR:
2047 smp_pmu_apic_interrupt(regs);
2048 break;
2049 #ifdef CONFIG_X86_MCE_P4THERMAL
2050 case THERMAL_APIC_VECTOR:
2051 smp_thermal_interrupt(regs);
2052 break;
2053 #endif
2054 default:
2055 regs->entry_vector = vector;
2056 do_IRQ(regs);
2057 break;
2061 static void wbinvd_ipi(void *info)
2063 wbinvd();
2066 static void vmx_wbinvd_intercept(void)
2068 if ( list_empty(&(domain_hvm_iommu(current->domain)->pdev_list)) )
2069 return;
2071 if ( cpu_has_wbinvd_exiting )
2072 on_each_cpu(wbinvd_ipi, NULL, 1, 1);
2073 else
2074 wbinvd();
2077 static void ept_handle_violation(unsigned long qualification, paddr_t gpa)
2079 if ( unlikely(((qualification >> 7) & 0x3) != 0x3) )
2081 domain_crash(current->domain);
2082 return;
2085 handle_mmio();
2088 static void vmx_failed_vmentry(unsigned int exit_reason,
2089 struct cpu_user_regs *regs)
2091 unsigned int failed_vmentry_reason = (uint16_t)exit_reason;
2092 unsigned long exit_qualification = __vmread(EXIT_QUALIFICATION);
2093 struct vcpu *curr = current;
2095 printk("Failed vm entry (exit reason 0x%x) ", exit_reason);
2096 switch ( failed_vmentry_reason )
2098 case EXIT_REASON_INVALID_GUEST_STATE:
2099 printk("caused by invalid guest state (%ld).\n", exit_qualification);
2100 break;
2101 case EXIT_REASON_MSR_LOADING:
2102 printk("caused by MSR entry %ld loading.\n", exit_qualification);
2103 break;
2104 case EXIT_REASON_MACHINE_CHECK:
2105 printk("caused by machine check.\n");
2106 HVMTRACE_0D(MCE, curr);
2107 do_machine_check(regs);
2108 break;
2109 default:
2110 printk("reason not known yet!");
2111 break;
2114 printk("************* VMCS Area **************\n");
2115 vmcs_dump_vcpu(curr);
2116 printk("**************************************\n");
2118 domain_crash(curr->domain);
2121 asmlinkage void vmx_vmexit_handler(struct cpu_user_regs *regs)
2123 unsigned int exit_reason, idtv_info;
2124 unsigned long exit_qualification, inst_len = 0;
2125 struct vcpu *v = current;
2127 if ( paging_mode_hap(v->domain) && hvm_paging_enabled(v) )
2128 v->arch.hvm_vcpu.guest_cr[3] = v->arch.hvm_vcpu.hw_cr[3] =
2129 __vmread(GUEST_CR3);
2131 exit_reason = __vmread(VM_EXIT_REASON);
2133 hvmtrace_vmexit(v, regs->eip, exit_reason);
2135 perfc_incra(vmexits, exit_reason);
2137 if ( exit_reason != EXIT_REASON_EXTERNAL_INTERRUPT )
2138 local_irq_enable();
2140 if ( unlikely(exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) )
2141 return vmx_failed_vmentry(exit_reason, regs);
2143 hvm_maybe_deassert_evtchn_irq();
2145 /* Event delivery caused this intercept? Queue for redelivery. */
2146 idtv_info = __vmread(IDT_VECTORING_INFO);
2147 if ( unlikely(idtv_info & INTR_INFO_VALID_MASK) &&
2148 (exit_reason != EXIT_REASON_TASK_SWITCH) )
2150 if ( hvm_event_needs_reinjection((idtv_info>>8)&7, idtv_info&0xff) )
2152 /* See SDM 3B 25.7.1.1 and .2 for info about masking resvd bits. */
2153 __vmwrite(VM_ENTRY_INTR_INFO,
2154 idtv_info & ~INTR_INFO_RESVD_BITS_MASK);
2155 if ( idtv_info & INTR_INFO_DELIVER_CODE_MASK )
2156 __vmwrite(VM_ENTRY_EXCEPTION_ERROR_CODE,
2157 __vmread(IDT_VECTORING_ERROR_CODE));
2160 /*
2161 * Clear NMI-blocking interruptibility info if an NMI delivery faulted.
2162 * Re-delivery will re-set it (see SDM 3B 25.7.1.2).
2163 */
2164 if ( (idtv_info & INTR_INFO_INTR_TYPE_MASK) == (X86_EVENTTYPE_NMI<<8) )
2165 __vmwrite(GUEST_INTERRUPTIBILITY_INFO,
2166 __vmread(GUEST_INTERRUPTIBILITY_INFO) &
2167 ~VMX_INTR_SHADOW_NMI);
2170 switch ( exit_reason )
2172 case EXIT_REASON_EXCEPTION_NMI:
2174 /*
2175 * We don't set the software-interrupt exiting (INT n).
2176 * (1) We can get an exception (e.g. #PG) in the guest, or
2177 * (2) NMI
2178 */
2179 unsigned int intr_info, vector;
2181 intr_info = __vmread(VM_EXIT_INTR_INFO);
2182 BUG_ON(!(intr_info & INTR_INFO_VALID_MASK));
2184 vector = intr_info & INTR_INFO_VECTOR_MASK;
2186 /*
2187 * Re-set the NMI shadow if vmexit caused by a guest IRET fault (see 3B
2188 * 25.7.1.2, "Resuming Guest Software after Handling an Exception").
2189 * (NB. If we emulate this IRET for any reason, we should re-clear!)
2190 */
2191 if ( unlikely(intr_info & INTR_INFO_NMI_UNBLOCKED_BY_IRET) &&
2192 !(__vmread(IDT_VECTORING_INFO) & INTR_INFO_VALID_MASK) &&
2193 (vector != TRAP_double_fault) )
2194 __vmwrite(GUEST_INTERRUPTIBILITY_INFO,
2195 __vmread(GUEST_INTERRUPTIBILITY_INFO)|VMX_INTR_SHADOW_NMI);
2197 perfc_incra(cause_vector, vector);
2199 switch ( vector )
2201 case TRAP_debug:
2202 case TRAP_int3:
2203 if ( !v->domain->debugger_attached )
2204 goto exit_and_crash;
2205 domain_pause_for_debugger();
2206 break;
2207 case TRAP_no_device:
2208 vmx_fpu_dirty_intercept();
2209 break;
2210 case TRAP_page_fault:
2211 exit_qualification = __vmread(EXIT_QUALIFICATION);
2212 regs->error_code = __vmread(VM_EXIT_INTR_ERROR_CODE);
2214 HVM_DBG_LOG(DBG_LEVEL_VMMU,
2215 "eax=%lx, ebx=%lx, ecx=%lx, edx=%lx, esi=%lx, edi=%lx",
2216 (unsigned long)regs->eax, (unsigned long)regs->ebx,
2217 (unsigned long)regs->ecx, (unsigned long)regs->edx,
2218 (unsigned long)regs->esi, (unsigned long)regs->edi);
2220 if ( paging_fault(exit_qualification, regs) )
2222 hvmtrace_pf_xen(v, exit_qualification, regs->error_code);
2223 break;
2226 v->arch.hvm_vcpu.guest_cr[2] = exit_qualification;
2227 vmx_inject_hw_exception(v, TRAP_page_fault, regs->error_code);
2228 break;
2229 case TRAP_nmi:
2230 if ( (intr_info & INTR_INFO_INTR_TYPE_MASK) !=
2231 (X86_EVENTTYPE_NMI << 8) )
2232 goto exit_and_crash;
2233 HVMTRACE_0D(NMI, v);
2234 do_nmi(regs); /* Real NMI, vector 2: normal processing. */
2235 break;
2236 case TRAP_machine_check:
2237 HVMTRACE_0D(MCE, v);
2238 do_machine_check(regs);
2239 break;
2240 default:
2241 goto exit_and_crash;
2243 break;
2245 case EXIT_REASON_EXTERNAL_INTERRUPT:
2246 vmx_do_extint(regs);
2247 break;
2248 case EXIT_REASON_TRIPLE_FAULT:
2249 hvm_triple_fault();
2250 break;
2251 case EXIT_REASON_PENDING_VIRT_INTR:
2252 /* Disable the interrupt window. */
2253 v->arch.hvm_vmx.exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
2254 __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
2255 v->arch.hvm_vmx.exec_control);
2256 break;
2257 case EXIT_REASON_PENDING_VIRT_NMI:
2258 /* Disable the NMI window. */
2259 v->arch.hvm_vmx.exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
2260 __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
2261 v->arch.hvm_vmx.exec_control);
2262 break;
2263 case EXIT_REASON_TASK_SWITCH: {
2264 const enum hvm_task_switch_reason reasons[] = {
2265 TSW_call_or_int, TSW_iret, TSW_jmp, TSW_call_or_int };
2266 int32_t errcode = -1;
2267 exit_qualification = __vmread(EXIT_QUALIFICATION);
2268 if ( (idtv_info & INTR_INFO_VALID_MASK) &&
2269 (idtv_info & INTR_INFO_DELIVER_CODE_MASK) )
2270 errcode = __vmread(IDT_VECTORING_ERROR_CODE);
2271 hvm_task_switch((uint16_t)exit_qualification,
2272 reasons[(exit_qualification >> 30) & 3],
2273 errcode);
2274 break;
2276 case EXIT_REASON_CPUID:
2277 inst_len = __get_instruction_length(); /* Safe: CPUID */
2278 __update_guest_eip(inst_len);
2279 vmx_do_cpuid(regs);
2280 break;
2281 case EXIT_REASON_HLT:
2282 inst_len = __get_instruction_length(); /* Safe: HLT */
2283 __update_guest_eip(inst_len);
2284 vmx_do_hlt(regs);
2285 break;
2286 case EXIT_REASON_INVLPG:
2288 inst_len = __get_instruction_length(); /* Safe: INVLPG */
2289 __update_guest_eip(inst_len);
2290 exit_qualification = __vmread(EXIT_QUALIFICATION);
2291 vmx_invlpg_intercept(exit_qualification);
2292 break;
2294 case EXIT_REASON_VMCALL:
2296 int rc;
2297 HVMTRACE_1D(VMMCALL, v, regs->eax);
2298 inst_len = __get_instruction_length(); /* Safe: VMCALL */
2299 rc = hvm_do_hypercall(regs);
2300 if ( rc != HVM_HCALL_preempted )
2302 __update_guest_eip(inst_len);
2303 if ( rc == HVM_HCALL_invalidate )
2304 send_invalidate_req();
2306 break;
2308 case EXIT_REASON_CR_ACCESS:
2310 exit_qualification = __vmread(EXIT_QUALIFICATION);
2311 inst_len = __get_instruction_length(); /* Safe: MOV Cn, LMSW, CLTS */
2312 if ( vmx_cr_access(exit_qualification, regs) )
2313 __update_guest_eip(inst_len);
2314 break;
2316 case EXIT_REASON_DR_ACCESS:
2317 exit_qualification = __vmread(EXIT_QUALIFICATION);
2318 vmx_dr_access(exit_qualification, regs);
2319 break;
2320 case EXIT_REASON_MSR_READ:
2321 inst_len = __get_instruction_length(); /* Safe: RDMSR */
2322 if ( vmx_msr_read_intercept(regs) == X86EMUL_OKAY )
2323 __update_guest_eip(inst_len);
2324 break;
2325 case EXIT_REASON_MSR_WRITE:
2326 inst_len = __get_instruction_length(); /* Safe: WRMSR */
2327 if ( vmx_msr_write_intercept(regs) == X86EMUL_OKAY )
2328 __update_guest_eip(inst_len);
2329 break;
2331 case EXIT_REASON_MWAIT_INSTRUCTION:
2332 case EXIT_REASON_MONITOR_INSTRUCTION:
2333 case EXIT_REASON_VMCLEAR:
2334 case EXIT_REASON_VMLAUNCH:
2335 case EXIT_REASON_VMPTRLD:
2336 case EXIT_REASON_VMPTRST:
2337 case EXIT_REASON_VMREAD:
2338 case EXIT_REASON_VMRESUME:
2339 case EXIT_REASON_VMWRITE:
2340 case EXIT_REASON_VMXOFF:
2341 case EXIT_REASON_VMXON:
2342 vmx_inject_hw_exception(v, TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE);
2343 break;
2345 case EXIT_REASON_TPR_BELOW_THRESHOLD:
2346 break;
2348 case EXIT_REASON_IO_INSTRUCTION:
2349 case EXIT_REASON_APIC_ACCESS:
2350 if ( !handle_mmio() )
2351 hvm_inject_exception(TRAP_gp_fault, 0, 0);
2352 break;
2354 case EXIT_REASON_INVD:
2355 case EXIT_REASON_WBINVD:
2357 inst_len = __get_instruction_length(); /* Safe: INVD, WBINVD */
2358 __update_guest_eip(inst_len);
2359 vmx_wbinvd_intercept();
2360 break;
2363 case EXIT_REASON_EPT_VIOLATION:
2365 paddr_t gpa = __vmread(GUEST_PHYSICAL_ADDRESS);
2366 #ifdef CONFIG_X86_PAE
2367 gpa |= (paddr_t)__vmread(GUEST_PHYSICAL_ADDRESS_HIGH) << 32;
2368 #endif
2369 exit_qualification = __vmread(EXIT_QUALIFICATION);
2370 ept_handle_violation(exit_qualification, gpa);
2371 break;
2374 default:
2375 exit_and_crash:
2376 gdprintk(XENLOG_ERR, "Bad vmexit (reason %x)\n", exit_reason);
2377 domain_crash(v->domain);
2378 break;
2382 asmlinkage void vmx_trace_vmentry(void)
2384 hvmtrace_vmentry(current);
2387 /*
2388 * Local variables:
2389 * mode: C
2390 * c-set-style: "BSD"
2391 * c-basic-offset: 4
2392 * tab-width: 4
2393 * indent-tabs-mode: nil
2394 * End:
2395 */