ia64/xen-unstable

view xen/arch/x86/hvm/vmx/vmx.c @ 19825:81edfffb3aff

Scaling guest's TSC when the target machine's frequency is different
with its requirement.

Using trap&emulate for guest's each rdtsc instruction first, maybe it
can be optimized later.

Signed-off-by: Xiantao Zhang <xiantao.zhang@intel.com>
author Keir Fraser <keir.fraser@citrix.com>
date Wed Jun 24 11:05:22 2009 +0100 (2009-06-24)
parents af06333d4c5d
children
line source
1 /*
2 * vmx.c: handling VMX architecture-related VM exits
3 * Copyright (c) 2004, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
16 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 */
19 #include <xen/config.h>
20 #include <xen/init.h>
21 #include <xen/lib.h>
22 #include <xen/trace.h>
23 #include <xen/sched.h>
24 #include <xen/irq.h>
25 #include <xen/softirq.h>
26 #include <xen/domain_page.h>
27 #include <xen/hypercall.h>
28 #include <xen/perfc.h>
29 #include <asm/current.h>
30 #include <asm/io.h>
31 #include <asm/regs.h>
32 #include <asm/cpufeature.h>
33 #include <asm/processor.h>
34 #include <asm/types.h>
35 #include <asm/debugreg.h>
36 #include <asm/msr.h>
37 #include <asm/spinlock.h>
38 #include <asm/paging.h>
39 #include <asm/p2m.h>
40 #include <asm/hvm/emulate.h>
41 #include <asm/hvm/hvm.h>
42 #include <asm/hvm/support.h>
43 #include <asm/hvm/vmx/vmx.h>
44 #include <asm/hvm/vmx/vmcs.h>
45 #include <public/sched.h>
46 #include <public/hvm/ioreq.h>
47 #include <asm/hvm/vpic.h>
48 #include <asm/hvm/vlapic.h>
49 #include <asm/x86_emulate.h>
50 #include <asm/hvm/vpt.h>
51 #include <public/hvm/save.h>
52 #include <asm/hvm/trace.h>
53 #include <asm/xenoprof.h>
55 enum handler_return { HNDL_done, HNDL_unhandled, HNDL_exception_raised };
57 static void vmx_ctxt_switch_from(struct vcpu *v);
58 static void vmx_ctxt_switch_to(struct vcpu *v);
60 static int vmx_alloc_vlapic_mapping(struct domain *d);
61 static void vmx_free_vlapic_mapping(struct domain *d);
62 static int vmx_alloc_vpid(struct domain *d);
63 static void vmx_free_vpid(struct domain *d);
64 static void vmx_install_vlapic_mapping(struct vcpu *v);
65 static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr);
66 static void vmx_update_guest_efer(struct vcpu *v);
67 static void vmx_cpuid_intercept(
68 unsigned int *eax, unsigned int *ebx,
69 unsigned int *ecx, unsigned int *edx);
70 static void vmx_wbinvd_intercept(void);
71 static void vmx_fpu_dirty_intercept(void);
72 static int vmx_msr_read_intercept(struct cpu_user_regs *regs);
73 static int vmx_msr_write_intercept(struct cpu_user_regs *regs);
74 static void vmx_invlpg_intercept(unsigned long vaddr);
76 static int vmx_domain_initialise(struct domain *d)
77 {
78 int rc;
80 d->arch.hvm_domain.vmx.ept_control.etmt = EPT_DEFAULT_MT;
81 d->arch.hvm_domain.vmx.ept_control.gaw = EPT_DEFAULT_GAW;
82 d->arch.hvm_domain.vmx.ept_control.asr =
83 pagetable_get_pfn(d->arch.phys_table);
85 if ( (rc = vmx_alloc_vpid(d)) != 0 )
86 return rc;
88 if ( (rc = vmx_alloc_vlapic_mapping(d)) != 0 )
89 {
90 vmx_free_vpid(d);
91 return rc;
92 }
94 return 0;
95 }
97 static void vmx_domain_destroy(struct domain *d)
98 {
99 ept_sync_domain(d);
100 vmx_free_vlapic_mapping(d);
101 vmx_free_vpid(d);
102 }
104 static int vmx_vcpu_initialise(struct vcpu *v)
105 {
106 int rc;
108 spin_lock_init(&v->arch.hvm_vmx.vmcs_lock);
110 v->arch.schedule_tail = vmx_do_resume;
111 v->arch.ctxt_switch_from = vmx_ctxt_switch_from;
112 v->arch.ctxt_switch_to = vmx_ctxt_switch_to;
114 if ( (rc = vmx_create_vmcs(v)) != 0 )
115 {
116 dprintk(XENLOG_WARNING,
117 "Failed to create VMCS for vcpu %d: err=%d.\n",
118 v->vcpu_id, rc);
119 return rc;
120 }
122 vpmu_initialise(v);
124 vmx_install_vlapic_mapping(v);
126 /* %eax == 1 signals full real-mode support to the guest loader. */
127 if ( v->vcpu_id == 0 )
128 v->arch.guest_context.user_regs.eax = 1;
130 return 0;
131 }
133 static void vmx_vcpu_destroy(struct vcpu *v)
134 {
135 vmx_destroy_vmcs(v);
136 vpmu_destroy(v);
137 passive_domain_destroy(v);
138 }
140 #ifdef __x86_64__
142 static DEFINE_PER_CPU(struct vmx_msr_state, host_msr_state);
144 static u32 msr_index[VMX_MSR_COUNT] =
145 {
146 MSR_LSTAR, MSR_STAR, MSR_SYSCALL_MASK
147 };
149 static void vmx_save_host_msrs(void)
150 {
151 struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state);
152 int i;
154 for ( i = 0; i < VMX_MSR_COUNT; i++ )
155 rdmsrl(msr_index[i], host_msr_state->msrs[i]);
156 }
158 #define WRITE_MSR(address) \
159 guest_msr_state->msrs[VMX_INDEX_MSR_ ## address] = msr_content; \
160 set_bit(VMX_INDEX_MSR_ ## address, &guest_msr_state->flags); \
161 wrmsrl(MSR_ ## address, msr_content); \
162 set_bit(VMX_INDEX_MSR_ ## address, &host_msr_state->flags); \
163 break
165 static enum handler_return long_mode_do_msr_read(struct cpu_user_regs *regs)
166 {
167 u64 msr_content = 0;
168 u32 ecx = regs->ecx;
169 struct vcpu *v = current;
170 struct vmx_msr_state *guest_msr_state = &v->arch.hvm_vmx.msr_state;
172 switch ( ecx )
173 {
174 case MSR_EFER:
175 msr_content = v->arch.hvm_vcpu.guest_efer;
176 break;
178 case MSR_FS_BASE:
179 msr_content = __vmread(GUEST_FS_BASE);
180 goto check_long_mode;
182 case MSR_GS_BASE:
183 msr_content = __vmread(GUEST_GS_BASE);
184 goto check_long_mode;
186 case MSR_SHADOW_GS_BASE:
187 msr_content = v->arch.hvm_vmx.shadow_gs;
188 check_long_mode:
189 if ( !(hvm_long_mode_enabled(v)) )
190 {
191 vmx_inject_hw_exception(TRAP_gp_fault, 0);
192 return HNDL_exception_raised;
193 }
194 break;
196 case MSR_STAR:
197 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_STAR];
198 break;
200 case MSR_LSTAR:
201 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_LSTAR];
202 break;
204 case MSR_CSTAR:
205 msr_content = v->arch.hvm_vmx.cstar;
206 break;
208 case MSR_SYSCALL_MASK:
209 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_SYSCALL_MASK];
210 break;
212 default:
213 return HNDL_unhandled;
214 }
216 HVM_DBG_LOG(DBG_LEVEL_0, "msr 0x%x content 0x%"PRIx64, ecx, msr_content);
218 regs->eax = (u32)(msr_content >> 0);
219 regs->edx = (u32)(msr_content >> 32);
221 return HNDL_done;
222 }
224 static enum handler_return long_mode_do_msr_write(struct cpu_user_regs *regs)
225 {
226 u64 msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
227 u32 ecx = regs->ecx;
228 struct vcpu *v = current;
229 struct vmx_msr_state *guest_msr_state = &v->arch.hvm_vmx.msr_state;
230 struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state);
232 HVM_DBG_LOG(DBG_LEVEL_0, "msr 0x%x content 0x%"PRIx64, ecx, msr_content);
234 switch ( ecx )
235 {
236 case MSR_EFER:
237 if ( hvm_set_efer(msr_content) )
238 goto exception_raised;
239 break;
241 case MSR_FS_BASE:
242 case MSR_GS_BASE:
243 case MSR_SHADOW_GS_BASE:
244 if ( !hvm_long_mode_enabled(v) )
245 goto gp_fault;
247 if ( !is_canonical_address(msr_content) )
248 goto uncanonical_address;
250 if ( ecx == MSR_FS_BASE )
251 __vmwrite(GUEST_FS_BASE, msr_content);
252 else if ( ecx == MSR_GS_BASE )
253 __vmwrite(GUEST_GS_BASE, msr_content);
254 else
255 {
256 v->arch.hvm_vmx.shadow_gs = msr_content;
257 wrmsrl(MSR_SHADOW_GS_BASE, msr_content);
258 }
260 break;
262 case MSR_STAR:
263 WRITE_MSR(STAR);
265 case MSR_LSTAR:
266 if ( !is_canonical_address(msr_content) )
267 goto uncanonical_address;
268 WRITE_MSR(LSTAR);
270 case MSR_CSTAR:
271 if ( !is_canonical_address(msr_content) )
272 goto uncanonical_address;
273 v->arch.hvm_vmx.cstar = msr_content;
274 break;
276 case MSR_SYSCALL_MASK:
277 WRITE_MSR(SYSCALL_MASK);
279 default:
280 return HNDL_unhandled;
281 }
283 return HNDL_done;
285 uncanonical_address:
286 HVM_DBG_LOG(DBG_LEVEL_0, "Not cano address of msr write %x", ecx);
287 gp_fault:
288 vmx_inject_hw_exception(TRAP_gp_fault, 0);
289 exception_raised:
290 return HNDL_exception_raised;
291 }
293 /*
294 * To avoid MSR save/restore at every VM exit/entry time, we restore
295 * the x86_64 specific MSRs at domain switch time. Since these MSRs
296 * are not modified once set for para domains, we don't save them,
297 * but simply reset them to values set in percpu_traps_init().
298 */
299 static void vmx_restore_host_msrs(void)
300 {
301 struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state);
302 int i;
304 while ( host_msr_state->flags )
305 {
306 i = find_first_set_bit(host_msr_state->flags);
307 wrmsrl(msr_index[i], host_msr_state->msrs[i]);
308 clear_bit(i, &host_msr_state->flags);
309 }
310 }
312 static void vmx_save_guest_msrs(struct vcpu *v)
313 {
314 /* MSR_SHADOW_GS_BASE may have been changed by swapgs instruction. */
315 rdmsrl(MSR_SHADOW_GS_BASE, v->arch.hvm_vmx.shadow_gs);
316 }
318 static void vmx_restore_guest_msrs(struct vcpu *v)
319 {
320 struct vmx_msr_state *guest_msr_state, *host_msr_state;
321 unsigned long guest_flags;
322 int i;
324 guest_msr_state = &v->arch.hvm_vmx.msr_state;
325 host_msr_state = &this_cpu(host_msr_state);
327 wrmsrl(MSR_SHADOW_GS_BASE, v->arch.hvm_vmx.shadow_gs);
329 guest_flags = guest_msr_state->flags;
331 while ( guest_flags )
332 {
333 i = find_first_set_bit(guest_flags);
335 HVM_DBG_LOG(DBG_LEVEL_2,
336 "restore guest's index %d msr %x with value %lx",
337 i, msr_index[i], guest_msr_state->msrs[i]);
338 set_bit(i, &host_msr_state->flags);
339 wrmsrl(msr_index[i], guest_msr_state->msrs[i]);
340 clear_bit(i, &guest_flags);
341 }
343 if ( (v->arch.hvm_vcpu.guest_efer ^ read_efer()) & EFER_SCE )
344 {
345 HVM_DBG_LOG(DBG_LEVEL_2,
346 "restore guest's EFER with value %lx",
347 v->arch.hvm_vcpu.guest_efer);
348 write_efer((read_efer() & ~EFER_SCE) |
349 (v->arch.hvm_vcpu.guest_efer & EFER_SCE));
350 }
351 }
353 #else /* __i386__ */
355 #define vmx_save_host_msrs() ((void)0)
356 #define vmx_restore_host_msrs() ((void)0)
358 #define vmx_save_guest_msrs(v) ((void)0)
359 #define vmx_restore_guest_msrs(v) ((void)0)
361 static enum handler_return long_mode_do_msr_read(struct cpu_user_regs *regs)
362 {
363 u64 msr_content = 0;
364 struct vcpu *v = current;
366 switch ( regs->ecx )
367 {
368 case MSR_EFER:
369 msr_content = v->arch.hvm_vcpu.guest_efer;
370 break;
372 default:
373 return HNDL_unhandled;
374 }
376 regs->eax = msr_content >> 0;
377 regs->edx = msr_content >> 32;
379 return HNDL_done;
380 }
382 static enum handler_return long_mode_do_msr_write(struct cpu_user_regs *regs)
383 {
384 u64 msr_content = regs->eax | ((u64)regs->edx << 32);
386 switch ( regs->ecx )
387 {
388 case MSR_EFER:
389 if ( hvm_set_efer(msr_content) )
390 return HNDL_exception_raised;
391 break;
393 default:
394 return HNDL_unhandled;
395 }
397 return HNDL_done;
398 }
400 #endif /* __i386__ */
402 static int vmx_guest_x86_mode(struct vcpu *v)
403 {
404 unsigned int cs_ar_bytes;
406 if ( unlikely(!(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE)) )
407 return 0;
408 if ( unlikely(guest_cpu_user_regs()->eflags & X86_EFLAGS_VM) )
409 return 1;
410 cs_ar_bytes = __vmread(GUEST_CS_AR_BYTES);
411 if ( hvm_long_mode_enabled(v) &&
412 likely(cs_ar_bytes & X86_SEG_AR_CS_LM_ACTIVE) )
413 return 8;
414 return (likely(cs_ar_bytes & X86_SEG_AR_DEF_OP_SIZE) ? 4 : 2);
415 }
417 static void vmx_save_dr(struct vcpu *v)
418 {
419 if ( !v->arch.hvm_vcpu.flag_dr_dirty )
420 return;
422 /* Clear the DR dirty flag and re-enable intercepts for DR accesses. */
423 v->arch.hvm_vcpu.flag_dr_dirty = 0;
424 v->arch.hvm_vmx.exec_control |= CPU_BASED_MOV_DR_EXITING;
425 __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control);
427 v->arch.guest_context.debugreg[0] = read_debugreg(0);
428 v->arch.guest_context.debugreg[1] = read_debugreg(1);
429 v->arch.guest_context.debugreg[2] = read_debugreg(2);
430 v->arch.guest_context.debugreg[3] = read_debugreg(3);
431 v->arch.guest_context.debugreg[6] = read_debugreg(6);
432 /* DR7 must be saved as it is used by vmx_restore_dr(). */
433 v->arch.guest_context.debugreg[7] = __vmread(GUEST_DR7);
434 }
436 static void __restore_debug_registers(struct vcpu *v)
437 {
438 if ( v->arch.hvm_vcpu.flag_dr_dirty )
439 return;
441 v->arch.hvm_vcpu.flag_dr_dirty = 1;
443 write_debugreg(0, v->arch.guest_context.debugreg[0]);
444 write_debugreg(1, v->arch.guest_context.debugreg[1]);
445 write_debugreg(2, v->arch.guest_context.debugreg[2]);
446 write_debugreg(3, v->arch.guest_context.debugreg[3]);
447 write_debugreg(6, v->arch.guest_context.debugreg[6]);
448 /* DR7 is loaded from the VMCS. */
449 }
451 /*
452 * DR7 is saved and restored on every vmexit. Other debug registers only
453 * need to be restored if their value is going to affect execution -- i.e.,
454 * if one of the breakpoints is enabled. So mask out all bits that don't
455 * enable some breakpoint functionality.
456 */
457 static void vmx_restore_dr(struct vcpu *v)
458 {
459 /* NB. __vmread() is not usable here, so we cannot read from the VMCS. */
460 if ( unlikely(v->arch.guest_context.debugreg[7] & DR7_ACTIVE_MASK) )
461 __restore_debug_registers(v);
462 }
464 static void vmx_vmcs_save(struct vcpu *v, struct hvm_hw_cpu *c)
465 {
466 uint32_t ev;
468 vmx_vmcs_enter(v);
470 c->cr0 = v->arch.hvm_vcpu.guest_cr[0];
471 c->cr2 = v->arch.hvm_vcpu.guest_cr[2];
472 c->cr3 = v->arch.hvm_vcpu.guest_cr[3];
473 c->cr4 = v->arch.hvm_vcpu.guest_cr[4];
475 c->msr_efer = v->arch.hvm_vcpu.guest_efer;
477 c->sysenter_cs = __vmread(GUEST_SYSENTER_CS);
478 c->sysenter_esp = __vmread(GUEST_SYSENTER_ESP);
479 c->sysenter_eip = __vmread(GUEST_SYSENTER_EIP);
481 c->pending_event = 0;
482 c->error_code = 0;
483 if ( ((ev = __vmread(VM_ENTRY_INTR_INFO)) & INTR_INFO_VALID_MASK) &&
484 hvm_event_needs_reinjection((ev >> 8) & 7, ev & 0xff) )
485 {
486 c->pending_event = ev;
487 c->error_code = __vmread(VM_ENTRY_EXCEPTION_ERROR_CODE);
488 }
490 vmx_vmcs_exit(v);
491 }
493 static int vmx_restore_cr0_cr3(
494 struct vcpu *v, unsigned long cr0, unsigned long cr3)
495 {
496 unsigned long mfn = 0;
497 p2m_type_t p2mt;
499 if ( paging_mode_shadow(v->domain) )
500 {
501 if ( cr0 & X86_CR0_PG )
502 {
503 mfn = mfn_x(gfn_to_mfn(v->domain, cr3 >> PAGE_SHIFT, &p2mt));
504 if ( !p2m_is_ram(p2mt) || !get_page(mfn_to_page(mfn), v->domain) )
505 {
506 gdprintk(XENLOG_ERR, "Invalid CR3 value=0x%lx\n", cr3);
507 return -EINVAL;
508 }
509 }
511 if ( hvm_paging_enabled(v) )
512 put_page(pagetable_get_page(v->arch.guest_table));
514 v->arch.guest_table = pagetable_from_pfn(mfn);
515 }
517 v->arch.hvm_vcpu.guest_cr[0] = cr0 | X86_CR0_ET;
518 v->arch.hvm_vcpu.guest_cr[3] = cr3;
520 return 0;
521 }
523 static int vmx_vmcs_restore(struct vcpu *v, struct hvm_hw_cpu *c)
524 {
525 int rc;
527 if ( c->pending_valid &&
528 ((c->pending_type == 1) || (c->pending_type > 6) ||
529 (c->pending_reserved != 0)) )
530 {
531 gdprintk(XENLOG_ERR, "Invalid pending event 0x%"PRIx32".\n",
532 c->pending_event);
533 return -EINVAL;
534 }
536 rc = vmx_restore_cr0_cr3(v, c->cr0, c->cr3);
537 if ( rc )
538 return rc;
540 vmx_vmcs_enter(v);
542 v->arch.hvm_vcpu.guest_cr[2] = c->cr2;
543 v->arch.hvm_vcpu.guest_cr[4] = c->cr4;
544 vmx_update_guest_cr(v, 0);
545 vmx_update_guest_cr(v, 2);
546 vmx_update_guest_cr(v, 4);
548 v->arch.hvm_vcpu.guest_efer = c->msr_efer;
549 vmx_update_guest_efer(v);
551 __vmwrite(GUEST_SYSENTER_CS, c->sysenter_cs);
552 __vmwrite(GUEST_SYSENTER_ESP, c->sysenter_esp);
553 __vmwrite(GUEST_SYSENTER_EIP, c->sysenter_eip);
555 __vmwrite(GUEST_DR7, c->dr7);
557 vmx_vmcs_exit(v);
559 paging_update_paging_modes(v);
561 if ( c->pending_valid )
562 {
563 gdprintk(XENLOG_INFO, "Re-injecting 0x%"PRIx32", 0x%"PRIx32"\n",
564 c->pending_event, c->error_code);
566 if ( hvm_event_needs_reinjection(c->pending_type, c->pending_vector) )
567 {
568 vmx_vmcs_enter(v);
569 __vmwrite(VM_ENTRY_INTR_INFO, c->pending_event);
570 __vmwrite(VM_ENTRY_EXCEPTION_ERROR_CODE, c->error_code);
571 vmx_vmcs_exit(v);
572 }
573 }
575 return 0;
576 }
578 static void vmx_save_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data)
579 {
580 #ifdef __x86_64__
581 struct vmx_msr_state *guest_state = &v->arch.hvm_vmx.msr_state;
582 unsigned long guest_flags = guest_state->flags;
584 data->shadow_gs = v->arch.hvm_vmx.shadow_gs;
585 data->msr_cstar = v->arch.hvm_vmx.cstar;
587 /* save msrs */
588 data->msr_flags = guest_flags;
589 data->msr_lstar = guest_state->msrs[VMX_INDEX_MSR_LSTAR];
590 data->msr_star = guest_state->msrs[VMX_INDEX_MSR_STAR];
591 data->msr_syscall_mask = guest_state->msrs[VMX_INDEX_MSR_SYSCALL_MASK];
592 #endif
594 data->tsc = hvm_get_guest_tsc(v);
595 }
597 static void vmx_load_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data)
598 {
599 #ifdef __x86_64__
600 struct vmx_msr_state *guest_state = &v->arch.hvm_vmx.msr_state;
602 /* restore msrs */
603 guest_state->flags = data->msr_flags & 7;
604 guest_state->msrs[VMX_INDEX_MSR_LSTAR] = data->msr_lstar;
605 guest_state->msrs[VMX_INDEX_MSR_STAR] = data->msr_star;
606 guest_state->msrs[VMX_INDEX_MSR_SYSCALL_MASK] = data->msr_syscall_mask;
608 v->arch.hvm_vmx.cstar = data->msr_cstar;
609 v->arch.hvm_vmx.shadow_gs = data->shadow_gs;
610 #endif
612 hvm_set_guest_tsc(v, data->tsc);
613 }
616 static void vmx_save_vmcs_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt)
617 {
618 vmx_save_cpu_state(v, ctxt);
619 vmx_vmcs_save(v, ctxt);
620 }
622 static int vmx_load_vmcs_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt)
623 {
624 vmx_load_cpu_state(v, ctxt);
626 if ( vmx_vmcs_restore(v, ctxt) )
627 {
628 gdprintk(XENLOG_ERR, "vmx_vmcs restore failed!\n");
629 domain_crash(v->domain);
630 return -EINVAL;
631 }
633 return 0;
634 }
636 static void vmx_fpu_enter(struct vcpu *v)
637 {
638 setup_fpu(v);
639 __vm_clear_bit(EXCEPTION_BITMAP, TRAP_no_device);
640 v->arch.hvm_vmx.host_cr0 &= ~X86_CR0_TS;
641 __vmwrite(HOST_CR0, v->arch.hvm_vmx.host_cr0);
642 }
644 static void vmx_fpu_leave(struct vcpu *v)
645 {
646 ASSERT(!v->fpu_dirtied);
647 ASSERT(read_cr0() & X86_CR0_TS);
649 if ( !(v->arch.hvm_vmx.host_cr0 & X86_CR0_TS) )
650 {
651 v->arch.hvm_vmx.host_cr0 |= X86_CR0_TS;
652 __vmwrite(HOST_CR0, v->arch.hvm_vmx.host_cr0);
653 }
655 /*
656 * If the guest does not have TS enabled then we must cause and handle an
657 * exception on first use of the FPU. If the guest *does* have TS enabled
658 * then this is not necessary: no FPU activity can occur until the guest
659 * clears CR0.TS, and we will initialise the FPU when that happens.
660 */
661 if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) )
662 {
663 v->arch.hvm_vcpu.hw_cr[0] |= X86_CR0_TS;
664 __vmwrite(GUEST_CR0, v->arch.hvm_vcpu.hw_cr[0]);
665 __vm_set_bit(EXCEPTION_BITMAP, TRAP_no_device);
666 }
667 }
669 static void vmx_ctxt_switch_from(struct vcpu *v)
670 {
671 vmx_fpu_leave(v);
672 vmx_save_guest_msrs(v);
673 vmx_restore_host_msrs();
674 vmx_save_dr(v);
675 vpmu_save(v);
676 }
678 static void vmx_ctxt_switch_to(struct vcpu *v)
679 {
680 /* HOST_CR4 in VMCS is always mmu_cr4_features. Sync CR4 now. */
681 if ( unlikely(read_cr4() != mmu_cr4_features) )
682 write_cr4(mmu_cr4_features);
684 vmx_restore_guest_msrs(v);
685 vmx_restore_dr(v);
686 vpmu_load(v);
687 }
690 /* SDM volume 3b section 22.3.1.2: we can only enter virtual 8086 mode
691 * if all of CS, SS, DS, ES, FS and GS are 16bit ring-3 data segments.
692 * The guest thinks it's got ring-0 segments, so we need to fudge
693 * things. We store the ring-3 version in the VMCS to avoid lots of
694 * shuffling on vmenter and vmexit, and translate in these accessors. */
696 #define rm_cs_attr (((union segment_attributes) { \
697 .fields = { .type = 0xb, .s = 1, .dpl = 0, .p = 1, .avl = 0, \
698 .l = 0, .db = 0, .g = 0, .pad = 0 } }).bytes)
699 #define rm_ds_attr (((union segment_attributes) { \
700 .fields = { .type = 0x3, .s = 1, .dpl = 0, .p = 1, .avl = 0, \
701 .l = 0, .db = 0, .g = 0, .pad = 0 } }).bytes)
702 #define vm86_ds_attr (((union segment_attributes) { \
703 .fields = { .type = 0x3, .s = 1, .dpl = 3, .p = 1, .avl = 0, \
704 .l = 0, .db = 0, .g = 0, .pad = 0 } }).bytes)
705 #define vm86_tr_attr (((union segment_attributes) { \
706 .fields = { .type = 0xb, .s = 0, .dpl = 0, .p = 1, .avl = 0, \
707 .l = 0, .db = 0, .g = 0, .pad = 0 } }).bytes)
709 static void vmx_get_segment_register(struct vcpu *v, enum x86_segment seg,
710 struct segment_register *reg)
711 {
712 uint32_t attr = 0;
714 vmx_vmcs_enter(v);
716 switch ( seg )
717 {
718 case x86_seg_cs:
719 reg->sel = __vmread(GUEST_CS_SELECTOR);
720 reg->limit = __vmread(GUEST_CS_LIMIT);
721 reg->base = __vmread(GUEST_CS_BASE);
722 attr = __vmread(GUEST_CS_AR_BYTES);
723 break;
724 case x86_seg_ds:
725 reg->sel = __vmread(GUEST_DS_SELECTOR);
726 reg->limit = __vmread(GUEST_DS_LIMIT);
727 reg->base = __vmread(GUEST_DS_BASE);
728 attr = __vmread(GUEST_DS_AR_BYTES);
729 break;
730 case x86_seg_es:
731 reg->sel = __vmread(GUEST_ES_SELECTOR);
732 reg->limit = __vmread(GUEST_ES_LIMIT);
733 reg->base = __vmread(GUEST_ES_BASE);
734 attr = __vmread(GUEST_ES_AR_BYTES);
735 break;
736 case x86_seg_fs:
737 reg->sel = __vmread(GUEST_FS_SELECTOR);
738 reg->limit = __vmread(GUEST_FS_LIMIT);
739 reg->base = __vmread(GUEST_FS_BASE);
740 attr = __vmread(GUEST_FS_AR_BYTES);
741 break;
742 case x86_seg_gs:
743 reg->sel = __vmread(GUEST_GS_SELECTOR);
744 reg->limit = __vmread(GUEST_GS_LIMIT);
745 reg->base = __vmread(GUEST_GS_BASE);
746 attr = __vmread(GUEST_GS_AR_BYTES);
747 break;
748 case x86_seg_ss:
749 reg->sel = __vmread(GUEST_SS_SELECTOR);
750 reg->limit = __vmread(GUEST_SS_LIMIT);
751 reg->base = __vmread(GUEST_SS_BASE);
752 attr = __vmread(GUEST_SS_AR_BYTES);
753 break;
754 case x86_seg_tr:
755 reg->sel = __vmread(GUEST_TR_SELECTOR);
756 reg->limit = __vmread(GUEST_TR_LIMIT);
757 reg->base = __vmread(GUEST_TR_BASE);
758 attr = __vmread(GUEST_TR_AR_BYTES);
759 break;
760 case x86_seg_gdtr:
761 reg->limit = __vmread(GUEST_GDTR_LIMIT);
762 reg->base = __vmread(GUEST_GDTR_BASE);
763 break;
764 case x86_seg_idtr:
765 reg->limit = __vmread(GUEST_IDTR_LIMIT);
766 reg->base = __vmread(GUEST_IDTR_BASE);
767 break;
768 case x86_seg_ldtr:
769 reg->sel = __vmread(GUEST_LDTR_SELECTOR);
770 reg->limit = __vmread(GUEST_LDTR_LIMIT);
771 reg->base = __vmread(GUEST_LDTR_BASE);
772 attr = __vmread(GUEST_LDTR_AR_BYTES);
773 break;
774 default:
775 BUG();
776 }
778 vmx_vmcs_exit(v);
780 reg->attr.bytes = (attr & 0xff) | ((attr >> 4) & 0xf00);
781 /* Unusable flag is folded into Present flag. */
782 if ( attr & (1u<<16) )
783 reg->attr.fields.p = 0;
785 /* Adjust for virtual 8086 mode */
786 if ( v->arch.hvm_vmx.vmx_realmode && seg <= x86_seg_tr
787 && !(v->arch.hvm_vmx.vm86_segment_mask & (1u << seg)) )
788 {
789 struct segment_register *sreg = &v->arch.hvm_vmx.vm86_saved_seg[seg];
790 if ( seg == x86_seg_tr )
791 *reg = *sreg;
792 else if ( reg->base != sreg->base || seg == x86_seg_ss )
793 {
794 /* If the guest's reloaded the segment, remember the new version.
795 * We can't tell if the guest reloaded the segment with another
796 * one that has the same base. By default we assume it hasn't,
797 * since we don't want to lose big-real-mode segment attributes,
798 * but for SS we assume it has: the Ubuntu graphical bootloader
799 * does this and gets badly confused if we leave the old SS in
800 * place. */
801 reg->attr.bytes = (seg == x86_seg_cs ? rm_cs_attr : rm_ds_attr);
802 *sreg = *reg;
803 }
804 else
805 {
806 /* Always give realmode guests a selector that matches the base
807 * but keep the attr and limit from before */
808 *reg = *sreg;
809 reg->sel = reg->base >> 4;
810 }
811 }
812 }
814 static void vmx_set_segment_register(struct vcpu *v, enum x86_segment seg,
815 struct segment_register *reg)
816 {
817 uint32_t attr, sel, limit;
818 uint64_t base;
820 sel = reg->sel;
821 attr = reg->attr.bytes;
822 limit = reg->limit;
823 base = reg->base;
825 /* Adjust CS/SS/DS/ES/FS/GS/TR for virtual 8086 mode */
826 if ( v->arch.hvm_vmx.vmx_realmode && seg <= x86_seg_tr )
827 {
828 /* Remember the proper contents */
829 v->arch.hvm_vmx.vm86_saved_seg[seg] = *reg;
831 if ( seg == x86_seg_tr )
832 {
833 if ( v->domain->arch.hvm_domain.params[HVM_PARAM_VM86_TSS] )
834 {
835 sel = 0;
836 attr = vm86_tr_attr;
837 limit = 0xff;
838 base = v->domain->arch.hvm_domain.params[HVM_PARAM_VM86_TSS];
839 v->arch.hvm_vmx.vm86_segment_mask &= ~(1u << seg);
840 }
841 else
842 v->arch.hvm_vmx.vm86_segment_mask |= (1u << seg);
843 }
844 else
845 {
846 /* Try to fake it out as a 16bit data segment. This could
847 * cause confusion for the guest if it reads the selector,
848 * but otherwise we have to emulate if *any* segment hasn't
849 * been reloaded. */
850 if ( base < 0x100000 && !(base & 0xf) && limit >= 0xffff
851 && reg->attr.fields.p )
852 {
853 sel = base >> 4;
854 attr = vm86_ds_attr;
855 limit = 0xffff;
856 v->arch.hvm_vmx.vm86_segment_mask &= ~(1u << seg);
857 }
858 else
859 v->arch.hvm_vmx.vm86_segment_mask |= (1u << seg);
860 }
861 }
863 attr = ((attr & 0xf00) << 4) | (attr & 0xff);
865 /* Not-present must mean unusable. */
866 if ( !reg->attr.fields.p )
867 attr |= (1u << 16);
869 /* VMX has strict consistency requirement for flag G. */
870 attr |= !!(limit >> 20) << 15;
872 vmx_vmcs_enter(v);
874 switch ( seg )
875 {
876 case x86_seg_cs:
877 __vmwrite(GUEST_CS_SELECTOR, sel);
878 __vmwrite(GUEST_CS_LIMIT, limit);
879 __vmwrite(GUEST_CS_BASE, base);
880 __vmwrite(GUEST_CS_AR_BYTES, attr);
881 break;
882 case x86_seg_ds:
883 __vmwrite(GUEST_DS_SELECTOR, sel);
884 __vmwrite(GUEST_DS_LIMIT, limit);
885 __vmwrite(GUEST_DS_BASE, base);
886 __vmwrite(GUEST_DS_AR_BYTES, attr);
887 break;
888 case x86_seg_es:
889 __vmwrite(GUEST_ES_SELECTOR, sel);
890 __vmwrite(GUEST_ES_LIMIT, limit);
891 __vmwrite(GUEST_ES_BASE, base);
892 __vmwrite(GUEST_ES_AR_BYTES, attr);
893 break;
894 case x86_seg_fs:
895 __vmwrite(GUEST_FS_SELECTOR, sel);
896 __vmwrite(GUEST_FS_LIMIT, limit);
897 __vmwrite(GUEST_FS_BASE, base);
898 __vmwrite(GUEST_FS_AR_BYTES, attr);
899 break;
900 case x86_seg_gs:
901 __vmwrite(GUEST_GS_SELECTOR, sel);
902 __vmwrite(GUEST_GS_LIMIT, limit);
903 __vmwrite(GUEST_GS_BASE, base);
904 __vmwrite(GUEST_GS_AR_BYTES, attr);
905 break;
906 case x86_seg_ss:
907 __vmwrite(GUEST_SS_SELECTOR, sel);
908 __vmwrite(GUEST_SS_LIMIT, limit);
909 __vmwrite(GUEST_SS_BASE, base);
910 __vmwrite(GUEST_SS_AR_BYTES, attr);
911 break;
912 case x86_seg_tr:
913 __vmwrite(GUEST_TR_SELECTOR, sel);
914 __vmwrite(GUEST_TR_LIMIT, limit);
915 __vmwrite(GUEST_TR_BASE, base);
916 /* VMX checks that the the busy flag (bit 1) is set. */
917 __vmwrite(GUEST_TR_AR_BYTES, attr | 2);
918 break;
919 case x86_seg_gdtr:
920 __vmwrite(GUEST_GDTR_LIMIT, limit);
921 __vmwrite(GUEST_GDTR_BASE, base);
922 break;
923 case x86_seg_idtr:
924 __vmwrite(GUEST_IDTR_LIMIT, limit);
925 __vmwrite(GUEST_IDTR_BASE, base);
926 break;
927 case x86_seg_ldtr:
928 __vmwrite(GUEST_LDTR_SELECTOR, sel);
929 __vmwrite(GUEST_LDTR_LIMIT, limit);
930 __vmwrite(GUEST_LDTR_BASE, base);
931 __vmwrite(GUEST_LDTR_AR_BYTES, attr);
932 break;
933 default:
934 BUG();
935 }
937 vmx_vmcs_exit(v);
938 }
940 static void vmx_set_tsc_offset(struct vcpu *v, u64 offset)
941 {
942 vmx_vmcs_enter(v);
943 __vmwrite(TSC_OFFSET, offset);
944 #if defined (__i386__)
945 __vmwrite(TSC_OFFSET_HIGH, offset >> 32);
946 #endif
947 vmx_vmcs_exit(v);
948 }
950 static void vmx_enable_rdtsc_exiting(struct vcpu *v)
951 {
952 vmx_vmcs_enter(v);
953 v->arch.hvm_vmx.exec_control |= CPU_BASED_RDTSC_EXITING;
954 __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control);
955 vmx_vmcs_exit(v);
956 }
958 void do_nmi(struct cpu_user_regs *);
960 static void vmx_init_hypercall_page(struct domain *d, void *hypercall_page)
961 {
962 char *p;
963 int i;
965 for ( i = 0; i < (PAGE_SIZE / 32); i++ )
966 {
967 p = (char *)(hypercall_page + (i * 32));
968 *(u8 *)(p + 0) = 0xb8; /* mov imm32, %eax */
969 *(u32 *)(p + 1) = i;
970 *(u8 *)(p + 5) = 0x0f; /* vmcall */
971 *(u8 *)(p + 6) = 0x01;
972 *(u8 *)(p + 7) = 0xc1;
973 *(u8 *)(p + 8) = 0xc3; /* ret */
974 }
976 /* Don't support HYPERVISOR_iret at the moment */
977 *(u16 *)(hypercall_page + (__HYPERVISOR_iret * 32)) = 0x0b0f; /* ud2 */
978 }
980 static unsigned int vmx_get_interrupt_shadow(struct vcpu *v)
981 {
982 return __vmread(GUEST_INTERRUPTIBILITY_INFO);
983 }
985 static void vmx_set_interrupt_shadow(struct vcpu *v, unsigned int intr_shadow)
986 {
987 __vmwrite(GUEST_INTERRUPTIBILITY_INFO, intr_shadow);
988 }
990 static void vmx_load_pdptrs(struct vcpu *v)
991 {
992 unsigned long cr3 = v->arch.hvm_vcpu.guest_cr[3], mfn;
993 uint64_t *guest_pdptrs;
994 p2m_type_t p2mt;
995 char *p;
997 /* EPT needs to load PDPTRS into VMCS for PAE. */
998 if ( !hvm_pae_enabled(v) || (v->arch.hvm_vcpu.guest_efer & EFER_LMA) )
999 return;
1001 if ( cr3 & 0x1fUL )
1002 goto crash;
1004 mfn = mfn_x(gfn_to_mfn(v->domain, cr3 >> PAGE_SHIFT, &p2mt));
1005 if ( !p2m_is_ram(p2mt) )
1006 goto crash;
1008 p = map_domain_page(mfn);
1010 guest_pdptrs = (uint64_t *)(p + (cr3 & ~PAGE_MASK));
1012 /*
1013 * We do not check the PDPTRs for validity. The CPU will do this during
1014 * vm entry, and we can handle the failure there and crash the guest.
1015 * The only thing we could do better here is #GP instead.
1016 */
1018 vmx_vmcs_enter(v);
1020 __vmwrite(GUEST_PDPTR0, guest_pdptrs[0]);
1021 __vmwrite(GUEST_PDPTR1, guest_pdptrs[1]);
1022 __vmwrite(GUEST_PDPTR2, guest_pdptrs[2]);
1023 __vmwrite(GUEST_PDPTR3, guest_pdptrs[3]);
1024 #ifdef __i386__
1025 __vmwrite(GUEST_PDPTR0_HIGH, guest_pdptrs[0] >> 32);
1026 __vmwrite(GUEST_PDPTR1_HIGH, guest_pdptrs[1] >> 32);
1027 __vmwrite(GUEST_PDPTR2_HIGH, guest_pdptrs[2] >> 32);
1028 __vmwrite(GUEST_PDPTR3_HIGH, guest_pdptrs[3] >> 32);
1029 #endif
1031 vmx_vmcs_exit(v);
1033 unmap_domain_page(p);
1034 return;
1036 crash:
1037 domain_crash(v->domain);
1040 static void vmx_update_host_cr3(struct vcpu *v)
1042 vmx_vmcs_enter(v);
1043 __vmwrite(HOST_CR3, v->arch.cr3);
1044 vmx_vmcs_exit(v);
1047 void vmx_update_debug_state(struct vcpu *v)
1049 unsigned long intercepts, mask;
1051 ASSERT(v == current);
1053 mask = 1u << TRAP_int3;
1054 if ( !cpu_has_monitor_trap_flag )
1055 mask |= 1u << TRAP_debug;
1057 intercepts = __vmread(EXCEPTION_BITMAP);
1058 if ( v->arch.hvm_vcpu.debug_state_latch )
1059 intercepts |= mask;
1060 else
1061 intercepts &= ~mask;
1062 __vmwrite(EXCEPTION_BITMAP, intercepts);
1065 static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr)
1067 vmx_vmcs_enter(v);
1069 switch ( cr )
1071 case 0: {
1072 int realmode;
1073 unsigned long hw_cr0_mask = X86_CR0_NE;
1075 if ( !vmx_unrestricted_guest(v) )
1076 hw_cr0_mask |= X86_CR0_PG | X86_CR0_PE;
1078 if ( paging_mode_shadow(v->domain) )
1079 hw_cr0_mask |= X86_CR0_WP;
1081 if ( paging_mode_hap(v->domain) )
1083 /* We manage GUEST_CR3 when guest CR0.PE is zero. */
1084 uint32_t cr3_ctls = (CPU_BASED_CR3_LOAD_EXITING |
1085 CPU_BASED_CR3_STORE_EXITING);
1086 v->arch.hvm_vmx.exec_control &= ~cr3_ctls;
1087 if ( !hvm_paging_enabled(v) )
1088 v->arch.hvm_vmx.exec_control |= cr3_ctls;
1089 __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control);
1091 /* Changing CR0.PE can change some bits in real CR4. */
1092 vmx_update_guest_cr(v, 4);
1095 if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) )
1097 if ( v != current )
1098 hw_cr0_mask |= X86_CR0_TS;
1099 else if ( v->arch.hvm_vcpu.hw_cr[0] & X86_CR0_TS )
1100 vmx_fpu_enter(v);
1103 realmode = !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE);
1105 if ( (!vmx_unrestricted_guest(v)) &&
1106 (realmode != v->arch.hvm_vmx.vmx_realmode) )
1108 enum x86_segment s;
1109 struct segment_register reg[x86_seg_tr + 1];
1111 /* Entering or leaving real mode: adjust the segment registers.
1112 * Need to read them all either way, as realmode reads can update
1113 * the saved values we'll use when returning to prot mode. */
1114 for ( s = x86_seg_cs ; s <= x86_seg_tr ; s++ )
1115 vmx_get_segment_register(v, s, &reg[s]);
1116 v->arch.hvm_vmx.vmx_realmode = realmode;
1118 if ( realmode )
1120 for ( s = x86_seg_cs ; s <= x86_seg_tr ; s++ )
1121 vmx_set_segment_register(v, s, &reg[s]);
1122 v->arch.hvm_vcpu.hw_cr[4] |= X86_CR4_VME;
1123 __vmwrite(GUEST_CR4, v->arch.hvm_vcpu.hw_cr[4]);
1124 __vmwrite(EXCEPTION_BITMAP, 0xffffffff);
1126 else
1128 for ( s = x86_seg_cs ; s <= x86_seg_tr ; s++ )
1129 if ( !(v->arch.hvm_vmx.vm86_segment_mask & (1<<s)) )
1130 vmx_set_segment_register(
1131 v, s, &v->arch.hvm_vmx.vm86_saved_seg[s]);
1132 v->arch.hvm_vcpu.hw_cr[4] =
1133 ((v->arch.hvm_vcpu.hw_cr[4] & ~X86_CR4_VME)
1134 |(v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_VME));
1135 __vmwrite(GUEST_CR4, v->arch.hvm_vcpu.hw_cr[4]);
1136 __vmwrite(EXCEPTION_BITMAP,
1137 HVM_TRAP_MASK
1138 | (paging_mode_hap(v->domain) ?
1139 0 : (1U << TRAP_page_fault))
1140 | (1U << TRAP_no_device));
1141 vmx_update_debug_state(v);
1145 v->arch.hvm_vcpu.hw_cr[0] =
1146 v->arch.hvm_vcpu.guest_cr[0] | hw_cr0_mask;
1147 __vmwrite(GUEST_CR0, v->arch.hvm_vcpu.hw_cr[0]);
1148 __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vcpu.guest_cr[0]);
1149 break;
1151 case 2:
1152 /* CR2 is updated in exit stub. */
1153 break;
1154 case 3:
1155 if ( paging_mode_hap(v->domain) )
1157 if ( !hvm_paging_enabled(v) )
1158 v->arch.hvm_vcpu.hw_cr[3] =
1159 v->domain->arch.hvm_domain.params[HVM_PARAM_IDENT_PT];
1160 vmx_load_pdptrs(v);
1163 __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr[3]);
1164 vpid_sync_vcpu_all(v);
1165 break;
1166 case 4:
1167 v->arch.hvm_vcpu.hw_cr[4] = HVM_CR4_HOST_MASK;
1168 if ( paging_mode_hap(v->domain) )
1169 v->arch.hvm_vcpu.hw_cr[4] &= ~X86_CR4_PAE;
1170 v->arch.hvm_vcpu.hw_cr[4] |= v->arch.hvm_vcpu.guest_cr[4];
1171 if ( v->arch.hvm_vmx.vmx_realmode )
1172 v->arch.hvm_vcpu.hw_cr[4] |= X86_CR4_VME;
1173 if ( paging_mode_hap(v->domain) && !hvm_paging_enabled(v) )
1175 v->arch.hvm_vcpu.hw_cr[4] |= X86_CR4_PSE;
1176 v->arch.hvm_vcpu.hw_cr[4] &= ~X86_CR4_PAE;
1178 __vmwrite(GUEST_CR4, v->arch.hvm_vcpu.hw_cr[4]);
1179 __vmwrite(CR4_READ_SHADOW, v->arch.hvm_vcpu.guest_cr[4]);
1180 break;
1181 default:
1182 BUG();
1185 vmx_vmcs_exit(v);
1188 static void vmx_update_guest_efer(struct vcpu *v)
1190 #ifdef __x86_64__
1191 unsigned long vm_entry_value;
1193 vmx_vmcs_enter(v);
1195 vm_entry_value = __vmread(VM_ENTRY_CONTROLS);
1196 if ( v->arch.hvm_vcpu.guest_efer & EFER_LMA )
1197 vm_entry_value |= VM_ENTRY_IA32E_MODE;
1198 else
1199 vm_entry_value &= ~VM_ENTRY_IA32E_MODE;
1200 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
1202 vmx_vmcs_exit(v);
1203 #endif
1205 if ( v == current )
1206 write_efer((read_efer() & ~EFER_SCE) |
1207 (v->arch.hvm_vcpu.guest_efer & EFER_SCE));
1210 static void vmx_flush_guest_tlbs(void)
1212 /*
1213 * If VPID (i.e. tagged TLB support) is not enabled, the fact that
1214 * we're in Xen at all means any guest will have a clean TLB when
1215 * it's next run, because VMRESUME will flush it for us.
1217 * If enabled, we invalidate all translations associated with all
1218 * VPID values.
1219 */
1220 vpid_sync_all();
1223 static void __ept_sync_domain(void *info)
1225 struct domain *d = info;
1226 __invept(1, d->arch.hvm_domain.vmx.ept_control.eptp, 0);
1229 void ept_sync_domain(struct domain *d)
1231 /* Only if using EPT and this domain has some VCPUs to dirty. */
1232 if ( d->arch.hvm_domain.hap_enabled && d->vcpu && d->vcpu[0] )
1234 ASSERT(local_irq_is_enabled());
1235 on_each_cpu(__ept_sync_domain, d, 1);
1239 static void __vmx_inject_exception(int trap, int type, int error_code)
1241 unsigned long intr_fields;
1242 struct vcpu *curr = current;
1244 /*
1245 * NB. Callers do not need to worry about clearing STI/MOV-SS blocking:
1246 * "If the VM entry is injecting, there is no blocking by STI or by
1247 * MOV SS following the VM entry, regardless of the contents of the
1248 * interruptibility-state field [in the guest-state area before the
1249 * VM entry]", PRM Vol. 3, 22.6.1 (Interruptibility State).
1250 */
1252 intr_fields = (INTR_INFO_VALID_MASK | (type<<8) | trap);
1253 if ( error_code != HVM_DELIVER_NO_ERROR_CODE ) {
1254 __vmwrite(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
1255 intr_fields |= INTR_INFO_DELIVER_CODE_MASK;
1258 __vmwrite(VM_ENTRY_INTR_INFO, intr_fields);
1260 /* Can't inject exceptions in virtual 8086 mode because they would
1261 * use the protected-mode IDT. Emulate at the next vmenter instead. */
1262 if ( curr->arch.hvm_vmx.vmx_realmode )
1263 curr->arch.hvm_vmx.vmx_emulate = 1;
1266 void vmx_inject_hw_exception(int trap, int error_code)
1268 unsigned long intr_info = __vmread(VM_ENTRY_INTR_INFO);
1269 struct vcpu *curr = current;
1271 switch ( trap )
1273 case TRAP_debug:
1274 if ( guest_cpu_user_regs()->eflags & X86_EFLAGS_TF )
1276 __restore_debug_registers(curr);
1277 write_debugreg(6, read_debugreg(6) | 0x4000);
1279 if ( cpu_has_monitor_trap_flag )
1280 break;
1281 case TRAP_int3:
1282 if ( curr->domain->debugger_attached )
1284 /* Debug/Int3: Trap to debugger. */
1285 domain_pause_for_debugger();
1286 return;
1290 if ( unlikely(intr_info & INTR_INFO_VALID_MASK) &&
1291 (((intr_info >> 8) & 7) == X86_EVENTTYPE_HW_EXCEPTION) )
1293 trap = hvm_combine_hw_exceptions((uint8_t)intr_info, trap);
1294 if ( trap == TRAP_double_fault )
1295 error_code = 0;
1298 __vmx_inject_exception(trap, X86_EVENTTYPE_HW_EXCEPTION, error_code);
1300 if ( trap == TRAP_page_fault )
1301 HVMTRACE_LONG_2D(PF_INJECT, error_code,
1302 TRC_PAR_LONG(current->arch.hvm_vcpu.guest_cr[2]));
1303 else
1304 HVMTRACE_2D(INJ_EXC, trap, error_code);
1307 void vmx_inject_extint(int trap)
1309 __vmx_inject_exception(trap, X86_EVENTTYPE_EXT_INTR,
1310 HVM_DELIVER_NO_ERROR_CODE);
1313 void vmx_inject_nmi(void)
1315 __vmx_inject_exception(2, X86_EVENTTYPE_NMI,
1316 HVM_DELIVER_NO_ERROR_CODE);
1319 static void vmx_inject_exception(
1320 unsigned int trapnr, int errcode, unsigned long cr2)
1322 if ( trapnr == TRAP_page_fault )
1323 current->arch.hvm_vcpu.guest_cr[2] = cr2;
1325 vmx_inject_hw_exception(trapnr, errcode);
1328 static int vmx_event_pending(struct vcpu *v)
1330 ASSERT(v == current);
1331 return (__vmread(VM_ENTRY_INTR_INFO) & INTR_INFO_VALID_MASK);
1334 static int vmx_do_pmu_interrupt(struct cpu_user_regs *regs)
1336 return vpmu_do_interrupt(regs);
1339 static void vmx_set_uc_mode(struct vcpu *v)
1341 if ( paging_mode_hap(v->domain) )
1342 ept_change_entry_emt_with_range(
1343 v->domain, 0, v->domain->arch.p2m->max_mapped_pfn);
1344 vpid_sync_all();
1347 static void vmx_set_info_guest(struct vcpu *v)
1349 unsigned long intr_shadow;
1351 vmx_vmcs_enter(v);
1353 __vmwrite(GUEST_DR7, v->arch.guest_context.debugreg[7]);
1355 /*
1356 * If the interruptibility-state field indicates blocking by STI,
1357 * setting the TF flag in the EFLAGS may cause VM entry to fail
1358 * and crash the guest. See SDM 3B 22.3.1.5.
1359 * Resetting the VMX_INTR_SHADOW_STI flag looks hackish but
1360 * to set the GUEST_PENDING_DBG_EXCEPTIONS.BS here incurs
1361 * immediately vmexit and hence make no progress.
1362 */
1363 intr_shadow = __vmread(GUEST_INTERRUPTIBILITY_INFO);
1364 if ( v->domain->debugger_attached &&
1365 (v->arch.guest_context.user_regs.eflags & X86_EFLAGS_TF) &&
1366 (intr_shadow & VMX_INTR_SHADOW_STI) )
1368 intr_shadow &= ~VMX_INTR_SHADOW_STI;
1369 __vmwrite(GUEST_INTERRUPTIBILITY_INFO, intr_shadow);
1372 vmx_vmcs_exit(v);
1375 static struct hvm_function_table vmx_function_table = {
1376 .name = "VMX",
1377 .domain_initialise = vmx_domain_initialise,
1378 .domain_destroy = vmx_domain_destroy,
1379 .vcpu_initialise = vmx_vcpu_initialise,
1380 .vcpu_destroy = vmx_vcpu_destroy,
1381 .save_cpu_ctxt = vmx_save_vmcs_ctxt,
1382 .load_cpu_ctxt = vmx_load_vmcs_ctxt,
1383 .get_interrupt_shadow = vmx_get_interrupt_shadow,
1384 .set_interrupt_shadow = vmx_set_interrupt_shadow,
1385 .guest_x86_mode = vmx_guest_x86_mode,
1386 .get_segment_register = vmx_get_segment_register,
1387 .set_segment_register = vmx_set_segment_register,
1388 .update_host_cr3 = vmx_update_host_cr3,
1389 .update_guest_cr = vmx_update_guest_cr,
1390 .update_guest_efer = vmx_update_guest_efer,
1391 .flush_guest_tlbs = vmx_flush_guest_tlbs,
1392 .set_tsc_offset = vmx_set_tsc_offset,
1393 .inject_exception = vmx_inject_exception,
1394 .init_hypercall_page = vmx_init_hypercall_page,
1395 .event_pending = vmx_event_pending,
1396 .do_pmu_interrupt = vmx_do_pmu_interrupt,
1397 .cpu_up = vmx_cpu_up,
1398 .cpu_down = vmx_cpu_down,
1399 .cpuid_intercept = vmx_cpuid_intercept,
1400 .wbinvd_intercept = vmx_wbinvd_intercept,
1401 .fpu_dirty_intercept = vmx_fpu_dirty_intercept,
1402 .msr_read_intercept = vmx_msr_read_intercept,
1403 .msr_write_intercept = vmx_msr_write_intercept,
1404 .invlpg_intercept = vmx_invlpg_intercept,
1405 .set_uc_mode = vmx_set_uc_mode,
1406 .set_info_guest = vmx_set_info_guest,
1407 .enable_rdtsc_exiting = vmx_enable_rdtsc_exiting
1408 };
1410 static unsigned long *vpid_bitmap;
1411 #define VPID_BITMAP_SIZE ((1u << VMCS_VPID_WIDTH) / XEN_LEGACY_MAX_VCPUS)
1413 void start_vmx(void)
1415 static bool_t bootstrapped;
1417 vmx_save_host_msrs();
1419 if ( test_and_set_bool(bootstrapped) )
1421 if ( hvm_enabled && !vmx_cpu_up() )
1423 printk("VMX: FATAL: failed to initialise CPU%d!\n",
1424 smp_processor_id());
1425 BUG();
1427 return;
1430 /* Xen does not fill x86_capability words except 0. */
1431 boot_cpu_data.x86_capability[4] = cpuid_ecx(1);
1433 if ( !test_bit(X86_FEATURE_VMXE, &boot_cpu_data.x86_capability) )
1434 return;
1436 set_in_cr4(X86_CR4_VMXE);
1438 if ( !vmx_cpu_up() )
1440 printk("VMX: failed to initialise.\n");
1441 return;
1444 if ( cpu_has_vmx_ept )
1445 vmx_function_table.hap_supported = 1;
1447 if ( cpu_has_vmx_vpid )
1449 vpid_bitmap = xmalloc_array(
1450 unsigned long, BITS_TO_LONGS(VPID_BITMAP_SIZE));
1451 BUG_ON(vpid_bitmap == NULL);
1452 memset(vpid_bitmap, 0, BITS_TO_LONGS(VPID_BITMAP_SIZE) * sizeof(long));
1454 /* VPID 0 is used by VMX root mode (the hypervisor). */
1455 __set_bit(0, vpid_bitmap);
1458 setup_vmcs_dump();
1460 hvm_enable(&vmx_function_table);
1463 /*
1464 * Not all cases receive valid value in the VM-exit instruction length field.
1465 * Callers must know what they're doing!
1466 */
1467 static int __get_instruction_length(void)
1469 int len;
1470 len = __vmread(VM_EXIT_INSTRUCTION_LEN); /* Safe: callers audited */
1471 BUG_ON((len < 1) || (len > 15));
1472 return len;
1475 static void __update_guest_eip(unsigned long inst_len)
1477 struct cpu_user_regs *regs = guest_cpu_user_regs();
1478 unsigned long x;
1480 regs->eip += inst_len;
1481 regs->eflags &= ~X86_EFLAGS_RF;
1483 x = __vmread(GUEST_INTERRUPTIBILITY_INFO);
1484 if ( x & (VMX_INTR_SHADOW_STI | VMX_INTR_SHADOW_MOV_SS) )
1486 x &= ~(VMX_INTR_SHADOW_STI | VMX_INTR_SHADOW_MOV_SS);
1487 __vmwrite(GUEST_INTERRUPTIBILITY_INFO, x);
1490 if ( regs->eflags & X86_EFLAGS_TF )
1491 vmx_inject_hw_exception(TRAP_debug, HVM_DELIVER_NO_ERROR_CODE);
1494 static void vmx_fpu_dirty_intercept(void)
1496 struct vcpu *curr = current;
1498 vmx_fpu_enter(curr);
1500 /* Disable TS in guest CR0 unless the guest wants the exception too. */
1501 if ( !(curr->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) )
1503 curr->arch.hvm_vcpu.hw_cr[0] &= ~X86_CR0_TS;
1504 __vmwrite(GUEST_CR0, curr->arch.hvm_vcpu.hw_cr[0]);
1508 #define bitmaskof(idx) (1U << ((idx) & 31))
1509 static void vmx_cpuid_intercept(
1510 unsigned int *eax, unsigned int *ebx,
1511 unsigned int *ecx, unsigned int *edx)
1513 unsigned int input = *eax;
1514 struct segment_register cs;
1515 struct vcpu *v = current;
1517 hvm_cpuid(input, eax, ebx, ecx, edx);
1519 switch ( input )
1521 case 0x80000001:
1522 /* SYSCALL is visible iff running in long mode. */
1523 hvm_get_segment_register(v, x86_seg_cs, &cs);
1524 if ( cs.attr.fields.l )
1525 *edx |= bitmaskof(X86_FEATURE_SYSCALL);
1526 else
1527 *edx &= ~(bitmaskof(X86_FEATURE_SYSCALL));
1528 break;
1531 HVMTRACE_5D (CPUID, input, *eax, *ebx, *ecx, *edx);
1534 static void vmx_do_cpuid(struct cpu_user_regs *regs)
1536 unsigned int eax, ebx, ecx, edx;
1538 eax = regs->eax;
1539 ebx = regs->ebx;
1540 ecx = regs->ecx;
1541 edx = regs->edx;
1543 vmx_cpuid_intercept(&eax, &ebx, &ecx, &edx);
1545 regs->eax = eax;
1546 regs->ebx = ebx;
1547 regs->ecx = ecx;
1548 regs->edx = edx;
1551 static void vmx_dr_access(unsigned long exit_qualification,
1552 struct cpu_user_regs *regs)
1554 struct vcpu *v = current;
1556 HVMTRACE_0D(DR_WRITE);
1558 if ( !v->arch.hvm_vcpu.flag_dr_dirty )
1559 __restore_debug_registers(v);
1561 /* Allow guest direct access to DR registers */
1562 v->arch.hvm_vmx.exec_control &= ~CPU_BASED_MOV_DR_EXITING;
1563 __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control);
1566 static void vmx_invlpg_intercept(unsigned long vaddr)
1568 struct vcpu *curr = current;
1569 HVMTRACE_LONG_2D(INVLPG, /*invlpga=*/ 0, TRC_PAR_LONG(vaddr));
1570 if ( paging_invlpg(curr, vaddr) )
1571 vpid_sync_vcpu_gva(curr, vaddr);
1574 #define CASE_SET_REG(REG, reg) \
1575 case VMX_CONTROL_REG_ACCESS_GPR_ ## REG: regs->reg = value; break
1576 #define CASE_GET_REG(REG, reg) \
1577 case VMX_CONTROL_REG_ACCESS_GPR_ ## REG: value = regs->reg; break
1579 #define CASE_EXTEND_SET_REG \
1580 CASE_EXTEND_REG(S)
1581 #define CASE_EXTEND_GET_REG \
1582 CASE_EXTEND_REG(G)
1584 #ifdef __i386__
1585 #define CASE_EXTEND_REG(T)
1586 #else
1587 #define CASE_EXTEND_REG(T) \
1588 CASE_ ## T ## ET_REG(R8, r8); \
1589 CASE_ ## T ## ET_REG(R9, r9); \
1590 CASE_ ## T ## ET_REG(R10, r10); \
1591 CASE_ ## T ## ET_REG(R11, r11); \
1592 CASE_ ## T ## ET_REG(R12, r12); \
1593 CASE_ ## T ## ET_REG(R13, r13); \
1594 CASE_ ## T ## ET_REG(R14, r14); \
1595 CASE_ ## T ## ET_REG(R15, r15)
1596 #endif
1598 static int mov_to_cr(int gp, int cr, struct cpu_user_regs *regs)
1600 unsigned long value;
1601 struct vcpu *v = current;
1602 struct vlapic *vlapic = vcpu_vlapic(v);
1604 switch ( gp )
1606 CASE_GET_REG(EAX, eax);
1607 CASE_GET_REG(ECX, ecx);
1608 CASE_GET_REG(EDX, edx);
1609 CASE_GET_REG(EBX, ebx);
1610 CASE_GET_REG(EBP, ebp);
1611 CASE_GET_REG(ESI, esi);
1612 CASE_GET_REG(EDI, edi);
1613 CASE_GET_REG(ESP, esp);
1614 CASE_EXTEND_GET_REG;
1615 default:
1616 gdprintk(XENLOG_ERR, "invalid gp: %d\n", gp);
1617 goto exit_and_crash;
1620 HVMTRACE_LONG_2D(CR_WRITE, cr, TRC_PAR_LONG(value));
1622 HVM_DBG_LOG(DBG_LEVEL_1, "CR%d, value = %lx", cr, value);
1624 switch ( cr )
1626 case 0:
1627 return !hvm_set_cr0(value);
1629 case 3:
1630 return !hvm_set_cr3(value);
1632 case 4:
1633 return !hvm_set_cr4(value);
1635 case 8:
1636 vlapic_set_reg(vlapic, APIC_TASKPRI, ((value & 0x0F) << 4));
1637 break;
1639 default:
1640 gdprintk(XENLOG_ERR, "invalid cr: %d\n", cr);
1641 goto exit_and_crash;
1644 return 1;
1646 exit_and_crash:
1647 domain_crash(v->domain);
1648 return 0;
1651 /*
1652 * Read from control registers. CR0 and CR4 are read from the shadow.
1653 */
1654 static void mov_from_cr(int cr, int gp, struct cpu_user_regs *regs)
1656 unsigned long value = 0;
1657 struct vcpu *v = current;
1658 struct vlapic *vlapic = vcpu_vlapic(v);
1660 switch ( cr )
1662 case 3:
1663 value = (unsigned long)v->arch.hvm_vcpu.guest_cr[3];
1664 break;
1665 case 8:
1666 value = (unsigned long)vlapic_get_reg(vlapic, APIC_TASKPRI);
1667 value = (value & 0xF0) >> 4;
1668 break;
1669 default:
1670 gdprintk(XENLOG_ERR, "invalid cr: %d\n", cr);
1671 domain_crash(v->domain);
1672 break;
1675 switch ( gp ) {
1676 CASE_SET_REG(EAX, eax);
1677 CASE_SET_REG(ECX, ecx);
1678 CASE_SET_REG(EDX, edx);
1679 CASE_SET_REG(EBX, ebx);
1680 CASE_SET_REG(EBP, ebp);
1681 CASE_SET_REG(ESI, esi);
1682 CASE_SET_REG(EDI, edi);
1683 CASE_SET_REG(ESP, esp);
1684 CASE_EXTEND_SET_REG;
1685 default:
1686 printk("invalid gp: %d\n", gp);
1687 domain_crash(v->domain);
1688 break;
1691 HVMTRACE_LONG_2D(CR_READ, cr, TRC_PAR_LONG(value));
1693 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR%d, value = %lx", cr, value);
1696 static int vmx_cr_access(unsigned long exit_qualification,
1697 struct cpu_user_regs *regs)
1699 unsigned int gp, cr;
1700 unsigned long value;
1701 struct vcpu *v = current;
1703 switch ( exit_qualification & VMX_CONTROL_REG_ACCESS_TYPE )
1705 case VMX_CONTROL_REG_ACCESS_TYPE_MOV_TO_CR:
1706 gp = exit_qualification & VMX_CONTROL_REG_ACCESS_GPR;
1707 cr = exit_qualification & VMX_CONTROL_REG_ACCESS_NUM;
1708 return mov_to_cr(gp, cr, regs);
1709 case VMX_CONTROL_REG_ACCESS_TYPE_MOV_FROM_CR:
1710 gp = exit_qualification & VMX_CONTROL_REG_ACCESS_GPR;
1711 cr = exit_qualification & VMX_CONTROL_REG_ACCESS_NUM;
1712 mov_from_cr(cr, gp, regs);
1713 break;
1714 case VMX_CONTROL_REG_ACCESS_TYPE_CLTS:
1715 v->arch.hvm_vcpu.guest_cr[0] &= ~X86_CR0_TS;
1716 vmx_update_guest_cr(v, 0);
1717 HVMTRACE_0D(CLTS);
1718 break;
1719 case VMX_CONTROL_REG_ACCESS_TYPE_LMSW:
1720 value = v->arch.hvm_vcpu.guest_cr[0];
1721 /* LMSW can: (1) set bits 0-3; (2) clear bits 1-3. */
1722 value = (value & ~0xe) | ((exit_qualification >> 16) & 0xf);
1723 HVMTRACE_LONG_1D(LMSW, value);
1724 return !hvm_set_cr0(value);
1725 default:
1726 BUG();
1729 return 1;
1732 static const struct lbr_info {
1733 u32 base, count;
1734 } p4_lbr[] = {
1735 { MSR_P4_LER_FROM_LIP, 1 },
1736 { MSR_P4_LER_TO_LIP, 1 },
1737 { MSR_P4_LASTBRANCH_TOS, 1 },
1738 { MSR_P4_LASTBRANCH_0_FROM_LIP, NUM_MSR_P4_LASTBRANCH_FROM_TO },
1739 { MSR_P4_LASTBRANCH_0_TO_LIP, NUM_MSR_P4_LASTBRANCH_FROM_TO },
1740 { 0, 0 }
1741 }, c2_lbr[] = {
1742 { MSR_IA32_LASTINTFROMIP, 1 },
1743 { MSR_IA32_LASTINTTOIP, 1 },
1744 { MSR_C2_LASTBRANCH_TOS, 1 },
1745 { MSR_C2_LASTBRANCH_0_FROM_IP, NUM_MSR_C2_LASTBRANCH_FROM_TO },
1746 { MSR_C2_LASTBRANCH_0_TO_IP, NUM_MSR_C2_LASTBRANCH_FROM_TO },
1747 { 0, 0 }
1748 #ifdef __i386__
1749 }, pm_lbr[] = {
1750 { MSR_IA32_LASTINTFROMIP, 1 },
1751 { MSR_IA32_LASTINTTOIP, 1 },
1752 { MSR_PM_LASTBRANCH_TOS, 1 },
1753 { MSR_PM_LASTBRANCH_0, NUM_MSR_PM_LASTBRANCH },
1754 { 0, 0 }
1755 #endif
1756 };
1758 static const struct lbr_info *last_branch_msr_get(void)
1760 switch ( boot_cpu_data.x86 )
1762 case 6:
1763 switch ( boot_cpu_data.x86_model )
1765 #ifdef __i386__
1766 /* PentiumM */
1767 case 9: case 13:
1768 /* Core Solo/Duo */
1769 case 14:
1770 return pm_lbr;
1771 break;
1772 #endif
1773 /* Core2 Duo */
1774 case 15:
1775 return c2_lbr;
1776 break;
1778 break;
1780 case 15:
1781 switch ( boot_cpu_data.x86_model )
1783 /* Pentium4/Xeon with em64t */
1784 case 3: case 4: case 6:
1785 return p4_lbr;
1786 break;
1788 break;
1791 return NULL;
1794 static int is_last_branch_msr(u32 ecx)
1796 const struct lbr_info *lbr = last_branch_msr_get();
1798 if ( lbr == NULL )
1799 return 0;
1801 for ( ; lbr->count; lbr++ )
1802 if ( (ecx >= lbr->base) && (ecx < (lbr->base + lbr->count)) )
1803 return 1;
1805 return 0;
1808 static int vmx_msr_read_intercept(struct cpu_user_regs *regs)
1810 u64 msr_content = 0;
1811 u32 ecx = regs->ecx, eax, edx;
1813 HVM_DBG_LOG(DBG_LEVEL_1, "ecx=%x", ecx);
1815 switch ( ecx )
1817 case MSR_IA32_SYSENTER_CS:
1818 msr_content = (u32)__vmread(GUEST_SYSENTER_CS);
1819 break;
1820 case MSR_IA32_SYSENTER_ESP:
1821 msr_content = __vmread(GUEST_SYSENTER_ESP);
1822 break;
1823 case MSR_IA32_SYSENTER_EIP:
1824 msr_content = __vmread(GUEST_SYSENTER_EIP);
1825 break;
1826 case MSR_IA32_DEBUGCTLMSR:
1827 msr_content = __vmread(GUEST_IA32_DEBUGCTL);
1828 #ifdef __i386__
1829 msr_content |= (u64)__vmread(GUEST_IA32_DEBUGCTL_HIGH) << 32;
1830 #endif
1831 break;
1832 case MSR_IA32_VMX_BASIC...MSR_IA32_VMX_PROCBASED_CTLS2:
1833 goto gp_fault;
1834 case MSR_IA32_MISC_ENABLE:
1835 rdmsrl(MSR_IA32_MISC_ENABLE, msr_content);
1836 /* Debug Trace Store is not supported. */
1837 msr_content |= MSR_IA32_MISC_ENABLE_BTS_UNAVAIL |
1838 MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL;
1839 break;
1840 default:
1841 if ( vpmu_do_rdmsr(regs) )
1842 goto done;
1843 if ( passive_domain_do_rdmsr(regs) )
1844 goto done;
1845 switch ( long_mode_do_msr_read(regs) )
1847 case HNDL_unhandled:
1848 break;
1849 case HNDL_exception_raised:
1850 return X86EMUL_EXCEPTION;
1851 case HNDL_done:
1852 goto done;
1855 if ( vmx_read_guest_msr(ecx, &msr_content) == 0 )
1856 break;
1858 if ( is_last_branch_msr(ecx) )
1860 msr_content = 0;
1861 break;
1864 if ( rdmsr_viridian_regs(ecx, &eax, &edx) ||
1865 rdmsr_hypervisor_regs(ecx, &eax, &edx) ||
1866 rdmsr_safe(ecx, eax, edx) == 0 )
1868 regs->eax = eax;
1869 regs->edx = edx;
1870 goto done;
1873 goto gp_fault;
1876 regs->eax = (uint32_t)msr_content;
1877 regs->edx = (uint32_t)(msr_content >> 32);
1879 done:
1880 HVMTRACE_3D (MSR_READ, ecx, regs->eax, regs->edx);
1881 HVM_DBG_LOG(DBG_LEVEL_1, "returns: ecx=%x, eax=%lx, edx=%lx",
1882 ecx, (unsigned long)regs->eax,
1883 (unsigned long)regs->edx);
1884 return X86EMUL_OKAY;
1886 gp_fault:
1887 vmx_inject_hw_exception(TRAP_gp_fault, 0);
1888 return X86EMUL_EXCEPTION;
1891 static int vmx_alloc_vlapic_mapping(struct domain *d)
1893 void *apic_va;
1895 if ( !cpu_has_vmx_virtualize_apic_accesses )
1896 return 0;
1898 apic_va = alloc_xenheap_page();
1899 if ( apic_va == NULL )
1900 return -ENOMEM;
1901 share_xen_page_with_guest(virt_to_page(apic_va), d, XENSHARE_writable);
1902 set_mmio_p2m_entry(
1903 d, paddr_to_pfn(APIC_DEFAULT_PHYS_BASE), _mfn(virt_to_mfn(apic_va)));
1904 d->arch.hvm_domain.vmx.apic_access_mfn = virt_to_mfn(apic_va);
1906 return 0;
1909 static void vmx_free_vlapic_mapping(struct domain *d)
1911 unsigned long mfn = d->arch.hvm_domain.vmx.apic_access_mfn;
1912 if ( mfn != 0 )
1913 free_xenheap_page(mfn_to_virt(mfn));
1916 static int vmx_alloc_vpid(struct domain *d)
1918 int idx;
1920 if ( !cpu_has_vmx_vpid )
1921 return 0;
1923 do {
1924 idx = find_first_zero_bit(vpid_bitmap, VPID_BITMAP_SIZE);
1925 if ( idx >= VPID_BITMAP_SIZE )
1927 dprintk(XENLOG_WARNING, "VMX VPID space exhausted.\n");
1928 return -EBUSY;
1931 while ( test_and_set_bit(idx, vpid_bitmap) );
1933 d->arch.hvm_domain.vmx.vpid_base = idx * XEN_LEGACY_MAX_VCPUS;
1934 return 0;
1937 static void vmx_free_vpid(struct domain *d)
1939 if ( !cpu_has_vmx_vpid )
1940 return;
1942 clear_bit(d->arch.hvm_domain.vmx.vpid_base / XEN_LEGACY_MAX_VCPUS,
1943 vpid_bitmap);
1946 static void vmx_install_vlapic_mapping(struct vcpu *v)
1948 paddr_t virt_page_ma, apic_page_ma;
1950 if ( !cpu_has_vmx_virtualize_apic_accesses )
1951 return;
1953 virt_page_ma = page_to_maddr(vcpu_vlapic(v)->regs_page);
1954 apic_page_ma = v->domain->arch.hvm_domain.vmx.apic_access_mfn;
1955 apic_page_ma <<= PAGE_SHIFT;
1957 vmx_vmcs_enter(v);
1958 __vmwrite(VIRTUAL_APIC_PAGE_ADDR, virt_page_ma);
1959 __vmwrite(APIC_ACCESS_ADDR, apic_page_ma);
1960 vmx_vmcs_exit(v);
1963 void vmx_vlapic_msr_changed(struct vcpu *v)
1965 struct vlapic *vlapic = vcpu_vlapic(v);
1966 uint32_t ctl;
1968 if ( !cpu_has_vmx_virtualize_apic_accesses )
1969 return;
1971 vmx_vmcs_enter(v);
1972 ctl = __vmread(SECONDARY_VM_EXEC_CONTROL);
1973 ctl &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
1974 if ( !vlapic_hw_disabled(vlapic) &&
1975 (vlapic_base_address(vlapic) == APIC_DEFAULT_PHYS_BASE) )
1976 ctl |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
1977 __vmwrite(SECONDARY_VM_EXEC_CONTROL, ctl);
1978 vmx_vmcs_exit(v);
1981 static int vmx_msr_write_intercept(struct cpu_user_regs *regs)
1983 u32 ecx = regs->ecx;
1984 u64 msr_content;
1985 struct vcpu *v = current;
1987 HVM_DBG_LOG(DBG_LEVEL_1, "ecx=%x, eax=%x, edx=%x",
1988 ecx, (u32)regs->eax, (u32)regs->edx);
1990 msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
1992 HVMTRACE_3D (MSR_WRITE, ecx, regs->eax, regs->edx);
1994 switch ( ecx )
1996 case MSR_IA32_SYSENTER_CS:
1997 __vmwrite(GUEST_SYSENTER_CS, msr_content);
1998 break;
1999 case MSR_IA32_SYSENTER_ESP:
2000 __vmwrite(GUEST_SYSENTER_ESP, msr_content);
2001 break;
2002 case MSR_IA32_SYSENTER_EIP:
2003 __vmwrite(GUEST_SYSENTER_EIP, msr_content);
2004 break;
2005 case MSR_IA32_DEBUGCTLMSR: {
2006 int i, rc = 0;
2008 if ( !msr_content || (msr_content & ~3) )
2009 break;
2011 if ( msr_content & 1 )
2013 const struct lbr_info *lbr = last_branch_msr_get();
2014 if ( lbr == NULL )
2015 break;
2017 for ( ; (rc == 0) && lbr->count; lbr++ )
2018 for ( i = 0; (rc == 0) && (i < lbr->count); i++ )
2019 if ( (rc = vmx_add_guest_msr(lbr->base + i)) == 0 )
2020 vmx_disable_intercept_for_msr(v, lbr->base + i);
2023 if ( (rc < 0) ||
2024 (vmx_add_host_load_msr(ecx) < 0) )
2025 vmx_inject_hw_exception(TRAP_machine_check, 0);
2026 else
2028 __vmwrite(GUEST_IA32_DEBUGCTL, msr_content);
2029 #ifdef __i386__
2030 __vmwrite(GUEST_IA32_DEBUGCTL_HIGH, msr_content >> 32);
2031 #endif
2034 break;
2036 case MSR_IA32_VMX_BASIC...MSR_IA32_VMX_PROCBASED_CTLS2:
2037 goto gp_fault;
2038 default:
2039 if ( vpmu_do_wrmsr(regs) )
2040 return X86EMUL_OKAY;
2041 if ( passive_domain_do_wrmsr(regs) )
2042 return X86EMUL_OKAY;
2044 if ( wrmsr_viridian_regs(ecx, regs->eax, regs->edx) )
2045 break;
2047 switch ( long_mode_do_msr_write(regs) )
2049 case HNDL_unhandled:
2050 if ( (vmx_write_guest_msr(ecx, msr_content) != 0) &&
2051 !is_last_branch_msr(ecx) )
2052 wrmsr_hypervisor_regs(ecx, regs->eax, regs->edx);
2053 break;
2054 case HNDL_exception_raised:
2055 return X86EMUL_EXCEPTION;
2056 case HNDL_done:
2057 break;
2059 break;
2062 return X86EMUL_OKAY;
2064 gp_fault:
2065 vmx_inject_hw_exception(TRAP_gp_fault, 0);
2066 return X86EMUL_EXCEPTION;
2069 static void vmx_do_extint(struct cpu_user_regs *regs)
2071 unsigned int vector;
2073 asmlinkage void do_IRQ(struct cpu_user_regs *);
2074 fastcall void smp_apic_timer_interrupt(struct cpu_user_regs *);
2075 fastcall void smp_event_check_interrupt(void);
2076 fastcall void smp_invalidate_interrupt(void);
2077 fastcall void smp_call_function_interrupt(void);
2078 fastcall void smp_spurious_interrupt(struct cpu_user_regs *regs);
2079 fastcall void smp_error_interrupt(struct cpu_user_regs *regs);
2080 fastcall void smp_pmu_apic_interrupt(struct cpu_user_regs *regs);
2081 fastcall void smp_cmci_interrupt(struct cpu_user_regs *regs);
2082 #ifdef CONFIG_X86_MCE_THERMAL
2083 fastcall void smp_thermal_interrupt(struct cpu_user_regs *regs);
2084 #endif
2086 vector = __vmread(VM_EXIT_INTR_INFO);
2087 BUG_ON(!(vector & INTR_INFO_VALID_MASK));
2089 vector &= INTR_INFO_VECTOR_MASK;
2090 HVMTRACE_1D(INTR, vector);
2092 switch ( vector )
2094 case LOCAL_TIMER_VECTOR:
2095 smp_apic_timer_interrupt(regs);
2096 break;
2097 case EVENT_CHECK_VECTOR:
2098 smp_event_check_interrupt();
2099 break;
2100 case INVALIDATE_TLB_VECTOR:
2101 smp_invalidate_interrupt();
2102 break;
2103 case CALL_FUNCTION_VECTOR:
2104 smp_call_function_interrupt();
2105 break;
2106 case SPURIOUS_APIC_VECTOR:
2107 smp_spurious_interrupt(regs);
2108 break;
2109 case ERROR_APIC_VECTOR:
2110 smp_error_interrupt(regs);
2111 break;
2112 case CMCI_APIC_VECTOR:
2113 smp_cmci_interrupt(regs);
2114 break;
2115 case PMU_APIC_VECTOR:
2116 smp_pmu_apic_interrupt(regs);
2117 break;
2118 #ifdef CONFIG_X86_MCE_THERMAL
2119 case THERMAL_APIC_VECTOR:
2120 smp_thermal_interrupt(regs);
2121 break;
2122 #endif
2123 default:
2124 regs->entry_vector = vector;
2125 do_IRQ(regs);
2126 break;
2130 static void wbinvd_ipi(void *info)
2132 wbinvd();
2135 static void vmx_wbinvd_intercept(void)
2137 if ( !has_arch_pdevs(current->domain) )
2138 return;
2140 if ( cpu_has_wbinvd_exiting )
2141 on_each_cpu(wbinvd_ipi, NULL, 1);
2142 else
2143 wbinvd();
2146 static void ept_handle_violation(unsigned long qualification, paddr_t gpa)
2148 unsigned long gla_validity = qualification & EPT_GLA_VALIDITY_MASK;
2149 struct domain *d = current->domain;
2150 unsigned long gla, gfn = gpa >> PAGE_SHIFT;
2151 mfn_t mfn;
2152 p2m_type_t t;
2154 mfn = gfn_to_mfn_guest(d, gfn, &t);
2156 /* There are three legitimate reasons for taking an EPT violation.
2157 * One is a guest access to MMIO space. */
2158 if ( gla_validity == EPT_GLA_VALIDITY_MATCH && p2m_is_mmio(t) )
2160 handle_mmio();
2161 return;
2164 /* The second is log-dirty mode, writing to a read-only page;
2165 * The third is populating a populate-on-demand page. */
2166 if ( (gla_validity == EPT_GLA_VALIDITY_MATCH
2167 || gla_validity == EPT_GLA_VALIDITY_GPT_WALK)
2168 && p2m_is_ram(t) && (t != p2m_ram_ro) )
2170 if ( paging_mode_log_dirty(d) )
2172 paging_mark_dirty(d, mfn_x(mfn));
2173 p2m_change_type(d, gfn, p2m_ram_logdirty, p2m_ram_rw);
2174 flush_tlb_mask(&d->domain_dirty_cpumask);
2176 return;
2179 /* Everything else is an error. */
2180 gla = __vmread(GUEST_LINEAR_ADDRESS);
2181 gdprintk(XENLOG_ERR, "EPT violation %#lx (%c%c%c/%c%c%c), "
2182 "gpa %#"PRIpaddr", mfn %#lx, type %i.\n",
2183 qualification,
2184 (qualification & EPT_READ_VIOLATION) ? 'r' : '-',
2185 (qualification & EPT_WRITE_VIOLATION) ? 'w' : '-',
2186 (qualification & EPT_EXEC_VIOLATION) ? 'x' : '-',
2187 (qualification & EPT_EFFECTIVE_READ) ? 'r' : '-',
2188 (qualification & EPT_EFFECTIVE_WRITE) ? 'w' : '-',
2189 (qualification & EPT_EFFECTIVE_EXEC) ? 'x' : '-',
2190 gpa, mfn_x(mfn), t);
2192 if ( qualification & EPT_GAW_VIOLATION )
2193 gdprintk(XENLOG_ERR, " --- GPA too wide (max %u bits)\n",
2194 9 * (unsigned) d->arch.hvm_domain.vmx.ept_control.gaw + 21);
2196 switch ( gla_validity )
2198 case EPT_GLA_VALIDITY_PDPTR_LOAD:
2199 gdprintk(XENLOG_ERR, " --- PDPTR load failed\n");
2200 break;
2201 case EPT_GLA_VALIDITY_GPT_WALK:
2202 gdprintk(XENLOG_ERR, " --- guest PT walk to %#lx failed\n", gla);
2203 break;
2204 case EPT_GLA_VALIDITY_RSVD:
2205 gdprintk(XENLOG_ERR, " --- GLA_validity 2 (reserved)\n");
2206 break;
2207 case EPT_GLA_VALIDITY_MATCH:
2208 gdprintk(XENLOG_ERR, " --- guest access to %#lx failed\n", gla);
2209 break;
2212 domain_crash(d);
2215 static void vmx_failed_vmentry(unsigned int exit_reason,
2216 struct cpu_user_regs *regs)
2218 unsigned int failed_vmentry_reason = (uint16_t)exit_reason;
2219 unsigned long exit_qualification = __vmread(EXIT_QUALIFICATION);
2220 struct vcpu *curr = current;
2222 printk("Failed vm entry (exit reason 0x%x) ", exit_reason);
2223 switch ( failed_vmentry_reason )
2225 case EXIT_REASON_INVALID_GUEST_STATE:
2226 printk("caused by invalid guest state (%ld).\n", exit_qualification);
2227 break;
2228 case EXIT_REASON_MSR_LOADING:
2229 printk("caused by MSR entry %ld loading.\n", exit_qualification);
2230 break;
2231 case EXIT_REASON_MCE_DURING_VMENTRY:
2232 printk("caused by machine check.\n");
2233 HVMTRACE_0D(MCE);
2234 do_machine_check(regs);
2235 break;
2236 default:
2237 printk("reason not known yet!");
2238 break;
2241 printk("************* VMCS Area **************\n");
2242 vmcs_dump_vcpu(curr);
2243 printk("**************************************\n");
2245 domain_crash(curr->domain);
2248 asmlinkage void vmx_enter_realmode(struct cpu_user_regs *regs)
2250 struct vcpu *v = current;
2252 /* Adjust RFLAGS to enter virtual 8086 mode with IOPL == 3. Since
2253 * we have CR4.VME == 1 and our own TSS with an empty interrupt
2254 * redirection bitmap, all software INTs will be handled by vm86 */
2255 v->arch.hvm_vmx.vm86_saved_eflags = regs->eflags;
2256 regs->eflags |= (X86_EFLAGS_VM | X86_EFLAGS_IOPL);
2259 static void vmx_vmexit_ud_intercept(struct cpu_user_regs *regs)
2261 struct hvm_emulate_ctxt ctxt;
2262 int rc;
2264 hvm_emulate_prepare(&ctxt, regs);
2266 rc = hvm_emulate_one(&ctxt);
2268 switch ( rc )
2270 case X86EMUL_UNHANDLEABLE:
2271 gdprintk(XENLOG_WARNING,
2272 "instruction emulation failed @ %04x:%lx: "
2273 "%02x %02x %02x %02x %02x %02x\n",
2274 hvmemul_get_seg_reg(x86_seg_cs, &ctxt)->sel,
2275 ctxt.insn_buf_eip,
2276 ctxt.insn_buf[0], ctxt.insn_buf[1],
2277 ctxt.insn_buf[2], ctxt.insn_buf[3],
2278 ctxt.insn_buf[4], ctxt.insn_buf[5]);
2279 return;
2280 case X86EMUL_EXCEPTION:
2281 if ( ctxt.exn_pending )
2282 hvm_inject_exception(ctxt.exn_vector, ctxt.exn_error_code, 0);
2283 break;
2284 default:
2285 break;
2288 hvm_emulate_writeback(&ctxt);
2291 asmlinkage void vmx_vmexit_handler(struct cpu_user_regs *regs)
2293 unsigned int exit_reason, idtv_info;
2294 unsigned long exit_qualification, inst_len = 0;
2295 struct vcpu *v = current;
2297 if ( paging_mode_hap(v->domain) && hvm_paging_enabled(v) )
2298 v->arch.hvm_vcpu.guest_cr[3] = v->arch.hvm_vcpu.hw_cr[3] =
2299 __vmread(GUEST_CR3);
2301 exit_reason = __vmread(VM_EXIT_REASON);
2303 if ( hvm_long_mode_enabled(v) )
2304 HVMTRACE_ND(VMEXIT64, 1/*cycles*/, 3, exit_reason,
2305 (uint32_t)regs->eip, (uint32_t)((uint64_t)regs->eip >> 32),
2306 0, 0, 0);
2307 else
2308 HVMTRACE_ND(VMEXIT, 1/*cycles*/, 2, exit_reason,
2309 (uint32_t)regs->eip,
2310 0, 0, 0, 0);
2312 perfc_incra(vmexits, exit_reason);
2314 /* Handle the interrupt we missed before allowing any more in. */
2315 if ( exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT )
2316 vmx_do_extint(regs);
2318 /* Now enable interrupts so it's safe to take locks. */
2319 local_irq_enable();
2321 if ( unlikely(exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) )
2322 return vmx_failed_vmentry(exit_reason, regs);
2324 if ( v->arch.hvm_vmx.vmx_realmode )
2326 unsigned int vector;
2328 /* Put RFLAGS back the way the guest wants it */
2329 regs->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IOPL);
2330 regs->eflags |= (v->arch.hvm_vmx.vm86_saved_eflags & X86_EFLAGS_IOPL);
2332 /* Unless this exit was for an interrupt, we've hit something
2333 * vm86 can't handle. Try again, using the emulator. */
2334 switch ( exit_reason )
2336 case EXIT_REASON_EXCEPTION_NMI:
2337 vector = __vmread(VM_EXIT_INTR_INFO) & INTR_INFO_VECTOR_MASK;;
2338 if ( vector != TRAP_page_fault
2339 && vector != TRAP_nmi
2340 && vector != TRAP_machine_check )
2342 perfc_incr(realmode_exits);
2343 v->arch.hvm_vmx.vmx_emulate = 1;
2344 return;
2346 case EXIT_REASON_EXTERNAL_INTERRUPT:
2347 case EXIT_REASON_INIT:
2348 case EXIT_REASON_SIPI:
2349 case EXIT_REASON_PENDING_VIRT_INTR:
2350 case EXIT_REASON_PENDING_VIRT_NMI:
2351 case EXIT_REASON_MCE_DURING_VMENTRY:
2352 break;
2353 default:
2354 v->arch.hvm_vmx.vmx_emulate = 1;
2355 perfc_incr(realmode_exits);
2356 return;
2360 hvm_maybe_deassert_evtchn_irq();
2362 /* Event delivery caused this intercept? Queue for redelivery. */
2363 idtv_info = __vmread(IDT_VECTORING_INFO);
2364 if ( unlikely(idtv_info & INTR_INFO_VALID_MASK) &&
2365 (exit_reason != EXIT_REASON_TASK_SWITCH) )
2367 if ( hvm_event_needs_reinjection((idtv_info>>8)&7, idtv_info&0xff) )
2369 /* See SDM 3B 25.7.1.1 and .2 for info about masking resvd bits. */
2370 __vmwrite(VM_ENTRY_INTR_INFO,
2371 idtv_info & ~INTR_INFO_RESVD_BITS_MASK);
2372 if ( idtv_info & INTR_INFO_DELIVER_CODE_MASK )
2373 __vmwrite(VM_ENTRY_EXCEPTION_ERROR_CODE,
2374 __vmread(IDT_VECTORING_ERROR_CODE));
2377 /*
2378 * Clear NMI-blocking interruptibility info if an NMI delivery faulted.
2379 * Re-delivery will re-set it (see SDM 3B 25.7.1.2).
2380 */
2381 if ( (idtv_info & INTR_INFO_INTR_TYPE_MASK) == (X86_EVENTTYPE_NMI<<8) )
2382 __vmwrite(GUEST_INTERRUPTIBILITY_INFO,
2383 __vmread(GUEST_INTERRUPTIBILITY_INFO) &
2384 ~VMX_INTR_SHADOW_NMI);
2387 switch ( exit_reason )
2389 case EXIT_REASON_EXCEPTION_NMI:
2391 /*
2392 * We don't set the software-interrupt exiting (INT n).
2393 * (1) We can get an exception (e.g. #PG) in the guest, or
2394 * (2) NMI
2395 */
2396 unsigned int intr_info, vector;
2398 intr_info = __vmread(VM_EXIT_INTR_INFO);
2399 BUG_ON(!(intr_info & INTR_INFO_VALID_MASK));
2401 vector = intr_info & INTR_INFO_VECTOR_MASK;
2403 /*
2404 * Re-set the NMI shadow if vmexit caused by a guest IRET fault (see 3B
2405 * 25.7.1.2, "Resuming Guest Software after Handling an Exception").
2406 * (NB. If we emulate this IRET for any reason, we should re-clear!)
2407 */
2408 if ( unlikely(intr_info & INTR_INFO_NMI_UNBLOCKED_BY_IRET) &&
2409 !(idtv_info & INTR_INFO_VALID_MASK) &&
2410 (vector != TRAP_double_fault) )
2411 __vmwrite(GUEST_INTERRUPTIBILITY_INFO,
2412 __vmread(GUEST_INTERRUPTIBILITY_INFO)
2413 | VMX_INTR_SHADOW_NMI);
2415 perfc_incra(cause_vector, vector);
2417 switch ( vector )
2419 case TRAP_debug:
2420 /*
2421 * Updates DR6 where debugger can peek (See 3B 23.2.1,
2422 * Table 23-1, "Exit Qualification for Debug Exceptions").
2423 */
2424 exit_qualification = __vmread(EXIT_QUALIFICATION);
2425 write_debugreg(6, exit_qualification | 0xffff0ff0);
2426 if ( !v->domain->debugger_attached || cpu_has_monitor_trap_flag )
2427 goto exit_and_crash;
2428 domain_pause_for_debugger();
2429 break;
2430 case TRAP_int3:
2431 if ( !v->domain->debugger_attached )
2432 goto exit_and_crash;
2433 inst_len = __get_instruction_length(); /* Safe: INT3 */
2434 __update_guest_eip(inst_len);
2435 domain_pause_for_debugger();
2436 break;
2437 case TRAP_no_device:
2438 vmx_fpu_dirty_intercept();
2439 break;
2440 case TRAP_page_fault:
2441 exit_qualification = __vmread(EXIT_QUALIFICATION);
2442 regs->error_code = __vmread(VM_EXIT_INTR_ERROR_CODE);
2444 HVM_DBG_LOG(DBG_LEVEL_VMMU,
2445 "eax=%lx, ebx=%lx, ecx=%lx, edx=%lx, esi=%lx, edi=%lx",
2446 (unsigned long)regs->eax, (unsigned long)regs->ebx,
2447 (unsigned long)regs->ecx, (unsigned long)regs->edx,
2448 (unsigned long)regs->esi, (unsigned long)regs->edi);
2450 if ( paging_fault(exit_qualification, regs) )
2452 if ( trace_will_trace_event(TRC_SHADOW) )
2453 break;
2454 if ( hvm_long_mode_enabled(v) )
2455 HVMTRACE_LONG_2D(PF_XEN, regs->error_code,
2456 TRC_PAR_LONG(exit_qualification) );
2457 else
2458 HVMTRACE_2D(PF_XEN,
2459 regs->error_code, exit_qualification );
2460 break;
2463 v->arch.hvm_vcpu.guest_cr[2] = exit_qualification;
2464 vmx_inject_hw_exception(TRAP_page_fault, regs->error_code);
2465 break;
2466 case TRAP_nmi:
2467 if ( (intr_info & INTR_INFO_INTR_TYPE_MASK) !=
2468 (X86_EVENTTYPE_NMI << 8) )
2469 goto exit_and_crash;
2470 HVMTRACE_0D(NMI);
2471 do_nmi(regs); /* Real NMI, vector 2: normal processing. */
2472 break;
2473 case TRAP_machine_check:
2474 HVMTRACE_0D(MCE);
2475 do_machine_check(regs);
2476 break;
2477 case TRAP_invalid_op:
2478 vmx_vmexit_ud_intercept(regs);
2479 break;
2480 default:
2481 goto exit_and_crash;
2483 break;
2485 case EXIT_REASON_EXTERNAL_INTERRUPT:
2486 /* Already handled above. */
2487 break;
2488 case EXIT_REASON_TRIPLE_FAULT:
2489 hvm_triple_fault();
2490 break;
2491 case EXIT_REASON_PENDING_VIRT_INTR:
2492 /* Disable the interrupt window. */
2493 v->arch.hvm_vmx.exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
2494 __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
2495 v->arch.hvm_vmx.exec_control);
2496 break;
2497 case EXIT_REASON_PENDING_VIRT_NMI:
2498 /* Disable the NMI window. */
2499 v->arch.hvm_vmx.exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
2500 __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
2501 v->arch.hvm_vmx.exec_control);
2502 break;
2503 case EXIT_REASON_TASK_SWITCH: {
2504 const enum hvm_task_switch_reason reasons[] = {
2505 TSW_call_or_int, TSW_iret, TSW_jmp, TSW_call_or_int };
2506 int32_t errcode = -1;
2507 exit_qualification = __vmread(EXIT_QUALIFICATION);
2508 if ( (idtv_info & INTR_INFO_VALID_MASK) &&
2509 (idtv_info & INTR_INFO_DELIVER_CODE_MASK) )
2510 errcode = __vmread(IDT_VECTORING_ERROR_CODE);
2511 hvm_task_switch((uint16_t)exit_qualification,
2512 reasons[(exit_qualification >> 30) & 3],
2513 errcode);
2514 break;
2516 case EXIT_REASON_CPUID:
2517 inst_len = __get_instruction_length(); /* Safe: CPUID */
2518 __update_guest_eip(inst_len);
2519 vmx_do_cpuid(regs);
2520 break;
2521 case EXIT_REASON_HLT:
2522 inst_len = __get_instruction_length(); /* Safe: HLT */
2523 __update_guest_eip(inst_len);
2524 hvm_hlt(regs->eflags);
2525 break;
2526 case EXIT_REASON_INVLPG:
2528 inst_len = __get_instruction_length(); /* Safe: INVLPG */
2529 __update_guest_eip(inst_len);
2530 exit_qualification = __vmread(EXIT_QUALIFICATION);
2531 vmx_invlpg_intercept(exit_qualification);
2532 break;
2534 case EXIT_REASON_RDTSC:
2535 inst_len = __get_instruction_length();
2536 __update_guest_eip(inst_len);
2537 hvm_rdtsc_intercept(regs);
2538 break;
2539 case EXIT_REASON_VMCALL:
2541 int rc;
2542 HVMTRACE_1D(VMMCALL, regs->eax);
2543 inst_len = __get_instruction_length(); /* Safe: VMCALL */
2544 rc = hvm_do_hypercall(regs);
2545 if ( rc != HVM_HCALL_preempted )
2547 __update_guest_eip(inst_len);
2548 if ( rc == HVM_HCALL_invalidate )
2549 send_invalidate_req();
2551 break;
2553 case EXIT_REASON_CR_ACCESS:
2555 exit_qualification = __vmread(EXIT_QUALIFICATION);
2556 inst_len = __get_instruction_length(); /* Safe: MOV Cn, LMSW, CLTS */
2557 if ( vmx_cr_access(exit_qualification, regs) )
2558 __update_guest_eip(inst_len);
2559 break;
2561 case EXIT_REASON_DR_ACCESS:
2562 exit_qualification = __vmread(EXIT_QUALIFICATION);
2563 vmx_dr_access(exit_qualification, regs);
2564 break;
2565 case EXIT_REASON_MSR_READ:
2566 inst_len = __get_instruction_length(); /* Safe: RDMSR */
2567 if ( hvm_msr_read_intercept(regs) == X86EMUL_OKAY )
2568 __update_guest_eip(inst_len);
2569 break;
2570 case EXIT_REASON_MSR_WRITE:
2571 inst_len = __get_instruction_length(); /* Safe: WRMSR */
2572 if ( hvm_msr_write_intercept(regs) == X86EMUL_OKAY )
2573 __update_guest_eip(inst_len);
2574 break;
2576 case EXIT_REASON_MWAIT_INSTRUCTION:
2577 case EXIT_REASON_MONITOR_INSTRUCTION:
2578 case EXIT_REASON_VMCLEAR:
2579 case EXIT_REASON_VMLAUNCH:
2580 case EXIT_REASON_VMPTRLD:
2581 case EXIT_REASON_VMPTRST:
2582 case EXIT_REASON_VMREAD:
2583 case EXIT_REASON_VMRESUME:
2584 case EXIT_REASON_VMWRITE:
2585 case EXIT_REASON_VMXOFF:
2586 case EXIT_REASON_VMXON:
2587 vmx_inject_hw_exception(TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE);
2588 break;
2590 case EXIT_REASON_TPR_BELOW_THRESHOLD:
2591 break;
2593 case EXIT_REASON_IO_INSTRUCTION:
2594 case EXIT_REASON_APIC_ACCESS:
2595 if ( !handle_mmio() )
2596 vmx_inject_hw_exception(TRAP_gp_fault, 0);
2597 break;
2599 case EXIT_REASON_INVD:
2600 case EXIT_REASON_WBINVD:
2602 inst_len = __get_instruction_length(); /* Safe: INVD, WBINVD */
2603 __update_guest_eip(inst_len);
2604 vmx_wbinvd_intercept();
2605 break;
2608 case EXIT_REASON_EPT_VIOLATION:
2610 paddr_t gpa = __vmread(GUEST_PHYSICAL_ADDRESS);
2611 #ifdef __i386__
2612 gpa |= (paddr_t)__vmread(GUEST_PHYSICAL_ADDRESS_HIGH) << 32;
2613 #endif
2614 exit_qualification = __vmread(EXIT_QUALIFICATION);
2615 ept_handle_violation(exit_qualification, gpa);
2616 break;
2619 case EXIT_REASON_MONITOR_TRAP_FLAG:
2621 v->arch.hvm_vmx.exec_control &= ~CPU_BASED_MONITOR_TRAP_FLAG;
2622 __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control);
2623 if ( v->domain->debugger_attached && v->arch.hvm_vcpu.single_step )
2624 domain_pause_for_debugger();
2625 break;
2628 default:
2629 exit_and_crash:
2630 gdprintk(XENLOG_ERR, "Bad vmexit (reason %x)\n", exit_reason);
2631 domain_crash(v->domain);
2632 break;
2636 asmlinkage void vmx_trace_vmentry(void)
2638 HVMTRACE_ND (VMENTRY, 1/*cycles*/, 0, 0, 0, 0, 0, 0, 0);
2641 /*
2642 * Local variables:
2643 * mode: C
2644 * c-set-style: "BSD"
2645 * c-basic-offset: 4
2646 * tab-width: 4
2647 * indent-tabs-mode: nil
2648 * End:
2649 */