ia64/xen-unstable

view xen/arch/x86/hvm/vmx/vmx.c @ 18788:07d0be88571f

hvm: fix single stepping on debugger

The debuggee domain will die with unexpected trap
on single stepping of emulated instruction.

Signed-off-by: Kouya Shimura <kouya@jp.fujitsu.com>
Signed-off-by: Keir Fraser <keir.fraser@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Tue Nov 11 11:47:03 2008 +0000 (2008-11-11)
parents 7be8e7eefbd7
children 6595393a3d28
line source
1 /*
2 * vmx.c: handling VMX architecture-related VM exits
3 * Copyright (c) 2004, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
16 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 */
19 #include <xen/config.h>
20 #include <xen/init.h>
21 #include <xen/lib.h>
22 #include <xen/trace.h>
23 #include <xen/sched.h>
24 #include <xen/irq.h>
25 #include <xen/softirq.h>
26 #include <xen/domain_page.h>
27 #include <xen/hypercall.h>
28 #include <xen/perfc.h>
29 #include <asm/current.h>
30 #include <asm/io.h>
31 #include <asm/regs.h>
32 #include <asm/cpufeature.h>
33 #include <asm/processor.h>
34 #include <asm/types.h>
35 #include <asm/debugreg.h>
36 #include <asm/msr.h>
37 #include <asm/spinlock.h>
38 #include <asm/paging.h>
39 #include <asm/p2m.h>
40 #include <asm/hvm/hvm.h>
41 #include <asm/hvm/support.h>
42 #include <asm/hvm/vmx/vmx.h>
43 #include <asm/hvm/vmx/vmcs.h>
44 #include <public/sched.h>
45 #include <public/hvm/ioreq.h>
46 #include <asm/hvm/vpic.h>
47 #include <asm/hvm/vlapic.h>
48 #include <asm/x86_emulate.h>
49 #include <asm/hvm/vpt.h>
50 #include <public/hvm/save.h>
51 #include <asm/hvm/trace.h>
52 #include <asm/xenoprof.h>
54 enum handler_return { HNDL_done, HNDL_unhandled, HNDL_exception_raised };
56 static void vmx_ctxt_switch_from(struct vcpu *v);
57 static void vmx_ctxt_switch_to(struct vcpu *v);
59 static int vmx_alloc_vlapic_mapping(struct domain *d);
60 static void vmx_free_vlapic_mapping(struct domain *d);
61 static int vmx_alloc_vpid(struct domain *d);
62 static void vmx_free_vpid(struct domain *d);
63 static void vmx_install_vlapic_mapping(struct vcpu *v);
64 static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr);
65 static void vmx_update_guest_efer(struct vcpu *v);
66 static void vmx_cpuid_intercept(
67 unsigned int *eax, unsigned int *ebx,
68 unsigned int *ecx, unsigned int *edx);
69 static void vmx_wbinvd_intercept(void);
70 static void vmx_fpu_dirty_intercept(void);
71 static int vmx_msr_read_intercept(struct cpu_user_regs *regs);
72 static int vmx_msr_write_intercept(struct cpu_user_regs *regs);
73 static void vmx_invlpg_intercept(unsigned long vaddr);
75 static int vmx_domain_initialise(struct domain *d)
76 {
77 int rc;
79 d->arch.hvm_domain.vmx.ept_control.etmt = EPT_DEFAULT_MT;
80 d->arch.hvm_domain.vmx.ept_control.gaw = EPT_DEFAULT_GAW;
81 d->arch.hvm_domain.vmx.ept_control.asr =
82 pagetable_get_pfn(d->arch.phys_table);
84 if ( (rc = vmx_alloc_vpid(d)) != 0 )
85 return rc;
87 if ( (rc = vmx_alloc_vlapic_mapping(d)) != 0 )
88 {
89 vmx_free_vpid(d);
90 return rc;
91 }
93 return 0;
94 }
96 static void vmx_domain_destroy(struct domain *d)
97 {
98 ept_sync_domain(d);
99 vmx_free_vlapic_mapping(d);
100 vmx_free_vpid(d);
101 }
103 static int vmx_vcpu_initialise(struct vcpu *v)
104 {
105 int rc;
107 spin_lock_init(&v->arch.hvm_vmx.vmcs_lock);
109 v->arch.schedule_tail = vmx_do_resume;
110 v->arch.ctxt_switch_from = vmx_ctxt_switch_from;
111 v->arch.ctxt_switch_to = vmx_ctxt_switch_to;
113 if ( (rc = vmx_create_vmcs(v)) != 0 )
114 {
115 dprintk(XENLOG_WARNING,
116 "Failed to create VMCS for vcpu %d: err=%d.\n",
117 v->vcpu_id, rc);
118 return rc;
119 }
121 vpmu_initialise(v);
123 vmx_install_vlapic_mapping(v);
125 /* %eax == 1 signals full real-mode support to the guest loader. */
126 if ( v->vcpu_id == 0 )
127 v->arch.guest_context.user_regs.eax = 1;
129 return 0;
130 }
132 static void vmx_vcpu_destroy(struct vcpu *v)
133 {
134 vmx_destroy_vmcs(v);
135 vpmu_destroy(v);
136 passive_domain_destroy(v);
137 }
139 #ifdef __x86_64__
141 static DEFINE_PER_CPU(struct vmx_msr_state, host_msr_state);
143 static u32 msr_index[VMX_MSR_COUNT] =
144 {
145 MSR_LSTAR, MSR_STAR, MSR_SYSCALL_MASK
146 };
148 static void vmx_save_host_msrs(void)
149 {
150 struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state);
151 int i;
153 for ( i = 0; i < VMX_MSR_COUNT; i++ )
154 rdmsrl(msr_index[i], host_msr_state->msrs[i]);
155 }
157 #define WRITE_MSR(address) \
158 guest_msr_state->msrs[VMX_INDEX_MSR_ ## address] = msr_content; \
159 set_bit(VMX_INDEX_MSR_ ## address, &guest_msr_state->flags); \
160 wrmsrl(MSR_ ## address, msr_content); \
161 set_bit(VMX_INDEX_MSR_ ## address, &host_msr_state->flags); \
162 break
164 static enum handler_return long_mode_do_msr_read(struct cpu_user_regs *regs)
165 {
166 u64 msr_content = 0;
167 u32 ecx = regs->ecx;
168 struct vcpu *v = current;
169 struct vmx_msr_state *guest_msr_state = &v->arch.hvm_vmx.msr_state;
171 switch ( ecx )
172 {
173 case MSR_EFER:
174 msr_content = v->arch.hvm_vcpu.guest_efer;
175 break;
177 case MSR_FS_BASE:
178 msr_content = __vmread(GUEST_FS_BASE);
179 goto check_long_mode;
181 case MSR_GS_BASE:
182 msr_content = __vmread(GUEST_GS_BASE);
183 goto check_long_mode;
185 case MSR_SHADOW_GS_BASE:
186 msr_content = v->arch.hvm_vmx.shadow_gs;
187 check_long_mode:
188 if ( !(hvm_long_mode_enabled(v)) )
189 {
190 vmx_inject_hw_exception(TRAP_gp_fault, 0);
191 return HNDL_exception_raised;
192 }
193 break;
195 case MSR_STAR:
196 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_STAR];
197 break;
199 case MSR_LSTAR:
200 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_LSTAR];
201 break;
203 case MSR_CSTAR:
204 msr_content = v->arch.hvm_vmx.cstar;
205 break;
207 case MSR_SYSCALL_MASK:
208 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_SYSCALL_MASK];
209 break;
211 default:
212 return HNDL_unhandled;
213 }
215 HVM_DBG_LOG(DBG_LEVEL_0, "msr 0x%x content 0x%"PRIx64, ecx, msr_content);
217 regs->eax = (u32)(msr_content >> 0);
218 regs->edx = (u32)(msr_content >> 32);
220 return HNDL_done;
221 }
223 static enum handler_return long_mode_do_msr_write(struct cpu_user_regs *regs)
224 {
225 u64 msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
226 u32 ecx = regs->ecx;
227 struct vcpu *v = current;
228 struct vmx_msr_state *guest_msr_state = &v->arch.hvm_vmx.msr_state;
229 struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state);
231 HVM_DBG_LOG(DBG_LEVEL_0, "msr 0x%x content 0x%"PRIx64, ecx, msr_content);
233 switch ( ecx )
234 {
235 case MSR_EFER:
236 if ( hvm_set_efer(msr_content) )
237 goto exception_raised;
238 break;
240 case MSR_FS_BASE:
241 case MSR_GS_BASE:
242 case MSR_SHADOW_GS_BASE:
243 if ( !hvm_long_mode_enabled(v) )
244 goto gp_fault;
246 if ( !is_canonical_address(msr_content) )
247 goto uncanonical_address;
249 if ( ecx == MSR_FS_BASE )
250 __vmwrite(GUEST_FS_BASE, msr_content);
251 else if ( ecx == MSR_GS_BASE )
252 __vmwrite(GUEST_GS_BASE, msr_content);
253 else
254 {
255 v->arch.hvm_vmx.shadow_gs = msr_content;
256 wrmsrl(MSR_SHADOW_GS_BASE, msr_content);
257 }
259 break;
261 case MSR_STAR:
262 WRITE_MSR(STAR);
264 case MSR_LSTAR:
265 if ( !is_canonical_address(msr_content) )
266 goto uncanonical_address;
267 WRITE_MSR(LSTAR);
269 case MSR_CSTAR:
270 if ( !is_canonical_address(msr_content) )
271 goto uncanonical_address;
272 v->arch.hvm_vmx.cstar = msr_content;
273 break;
275 case MSR_SYSCALL_MASK:
276 WRITE_MSR(SYSCALL_MASK);
278 default:
279 return HNDL_unhandled;
280 }
282 return HNDL_done;
284 uncanonical_address:
285 HVM_DBG_LOG(DBG_LEVEL_0, "Not cano address of msr write %x", ecx);
286 gp_fault:
287 vmx_inject_hw_exception(TRAP_gp_fault, 0);
288 exception_raised:
289 return HNDL_exception_raised;
290 }
292 /*
293 * To avoid MSR save/restore at every VM exit/entry time, we restore
294 * the x86_64 specific MSRs at domain switch time. Since these MSRs
295 * are not modified once set for para domains, we don't save them,
296 * but simply reset them to values set in percpu_traps_init().
297 */
298 static void vmx_restore_host_msrs(void)
299 {
300 struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state);
301 int i;
303 while ( host_msr_state->flags )
304 {
305 i = find_first_set_bit(host_msr_state->flags);
306 wrmsrl(msr_index[i], host_msr_state->msrs[i]);
307 clear_bit(i, &host_msr_state->flags);
308 }
310 if ( cpu_has_nx && !(read_efer() & EFER_NX) )
311 write_efer(read_efer() | EFER_NX);
312 }
314 static void vmx_save_guest_msrs(struct vcpu *v)
315 {
316 /* MSR_SHADOW_GS_BASE may have been changed by swapgs instruction. */
317 rdmsrl(MSR_SHADOW_GS_BASE, v->arch.hvm_vmx.shadow_gs);
318 }
320 static void vmx_restore_guest_msrs(struct vcpu *v)
321 {
322 struct vmx_msr_state *guest_msr_state, *host_msr_state;
323 unsigned long guest_flags;
324 int i;
326 guest_msr_state = &v->arch.hvm_vmx.msr_state;
327 host_msr_state = &this_cpu(host_msr_state);
329 wrmsrl(MSR_SHADOW_GS_BASE, v->arch.hvm_vmx.shadow_gs);
331 guest_flags = guest_msr_state->flags;
333 while ( guest_flags )
334 {
335 i = find_first_set_bit(guest_flags);
337 HVM_DBG_LOG(DBG_LEVEL_2,
338 "restore guest's index %d msr %x with value %lx",
339 i, msr_index[i], guest_msr_state->msrs[i]);
340 set_bit(i, &host_msr_state->flags);
341 wrmsrl(msr_index[i], guest_msr_state->msrs[i]);
342 clear_bit(i, &guest_flags);
343 }
345 if ( (v->arch.hvm_vcpu.guest_efer ^ read_efer()) & (EFER_NX | EFER_SCE) )
346 {
347 HVM_DBG_LOG(DBG_LEVEL_2,
348 "restore guest's EFER with value %lx",
349 v->arch.hvm_vcpu.guest_efer);
350 write_efer((read_efer() & ~(EFER_NX | EFER_SCE)) |
351 (v->arch.hvm_vcpu.guest_efer & (EFER_NX | EFER_SCE)));
352 }
353 }
355 #else /* __i386__ */
357 #define vmx_save_host_msrs() ((void)0)
359 static void vmx_restore_host_msrs(void)
360 {
361 if ( cpu_has_nx && !(read_efer() & EFER_NX) )
362 write_efer(read_efer() | EFER_NX);
363 }
365 #define vmx_save_guest_msrs(v) ((void)0)
367 static void vmx_restore_guest_msrs(struct vcpu *v)
368 {
369 if ( (v->arch.hvm_vcpu.guest_efer ^ read_efer()) & EFER_NX )
370 {
371 HVM_DBG_LOG(DBG_LEVEL_2,
372 "restore guest's EFER with value %lx",
373 v->arch.hvm_vcpu.guest_efer);
374 write_efer((read_efer() & ~EFER_NX) |
375 (v->arch.hvm_vcpu.guest_efer & EFER_NX));
376 }
377 }
379 static enum handler_return long_mode_do_msr_read(struct cpu_user_regs *regs)
380 {
381 u64 msr_content = 0;
382 struct vcpu *v = current;
384 switch ( regs->ecx )
385 {
386 case MSR_EFER:
387 msr_content = v->arch.hvm_vcpu.guest_efer;
388 break;
390 default:
391 return HNDL_unhandled;
392 }
394 regs->eax = msr_content >> 0;
395 regs->edx = msr_content >> 32;
397 return HNDL_done;
398 }
400 static enum handler_return long_mode_do_msr_write(struct cpu_user_regs *regs)
401 {
402 u64 msr_content = regs->eax | ((u64)regs->edx << 32);
404 switch ( regs->ecx )
405 {
406 case MSR_EFER:
407 if ( hvm_set_efer(msr_content) )
408 return HNDL_exception_raised;
409 break;
411 default:
412 return HNDL_unhandled;
413 }
415 return HNDL_done;
416 }
418 #endif /* __i386__ */
420 static int vmx_guest_x86_mode(struct vcpu *v)
421 {
422 unsigned int cs_ar_bytes;
424 if ( unlikely(!(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE)) )
425 return 0;
426 if ( unlikely(guest_cpu_user_regs()->eflags & X86_EFLAGS_VM) )
427 return 1;
428 cs_ar_bytes = __vmread(GUEST_CS_AR_BYTES);
429 if ( hvm_long_mode_enabled(v) &&
430 likely(cs_ar_bytes & X86_SEG_AR_CS_LM_ACTIVE) )
431 return 8;
432 return (likely(cs_ar_bytes & X86_SEG_AR_DEF_OP_SIZE) ? 4 : 2);
433 }
435 static void vmx_save_dr(struct vcpu *v)
436 {
437 if ( !v->arch.hvm_vcpu.flag_dr_dirty )
438 return;
440 /* Clear the DR dirty flag and re-enable intercepts for DR accesses. */
441 v->arch.hvm_vcpu.flag_dr_dirty = 0;
442 v->arch.hvm_vmx.exec_control |= CPU_BASED_MOV_DR_EXITING;
443 __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control);
445 v->arch.guest_context.debugreg[0] = read_debugreg(0);
446 v->arch.guest_context.debugreg[1] = read_debugreg(1);
447 v->arch.guest_context.debugreg[2] = read_debugreg(2);
448 v->arch.guest_context.debugreg[3] = read_debugreg(3);
449 v->arch.guest_context.debugreg[6] = read_debugreg(6);
450 /* DR7 must be saved as it is used by vmx_restore_dr(). */
451 v->arch.guest_context.debugreg[7] = __vmread(GUEST_DR7);
452 }
454 static void __restore_debug_registers(struct vcpu *v)
455 {
456 if ( v->arch.hvm_vcpu.flag_dr_dirty )
457 return;
459 v->arch.hvm_vcpu.flag_dr_dirty = 1;
461 write_debugreg(0, v->arch.guest_context.debugreg[0]);
462 write_debugreg(1, v->arch.guest_context.debugreg[1]);
463 write_debugreg(2, v->arch.guest_context.debugreg[2]);
464 write_debugreg(3, v->arch.guest_context.debugreg[3]);
465 write_debugreg(6, v->arch.guest_context.debugreg[6]);
466 /* DR7 is loaded from the VMCS. */
467 }
469 /*
470 * DR7 is saved and restored on every vmexit. Other debug registers only
471 * need to be restored if their value is going to affect execution -- i.e.,
472 * if one of the breakpoints is enabled. So mask out all bits that don't
473 * enable some breakpoint functionality.
474 */
475 static void vmx_restore_dr(struct vcpu *v)
476 {
477 /* NB. __vmread() is not usable here, so we cannot read from the VMCS. */
478 if ( unlikely(v->arch.guest_context.debugreg[7] & DR7_ACTIVE_MASK) )
479 __restore_debug_registers(v);
480 }
482 static void vmx_vmcs_save(struct vcpu *v, struct hvm_hw_cpu *c)
483 {
484 uint32_t ev;
486 vmx_vmcs_enter(v);
488 c->cr0 = v->arch.hvm_vcpu.guest_cr[0];
489 c->cr2 = v->arch.hvm_vcpu.guest_cr[2];
490 c->cr3 = v->arch.hvm_vcpu.guest_cr[3];
491 c->cr4 = v->arch.hvm_vcpu.guest_cr[4];
493 c->msr_efer = v->arch.hvm_vcpu.guest_efer;
495 c->sysenter_cs = __vmread(GUEST_SYSENTER_CS);
496 c->sysenter_esp = __vmread(GUEST_SYSENTER_ESP);
497 c->sysenter_eip = __vmread(GUEST_SYSENTER_EIP);
499 c->pending_event = 0;
500 c->error_code = 0;
501 if ( ((ev = __vmread(VM_ENTRY_INTR_INFO)) & INTR_INFO_VALID_MASK) &&
502 hvm_event_needs_reinjection((ev >> 8) & 7, ev & 0xff) )
503 {
504 c->pending_event = ev;
505 c->error_code = __vmread(VM_ENTRY_EXCEPTION_ERROR_CODE);
506 }
508 vmx_vmcs_exit(v);
509 }
511 static int vmx_restore_cr0_cr3(
512 struct vcpu *v, unsigned long cr0, unsigned long cr3)
513 {
514 unsigned long mfn = 0;
515 p2m_type_t p2mt;
517 if ( paging_mode_shadow(v->domain) )
518 {
519 if ( cr0 & X86_CR0_PG )
520 {
521 mfn = mfn_x(gfn_to_mfn(v->domain, cr3 >> PAGE_SHIFT, &p2mt));
522 if ( !p2m_is_ram(p2mt) || !get_page(mfn_to_page(mfn), v->domain) )
523 {
524 gdprintk(XENLOG_ERR, "Invalid CR3 value=0x%lx\n", cr3);
525 return -EINVAL;
526 }
527 }
529 if ( hvm_paging_enabled(v) )
530 put_page(pagetable_get_page(v->arch.guest_table));
532 v->arch.guest_table = pagetable_from_pfn(mfn);
533 }
535 v->arch.hvm_vcpu.guest_cr[0] = cr0 | X86_CR0_ET;
536 v->arch.hvm_vcpu.guest_cr[3] = cr3;
538 return 0;
539 }
541 static int vmx_vmcs_restore(struct vcpu *v, struct hvm_hw_cpu *c)
542 {
543 int rc;
545 if ( c->pending_valid &&
546 ((c->pending_type == 1) || (c->pending_type > 6) ||
547 (c->pending_reserved != 0)) )
548 {
549 gdprintk(XENLOG_ERR, "Invalid pending event 0x%"PRIx32".\n",
550 c->pending_event);
551 return -EINVAL;
552 }
554 rc = vmx_restore_cr0_cr3(v, c->cr0, c->cr3);
555 if ( rc )
556 return rc;
558 vmx_vmcs_enter(v);
560 v->arch.hvm_vcpu.guest_cr[2] = c->cr2;
561 v->arch.hvm_vcpu.guest_cr[4] = c->cr4;
562 vmx_update_guest_cr(v, 0);
563 vmx_update_guest_cr(v, 2);
564 vmx_update_guest_cr(v, 4);
566 v->arch.hvm_vcpu.guest_efer = c->msr_efer;
567 vmx_update_guest_efer(v);
569 __vmwrite(GUEST_SYSENTER_CS, c->sysenter_cs);
570 __vmwrite(GUEST_SYSENTER_ESP, c->sysenter_esp);
571 __vmwrite(GUEST_SYSENTER_EIP, c->sysenter_eip);
573 __vmwrite(GUEST_DR7, c->dr7);
575 vmx_vmcs_exit(v);
577 paging_update_paging_modes(v);
579 if ( c->pending_valid )
580 {
581 gdprintk(XENLOG_INFO, "Re-injecting 0x%"PRIx32", 0x%"PRIx32"\n",
582 c->pending_event, c->error_code);
584 if ( hvm_event_needs_reinjection(c->pending_type, c->pending_vector) )
585 {
586 vmx_vmcs_enter(v);
587 __vmwrite(VM_ENTRY_INTR_INFO, c->pending_event);
588 __vmwrite(VM_ENTRY_EXCEPTION_ERROR_CODE, c->error_code);
589 vmx_vmcs_exit(v);
590 }
591 }
593 return 0;
594 }
596 static void vmx_save_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data)
597 {
598 #ifdef __x86_64__
599 struct vmx_msr_state *guest_state = &v->arch.hvm_vmx.msr_state;
600 unsigned long guest_flags = guest_state->flags;
602 data->shadow_gs = v->arch.hvm_vmx.shadow_gs;
603 data->msr_cstar = v->arch.hvm_vmx.cstar;
605 /* save msrs */
606 data->msr_flags = guest_flags;
607 data->msr_lstar = guest_state->msrs[VMX_INDEX_MSR_LSTAR];
608 data->msr_star = guest_state->msrs[VMX_INDEX_MSR_STAR];
609 data->msr_syscall_mask = guest_state->msrs[VMX_INDEX_MSR_SYSCALL_MASK];
610 #endif
612 data->tsc = hvm_get_guest_tsc(v);
613 }
615 static void vmx_load_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data)
616 {
617 #ifdef __x86_64__
618 struct vmx_msr_state *guest_state = &v->arch.hvm_vmx.msr_state;
620 /* restore msrs */
621 guest_state->flags = data->msr_flags & 7;
622 guest_state->msrs[VMX_INDEX_MSR_LSTAR] = data->msr_lstar;
623 guest_state->msrs[VMX_INDEX_MSR_STAR] = data->msr_star;
624 guest_state->msrs[VMX_INDEX_MSR_SYSCALL_MASK] = data->msr_syscall_mask;
626 v->arch.hvm_vmx.cstar = data->msr_cstar;
627 v->arch.hvm_vmx.shadow_gs = data->shadow_gs;
628 #endif
630 hvm_set_guest_tsc(v, data->tsc);
631 }
634 static void vmx_save_vmcs_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt)
635 {
636 vmx_save_cpu_state(v, ctxt);
637 vmx_vmcs_save(v, ctxt);
638 }
640 static int vmx_load_vmcs_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt)
641 {
642 vmx_load_cpu_state(v, ctxt);
644 if ( vmx_vmcs_restore(v, ctxt) )
645 {
646 gdprintk(XENLOG_ERR, "vmx_vmcs restore failed!\n");
647 domain_crash(v->domain);
648 return -EINVAL;
649 }
651 return 0;
652 }
654 static void vmx_fpu_enter(struct vcpu *v)
655 {
656 setup_fpu(v);
657 __vm_clear_bit(EXCEPTION_BITMAP, TRAP_no_device);
658 v->arch.hvm_vmx.host_cr0 &= ~X86_CR0_TS;
659 __vmwrite(HOST_CR0, v->arch.hvm_vmx.host_cr0);
660 }
662 static void vmx_fpu_leave(struct vcpu *v)
663 {
664 ASSERT(!v->fpu_dirtied);
665 ASSERT(read_cr0() & X86_CR0_TS);
667 if ( !(v->arch.hvm_vmx.host_cr0 & X86_CR0_TS) )
668 {
669 v->arch.hvm_vmx.host_cr0 |= X86_CR0_TS;
670 __vmwrite(HOST_CR0, v->arch.hvm_vmx.host_cr0);
671 }
673 /*
674 * If the guest does not have TS enabled then we must cause and handle an
675 * exception on first use of the FPU. If the guest *does* have TS enabled
676 * then this is not necessary: no FPU activity can occur until the guest
677 * clears CR0.TS, and we will initialise the FPU when that happens.
678 */
679 if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) )
680 {
681 v->arch.hvm_vcpu.hw_cr[0] |= X86_CR0_TS;
682 __vmwrite(GUEST_CR0, v->arch.hvm_vcpu.hw_cr[0]);
683 __vm_set_bit(EXCEPTION_BITMAP, TRAP_no_device);
684 }
685 }
687 static void vmx_ctxt_switch_from(struct vcpu *v)
688 {
689 vmx_fpu_leave(v);
690 vmx_save_guest_msrs(v);
691 vmx_restore_host_msrs();
692 vmx_save_dr(v);
693 vpmu_save(v);
694 }
696 static void vmx_ctxt_switch_to(struct vcpu *v)
697 {
698 /* HOST_CR4 in VMCS is always mmu_cr4_features. Sync CR4 now. */
699 if ( unlikely(read_cr4() != mmu_cr4_features) )
700 write_cr4(mmu_cr4_features);
702 vmx_restore_guest_msrs(v);
703 vmx_restore_dr(v);
704 vpmu_load(v);
705 }
707 static void vmx_get_segment_register(struct vcpu *v, enum x86_segment seg,
708 struct segment_register *reg)
709 {
710 uint32_t attr = 0;
712 vmx_vmcs_enter(v);
714 switch ( seg )
715 {
716 case x86_seg_cs:
717 reg->sel = __vmread(GUEST_CS_SELECTOR);
718 reg->limit = __vmread(GUEST_CS_LIMIT);
719 reg->base = __vmread(GUEST_CS_BASE);
720 attr = __vmread(GUEST_CS_AR_BYTES);
721 break;
722 case x86_seg_ds:
723 reg->sel = __vmread(GUEST_DS_SELECTOR);
724 reg->limit = __vmread(GUEST_DS_LIMIT);
725 reg->base = __vmread(GUEST_DS_BASE);
726 attr = __vmread(GUEST_DS_AR_BYTES);
727 break;
728 case x86_seg_es:
729 reg->sel = __vmread(GUEST_ES_SELECTOR);
730 reg->limit = __vmread(GUEST_ES_LIMIT);
731 reg->base = __vmread(GUEST_ES_BASE);
732 attr = __vmread(GUEST_ES_AR_BYTES);
733 break;
734 case x86_seg_fs:
735 reg->sel = __vmread(GUEST_FS_SELECTOR);
736 reg->limit = __vmread(GUEST_FS_LIMIT);
737 reg->base = __vmread(GUEST_FS_BASE);
738 attr = __vmread(GUEST_FS_AR_BYTES);
739 break;
740 case x86_seg_gs:
741 reg->sel = __vmread(GUEST_GS_SELECTOR);
742 reg->limit = __vmread(GUEST_GS_LIMIT);
743 reg->base = __vmread(GUEST_GS_BASE);
744 attr = __vmread(GUEST_GS_AR_BYTES);
745 break;
746 case x86_seg_ss:
747 reg->sel = __vmread(GUEST_SS_SELECTOR);
748 reg->limit = __vmread(GUEST_SS_LIMIT);
749 reg->base = __vmread(GUEST_SS_BASE);
750 attr = __vmread(GUEST_SS_AR_BYTES);
751 break;
752 case x86_seg_tr:
753 reg->sel = __vmread(GUEST_TR_SELECTOR);
754 reg->limit = __vmread(GUEST_TR_LIMIT);
755 reg->base = __vmread(GUEST_TR_BASE);
756 attr = __vmread(GUEST_TR_AR_BYTES);
757 break;
758 case x86_seg_gdtr:
759 reg->limit = __vmread(GUEST_GDTR_LIMIT);
760 reg->base = __vmread(GUEST_GDTR_BASE);
761 break;
762 case x86_seg_idtr:
763 reg->limit = __vmread(GUEST_IDTR_LIMIT);
764 reg->base = __vmread(GUEST_IDTR_BASE);
765 break;
766 case x86_seg_ldtr:
767 reg->sel = __vmread(GUEST_LDTR_SELECTOR);
768 reg->limit = __vmread(GUEST_LDTR_LIMIT);
769 reg->base = __vmread(GUEST_LDTR_BASE);
770 attr = __vmread(GUEST_LDTR_AR_BYTES);
771 break;
772 default:
773 BUG();
774 }
776 vmx_vmcs_exit(v);
778 reg->attr.bytes = (attr & 0xff) | ((attr >> 4) & 0xf00);
779 /* Unusable flag is folded into Present flag. */
780 if ( attr & (1u<<16) )
781 reg->attr.fields.p = 0;
782 }
784 static void vmx_set_segment_register(struct vcpu *v, enum x86_segment seg,
785 struct segment_register *reg)
786 {
787 uint32_t attr;
789 attr = reg->attr.bytes;
790 attr = ((attr & 0xf00) << 4) | (attr & 0xff);
792 /* Not-present must mean unusable. */
793 if ( !reg->attr.fields.p )
794 attr |= (1u << 16);
796 /* VMX has strict consistency requirement for flag G. */
797 attr |= !!(reg->limit >> 20) << 15;
799 vmx_vmcs_enter(v);
801 switch ( seg )
802 {
803 case x86_seg_cs:
804 __vmwrite(GUEST_CS_SELECTOR, reg->sel);
805 __vmwrite(GUEST_CS_LIMIT, reg->limit);
806 __vmwrite(GUEST_CS_BASE, reg->base);
807 __vmwrite(GUEST_CS_AR_BYTES, attr);
808 break;
809 case x86_seg_ds:
810 __vmwrite(GUEST_DS_SELECTOR, reg->sel);
811 __vmwrite(GUEST_DS_LIMIT, reg->limit);
812 __vmwrite(GUEST_DS_BASE, reg->base);
813 __vmwrite(GUEST_DS_AR_BYTES, attr);
814 break;
815 case x86_seg_es:
816 __vmwrite(GUEST_ES_SELECTOR, reg->sel);
817 __vmwrite(GUEST_ES_LIMIT, reg->limit);
818 __vmwrite(GUEST_ES_BASE, reg->base);
819 __vmwrite(GUEST_ES_AR_BYTES, attr);
820 break;
821 case x86_seg_fs:
822 __vmwrite(GUEST_FS_SELECTOR, reg->sel);
823 __vmwrite(GUEST_FS_LIMIT, reg->limit);
824 __vmwrite(GUEST_FS_BASE, reg->base);
825 __vmwrite(GUEST_FS_AR_BYTES, attr);
826 break;
827 case x86_seg_gs:
828 __vmwrite(GUEST_GS_SELECTOR, reg->sel);
829 __vmwrite(GUEST_GS_LIMIT, reg->limit);
830 __vmwrite(GUEST_GS_BASE, reg->base);
831 __vmwrite(GUEST_GS_AR_BYTES, attr);
832 break;
833 case x86_seg_ss:
834 __vmwrite(GUEST_SS_SELECTOR, reg->sel);
835 __vmwrite(GUEST_SS_LIMIT, reg->limit);
836 __vmwrite(GUEST_SS_BASE, reg->base);
837 __vmwrite(GUEST_SS_AR_BYTES, attr);
838 break;
839 case x86_seg_tr:
840 __vmwrite(GUEST_TR_SELECTOR, reg->sel);
841 __vmwrite(GUEST_TR_LIMIT, reg->limit);
842 __vmwrite(GUEST_TR_BASE, reg->base);
843 /* VMX checks that the the busy flag (bit 1) is set. */
844 __vmwrite(GUEST_TR_AR_BYTES, attr | 2);
845 break;
846 case x86_seg_gdtr:
847 __vmwrite(GUEST_GDTR_LIMIT, reg->limit);
848 __vmwrite(GUEST_GDTR_BASE, reg->base);
849 break;
850 case x86_seg_idtr:
851 __vmwrite(GUEST_IDTR_LIMIT, reg->limit);
852 __vmwrite(GUEST_IDTR_BASE, reg->base);
853 break;
854 case x86_seg_ldtr:
855 __vmwrite(GUEST_LDTR_SELECTOR, reg->sel);
856 __vmwrite(GUEST_LDTR_LIMIT, reg->limit);
857 __vmwrite(GUEST_LDTR_BASE, reg->base);
858 __vmwrite(GUEST_LDTR_AR_BYTES, attr);
859 break;
860 default:
861 BUG();
862 }
864 vmx_vmcs_exit(v);
865 }
867 static void vmx_set_tsc_offset(struct vcpu *v, u64 offset)
868 {
869 vmx_vmcs_enter(v);
870 __vmwrite(TSC_OFFSET, offset);
871 #if defined (__i386__)
872 __vmwrite(TSC_OFFSET_HIGH, offset >> 32);
873 #endif
874 vmx_vmcs_exit(v);
875 }
877 void do_nmi(struct cpu_user_regs *);
879 static void vmx_init_hypercall_page(struct domain *d, void *hypercall_page)
880 {
881 char *p;
882 int i;
884 for ( i = 0; i < (PAGE_SIZE / 32); i++ )
885 {
886 p = (char *)(hypercall_page + (i * 32));
887 *(u8 *)(p + 0) = 0xb8; /* mov imm32, %eax */
888 *(u32 *)(p + 1) = i;
889 *(u8 *)(p + 5) = 0x0f; /* vmcall */
890 *(u8 *)(p + 6) = 0x01;
891 *(u8 *)(p + 7) = 0xc1;
892 *(u8 *)(p + 8) = 0xc3; /* ret */
893 }
895 /* Don't support HYPERVISOR_iret at the moment */
896 *(u16 *)(hypercall_page + (__HYPERVISOR_iret * 32)) = 0x0b0f; /* ud2 */
897 }
899 static unsigned int vmx_get_interrupt_shadow(struct vcpu *v)
900 {
901 return __vmread(GUEST_INTERRUPTIBILITY_INFO);
902 }
904 static void vmx_set_interrupt_shadow(struct vcpu *v, unsigned int intr_shadow)
905 {
906 __vmwrite(GUEST_INTERRUPTIBILITY_INFO, intr_shadow);
907 }
909 static void vmx_load_pdptrs(struct vcpu *v)
910 {
911 unsigned long cr3 = v->arch.hvm_vcpu.guest_cr[3], mfn;
912 uint64_t *guest_pdptrs;
913 p2m_type_t p2mt;
914 char *p;
916 /* EPT needs to load PDPTRS into VMCS for PAE. */
917 if ( !hvm_pae_enabled(v) || (v->arch.hvm_vcpu.guest_efer & EFER_LMA) )
918 return;
920 if ( cr3 & 0x1fUL )
921 goto crash;
923 mfn = mfn_x(gfn_to_mfn(v->domain, cr3 >> PAGE_SHIFT, &p2mt));
924 if ( !p2m_is_ram(p2mt) )
925 goto crash;
927 p = map_domain_page(mfn);
929 guest_pdptrs = (uint64_t *)(p + (cr3 & ~PAGE_MASK));
931 /*
932 * We do not check the PDPTRs for validity. The CPU will do this during
933 * vm entry, and we can handle the failure there and crash the guest.
934 * The only thing we could do better here is #GP instead.
935 */
937 vmx_vmcs_enter(v);
939 __vmwrite(GUEST_PDPTR0, guest_pdptrs[0]);
940 __vmwrite(GUEST_PDPTR1, guest_pdptrs[1]);
941 __vmwrite(GUEST_PDPTR2, guest_pdptrs[2]);
942 __vmwrite(GUEST_PDPTR3, guest_pdptrs[3]);
943 #ifdef __i386__
944 __vmwrite(GUEST_PDPTR0_HIGH, guest_pdptrs[0] >> 32);
945 __vmwrite(GUEST_PDPTR1_HIGH, guest_pdptrs[1] >> 32);
946 __vmwrite(GUEST_PDPTR2_HIGH, guest_pdptrs[2] >> 32);
947 __vmwrite(GUEST_PDPTR3_HIGH, guest_pdptrs[3] >> 32);
948 #endif
950 vmx_vmcs_exit(v);
952 unmap_domain_page(p);
953 return;
955 crash:
956 domain_crash(v->domain);
957 }
959 static void vmx_update_host_cr3(struct vcpu *v)
960 {
961 vmx_vmcs_enter(v);
962 __vmwrite(HOST_CR3, v->arch.cr3);
963 vmx_vmcs_exit(v);
964 }
966 static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr)
967 {
968 vmx_vmcs_enter(v);
970 switch ( cr )
971 {
972 case 0: {
973 unsigned long hw_cr0_mask =
974 X86_CR0_NE | X86_CR0_PG | X86_CR0_PE;
976 if ( paging_mode_shadow(v->domain) )
977 hw_cr0_mask |= X86_CR0_WP;
979 if ( paging_mode_hap(v->domain) )
980 {
981 /* We manage GUEST_CR3 when guest CR0.PE is zero. */
982 uint32_t cr3_ctls = (CPU_BASED_CR3_LOAD_EXITING |
983 CPU_BASED_CR3_STORE_EXITING);
984 v->arch.hvm_vmx.exec_control &= ~cr3_ctls;
985 if ( !hvm_paging_enabled(v) )
986 v->arch.hvm_vmx.exec_control |= cr3_ctls;
987 __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control);
989 /* Changing CR0.PE can change some bits in real CR4. */
990 vmx_update_guest_cr(v, 4);
991 }
993 if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) )
994 {
995 if ( v != current )
996 hw_cr0_mask |= X86_CR0_TS;
997 else if ( v->arch.hvm_vcpu.hw_cr[0] & X86_CR0_TS )
998 vmx_fpu_enter(v);
999 }
1001 v->arch.hvm_vmx.vmxemul &= ~VMXEMUL_REALMODE;
1002 if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE) )
1003 v->arch.hvm_vmx.vmxemul |= VMXEMUL_REALMODE;
1005 v->arch.hvm_vcpu.hw_cr[0] =
1006 v->arch.hvm_vcpu.guest_cr[0] | hw_cr0_mask;
1007 __vmwrite(GUEST_CR0, v->arch.hvm_vcpu.hw_cr[0]);
1008 __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vcpu.guest_cr[0]);
1009 break;
1011 case 2:
1012 /* CR2 is updated in exit stub. */
1013 break;
1014 case 3:
1015 if ( paging_mode_hap(v->domain) )
1017 if ( !hvm_paging_enabled(v) )
1018 v->arch.hvm_vcpu.hw_cr[3] =
1019 v->domain->arch.hvm_domain.params[HVM_PARAM_IDENT_PT];
1020 vmx_load_pdptrs(v);
1023 __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr[3]);
1024 vpid_sync_vcpu_all(v);
1025 break;
1026 case 4:
1027 v->arch.hvm_vcpu.hw_cr[4] = HVM_CR4_HOST_MASK;
1028 if ( paging_mode_hap(v->domain) )
1029 v->arch.hvm_vcpu.hw_cr[4] &= ~X86_CR4_PAE;
1030 v->arch.hvm_vcpu.hw_cr[4] |= v->arch.hvm_vcpu.guest_cr[4];
1031 if ( paging_mode_hap(v->domain) && !hvm_paging_enabled(v) )
1033 v->arch.hvm_vcpu.hw_cr[4] |= X86_CR4_PSE;
1034 v->arch.hvm_vcpu.hw_cr[4] &= ~X86_CR4_PAE;
1036 __vmwrite(GUEST_CR4, v->arch.hvm_vcpu.hw_cr[4]);
1037 __vmwrite(CR4_READ_SHADOW, v->arch.hvm_vcpu.guest_cr[4]);
1038 break;
1039 default:
1040 BUG();
1043 vmx_vmcs_exit(v);
1046 static void vmx_update_guest_efer(struct vcpu *v)
1048 #ifdef __x86_64__
1049 unsigned long vm_entry_value;
1051 vmx_vmcs_enter(v);
1053 vm_entry_value = __vmread(VM_ENTRY_CONTROLS);
1054 if ( v->arch.hvm_vcpu.guest_efer & EFER_LMA )
1055 vm_entry_value |= VM_ENTRY_IA32E_MODE;
1056 else
1057 vm_entry_value &= ~VM_ENTRY_IA32E_MODE;
1058 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
1060 vmx_vmcs_exit(v);
1061 #endif
1063 if ( v == current )
1064 write_efer((read_efer() & ~(EFER_NX|EFER_SCE)) |
1065 (v->arch.hvm_vcpu.guest_efer & (EFER_NX|EFER_SCE)));
1068 static void vmx_flush_guest_tlbs(void)
1070 /*
1071 * If VPID (i.e. tagged TLB support) is not enabled, the fact that
1072 * we're in Xen at all means any guest will have a clean TLB when
1073 * it's next run, because VMRESUME will flush it for us.
1075 * If enabled, we invalidate all translations associated with all
1076 * VPID values.
1077 */
1078 vpid_sync_all();
1081 static void __ept_sync_domain(void *info)
1083 struct domain *d = info;
1084 __invept(1, d->arch.hvm_domain.vmx.ept_control.eptp, 0);
1087 void ept_sync_domain(struct domain *d)
1089 /* Only if using EPT and this domain has some VCPUs to dirty. */
1090 if ( d->arch.hvm_domain.hap_enabled && d->vcpu[0] )
1092 ASSERT(local_irq_is_enabled());
1093 on_each_cpu(__ept_sync_domain, d, 1, 1);
1097 static void __vmx_inject_exception(int trap, int type, int error_code)
1099 unsigned long intr_fields;
1101 /*
1102 * NB. Callers do not need to worry about clearing STI/MOV-SS blocking:
1103 * "If the VM entry is injecting, there is no blocking by STI or by
1104 * MOV SS following the VM entry, regardless of the contents of the
1105 * interruptibility-state field [in the guest-state area before the
1106 * VM entry]", PRM Vol. 3, 22.6.1 (Interruptibility State).
1107 */
1109 intr_fields = (INTR_INFO_VALID_MASK | (type<<8) | trap);
1110 if ( error_code != HVM_DELIVER_NO_ERROR_CODE ) {
1111 __vmwrite(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
1112 intr_fields |= INTR_INFO_DELIVER_CODE_MASK;
1115 __vmwrite(VM_ENTRY_INTR_INFO, intr_fields);
1118 void vmx_inject_hw_exception(int trap, int error_code)
1120 unsigned long intr_info = __vmread(VM_ENTRY_INTR_INFO);
1121 struct vcpu *curr = current;
1123 switch ( trap )
1125 case TRAP_debug:
1126 if ( guest_cpu_user_regs()->eflags & X86_EFLAGS_TF )
1128 __restore_debug_registers(curr);
1129 write_debugreg(6, read_debugreg(6) | 0x4000);
1131 case TRAP_int3:
1132 if ( curr->domain->debugger_attached )
1134 /* Debug/Int3: Trap to debugger. */
1135 domain_pause_for_debugger();
1136 return;
1140 if ( unlikely(intr_info & INTR_INFO_VALID_MASK) &&
1141 (((intr_info >> 8) & 7) == X86_EVENTTYPE_HW_EXCEPTION) )
1143 trap = hvm_combine_hw_exceptions((uint8_t)intr_info, trap);
1144 if ( trap == TRAP_double_fault )
1145 error_code = 0;
1148 __vmx_inject_exception(trap, X86_EVENTTYPE_HW_EXCEPTION, error_code);
1150 if ( trap == TRAP_page_fault )
1151 HVMTRACE_LONG_2D(PF_INJECT, error_code,
1152 TRC_PAR_LONG(current->arch.hvm_vcpu.guest_cr[2]));
1153 else
1154 HVMTRACE_2D(INJ_EXC, trap, error_code);
1157 void vmx_inject_extint(int trap)
1159 __vmx_inject_exception(trap, X86_EVENTTYPE_EXT_INTR,
1160 HVM_DELIVER_NO_ERROR_CODE);
1163 void vmx_inject_nmi(void)
1165 __vmx_inject_exception(2, X86_EVENTTYPE_NMI,
1166 HVM_DELIVER_NO_ERROR_CODE);
1169 static void vmx_inject_exception(
1170 unsigned int trapnr, int errcode, unsigned long cr2)
1172 if ( trapnr == TRAP_page_fault )
1173 current->arch.hvm_vcpu.guest_cr[2] = cr2;
1175 vmx_inject_hw_exception(trapnr, errcode);
1178 static int vmx_event_pending(struct vcpu *v)
1180 ASSERT(v == current);
1181 return (__vmread(VM_ENTRY_INTR_INFO) & INTR_INFO_VALID_MASK);
1184 static int vmx_do_pmu_interrupt(struct cpu_user_regs *regs)
1186 return vpmu_do_interrupt(regs);
1189 static void vmx_set_uc_mode(struct vcpu *v)
1191 if ( paging_mode_hap(v->domain) )
1192 ept_change_entry_emt_with_range(
1193 v->domain, 0, v->domain->arch.p2m->max_mapped_pfn);
1194 vpid_sync_all();
1197 static void vmx_set_info_guest(struct vcpu *v)
1199 vmx_vmcs_enter(v);
1200 __vmwrite(GUEST_DR7, v->arch.guest_context.debugreg[7]);
1201 vmx_vmcs_exit(v);
1204 static struct hvm_function_table vmx_function_table = {
1205 .name = "VMX",
1206 .domain_initialise = vmx_domain_initialise,
1207 .domain_destroy = vmx_domain_destroy,
1208 .vcpu_initialise = vmx_vcpu_initialise,
1209 .vcpu_destroy = vmx_vcpu_destroy,
1210 .save_cpu_ctxt = vmx_save_vmcs_ctxt,
1211 .load_cpu_ctxt = vmx_load_vmcs_ctxt,
1212 .get_interrupt_shadow = vmx_get_interrupt_shadow,
1213 .set_interrupt_shadow = vmx_set_interrupt_shadow,
1214 .guest_x86_mode = vmx_guest_x86_mode,
1215 .get_segment_register = vmx_get_segment_register,
1216 .set_segment_register = vmx_set_segment_register,
1217 .update_host_cr3 = vmx_update_host_cr3,
1218 .update_guest_cr = vmx_update_guest_cr,
1219 .update_guest_efer = vmx_update_guest_efer,
1220 .flush_guest_tlbs = vmx_flush_guest_tlbs,
1221 .set_tsc_offset = vmx_set_tsc_offset,
1222 .inject_exception = vmx_inject_exception,
1223 .init_hypercall_page = vmx_init_hypercall_page,
1224 .event_pending = vmx_event_pending,
1225 .do_pmu_interrupt = vmx_do_pmu_interrupt,
1226 .cpu_up = vmx_cpu_up,
1227 .cpu_down = vmx_cpu_down,
1228 .cpuid_intercept = vmx_cpuid_intercept,
1229 .wbinvd_intercept = vmx_wbinvd_intercept,
1230 .fpu_dirty_intercept = vmx_fpu_dirty_intercept,
1231 .msr_read_intercept = vmx_msr_read_intercept,
1232 .msr_write_intercept = vmx_msr_write_intercept,
1233 .invlpg_intercept = vmx_invlpg_intercept,
1234 .set_uc_mode = vmx_set_uc_mode,
1235 .set_info_guest = vmx_set_info_guest
1236 };
1238 static unsigned long *vpid_bitmap;
1239 #define VPID_BITMAP_SIZE ((1u << VMCS_VPID_WIDTH) / MAX_VIRT_CPUS)
1241 void start_vmx(void)
1243 static int bootstrapped;
1245 vmx_save_host_msrs();
1247 if ( bootstrapped )
1249 if ( hvm_enabled && !vmx_cpu_up() )
1251 printk("VMX: FATAL: failed to initialise CPU%d!\n",
1252 smp_processor_id());
1253 BUG();
1255 return;
1258 bootstrapped = 1;
1260 /* Xen does not fill x86_capability words except 0. */
1261 boot_cpu_data.x86_capability[4] = cpuid_ecx(1);
1263 if ( !test_bit(X86_FEATURE_VMXE, &boot_cpu_data.x86_capability) )
1264 return;
1266 set_in_cr4(X86_CR4_VMXE);
1268 if ( !vmx_cpu_up() )
1270 printk("VMX: failed to initialise.\n");
1271 return;
1274 if ( cpu_has_vmx_ept )
1276 printk("VMX: EPT is available.\n");
1277 vmx_function_table.hap_supported = 1;
1280 if ( cpu_has_vmx_vpid )
1282 printk("VMX: VPID is available.\n");
1284 vpid_bitmap = xmalloc_array(
1285 unsigned long, BITS_TO_LONGS(VPID_BITMAP_SIZE));
1286 BUG_ON(vpid_bitmap == NULL);
1287 memset(vpid_bitmap, 0, BITS_TO_LONGS(VPID_BITMAP_SIZE) * sizeof(long));
1289 /* VPID 0 is used by VMX root mode (the hypervisor). */
1290 __set_bit(0, vpid_bitmap);
1293 setup_vmcs_dump();
1295 hvm_enable(&vmx_function_table);
1298 /*
1299 * Not all cases receive valid value in the VM-exit instruction length field.
1300 * Callers must know what they're doing!
1301 */
1302 static int __get_instruction_length(void)
1304 int len;
1305 len = __vmread(VM_EXIT_INSTRUCTION_LEN); /* Safe: callers audited */
1306 BUG_ON((len < 1) || (len > 15));
1307 return len;
1310 static void __update_guest_eip(unsigned long inst_len)
1312 struct cpu_user_regs *regs = guest_cpu_user_regs();
1313 unsigned long x;
1315 regs->eip += inst_len;
1316 regs->eflags &= ~X86_EFLAGS_RF;
1318 x = __vmread(GUEST_INTERRUPTIBILITY_INFO);
1319 if ( x & (VMX_INTR_SHADOW_STI | VMX_INTR_SHADOW_MOV_SS) )
1321 x &= ~(VMX_INTR_SHADOW_STI | VMX_INTR_SHADOW_MOV_SS);
1322 __vmwrite(GUEST_INTERRUPTIBILITY_INFO, x);
1325 if ( regs->eflags & X86_EFLAGS_TF )
1326 vmx_inject_hw_exception(TRAP_debug, HVM_DELIVER_NO_ERROR_CODE);
1329 static void vmx_fpu_dirty_intercept(void)
1331 struct vcpu *curr = current;
1333 vmx_fpu_enter(curr);
1335 /* Disable TS in guest CR0 unless the guest wants the exception too. */
1336 if ( !(curr->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) )
1338 curr->arch.hvm_vcpu.hw_cr[0] &= ~X86_CR0_TS;
1339 __vmwrite(GUEST_CR0, curr->arch.hvm_vcpu.hw_cr[0]);
1343 #define bitmaskof(idx) (1U << ((idx) & 31))
1344 static void vmx_cpuid_intercept(
1345 unsigned int *eax, unsigned int *ebx,
1346 unsigned int *ecx, unsigned int *edx)
1348 unsigned int input = *eax;
1349 struct segment_register cs;
1350 struct vcpu *v = current;
1352 hvm_cpuid(input, eax, ebx, ecx, edx);
1354 switch ( input )
1356 case 0x80000001:
1357 /* SYSCALL is visible iff running in long mode. */
1358 hvm_get_segment_register(v, x86_seg_cs, &cs);
1359 if ( cs.attr.fields.l )
1360 *edx |= bitmaskof(X86_FEATURE_SYSCALL);
1361 else
1362 *edx &= ~(bitmaskof(X86_FEATURE_SYSCALL));
1363 break;
1366 HVMTRACE_5D (CPUID, input, *eax, *ebx, *ecx, *edx);
1369 static void vmx_do_cpuid(struct cpu_user_regs *regs)
1371 unsigned int eax, ebx, ecx, edx;
1373 eax = regs->eax;
1374 ebx = regs->ebx;
1375 ecx = regs->ecx;
1376 edx = regs->edx;
1378 vmx_cpuid_intercept(&eax, &ebx, &ecx, &edx);
1380 regs->eax = eax;
1381 regs->ebx = ebx;
1382 regs->ecx = ecx;
1383 regs->edx = edx;
1386 static void vmx_dr_access(unsigned long exit_qualification,
1387 struct cpu_user_regs *regs)
1389 struct vcpu *v = current;
1391 HVMTRACE_0D(DR_WRITE);
1393 if ( !v->arch.hvm_vcpu.flag_dr_dirty )
1394 __restore_debug_registers(v);
1396 /* Allow guest direct access to DR registers */
1397 v->arch.hvm_vmx.exec_control &= ~CPU_BASED_MOV_DR_EXITING;
1398 __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control);
1401 static void vmx_invlpg_intercept(unsigned long vaddr)
1403 struct vcpu *curr = current;
1404 HVMTRACE_LONG_2D(INVLPG, /*invlpga=*/ 0, TRC_PAR_LONG(vaddr));
1405 if ( paging_invlpg(curr, vaddr) )
1406 vpid_sync_vcpu_gva(curr, vaddr);
1409 #define CASE_SET_REG(REG, reg) \
1410 case VMX_CONTROL_REG_ACCESS_GPR_ ## REG: regs->reg = value; break
1411 #define CASE_GET_REG(REG, reg) \
1412 case VMX_CONTROL_REG_ACCESS_GPR_ ## REG: value = regs->reg; break
1414 #define CASE_EXTEND_SET_REG \
1415 CASE_EXTEND_REG(S)
1416 #define CASE_EXTEND_GET_REG \
1417 CASE_EXTEND_REG(G)
1419 #ifdef __i386__
1420 #define CASE_EXTEND_REG(T)
1421 #else
1422 #define CASE_EXTEND_REG(T) \
1423 CASE_ ## T ## ET_REG(R8, r8); \
1424 CASE_ ## T ## ET_REG(R9, r9); \
1425 CASE_ ## T ## ET_REG(R10, r10); \
1426 CASE_ ## T ## ET_REG(R11, r11); \
1427 CASE_ ## T ## ET_REG(R12, r12); \
1428 CASE_ ## T ## ET_REG(R13, r13); \
1429 CASE_ ## T ## ET_REG(R14, r14); \
1430 CASE_ ## T ## ET_REG(R15, r15)
1431 #endif
1433 static int mov_to_cr(int gp, int cr, struct cpu_user_regs *regs)
1435 unsigned long value;
1436 struct vcpu *v = current;
1437 struct vlapic *vlapic = vcpu_vlapic(v);
1439 switch ( gp )
1441 CASE_GET_REG(EAX, eax);
1442 CASE_GET_REG(ECX, ecx);
1443 CASE_GET_REG(EDX, edx);
1444 CASE_GET_REG(EBX, ebx);
1445 CASE_GET_REG(EBP, ebp);
1446 CASE_GET_REG(ESI, esi);
1447 CASE_GET_REG(EDI, edi);
1448 CASE_GET_REG(ESP, esp);
1449 CASE_EXTEND_GET_REG;
1450 default:
1451 gdprintk(XENLOG_ERR, "invalid gp: %d\n", gp);
1452 goto exit_and_crash;
1455 HVMTRACE_LONG_2D(CR_WRITE, cr, TRC_PAR_LONG(value));
1457 HVM_DBG_LOG(DBG_LEVEL_1, "CR%d, value = %lx", cr, value);
1459 switch ( cr )
1461 case 0:
1462 return !hvm_set_cr0(value);
1464 case 3:
1465 return !hvm_set_cr3(value);
1467 case 4:
1468 return !hvm_set_cr4(value);
1470 case 8:
1471 vlapic_set_reg(vlapic, APIC_TASKPRI, ((value & 0x0F) << 4));
1472 break;
1474 default:
1475 gdprintk(XENLOG_ERR, "invalid cr: %d\n", cr);
1476 goto exit_and_crash;
1479 return 1;
1481 exit_and_crash:
1482 domain_crash(v->domain);
1483 return 0;
1486 /*
1487 * Read from control registers. CR0 and CR4 are read from the shadow.
1488 */
1489 static void mov_from_cr(int cr, int gp, struct cpu_user_regs *regs)
1491 unsigned long value = 0;
1492 struct vcpu *v = current;
1493 struct vlapic *vlapic = vcpu_vlapic(v);
1495 switch ( cr )
1497 case 3:
1498 value = (unsigned long)v->arch.hvm_vcpu.guest_cr[3];
1499 break;
1500 case 8:
1501 value = (unsigned long)vlapic_get_reg(vlapic, APIC_TASKPRI);
1502 value = (value & 0xF0) >> 4;
1503 break;
1504 default:
1505 gdprintk(XENLOG_ERR, "invalid cr: %d\n", cr);
1506 domain_crash(v->domain);
1507 break;
1510 switch ( gp ) {
1511 CASE_SET_REG(EAX, eax);
1512 CASE_SET_REG(ECX, ecx);
1513 CASE_SET_REG(EDX, edx);
1514 CASE_SET_REG(EBX, ebx);
1515 CASE_SET_REG(EBP, ebp);
1516 CASE_SET_REG(ESI, esi);
1517 CASE_SET_REG(EDI, edi);
1518 CASE_SET_REG(ESP, esp);
1519 CASE_EXTEND_SET_REG;
1520 default:
1521 printk("invalid gp: %d\n", gp);
1522 domain_crash(v->domain);
1523 break;
1526 HVMTRACE_LONG_2D(CR_READ, cr, TRC_PAR_LONG(value));
1528 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR%d, value = %lx", cr, value);
1531 static int vmx_cr_access(unsigned long exit_qualification,
1532 struct cpu_user_regs *regs)
1534 unsigned int gp, cr;
1535 unsigned long value;
1536 struct vcpu *v = current;
1538 switch ( exit_qualification & VMX_CONTROL_REG_ACCESS_TYPE )
1540 case VMX_CONTROL_REG_ACCESS_TYPE_MOV_TO_CR:
1541 gp = exit_qualification & VMX_CONTROL_REG_ACCESS_GPR;
1542 cr = exit_qualification & VMX_CONTROL_REG_ACCESS_NUM;
1543 return mov_to_cr(gp, cr, regs);
1544 case VMX_CONTROL_REG_ACCESS_TYPE_MOV_FROM_CR:
1545 gp = exit_qualification & VMX_CONTROL_REG_ACCESS_GPR;
1546 cr = exit_qualification & VMX_CONTROL_REG_ACCESS_NUM;
1547 mov_from_cr(cr, gp, regs);
1548 break;
1549 case VMX_CONTROL_REG_ACCESS_TYPE_CLTS:
1550 v->arch.hvm_vcpu.guest_cr[0] &= ~X86_CR0_TS;
1551 vmx_update_guest_cr(v, 0);
1552 HVMTRACE_0D(CLTS);
1553 break;
1554 case VMX_CONTROL_REG_ACCESS_TYPE_LMSW:
1555 value = v->arch.hvm_vcpu.guest_cr[0];
1556 /* LMSW can: (1) set bits 0-3; (2) clear bits 1-3. */
1557 value = (value & ~0xe) | ((exit_qualification >> 16) & 0xf);
1558 HVMTRACE_LONG_1D(LMSW, value);
1559 return !hvm_set_cr0(value);
1560 default:
1561 BUG();
1564 return 1;
1567 static const struct lbr_info {
1568 u32 base, count;
1569 } p4_lbr[] = {
1570 { MSR_P4_LER_FROM_LIP, 1 },
1571 { MSR_P4_LER_TO_LIP, 1 },
1572 { MSR_P4_LASTBRANCH_TOS, 1 },
1573 { MSR_P4_LASTBRANCH_0_FROM_LIP, NUM_MSR_P4_LASTBRANCH_FROM_TO },
1574 { MSR_P4_LASTBRANCH_0_TO_LIP, NUM_MSR_P4_LASTBRANCH_FROM_TO },
1575 { 0, 0 }
1576 }, c2_lbr[] = {
1577 { MSR_IA32_LASTINTFROMIP, 1 },
1578 { MSR_IA32_LASTINTTOIP, 1 },
1579 { MSR_C2_LASTBRANCH_TOS, 1 },
1580 { MSR_C2_LASTBRANCH_0_FROM_IP, NUM_MSR_C2_LASTBRANCH_FROM_TO },
1581 { MSR_C2_LASTBRANCH_0_TO_IP, NUM_MSR_C2_LASTBRANCH_FROM_TO },
1582 { 0, 0 }
1583 #ifdef __i386__
1584 }, pm_lbr[] = {
1585 { MSR_IA32_LASTINTFROMIP, 1 },
1586 { MSR_IA32_LASTINTTOIP, 1 },
1587 { MSR_PM_LASTBRANCH_TOS, 1 },
1588 { MSR_PM_LASTBRANCH_0, NUM_MSR_PM_LASTBRANCH },
1589 { 0, 0 }
1590 #endif
1591 };
1593 static const struct lbr_info *last_branch_msr_get(void)
1595 switch ( boot_cpu_data.x86 )
1597 case 6:
1598 switch ( boot_cpu_data.x86_model )
1600 #ifdef __i386__
1601 /* PentiumM */
1602 case 9: case 13:
1603 /* Core Solo/Duo */
1604 case 14:
1605 return pm_lbr;
1606 break;
1607 #endif
1608 /* Core2 Duo */
1609 case 15:
1610 return c2_lbr;
1611 break;
1613 break;
1615 case 15:
1616 switch ( boot_cpu_data.x86_model )
1618 /* Pentium4/Xeon with em64t */
1619 case 3: case 4: case 6:
1620 return p4_lbr;
1621 break;
1623 break;
1626 return NULL;
1629 static int is_last_branch_msr(u32 ecx)
1631 const struct lbr_info *lbr = last_branch_msr_get();
1633 if ( lbr == NULL )
1634 return 0;
1636 for ( ; lbr->count; lbr++ )
1637 if ( (ecx >= lbr->base) && (ecx < (lbr->base + lbr->count)) )
1638 return 1;
1640 return 0;
1643 static int vmx_msr_read_intercept(struct cpu_user_regs *regs)
1645 u64 msr_content = 0;
1646 u32 ecx = regs->ecx, eax, edx;
1648 HVM_DBG_LOG(DBG_LEVEL_1, "ecx=%x", ecx);
1650 switch ( ecx )
1652 case MSR_IA32_SYSENTER_CS:
1653 msr_content = (u32)__vmread(GUEST_SYSENTER_CS);
1654 break;
1655 case MSR_IA32_SYSENTER_ESP:
1656 msr_content = __vmread(GUEST_SYSENTER_ESP);
1657 break;
1658 case MSR_IA32_SYSENTER_EIP:
1659 msr_content = __vmread(GUEST_SYSENTER_EIP);
1660 break;
1661 case MSR_IA32_DEBUGCTLMSR:
1662 msr_content = __vmread(GUEST_IA32_DEBUGCTL);
1663 #ifdef __i386__
1664 msr_content |= (u64)__vmread(GUEST_IA32_DEBUGCTL_HIGH) << 32;
1665 #endif
1666 break;
1667 case MSR_IA32_VMX_BASIC...MSR_IA32_VMX_PROCBASED_CTLS2:
1668 goto gp_fault;
1669 case MSR_IA32_MISC_ENABLE:
1670 rdmsrl(MSR_IA32_MISC_ENABLE, msr_content);
1671 /* Debug Trace Store is not supported. */
1672 msr_content |= MSR_IA32_MISC_ENABLE_BTS_UNAVAIL |
1673 MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL;
1674 break;
1675 default:
1676 if ( vpmu_do_rdmsr(regs) )
1677 goto done;
1678 if ( passive_domain_do_rdmsr(regs) )
1679 goto done;
1680 switch ( long_mode_do_msr_read(regs) )
1682 case HNDL_unhandled:
1683 break;
1684 case HNDL_exception_raised:
1685 return X86EMUL_EXCEPTION;
1686 case HNDL_done:
1687 goto done;
1690 if ( vmx_read_guest_msr(ecx, &msr_content) == 0 )
1691 break;
1693 if ( is_last_branch_msr(ecx) )
1695 msr_content = 0;
1696 break;
1699 if ( rdmsr_viridian_regs(ecx, &eax, &edx) ||
1700 rdmsr_hypervisor_regs(ecx, &eax, &edx) ||
1701 rdmsr_safe(ecx, eax, edx) == 0 )
1703 regs->eax = eax;
1704 regs->edx = edx;
1705 goto done;
1708 goto gp_fault;
1711 regs->eax = (uint32_t)msr_content;
1712 regs->edx = (uint32_t)(msr_content >> 32);
1714 done:
1715 HVMTRACE_3D (MSR_READ, ecx, regs->eax, regs->edx);
1716 HVM_DBG_LOG(DBG_LEVEL_1, "returns: ecx=%x, eax=%lx, edx=%lx",
1717 ecx, (unsigned long)regs->eax,
1718 (unsigned long)regs->edx);
1719 return X86EMUL_OKAY;
1721 gp_fault:
1722 vmx_inject_hw_exception(TRAP_gp_fault, 0);
1723 return X86EMUL_EXCEPTION;
1726 static int vmx_alloc_vlapic_mapping(struct domain *d)
1728 void *apic_va;
1730 if ( !cpu_has_vmx_virtualize_apic_accesses )
1731 return 0;
1733 apic_va = alloc_xenheap_page();
1734 if ( apic_va == NULL )
1735 return -ENOMEM;
1736 share_xen_page_with_guest(virt_to_page(apic_va), d, XENSHARE_writable);
1737 set_mmio_p2m_entry(
1738 d, paddr_to_pfn(APIC_DEFAULT_PHYS_BASE), _mfn(virt_to_mfn(apic_va)));
1739 d->arch.hvm_domain.vmx.apic_access_mfn = virt_to_mfn(apic_va);
1741 return 0;
1744 static void vmx_free_vlapic_mapping(struct domain *d)
1746 unsigned long mfn = d->arch.hvm_domain.vmx.apic_access_mfn;
1747 if ( mfn != 0 )
1748 free_xenheap_page(mfn_to_virt(mfn));
1751 static int vmx_alloc_vpid(struct domain *d)
1753 int idx;
1755 if ( !cpu_has_vmx_vpid )
1756 return 0;
1758 do {
1759 idx = find_first_zero_bit(vpid_bitmap, VPID_BITMAP_SIZE);
1760 if ( idx >= VPID_BITMAP_SIZE )
1762 dprintk(XENLOG_WARNING, "VMX VPID space exhausted.\n");
1763 return -EBUSY;
1766 while ( test_and_set_bit(idx, vpid_bitmap) );
1768 d->arch.hvm_domain.vmx.vpid_base = idx * MAX_VIRT_CPUS;
1769 return 0;
1772 static void vmx_free_vpid(struct domain *d)
1774 if ( !cpu_has_vmx_vpid )
1775 return;
1777 clear_bit(d->arch.hvm_domain.vmx.vpid_base / MAX_VIRT_CPUS, vpid_bitmap);
1780 static void vmx_install_vlapic_mapping(struct vcpu *v)
1782 paddr_t virt_page_ma, apic_page_ma;
1784 if ( !cpu_has_vmx_virtualize_apic_accesses )
1785 return;
1787 virt_page_ma = page_to_maddr(vcpu_vlapic(v)->regs_page);
1788 apic_page_ma = v->domain->arch.hvm_domain.vmx.apic_access_mfn;
1789 apic_page_ma <<= PAGE_SHIFT;
1791 vmx_vmcs_enter(v);
1792 __vmwrite(VIRTUAL_APIC_PAGE_ADDR, virt_page_ma);
1793 __vmwrite(APIC_ACCESS_ADDR, apic_page_ma);
1794 vmx_vmcs_exit(v);
1797 void vmx_vlapic_msr_changed(struct vcpu *v)
1799 struct vlapic *vlapic = vcpu_vlapic(v);
1800 uint32_t ctl;
1802 if ( !cpu_has_vmx_virtualize_apic_accesses )
1803 return;
1805 vmx_vmcs_enter(v);
1806 ctl = __vmread(SECONDARY_VM_EXEC_CONTROL);
1807 ctl &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
1808 if ( !vlapic_hw_disabled(vlapic) &&
1809 (vlapic_base_address(vlapic) == APIC_DEFAULT_PHYS_BASE) )
1810 ctl |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
1811 __vmwrite(SECONDARY_VM_EXEC_CONTROL, ctl);
1812 vmx_vmcs_exit(v);
1815 static int vmx_msr_write_intercept(struct cpu_user_regs *regs)
1817 u32 ecx = regs->ecx;
1818 u64 msr_content;
1819 struct vcpu *v = current;
1821 HVM_DBG_LOG(DBG_LEVEL_1, "ecx=%x, eax=%x, edx=%x",
1822 ecx, (u32)regs->eax, (u32)regs->edx);
1824 msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
1826 HVMTRACE_3D (MSR_WRITE, ecx, regs->eax, regs->edx);
1828 switch ( ecx )
1830 case MSR_IA32_SYSENTER_CS:
1831 __vmwrite(GUEST_SYSENTER_CS, msr_content);
1832 break;
1833 case MSR_IA32_SYSENTER_ESP:
1834 __vmwrite(GUEST_SYSENTER_ESP, msr_content);
1835 break;
1836 case MSR_IA32_SYSENTER_EIP:
1837 __vmwrite(GUEST_SYSENTER_EIP, msr_content);
1838 break;
1839 case MSR_IA32_DEBUGCTLMSR: {
1840 int i, rc = 0;
1842 if ( !msr_content || (msr_content & ~3) )
1843 break;
1845 if ( msr_content & 1 )
1847 const struct lbr_info *lbr = last_branch_msr_get();
1848 if ( lbr == NULL )
1849 break;
1851 for ( ; (rc == 0) && lbr->count; lbr++ )
1852 for ( i = 0; (rc == 0) && (i < lbr->count); i++ )
1853 if ( (rc = vmx_add_guest_msr(lbr->base + i)) == 0 )
1854 vmx_disable_intercept_for_msr(v, lbr->base + i);
1857 if ( (rc < 0) ||
1858 (vmx_add_host_load_msr(ecx) < 0) )
1859 vmx_inject_hw_exception(TRAP_machine_check, 0);
1860 else
1862 __vmwrite(GUEST_IA32_DEBUGCTL, msr_content);
1863 #ifdef __i386__
1864 __vmwrite(GUEST_IA32_DEBUGCTL_HIGH, msr_content >> 32);
1865 #endif
1868 break;
1870 case MSR_IA32_VMX_BASIC...MSR_IA32_VMX_PROCBASED_CTLS2:
1871 goto gp_fault;
1872 default:
1873 if ( vpmu_do_wrmsr(regs) )
1874 return X86EMUL_OKAY;
1875 if ( passive_domain_do_wrmsr(regs) )
1876 return X86EMUL_OKAY;
1878 if ( wrmsr_viridian_regs(ecx, regs->eax, regs->edx) )
1879 break;
1881 switch ( long_mode_do_msr_write(regs) )
1883 case HNDL_unhandled:
1884 if ( (vmx_write_guest_msr(ecx, msr_content) != 0) &&
1885 !is_last_branch_msr(ecx) )
1886 wrmsr_hypervisor_regs(ecx, regs->eax, regs->edx);
1887 break;
1888 case HNDL_exception_raised:
1889 return X86EMUL_EXCEPTION;
1890 case HNDL_done:
1891 break;
1893 break;
1896 return X86EMUL_OKAY;
1898 gp_fault:
1899 vmx_inject_hw_exception(TRAP_gp_fault, 0);
1900 return X86EMUL_EXCEPTION;
1903 static void vmx_do_extint(struct cpu_user_regs *regs)
1905 unsigned int vector;
1907 asmlinkage void do_IRQ(struct cpu_user_regs *);
1908 fastcall void smp_apic_timer_interrupt(struct cpu_user_regs *);
1909 fastcall void smp_event_check_interrupt(void);
1910 fastcall void smp_invalidate_interrupt(void);
1911 fastcall void smp_call_function_interrupt(void);
1912 fastcall void smp_spurious_interrupt(struct cpu_user_regs *regs);
1913 fastcall void smp_error_interrupt(struct cpu_user_regs *regs);
1914 fastcall void smp_pmu_apic_interrupt(struct cpu_user_regs *regs);
1915 #ifdef CONFIG_X86_MCE_P4THERMAL
1916 fastcall void smp_thermal_interrupt(struct cpu_user_regs *regs);
1917 #endif
1919 vector = __vmread(VM_EXIT_INTR_INFO);
1920 BUG_ON(!(vector & INTR_INFO_VALID_MASK));
1922 vector &= INTR_INFO_VECTOR_MASK;
1923 HVMTRACE_1D(INTR, vector);
1925 switch ( vector )
1927 case LOCAL_TIMER_VECTOR:
1928 smp_apic_timer_interrupt(regs);
1929 break;
1930 case EVENT_CHECK_VECTOR:
1931 smp_event_check_interrupt();
1932 break;
1933 case INVALIDATE_TLB_VECTOR:
1934 smp_invalidate_interrupt();
1935 break;
1936 case CALL_FUNCTION_VECTOR:
1937 smp_call_function_interrupt();
1938 break;
1939 case SPURIOUS_APIC_VECTOR:
1940 smp_spurious_interrupt(regs);
1941 break;
1942 case ERROR_APIC_VECTOR:
1943 smp_error_interrupt(regs);
1944 break;
1945 case PMU_APIC_VECTOR:
1946 smp_pmu_apic_interrupt(regs);
1947 break;
1948 #ifdef CONFIG_X86_MCE_P4THERMAL
1949 case THERMAL_APIC_VECTOR:
1950 smp_thermal_interrupt(regs);
1951 break;
1952 #endif
1953 default:
1954 regs->entry_vector = vector;
1955 do_IRQ(regs);
1956 break;
1960 static void wbinvd_ipi(void *info)
1962 wbinvd();
1965 static void vmx_wbinvd_intercept(void)
1967 if ( !has_arch_pdevs(current->domain) )
1968 return;
1970 if ( cpu_has_wbinvd_exiting )
1971 on_each_cpu(wbinvd_ipi, NULL, 1, 1);
1972 else
1973 wbinvd();
1976 static void ept_handle_violation(unsigned long qualification, paddr_t gpa)
1978 unsigned long gla_validity = qualification & EPT_GLA_VALIDITY_MASK;
1979 struct domain *d = current->domain;
1980 unsigned long gla, gfn = gpa >> PAGE_SHIFT;
1981 mfn_t mfn;
1982 p2m_type_t t;
1984 mfn = gfn_to_mfn(d, gfn, &t);
1986 /* There are two legitimate reasons for taking an EPT violation.
1987 * One is a guest access to MMIO space. */
1988 if ( gla_validity == EPT_GLA_VALIDITY_MATCH && p2m_is_mmio(t) )
1990 handle_mmio();
1991 return;
1994 /* The other is log-dirty mode, writing to a read-only page */
1995 if ( paging_mode_log_dirty(d)
1996 && (gla_validity == EPT_GLA_VALIDITY_MATCH
1997 || gla_validity == EPT_GLA_VALIDITY_GPT_WALK)
1998 && p2m_is_ram(t) && (t != p2m_ram_ro) )
2000 paging_mark_dirty(d, mfn_x(mfn));
2001 p2m_change_type(d, gfn, p2m_ram_logdirty, p2m_ram_rw);
2002 flush_tlb_mask(d->domain_dirty_cpumask);
2003 return;
2006 /* Everything else is an error. */
2007 gla = __vmread(GUEST_LINEAR_ADDRESS);
2008 gdprintk(XENLOG_ERR, "EPT violation %#lx (%c%c%c/%c%c%c), "
2009 "gpa %#"PRIpaddr", mfn %#lx, type %i.\n",
2010 qualification,
2011 (qualification & EPT_READ_VIOLATION) ? 'r' : '-',
2012 (qualification & EPT_WRITE_VIOLATION) ? 'w' : '-',
2013 (qualification & EPT_EXEC_VIOLATION) ? 'x' : '-',
2014 (qualification & EPT_EFFECTIVE_READ) ? 'r' : '-',
2015 (qualification & EPT_EFFECTIVE_WRITE) ? 'w' : '-',
2016 (qualification & EPT_EFFECTIVE_EXEC) ? 'x' : '-',
2017 gpa, mfn_x(mfn), t);
2019 if ( qualification & EPT_GAW_VIOLATION )
2020 gdprintk(XENLOG_ERR, " --- GPA too wide (max %u bits)\n",
2021 9 * (unsigned) d->arch.hvm_domain.vmx.ept_control.gaw + 21);
2023 switch ( gla_validity )
2025 case EPT_GLA_VALIDITY_PDPTR_LOAD:
2026 gdprintk(XENLOG_ERR, " --- PDPTR load failed\n");
2027 break;
2028 case EPT_GLA_VALIDITY_GPT_WALK:
2029 gdprintk(XENLOG_ERR, " --- guest PT walk to %#lx failed\n", gla);
2030 break;
2031 case EPT_GLA_VALIDITY_RSVD:
2032 gdprintk(XENLOG_ERR, " --- GLA_validity 2 (reserved)\n");
2033 break;
2034 case EPT_GLA_VALIDITY_MATCH:
2035 gdprintk(XENLOG_ERR, " --- guest access to %#lx failed\n", gla);
2036 break;
2039 domain_crash(d);
2042 static void vmx_failed_vmentry(unsigned int exit_reason,
2043 struct cpu_user_regs *regs)
2045 unsigned int failed_vmentry_reason = (uint16_t)exit_reason;
2046 unsigned long exit_qualification = __vmread(EXIT_QUALIFICATION);
2047 struct vcpu *curr = current;
2049 printk("Failed vm entry (exit reason 0x%x) ", exit_reason);
2050 switch ( failed_vmentry_reason )
2052 case EXIT_REASON_INVALID_GUEST_STATE:
2053 printk("caused by invalid guest state (%ld).\n", exit_qualification);
2054 break;
2055 case EXIT_REASON_MSR_LOADING:
2056 printk("caused by MSR entry %ld loading.\n", exit_qualification);
2057 break;
2058 case EXIT_REASON_MACHINE_CHECK:
2059 printk("caused by machine check.\n");
2060 HVMTRACE_0D(MCE);
2061 do_machine_check(regs);
2062 break;
2063 default:
2064 printk("reason not known yet!");
2065 break;
2068 printk("************* VMCS Area **************\n");
2069 vmcs_dump_vcpu(curr);
2070 printk("**************************************\n");
2072 domain_crash(curr->domain);
2075 asmlinkage void vmx_vmexit_handler(struct cpu_user_regs *regs)
2077 unsigned int exit_reason, idtv_info;
2078 unsigned long exit_qualification, inst_len = 0;
2079 struct vcpu *v = current;
2081 if ( paging_mode_hap(v->domain) && hvm_paging_enabled(v) )
2082 v->arch.hvm_vcpu.guest_cr[3] = v->arch.hvm_vcpu.hw_cr[3] =
2083 __vmread(GUEST_CR3);
2085 exit_reason = __vmread(VM_EXIT_REASON);
2087 HVMTRACE_ND(VMEXIT64, 1/*cycles*/, 3, exit_reason,
2088 (uint32_t)regs->eip, (uint32_t)((uint64_t)regs->eip >> 32),
2089 0, 0, 0);
2091 perfc_incra(vmexits, exit_reason);
2093 /* Handle the interrupt we missed before allowing any more in. */
2094 if ( exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT )
2095 vmx_do_extint(regs);
2097 /* Now enable interrupts so it's safe to take locks. */
2098 local_irq_enable();
2100 if ( unlikely(exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) )
2101 return vmx_failed_vmentry(exit_reason, regs);
2103 hvm_maybe_deassert_evtchn_irq();
2105 /* Event delivery caused this intercept? Queue for redelivery. */
2106 idtv_info = __vmread(IDT_VECTORING_INFO);
2107 if ( unlikely(idtv_info & INTR_INFO_VALID_MASK) &&
2108 (exit_reason != EXIT_REASON_TASK_SWITCH) )
2110 if ( hvm_event_needs_reinjection((idtv_info>>8)&7, idtv_info&0xff) )
2112 /* See SDM 3B 25.7.1.1 and .2 for info about masking resvd bits. */
2113 __vmwrite(VM_ENTRY_INTR_INFO,
2114 idtv_info & ~INTR_INFO_RESVD_BITS_MASK);
2115 if ( idtv_info & INTR_INFO_DELIVER_CODE_MASK )
2116 __vmwrite(VM_ENTRY_EXCEPTION_ERROR_CODE,
2117 __vmread(IDT_VECTORING_ERROR_CODE));
2120 /*
2121 * Clear NMI-blocking interruptibility info if an NMI delivery faulted.
2122 * Re-delivery will re-set it (see SDM 3B 25.7.1.2).
2123 */
2124 if ( (idtv_info & INTR_INFO_INTR_TYPE_MASK) == (X86_EVENTTYPE_NMI<<8) )
2125 __vmwrite(GUEST_INTERRUPTIBILITY_INFO,
2126 __vmread(GUEST_INTERRUPTIBILITY_INFO) &
2127 ~VMX_INTR_SHADOW_NMI);
2130 switch ( exit_reason )
2132 case EXIT_REASON_EXCEPTION_NMI:
2134 /*
2135 * We don't set the software-interrupt exiting (INT n).
2136 * (1) We can get an exception (e.g. #PG) in the guest, or
2137 * (2) NMI
2138 */
2139 unsigned int intr_info, vector;
2141 intr_info = __vmread(VM_EXIT_INTR_INFO);
2142 BUG_ON(!(intr_info & INTR_INFO_VALID_MASK));
2144 vector = intr_info & INTR_INFO_VECTOR_MASK;
2146 /*
2147 * Re-set the NMI shadow if vmexit caused by a guest IRET fault (see 3B
2148 * 25.7.1.2, "Resuming Guest Software after Handling an Exception").
2149 * (NB. If we emulate this IRET for any reason, we should re-clear!)
2150 */
2151 if ( unlikely(intr_info & INTR_INFO_NMI_UNBLOCKED_BY_IRET) &&
2152 !(__vmread(IDT_VECTORING_INFO) & INTR_INFO_VALID_MASK) &&
2153 (vector != TRAP_double_fault) )
2154 __vmwrite(GUEST_INTERRUPTIBILITY_INFO,
2155 __vmread(GUEST_INTERRUPTIBILITY_INFO)
2156 | VMX_INTR_SHADOW_NMI);
2158 perfc_incra(cause_vector, vector);
2160 switch ( vector )
2162 case TRAP_debug:
2163 /*
2164 * Updates DR6 where debugger can peek (See 3B 23.2.1,
2165 * Table 23-1, "Exit Qualification for Debug Exceptions").
2166 */
2167 exit_qualification = __vmread(EXIT_QUALIFICATION);
2168 write_debugreg(6, exit_qualification | 0xffff0ff0);
2169 if ( !v->domain->debugger_attached )
2170 goto exit_and_crash;
2171 domain_pause_for_debugger();
2172 break;
2173 case TRAP_int3:
2174 if ( !v->domain->debugger_attached )
2175 goto exit_and_crash;
2176 inst_len = __get_instruction_length(); /* Safe: INT3 */
2177 __update_guest_eip(inst_len);
2178 domain_pause_for_debugger();
2179 break;
2180 case TRAP_no_device:
2181 vmx_fpu_dirty_intercept();
2182 break;
2183 case TRAP_page_fault:
2184 exit_qualification = __vmread(EXIT_QUALIFICATION);
2185 regs->error_code = __vmread(VM_EXIT_INTR_ERROR_CODE);
2187 HVM_DBG_LOG(DBG_LEVEL_VMMU,
2188 "eax=%lx, ebx=%lx, ecx=%lx, edx=%lx, esi=%lx, edi=%lx",
2189 (unsigned long)regs->eax, (unsigned long)regs->ebx,
2190 (unsigned long)regs->ecx, (unsigned long)regs->edx,
2191 (unsigned long)regs->esi, (unsigned long)regs->edi);
2193 if ( paging_fault(exit_qualification, regs) )
2195 if ( trace_will_trace_event(TRC_SHADOW) )
2196 break;
2197 if ( hvm_long_mode_enabled(v) )
2198 HVMTRACE_LONG_2D(PF_XEN, regs->error_code,
2199 TRC_PAR_LONG(exit_qualification) );
2200 else
2201 HVMTRACE_2D(PF_XEN,
2202 regs->error_code, exit_qualification );
2203 break;
2206 v->arch.hvm_vcpu.guest_cr[2] = exit_qualification;
2207 vmx_inject_hw_exception(TRAP_page_fault, regs->error_code);
2208 break;
2209 case TRAP_nmi:
2210 if ( (intr_info & INTR_INFO_INTR_TYPE_MASK) !=
2211 (X86_EVENTTYPE_NMI << 8) )
2212 goto exit_and_crash;
2213 HVMTRACE_0D(NMI);
2214 do_nmi(regs); /* Real NMI, vector 2: normal processing. */
2215 break;
2216 case TRAP_machine_check:
2217 HVMTRACE_0D(MCE);
2218 do_machine_check(regs);
2219 break;
2220 default:
2221 goto exit_and_crash;
2223 break;
2225 case EXIT_REASON_EXTERNAL_INTERRUPT:
2226 /* Already handled above. */
2227 break;
2228 case EXIT_REASON_TRIPLE_FAULT:
2229 hvm_triple_fault();
2230 break;
2231 case EXIT_REASON_PENDING_VIRT_INTR:
2232 /* Disable the interrupt window. */
2233 v->arch.hvm_vmx.exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
2234 __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
2235 v->arch.hvm_vmx.exec_control);
2236 break;
2237 case EXIT_REASON_PENDING_VIRT_NMI:
2238 /* Disable the NMI window. */
2239 v->arch.hvm_vmx.exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
2240 __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
2241 v->arch.hvm_vmx.exec_control);
2242 break;
2243 case EXIT_REASON_TASK_SWITCH: {
2244 const enum hvm_task_switch_reason reasons[] = {
2245 TSW_call_or_int, TSW_iret, TSW_jmp, TSW_call_or_int };
2246 int32_t errcode = -1;
2247 exit_qualification = __vmread(EXIT_QUALIFICATION);
2248 if ( (idtv_info & INTR_INFO_VALID_MASK) &&
2249 (idtv_info & INTR_INFO_DELIVER_CODE_MASK) )
2250 errcode = __vmread(IDT_VECTORING_ERROR_CODE);
2251 hvm_task_switch((uint16_t)exit_qualification,
2252 reasons[(exit_qualification >> 30) & 3],
2253 errcode);
2254 break;
2256 case EXIT_REASON_CPUID:
2257 inst_len = __get_instruction_length(); /* Safe: CPUID */
2258 __update_guest_eip(inst_len);
2259 vmx_do_cpuid(regs);
2260 break;
2261 case EXIT_REASON_HLT:
2262 inst_len = __get_instruction_length(); /* Safe: HLT */
2263 __update_guest_eip(inst_len);
2264 hvm_hlt(regs->eflags);
2265 break;
2266 case EXIT_REASON_INVLPG:
2268 inst_len = __get_instruction_length(); /* Safe: INVLPG */
2269 __update_guest_eip(inst_len);
2270 exit_qualification = __vmread(EXIT_QUALIFICATION);
2271 vmx_invlpg_intercept(exit_qualification);
2272 break;
2274 case EXIT_REASON_RDTSC:
2275 inst_len = __get_instruction_length();
2276 __update_guest_eip(inst_len);
2277 hvm_rdtsc_intercept(regs);
2278 break;
2279 case EXIT_REASON_VMCALL:
2281 int rc;
2282 HVMTRACE_1D(VMMCALL, regs->eax);
2283 inst_len = __get_instruction_length(); /* Safe: VMCALL */
2284 rc = hvm_do_hypercall(regs);
2285 if ( rc != HVM_HCALL_preempted )
2287 __update_guest_eip(inst_len);
2288 if ( rc == HVM_HCALL_invalidate )
2289 send_invalidate_req();
2291 break;
2293 case EXIT_REASON_CR_ACCESS:
2295 exit_qualification = __vmread(EXIT_QUALIFICATION);
2296 inst_len = __get_instruction_length(); /* Safe: MOV Cn, LMSW, CLTS */
2297 if ( vmx_cr_access(exit_qualification, regs) )
2298 __update_guest_eip(inst_len);
2299 break;
2301 case EXIT_REASON_DR_ACCESS:
2302 exit_qualification = __vmread(EXIT_QUALIFICATION);
2303 vmx_dr_access(exit_qualification, regs);
2304 break;
2305 case EXIT_REASON_MSR_READ:
2306 inst_len = __get_instruction_length(); /* Safe: RDMSR */
2307 if ( hvm_msr_read_intercept(regs) == X86EMUL_OKAY )
2308 __update_guest_eip(inst_len);
2309 break;
2310 case EXIT_REASON_MSR_WRITE:
2311 inst_len = __get_instruction_length(); /* Safe: WRMSR */
2312 if ( hvm_msr_write_intercept(regs) == X86EMUL_OKAY )
2313 __update_guest_eip(inst_len);
2314 break;
2316 case EXIT_REASON_MWAIT_INSTRUCTION:
2317 case EXIT_REASON_MONITOR_INSTRUCTION:
2318 case EXIT_REASON_VMCLEAR:
2319 case EXIT_REASON_VMLAUNCH:
2320 case EXIT_REASON_VMPTRLD:
2321 case EXIT_REASON_VMPTRST:
2322 case EXIT_REASON_VMREAD:
2323 case EXIT_REASON_VMRESUME:
2324 case EXIT_REASON_VMWRITE:
2325 case EXIT_REASON_VMXOFF:
2326 case EXIT_REASON_VMXON:
2327 vmx_inject_hw_exception(TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE);
2328 break;
2330 case EXIT_REASON_TPR_BELOW_THRESHOLD:
2331 break;
2333 case EXIT_REASON_IO_INSTRUCTION:
2334 case EXIT_REASON_APIC_ACCESS:
2335 if ( !handle_mmio() )
2336 vmx_inject_hw_exception(TRAP_gp_fault, 0);
2337 break;
2339 case EXIT_REASON_INVD:
2340 case EXIT_REASON_WBINVD:
2342 inst_len = __get_instruction_length(); /* Safe: INVD, WBINVD */
2343 __update_guest_eip(inst_len);
2344 vmx_wbinvd_intercept();
2345 break;
2348 case EXIT_REASON_EPT_VIOLATION:
2350 paddr_t gpa = __vmread(GUEST_PHYSICAL_ADDRESS);
2351 #ifdef __i386__
2352 gpa |= (paddr_t)__vmread(GUEST_PHYSICAL_ADDRESS_HIGH) << 32;
2353 #endif
2354 exit_qualification = __vmread(EXIT_QUALIFICATION);
2355 ept_handle_violation(exit_qualification, gpa);
2356 break;
2359 default:
2360 exit_and_crash:
2361 gdprintk(XENLOG_ERR, "Bad vmexit (reason %x)\n", exit_reason);
2362 domain_crash(v->domain);
2363 break;
2367 asmlinkage void vmx_trace_vmentry(void)
2369 HVMTRACE_ND (VMENTRY, 1/*cycles*/, 0, 0, 0, 0, 0, 0, 0);
2372 /*
2373 * Local variables:
2374 * mode: C
2375 * c-set-style: "BSD"
2376 * c-basic-offset: 4
2377 * tab-width: 4
2378 * indent-tabs-mode: nil
2379 * End:
2380 */