ia64/xen-unstable

view xen/arch/x86/hvm/vmx/vmx.c @ 15675:66147ca8f9c4

hvm: Define common (across VMX and SVM) set of event types.
Signed-off-by: Keir Fraser <keir@xensource.com>
author kfraser@localhost.localdomain
date Tue Jul 31 10:11:47 2007 +0100 (2007-07-31)
parents 9174a8cfb578
children 66055f773d19
line source
1 /*
2 * vmx.c: handling VMX architecture-related VM exits
3 * Copyright (c) 2004, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
16 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 *
18 */
20 #include <xen/config.h>
21 #include <xen/init.h>
22 #include <xen/lib.h>
23 #include <xen/trace.h>
24 #include <xen/sched.h>
25 #include <xen/irq.h>
26 #include <xen/softirq.h>
27 #include <xen/domain_page.h>
28 #include <xen/hypercall.h>
29 #include <xen/perfc.h>
30 #include <asm/current.h>
31 #include <asm/io.h>
32 #include <asm/regs.h>
33 #include <asm/cpufeature.h>
34 #include <asm/processor.h>
35 #include <asm/types.h>
36 #include <asm/msr.h>
37 #include <asm/spinlock.h>
38 #include <asm/paging.h>
39 #include <asm/p2m.h>
40 #include <asm/hvm/hvm.h>
41 #include <asm/hvm/support.h>
42 #include <asm/hvm/vmx/vmx.h>
43 #include <asm/hvm/vmx/vmcs.h>
44 #include <asm/hvm/vmx/cpu.h>
45 #include <public/sched.h>
46 #include <public/hvm/ioreq.h>
47 #include <asm/hvm/vpic.h>
48 #include <asm/hvm/vlapic.h>
49 #include <asm/x86_emulate.h>
50 #include <asm/hvm/vpt.h>
51 #include <public/hvm/save.h>
52 #include <asm/hvm/trace.h>
54 enum handler_return { HNDL_done, HNDL_unhandled, HNDL_exception_raised };
56 char *vmx_msr_bitmap;
58 static void vmx_ctxt_switch_from(struct vcpu *v);
59 static void vmx_ctxt_switch_to(struct vcpu *v);
61 static int vmx_alloc_vlapic_mapping(struct domain *d);
62 static void vmx_free_vlapic_mapping(struct domain *d);
63 static void vmx_install_vlapic_mapping(struct vcpu *v);
65 static int vmx_domain_initialise(struct domain *d)
66 {
67 return vmx_alloc_vlapic_mapping(d);
68 }
70 static void vmx_domain_destroy(struct domain *d)
71 {
72 vmx_free_vlapic_mapping(d);
73 }
75 static int vmx_vcpu_initialise(struct vcpu *v)
76 {
77 int rc;
79 spin_lock_init(&v->arch.hvm_vmx.vmcs_lock);
81 v->arch.schedule_tail = vmx_do_resume;
82 v->arch.ctxt_switch_from = vmx_ctxt_switch_from;
83 v->arch.ctxt_switch_to = vmx_ctxt_switch_to;
85 if ( (rc = vmx_create_vmcs(v)) != 0 )
86 {
87 dprintk(XENLOG_WARNING,
88 "Failed to create VMCS for vcpu %d: err=%d.\n",
89 v->vcpu_id, rc);
90 return rc;
91 }
93 vmx_install_vlapic_mapping(v);
95 return 0;
96 }
98 static void vmx_vcpu_destroy(struct vcpu *v)
99 {
100 vmx_destroy_vmcs(v);
101 }
103 static int vmx_paging_enabled(struct vcpu *v)
104 {
105 unsigned long cr0 = v->arch.hvm_vmx.cpu_shadow_cr0;
106 return (cr0 & (X86_CR0_PE | X86_CR0_PG)) == (X86_CR0_PE | X86_CR0_PG);
107 }
109 static int vmx_pgbit_test(struct vcpu *v)
110 {
111 unsigned long cr0 = v->arch.hvm_vmx.cpu_shadow_cr0;
112 return cr0 & X86_CR0_PG;
113 }
115 static int vmx_pae_enabled(struct vcpu *v)
116 {
117 unsigned long cr4 = v->arch.hvm_vmx.cpu_shadow_cr4;
118 return vmx_paging_enabled(v) && (cr4 & X86_CR4_PAE);
119 }
121 static int vmx_nx_enabled(struct vcpu *v)
122 {
123 return v->arch.hvm_vmx.efer & EFER_NX;
124 }
126 #ifdef __x86_64__
128 static int vmx_lme_is_set(struct vcpu *v)
129 {
130 return v->arch.hvm_vmx.efer & EFER_LME;
131 }
133 static int vmx_long_mode_enabled(struct vcpu *v)
134 {
135 return v->arch.hvm_vmx.efer & EFER_LMA;
136 }
138 static void vmx_enable_long_mode(struct vcpu *v)
139 {
140 unsigned long vm_entry_value;
142 vm_entry_value = __vmread(VM_ENTRY_CONTROLS);
143 vm_entry_value |= VM_ENTRY_IA32E_MODE;
144 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
146 v->arch.hvm_vmx.efer |= EFER_LMA;
147 }
149 static void vmx_disable_long_mode(struct vcpu *v)
150 {
151 unsigned long vm_entry_value;
153 vm_entry_value = __vmread(VM_ENTRY_CONTROLS);
154 vm_entry_value &= ~VM_ENTRY_IA32E_MODE;
155 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
157 v->arch.hvm_vmx.efer &= ~EFER_LMA;
158 }
160 static DEFINE_PER_CPU(struct vmx_msr_state, host_msr_state);
162 static u32 msr_index[VMX_MSR_COUNT] =
163 {
164 MSR_LSTAR, MSR_STAR, MSR_SYSCALL_MASK
165 };
167 static void vmx_save_host_msrs(void)
168 {
169 struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state);
170 int i;
172 for ( i = 0; i < VMX_MSR_COUNT; i++ )
173 rdmsrl(msr_index[i], host_msr_state->msrs[i]);
174 }
176 #define WRITE_MSR(address) \
177 guest_msr_state->msrs[VMX_INDEX_MSR_ ## address] = msr_content; \
178 set_bit(VMX_INDEX_MSR_ ## address, &guest_msr_state->flags); \
179 wrmsrl(MSR_ ## address, msr_content); \
180 set_bit(VMX_INDEX_MSR_ ## address, &host_msr_state->flags); \
181 break
183 static enum handler_return long_mode_do_msr_read(struct cpu_user_regs *regs)
184 {
185 u64 msr_content = 0;
186 u32 ecx = regs->ecx;
187 struct vcpu *v = current;
188 struct vmx_msr_state *guest_msr_state = &v->arch.hvm_vmx.msr_state;
190 switch ( ecx )
191 {
192 case MSR_EFER:
193 msr_content = v->arch.hvm_vmx.efer;
194 break;
196 case MSR_FS_BASE:
197 msr_content = __vmread(GUEST_FS_BASE);
198 goto check_long_mode;
200 case MSR_GS_BASE:
201 msr_content = __vmread(GUEST_GS_BASE);
202 goto check_long_mode;
204 case MSR_SHADOW_GS_BASE:
205 msr_content = v->arch.hvm_vmx.shadow_gs;
206 check_long_mode:
207 if ( !(vmx_long_mode_enabled(v)) )
208 {
209 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
210 return HNDL_exception_raised;
211 }
212 break;
214 case MSR_STAR:
215 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_STAR];
216 break;
218 case MSR_LSTAR:
219 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_LSTAR];
220 break;
222 case MSR_CSTAR:
223 msr_content = v->arch.hvm_vmx.cstar;
224 break;
226 case MSR_SYSCALL_MASK:
227 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_SYSCALL_MASK];
228 break;
230 default:
231 return HNDL_unhandled;
232 }
234 HVM_DBG_LOG(DBG_LEVEL_0, "msr 0x%x content 0x%"PRIx64, ecx, msr_content);
236 regs->eax = (u32)(msr_content >> 0);
237 regs->edx = (u32)(msr_content >> 32);
239 return HNDL_done;
240 }
242 static enum handler_return long_mode_do_msr_write(struct cpu_user_regs *regs)
243 {
244 u64 msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
245 u32 ecx = regs->ecx;
246 struct vcpu *v = current;
247 struct vmx_msr_state *guest_msr_state = &v->arch.hvm_vmx.msr_state;
248 struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state);
250 HVM_DBG_LOG(DBG_LEVEL_0, "msr 0x%x content 0x%"PRIx64, ecx, msr_content);
252 switch ( ecx )
253 {
254 case MSR_EFER:
255 /* offending reserved bit will cause #GP */
256 if ( (msr_content & ~(EFER_LME | EFER_LMA | EFER_NX | EFER_SCE)) ||
257 (!cpu_has_nx && (msr_content & EFER_NX)) ||
258 (!cpu_has_syscall && (msr_content & EFER_SCE)) )
259 {
260 gdprintk(XENLOG_WARNING, "Trying to set reserved bit in "
261 "EFER: %"PRIx64"\n", msr_content);
262 goto gp_fault;
263 }
265 if ( (msr_content & EFER_LME)
266 && !(v->arch.hvm_vmx.efer & EFER_LME) )
267 {
268 if ( unlikely(vmx_paging_enabled(v)) )
269 {
270 gdprintk(XENLOG_WARNING,
271 "Trying to set EFER.LME with paging enabled\n");
272 goto gp_fault;
273 }
274 }
275 else if ( !(msr_content & EFER_LME)
276 && (v->arch.hvm_vmx.efer & EFER_LME) )
277 {
278 if ( unlikely(vmx_paging_enabled(v)) )
279 {
280 gdprintk(XENLOG_WARNING,
281 "Trying to clear EFER.LME with paging enabled\n");
282 goto gp_fault;
283 }
284 }
286 if ( (msr_content ^ v->arch.hvm_vmx.efer) & (EFER_NX|EFER_SCE) )
287 write_efer((read_efer() & ~(EFER_NX|EFER_SCE)) |
288 (msr_content & (EFER_NX|EFER_SCE)));
290 v->arch.hvm_vmx.efer = msr_content;
291 break;
293 case MSR_FS_BASE:
294 case MSR_GS_BASE:
295 case MSR_SHADOW_GS_BASE:
296 if ( !vmx_long_mode_enabled(v) )
297 goto gp_fault;
299 if ( !is_canonical_address(msr_content) )
300 goto uncanonical_address;
302 if ( ecx == MSR_FS_BASE )
303 __vmwrite(GUEST_FS_BASE, msr_content);
304 else if ( ecx == MSR_GS_BASE )
305 __vmwrite(GUEST_GS_BASE, msr_content);
306 else
307 {
308 v->arch.hvm_vmx.shadow_gs = msr_content;
309 wrmsrl(MSR_SHADOW_GS_BASE, msr_content);
310 }
312 break;
314 case MSR_STAR:
315 WRITE_MSR(STAR);
317 case MSR_LSTAR:
318 if ( !is_canonical_address(msr_content) )
319 goto uncanonical_address;
320 WRITE_MSR(LSTAR);
322 case MSR_CSTAR:
323 if ( !is_canonical_address(msr_content) )
324 goto uncanonical_address;
325 v->arch.hvm_vmx.cstar = msr_content;
326 break;
328 case MSR_SYSCALL_MASK:
329 WRITE_MSR(SYSCALL_MASK);
331 default:
332 return HNDL_unhandled;
333 }
335 return HNDL_done;
337 uncanonical_address:
338 HVM_DBG_LOG(DBG_LEVEL_0, "Not cano address of msr write %x", ecx);
339 gp_fault:
340 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
341 return HNDL_exception_raised;
342 }
344 /*
345 * To avoid MSR save/restore at every VM exit/entry time, we restore
346 * the x86_64 specific MSRs at domain switch time. Since these MSRs
347 * are not modified once set for para domains, we don't save them,
348 * but simply reset them to values set in percpu_traps_init().
349 */
350 static void vmx_restore_host_msrs(void)
351 {
352 struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state);
353 int i;
355 while ( host_msr_state->flags )
356 {
357 i = find_first_set_bit(host_msr_state->flags);
358 wrmsrl(msr_index[i], host_msr_state->msrs[i]);
359 clear_bit(i, &host_msr_state->flags);
360 }
362 if ( cpu_has_nx && !(read_efer() & EFER_NX) )
363 write_efer(read_efer() | EFER_NX);
364 }
366 static void vmx_save_guest_msrs(struct vcpu *v)
367 {
368 /* MSR_SHADOW_GS_BASE may have been changed by swapgs instruction. */
369 rdmsrl(MSR_SHADOW_GS_BASE, v->arch.hvm_vmx.shadow_gs);
370 }
372 static void vmx_restore_guest_msrs(struct vcpu *v)
373 {
374 struct vmx_msr_state *guest_msr_state, *host_msr_state;
375 unsigned long guest_flags;
376 int i;
378 guest_msr_state = &v->arch.hvm_vmx.msr_state;
379 host_msr_state = &this_cpu(host_msr_state);
381 wrmsrl(MSR_SHADOW_GS_BASE, v->arch.hvm_vmx.shadow_gs);
383 guest_flags = guest_msr_state->flags;
385 while ( guest_flags )
386 {
387 i = find_first_set_bit(guest_flags);
389 HVM_DBG_LOG(DBG_LEVEL_2,
390 "restore guest's index %d msr %x with value %lx",
391 i, msr_index[i], guest_msr_state->msrs[i]);
392 set_bit(i, &host_msr_state->flags);
393 wrmsrl(msr_index[i], guest_msr_state->msrs[i]);
394 clear_bit(i, &guest_flags);
395 }
397 if ( (v->arch.hvm_vmx.efer ^ read_efer()) & (EFER_NX | EFER_SCE) )
398 {
399 HVM_DBG_LOG(DBG_LEVEL_2,
400 "restore guest's EFER with value %lx",
401 v->arch.hvm_vmx.efer);
402 write_efer((read_efer() & ~(EFER_NX | EFER_SCE)) |
403 (v->arch.hvm_vmx.efer & (EFER_NX | EFER_SCE)));
404 }
405 }
407 #else /* __i386__ */
409 static int vmx_lme_is_set(struct vcpu *v)
410 { return 0; }
411 static int vmx_long_mode_enabled(struct vcpu *v)
412 { return 0; }
413 static void vmx_enable_long_mode(struct vcpu *v)
414 { BUG(); }
415 static void vmx_disable_long_mode(struct vcpu *v)
416 { BUG(); }
418 #define vmx_save_host_msrs() ((void)0)
420 static void vmx_restore_host_msrs(void)
421 {
422 if ( cpu_has_nx && !(read_efer() & EFER_NX) )
423 write_efer(read_efer() | EFER_NX);
424 }
426 #define vmx_save_guest_msrs(v) ((void)0)
428 static void vmx_restore_guest_msrs(struct vcpu *v)
429 {
430 if ( (v->arch.hvm_vmx.efer ^ read_efer()) & EFER_NX )
431 {
432 HVM_DBG_LOG(DBG_LEVEL_2,
433 "restore guest's EFER with value %lx",
434 v->arch.hvm_vmx.efer);
435 write_efer((read_efer() & ~EFER_NX) |
436 (v->arch.hvm_vmx.efer & EFER_NX));
437 }
438 }
440 static enum handler_return long_mode_do_msr_read(struct cpu_user_regs *regs)
441 {
442 u64 msr_content = 0;
443 struct vcpu *v = current;
445 switch ( regs->ecx ) {
446 case MSR_EFER:
447 msr_content = v->arch.hvm_vmx.efer;
448 break;
450 default:
451 return HNDL_unhandled;
452 }
454 regs->eax = msr_content >> 0;
455 regs->edx = msr_content >> 32;
457 return HNDL_done;
458 }
460 static enum handler_return long_mode_do_msr_write(struct cpu_user_regs *regs)
461 {
462 u64 msr_content = regs->eax | ((u64)regs->edx << 32);
463 struct vcpu *v = current;
465 switch ( regs->ecx )
466 {
467 case MSR_EFER:
468 /* offending reserved bit will cause #GP */
469 if ( (msr_content & ~EFER_NX) ||
470 (!cpu_has_nx && (msr_content & EFER_NX)) )
471 {
472 gdprintk(XENLOG_WARNING, "Trying to set reserved bit in "
473 "EFER: %"PRIx64"\n", msr_content);
474 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
475 return HNDL_exception_raised;
476 }
478 if ( (msr_content ^ v->arch.hvm_vmx.efer) & EFER_NX )
479 write_efer((read_efer() & ~EFER_NX) | (msr_content & EFER_NX));
481 v->arch.hvm_vmx.efer = msr_content;
482 break;
484 default:
485 return HNDL_unhandled;
486 }
488 return HNDL_done;
489 }
491 #endif /* __i386__ */
493 #define loaddebug(_v,_reg) \
494 __asm__ __volatile__ ("mov %0,%%db" #_reg : : "r" ((_v)->debugreg[_reg]))
495 #define savedebug(_v,_reg) \
496 __asm__ __volatile__ ("mov %%db" #_reg ",%0" : : "r" ((_v)->debugreg[_reg]))
498 static int vmx_guest_x86_mode(struct vcpu *v)
499 {
500 unsigned int cs_ar_bytes;
502 ASSERT(v == current);
504 if ( unlikely(!(v->arch.hvm_vmx.cpu_shadow_cr0 & X86_CR0_PE)) )
505 return 0;
506 if ( unlikely(__vmread(GUEST_RFLAGS) & X86_EFLAGS_VM) )
507 return 1;
508 cs_ar_bytes = __vmread(GUEST_CS_AR_BYTES);
509 if ( vmx_long_mode_enabled(v) &&
510 likely(cs_ar_bytes & X86_SEG_AR_CS_LM_ACTIVE) )
511 return 8;
512 return (likely(cs_ar_bytes & X86_SEG_AR_DEF_OP_SIZE) ? 4 : 2);
513 }
515 static void vmx_save_dr(struct vcpu *v)
516 {
517 if ( !v->arch.hvm_vcpu.flag_dr_dirty )
518 return;
520 /* Clear the DR dirty flag and re-enable intercepts for DR accesses. */
521 v->arch.hvm_vcpu.flag_dr_dirty = 0;
522 v->arch.hvm_vmx.exec_control |= CPU_BASED_MOV_DR_EXITING;
523 __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control);
525 savedebug(&v->arch.guest_context, 0);
526 savedebug(&v->arch.guest_context, 1);
527 savedebug(&v->arch.guest_context, 2);
528 savedebug(&v->arch.guest_context, 3);
529 savedebug(&v->arch.guest_context, 6);
530 v->arch.guest_context.debugreg[7] = __vmread(GUEST_DR7);
531 }
533 static void __restore_debug_registers(struct vcpu *v)
534 {
535 loaddebug(&v->arch.guest_context, 0);
536 loaddebug(&v->arch.guest_context, 1);
537 loaddebug(&v->arch.guest_context, 2);
538 loaddebug(&v->arch.guest_context, 3);
539 /* No 4 and 5 */
540 loaddebug(&v->arch.guest_context, 6);
541 /* DR7 is loaded from the VMCS. */
542 }
544 void vmx_vmcs_save(struct vcpu *v, struct hvm_hw_cpu *c)
545 {
546 uint32_t ev;
548 vmx_vmcs_enter(v);
550 c->rip = __vmread(GUEST_RIP);
551 c->rsp = __vmread(GUEST_RSP);
552 c->rflags = __vmread(GUEST_RFLAGS);
554 c->cr0 = v->arch.hvm_vmx.cpu_shadow_cr0;
555 c->cr2 = v->arch.hvm_vmx.cpu_cr2;
556 c->cr3 = v->arch.hvm_vmx.cpu_cr3;
557 c->cr4 = v->arch.hvm_vmx.cpu_shadow_cr4;
559 c->msr_efer = v->arch.hvm_vmx.efer;
561 #ifdef HVM_DEBUG_SUSPEND
562 printk("%s: cr3=0x%"PRIx64", cr0=0x%"PRIx64", cr4=0x%"PRIx64".\n",
563 __func__, c->cr3, c->cr0, c->cr4);
564 #endif
566 c->idtr_limit = __vmread(GUEST_IDTR_LIMIT);
567 c->idtr_base = __vmread(GUEST_IDTR_BASE);
569 c->gdtr_limit = __vmread(GUEST_GDTR_LIMIT);
570 c->gdtr_base = __vmread(GUEST_GDTR_BASE);
572 c->cs_sel = __vmread(GUEST_CS_SELECTOR);
573 c->cs_limit = __vmread(GUEST_CS_LIMIT);
574 c->cs_base = __vmread(GUEST_CS_BASE);
575 c->cs_arbytes = __vmread(GUEST_CS_AR_BYTES);
577 c->ds_sel = __vmread(GUEST_DS_SELECTOR);
578 c->ds_limit = __vmread(GUEST_DS_LIMIT);
579 c->ds_base = __vmread(GUEST_DS_BASE);
580 c->ds_arbytes = __vmread(GUEST_DS_AR_BYTES);
582 c->es_sel = __vmread(GUEST_ES_SELECTOR);
583 c->es_limit = __vmread(GUEST_ES_LIMIT);
584 c->es_base = __vmread(GUEST_ES_BASE);
585 c->es_arbytes = __vmread(GUEST_ES_AR_BYTES);
587 c->ss_sel = __vmread(GUEST_SS_SELECTOR);
588 c->ss_limit = __vmread(GUEST_SS_LIMIT);
589 c->ss_base = __vmread(GUEST_SS_BASE);
590 c->ss_arbytes = __vmread(GUEST_SS_AR_BYTES);
592 c->fs_sel = __vmread(GUEST_FS_SELECTOR);
593 c->fs_limit = __vmread(GUEST_FS_LIMIT);
594 c->fs_base = __vmread(GUEST_FS_BASE);
595 c->fs_arbytes = __vmread(GUEST_FS_AR_BYTES);
597 c->gs_sel = __vmread(GUEST_GS_SELECTOR);
598 c->gs_limit = __vmread(GUEST_GS_LIMIT);
599 c->gs_base = __vmread(GUEST_GS_BASE);
600 c->gs_arbytes = __vmread(GUEST_GS_AR_BYTES);
602 c->tr_sel = __vmread(GUEST_TR_SELECTOR);
603 c->tr_limit = __vmread(GUEST_TR_LIMIT);
604 c->tr_base = __vmread(GUEST_TR_BASE);
605 c->tr_arbytes = __vmread(GUEST_TR_AR_BYTES);
607 c->ldtr_sel = __vmread(GUEST_LDTR_SELECTOR);
608 c->ldtr_limit = __vmread(GUEST_LDTR_LIMIT);
609 c->ldtr_base = __vmread(GUEST_LDTR_BASE);
610 c->ldtr_arbytes = __vmread(GUEST_LDTR_AR_BYTES);
612 c->sysenter_cs = __vmread(GUEST_SYSENTER_CS);
613 c->sysenter_esp = __vmread(GUEST_SYSENTER_ESP);
614 c->sysenter_eip = __vmread(GUEST_SYSENTER_EIP);
616 /*
617 * Save any event/interrupt that was being injected when we last
618 * exited. IDT_VECTORING_INFO_FIELD has priority, as anything in
619 * VM_ENTRY_INTR_INFO_FIELD is either a fault caused by the first
620 * event, which will happen the next time, or an interrupt, which we
621 * never inject when IDT_VECTORING_INFO_FIELD is valid.
622 */
623 if ( (ev = __vmread(IDT_VECTORING_INFO_FIELD)) & INTR_INFO_VALID_MASK )
624 {
625 c->pending_event = ev;
626 c->error_code = __vmread(IDT_VECTORING_ERROR_CODE);
627 }
628 else if ( (ev = __vmread(VM_ENTRY_INTR_INFO_FIELD)) &
629 INTR_INFO_VALID_MASK )
630 {
631 c->pending_event = ev;
632 c->error_code = __vmread(VM_ENTRY_EXCEPTION_ERROR_CODE);
633 }
634 else
635 {
636 c->pending_event = 0;
637 c->error_code = 0;
638 }
640 vmx_vmcs_exit(v);
641 }
643 int vmx_vmcs_restore(struct vcpu *v, struct hvm_hw_cpu *c)
644 {
645 unsigned long mfn, old_base_mfn;
647 vmx_vmcs_enter(v);
649 __vmwrite(GUEST_RIP, c->rip);
650 __vmwrite(GUEST_RSP, c->rsp);
651 __vmwrite(GUEST_RFLAGS, c->rflags);
653 v->arch.hvm_vmx.cpu_cr0 = (c->cr0 | X86_CR0_PE | X86_CR0_PG |
654 X86_CR0_NE | X86_CR0_WP | X86_CR0_ET);
655 __vmwrite(GUEST_CR0, v->arch.hvm_vmx.cpu_cr0);
656 v->arch.hvm_vmx.cpu_shadow_cr0 = c->cr0;
657 __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr0);
659 v->arch.hvm_vmx.cpu_cr2 = c->cr2;
661 v->arch.hvm_vmx.efer = c->msr_efer;
663 #ifdef HVM_DEBUG_SUSPEND
664 printk("%s: cr3=0x%"PRIx64", cr0=0x%"PRIx64", cr4=0x%"PRIx64".\n",
665 __func__, c->cr3, c->cr0, c->cr4);
666 #endif
668 if ( !vmx_paging_enabled(v) )
669 {
670 HVM_DBG_LOG(DBG_LEVEL_VMMU, "%s: paging not enabled.", __func__);
671 goto skip_cr3;
672 }
674 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 = %"PRIx64, c->cr3);
675 /* current!=vcpu as not called by arch_vmx_do_launch */
676 mfn = gmfn_to_mfn(v->domain, c->cr3 >> PAGE_SHIFT);
677 if ( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), v->domain) )
678 {
679 gdprintk(XENLOG_ERR, "Invalid CR3 value=0x%"PRIx64".\n", c->cr3);
680 vmx_vmcs_exit(v);
681 return -EINVAL;
682 }
684 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
685 v->arch.guest_table = pagetable_from_pfn(mfn);
686 if ( old_base_mfn )
687 put_page(mfn_to_page(old_base_mfn));
689 skip_cr3:
690 v->arch.hvm_vmx.cpu_cr3 = c->cr3;
692 if ( vmx_long_mode_enabled(v) )
693 vmx_enable_long_mode(v);
695 __vmwrite(GUEST_CR4, (c->cr4 | HVM_CR4_HOST_MASK));
696 v->arch.hvm_vmx.cpu_shadow_cr4 = c->cr4;
697 __vmwrite(CR4_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr4);
699 __vmwrite(GUEST_IDTR_LIMIT, c->idtr_limit);
700 __vmwrite(GUEST_IDTR_BASE, c->idtr_base);
702 __vmwrite(GUEST_GDTR_LIMIT, c->gdtr_limit);
703 __vmwrite(GUEST_GDTR_BASE, c->gdtr_base);
705 __vmwrite(GUEST_CS_SELECTOR, c->cs_sel);
706 __vmwrite(GUEST_CS_LIMIT, c->cs_limit);
707 __vmwrite(GUEST_CS_BASE, c->cs_base);
708 __vmwrite(GUEST_CS_AR_BYTES, c->cs_arbytes);
710 __vmwrite(GUEST_DS_SELECTOR, c->ds_sel);
711 __vmwrite(GUEST_DS_LIMIT, c->ds_limit);
712 __vmwrite(GUEST_DS_BASE, c->ds_base);
713 __vmwrite(GUEST_DS_AR_BYTES, c->ds_arbytes);
715 __vmwrite(GUEST_ES_SELECTOR, c->es_sel);
716 __vmwrite(GUEST_ES_LIMIT, c->es_limit);
717 __vmwrite(GUEST_ES_BASE, c->es_base);
718 __vmwrite(GUEST_ES_AR_BYTES, c->es_arbytes);
720 __vmwrite(GUEST_SS_SELECTOR, c->ss_sel);
721 __vmwrite(GUEST_SS_LIMIT, c->ss_limit);
722 __vmwrite(GUEST_SS_BASE, c->ss_base);
723 __vmwrite(GUEST_SS_AR_BYTES, c->ss_arbytes);
725 __vmwrite(GUEST_FS_SELECTOR, c->fs_sel);
726 __vmwrite(GUEST_FS_LIMIT, c->fs_limit);
727 __vmwrite(GUEST_FS_BASE, c->fs_base);
728 __vmwrite(GUEST_FS_AR_BYTES, c->fs_arbytes);
730 __vmwrite(GUEST_GS_SELECTOR, c->gs_sel);
731 __vmwrite(GUEST_GS_LIMIT, c->gs_limit);
732 __vmwrite(GUEST_GS_BASE, c->gs_base);
733 __vmwrite(GUEST_GS_AR_BYTES, c->gs_arbytes);
735 __vmwrite(GUEST_TR_SELECTOR, c->tr_sel);
736 __vmwrite(GUEST_TR_LIMIT, c->tr_limit);
737 __vmwrite(GUEST_TR_BASE, c->tr_base);
738 __vmwrite(GUEST_TR_AR_BYTES, c->tr_arbytes);
740 __vmwrite(GUEST_LDTR_SELECTOR, c->ldtr_sel);
741 __vmwrite(GUEST_LDTR_LIMIT, c->ldtr_limit);
742 __vmwrite(GUEST_LDTR_BASE, c->ldtr_base);
743 __vmwrite(GUEST_LDTR_AR_BYTES, c->ldtr_arbytes);
745 __vmwrite(GUEST_SYSENTER_CS, c->sysenter_cs);
746 __vmwrite(GUEST_SYSENTER_ESP, c->sysenter_esp);
747 __vmwrite(GUEST_SYSENTER_EIP, c->sysenter_eip);
749 __vmwrite(GUEST_DR7, c->dr7);
751 vmx_vmcs_exit(v);
753 paging_update_paging_modes(v);
755 if ( c->pending_valid )
756 {
757 vmx_vmcs_enter(v);
759 gdprintk(XENLOG_INFO, "Re-injecting 0x%"PRIx32", 0x%"PRIx32"\n",
760 c->pending_event, c->error_code);
762 /* SVM uses type 3 ("Exception") for #OF and #BP; VMX uses type 6 */
763 if ( (c->pending_type == 3) &&
764 ((c->pending_vector == 3) || (c->pending_vector == 4)) )
765 c->pending_type = 6;
767 /* For software exceptions, we need to tell the hardware the
768 * instruction length as well (hmmm). */
769 if ( c->pending_type > 4 )
770 {
771 int addrbytes, ilen;
772 if ( (c->cs_arbytes & X86_SEG_AR_CS_LM_ACTIVE) &&
773 (c->msr_efer & EFER_LMA) )
774 addrbytes = 8;
775 else if ( c->cs_arbytes & X86_SEG_AR_DEF_OP_SIZE )
776 addrbytes = 4;
777 else
778 addrbytes = 2;
780 ilen = hvm_instruction_length(c->rip, addrbytes);
781 __vmwrite(VM_ENTRY_INSTRUCTION_LEN, ilen);
782 }
784 /* Sanity check */
785 if ( (c->pending_type == 1) || (c->pending_type > 6) ||
786 (c->pending_reserved != 0) )
787 {
788 gdprintk(XENLOG_ERR, "Invalid pending event 0x%"PRIx32".\n",
789 c->pending_event);
790 return -EINVAL;
791 }
793 /* Re-inject the exception */
794 __vmwrite(VM_ENTRY_INTR_INFO_FIELD, c->pending_event);
795 __vmwrite(VM_ENTRY_EXCEPTION_ERROR_CODE, c->error_code);
796 v->arch.hvm_vmx.vector_injected = 1;
798 vmx_vmcs_exit(v);
799 }
801 return 0;
802 }
804 #if defined(__x86_64__) && defined(HVM_DEBUG_SUSPEND)
805 static void dump_msr_state(struct vmx_msr_state *m)
806 {
807 int i = 0;
808 printk("**** msr state ****\n");
809 printk("shadow_gs=0x%lx, flags=0x%lx, msr_items:", m->shadow_gs, m->flags);
810 for ( i = 0; i < VMX_MSR_COUNT; i++ )
811 printk("0x%lx,", m->msrs[i]);
812 printk("\n");
813 }
814 #else
815 #define dump_msr_state(m) ((void)0)
816 #endif
818 static void vmx_save_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data)
819 {
820 #ifdef __x86_64__
821 struct vmx_msr_state *guest_state = &v->arch.hvm_vmx.msr_state;
822 unsigned long guest_flags = guest_state->flags;
824 data->shadow_gs = v->arch.hvm_vmx.shadow_gs;
825 data->msr_cstar = v->arch.hvm_vmx.cstar;
827 /* save msrs */
828 data->msr_flags = guest_flags;
829 data->msr_lstar = guest_state->msrs[VMX_INDEX_MSR_LSTAR];
830 data->msr_star = guest_state->msrs[VMX_INDEX_MSR_STAR];
831 data->msr_syscall_mask = guest_state->msrs[VMX_INDEX_MSR_SYSCALL_MASK];
832 #endif
834 data->tsc = hvm_get_guest_time(v);
836 dump_msr_state(guest_state);
837 }
839 static void vmx_load_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data)
840 {
841 #ifdef __x86_64__
842 struct vmx_msr_state *guest_state = &v->arch.hvm_vmx.msr_state;
844 /* restore msrs */
845 guest_state->flags = data->msr_flags;
846 guest_state->msrs[VMX_INDEX_MSR_LSTAR] = data->msr_lstar;
847 guest_state->msrs[VMX_INDEX_MSR_STAR] = data->msr_star;
848 guest_state->msrs[VMX_INDEX_MSR_SYSCALL_MASK] = data->msr_syscall_mask;
850 v->arch.hvm_vmx.cstar = data->msr_cstar;
851 v->arch.hvm_vmx.shadow_gs = data->shadow_gs;
852 #endif
854 v->arch.hvm_vmx.vmxassist_enabled = !(data->cr0 & X86_CR0_PE);
856 hvm_set_guest_time(v, data->tsc);
858 dump_msr_state(guest_state);
859 }
862 static void vmx_save_vmcs_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt)
863 {
864 vmx_save_cpu_state(v, ctxt);
865 vmx_vmcs_save(v, ctxt);
866 }
868 static int vmx_load_vmcs_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt)
869 {
870 vmx_load_cpu_state(v, ctxt);
872 if ( vmx_vmcs_restore(v, ctxt) )
873 {
874 gdprintk(XENLOG_ERR, "vmx_vmcs restore failed!\n");
875 domain_crash(v->domain);
876 return -EINVAL;
877 }
879 return 0;
880 }
882 /*
883 * DR7 is saved and restored on every vmexit. Other debug registers only
884 * need to be restored if their value is going to affect execution -- i.e.,
885 * if one of the breakpoints is enabled. So mask out all bits that don't
886 * enable some breakpoint functionality.
887 */
888 #define DR7_ACTIVE_MASK 0xff
890 static void vmx_restore_dr(struct vcpu *v)
891 {
892 /* NB. __vmread() is not usable here, so we cannot read from the VMCS. */
893 if ( unlikely(v->arch.guest_context.debugreg[7] & DR7_ACTIVE_MASK) )
894 __restore_debug_registers(v);
895 }
897 static void vmx_ctxt_switch_from(struct vcpu *v)
898 {
899 vmx_save_guest_msrs(v);
900 vmx_restore_host_msrs();
901 vmx_save_dr(v);
902 }
904 static void vmx_ctxt_switch_to(struct vcpu *v)
905 {
906 vmx_restore_guest_msrs(v);
907 vmx_restore_dr(v);
908 }
910 static void vmx_store_cpu_guest_regs(
911 struct vcpu *v, struct cpu_user_regs *regs, unsigned long *crs)
912 {
913 vmx_vmcs_enter(v);
915 if ( regs != NULL )
916 {
917 regs->eflags = __vmread(GUEST_RFLAGS);
918 regs->ss = __vmread(GUEST_SS_SELECTOR);
919 regs->cs = __vmread(GUEST_CS_SELECTOR);
920 regs->eip = __vmread(GUEST_RIP);
921 regs->esp = __vmread(GUEST_RSP);
922 }
924 if ( crs != NULL )
925 {
926 crs[0] = v->arch.hvm_vmx.cpu_shadow_cr0;
927 crs[2] = v->arch.hvm_vmx.cpu_cr2;
928 crs[3] = v->arch.hvm_vmx.cpu_cr3;
929 crs[4] = v->arch.hvm_vmx.cpu_shadow_cr4;
930 }
932 vmx_vmcs_exit(v);
933 }
935 static void vmx_load_cpu_guest_regs(struct vcpu *v, struct cpu_user_regs *regs)
936 {
937 unsigned long base;
939 vmx_vmcs_enter(v);
941 __vmwrite(GUEST_SS_SELECTOR, regs->ss);
942 __vmwrite(GUEST_RSP, regs->esp);
944 /* NB. Bit 1 of RFLAGS must be set for VMENTRY to succeed. */
945 __vmwrite(GUEST_RFLAGS, regs->eflags | 2UL);
947 if ( regs->eflags & EF_VM )
948 {
949 /*
950 * The VMX spec (section 4.3.1.2, Checks on Guest Segment
951 * Registers) says that virtual-8086 mode guests' segment
952 * base-address fields in the VMCS must be equal to their
953 * corresponding segment selector field shifted right by
954 * four bits upon vmentry.
955 */
956 base = __vmread(GUEST_CS_BASE);
957 if ( (regs->cs << 4) != base )
958 __vmwrite(GUEST_CS_BASE, regs->cs << 4);
959 base = __vmread(GUEST_SS_BASE);
960 if ( (regs->ss << 4) != base )
961 __vmwrite(GUEST_SS_BASE, regs->ss << 4);
962 }
964 __vmwrite(GUEST_CS_SELECTOR, regs->cs);
965 __vmwrite(GUEST_RIP, regs->eip);
967 vmx_vmcs_exit(v);
968 }
970 static unsigned long vmx_get_ctrl_reg(struct vcpu *v, unsigned int num)
971 {
972 switch ( num )
973 {
974 case 0:
975 return v->arch.hvm_vmx.cpu_cr0;
976 case 2:
977 return v->arch.hvm_vmx.cpu_cr2;
978 case 3:
979 return v->arch.hvm_vmx.cpu_cr3;
980 case 4:
981 return v->arch.hvm_vmx.cpu_shadow_cr4;
982 default:
983 BUG();
984 }
985 return 0; /* dummy */
986 }
988 static unsigned long vmx_get_segment_base(struct vcpu *v, enum x86_segment seg)
989 {
990 unsigned long base = 0;
991 int long_mode = 0;
993 ASSERT(v == current);
995 if ( vmx_long_mode_enabled(v) &&
996 (__vmread(GUEST_CS_AR_BYTES) & X86_SEG_AR_CS_LM_ACTIVE) )
997 long_mode = 1;
999 switch ( seg )
1001 case x86_seg_cs: if ( !long_mode ) base = __vmread(GUEST_CS_BASE); break;
1002 case x86_seg_ds: if ( !long_mode ) base = __vmread(GUEST_DS_BASE); break;
1003 case x86_seg_es: if ( !long_mode ) base = __vmread(GUEST_ES_BASE); break;
1004 case x86_seg_fs: base = __vmread(GUEST_FS_BASE); break;
1005 case x86_seg_gs: base = __vmread(GUEST_GS_BASE); break;
1006 case x86_seg_ss: if ( !long_mode ) base = __vmread(GUEST_SS_BASE); break;
1007 case x86_seg_tr: base = __vmread(GUEST_TR_BASE); break;
1008 case x86_seg_gdtr: base = __vmread(GUEST_GDTR_BASE); break;
1009 case x86_seg_idtr: base = __vmread(GUEST_IDTR_BASE); break;
1010 case x86_seg_ldtr: base = __vmread(GUEST_LDTR_BASE); break;
1011 default: BUG(); break;
1014 return base;
1017 static void vmx_get_segment_register(struct vcpu *v, enum x86_segment seg,
1018 struct segment_register *reg)
1020 u16 attr = 0;
1022 ASSERT(v == current);
1024 switch ( seg )
1026 case x86_seg_cs:
1027 reg->sel = __vmread(GUEST_CS_SELECTOR);
1028 reg->limit = __vmread(GUEST_CS_LIMIT);
1029 reg->base = __vmread(GUEST_CS_BASE);
1030 attr = __vmread(GUEST_CS_AR_BYTES);
1031 break;
1032 case x86_seg_ds:
1033 reg->sel = __vmread(GUEST_DS_SELECTOR);
1034 reg->limit = __vmread(GUEST_DS_LIMIT);
1035 reg->base = __vmread(GUEST_DS_BASE);
1036 attr = __vmread(GUEST_DS_AR_BYTES);
1037 break;
1038 case x86_seg_es:
1039 reg->sel = __vmread(GUEST_ES_SELECTOR);
1040 reg->limit = __vmread(GUEST_ES_LIMIT);
1041 reg->base = __vmread(GUEST_ES_BASE);
1042 attr = __vmread(GUEST_ES_AR_BYTES);
1043 break;
1044 case x86_seg_fs:
1045 reg->sel = __vmread(GUEST_FS_SELECTOR);
1046 reg->limit = __vmread(GUEST_FS_LIMIT);
1047 reg->base = __vmread(GUEST_FS_BASE);
1048 attr = __vmread(GUEST_FS_AR_BYTES);
1049 break;
1050 case x86_seg_gs:
1051 reg->sel = __vmread(GUEST_GS_SELECTOR);
1052 reg->limit = __vmread(GUEST_GS_LIMIT);
1053 reg->base = __vmread(GUEST_GS_BASE);
1054 attr = __vmread(GUEST_GS_AR_BYTES);
1055 break;
1056 case x86_seg_ss:
1057 reg->sel = __vmread(GUEST_SS_SELECTOR);
1058 reg->limit = __vmread(GUEST_SS_LIMIT);
1059 reg->base = __vmread(GUEST_SS_BASE);
1060 attr = __vmread(GUEST_SS_AR_BYTES);
1061 break;
1062 case x86_seg_tr:
1063 reg->sel = __vmread(GUEST_TR_SELECTOR);
1064 reg->limit = __vmread(GUEST_TR_LIMIT);
1065 reg->base = __vmread(GUEST_TR_BASE);
1066 attr = __vmread(GUEST_TR_AR_BYTES);
1067 break;
1068 case x86_seg_gdtr:
1069 reg->limit = __vmread(GUEST_GDTR_LIMIT);
1070 reg->base = __vmread(GUEST_GDTR_BASE);
1071 break;
1072 case x86_seg_idtr:
1073 reg->limit = __vmread(GUEST_IDTR_LIMIT);
1074 reg->base = __vmread(GUEST_IDTR_BASE);
1075 break;
1076 case x86_seg_ldtr:
1077 reg->sel = __vmread(GUEST_LDTR_SELECTOR);
1078 reg->limit = __vmread(GUEST_LDTR_LIMIT);
1079 reg->base = __vmread(GUEST_LDTR_BASE);
1080 attr = __vmread(GUEST_LDTR_AR_BYTES);
1081 break;
1082 default:
1083 BUG();
1086 reg->attr.bytes = (attr & 0xff) | ((attr >> 4) & 0xf00);
1089 /* Make sure that xen intercepts any FP accesses from current */
1090 static void vmx_stts(struct vcpu *v)
1092 /* VMX depends on operating on the current vcpu */
1093 ASSERT(v == current);
1095 /*
1096 * If the guest does not have TS enabled then we must cause and handle an
1097 * exception on first use of the FPU. If the guest *does* have TS enabled
1098 * then this is not necessary: no FPU activity can occur until the guest
1099 * clears CR0.TS, and we will initialise the FPU when that happens.
1100 */
1101 if ( !(v->arch.hvm_vmx.cpu_shadow_cr0 & X86_CR0_TS) )
1103 v->arch.hvm_vmx.cpu_cr0 |= X86_CR0_TS;
1104 __vmwrite(GUEST_CR0, v->arch.hvm_vmx.cpu_cr0);
1105 __vm_set_bit(EXCEPTION_BITMAP, TRAP_no_device);
1109 static void vmx_set_tsc_offset(struct vcpu *v, u64 offset)
1111 vmx_vmcs_enter(v);
1112 __vmwrite(TSC_OFFSET, offset);
1113 #if defined (__i386__)
1114 __vmwrite(TSC_OFFSET_HIGH, offset >> 32);
1115 #endif
1116 vmx_vmcs_exit(v);
1119 static void vmx_init_ap_context(
1120 struct vcpu_guest_context *ctxt, int vcpuid, int trampoline_vector)
1122 memset(ctxt, 0, sizeof(*ctxt));
1123 ctxt->user_regs.eip = VMXASSIST_BASE;
1124 ctxt->user_regs.edx = vcpuid;
1125 ctxt->user_regs.ebx = trampoline_vector;
1128 void do_nmi(struct cpu_user_regs *);
1130 static void vmx_init_hypercall_page(struct domain *d, void *hypercall_page)
1132 char *p;
1133 int i;
1135 for ( i = 0; i < (PAGE_SIZE / 32); i++ )
1137 p = (char *)(hypercall_page + (i * 32));
1138 *(u8 *)(p + 0) = 0xb8; /* mov imm32, %eax */
1139 *(u32 *)(p + 1) = i;
1140 *(u8 *)(p + 5) = 0x0f; /* vmcall */
1141 *(u8 *)(p + 6) = 0x01;
1142 *(u8 *)(p + 7) = 0xc1;
1143 *(u8 *)(p + 8) = 0xc3; /* ret */
1146 /* Don't support HYPERVISOR_iret at the moment */
1147 *(u16 *)(hypercall_page + (__HYPERVISOR_iret * 32)) = 0x0b0f; /* ud2 */
1150 static int vmx_interrupts_enabled(struct vcpu *v, enum hvm_intack type)
1152 unsigned long intr_shadow, eflags;
1154 ASSERT(v == current);
1156 intr_shadow = __vmread(GUEST_INTERRUPTIBILITY_INFO);
1158 if ( type == hvm_intack_nmi )
1159 return !(intr_shadow & (VMX_INTR_SHADOW_STI|
1160 VMX_INTR_SHADOW_MOV_SS|
1161 VMX_INTR_SHADOW_NMI));
1163 ASSERT((type == hvm_intack_pic) || (type == hvm_intack_lapic));
1164 eflags = __vmread(GUEST_RFLAGS);
1165 return (!irq_masked(eflags) &&
1166 !(intr_shadow & (VMX_INTR_SHADOW_STI|VMX_INTR_SHADOW_MOV_SS)));
1169 static void vmx_update_host_cr3(struct vcpu *v)
1171 ASSERT((v == current) || !vcpu_runnable(v));
1172 vmx_vmcs_enter(v);
1173 __vmwrite(HOST_CR3, v->arch.cr3);
1174 vmx_vmcs_exit(v);
1177 static void vmx_update_guest_cr3(struct vcpu *v)
1179 ASSERT((v == current) || !vcpu_runnable(v));
1180 vmx_vmcs_enter(v);
1181 __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
1182 vmx_vmcs_exit(v);
1185 static void vmx_flush_guest_tlbs(void)
1187 /* No tagged TLB support on VMX yet. The fact that we're in Xen
1188 * at all means any guest will have a clean TLB when it's next run,
1189 * because VMRESUME will flush it for us. */
1192 static void vmx_inject_exception(
1193 unsigned int trapnr, int errcode, unsigned long cr2)
1195 struct vcpu *v = current;
1196 vmx_inject_hw_exception(v, trapnr, errcode);
1197 if ( trapnr == TRAP_page_fault )
1198 v->arch.hvm_vmx.cpu_cr2 = cr2;
1201 static void vmx_update_vtpr(struct vcpu *v, unsigned long value)
1203 /* VMX doesn't have a V_TPR field */
1206 static int vmx_event_injection_faulted(struct vcpu *v)
1208 unsigned int idtv_info_field;
1210 ASSERT(v == current);
1212 idtv_info_field = __vmread(IDT_VECTORING_INFO_FIELD);
1213 return (idtv_info_field & INTR_INFO_VALID_MASK);
1216 static void disable_intercept_for_msr(u32 msr)
1218 /*
1219 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
1220 * have the write-low and read-high bitmap offsets the wrong way round.
1221 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
1222 */
1223 if ( msr <= 0x1fff )
1225 __clear_bit(msr, vmx_msr_bitmap + 0x000); /* read-low */
1226 __clear_bit(msr, vmx_msr_bitmap + 0x800); /* write-low */
1228 else if ( (msr >= 0xc0000000) && (msr <= 0xc0001fff) )
1230 msr &= 0x1fff;
1231 __clear_bit(msr, vmx_msr_bitmap + 0x400); /* read-high */
1232 __clear_bit(msr, vmx_msr_bitmap + 0xc00); /* write-high */
1236 static struct hvm_function_table vmx_function_table = {
1237 .name = "VMX",
1238 .domain_initialise = vmx_domain_initialise,
1239 .domain_destroy = vmx_domain_destroy,
1240 .vcpu_initialise = vmx_vcpu_initialise,
1241 .vcpu_destroy = vmx_vcpu_destroy,
1242 .store_cpu_guest_regs = vmx_store_cpu_guest_regs,
1243 .load_cpu_guest_regs = vmx_load_cpu_guest_regs,
1244 .save_cpu_ctxt = vmx_save_vmcs_ctxt,
1245 .load_cpu_ctxt = vmx_load_vmcs_ctxt,
1246 .paging_enabled = vmx_paging_enabled,
1247 .long_mode_enabled = vmx_long_mode_enabled,
1248 .pae_enabled = vmx_pae_enabled,
1249 .nx_enabled = vmx_nx_enabled,
1250 .interrupts_enabled = vmx_interrupts_enabled,
1251 .guest_x86_mode = vmx_guest_x86_mode,
1252 .get_guest_ctrl_reg = vmx_get_ctrl_reg,
1253 .get_segment_base = vmx_get_segment_base,
1254 .get_segment_register = vmx_get_segment_register,
1255 .update_host_cr3 = vmx_update_host_cr3,
1256 .update_guest_cr3 = vmx_update_guest_cr3,
1257 .flush_guest_tlbs = vmx_flush_guest_tlbs,
1258 .update_vtpr = vmx_update_vtpr,
1259 .stts = vmx_stts,
1260 .set_tsc_offset = vmx_set_tsc_offset,
1261 .inject_exception = vmx_inject_exception,
1262 .init_ap_context = vmx_init_ap_context,
1263 .init_hypercall_page = vmx_init_hypercall_page,
1264 .event_injection_faulted = vmx_event_injection_faulted,
1265 .cpu_up = vmx_cpu_up,
1266 .cpu_down = vmx_cpu_down,
1267 };
1269 void start_vmx(void)
1271 static int bootstrapped;
1273 vmx_save_host_msrs();
1275 if ( bootstrapped )
1277 if ( hvm_enabled && !vmx_cpu_up() )
1279 printk("VMX: FATAL: failed to initialise CPU%d!\n",
1280 smp_processor_id());
1281 BUG();
1283 return;
1286 bootstrapped = 1;
1288 /* Xen does not fill x86_capability words except 0. */
1289 boot_cpu_data.x86_capability[4] = cpuid_ecx(1);
1291 if ( !test_bit(X86_FEATURE_VMXE, &boot_cpu_data.x86_capability) )
1292 return;
1294 set_in_cr4(X86_CR4_VMXE);
1296 if ( !vmx_cpu_up() )
1298 printk("VMX: failed to initialise.\n");
1299 return;
1302 setup_vmcs_dump();
1304 hvm_enable(&vmx_function_table);
1306 if ( cpu_has_vmx_msr_bitmap )
1308 printk("VMX: MSR intercept bitmap enabled\n");
1309 vmx_msr_bitmap = alloc_xenheap_page();
1310 BUG_ON(vmx_msr_bitmap == NULL);
1311 memset(vmx_msr_bitmap, ~0, PAGE_SIZE);
1313 disable_intercept_for_msr(MSR_FS_BASE);
1314 disable_intercept_for_msr(MSR_GS_BASE);
1316 disable_intercept_for_msr(MSR_IA32_SYSENTER_CS);
1317 disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP);
1318 disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP);
1322 /*
1323 * Not all cases receive valid value in the VM-exit instruction length field.
1324 * Callers must know what they're doing!
1325 */
1326 static int __get_instruction_length(void)
1328 int len;
1329 len = __vmread(VM_EXIT_INSTRUCTION_LEN); /* Safe: callers audited */
1330 BUG_ON((len < 1) || (len > 15));
1331 return len;
1334 static void __update_guest_eip(unsigned long inst_len)
1336 unsigned long x;
1338 x = __vmread(GUEST_RIP);
1339 __vmwrite(GUEST_RIP, x + inst_len);
1341 x = __vmread(GUEST_RFLAGS);
1342 if ( x & X86_EFLAGS_RF )
1343 __vmwrite(GUEST_RFLAGS, x & ~X86_EFLAGS_RF);
1345 x = __vmread(GUEST_INTERRUPTIBILITY_INFO);
1346 if ( x & (VMX_INTR_SHADOW_STI | VMX_INTR_SHADOW_MOV_SS) )
1348 x &= ~(VMX_INTR_SHADOW_STI | VMX_INTR_SHADOW_MOV_SS);
1349 __vmwrite(GUEST_INTERRUPTIBILITY_INFO, x);
1353 static void vmx_do_no_device_fault(void)
1355 struct vcpu *v = current;
1357 setup_fpu(current);
1358 __vm_clear_bit(EXCEPTION_BITMAP, TRAP_no_device);
1360 /* Disable TS in guest CR0 unless the guest wants the exception too. */
1361 if ( !(v->arch.hvm_vmx.cpu_shadow_cr0 & X86_CR0_TS) )
1363 v->arch.hvm_vmx.cpu_cr0 &= ~X86_CR0_TS;
1364 __vmwrite(GUEST_CR0, v->arch.hvm_vmx.cpu_cr0);
1368 #define bitmaskof(idx) (1U << ((idx) & 31))
1369 static void vmx_do_cpuid(struct cpu_user_regs *regs)
1371 unsigned int input = (unsigned int)regs->eax;
1372 unsigned int count = (unsigned int)regs->ecx;
1373 unsigned int eax, ebx, ecx, edx;
1375 if ( input == 0x00000004 )
1377 cpuid_count(input, count, &eax, &ebx, &ecx, &edx);
1378 eax &= NUM_CORES_RESET_MASK;
1380 else if ( input == 0x40000003 )
1382 /*
1383 * NB. Unsupported interface for private use of VMXASSIST only.
1384 * Note that this leaf lives at <max-hypervisor-leaf> + 1.
1385 */
1386 u64 value = ((u64)regs->edx << 32) | (u32)regs->ecx;
1387 unsigned long mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
1388 struct vcpu *v = current;
1389 char *p;
1391 gdprintk(XENLOG_INFO, "Input address is 0x%"PRIx64".\n", value);
1393 /* 8-byte aligned valid pseudophys address from vmxassist, please. */
1394 if ( (value & 7) || (mfn == INVALID_MFN) ||
1395 !v->arch.hvm_vmx.vmxassist_enabled )
1397 domain_crash(v->domain);
1398 return;
1401 p = map_domain_page(mfn);
1402 value = *((uint64_t *)(p + (value & (PAGE_SIZE - 1))));
1403 unmap_domain_page(p);
1405 gdprintk(XENLOG_INFO, "Output value is 0x%"PRIx64".\n", value);
1406 ecx = (u32)value;
1407 edx = (u32)(value >> 32);
1408 } else {
1409 hvm_cpuid(input, &eax, &ebx, &ecx, &edx);
1411 if ( input == 0x00000001 )
1413 /* Mask off reserved bits. */
1414 ecx &= ~VMX_VCPU_CPUID_L1_ECX_RESERVED;
1416 ebx &= NUM_THREADS_RESET_MASK;
1418 /* Unsupportable for virtualised CPUs. */
1419 ecx &= ~(bitmaskof(X86_FEATURE_VMXE) |
1420 bitmaskof(X86_FEATURE_EST) |
1421 bitmaskof(X86_FEATURE_TM2) |
1422 bitmaskof(X86_FEATURE_CID));
1424 edx &= ~(bitmaskof(X86_FEATURE_HT) |
1425 bitmaskof(X86_FEATURE_ACPI) |
1426 bitmaskof(X86_FEATURE_ACC));
1429 if ( input == 0x00000006 || input == 0x00000009 || input == 0x0000000A )
1430 eax = ebx = ecx = edx = 0x0;
1433 regs->eax = (unsigned long)eax;
1434 regs->ebx = (unsigned long)ebx;
1435 regs->ecx = (unsigned long)ecx;
1436 regs->edx = (unsigned long)edx;
1438 HVMTRACE_3D(CPUID, current, input,
1439 ((uint64_t)eax << 32) | ebx, ((uint64_t)ecx << 32) | edx);
1442 #define CASE_GET_REG_P(REG, reg) \
1443 case REG_ ## REG: reg_p = (unsigned long *)&(regs->reg); break
1445 #ifdef __i386__
1446 #define CASE_EXTEND_GET_REG_P
1447 #else
1448 #define CASE_EXTEND_GET_REG_P \
1449 CASE_GET_REG_P(R8, r8); \
1450 CASE_GET_REG_P(R9, r9); \
1451 CASE_GET_REG_P(R10, r10); \
1452 CASE_GET_REG_P(R11, r11); \
1453 CASE_GET_REG_P(R12, r12); \
1454 CASE_GET_REG_P(R13, r13); \
1455 CASE_GET_REG_P(R14, r14); \
1456 CASE_GET_REG_P(R15, r15)
1457 #endif
1459 static void vmx_dr_access(unsigned long exit_qualification,
1460 struct cpu_user_regs *regs)
1462 struct vcpu *v = current;
1464 HVMTRACE_0D(DR_WRITE, v);
1466 v->arch.hvm_vcpu.flag_dr_dirty = 1;
1468 /* We could probably be smarter about this */
1469 __restore_debug_registers(v);
1471 /* Allow guest direct access to DR registers */
1472 v->arch.hvm_vmx.exec_control &= ~CPU_BASED_MOV_DR_EXITING;
1473 __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
1474 v->arch.hvm_vmx.exec_control);
1477 /*
1478 * Invalidate the TLB for va. Invalidate the shadow page corresponding
1479 * the address va.
1480 */
1481 static void vmx_do_invlpg(unsigned long va)
1483 unsigned long eip;
1484 struct vcpu *v = current;
1486 HVMTRACE_2D(INVLPG, v, /*invlpga=*/ 0, va);
1488 eip = __vmread(GUEST_RIP);
1490 HVM_DBG_LOG(DBG_LEVEL_VMMU, "eip=%lx, va=%lx",
1491 eip, va);
1493 /*
1494 * We do the safest things first, then try to update the shadow
1495 * copying from guest
1496 */
1497 paging_invlpg(v, va);
1500 /* Get segment for OUTS according to guest instruction. */
1501 static enum x86_segment vmx_outs_get_segment(
1502 int long_mode, unsigned long eip, int inst_len)
1504 unsigned char inst[MAX_INST_LEN];
1505 enum x86_segment seg = x86_seg_ds;
1506 int i;
1507 extern int inst_copy_from_guest(unsigned char *, unsigned long, int);
1509 if ( likely(cpu_has_vmx_ins_outs_instr_info) )
1511 unsigned int instr_info = __vmread(VMX_INSTRUCTION_INFO);
1513 /* Get segment register according to bits 17:15. */
1514 switch ( (instr_info >> 15) & 7 )
1516 case 0: seg = x86_seg_es; break;
1517 case 1: seg = x86_seg_cs; break;
1518 case 2: seg = x86_seg_ss; break;
1519 case 3: seg = x86_seg_ds; break;
1520 case 4: seg = x86_seg_fs; break;
1521 case 5: seg = x86_seg_gs; break;
1522 default: BUG();
1525 goto out;
1528 if ( !long_mode )
1529 eip += __vmread(GUEST_CS_BASE);
1531 memset(inst, 0, MAX_INST_LEN);
1532 if ( inst_copy_from_guest(inst, eip, inst_len) != inst_len )
1534 gdprintk(XENLOG_ERR, "Get guest instruction failed\n");
1535 domain_crash(current->domain);
1536 goto out;
1539 for ( i = 0; i < inst_len; i++ )
1541 switch ( inst[i] )
1543 case 0xf3: /* REPZ */
1544 case 0xf2: /* REPNZ */
1545 case 0xf0: /* LOCK */
1546 case 0x66: /* data32 */
1547 case 0x67: /* addr32 */
1548 #ifdef __x86_64__
1549 case 0x40 ... 0x4f: /* REX */
1550 #endif
1551 continue;
1552 case 0x2e: /* CS */
1553 seg = x86_seg_cs;
1554 continue;
1555 case 0x36: /* SS */
1556 seg = x86_seg_ss;
1557 continue;
1558 case 0x26: /* ES */
1559 seg = x86_seg_es;
1560 continue;
1561 case 0x64: /* FS */
1562 seg = x86_seg_fs;
1563 continue;
1564 case 0x65: /* GS */
1565 seg = x86_seg_gs;
1566 continue;
1567 case 0x3e: /* DS */
1568 seg = x86_seg_ds;
1569 continue;
1573 out:
1574 return seg;
1577 static int vmx_str_pio_check_descriptor(int long_mode, unsigned long eip,
1578 int inst_len, enum x86_segment seg,
1579 unsigned long *base, u32 *limit,
1580 u32 *ar_bytes)
1582 enum vmcs_field ar_field, base_field, limit_field;
1584 *base = 0;
1585 *limit = 0;
1586 if ( seg != x86_seg_es )
1587 seg = vmx_outs_get_segment(long_mode, eip, inst_len);
1589 switch ( seg )
1591 case x86_seg_cs:
1592 ar_field = GUEST_CS_AR_BYTES;
1593 base_field = GUEST_CS_BASE;
1594 limit_field = GUEST_CS_LIMIT;
1595 break;
1596 case x86_seg_ds:
1597 ar_field = GUEST_DS_AR_BYTES;
1598 base_field = GUEST_DS_BASE;
1599 limit_field = GUEST_DS_LIMIT;
1600 break;
1601 case x86_seg_es:
1602 ar_field = GUEST_ES_AR_BYTES;
1603 base_field = GUEST_ES_BASE;
1604 limit_field = GUEST_ES_LIMIT;
1605 break;
1606 case x86_seg_fs:
1607 ar_field = GUEST_FS_AR_BYTES;
1608 base_field = GUEST_FS_BASE;
1609 limit_field = GUEST_FS_LIMIT;
1610 break;
1611 case x86_seg_gs:
1612 ar_field = GUEST_GS_AR_BYTES;
1613 base_field = GUEST_GS_BASE;
1614 limit_field = GUEST_GS_LIMIT;
1615 break;
1616 case x86_seg_ss:
1617 ar_field = GUEST_SS_AR_BYTES;
1618 base_field = GUEST_SS_BASE;
1619 limit_field = GUEST_SS_LIMIT;
1620 break;
1621 default:
1622 BUG();
1623 return 0;
1626 if ( !long_mode || seg == x86_seg_fs || seg == x86_seg_gs )
1628 *base = __vmread(base_field);
1629 *limit = __vmread(limit_field);
1631 *ar_bytes = __vmread(ar_field);
1633 return !(*ar_bytes & X86_SEG_AR_SEG_UNUSABLE);
1637 static int vmx_str_pio_check_limit(u32 limit, unsigned int size,
1638 u32 ar_bytes, unsigned long addr,
1639 unsigned long base, int df,
1640 unsigned long *count)
1642 unsigned long ea = addr - base;
1644 /* Offset must be within limits. */
1645 ASSERT(ea == (u32)ea);
1646 if ( (u32)(ea + size - 1) < (u32)ea ||
1647 (ar_bytes & 0xc) != 0x4 ? ea + size - 1 > limit
1648 : ea <= limit )
1649 return 0;
1651 /* Check the limit for repeated instructions, as above we checked
1652 only the first instance. Truncate the count if a limit violation
1653 would occur. Note that the checking is not necessary for page
1654 granular segments as transfers crossing page boundaries will be
1655 broken up anyway. */
1656 if ( !(ar_bytes & X86_SEG_AR_GRANULARITY) && *count > 1 )
1658 if ( (ar_bytes & 0xc) != 0x4 )
1660 /* expand-up */
1661 if ( !df )
1663 if ( ea + *count * size - 1 < ea ||
1664 ea + *count * size - 1 > limit )
1665 *count = (limit + 1UL - ea) / size;
1667 else
1669 if ( *count - 1 > ea / size )
1670 *count = ea / size + 1;
1673 else
1675 /* expand-down */
1676 if ( !df )
1678 if ( *count - 1 > -(s32)ea / size )
1679 *count = -(s32)ea / size + 1UL;
1681 else
1683 if ( ea < (*count - 1) * size ||
1684 ea - (*count - 1) * size <= limit )
1685 *count = (ea - limit - 1) / size + 1;
1688 ASSERT(*count);
1691 return 1;
1694 #ifdef __x86_64__
1695 static int vmx_str_pio_lm_check_limit(struct cpu_user_regs *regs,
1696 unsigned int size,
1697 unsigned long addr,
1698 unsigned long *count)
1700 if ( !is_canonical_address(addr) ||
1701 !is_canonical_address(addr + size - 1) )
1702 return 0;
1704 if ( *count > (1UL << 48) / size )
1705 *count = (1UL << 48) / size;
1707 if ( !(regs->eflags & EF_DF) )
1709 if ( addr + *count * size - 1 < addr ||
1710 !is_canonical_address(addr + *count * size - 1) )
1711 *count = (addr & ~((1UL << 48) - 1)) / size;
1713 else
1715 if ( (*count - 1) * size > addr ||
1716 !is_canonical_address(addr + (*count - 1) * size) )
1717 *count = (addr & ~((1UL << 48) - 1)) / size + 1;
1720 ASSERT(*count);
1722 return 1;
1724 #endif
1726 static void vmx_send_str_pio(struct cpu_user_regs *regs,
1727 struct hvm_io_op *pio_opp,
1728 unsigned long inst_len, unsigned int port,
1729 int sign, unsigned int size, int dir,
1730 int df, unsigned long addr,
1731 unsigned long paddr, unsigned long count)
1733 /*
1734 * Handle string pio instructions that cross pages or that
1735 * are unaligned. See the comments in hvm_domain.c/handle_mmio()
1736 */
1737 if ( (addr & PAGE_MASK) != ((addr + size - 1) & PAGE_MASK) ) {
1738 unsigned long value = 0;
1740 pio_opp->flags |= OVERLAP;
1742 if ( dir == IOREQ_WRITE ) /* OUTS */
1744 if ( hvm_paging_enabled(current) )
1746 int rv = hvm_copy_from_guest_virt(&value, addr, size);
1747 if ( rv != 0 )
1749 /* Failed on the page-spanning copy. Inject PF into
1750 * the guest for the address where we failed. */
1751 addr += size - rv;
1752 gdprintk(XENLOG_DEBUG, "Pagefault reading non-io side "
1753 "of a page-spanning PIO: va=%#lx\n", addr);
1754 vmx_inject_exception(TRAP_page_fault, 0, addr);
1755 return;
1758 else
1759 (void) hvm_copy_from_guest_phys(&value, addr, size);
1760 } else /* dir != IOREQ_WRITE */
1761 /* Remember where to write the result, as a *VA*.
1762 * Must be a VA so we can handle the page overlap
1763 * correctly in hvm_pio_assist() */
1764 pio_opp->addr = addr;
1766 if ( count == 1 )
1767 regs->eip += inst_len;
1769 send_pio_req(port, 1, size, value, dir, df, 0);
1770 } else {
1771 unsigned long last_addr = sign > 0 ? addr + count * size - 1
1772 : addr - (count - 1) * size;
1774 if ( (addr & PAGE_MASK) != (last_addr & PAGE_MASK) )
1776 if ( sign > 0 )
1777 count = (PAGE_SIZE - (addr & ~PAGE_MASK)) / size;
1778 else
1779 count = (addr & ~PAGE_MASK) / size + 1;
1780 } else
1781 regs->eip += inst_len;
1783 send_pio_req(port, count, size, paddr, dir, df, 1);
1787 static void vmx_do_str_pio(unsigned long exit_qualification,
1788 unsigned long inst_len,
1789 struct cpu_user_regs *regs,
1790 struct hvm_io_op *pio_opp)
1792 unsigned int port, size;
1793 int dir, df, vm86;
1794 unsigned long addr, count = 1, base;
1795 paddr_t paddr;
1796 unsigned long gfn;
1797 u32 ar_bytes, limit;
1798 int sign;
1799 int long_mode = 0;
1801 vm86 = regs->eflags & X86_EFLAGS_VM ? 1 : 0;
1802 df = regs->eflags & X86_EFLAGS_DF ? 1 : 0;
1804 if ( test_bit(6, &exit_qualification) )
1805 port = (exit_qualification >> 16) & 0xFFFF;
1806 else
1807 port = regs->edx & 0xffff;
1809 size = (exit_qualification & 7) + 1;
1810 dir = test_bit(3, &exit_qualification); /* direction */
1812 if ( dir == IOREQ_READ )
1813 HVMTRACE_2D(IO_READ, current, port, size);
1814 else
1815 HVMTRACE_2D(IO_WRITE, current, port, size);
1817 sign = regs->eflags & X86_EFLAGS_DF ? -1 : 1;
1818 ar_bytes = __vmread(GUEST_CS_AR_BYTES);
1819 if ( vmx_long_mode_enabled(current) &&
1820 (ar_bytes & X86_SEG_AR_CS_LM_ACTIVE) )
1821 long_mode = 1;
1822 addr = __vmread(GUEST_LINEAR_ADDRESS);
1824 if ( test_bit(5, &exit_qualification) ) { /* "rep" prefix */
1825 pio_opp->flags |= REPZ;
1826 count = regs->ecx;
1827 if ( !long_mode &&
1828 (vm86 || !(ar_bytes & X86_SEG_AR_DEF_OP_SIZE)) )
1829 count &= 0xFFFF;
1832 /*
1833 * In protected mode, guest linear address is invalid if the
1834 * selector is null.
1835 */
1836 if ( !vmx_str_pio_check_descriptor(long_mode, regs->eip, inst_len,
1837 dir==IOREQ_WRITE ? x86_seg_ds :
1838 x86_seg_es, &base, &limit,
1839 &ar_bytes) ) {
1840 if ( !long_mode ) {
1841 vmx_inject_hw_exception(current, TRAP_gp_fault, 0);
1842 return;
1844 addr = dir == IOREQ_WRITE ? base + regs->esi : regs->edi;
1847 if ( !long_mode )
1849 /* Segment must be readable for outs and writeable for ins. */
1850 if ( ((dir == IOREQ_WRITE)
1851 ? ((ar_bytes & 0xa) == 0x8)
1852 : ((ar_bytes & 0xa) != 0x2)) ||
1853 !vmx_str_pio_check_limit(limit, size, ar_bytes,
1854 addr, base, df, &count) )
1856 vmx_inject_hw_exception(current, TRAP_gp_fault, 0);
1857 return;
1860 #ifdef __x86_64__
1861 else if ( !vmx_str_pio_lm_check_limit(regs, size, addr, &count) )
1863 vmx_inject_hw_exception(current, TRAP_gp_fault, 0);
1864 return;
1866 #endif
1868 /* Translate the address to a physical address */
1869 gfn = paging_gva_to_gfn(current, addr);
1870 if ( gfn == INVALID_GFN )
1872 /* The guest does not have the RAM address mapped.
1873 * Need to send in a page fault */
1874 int errcode = 0;
1875 /* IO read --> memory write */
1876 if ( dir == IOREQ_READ ) errcode |= PFEC_write_access;
1877 vmx_inject_exception(TRAP_page_fault, errcode, addr);
1878 return;
1880 paddr = (paddr_t)gfn << PAGE_SHIFT | (addr & ~PAGE_MASK);
1882 vmx_send_str_pio(regs, pio_opp, inst_len, port, sign,
1883 size, dir, df, addr, paddr, count);
1886 static void vmx_io_instruction(unsigned long exit_qualification,
1887 unsigned long inst_len)
1889 struct cpu_user_regs *regs;
1890 struct hvm_io_op *pio_opp;
1892 pio_opp = &current->arch.hvm_vcpu.io_op;
1893 pio_opp->instr = INSTR_PIO;
1894 pio_opp->flags = 0;
1896 regs = &pio_opp->io_context;
1898 /* Copy current guest state into io instruction state structure. */
1899 memcpy(regs, guest_cpu_user_regs(), HVM_CONTEXT_STACK_BYTES);
1900 vmx_store_cpu_guest_regs(current, regs, NULL);
1902 HVM_DBG_LOG(DBG_LEVEL_IO, "vm86 %d, eip=%x:%lx, "
1903 "exit_qualification = %lx",
1904 regs->eflags & X86_EFLAGS_VM ? 1 : 0,
1905 regs->cs, (unsigned long)regs->eip, exit_qualification);
1907 if ( test_bit(4, &exit_qualification) ) /* string instrucation */
1908 vmx_do_str_pio(exit_qualification, inst_len, regs, pio_opp);
1909 else
1911 unsigned int port, size;
1912 int dir, df;
1914 df = regs->eflags & X86_EFLAGS_DF ? 1 : 0;
1916 if ( test_bit(6, &exit_qualification) )
1917 port = (exit_qualification >> 16) & 0xFFFF;
1918 else
1919 port = regs->edx & 0xffff;
1921 size = (exit_qualification & 7) + 1;
1922 dir = test_bit(3, &exit_qualification); /* direction */
1924 if ( dir == IOREQ_READ )
1925 HVMTRACE_2D(IO_READ, current, port, size);
1926 else
1927 HVMTRACE_2D(IO_WRITE, current, port, size);
1929 if ( port == 0xe9 && dir == IOREQ_WRITE && size == 1 )
1930 hvm_print_line(current, regs->eax); /* guest debug output */
1932 regs->eip += inst_len;
1933 send_pio_req(port, 1, size, regs->eax, dir, df, 0);
1937 static void vmx_world_save(struct vcpu *v, struct vmx_assist_context *c)
1939 /* NB. Skip transition instruction. */
1940 c->eip = __vmread(GUEST_RIP);
1941 c->eip += __get_instruction_length(); /* Safe: MOV Cn, LMSW, CLTS */
1943 c->esp = __vmread(GUEST_RSP);
1944 c->eflags = __vmread(GUEST_RFLAGS) & ~X86_EFLAGS_RF;
1946 c->cr0 = v->arch.hvm_vmx.cpu_shadow_cr0;
1947 c->cr3 = v->arch.hvm_vmx.cpu_cr3;
1948 c->cr4 = v->arch.hvm_vmx.cpu_shadow_cr4;
1950 c->idtr_limit = __vmread(GUEST_IDTR_LIMIT);
1951 c->idtr_base = __vmread(GUEST_IDTR_BASE);
1953 c->gdtr_limit = __vmread(GUEST_GDTR_LIMIT);
1954 c->gdtr_base = __vmread(GUEST_GDTR_BASE);
1956 c->cs_sel = __vmread(GUEST_CS_SELECTOR);
1957 c->cs_limit = __vmread(GUEST_CS_LIMIT);
1958 c->cs_base = __vmread(GUEST_CS_BASE);
1959 c->cs_arbytes.bytes = __vmread(GUEST_CS_AR_BYTES);
1961 c->ds_sel = __vmread(GUEST_DS_SELECTOR);
1962 c->ds_limit = __vmread(GUEST_DS_LIMIT);
1963 c->ds_base = __vmread(GUEST_DS_BASE);
1964 c->ds_arbytes.bytes = __vmread(GUEST_DS_AR_BYTES);
1966 c->es_sel = __vmread(GUEST_ES_SELECTOR);
1967 c->es_limit = __vmread(GUEST_ES_LIMIT);
1968 c->es_base = __vmread(GUEST_ES_BASE);
1969 c->es_arbytes.bytes = __vmread(GUEST_ES_AR_BYTES);
1971 c->ss_sel = __vmread(GUEST_SS_SELECTOR);
1972 c->ss_limit = __vmread(GUEST_SS_LIMIT);
1973 c->ss_base = __vmread(GUEST_SS_BASE);
1974 c->ss_arbytes.bytes = __vmread(GUEST_SS_AR_BYTES);
1976 c->fs_sel = __vmread(GUEST_FS_SELECTOR);
1977 c->fs_limit = __vmread(GUEST_FS_LIMIT);
1978 c->fs_base = __vmread(GUEST_FS_BASE);
1979 c->fs_arbytes.bytes = __vmread(GUEST_FS_AR_BYTES);
1981 c->gs_sel = __vmread(GUEST_GS_SELECTOR);
1982 c->gs_limit = __vmread(GUEST_GS_LIMIT);
1983 c->gs_base = __vmread(GUEST_GS_BASE);
1984 c->gs_arbytes.bytes = __vmread(GUEST_GS_AR_BYTES);
1986 c->tr_sel = __vmread(GUEST_TR_SELECTOR);
1987 c->tr_limit = __vmread(GUEST_TR_LIMIT);
1988 c->tr_base = __vmread(GUEST_TR_BASE);
1989 c->tr_arbytes.bytes = __vmread(GUEST_TR_AR_BYTES);
1991 c->ldtr_sel = __vmread(GUEST_LDTR_SELECTOR);
1992 c->ldtr_limit = __vmread(GUEST_LDTR_LIMIT);
1993 c->ldtr_base = __vmread(GUEST_LDTR_BASE);
1994 c->ldtr_arbytes.bytes = __vmread(GUEST_LDTR_AR_BYTES);
1997 static int vmx_world_restore(struct vcpu *v, struct vmx_assist_context *c)
1999 unsigned long mfn, old_base_mfn;
2001 __vmwrite(GUEST_RIP, c->eip);
2002 __vmwrite(GUEST_RSP, c->esp);
2003 __vmwrite(GUEST_RFLAGS, c->eflags);
2005 v->arch.hvm_vmx.cpu_shadow_cr0 = c->cr0;
2006 __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr0);
2008 if ( !vmx_paging_enabled(v) )
2009 goto skip_cr3;
2011 if ( c->cr3 == v->arch.hvm_vmx.cpu_cr3 )
2013 /*
2014 * This is simple TLB flush, implying the guest has
2015 * removed some translation or changed page attributes.
2016 * We simply invalidate the shadow.
2017 */
2018 mfn = get_mfn_from_gpfn(c->cr3 >> PAGE_SHIFT);
2019 if ( mfn != pagetable_get_pfn(v->arch.guest_table) )
2020 goto bad_cr3;
2022 else
2024 /*
2025 * If different, make a shadow. Check if the PDBR is valid
2026 * first.
2027 */
2028 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 c->cr3 = %x", c->cr3);
2029 mfn = get_mfn_from_gpfn(c->cr3 >> PAGE_SHIFT);
2030 if ( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), v->domain) )
2031 goto bad_cr3;
2032 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
2033 v->arch.guest_table = pagetable_from_pfn(mfn);
2034 if ( old_base_mfn )
2035 put_page(mfn_to_page(old_base_mfn));
2036 v->arch.hvm_vmx.cpu_cr3 = c->cr3;
2039 skip_cr3:
2040 if ( !vmx_paging_enabled(v) )
2041 HVM_DBG_LOG(DBG_LEVEL_VMMU, "switching to vmxassist. use phys table");
2042 else
2043 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %x", c->cr3);
2045 __vmwrite(GUEST_CR4, (c->cr4 | HVM_CR4_HOST_MASK));
2046 v->arch.hvm_vmx.cpu_shadow_cr4 = c->cr4;
2047 __vmwrite(CR4_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr4);
2049 __vmwrite(GUEST_IDTR_LIMIT, c->idtr_limit);
2050 __vmwrite(GUEST_IDTR_BASE, c->idtr_base);
2052 __vmwrite(GUEST_GDTR_LIMIT, c->gdtr_limit);
2053 __vmwrite(GUEST_GDTR_BASE, c->gdtr_base);
2055 __vmwrite(GUEST_CS_SELECTOR, c->cs_sel);
2056 __vmwrite(GUEST_CS_LIMIT, c->cs_limit);
2057 __vmwrite(GUEST_CS_BASE, c->cs_base);
2058 __vmwrite(GUEST_CS_AR_BYTES, c->cs_arbytes.bytes);
2060 __vmwrite(GUEST_DS_SELECTOR, c->ds_sel);
2061 __vmwrite(GUEST_DS_LIMIT, c->ds_limit);
2062 __vmwrite(GUEST_DS_BASE, c->ds_base);
2063 __vmwrite(GUEST_DS_AR_BYTES, c->ds_arbytes.bytes);
2065 __vmwrite(GUEST_ES_SELECTOR, c->es_sel);
2066 __vmwrite(GUEST_ES_LIMIT, c->es_limit);
2067 __vmwrite(GUEST_ES_BASE, c->es_base);
2068 __vmwrite(GUEST_ES_AR_BYTES, c->es_arbytes.bytes);
2070 __vmwrite(GUEST_SS_SELECTOR, c->ss_sel);
2071 __vmwrite(GUEST_SS_LIMIT, c->ss_limit);
2072 __vmwrite(GUEST_SS_BASE, c->ss_base);
2073 __vmwrite(GUEST_SS_AR_BYTES, c->ss_arbytes.bytes);
2075 __vmwrite(GUEST_FS_SELECTOR, c->fs_sel);
2076 __vmwrite(GUEST_FS_LIMIT, c->fs_limit);
2077 __vmwrite(GUEST_FS_BASE, c->fs_base);
2078 __vmwrite(GUEST_FS_AR_BYTES, c->fs_arbytes.bytes);
2080 __vmwrite(GUEST_GS_SELECTOR, c->gs_sel);
2081 __vmwrite(GUEST_GS_LIMIT, c->gs_limit);
2082 __vmwrite(GUEST_GS_BASE, c->gs_base);
2083 __vmwrite(GUEST_GS_AR_BYTES, c->gs_arbytes.bytes);
2085 __vmwrite(GUEST_TR_SELECTOR, c->tr_sel);
2086 __vmwrite(GUEST_TR_LIMIT, c->tr_limit);
2087 __vmwrite(GUEST_TR_BASE, c->tr_base);
2088 __vmwrite(GUEST_TR_AR_BYTES, c->tr_arbytes.bytes);
2090 __vmwrite(GUEST_LDTR_SELECTOR, c->ldtr_sel);
2091 __vmwrite(GUEST_LDTR_LIMIT, c->ldtr_limit);
2092 __vmwrite(GUEST_LDTR_BASE, c->ldtr_base);
2093 __vmwrite(GUEST_LDTR_AR_BYTES, c->ldtr_arbytes.bytes);
2095 paging_update_paging_modes(v);
2096 return 0;
2098 bad_cr3:
2099 gdprintk(XENLOG_ERR, "Invalid CR3 value=%x", c->cr3);
2100 return -EINVAL;
2103 enum { VMX_ASSIST_INVOKE = 0, VMX_ASSIST_RESTORE };
2105 static int vmx_assist(struct vcpu *v, int mode)
2107 struct vmx_assist_context c;
2108 struct hvm_hw_vpic *vpic = v->domain->arch.hvm_domain.vpic;
2109 u32 magic, cp;
2111 if ( hvm_copy_from_guest_phys(&magic, VMXASSIST_MAGIC_OFFSET,
2112 sizeof(magic)) )
2114 gdprintk(XENLOG_ERR, "No vmxassist: can't execute real mode code\n");
2115 domain_crash(v->domain);
2116 return 0;
2119 if ( magic != VMXASSIST_MAGIC )
2121 gdprintk(XENLOG_ERR, "vmxassist magic number not match\n");
2122 domain_crash(v->domain);
2123 return 0;
2126 switch ( mode ) {
2127 /*
2128 * Transfer control to vmxassist.
2129 * Store the current context in VMXASSIST_OLD_CONTEXT and load
2130 * the new VMXASSIST_NEW_CONTEXT context. This context was created
2131 * by vmxassist and will transfer control to it.
2132 */
2133 case VMX_ASSIST_INVOKE:
2134 /* save the old context */
2135 if ( hvm_copy_from_guest_phys(&cp, VMXASSIST_OLD_CONTEXT, sizeof(cp)) )
2136 goto error;
2137 if ( cp != 0 ) {
2138 vmx_world_save(v, &c);
2139 if ( hvm_copy_to_guest_phys(cp, &c, sizeof(c)) )
2140 goto error;
2143 /* restore the new context, this should activate vmxassist */
2144 if ( hvm_copy_from_guest_phys(&cp, VMXASSIST_NEW_CONTEXT, sizeof(cp)) )
2145 goto error;
2146 if ( cp != 0 ) {
2147 if ( hvm_copy_from_guest_phys(&c, cp, sizeof(c)) )
2148 goto error;
2149 if ( vmx_world_restore(v, &c) != 0 )
2150 goto error;
2151 v->arch.hvm_vmx.pm_irqbase[0] = vpic[0].irq_base;
2152 v->arch.hvm_vmx.pm_irqbase[1] = vpic[1].irq_base;
2153 vpic[0].irq_base = NR_EXCEPTION_HANDLER;
2154 vpic[1].irq_base = NR_EXCEPTION_HANDLER + 8;
2155 v->arch.hvm_vmx.vmxassist_enabled = 1;
2156 return 1;
2158 break;
2160 /*
2161 * Restore the VMXASSIST_OLD_CONTEXT that was saved by
2162 * VMX_ASSIST_INVOKE above.
2163 */
2164 case VMX_ASSIST_RESTORE:
2165 /* save the old context */
2166 if ( hvm_copy_from_guest_phys(&cp, VMXASSIST_OLD_CONTEXT, sizeof(cp)) )
2167 goto error;
2168 if ( cp != 0 ) {
2169 if ( hvm_copy_from_guest_phys(&c, cp, sizeof(c)) )
2170 goto error;
2171 if ( vmx_world_restore(v, &c) != 0 )
2172 goto error;
2173 if ( v->arch.hvm_vmx.irqbase_mode ) {
2174 vpic[0].irq_base = c.rm_irqbase[0] & 0xf8;
2175 vpic[1].irq_base = c.rm_irqbase[1] & 0xf8;
2176 } else {
2177 vpic[0].irq_base = v->arch.hvm_vmx.pm_irqbase[0];
2178 vpic[1].irq_base = v->arch.hvm_vmx.pm_irqbase[1];
2180 v->arch.hvm_vmx.vmxassist_enabled = 0;
2181 return 1;
2183 break;
2186 error:
2187 gdprintk(XENLOG_ERR, "Failed to transfer to vmxassist\n");
2188 domain_crash(v->domain);
2189 return 0;
2192 static int vmx_set_cr0(unsigned long value)
2194 struct vcpu *v = current;
2195 unsigned long mfn;
2196 unsigned long eip;
2197 int paging_enabled;
2198 unsigned long old_cr0;
2199 unsigned long old_base_mfn;
2201 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR0 value = %lx", value);
2203 if ( (u32)value != value )
2205 HVM_DBG_LOG(DBG_LEVEL_1,
2206 "Guest attempts to set upper 32 bits in CR0: %lx",
2207 value);
2208 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
2209 return 0;
2212 value &= ~HVM_CR0_GUEST_RESERVED_BITS;
2214 /* ET is reserved and should be always be 1. */
2215 value |= X86_CR0_ET;
2217 if ( (value & (X86_CR0_PE | X86_CR0_PG)) == X86_CR0_PG )
2219 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
2220 return 0;
2223 /* TS cleared? Then initialise FPU now. */
2224 if ( !(value & X86_CR0_TS) )
2226 setup_fpu(v);
2227 __vm_clear_bit(EXCEPTION_BITMAP, TRAP_no_device);
2230 old_cr0 = v->arch.hvm_vmx.cpu_shadow_cr0;
2231 paging_enabled = old_cr0 & X86_CR0_PG;
2233 v->arch.hvm_vmx.cpu_cr0 = (value | X86_CR0_PE | X86_CR0_PG
2234 | X86_CR0_NE | X86_CR0_WP);
2235 __vmwrite(GUEST_CR0, v->arch.hvm_vmx.cpu_cr0);
2237 v->arch.hvm_vmx.cpu_shadow_cr0 = value;
2238 __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr0);
2240 /* Trying to enable paging. */
2241 if ( (value & X86_CR0_PE) && (value & X86_CR0_PG) && !paging_enabled )
2243 if ( vmx_lme_is_set(v) && !vmx_long_mode_enabled(v) )
2245 if ( !(v->arch.hvm_vmx.cpu_shadow_cr4 & X86_CR4_PAE) )
2247 HVM_DBG_LOG(DBG_LEVEL_1, "Guest enabled paging "
2248 "with EFER.LME set but not CR4.PAE");
2249 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
2250 return 0;
2253 HVM_DBG_LOG(DBG_LEVEL_1, "Enabling long mode");
2254 vmx_enable_long_mode(v);
2257 /*
2258 * The guest CR3 must be pointing to the guest physical.
2259 */
2260 mfn = get_mfn_from_gpfn(v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT);
2261 if ( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), v->domain) )
2263 gdprintk(XENLOG_ERR, "Invalid CR3 value = %lx (mfn=%lx)\n",
2264 v->arch.hvm_vmx.cpu_cr3, mfn);
2265 domain_crash(v->domain);
2266 return 0;
2269 /*
2270 * Now arch.guest_table points to machine physical.
2271 */
2272 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
2273 v->arch.guest_table = pagetable_from_pfn(mfn);
2274 if ( old_base_mfn )
2275 put_page(mfn_to_page(old_base_mfn));
2277 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, mfn = %lx",
2278 v->arch.hvm_vmx.cpu_cr3, mfn);
2280 paging_update_paging_modes(v);
2283 /* Trying to disable paging. */
2284 if ( ((value & (X86_CR0_PE | X86_CR0_PG)) != (X86_CR0_PE | X86_CR0_PG)) &&
2285 paging_enabled )
2287 /* When CR0.PG is cleared, LMA is cleared immediately. */
2288 if ( vmx_long_mode_enabled(v) )
2289 vmx_disable_long_mode(v);
2291 if ( v->arch.hvm_vmx.cpu_cr3 )
2293 put_page(mfn_to_page(get_mfn_from_gpfn(
2294 v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT)));
2295 v->arch.guest_table = pagetable_null();
2299 /*
2300 * VMX does not implement real-mode virtualization. We emulate
2301 * real-mode by performing a world switch to VMXAssist whenever
2302 * a partition disables the CR0.PE bit.
2303 */
2304 if ( (value & X86_CR0_PE) == 0 )
2306 if ( value & X86_CR0_PG )
2308 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
2309 return 0;
2312 if ( vmx_assist(v, VMX_ASSIST_INVOKE) )
2314 eip = __vmread(GUEST_RIP);
2315 HVM_DBG_LOG(DBG_LEVEL_1,
2316 "Transfering control to vmxassist %%eip 0x%lx", eip);
2317 return 0; /* do not update eip! */
2320 else if ( v->arch.hvm_vmx.vmxassist_enabled )
2322 eip = __vmread(GUEST_RIP);
2323 HVM_DBG_LOG(DBG_LEVEL_1,
2324 "Enabling CR0.PE at %%eip 0x%lx", eip);
2325 if ( vmx_assist(v, VMX_ASSIST_RESTORE) )
2327 HVM_DBG_LOG(DBG_LEVEL_1,
2328 "Restoring to %%eip 0x%lx", eip);
2329 return 0; /* do not update eip! */
2332 else if ( (value & (X86_CR0_PE | X86_CR0_PG)) == X86_CR0_PE )
2333 paging_update_paging_modes(v);
2335 return 1;
2338 #define CASE_SET_REG(REG, reg) \
2339 case REG_ ## REG: regs->reg = value; break
2340 #define CASE_GET_REG(REG, reg) \
2341 case REG_ ## REG: value = regs->reg; break
2343 #define CASE_EXTEND_SET_REG \
2344 CASE_EXTEND_REG(S)
2345 #define CASE_EXTEND_GET_REG \
2346 CASE_EXTEND_REG(G)
2348 #ifdef __i386__
2349 #define CASE_EXTEND_REG(T)
2350 #else
2351 #define CASE_EXTEND_REG(T) \
2352 CASE_ ## T ## ET_REG(R8, r8); \
2353 CASE_ ## T ## ET_REG(R9, r9); \
2354 CASE_ ## T ## ET_REG(R10, r10); \
2355 CASE_ ## T ## ET_REG(R11, r11); \
2356 CASE_ ## T ## ET_REG(R12, r12); \
2357 CASE_ ## T ## ET_REG(R13, r13); \
2358 CASE_ ## T ## ET_REG(R14, r14); \
2359 CASE_ ## T ## ET_REG(R15, r15)
2360 #endif
2362 /*
2363 * Write to control registers
2364 */
2365 static int mov_to_cr(int gp, int cr, struct cpu_user_regs *regs)
2367 unsigned long value, old_cr, old_base_mfn, mfn;
2368 struct vcpu *v = current;
2369 struct vlapic *vlapic = vcpu_vlapic(v);
2371 switch ( gp )
2373 CASE_GET_REG(EAX, eax);
2374 CASE_GET_REG(ECX, ecx);
2375 CASE_GET_REG(EDX, edx);
2376 CASE_GET_REG(EBX, ebx);
2377 CASE_GET_REG(EBP, ebp);
2378 CASE_GET_REG(ESI, esi);
2379 CASE_GET_REG(EDI, edi);
2380 CASE_EXTEND_GET_REG;
2381 case REG_ESP:
2382 value = __vmread(GUEST_RSP);
2383 break;
2384 default:
2385 gdprintk(XENLOG_ERR, "invalid gp: %d\n", gp);
2386 goto exit_and_crash;
2389 HVMTRACE_2D(CR_WRITE, v, cr, value);
2391 HVM_DBG_LOG(DBG_LEVEL_1, "CR%d, value = %lx", cr, value);
2393 switch ( cr )
2395 case 0:
2396 return vmx_set_cr0(value);
2398 case 3:
2399 /*
2400 * If paging is not enabled yet, simply copy the value to CR3.
2401 */
2402 if ( !vmx_paging_enabled(v) )
2404 v->arch.hvm_vmx.cpu_cr3 = value;
2405 break;
2408 /*
2409 * We make a new one if the shadow does not exist.
2410 */
2411 if ( value == v->arch.hvm_vmx.cpu_cr3 ) {
2412 /*
2413 * This is simple TLB flush, implying the guest has
2414 * removed some translation or changed page attributes.
2415 * We simply invalidate the shadow.
2416 */
2417 mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
2418 if ( mfn != pagetable_get_pfn(v->arch.guest_table) )
2419 goto bad_cr3;
2420 paging_update_cr3(v);
2421 } else {
2422 /*
2423 * If different, make a shadow. Check if the PDBR is valid
2424 * first.
2425 */
2426 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 value = %lx", value);
2427 mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
2428 if ( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), v->domain) )
2429 goto bad_cr3;
2430 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
2431 v->arch.guest_table = pagetable_from_pfn(mfn);
2432 if ( old_base_mfn )
2433 put_page(mfn_to_page(old_base_mfn));
2434 v->arch.hvm_vmx.cpu_cr3 = value;
2435 update_cr3(v);
2436 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx", value);
2438 break;
2440 case 4: /* CR4 */
2441 old_cr = v->arch.hvm_vmx.cpu_shadow_cr4;
2443 if ( value & HVM_CR4_GUEST_RESERVED_BITS )
2445 HVM_DBG_LOG(DBG_LEVEL_1,
2446 "Guest attempts to set reserved bit in CR4: %lx",
2447 value);
2448 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
2449 return 0;
2452 if ( (value & X86_CR4_PAE) && !(old_cr & X86_CR4_PAE) )
2454 if ( vmx_pgbit_test(v) )
2456 #if CONFIG_PAGING_LEVELS >= 3
2457 /* The guest is a 32-bit PAE guest. */
2458 unsigned long mfn, old_base_mfn;
2459 mfn = get_mfn_from_gpfn(v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT);
2460 if ( !mfn_valid(mfn) ||
2461 !get_page(mfn_to_page(mfn), v->domain) )
2462 goto bad_cr3;
2464 /*
2465 * Now arch.guest_table points to machine physical.
2466 */
2467 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
2468 v->arch.guest_table = pagetable_from_pfn(mfn);
2469 if ( old_base_mfn )
2470 put_page(mfn_to_page(old_base_mfn));
2472 HVM_DBG_LOG(DBG_LEVEL_VMMU,
2473 "Update CR3 value = %lx, mfn = %lx",
2474 v->arch.hvm_vmx.cpu_cr3, mfn);
2475 #endif
2478 else if ( !(value & X86_CR4_PAE) )
2480 if ( unlikely(vmx_long_mode_enabled(v)) )
2482 HVM_DBG_LOG(DBG_LEVEL_1, "Guest cleared CR4.PAE while "
2483 "EFER.LMA is set");
2484 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
2485 return 0;
2489 __vmwrite(GUEST_CR4, value | HVM_CR4_HOST_MASK);
2490 v->arch.hvm_vmx.cpu_shadow_cr4 = value;
2491 __vmwrite(CR4_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr4);
2493 /*
2494 * Writing to CR4 to modify the PSE, PGE, or PAE flag invalidates
2495 * all TLB entries except global entries.
2496 */
2497 if ( (old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE) )
2498 paging_update_paging_modes(v);
2500 break;
2502 case 8:
2503 vlapic_set_reg(vlapic, APIC_TASKPRI, ((value & 0x0F) << 4));
2504 break;
2506 default:
2507 gdprintk(XENLOG_ERR, "invalid cr: %d\n", cr);
2508 domain_crash(v->domain);
2509 return 0;
2512 return 1;
2514 bad_cr3:
2515 gdprintk(XENLOG_ERR, "Invalid CR3\n");
2516 exit_and_crash:
2517 domain_crash(v->domain);
2518 return 0;
2521 /*
2522 * Read from control registers. CR0 and CR4 are read from the shadow.
2523 */
2524 static void mov_from_cr(int cr, int gp, struct cpu_user_regs *regs)
2526 unsigned long value = 0;
2527 struct vcpu *v = current;
2528 struct vlapic *vlapic = vcpu_vlapic(v);
2530 switch ( cr )
2532 case 3:
2533 value = (unsigned long)v->arch.hvm_vmx.cpu_cr3;
2534 break;
2535 case 8:
2536 value = (unsigned long)vlapic_get_reg(vlapic, APIC_TASKPRI);
2537 value = (value & 0xF0) >> 4;
2538 break;
2539 default:
2540 gdprintk(XENLOG_ERR, "invalid cr: %d\n", cr);
2541 domain_crash(v->domain);
2542 break;
2545 switch ( gp ) {
2546 CASE_SET_REG(EAX, eax);
2547 CASE_SET_REG(ECX, ecx);
2548 CASE_SET_REG(EDX, edx);
2549 CASE_SET_REG(EBX, ebx);
2550 CASE_SET_REG(EBP, ebp);
2551 CASE_SET_REG(ESI, esi);
2552 CASE_SET_REG(EDI, edi);
2553 CASE_EXTEND_SET_REG;
2554 case REG_ESP:
2555 __vmwrite(GUEST_RSP, value);
2556 regs->esp = value;
2557 break;
2558 default:
2559 printk("invalid gp: %d\n", gp);
2560 domain_crash(v->domain);
2561 break;
2564 HVMTRACE_2D(CR_READ, v, cr, value);
2566 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR%d, value = %lx", cr, value);
2569 static int vmx_cr_access(unsigned long exit_qualification,
2570 struct cpu_user_regs *regs)
2572 unsigned int gp, cr;
2573 unsigned long value;
2574 struct vcpu *v = current;
2576 switch ( exit_qualification & CONTROL_REG_ACCESS_TYPE ) {
2577 case TYPE_MOV_TO_CR:
2578 gp = exit_qualification & CONTROL_REG_ACCESS_REG;
2579 cr = exit_qualification & CONTROL_REG_ACCESS_NUM;
2580 return mov_to_cr(gp, cr, regs);
2581 case TYPE_MOV_FROM_CR:
2582 gp = exit_qualification & CONTROL_REG_ACCESS_REG;
2583 cr = exit_qualification & CONTROL_REG_ACCESS_NUM;
2584 mov_from_cr(cr, gp, regs);
2585 break;
2586 case TYPE_CLTS:
2587 /* We initialise the FPU now, to avoid needing another vmexit. */
2588 setup_fpu(v);
2589 __vm_clear_bit(EXCEPTION_BITMAP, TRAP_no_device);
2591 v->arch.hvm_vmx.cpu_cr0 &= ~X86_CR0_TS; /* clear TS */
2592 __vmwrite(GUEST_CR0, v->arch.hvm_vmx.cpu_cr0);
2594 v->arch.hvm_vmx.cpu_shadow_cr0 &= ~X86_CR0_TS; /* clear TS */
2595 __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr0);
2596 break;
2597 case TYPE_LMSW:
2598 value = v->arch.hvm_vmx.cpu_shadow_cr0;
2599 value = (value & ~0xF) |
2600 (((exit_qualification & LMSW_SOURCE_DATA) >> 16) & 0xF);
2601 return vmx_set_cr0(value);
2602 default:
2603 BUG();
2606 return 1;
2609 static int vmx_do_msr_read(struct cpu_user_regs *regs)
2611 u64 msr_content = 0;
2612 u32 ecx = regs->ecx, eax, edx;
2613 struct vcpu *v = current;
2615 HVM_DBG_LOG(DBG_LEVEL_1, "ecx=%x", ecx);
2617 switch ( ecx )
2619 case MSR_IA32_TIME_STAMP_COUNTER:
2620 msr_content = hvm_get_guest_time(v);
2621 break;
2622 case MSR_IA32_SYSENTER_CS:
2623 msr_content = (u32)__vmread(GUEST_SYSENTER_CS);
2624 break;
2625 case MSR_IA32_SYSENTER_ESP:
2626 msr_content = __vmread(GUEST_SYSENTER_ESP);
2627 break;
2628 case MSR_IA32_SYSENTER_EIP:
2629 msr_content = __vmread(GUEST_SYSENTER_EIP);
2630 break;
2631 case MSR_IA32_APICBASE:
2632 msr_content = vcpu_vlapic(v)->hw.apic_base_msr;
2633 break;
2634 case MSR_IA32_VMX_BASIC...MSR_IA32_VMX_PROCBASED_CTLS2:
2635 goto gp_fault;
2636 case MSR_IA32_MCG_STATUS:
2637 case MSR_IA32_MC0_STATUS:
2638 case MSR_K8_MC1_STATUS:
2639 case MSR_K8_MC2_STATUS:
2640 case MSR_K8_MC3_STATUS:
2641 case MSR_K8_MC4_STATUS:
2642 /* No point in letting the guest see real MCEs */
2643 msr_content = 0;
2644 break;
2645 default:
2646 switch ( long_mode_do_msr_read(regs) )
2648 case HNDL_unhandled:
2649 break;
2650 case HNDL_exception_raised:
2651 return 0;
2652 case HNDL_done:
2653 goto done;
2656 if ( rdmsr_hypervisor_regs(ecx, &eax, &edx) ||
2657 rdmsr_safe(ecx, eax, edx) == 0 )
2659 regs->eax = eax;
2660 regs->edx = edx;
2661 goto done;
2664 goto gp_fault;
2667 regs->eax = msr_content & 0xFFFFFFFF;
2668 regs->edx = msr_content >> 32;
2670 done:
2671 HVMTRACE_2D(MSR_READ, v, ecx, msr_content);
2672 HVM_DBG_LOG(DBG_LEVEL_1, "returns: ecx=%x, eax=%lx, edx=%lx",
2673 ecx, (unsigned long)regs->eax,
2674 (unsigned long)regs->edx);
2675 return 1;
2677 gp_fault:
2678 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
2679 return 0;
2682 static int vmx_alloc_vlapic_mapping(struct domain *d)
2684 void *apic_va;
2686 if ( !cpu_has_vmx_virtualize_apic_accesses )
2687 return 0;
2689 apic_va = alloc_xenheap_page();
2690 if ( apic_va == NULL )
2691 return -ENOMEM;
2692 share_xen_page_with_guest(virt_to_page(apic_va), d, XENSHARE_writable);
2693 guest_physmap_add_page(
2694 d, paddr_to_pfn(APIC_DEFAULT_PHYS_BASE), virt_to_mfn(apic_va));
2695 d->arch.hvm_domain.vmx_apic_access_mfn = virt_to_mfn(apic_va);
2697 return 0;
2700 static void vmx_free_vlapic_mapping(struct domain *d)
2702 unsigned long mfn = d->arch.hvm_domain.vmx_apic_access_mfn;
2703 if ( mfn != 0 )
2704 free_xenheap_page(mfn_to_virt(mfn));
2707 static void vmx_install_vlapic_mapping(struct vcpu *v)
2709 unsigned long virt_page_ma, apic_page_ma;
2711 if ( !cpu_has_vmx_virtualize_apic_accesses )
2712 return;
2714 virt_page_ma = page_to_maddr(vcpu_vlapic(v)->regs_page);
2715 apic_page_ma = v->domain->arch.hvm_domain.vmx_apic_access_mfn;
2716 apic_page_ma <<= PAGE_SHIFT;
2718 vmx_vmcs_enter(v);
2719 __vmwrite(VIRTUAL_APIC_PAGE_ADDR, virt_page_ma);
2720 __vmwrite(APIC_ACCESS_ADDR, apic_page_ma);
2721 vmx_vmcs_exit(v);
2724 void vmx_vlapic_msr_changed(struct vcpu *v)
2726 struct vlapic *vlapic = vcpu_vlapic(v);
2727 uint32_t ctl;
2729 if ( !cpu_has_vmx_virtualize_apic_accesses )
2730 return;
2732 vmx_vmcs_enter(v);
2733 ctl = __vmread(SECONDARY_VM_EXEC_CONTROL);
2734 ctl &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
2735 if ( !vlapic_hw_disabled(vlapic) &&
2736 (vlapic_base_address(vlapic) == APIC_DEFAULT_PHYS_BASE) )
2737 ctl |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
2738 __vmwrite(SECONDARY_VM_EXEC_CONTROL, ctl);
2739 vmx_vmcs_exit(v);
2742 static int vmx_do_msr_write(struct cpu_user_regs *regs)
2744 u32 ecx = regs->ecx;
2745 u64 msr_content;
2746 struct vcpu *v = current;
2748 HVM_DBG_LOG(DBG_LEVEL_1, "ecx=%x, eax=%x, edx=%x",
2749 ecx, (u32)regs->eax, (u32)regs->edx);
2751 msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
2752 HVMTRACE_2D(MSR_WRITE, v, ecx, msr_content);
2754 switch ( ecx )
2756 case MSR_IA32_TIME_STAMP_COUNTER:
2757 hvm_set_guest_time(v, msr_content);
2758 pt_reset(v);
2759 break;
2760 case MSR_IA32_SYSENTER_CS:
2761 __vmwrite(GUEST_SYSENTER_CS, msr_content);
2762 break;
2763 case MSR_IA32_SYSENTER_ESP:
2764 __vmwrite(GUEST_SYSENTER_ESP, msr_content);
2765 break;
2766 case MSR_IA32_SYSENTER_EIP:
2767 __vmwrite(GUEST_SYSENTER_EIP, msr_content);
2768 break;
2769 case MSR_IA32_APICBASE:
2770 vlapic_msr_set(vcpu_vlapic(v), msr_content);
2771 break;
2772 case MSR_IA32_VMX_BASIC...MSR_IA32_VMX_PROCBASED_CTLS2:
2773 goto gp_fault;
2774 default:
2775 switch ( long_mode_do_msr_write(regs) )
2777 case HNDL_unhandled:
2778 wrmsr_hypervisor_regs(ecx, regs->eax, regs->edx);
2779 break;
2780 case HNDL_exception_raised:
2781 return 0;
2782 case HNDL_done:
2783 break;
2785 break;
2788 return 1;
2790 gp_fault:
2791 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
2792 return 0;
2795 static void vmx_do_hlt(void)
2797 unsigned long rflags;
2798 HVMTRACE_0D(HLT, current);
2799 rflags = __vmread(GUEST_RFLAGS);
2800 hvm_hlt(rflags);
2803 static void vmx_do_extint(struct cpu_user_regs *regs)
2805 unsigned int vector;
2807 asmlinkage void do_IRQ(struct cpu_user_regs *);
2808 fastcall void smp_apic_timer_interrupt(struct cpu_user_regs *);
2809 fastcall void smp_event_check_interrupt(void);
2810 fastcall void smp_invalidate_interrupt(void);
2811 fastcall void smp_call_function_interrupt(void);
2812 fastcall void smp_spurious_interrupt(struct cpu_user_regs *regs);
2813 fastcall void smp_error_interrupt(struct cpu_user_regs *regs);
2814 #ifdef CONFIG_X86_MCE_P4THERMAL
2815 fastcall void smp_thermal_interrupt(struct cpu_user_regs *regs);
2816 #endif
2818 vector = __vmread(VM_EXIT_INTR_INFO);
2819 BUG_ON(!(vector & INTR_INFO_VALID_MASK));
2821 vector &= INTR_INFO_VECTOR_MASK;
2822 HVMTRACE_1D(INTR, current, vector);
2824 switch ( vector )
2826 case LOCAL_TIMER_VECTOR:
2827 smp_apic_timer_interrupt(regs);
2828 break;
2829 case EVENT_CHECK_VECTOR:
2830 smp_event_check_interrupt();
2831 break;
2832 case INVALIDATE_TLB_VECTOR:
2833 smp_invalidate_interrupt();
2834 break;
2835 case CALL_FUNCTION_VECTOR:
2836 smp_call_function_interrupt();
2837 break;
2838 case SPURIOUS_APIC_VECTOR:
2839 smp_spurious_interrupt(regs);
2840 break;
2841 case ERROR_APIC_VECTOR:
2842 smp_error_interrupt(regs);
2843 break;
2844 #ifdef CONFIG_X86_MCE_P4THERMAL
2845 case THERMAL_APIC_VECTOR:
2846 smp_thermal_interrupt(regs);
2847 break;
2848 #endif
2849 default:
2850 regs->entry_vector = vector;
2851 do_IRQ(regs);
2852 break;
2856 static void vmx_failed_vmentry(unsigned int exit_reason,
2857 struct cpu_user_regs *regs)
2859 unsigned int failed_vmentry_reason = (uint16_t)exit_reason;
2860 unsigned long exit_qualification;
2862 exit_qualification = __vmread(EXIT_QUALIFICATION);
2863 printk("Failed vm entry (exit reason 0x%x) ", exit_reason);
2864 switch ( failed_vmentry_reason )
2866 case EXIT_REASON_INVALID_GUEST_STATE:
2867 printk("caused by invalid guest state (%ld).\n", exit_qualification);
2868 break;
2869 case EXIT_REASON_MSR_LOADING:
2870 printk("caused by MSR entry %ld loading.\n", exit_qualification);
2871 break;
2872 case EXIT_REASON_MACHINE_CHECK:
2873 printk("caused by machine check.\n");
2874 HVMTRACE_0D(MCE, current);
2875 vmx_store_cpu_guest_regs(current, regs, NULL);
2876 do_machine_check(regs);
2877 break;
2878 default:
2879 printk("reason not known yet!");
2880 break;
2883 printk("************* VMCS Area **************\n");
2884 vmcs_dump_vcpu();
2885 printk("**************************************\n");
2887 domain_crash(current->domain);
2890 asmlinkage void vmx_vmexit_handler(struct cpu_user_regs *regs)
2892 unsigned int exit_reason;
2893 unsigned long exit_qualification, inst_len = 0;
2894 struct vcpu *v = current;
2896 exit_reason = __vmread(VM_EXIT_REASON);
2898 HVMTRACE_2D(VMEXIT, v, __vmread(GUEST_RIP), exit_reason);
2900 perfc_incra(vmexits, exit_reason);
2902 if ( exit_reason != EXIT_REASON_EXTERNAL_INTERRUPT )
2903 local_irq_enable();
2905 if ( unlikely(exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) )
2906 return vmx_failed_vmentry(exit_reason, regs);
2908 switch ( exit_reason )
2910 case EXIT_REASON_EXCEPTION_NMI:
2912 /*
2913 * We don't set the software-interrupt exiting (INT n).
2914 * (1) We can get an exception (e.g. #PG) in the guest, or
2915 * (2) NMI
2916 */
2917 unsigned int intr_info, vector;
2919 intr_info = __vmread(VM_EXIT_INTR_INFO);
2920 BUG_ON(!(intr_info & INTR_INFO_VALID_MASK));
2922 vector = intr_info & INTR_INFO_VECTOR_MASK;
2924 /*
2925 * Re-set the NMI shadow if vmexit caused by a guest IRET fault (see 3B
2926 * 25.7.1.2, "Resuming Guest Software after Handling an Exception").
2927 * (NB. If we emulate this IRET for any reason, we should re-clear!)
2928 */
2929 if ( unlikely(intr_info & INTR_INFO_NMI_UNBLOCKED_BY_IRET) &&
2930 !(__vmread(IDT_VECTORING_INFO_FIELD) & INTR_INFO_VALID_MASK) &&
2931 (vector != TRAP_double_fault) )
2932 __vmwrite(GUEST_INTERRUPTIBILITY_INFO,
2933 __vmread(GUEST_INTERRUPTIBILITY_INFO)|VMX_INTR_SHADOW_NMI);
2935 perfc_incra(cause_vector, vector);
2937 switch ( vector )
2939 case TRAP_debug:
2940 case TRAP_int3:
2941 if ( !v->domain->debugger_attached )
2942 goto exit_and_crash;
2943 domain_pause_for_debugger();
2944 break;
2945 case TRAP_no_device:
2946 vmx_do_no_device_fault();
2947 break;
2948 case TRAP_page_fault:
2949 exit_qualification = __vmread(EXIT_QUALIFICATION);
2950 regs->error_code = __vmread(VM_EXIT_INTR_ERROR_CODE);
2952 HVM_DBG_LOG(DBG_LEVEL_VMMU,
2953 "eax=%lx, ebx=%lx, ecx=%lx, edx=%lx, esi=%lx, edi=%lx",
2954 (unsigned long)regs->eax, (unsigned long)regs->ebx,
2955 (unsigned long)regs->ecx, (unsigned long)regs->edx,
2956 (unsigned long)regs->esi, (unsigned long)regs->edi);
2958 if ( paging_fault(exit_qualification, regs) )
2960 HVMTRACE_2D(PF_XEN, v, exit_qualification, regs->error_code);
2961 break;
2964 v->arch.hvm_vmx.cpu_cr2 = exit_qualification;
2965 vmx_inject_hw_exception(v, TRAP_page_fault, regs->error_code);
2966 break;
2967 case TRAP_nmi:
2968 if ( (intr_info & INTR_INFO_INTR_TYPE_MASK) !=
2969 (X86_EVENTTYPE_NMI << 8) )
2970 goto exit_and_crash;
2971 HVMTRACE_0D(NMI, v);
2972 vmx_store_cpu_guest_regs(v, regs, NULL);
2973 do_nmi(regs); /* Real NMI, vector 2: normal processing. */
2974 break;
2975 case TRAP_machine_check:
2976 HVMTRACE_0D(MCE, v);
2977 vmx_store_cpu_guest_regs(v, regs, NULL);
2978 do_machine_check(regs);
2979 break;
2980 default:
2981 goto exit_and_crash;
2983 break;
2985 case EXIT_REASON_EXTERNAL_INTERRUPT:
2986 vmx_do_extint(regs);
2987 break;
2988 case EXIT_REASON_TRIPLE_FAULT:
2989 hvm_triple_fault();
2990 break;
2991 case EXIT_REASON_PENDING_VIRT_INTR:
2992 /* Disable the interrupt window. */
2993 v->arch.hvm_vmx.exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
2994 __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
2995 v->arch.hvm_vmx.exec_control);
2996 break;
2997 case EXIT_REASON_PENDING_VIRT_NMI:
2998 /* Disable the NMI window. */
2999 v->arch.hvm_vmx.exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
3000 __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
3001 v->arch.hvm_vmx.exec_control);
3002 break;
3003 case EXIT_REASON_TASK_SWITCH:
3004 goto exit_and_crash;
3005 case EXIT_REASON_CPUID:
3006 inst_len = __get_instruction_length(); /* Safe: CPUID */
3007 __update_guest_eip(inst_len);
3008 vmx_do_cpuid(regs);
3009 break;
3010 case EXIT_REASON_HLT:
3011 inst_len = __get_instruction_length(); /* Safe: HLT */
3012 __update_guest_eip(inst_len);
3013 vmx_do_hlt();
3014 break;
3015 case EXIT_REASON_INVLPG:
3017 inst_len = __get_instruction_length(); /* Safe: INVLPG */
3018 __update_guest_eip(inst_len);
3019 exit_qualification = __vmread(EXIT_QUALIFICATION);
3020 vmx_do_invlpg(exit_qualification);
3021 break;
3023 case EXIT_REASON_VMCALL:
3025 int rc;
3026 HVMTRACE_1D(VMMCALL, v, regs->eax);
3027 inst_len = __get_instruction_length(); /* Safe: VMCALL */
3028 rc = hvm_do_hypercall(regs);
3029 if ( rc != HVM_HCALL_preempted )
3031 __update_guest_eip(inst_len);
3032 if ( rc == HVM_HCALL_invalidate )
3033 send_invalidate_req();
3035 break;
3037 case EXIT_REASON_CR_ACCESS:
3039 exit_qualification = __vmread(EXIT_QUALIFICATION);
3040 inst_len = __get_instruction_length(); /* Safe: MOV Cn, LMSW, CLTS */
3041 if ( vmx_cr_access(exit_qualification, regs) )
3042 __update_guest_eip(inst_len);
3043 break;
3045 case EXIT_REASON_DR_ACCESS:
3046 exit_qualification = __vmread(EXIT_QUALIFICATION);
3047 vmx_dr_access(exit_qualification, regs);
3048 break;
3049 case EXIT_REASON_IO_INSTRUCTION:
3050 exit_qualification = __vmread(EXIT_QUALIFICATION);
3051 inst_len = __get_instruction_length(); /* Safe: IN, INS, OUT, OUTS */
3052 vmx_io_instruction(exit_qualification, inst_len);
3053 break;
3054 case EXIT_REASON_MSR_READ:
3055 inst_len = __get_instruction_length(); /* Safe: RDMSR */
3056 if ( vmx_do_msr_read(regs) )
3057 __update_guest_eip(inst_len);
3058 break;
3059 case EXIT_REASON_MSR_WRITE:
3060 inst_len = __get_instruction_length(); /* Safe: WRMSR */
3061 if ( vmx_do_msr_write(regs) )
3062 __update_guest_eip(inst_len);
3063 break;
3064 case EXIT_REASON_MWAIT_INSTRUCTION:
3065 case EXIT_REASON_MONITOR_INSTRUCTION:
3066 case EXIT_REASON_PAUSE_INSTRUCTION:
3067 goto exit_and_crash;
3068 case EXIT_REASON_VMCLEAR:
3069 case EXIT_REASON_VMLAUNCH:
3070 case EXIT_REASON_VMPTRLD:
3071 case EXIT_REASON_VMPTRST:
3072 case EXIT_REASON_VMREAD:
3073 case EXIT_REASON_VMRESUME:
3074 case EXIT_REASON_VMWRITE:
3075 case EXIT_REASON_VMXOFF:
3076 case EXIT_REASON_VMXON:
3077 /* Report invalid opcode exception when a VMX guest tries to execute
3078 any of the VMX instructions */
3079 vmx_inject_hw_exception(v, TRAP_invalid_op, VMX_DELIVER_NO_ERROR_CODE);
3080 break;
3082 case EXIT_REASON_TPR_BELOW_THRESHOLD:
3083 break;
3085 case EXIT_REASON_APIC_ACCESS:
3087 unsigned long offset;
3088 exit_qualification = __vmread(EXIT_QUALIFICATION);
3089 offset = exit_qualification & 0x0fffUL;
3090 handle_mmio(APIC_DEFAULT_PHYS_BASE | offset);
3091 break;
3094 default:
3095 exit_and_crash:
3096 gdprintk(XENLOG_ERR, "Bad vmexit (reason %x)\n", exit_reason);
3097 domain_crash(v->domain);
3098 break;
3102 asmlinkage void vmx_trace_vmentry(void)
3104 struct vcpu *v = current;
3105 HVMTRACE_0D(VMENTRY, v);
3108 /*
3109 * Local variables:
3110 * mode: C
3111 * c-set-style: "BSD"
3112 * c-basic-offset: 4
3113 * tab-width: 4
3114 * indent-tabs-mode: nil
3115 * End:
3116 */