ia64/xen-unstable

view xen/arch/x86/hvm/vmx/vmx.c @ 14086:e8470a1a01af

hvm: Rename injection_pending() to event_injection_faulted().
Fix the VMX and SVM handlers to reflect the new semantics (which is
what is actually required by the one caller, in shadow fault path).
Signed-off-by: Keir Fraser <keir@xensource.com>
author kfraser@localhost.localdomain
date Fri Feb 23 10:35:16 2007 +0000 (2007-02-23)
parents 3f7e8c763b55
children cdc765772f69
line source
1 /*
2 * vmx.c: handling VMX architecture-related VM exits
3 * Copyright (c) 2004, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
16 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 *
18 */
20 #include <xen/config.h>
21 #include <xen/init.h>
22 #include <xen/lib.h>
23 #include <xen/trace.h>
24 #include <xen/sched.h>
25 #include <xen/irq.h>
26 #include <xen/softirq.h>
27 #include <xen/domain_page.h>
28 #include <xen/hypercall.h>
29 #include <xen/perfc.h>
30 #include <asm/current.h>
31 #include <asm/io.h>
32 #include <asm/regs.h>
33 #include <asm/cpufeature.h>
34 #include <asm/processor.h>
35 #include <asm/types.h>
36 #include <asm/msr.h>
37 #include <asm/spinlock.h>
38 #include <asm/paging.h>
39 #include <asm/p2m.h>
40 #include <asm/hvm/hvm.h>
41 #include <asm/hvm/support.h>
42 #include <asm/hvm/vmx/vmx.h>
43 #include <asm/hvm/vmx/vmcs.h>
44 #include <asm/hvm/vmx/cpu.h>
45 #include <public/sched.h>
46 #include <public/hvm/ioreq.h>
47 #include <asm/hvm/vpic.h>
48 #include <asm/hvm/vlapic.h>
49 #include <asm/x86_emulate.h>
50 #include <asm/hvm/vpt.h>
51 #include <public/hvm/save.h>
53 static void vmx_ctxt_switch_from(struct vcpu *v);
54 static void vmx_ctxt_switch_to(struct vcpu *v);
56 static int vmx_vcpu_initialise(struct vcpu *v)
57 {
58 int rc;
60 spin_lock_init(&v->arch.hvm_vmx.vmcs_lock);
62 v->arch.schedule_tail = arch_vmx_do_resume;
63 v->arch.ctxt_switch_from = vmx_ctxt_switch_from;
64 v->arch.ctxt_switch_to = vmx_ctxt_switch_to;
66 if ( (rc = vmx_create_vmcs(v)) != 0 )
67 {
68 dprintk(XENLOG_WARNING,
69 "Failed to create VMCS for vcpu %d: err=%d.\n",
70 v->vcpu_id, rc);
71 return rc;
72 }
74 return 0;
75 }
77 static void vmx_vcpu_destroy(struct vcpu *v)
78 {
79 vmx_destroy_vmcs(v);
80 }
82 #ifdef __x86_64__
84 static DEFINE_PER_CPU(struct vmx_msr_state, host_msr_state);
86 static u32 msr_index[VMX_MSR_COUNT] =
87 {
88 MSR_LSTAR, MSR_STAR, MSR_CSTAR,
89 MSR_SYSCALL_MASK, MSR_EFER,
90 };
92 static void vmx_save_host_msrs(void)
93 {
94 struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state);
95 int i;
97 for ( i = 0; i < VMX_MSR_COUNT; i++ )
98 rdmsrl(msr_index[i], host_msr_state->msrs[i]);
99 }
101 #define WRITE_MSR(address) \
102 guest_msr_state->msrs[VMX_INDEX_MSR_ ## address] = msr_content; \
103 if ( !test_bit(VMX_INDEX_MSR_ ## address, &guest_msr_state->flags) )\
104 set_bit(VMX_INDEX_MSR_ ## address, &guest_msr_state->flags); \
105 wrmsrl(MSR_ ## address, msr_content); \
106 set_bit(VMX_INDEX_MSR_ ## address, &host_msr_state->flags); \
107 break
109 static inline int long_mode_do_msr_read(struct cpu_user_regs *regs)
110 {
111 u64 msr_content = 0;
112 struct vcpu *v = current;
113 struct vmx_msr_state *guest_msr_state = &v->arch.hvm_vmx.msr_state;
115 switch ( (u32)regs->ecx ) {
116 case MSR_EFER:
117 HVM_DBG_LOG(DBG_LEVEL_2, "EFER msr_content 0x%"PRIx64, msr_content);
118 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_EFER];
119 break;
121 case MSR_FS_BASE:
122 msr_content = __vmread(GUEST_FS_BASE);
123 goto check_long_mode;
125 case MSR_GS_BASE:
126 msr_content = __vmread(GUEST_GS_BASE);
127 goto check_long_mode;
129 case MSR_SHADOW_GS_BASE:
130 msr_content = guest_msr_state->shadow_gs;
131 check_long_mode:
132 if ( !(vmx_long_mode_enabled(v)) )
133 {
134 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
135 return 0;
136 }
137 break;
139 case MSR_STAR:
140 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_STAR];
141 break;
143 case MSR_LSTAR:
144 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_LSTAR];
145 break;
147 case MSR_CSTAR:
148 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_CSTAR];
149 break;
151 case MSR_SYSCALL_MASK:
152 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_SYSCALL_MASK];
153 break;
155 default:
156 return 0;
157 }
159 HVM_DBG_LOG(DBG_LEVEL_2, "msr_content: 0x%"PRIx64, msr_content);
161 regs->eax = (u32)(msr_content >> 0);
162 regs->edx = (u32)(msr_content >> 32);
164 return 1;
165 }
167 static inline int long_mode_do_msr_write(struct cpu_user_regs *regs)
168 {
169 u64 msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
170 u32 ecx = regs->ecx;
171 struct vcpu *v = current;
172 struct vmx_msr_state *guest_msr_state = &v->arch.hvm_vmx.msr_state;
173 struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state);
175 HVM_DBG_LOG(DBG_LEVEL_1, "msr 0x%x msr_content 0x%"PRIx64"\n",
176 ecx, msr_content);
178 switch ( ecx )
179 {
180 case MSR_EFER:
181 /* offending reserved bit will cause #GP */
182 if ( msr_content & ~(EFER_LME | EFER_LMA | EFER_NX | EFER_SCE) )
183 {
184 gdprintk(XENLOG_WARNING, "Trying to set reserved bit in "
185 "EFER: %"PRIx64"\n", msr_content);
186 goto gp_fault;
187 }
189 if ( (msr_content & EFER_LME)
190 && !(guest_msr_state->msrs[VMX_INDEX_MSR_EFER] & EFER_LME) )
191 {
192 if ( unlikely(vmx_paging_enabled(v)) )
193 {
194 gdprintk(XENLOG_WARNING,
195 "Trying to set EFER.LME with paging enabled\n");
196 goto gp_fault;
197 }
198 }
199 else if ( !(msr_content & EFER_LME)
200 && (guest_msr_state->msrs[VMX_INDEX_MSR_EFER] & EFER_LME) )
201 {
202 if ( unlikely(vmx_paging_enabled(v)) )
203 {
204 gdprintk(XENLOG_WARNING,
205 "Trying to clear EFER.LME with paging enabled\n");
206 goto gp_fault;
207 }
208 }
210 guest_msr_state->msrs[VMX_INDEX_MSR_EFER] = msr_content;
211 break;
213 case MSR_FS_BASE:
214 case MSR_GS_BASE:
215 case MSR_SHADOW_GS_BASE:
216 if ( !vmx_long_mode_enabled(v) )
217 goto gp_fault;
219 if ( !is_canonical_address(msr_content) )
220 goto uncanonical_address;
222 if ( ecx == MSR_FS_BASE )
223 __vmwrite(GUEST_FS_BASE, msr_content);
224 else if ( ecx == MSR_GS_BASE )
225 __vmwrite(GUEST_GS_BASE, msr_content);
226 else
227 {
228 v->arch.hvm_vmx.msr_state.shadow_gs = msr_content;
229 wrmsrl(MSR_SHADOW_GS_BASE, msr_content);
230 }
232 break;
234 case MSR_STAR:
235 WRITE_MSR(STAR);
237 case MSR_LSTAR:
238 if ( !is_canonical_address(msr_content) )
239 goto uncanonical_address;
240 WRITE_MSR(LSTAR);
242 case MSR_CSTAR:
243 if ( !is_canonical_address(msr_content) )
244 goto uncanonical_address;
245 WRITE_MSR(CSTAR);
247 case MSR_SYSCALL_MASK:
248 WRITE_MSR(SYSCALL_MASK);
250 default:
251 return 0;
252 }
254 return 1;
256 uncanonical_address:
257 HVM_DBG_LOG(DBG_LEVEL_1, "Not cano address of msr write %x\n", ecx);
258 gp_fault:
259 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
260 return 0;
261 }
263 /*
264 * To avoid MSR save/restore at every VM exit/entry time, we restore
265 * the x86_64 specific MSRs at domain switch time. Since these MSRs
266 * are not modified once set for para domains, we don't save them,
267 * but simply reset them to values set in percpu_traps_init().
268 */
269 static void vmx_restore_host_msrs(void)
270 {
271 struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state);
272 int i;
274 while ( host_msr_state->flags )
275 {
276 i = find_first_set_bit(host_msr_state->flags);
277 wrmsrl(msr_index[i], host_msr_state->msrs[i]);
278 clear_bit(i, &host_msr_state->flags);
279 }
280 }
282 static void vmx_save_guest_msrs(struct vcpu *v)
283 {
284 /* MSR_SHADOW_GS_BASE may have been changed by swapgs instruction. */
285 rdmsrl(MSR_SHADOW_GS_BASE, v->arch.hvm_vmx.msr_state.shadow_gs);
286 }
288 static void vmx_restore_guest_msrs(struct vcpu *v)
289 {
290 struct vmx_msr_state *guest_msr_state, *host_msr_state;
291 unsigned long guest_flags;
292 int i;
294 guest_msr_state = &v->arch.hvm_vmx.msr_state;
295 host_msr_state = &this_cpu(host_msr_state);
297 wrmsrl(MSR_SHADOW_GS_BASE, guest_msr_state->shadow_gs);
299 guest_flags = guest_msr_state->flags;
300 if ( !guest_flags )
301 return;
303 while ( guest_flags ) {
304 i = find_first_set_bit(guest_flags);
306 HVM_DBG_LOG(DBG_LEVEL_2,
307 "restore guest's index %d msr %x with value %lx",
308 i, msr_index[i], guest_msr_state->msrs[i]);
309 set_bit(i, &host_msr_state->flags);
310 wrmsrl(msr_index[i], guest_msr_state->msrs[i]);
311 clear_bit(i, &guest_flags);
312 }
313 }
315 #else /* __i386__ */
317 #define vmx_save_host_msrs() ((void)0)
318 #define vmx_restore_host_msrs() ((void)0)
319 #define vmx_save_guest_msrs(v) ((void)0)
320 #define vmx_restore_guest_msrs(v) ((void)0)
322 static inline int long_mode_do_msr_read(struct cpu_user_regs *regs)
323 {
324 return 0;
325 }
327 static inline int long_mode_do_msr_write(struct cpu_user_regs *regs)
328 {
329 return 0;
330 }
332 #endif /* __i386__ */
334 #define loaddebug(_v,_reg) \
335 __asm__ __volatile__ ("mov %0,%%db" #_reg : : "r" ((_v)->debugreg[_reg]))
336 #define savedebug(_v,_reg) \
337 __asm__ __volatile__ ("mov %%db" #_reg ",%0" : : "r" ((_v)->debugreg[_reg]))
339 static inline void vmx_save_dr(struct vcpu *v)
340 {
341 if ( !v->arch.hvm_vcpu.flag_dr_dirty )
342 return;
344 /* Clear the DR dirty flag and re-enable intercepts for DR accesses. */
345 v->arch.hvm_vcpu.flag_dr_dirty = 0;
346 v->arch.hvm_vcpu.u.vmx.exec_control |= CPU_BASED_MOV_DR_EXITING;
347 __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vcpu.u.vmx.exec_control);
349 savedebug(&v->arch.guest_context, 0);
350 savedebug(&v->arch.guest_context, 1);
351 savedebug(&v->arch.guest_context, 2);
352 savedebug(&v->arch.guest_context, 3);
353 savedebug(&v->arch.guest_context, 6);
354 v->arch.guest_context.debugreg[7] = __vmread(GUEST_DR7);
355 }
357 static inline void __restore_debug_registers(struct vcpu *v)
358 {
359 loaddebug(&v->arch.guest_context, 0);
360 loaddebug(&v->arch.guest_context, 1);
361 loaddebug(&v->arch.guest_context, 2);
362 loaddebug(&v->arch.guest_context, 3);
363 /* No 4 and 5 */
364 loaddebug(&v->arch.guest_context, 6);
365 /* DR7 is loaded from the VMCS. */
366 }
368 int vmx_vmcs_save(struct vcpu *v, struct hvm_hw_cpu *c)
369 {
370 c->eip = __vmread(GUEST_RIP);
371 c->esp = __vmread(GUEST_RSP);
372 c->eflags = __vmread(GUEST_RFLAGS);
374 c->cr0 = v->arch.hvm_vmx.cpu_shadow_cr0;
375 c->cr3 = v->arch.hvm_vmx.cpu_cr3;
376 c->cr4 = v->arch.hvm_vmx.cpu_shadow_cr4;
378 #ifdef HVM_DEBUG_SUSPEND
379 printk("vmx_vmcs_save: cr3=0x%"PRIx64", cr0=0x%"PRIx64", cr4=0x%"PRIx64".\n",
380 c->cr3,
381 c->cr0,
382 c->cr4);
383 #endif
385 c->idtr_limit = __vmread(GUEST_IDTR_LIMIT);
386 c->idtr_base = __vmread(GUEST_IDTR_BASE);
388 c->gdtr_limit = __vmread(GUEST_GDTR_LIMIT);
389 c->gdtr_base = __vmread(GUEST_GDTR_BASE);
391 c->cs_sel = __vmread(GUEST_CS_SELECTOR);
392 c->cs_limit = __vmread(GUEST_CS_LIMIT);
393 c->cs_base = __vmread(GUEST_CS_BASE);
394 c->cs_arbytes = __vmread(GUEST_CS_AR_BYTES);
396 c->ds_sel = __vmread(GUEST_DS_SELECTOR);
397 c->ds_limit = __vmread(GUEST_DS_LIMIT);
398 c->ds_base = __vmread(GUEST_DS_BASE);
399 c->ds_arbytes = __vmread(GUEST_DS_AR_BYTES);
401 c->es_sel = __vmread(GUEST_ES_SELECTOR);
402 c->es_limit = __vmread(GUEST_ES_LIMIT);
403 c->es_base = __vmread(GUEST_ES_BASE);
404 c->es_arbytes = __vmread(GUEST_ES_AR_BYTES);
406 c->ss_sel = __vmread(GUEST_SS_SELECTOR);
407 c->ss_limit = __vmread(GUEST_SS_LIMIT);
408 c->ss_base = __vmread(GUEST_SS_BASE);
409 c->ss_arbytes = __vmread(GUEST_SS_AR_BYTES);
411 c->fs_sel = __vmread(GUEST_FS_SELECTOR);
412 c->fs_limit = __vmread(GUEST_FS_LIMIT);
413 c->fs_base = __vmread(GUEST_FS_BASE);
414 c->fs_arbytes = __vmread(GUEST_FS_AR_BYTES);
416 c->gs_sel = __vmread(GUEST_GS_SELECTOR);
417 c->gs_limit = __vmread(GUEST_GS_LIMIT);
418 c->gs_base = __vmread(GUEST_GS_BASE);
419 c->gs_arbytes = __vmread(GUEST_GS_AR_BYTES);
421 c->tr_sel = __vmread(GUEST_TR_SELECTOR);
422 c->tr_limit = __vmread(GUEST_TR_LIMIT);
423 c->tr_base = __vmread(GUEST_TR_BASE);
424 c->tr_arbytes = __vmread(GUEST_TR_AR_BYTES);
426 c->ldtr_sel = __vmread(GUEST_LDTR_SELECTOR);
427 c->ldtr_limit = __vmread(GUEST_LDTR_LIMIT);
428 c->ldtr_base = __vmread(GUEST_LDTR_BASE);
429 c->ldtr_arbytes = __vmread(GUEST_LDTR_AR_BYTES);
431 c->sysenter_cs = __vmread(GUEST_SYSENTER_CS);
432 c->sysenter_esp = __vmread(GUEST_SYSENTER_ESP);
433 c->sysenter_eip = __vmread(GUEST_SYSENTER_EIP);
435 return 1;
436 }
438 int vmx_vmcs_restore(struct vcpu *v, struct hvm_hw_cpu *c)
439 {
440 unsigned long mfn, old_base_mfn;
442 vmx_vmcs_enter(v);
444 __vmwrite(GUEST_RIP, c->eip);
445 __vmwrite(GUEST_RSP, c->esp);
446 __vmwrite(GUEST_RFLAGS, c->eflags);
448 v->arch.hvm_vmx.cpu_shadow_cr0 = c->cr0;
449 __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr0);
451 #ifdef HVM_DEBUG_SUSPEND
452 printk("vmx_vmcs_restore: cr3=0x%"PRIx64", cr0=0x%"PRIx64", cr4=0x%"PRIx64".\n",
453 c->cr3,
454 c->cr0,
455 c->cr4);
456 #endif
458 if (!vmx_paging_enabled(v)) {
459 printk("vmx_vmcs_restore: paging not enabled.");
460 goto skip_cr3;
461 }
463 if (c->cr3 == v->arch.hvm_vmx.cpu_cr3) {
464 /*
465 * This is simple TLB flush, implying the guest has
466 * removed some translation or changed page attributes.
467 * We simply invalidate the shadow.
468 */
469 mfn = gmfn_to_mfn(v->domain, c->cr3 >> PAGE_SHIFT);
470 if (mfn != pagetable_get_pfn(v->arch.guest_table)) {
471 goto bad_cr3;
472 }
473 } else {
474 /*
475 * If different, make a shadow. Check if the PDBR is valid
476 * first.
477 */
478 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 c->cr3 = %"PRIx64"", c->cr3);
479 /* current!=vcpu as not called by arch_vmx_do_launch */
480 mfn = gmfn_to_mfn(v->domain, c->cr3 >> PAGE_SHIFT);
481 if( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), v->domain)) {
482 goto bad_cr3;
483 }
484 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
485 v->arch.guest_table = pagetable_from_pfn(mfn);
486 if (old_base_mfn)
487 put_page(mfn_to_page(old_base_mfn));
488 v->arch.hvm_vmx.cpu_cr3 = c->cr3;
489 }
491 skip_cr3:
492 #if defined(__x86_64__)
493 if (vmx_long_mode_enabled(v)) {
494 unsigned long vm_entry_value;
495 vm_entry_value = __vmread(VM_ENTRY_CONTROLS);
496 vm_entry_value |= VM_ENTRY_IA32E_MODE;
497 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
498 }
499 #endif
501 __vmwrite(GUEST_CR4, (c->cr4 | VMX_CR4_HOST_MASK));
502 v->arch.hvm_vmx.cpu_shadow_cr4 = c->cr4;
503 __vmwrite(CR4_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr4);
505 __vmwrite(GUEST_IDTR_LIMIT, c->idtr_limit);
506 __vmwrite(GUEST_IDTR_BASE, c->idtr_base);
508 __vmwrite(GUEST_GDTR_LIMIT, c->gdtr_limit);
509 __vmwrite(GUEST_GDTR_BASE, c->gdtr_base);
511 __vmwrite(GUEST_CS_SELECTOR, c->cs_sel);
512 __vmwrite(GUEST_CS_LIMIT, c->cs_limit);
513 __vmwrite(GUEST_CS_BASE, c->cs_base);
514 __vmwrite(GUEST_CS_AR_BYTES, c->cs_arbytes);
516 __vmwrite(GUEST_DS_SELECTOR, c->ds_sel);
517 __vmwrite(GUEST_DS_LIMIT, c->ds_limit);
518 __vmwrite(GUEST_DS_BASE, c->ds_base);
519 __vmwrite(GUEST_DS_AR_BYTES, c->ds_arbytes);
521 __vmwrite(GUEST_ES_SELECTOR, c->es_sel);
522 __vmwrite(GUEST_ES_LIMIT, c->es_limit);
523 __vmwrite(GUEST_ES_BASE, c->es_base);
524 __vmwrite(GUEST_ES_AR_BYTES, c->es_arbytes);
526 __vmwrite(GUEST_SS_SELECTOR, c->ss_sel);
527 __vmwrite(GUEST_SS_LIMIT, c->ss_limit);
528 __vmwrite(GUEST_SS_BASE, c->ss_base);
529 __vmwrite(GUEST_SS_AR_BYTES, c->ss_arbytes);
531 __vmwrite(GUEST_FS_SELECTOR, c->fs_sel);
532 __vmwrite(GUEST_FS_LIMIT, c->fs_limit);
533 __vmwrite(GUEST_FS_BASE, c->fs_base);
534 __vmwrite(GUEST_FS_AR_BYTES, c->fs_arbytes);
536 __vmwrite(GUEST_GS_SELECTOR, c->gs_sel);
537 __vmwrite(GUEST_GS_LIMIT, c->gs_limit);
538 __vmwrite(GUEST_GS_BASE, c->gs_base);
539 __vmwrite(GUEST_GS_AR_BYTES, c->gs_arbytes);
541 __vmwrite(GUEST_TR_SELECTOR, c->tr_sel);
542 __vmwrite(GUEST_TR_LIMIT, c->tr_limit);
543 __vmwrite(GUEST_TR_BASE, c->tr_base);
544 __vmwrite(GUEST_TR_AR_BYTES, c->tr_arbytes);
546 __vmwrite(GUEST_LDTR_SELECTOR, c->ldtr_sel);
547 __vmwrite(GUEST_LDTR_LIMIT, c->ldtr_limit);
548 __vmwrite(GUEST_LDTR_BASE, c->ldtr_base);
549 __vmwrite(GUEST_LDTR_AR_BYTES, c->ldtr_arbytes);
551 __vmwrite(GUEST_SYSENTER_CS, c->sysenter_cs);
552 __vmwrite(GUEST_SYSENTER_ESP, c->sysenter_esp);
553 __vmwrite(GUEST_SYSENTER_EIP, c->sysenter_eip);
555 vmx_vmcs_exit(v);
557 paging_update_paging_modes(v);
558 return 0;
560 bad_cr3:
561 gdprintk(XENLOG_ERR, "Invalid CR3 value=0x%"PRIx64"", c->cr3);
562 vmx_vmcs_exit(v);
563 return -EINVAL;
564 }
566 #ifdef HVM_DEBUG_SUSPEND
567 static void dump_msr_state(struct vmx_msr_state *m)
568 {
569 int i = 0;
570 printk("**** msr state ****\n");
571 printk("shadow_gs=0x%lx, flags=0x%lx, msr_items:", m->shadow_gs, m->flags);
572 for (i = 0; i < VMX_MSR_COUNT; i++)
573 printk("0x%lx,", m->msrs[i]);
574 printk("\n");
575 }
576 #else
577 static void dump_msr_state(struct vmx_msr_state *m)
578 {
579 }
580 #endif
582 void vmx_save_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data)
583 {
584 struct vmx_msr_state *guest_state = &v->arch.hvm_vmx.msr_state;
585 unsigned long guest_flags = guest_state->flags;
587 data->shadow_gs = guest_state->shadow_gs;
589 /* save msrs */
590 data->flags = guest_flags;
591 data->msr_lstar = guest_state->msrs[VMX_INDEX_MSR_LSTAR];
592 data->msr_star = guest_state->msrs[VMX_INDEX_MSR_STAR];
593 data->msr_cstar = guest_state->msrs[VMX_INDEX_MSR_CSTAR];
594 data->msr_syscall_mask = guest_state->msrs[VMX_INDEX_MSR_SYSCALL_MASK];
595 data->msr_efer = guest_state->msrs[VMX_INDEX_MSR_EFER];
597 data->tsc = hvm_get_guest_time(v);
599 dump_msr_state(guest_state);
600 }
602 void vmx_load_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data)
603 {
604 struct vmx_msr_state *guest_state = &v->arch.hvm_vmx.msr_state;
606 /* restore msrs */
607 guest_state->flags = data->flags;
608 guest_state->msrs[VMX_INDEX_MSR_LSTAR] = data->msr_lstar;
609 guest_state->msrs[VMX_INDEX_MSR_STAR] = data->msr_star;
610 guest_state->msrs[VMX_INDEX_MSR_CSTAR] = data->msr_cstar;
611 guest_state->msrs[VMX_INDEX_MSR_SYSCALL_MASK] = data->msr_syscall_mask;
612 guest_state->msrs[VMX_INDEX_MSR_EFER] = data->msr_efer;
614 guest_state->shadow_gs = data->shadow_gs;
616 v->arch.hvm_vmx.vmxassist_enabled = !(data->cr0 & X86_CR0_PE);
618 hvm_set_guest_time(v, data->tsc);
620 dump_msr_state(guest_state);
621 }
624 void vmx_save_vmcs_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt)
625 {
626 vmx_save_cpu_state(v, ctxt);
627 vmx_vmcs_enter(v);
628 vmx_vmcs_save(v, ctxt);
629 vmx_vmcs_exit(v);
630 }
632 int vmx_load_vmcs_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt)
633 {
634 vmx_load_cpu_state(v, ctxt);
635 if (vmx_vmcs_restore(v, ctxt)) {
636 printk("vmx_vmcs restore failed!\n");
637 domain_crash(v->domain);
638 return -EINVAL;
639 }
641 return 0;
642 }
644 /*
645 * DR7 is saved and restored on every vmexit. Other debug registers only
646 * need to be restored if their value is going to affect execution -- i.e.,
647 * if one of the breakpoints is enabled. So mask out all bits that don't
648 * enable some breakpoint functionality.
649 */
650 #define DR7_ACTIVE_MASK 0xff
652 static inline void vmx_restore_dr(struct vcpu *v)
653 {
654 /* NB. __vmread() is not usable here, so we cannot read from the VMCS. */
655 if ( unlikely(v->arch.guest_context.debugreg[7] & DR7_ACTIVE_MASK) )
656 __restore_debug_registers(v);
657 }
659 static void vmx_ctxt_switch_from(struct vcpu *v)
660 {
661 vmx_save_guest_msrs(v);
662 vmx_restore_host_msrs();
663 vmx_save_dr(v);
664 }
666 static void vmx_ctxt_switch_to(struct vcpu *v)
667 {
668 vmx_restore_guest_msrs(v);
669 vmx_restore_dr(v);
670 }
672 static void stop_vmx(void)
673 {
674 if ( !(read_cr4() & X86_CR4_VMXE) )
675 return;
677 __vmxoff();
678 clear_in_cr4(X86_CR4_VMXE);
679 }
681 static void vmx_store_cpu_guest_regs(
682 struct vcpu *v, struct cpu_user_regs *regs, unsigned long *crs)
683 {
684 vmx_vmcs_enter(v);
686 if ( regs != NULL )
687 {
688 regs->eflags = __vmread(GUEST_RFLAGS);
689 regs->ss = __vmread(GUEST_SS_SELECTOR);
690 regs->cs = __vmread(GUEST_CS_SELECTOR);
691 regs->eip = __vmread(GUEST_RIP);
692 regs->esp = __vmread(GUEST_RSP);
693 }
695 if ( crs != NULL )
696 {
697 crs[0] = v->arch.hvm_vmx.cpu_shadow_cr0;
698 crs[2] = v->arch.hvm_vmx.cpu_cr2;
699 crs[3] = v->arch.hvm_vmx.cpu_cr3;
700 crs[4] = v->arch.hvm_vmx.cpu_shadow_cr4;
701 }
703 vmx_vmcs_exit(v);
704 }
706 static void vmx_load_cpu_guest_regs(struct vcpu *v, struct cpu_user_regs *regs)
707 {
708 unsigned long base;
710 vmx_vmcs_enter(v);
712 __vmwrite(GUEST_SS_SELECTOR, regs->ss);
713 __vmwrite(GUEST_RSP, regs->esp);
715 /* NB. Bit 1 of RFLAGS must be set for VMENTRY to succeed. */
716 __vmwrite(GUEST_RFLAGS, regs->eflags | 2UL);
718 if ( regs->eflags & EF_TF )
719 __vm_set_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_DB);
720 else
721 __vm_clear_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_DB);
723 if ( regs->eflags & EF_VM )
724 {
725 /*
726 * The VMX spec (section 4.3.1.2, Checks on Guest Segment
727 * Registers) says that virtual-8086 mode guests' segment
728 * base-address fields in the VMCS must be equal to their
729 * corresponding segment selector field shifted right by
730 * four bits upon vmentry.
731 */
732 base = __vmread(GUEST_CS_BASE);
733 if ( (regs->cs << 4) != base )
734 __vmwrite(GUEST_CS_BASE, regs->cs << 4);
735 base = __vmread(GUEST_SS_BASE);
736 if ( (regs->ss << 4) != base )
737 __vmwrite(GUEST_SS_BASE, regs->ss << 4);
738 }
740 __vmwrite(GUEST_CS_SELECTOR, regs->cs);
741 __vmwrite(GUEST_RIP, regs->eip);
743 vmx_vmcs_exit(v);
744 }
746 static unsigned long vmx_get_ctrl_reg(struct vcpu *v, unsigned int num)
747 {
748 switch ( num )
749 {
750 case 0:
751 return v->arch.hvm_vmx.cpu_cr0;
752 case 2:
753 return v->arch.hvm_vmx.cpu_cr2;
754 case 3:
755 return v->arch.hvm_vmx.cpu_cr3;
756 case 4:
757 return v->arch.hvm_vmx.cpu_shadow_cr4;
758 default:
759 BUG();
760 }
761 return 0; /* dummy */
762 }
764 static unsigned long vmx_get_segment_base(struct vcpu *v, enum x86_segment seg)
765 {
766 unsigned long base = 0;
767 int long_mode = 0;
769 ASSERT(v == current);
771 #ifdef __x86_64__
772 if ( vmx_long_mode_enabled(v) && (__vmread(GUEST_CS_AR_BYTES) & (1u<<13)) )
773 long_mode = 1;
774 #endif
776 switch ( seg )
777 {
778 case x86_seg_cs: if ( !long_mode ) base = __vmread(GUEST_CS_BASE); break;
779 case x86_seg_ds: if ( !long_mode ) base = __vmread(GUEST_DS_BASE); break;
780 case x86_seg_es: if ( !long_mode ) base = __vmread(GUEST_ES_BASE); break;
781 case x86_seg_fs: base = __vmread(GUEST_FS_BASE); break;
782 case x86_seg_gs: base = __vmread(GUEST_GS_BASE); break;
783 case x86_seg_ss: if ( !long_mode ) base = __vmread(GUEST_SS_BASE); break;
784 case x86_seg_tr: base = __vmread(GUEST_TR_BASE); break;
785 case x86_seg_gdtr: base = __vmread(GUEST_GDTR_BASE); break;
786 case x86_seg_idtr: base = __vmread(GUEST_IDTR_BASE); break;
787 case x86_seg_ldtr: base = __vmread(GUEST_LDTR_BASE); break;
788 default: BUG(); break;
789 }
791 return base;
792 }
794 static void vmx_get_segment_register(struct vcpu *v, enum x86_segment seg,
795 struct segment_register *reg)
796 {
797 u16 attr = 0;
799 ASSERT(v == current);
801 switch ( seg )
802 {
803 case x86_seg_cs:
804 reg->sel = __vmread(GUEST_CS_SELECTOR);
805 reg->limit = __vmread(GUEST_CS_LIMIT);
806 reg->base = __vmread(GUEST_CS_BASE);
807 attr = __vmread(GUEST_CS_AR_BYTES);
808 break;
809 case x86_seg_ds:
810 reg->sel = __vmread(GUEST_DS_SELECTOR);
811 reg->limit = __vmread(GUEST_DS_LIMIT);
812 reg->base = __vmread(GUEST_DS_BASE);
813 attr = __vmread(GUEST_DS_AR_BYTES);
814 break;
815 case x86_seg_es:
816 reg->sel = __vmread(GUEST_ES_SELECTOR);
817 reg->limit = __vmread(GUEST_ES_LIMIT);
818 reg->base = __vmread(GUEST_ES_BASE);
819 attr = __vmread(GUEST_ES_AR_BYTES);
820 break;
821 case x86_seg_fs:
822 reg->sel = __vmread(GUEST_FS_SELECTOR);
823 reg->limit = __vmread(GUEST_FS_LIMIT);
824 reg->base = __vmread(GUEST_FS_BASE);
825 attr = __vmread(GUEST_FS_AR_BYTES);
826 break;
827 case x86_seg_gs:
828 reg->sel = __vmread(GUEST_GS_SELECTOR);
829 reg->limit = __vmread(GUEST_GS_LIMIT);
830 reg->base = __vmread(GUEST_GS_BASE);
831 attr = __vmread(GUEST_GS_AR_BYTES);
832 break;
833 case x86_seg_ss:
834 reg->sel = __vmread(GUEST_SS_SELECTOR);
835 reg->limit = __vmread(GUEST_SS_LIMIT);
836 reg->base = __vmread(GUEST_SS_BASE);
837 attr = __vmread(GUEST_SS_AR_BYTES);
838 break;
839 case x86_seg_tr:
840 reg->sel = __vmread(GUEST_TR_SELECTOR);
841 reg->limit = __vmread(GUEST_TR_LIMIT);
842 reg->base = __vmread(GUEST_TR_BASE);
843 attr = __vmread(GUEST_TR_AR_BYTES);
844 break;
845 case x86_seg_gdtr:
846 reg->limit = __vmread(GUEST_GDTR_LIMIT);
847 reg->base = __vmread(GUEST_GDTR_BASE);
848 break;
849 case x86_seg_idtr:
850 reg->limit = __vmread(GUEST_IDTR_LIMIT);
851 reg->base = __vmread(GUEST_IDTR_BASE);
852 break;
853 case x86_seg_ldtr:
854 reg->sel = __vmread(GUEST_LDTR_SELECTOR);
855 reg->limit = __vmread(GUEST_LDTR_LIMIT);
856 reg->base = __vmread(GUEST_LDTR_BASE);
857 attr = __vmread(GUEST_LDTR_AR_BYTES);
858 break;
859 default:
860 BUG();
861 }
863 reg->attr.bytes = (attr & 0xff) | ((attr >> 4) & 0xf00);
864 }
866 /* Make sure that xen intercepts any FP accesses from current */
867 static void vmx_stts(struct vcpu *v)
868 {
869 /* VMX depends on operating on the current vcpu */
870 ASSERT(v == current);
872 /*
873 * If the guest does not have TS enabled then we must cause and handle an
874 * exception on first use of the FPU. If the guest *does* have TS enabled
875 * then this is not necessary: no FPU activity can occur until the guest
876 * clears CR0.TS, and we will initialise the FPU when that happens.
877 */
878 if ( !(v->arch.hvm_vmx.cpu_shadow_cr0 & X86_CR0_TS) )
879 {
880 v->arch.hvm_vmx.cpu_cr0 |= X86_CR0_TS;
881 __vmwrite(GUEST_CR0, v->arch.hvm_vmx.cpu_cr0);
882 __vm_set_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_NM);
883 }
884 }
886 static void vmx_set_tsc_offset(struct vcpu *v, u64 offset)
887 {
888 vmx_vmcs_enter(v);
889 __vmwrite(TSC_OFFSET, offset);
890 #if defined (__i386__)
891 __vmwrite(TSC_OFFSET_HIGH, offset >> 32);
892 #endif
893 vmx_vmcs_exit(v);
894 }
896 static void vmx_init_ap_context(
897 struct vcpu_guest_context *ctxt, int vcpuid, int trampoline_vector)
898 {
899 memset(ctxt, 0, sizeof(*ctxt));
900 ctxt->user_regs.eip = VMXASSIST_BASE;
901 ctxt->user_regs.edx = vcpuid;
902 ctxt->user_regs.ebx = trampoline_vector;
903 }
905 void do_nmi(struct cpu_user_regs *);
907 static void vmx_init_hypercall_page(struct domain *d, void *hypercall_page)
908 {
909 char *p;
910 int i;
912 memset(hypercall_page, 0, PAGE_SIZE);
914 for ( i = 0; i < (PAGE_SIZE / 32); i++ )
915 {
916 p = (char *)(hypercall_page + (i * 32));
917 *(u8 *)(p + 0) = 0xb8; /* mov imm32, %eax */
918 *(u32 *)(p + 1) = i;
919 *(u8 *)(p + 5) = 0x0f; /* vmcall */
920 *(u8 *)(p + 6) = 0x01;
921 *(u8 *)(p + 7) = 0xc1;
922 *(u8 *)(p + 8) = 0xc3; /* ret */
923 }
925 /* Don't support HYPERVISOR_iret at the moment */
926 *(u16 *)(hypercall_page + (__HYPERVISOR_iret * 32)) = 0x0b0f; /* ud2 */
927 }
929 static int vmx_realmode(struct vcpu *v)
930 {
931 unsigned long rflags;
933 ASSERT(v == current);
935 rflags = __vmread(GUEST_RFLAGS);
936 return rflags & X86_EFLAGS_VM;
937 }
939 static int vmx_guest_x86_mode(struct vcpu *v)
940 {
941 unsigned long cs_ar_bytes;
943 ASSERT(v == current);
945 cs_ar_bytes = __vmread(GUEST_CS_AR_BYTES);
947 if ( vmx_long_mode_enabled(v) && (cs_ar_bytes & (1u<<13)) )
948 return 8;
950 if ( vmx_realmode(v) )
951 return 2;
953 return ((cs_ar_bytes & (1u<<14)) ? 4 : 2);
954 }
956 static int vmx_pae_enabled(struct vcpu *v)
957 {
958 unsigned long cr4 = v->arch.hvm_vmx.cpu_shadow_cr4;
959 return (vmx_paging_enabled(v) && (cr4 & X86_CR4_PAE));
960 }
962 static void vmx_update_host_cr3(struct vcpu *v)
963 {
964 ASSERT( (v == current) || !vcpu_runnable(v) );
965 vmx_vmcs_enter(v);
966 __vmwrite(HOST_CR3, v->arch.cr3);
967 vmx_vmcs_exit(v);
968 }
970 static void vmx_update_guest_cr3(struct vcpu *v)
971 {
972 ASSERT( (v == current) || !vcpu_runnable(v) );
973 vmx_vmcs_enter(v);
974 __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
975 vmx_vmcs_exit(v);
976 }
979 static void vmx_inject_exception(
980 unsigned int trapnr, int errcode, unsigned long cr2)
981 {
982 struct vcpu *v = current;
983 vmx_inject_hw_exception(v, trapnr, errcode);
984 if ( trapnr == TRAP_page_fault )
985 v->arch.hvm_vmx.cpu_cr2 = cr2;
986 }
988 static void vmx_update_vtpr(struct vcpu *v, unsigned long value)
989 {
990 /* VMX doesn't have a V_TPR field */
991 }
993 static int vmx_event_injection_faulted(struct vcpu *v)
994 {
995 unsigned int idtv_info_field;
997 ASSERT(v == current);
999 idtv_info_field = __vmread(IDT_VECTORING_INFO_FIELD);
1000 return (idtv_info_field & INTR_INFO_VALID_MASK);
1003 /* Setup HVM interfaces */
1004 static void vmx_setup_hvm_funcs(void)
1006 hvm_funcs.disable = stop_vmx;
1008 hvm_funcs.vcpu_initialise = vmx_vcpu_initialise;
1009 hvm_funcs.vcpu_destroy = vmx_vcpu_destroy;
1011 hvm_funcs.store_cpu_guest_regs = vmx_store_cpu_guest_regs;
1012 hvm_funcs.load_cpu_guest_regs = vmx_load_cpu_guest_regs;
1014 hvm_funcs.save_cpu_ctxt = vmx_save_vmcs_ctxt;
1015 hvm_funcs.load_cpu_ctxt = vmx_load_vmcs_ctxt;
1017 hvm_funcs.paging_enabled = vmx_paging_enabled;
1018 hvm_funcs.long_mode_enabled = vmx_long_mode_enabled;
1019 hvm_funcs.pae_enabled = vmx_pae_enabled;
1020 hvm_funcs.guest_x86_mode = vmx_guest_x86_mode;
1021 hvm_funcs.get_guest_ctrl_reg = vmx_get_ctrl_reg;
1022 hvm_funcs.get_segment_base = vmx_get_segment_base;
1023 hvm_funcs.get_segment_register = vmx_get_segment_register;
1025 hvm_funcs.update_host_cr3 = vmx_update_host_cr3;
1026 hvm_funcs.update_guest_cr3 = vmx_update_guest_cr3;
1028 hvm_funcs.update_vtpr = vmx_update_vtpr;
1030 hvm_funcs.stts = vmx_stts;
1031 hvm_funcs.set_tsc_offset = vmx_set_tsc_offset;
1033 hvm_funcs.inject_exception = vmx_inject_exception;
1035 hvm_funcs.init_ap_context = vmx_init_ap_context;
1037 hvm_funcs.init_hypercall_page = vmx_init_hypercall_page;
1039 hvm_funcs.event_injection_faulted = vmx_event_injection_faulted;
1042 int start_vmx(void)
1044 u32 eax, edx;
1045 struct vmcs_struct *vmcs;
1047 /*
1048 * Xen does not fill x86_capability words except 0.
1049 */
1050 boot_cpu_data.x86_capability[4] = cpuid_ecx(1);
1052 if ( !test_bit(X86_FEATURE_VMXE, &boot_cpu_data.x86_capability) )
1053 return 0;
1055 rdmsr(IA32_FEATURE_CONTROL_MSR, eax, edx);
1057 if ( eax & IA32_FEATURE_CONTROL_MSR_LOCK )
1059 if ( (eax & IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON) == 0x0 )
1061 printk("VMX disabled by Feature Control MSR.\n");
1062 return 0;
1065 else
1067 wrmsr(IA32_FEATURE_CONTROL_MSR,
1068 IA32_FEATURE_CONTROL_MSR_LOCK |
1069 IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON, 0);
1072 set_in_cr4(X86_CR4_VMXE);
1074 vmx_init_vmcs_config();
1076 if ( smp_processor_id() == 0 )
1077 setup_vmcs_dump();
1079 if ( (vmcs = vmx_alloc_host_vmcs()) == NULL )
1081 clear_in_cr4(X86_CR4_VMXE);
1082 printk("Failed to allocate host VMCS\n");
1083 return 0;
1086 if ( __vmxon(virt_to_maddr(vmcs)) )
1088 clear_in_cr4(X86_CR4_VMXE);
1089 printk("VMXON failed\n");
1090 vmx_free_host_vmcs(vmcs);
1091 return 0;
1094 printk("VMXON is done\n");
1096 vmx_save_host_msrs();
1098 vmx_setup_hvm_funcs();
1100 hvm_enable();
1102 return 1;
1105 /*
1106 * Not all cases receive valid value in the VM-exit instruction length field.
1107 * Callers must know what they're doing!
1108 */
1109 static int __get_instruction_length(void)
1111 int len;
1112 len = __vmread(VM_EXIT_INSTRUCTION_LEN); /* Safe: callers audited */
1113 BUG_ON((len < 1) || (len > 15));
1114 return len;
1117 static void inline __update_guest_eip(unsigned long inst_len)
1119 unsigned long current_eip;
1121 current_eip = __vmread(GUEST_RIP);
1122 __vmwrite(GUEST_RIP, current_eip + inst_len);
1123 __vmwrite(GUEST_INTERRUPTIBILITY_INFO, 0);
1126 static int vmx_do_page_fault(unsigned long va, struct cpu_user_regs *regs)
1128 int result;
1130 #if 0 /* keep for debugging */
1132 unsigned long eip, cs;
1134 cs = __vmread(GUEST_CS_BASE);
1135 eip = __vmread(GUEST_RIP);
1136 HVM_DBG_LOG(DBG_LEVEL_VMMU,
1137 "vmx_do_page_fault = 0x%lx, cs_base=%lx, "
1138 "eip = %lx, error_code = %lx\n",
1139 va, cs, eip, (unsigned long)regs->error_code);
1141 #endif
1143 result = paging_fault(va, regs);
1145 TRACE_VMEXIT(2, result);
1146 #if 0
1147 if ( !result )
1149 eip = __vmread(GUEST_RIP);
1150 printk("vmx pgfault to guest va=%lx eip=%lx\n", va, eip);
1152 #endif
1154 return result;
1157 static void vmx_do_no_device_fault(void)
1159 struct vcpu *v = current;
1161 setup_fpu(current);
1162 __vm_clear_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_NM);
1164 /* Disable TS in guest CR0 unless the guest wants the exception too. */
1165 if ( !(v->arch.hvm_vmx.cpu_shadow_cr0 & X86_CR0_TS) )
1167 v->arch.hvm_vmx.cpu_cr0 &= ~X86_CR0_TS;
1168 __vmwrite(GUEST_CR0, v->arch.hvm_vmx.cpu_cr0);
1172 #define bitmaskof(idx) (1U << ((idx) & 31))
1173 static void vmx_do_cpuid(struct cpu_user_regs *regs)
1175 unsigned int input = (unsigned int)regs->eax;
1176 unsigned int count = (unsigned int)regs->ecx;
1177 unsigned int eax, ebx, ecx, edx;
1179 if ( input == 0x00000004 )
1181 cpuid_count(input, count, &eax, &ebx, &ecx, &edx);
1182 eax &= NUM_CORES_RESET_MASK;
1184 else if ( input == 0x40000003 )
1186 /*
1187 * NB. Unsupported interface for private use of VMXASSIST only.
1188 * Note that this leaf lives at <max-hypervisor-leaf> + 1.
1189 */
1190 u64 value = ((u64)regs->edx << 32) | (u32)regs->ecx;
1191 unsigned long mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
1192 struct vcpu *v = current;
1193 char *p;
1195 gdprintk(XENLOG_INFO, "Input address is 0x%"PRIx64".\n", value);
1197 /* 8-byte aligned valid pseudophys address from vmxassist, please. */
1198 if ( (value & 7) || (mfn == INVALID_MFN) ||
1199 !v->arch.hvm_vmx.vmxassist_enabled )
1201 domain_crash(v->domain);
1202 return;
1205 p = map_domain_page(mfn);
1206 value = *((uint64_t *)(p + (value & (PAGE_SIZE - 1))));
1207 unmap_domain_page(p);
1209 gdprintk(XENLOG_INFO, "Output value is 0x%"PRIx64".\n", value);
1210 ecx = (u32)value;
1211 edx = (u32)(value >> 32);
1212 } else {
1213 hvm_cpuid(input, &eax, &ebx, &ecx, &edx);
1215 if ( input == 0x00000001 )
1217 /* Mask off reserved bits. */
1218 ecx &= ~VMX_VCPU_CPUID_L1_ECX_RESERVED;
1220 ebx &= NUM_THREADS_RESET_MASK;
1222 /* Unsupportable for virtualised CPUs. */
1223 ecx &= ~(bitmaskof(X86_FEATURE_VMXE) |
1224 bitmaskof(X86_FEATURE_EST) |
1225 bitmaskof(X86_FEATURE_TM2) |
1226 bitmaskof(X86_FEATURE_CID));
1228 edx &= ~(bitmaskof(X86_FEATURE_HT) |
1229 bitmaskof(X86_FEATURE_ACPI) |
1230 bitmaskof(X86_FEATURE_ACC));
1233 if ( input == 0x00000006 || input == 0x00000009 || input == 0x0000000A )
1234 eax = ebx = ecx = edx = 0x0;
1237 regs->eax = (unsigned long)eax;
1238 regs->ebx = (unsigned long)ebx;
1239 regs->ecx = (unsigned long)ecx;
1240 regs->edx = (unsigned long)edx;
1243 #define CASE_GET_REG_P(REG, reg) \
1244 case REG_ ## REG: reg_p = (unsigned long *)&(regs->reg); break
1246 #ifdef __i386__
1247 #define CASE_EXTEND_GET_REG_P
1248 #else
1249 #define CASE_EXTEND_GET_REG_P \
1250 CASE_GET_REG_P(R8, r8); \
1251 CASE_GET_REG_P(R9, r9); \
1252 CASE_GET_REG_P(R10, r10); \
1253 CASE_GET_REG_P(R11, r11); \
1254 CASE_GET_REG_P(R12, r12); \
1255 CASE_GET_REG_P(R13, r13); \
1256 CASE_GET_REG_P(R14, r14); \
1257 CASE_GET_REG_P(R15, r15)
1258 #endif
1260 static void vmx_dr_access(unsigned long exit_qualification,
1261 struct cpu_user_regs *regs)
1263 struct vcpu *v = current;
1265 v->arch.hvm_vcpu.flag_dr_dirty = 1;
1267 /* We could probably be smarter about this */
1268 __restore_debug_registers(v);
1270 /* Allow guest direct access to DR registers */
1271 v->arch.hvm_vcpu.u.vmx.exec_control &= ~CPU_BASED_MOV_DR_EXITING;
1272 __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
1273 v->arch.hvm_vcpu.u.vmx.exec_control);
1276 /*
1277 * Invalidate the TLB for va. Invalidate the shadow page corresponding
1278 * the address va.
1279 */
1280 static void vmx_do_invlpg(unsigned long va)
1282 unsigned long eip;
1283 struct vcpu *v = current;
1285 eip = __vmread(GUEST_RIP);
1287 HVM_DBG_LOG(DBG_LEVEL_VMMU, "eip=%lx, va=%lx",
1288 eip, va);
1290 /*
1291 * We do the safest things first, then try to update the shadow
1292 * copying from guest
1293 */
1294 paging_invlpg(v, va);
1298 static int vmx_check_descriptor(int long_mode, unsigned long eip, int inst_len,
1299 enum x86_segment seg, unsigned long *base,
1300 u32 *limit, u32 *ar_bytes)
1302 enum vmcs_field ar_field, base_field, limit_field;
1304 *base = 0;
1305 *limit = 0;
1306 if ( seg != x86_seg_es )
1308 unsigned char inst[MAX_INST_LEN];
1309 int i;
1310 extern int inst_copy_from_guest(unsigned char *, unsigned long, int);
1312 if ( !long_mode )
1313 eip += __vmread(GUEST_CS_BASE);
1314 memset(inst, 0, MAX_INST_LEN);
1315 if ( inst_copy_from_guest(inst, eip, inst_len) != inst_len )
1317 gdprintk(XENLOG_ERR, "Get guest instruction failed\n");
1318 domain_crash(current->domain);
1319 return 0;
1322 for ( i = 0; i < inst_len; i++ )
1324 switch ( inst[i] )
1326 case 0xf3: /* REPZ */
1327 case 0xf2: /* REPNZ */
1328 case 0xf0: /* LOCK */
1329 case 0x66: /* data32 */
1330 case 0x67: /* addr32 */
1331 #ifdef __x86_64__
1332 case 0x40 ... 0x4f: /* REX */
1333 #endif
1334 continue;
1335 case 0x2e: /* CS */
1336 seg = x86_seg_cs;
1337 continue;
1338 case 0x36: /* SS */
1339 seg = x86_seg_ss;
1340 continue;
1341 case 0x26: /* ES */
1342 seg = x86_seg_es;
1343 continue;
1344 case 0x64: /* FS */
1345 seg = x86_seg_fs;
1346 continue;
1347 case 0x65: /* GS */
1348 seg = x86_seg_gs;
1349 continue;
1350 case 0x3e: /* DS */
1351 seg = x86_seg_ds;
1352 continue;
1357 switch ( seg )
1359 case x86_seg_cs:
1360 ar_field = GUEST_CS_AR_BYTES;
1361 base_field = GUEST_CS_BASE;
1362 limit_field = GUEST_CS_LIMIT;
1363 break;
1364 case x86_seg_ds:
1365 ar_field = GUEST_DS_AR_BYTES;
1366 base_field = GUEST_DS_BASE;
1367 limit_field = GUEST_DS_LIMIT;
1368 break;
1369 case x86_seg_es:
1370 ar_field = GUEST_ES_AR_BYTES;
1371 base_field = GUEST_ES_BASE;
1372 limit_field = GUEST_ES_LIMIT;
1373 break;
1374 case x86_seg_fs:
1375 ar_field = GUEST_FS_AR_BYTES;
1376 base_field = GUEST_FS_BASE;
1377 limit_field = GUEST_FS_LIMIT;
1378 break;
1379 case x86_seg_gs:
1380 ar_field = GUEST_FS_AR_BYTES;
1381 base_field = GUEST_FS_BASE;
1382 limit_field = GUEST_FS_LIMIT;
1383 break;
1384 case x86_seg_ss:
1385 ar_field = GUEST_GS_AR_BYTES;
1386 base_field = GUEST_GS_BASE;
1387 limit_field = GUEST_GS_LIMIT;
1388 break;
1389 default:
1390 BUG();
1391 return 0;
1394 if ( !long_mode || seg == x86_seg_fs || seg == x86_seg_gs )
1396 *base = __vmread(base_field);
1397 *limit = __vmread(limit_field);
1399 *ar_bytes = __vmread(ar_field);
1401 return !(*ar_bytes & 0x10000);
1404 static void vmx_io_instruction(unsigned long exit_qualification,
1405 unsigned long inst_len)
1407 struct cpu_user_regs *regs;
1408 struct hvm_io_op *pio_opp;
1409 unsigned int port, size;
1410 int dir, df, vm86;
1412 pio_opp = &current->arch.hvm_vcpu.io_op;
1413 pio_opp->instr = INSTR_PIO;
1414 pio_opp->flags = 0;
1416 regs = &pio_opp->io_context;
1418 /* Copy current guest state into io instruction state structure. */
1419 memcpy(regs, guest_cpu_user_regs(), HVM_CONTEXT_STACK_BYTES);
1420 hvm_store_cpu_guest_regs(current, regs, NULL);
1422 vm86 = regs->eflags & X86_EFLAGS_VM ? 1 : 0;
1423 df = regs->eflags & X86_EFLAGS_DF ? 1 : 0;
1425 HVM_DBG_LOG(DBG_LEVEL_IO, "vm86 %d, eip=%x:%lx, "
1426 "exit_qualification = %lx",
1427 vm86, regs->cs, (unsigned long)regs->eip, exit_qualification);
1429 if ( test_bit(6, &exit_qualification) )
1430 port = (exit_qualification >> 16) & 0xFFFF;
1431 else
1432 port = regs->edx & 0xffff;
1434 TRACE_VMEXIT(1, port);
1436 size = (exit_qualification & 7) + 1;
1437 dir = test_bit(3, &exit_qualification); /* direction */
1439 if ( test_bit(4, &exit_qualification) ) { /* string instruction */
1440 unsigned long addr, count = 1, base;
1441 paddr_t paddr;
1442 unsigned long gfn;
1443 u32 ar_bytes, limit;
1444 int sign = regs->eflags & X86_EFLAGS_DF ? -1 : 1;
1445 int long_mode = 0;
1447 ar_bytes = __vmread(GUEST_CS_AR_BYTES);
1448 #ifdef __x86_64__
1449 if ( vmx_long_mode_enabled(current) && (ar_bytes & (1u<<13)) )
1450 long_mode = 1;
1451 #endif
1452 addr = __vmread(GUEST_LINEAR_ADDRESS);
1454 if ( test_bit(5, &exit_qualification) ) { /* "rep" prefix */
1455 pio_opp->flags |= REPZ;
1456 count = regs->ecx;
1457 if ( !long_mode && (vm86 || !(ar_bytes & (1u<<14))) )
1458 count &= 0xFFFF;
1461 /*
1462 * In protected mode, guest linear address is invalid if the
1463 * selector is null.
1464 */
1465 if ( !vmx_check_descriptor(long_mode, regs->eip, inst_len,
1466 dir==IOREQ_WRITE ? x86_seg_ds : x86_seg_es,
1467 &base, &limit, &ar_bytes) ) {
1468 if ( !long_mode ) {
1469 vmx_inject_hw_exception(current, TRAP_gp_fault, 0);
1470 return;
1472 addr = dir == IOREQ_WRITE ? base + regs->esi : regs->edi;
1475 if ( !long_mode ) {
1476 unsigned long ea = addr - base;
1478 /* Segment must be readable for outs and writeable for ins. */
1479 if ( dir == IOREQ_WRITE ? (ar_bytes & 0xa) == 0x8
1480 : (ar_bytes & 0xa) != 0x2 ) {
1481 vmx_inject_hw_exception(current, TRAP_gp_fault, 0);
1482 return;
1485 /* Offset must be within limits. */
1486 ASSERT(ea == (u32)ea);
1487 if ( (u32)(ea + size - 1) < (u32)ea ||
1488 (ar_bytes & 0xc) != 0x4 ? ea + size - 1 > limit
1489 : ea <= limit )
1491 vmx_inject_hw_exception(current, TRAP_gp_fault, 0);
1492 return;
1495 /* Check the limit for repeated instructions, as above we checked
1496 only the first instance. Truncate the count if a limit violation
1497 would occur. Note that the checking is not necessary for page
1498 granular segments as transfers crossing page boundaries will be
1499 broken up anyway. */
1500 if ( !(ar_bytes & (1u<<15)) && count > 1 )
1502 if ( (ar_bytes & 0xc) != 0x4 )
1504 /* expand-up */
1505 if ( !df )
1507 if ( ea + count * size - 1 < ea ||
1508 ea + count * size - 1 > limit )
1509 count = (limit + 1UL - ea) / size;
1511 else
1513 if ( count - 1 > ea / size )
1514 count = ea / size + 1;
1517 else
1519 /* expand-down */
1520 if ( !df )
1522 if ( count - 1 > -(s32)ea / size )
1523 count = -(s32)ea / size + 1UL;
1525 else
1527 if ( ea < (count - 1) * size ||
1528 ea - (count - 1) * size <= limit )
1529 count = (ea - limit - 1) / size + 1;
1532 ASSERT(count);
1535 #ifdef __x86_64__
1536 else
1538 if ( !is_canonical_address(addr) ||
1539 !is_canonical_address(addr + size - 1) )
1541 vmx_inject_hw_exception(current, TRAP_gp_fault, 0);
1542 return;
1544 if ( count > (1UL << 48) / size )
1545 count = (1UL << 48) / size;
1546 if ( !(regs->eflags & EF_DF) )
1548 if ( addr + count * size - 1 < addr ||
1549 !is_canonical_address(addr + count * size - 1) )
1550 count = (addr & ~((1UL << 48) - 1)) / size;
1552 else
1554 if ( (count - 1) * size > addr ||
1555 !is_canonical_address(addr + (count - 1) * size) )
1556 count = (addr & ~((1UL << 48) - 1)) / size + 1;
1558 ASSERT(count);
1560 #endif
1562 /* Translate the address to a physical address */
1563 gfn = paging_gva_to_gfn(current, addr);
1564 if ( gfn == INVALID_GFN )
1566 /* The guest does not have the RAM address mapped.
1567 * Need to send in a page fault */
1568 int errcode = 0;
1569 /* IO read --> memory write */
1570 if ( dir == IOREQ_READ ) errcode |= PFEC_write_access;
1571 vmx_inject_exception(TRAP_page_fault, errcode, addr);
1572 return;
1574 paddr = (paddr_t)gfn << PAGE_SHIFT | (addr & ~PAGE_MASK);
1576 /*
1577 * Handle string pio instructions that cross pages or that
1578 * are unaligned. See the comments in hvm_domain.c/handle_mmio()
1579 */
1580 if ( (addr & PAGE_MASK) != ((addr + size - 1) & PAGE_MASK) ) {
1581 unsigned long value = 0;
1583 pio_opp->flags |= OVERLAP;
1585 if ( dir == IOREQ_WRITE ) /* OUTS */
1587 if ( hvm_paging_enabled(current) )
1589 int rv = hvm_copy_from_guest_virt(&value, addr, size);
1590 if ( rv != 0 )
1592 /* Failed on the page-spanning copy. Inject PF into
1593 * the guest for the address where we failed. */
1594 addr += size - rv;
1595 gdprintk(XENLOG_DEBUG, "Pagefault reading non-io side "
1596 "of a page-spanning PIO: va=%#lx\n", addr);
1597 vmx_inject_exception(TRAP_page_fault, 0, addr);
1598 return;
1601 else
1602 (void) hvm_copy_from_guest_phys(&value, addr, size);
1603 } else /* dir != IOREQ_WRITE */
1604 /* Remember where to write the result, as a *VA*.
1605 * Must be a VA so we can handle the page overlap
1606 * correctly in hvm_pio_assist() */
1607 pio_opp->addr = addr;
1609 if ( count == 1 )
1610 regs->eip += inst_len;
1612 send_pio_req(port, 1, size, value, dir, df, 0);
1613 } else {
1614 unsigned long last_addr = sign > 0 ? addr + count * size - 1
1615 : addr - (count - 1) * size;
1617 if ( (addr & PAGE_MASK) != (last_addr & PAGE_MASK) )
1619 if ( sign > 0 )
1620 count = (PAGE_SIZE - (addr & ~PAGE_MASK)) / size;
1621 else
1622 count = (addr & ~PAGE_MASK) / size + 1;
1623 } else
1624 regs->eip += inst_len;
1626 send_pio_req(port, count, size, paddr, dir, df, 1);
1628 } else {
1629 if ( port == 0xe9 && dir == IOREQ_WRITE && size == 1 )
1630 hvm_print_line(current, regs->eax); /* guest debug output */
1632 if ( dir == IOREQ_WRITE )
1633 TRACE_VMEXIT(2, regs->eax);
1635 regs->eip += inst_len;
1636 send_pio_req(port, 1, size, regs->eax, dir, df, 0);
1640 static void vmx_world_save(struct vcpu *v, struct vmx_assist_context *c)
1642 /* NB. Skip transition instruction. */
1643 c->eip = __vmread(GUEST_RIP);
1644 c->eip += __get_instruction_length(); /* Safe: MOV Cn, LMSW, CLTS */
1646 c->esp = __vmread(GUEST_RSP);
1647 c->eflags = __vmread(GUEST_RFLAGS);
1649 c->cr0 = v->arch.hvm_vmx.cpu_shadow_cr0;
1650 c->cr3 = v->arch.hvm_vmx.cpu_cr3;
1651 c->cr4 = v->arch.hvm_vmx.cpu_shadow_cr4;
1653 c->idtr_limit = __vmread(GUEST_IDTR_LIMIT);
1654 c->idtr_base = __vmread(GUEST_IDTR_BASE);
1656 c->gdtr_limit = __vmread(GUEST_GDTR_LIMIT);
1657 c->gdtr_base = __vmread(GUEST_GDTR_BASE);
1659 c->cs_sel = __vmread(GUEST_CS_SELECTOR);
1660 c->cs_limit = __vmread(GUEST_CS_LIMIT);
1661 c->cs_base = __vmread(GUEST_CS_BASE);
1662 c->cs_arbytes.bytes = __vmread(GUEST_CS_AR_BYTES);
1664 c->ds_sel = __vmread(GUEST_DS_SELECTOR);
1665 c->ds_limit = __vmread(GUEST_DS_LIMIT);
1666 c->ds_base = __vmread(GUEST_DS_BASE);
1667 c->ds_arbytes.bytes = __vmread(GUEST_DS_AR_BYTES);
1669 c->es_sel = __vmread(GUEST_ES_SELECTOR);
1670 c->es_limit = __vmread(GUEST_ES_LIMIT);
1671 c->es_base = __vmread(GUEST_ES_BASE);
1672 c->es_arbytes.bytes = __vmread(GUEST_ES_AR_BYTES);
1674 c->ss_sel = __vmread(GUEST_SS_SELECTOR);
1675 c->ss_limit = __vmread(GUEST_SS_LIMIT);
1676 c->ss_base = __vmread(GUEST_SS_BASE);
1677 c->ss_arbytes.bytes = __vmread(GUEST_SS_AR_BYTES);
1679 c->fs_sel = __vmread(GUEST_FS_SELECTOR);
1680 c->fs_limit = __vmread(GUEST_FS_LIMIT);
1681 c->fs_base = __vmread(GUEST_FS_BASE);
1682 c->fs_arbytes.bytes = __vmread(GUEST_FS_AR_BYTES);
1684 c->gs_sel = __vmread(GUEST_GS_SELECTOR);
1685 c->gs_limit = __vmread(GUEST_GS_LIMIT);
1686 c->gs_base = __vmread(GUEST_GS_BASE);
1687 c->gs_arbytes.bytes = __vmread(GUEST_GS_AR_BYTES);
1689 c->tr_sel = __vmread(GUEST_TR_SELECTOR);
1690 c->tr_limit = __vmread(GUEST_TR_LIMIT);
1691 c->tr_base = __vmread(GUEST_TR_BASE);
1692 c->tr_arbytes.bytes = __vmread(GUEST_TR_AR_BYTES);
1694 c->ldtr_sel = __vmread(GUEST_LDTR_SELECTOR);
1695 c->ldtr_limit = __vmread(GUEST_LDTR_LIMIT);
1696 c->ldtr_base = __vmread(GUEST_LDTR_BASE);
1697 c->ldtr_arbytes.bytes = __vmread(GUEST_LDTR_AR_BYTES);
1700 static int vmx_world_restore(struct vcpu *v, struct vmx_assist_context *c)
1702 unsigned long mfn, old_base_mfn;
1704 __vmwrite(GUEST_RIP, c->eip);
1705 __vmwrite(GUEST_RSP, c->esp);
1706 __vmwrite(GUEST_RFLAGS, c->eflags);
1708 v->arch.hvm_vmx.cpu_shadow_cr0 = c->cr0;
1709 __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr0);
1711 if ( !vmx_paging_enabled(v) )
1712 goto skip_cr3;
1714 if ( c->cr3 == v->arch.hvm_vmx.cpu_cr3 )
1716 /*
1717 * This is simple TLB flush, implying the guest has
1718 * removed some translation or changed page attributes.
1719 * We simply invalidate the shadow.
1720 */
1721 mfn = get_mfn_from_gpfn(c->cr3 >> PAGE_SHIFT);
1722 if ( mfn != pagetable_get_pfn(v->arch.guest_table) )
1723 goto bad_cr3;
1725 else
1727 /*
1728 * If different, make a shadow. Check if the PDBR is valid
1729 * first.
1730 */
1731 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 c->cr3 = %x", c->cr3);
1732 mfn = get_mfn_from_gpfn(c->cr3 >> PAGE_SHIFT);
1733 if ( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), v->domain) )
1734 goto bad_cr3;
1735 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1736 v->arch.guest_table = pagetable_from_pfn(mfn);
1737 if (old_base_mfn)
1738 put_page(mfn_to_page(old_base_mfn));
1739 v->arch.hvm_vmx.cpu_cr3 = c->cr3;
1742 skip_cr3:
1743 if ( !vmx_paging_enabled(v) )
1744 HVM_DBG_LOG(DBG_LEVEL_VMMU, "switching to vmxassist. use phys table");
1745 else
1746 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %x", c->cr3);
1748 __vmwrite(GUEST_CR4, (c->cr4 | VMX_CR4_HOST_MASK));
1749 v->arch.hvm_vmx.cpu_shadow_cr4 = c->cr4;
1750 __vmwrite(CR4_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr4);
1752 __vmwrite(GUEST_IDTR_LIMIT, c->idtr_limit);
1753 __vmwrite(GUEST_IDTR_BASE, c->idtr_base);
1755 __vmwrite(GUEST_GDTR_LIMIT, c->gdtr_limit);
1756 __vmwrite(GUEST_GDTR_BASE, c->gdtr_base);
1758 __vmwrite(GUEST_CS_SELECTOR, c->cs_sel);
1759 __vmwrite(GUEST_CS_LIMIT, c->cs_limit);
1760 __vmwrite(GUEST_CS_BASE, c->cs_base);
1761 __vmwrite(GUEST_CS_AR_BYTES, c->cs_arbytes.bytes);
1763 __vmwrite(GUEST_DS_SELECTOR, c->ds_sel);
1764 __vmwrite(GUEST_DS_LIMIT, c->ds_limit);
1765 __vmwrite(GUEST_DS_BASE, c->ds_base);
1766 __vmwrite(GUEST_DS_AR_BYTES, c->ds_arbytes.bytes);
1768 __vmwrite(GUEST_ES_SELECTOR, c->es_sel);
1769 __vmwrite(GUEST_ES_LIMIT, c->es_limit);
1770 __vmwrite(GUEST_ES_BASE, c->es_base);
1771 __vmwrite(GUEST_ES_AR_BYTES, c->es_arbytes.bytes);
1773 __vmwrite(GUEST_SS_SELECTOR, c->ss_sel);
1774 __vmwrite(GUEST_SS_LIMIT, c->ss_limit);
1775 __vmwrite(GUEST_SS_BASE, c->ss_base);
1776 __vmwrite(GUEST_SS_AR_BYTES, c->ss_arbytes.bytes);
1778 __vmwrite(GUEST_FS_SELECTOR, c->fs_sel);
1779 __vmwrite(GUEST_FS_LIMIT, c->fs_limit);
1780 __vmwrite(GUEST_FS_BASE, c->fs_base);
1781 __vmwrite(GUEST_FS_AR_BYTES, c->fs_arbytes.bytes);
1783 __vmwrite(GUEST_GS_SELECTOR, c->gs_sel);
1784 __vmwrite(GUEST_GS_LIMIT, c->gs_limit);
1785 __vmwrite(GUEST_GS_BASE, c->gs_base);
1786 __vmwrite(GUEST_GS_AR_BYTES, c->gs_arbytes.bytes);
1788 __vmwrite(GUEST_TR_SELECTOR, c->tr_sel);
1789 __vmwrite(GUEST_TR_LIMIT, c->tr_limit);
1790 __vmwrite(GUEST_TR_BASE, c->tr_base);
1791 __vmwrite(GUEST_TR_AR_BYTES, c->tr_arbytes.bytes);
1793 __vmwrite(GUEST_LDTR_SELECTOR, c->ldtr_sel);
1794 __vmwrite(GUEST_LDTR_LIMIT, c->ldtr_limit);
1795 __vmwrite(GUEST_LDTR_BASE, c->ldtr_base);
1796 __vmwrite(GUEST_LDTR_AR_BYTES, c->ldtr_arbytes.bytes);
1798 paging_update_paging_modes(v);
1799 return 0;
1801 bad_cr3:
1802 gdprintk(XENLOG_ERR, "Invalid CR3 value=%x", c->cr3);
1803 return -EINVAL;
1806 enum { VMX_ASSIST_INVOKE = 0, VMX_ASSIST_RESTORE };
1808 static int vmx_assist(struct vcpu *v, int mode)
1810 struct vmx_assist_context c;
1811 u32 magic;
1812 u32 cp;
1814 /* make sure vmxassist exists (this is not an error) */
1815 if (hvm_copy_from_guest_phys(&magic, VMXASSIST_MAGIC_OFFSET,
1816 sizeof(magic)))
1817 return 0;
1818 if (magic != VMXASSIST_MAGIC)
1819 return 0;
1821 switch (mode) {
1822 /*
1823 * Transfer control to vmxassist.
1824 * Store the current context in VMXASSIST_OLD_CONTEXT and load
1825 * the new VMXASSIST_NEW_CONTEXT context. This context was created
1826 * by vmxassist and will transfer control to it.
1827 */
1828 case VMX_ASSIST_INVOKE:
1829 /* save the old context */
1830 if (hvm_copy_from_guest_phys(&cp, VMXASSIST_OLD_CONTEXT, sizeof(cp)))
1831 goto error;
1832 if (cp != 0) {
1833 vmx_world_save(v, &c);
1834 if (hvm_copy_to_guest_phys(cp, &c, sizeof(c)))
1835 goto error;
1838 /* restore the new context, this should activate vmxassist */
1839 if (hvm_copy_from_guest_phys(&cp, VMXASSIST_NEW_CONTEXT, sizeof(cp)))
1840 goto error;
1841 if (cp != 0) {
1842 if (hvm_copy_from_guest_phys(&c, cp, sizeof(c)))
1843 goto error;
1844 if ( vmx_world_restore(v, &c) != 0 )
1845 goto error;
1846 v->arch.hvm_vmx.vmxassist_enabled = 1;
1847 return 1;
1849 break;
1851 /*
1852 * Restore the VMXASSIST_OLD_CONTEXT that was saved by
1853 * VMX_ASSIST_INVOKE above.
1854 */
1855 case VMX_ASSIST_RESTORE:
1856 /* save the old context */
1857 if (hvm_copy_from_guest_phys(&cp, VMXASSIST_OLD_CONTEXT, sizeof(cp)))
1858 goto error;
1859 if (cp != 0) {
1860 if (hvm_copy_from_guest_phys(&c, cp, sizeof(c)))
1861 goto error;
1862 if ( vmx_world_restore(v, &c) != 0 )
1863 goto error;
1864 v->arch.hvm_vmx.vmxassist_enabled = 0;
1865 return 1;
1867 break;
1870 error:
1871 gdprintk(XENLOG_ERR, "Failed to transfer to vmxassist\n");
1872 domain_crash(v->domain);
1873 return 0;
1876 static int vmx_set_cr0(unsigned long value)
1878 struct vcpu *v = current;
1879 unsigned long mfn;
1880 unsigned long eip;
1881 int paging_enabled;
1882 unsigned long vm_entry_value;
1883 unsigned long old_cr0;
1884 unsigned long old_base_mfn;
1886 /*
1887 * CR0: We don't want to lose PE and PG.
1888 */
1889 old_cr0 = v->arch.hvm_vmx.cpu_shadow_cr0;
1890 paging_enabled = (old_cr0 & X86_CR0_PE) && (old_cr0 & X86_CR0_PG);
1892 /* TS cleared? Then initialise FPU now. */
1893 if ( !(value & X86_CR0_TS) )
1895 setup_fpu(v);
1896 __vm_clear_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_NM);
1899 v->arch.hvm_vmx.cpu_cr0 = (value | X86_CR0_PE | X86_CR0_PG
1900 | X86_CR0_NE | X86_CR0_WP);
1901 __vmwrite(GUEST_CR0, v->arch.hvm_vmx.cpu_cr0);
1903 v->arch.hvm_vmx.cpu_shadow_cr0 = value;
1904 __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr0);
1906 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR0 value = %lx\n", value);
1908 if ( (value & X86_CR0_PE) && (value & X86_CR0_PG) && !paging_enabled )
1910 /*
1911 * Trying to enable guest paging.
1912 * The guest CR3 must be pointing to the guest physical.
1913 */
1914 mfn = get_mfn_from_gpfn(v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT);
1915 if ( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), v->domain) )
1917 gdprintk(XENLOG_ERR, "Invalid CR3 value = %lx (mfn=%lx)\n",
1918 v->arch.hvm_vmx.cpu_cr3, mfn);
1919 domain_crash(v->domain);
1920 return 0;
1923 #if defined(__x86_64__)
1924 if ( vmx_lme_is_set(v) )
1926 if ( !(v->arch.hvm_vmx.cpu_shadow_cr4 & X86_CR4_PAE) )
1928 HVM_DBG_LOG(DBG_LEVEL_1, "Guest enabled paging "
1929 "with EFER.LME set but not CR4.PAE\n");
1930 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
1932 else
1934 HVM_DBG_LOG(DBG_LEVEL_1, "Enabling long mode\n");
1935 v->arch.hvm_vmx.msr_state.msrs[VMX_INDEX_MSR_EFER]
1936 |= EFER_LMA;
1937 vm_entry_value = __vmread(VM_ENTRY_CONTROLS);
1938 vm_entry_value |= VM_ENTRY_IA32E_MODE;
1939 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
1942 #endif
1944 /*
1945 * Now arch.guest_table points to machine physical.
1946 */
1947 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1948 v->arch.guest_table = pagetable_from_pfn(mfn);
1949 if (old_base_mfn)
1950 put_page(mfn_to_page(old_base_mfn));
1951 paging_update_paging_modes(v);
1953 HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
1954 (unsigned long) (mfn << PAGE_SHIFT));
1956 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, mfn = %lx",
1957 v->arch.hvm_vmx.cpu_cr3, mfn);
1960 if ( !((value & X86_CR0_PE) && (value & X86_CR0_PG)) && paging_enabled )
1961 if ( v->arch.hvm_vmx.cpu_cr3 ) {
1962 put_page(mfn_to_page(get_mfn_from_gpfn(
1963 v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT)));
1964 v->arch.guest_table = pagetable_null();
1967 /*
1968 * VMX does not implement real-mode virtualization. We emulate
1969 * real-mode by performing a world switch to VMXAssist whenever
1970 * a partition disables the CR0.PE bit.
1971 */
1972 if ( (value & X86_CR0_PE) == 0 )
1974 if ( value & X86_CR0_PG ) {
1975 /* inject GP here */
1976 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
1977 return 0;
1978 } else {
1979 /*
1980 * Disable paging here.
1981 * Same to PE == 1 && PG == 0
1982 */
1983 if ( vmx_long_mode_enabled(v) )
1985 v->arch.hvm_vmx.msr_state.msrs[VMX_INDEX_MSR_EFER]
1986 &= ~EFER_LMA;
1987 vm_entry_value = __vmread(VM_ENTRY_CONTROLS);
1988 vm_entry_value &= ~VM_ENTRY_IA32E_MODE;
1989 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
1993 if ( vmx_assist(v, VMX_ASSIST_INVOKE) )
1995 eip = __vmread(GUEST_RIP);
1996 HVM_DBG_LOG(DBG_LEVEL_1,
1997 "Transfering control to vmxassist %%eip 0x%lx\n", eip);
1998 return 0; /* do not update eip! */
2001 else if ( v->arch.hvm_vmx.vmxassist_enabled )
2003 eip = __vmread(GUEST_RIP);
2004 HVM_DBG_LOG(DBG_LEVEL_1,
2005 "Enabling CR0.PE at %%eip 0x%lx\n", eip);
2006 if ( vmx_assist(v, VMX_ASSIST_RESTORE) )
2008 eip = __vmread(GUEST_RIP);
2009 HVM_DBG_LOG(DBG_LEVEL_1,
2010 "Restoring to %%eip 0x%lx\n", eip);
2011 return 0; /* do not update eip! */
2014 else if ( (value & (X86_CR0_PE | X86_CR0_PG)) == X86_CR0_PE )
2016 if ( vmx_long_mode_enabled(v) )
2018 v->arch.hvm_vmx.msr_state.msrs[VMX_INDEX_MSR_EFER] &= ~EFER_LMA;
2019 vm_entry_value = __vmread(VM_ENTRY_CONTROLS);
2020 vm_entry_value &= ~VM_ENTRY_IA32E_MODE;
2021 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
2023 paging_update_paging_modes(v);
2026 return 1;
2029 #define CASE_SET_REG(REG, reg) \
2030 case REG_ ## REG: regs->reg = value; break
2031 #define CASE_GET_REG(REG, reg) \
2032 case REG_ ## REG: value = regs->reg; break
2034 #define CASE_EXTEND_SET_REG \
2035 CASE_EXTEND_REG(S)
2036 #define CASE_EXTEND_GET_REG \
2037 CASE_EXTEND_REG(G)
2039 #ifdef __i386__
2040 #define CASE_EXTEND_REG(T)
2041 #else
2042 #define CASE_EXTEND_REG(T) \
2043 CASE_ ## T ## ET_REG(R8, r8); \
2044 CASE_ ## T ## ET_REG(R9, r9); \
2045 CASE_ ## T ## ET_REG(R10, r10); \
2046 CASE_ ## T ## ET_REG(R11, r11); \
2047 CASE_ ## T ## ET_REG(R12, r12); \
2048 CASE_ ## T ## ET_REG(R13, r13); \
2049 CASE_ ## T ## ET_REG(R14, r14); \
2050 CASE_ ## T ## ET_REG(R15, r15)
2051 #endif
2053 /*
2054 * Write to control registers
2055 */
2056 static int mov_to_cr(int gp, int cr, struct cpu_user_regs *regs)
2058 unsigned long value, old_cr, old_base_mfn, mfn;
2059 struct vcpu *v = current;
2060 struct vlapic *vlapic = vcpu_vlapic(v);
2062 switch ( gp )
2064 CASE_GET_REG(EAX, eax);
2065 CASE_GET_REG(ECX, ecx);
2066 CASE_GET_REG(EDX, edx);
2067 CASE_GET_REG(EBX, ebx);
2068 CASE_GET_REG(EBP, ebp);
2069 CASE_GET_REG(ESI, esi);
2070 CASE_GET_REG(EDI, edi);
2071 CASE_EXTEND_GET_REG;
2072 case REG_ESP:
2073 value = __vmread(GUEST_RSP);
2074 break;
2075 default:
2076 gdprintk(XENLOG_ERR, "invalid gp: %d\n", gp);
2077 goto exit_and_crash;
2080 TRACE_VMEXIT(1, TYPE_MOV_TO_CR);
2081 TRACE_VMEXIT(2, cr);
2082 TRACE_VMEXIT(3, value);
2084 HVM_DBG_LOG(DBG_LEVEL_1, "CR%d, value = %lx", cr, value);
2086 switch ( cr )
2088 case 0:
2089 return vmx_set_cr0(value);
2091 case 3:
2092 /*
2093 * If paging is not enabled yet, simply copy the value to CR3.
2094 */
2095 if (!vmx_paging_enabled(v)) {
2096 v->arch.hvm_vmx.cpu_cr3 = value;
2097 break;
2100 /*
2101 * We make a new one if the shadow does not exist.
2102 */
2103 if (value == v->arch.hvm_vmx.cpu_cr3) {
2104 /*
2105 * This is simple TLB flush, implying the guest has
2106 * removed some translation or changed page attributes.
2107 * We simply invalidate the shadow.
2108 */
2109 mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
2110 if (mfn != pagetable_get_pfn(v->arch.guest_table))
2111 goto bad_cr3;
2112 paging_update_cr3(v);
2113 } else {
2114 /*
2115 * If different, make a shadow. Check if the PDBR is valid
2116 * first.
2117 */
2118 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 value = %lx", value);
2119 mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
2120 if ( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), v->domain) )
2121 goto bad_cr3;
2122 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
2123 v->arch.guest_table = pagetable_from_pfn(mfn);
2124 if (old_base_mfn)
2125 put_page(mfn_to_page(old_base_mfn));
2126 v->arch.hvm_vmx.cpu_cr3 = value;
2127 update_cr3(v);
2128 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx", value);
2130 break;
2132 case 4: /* CR4 */
2133 old_cr = v->arch.hvm_vmx.cpu_shadow_cr4;
2135 if ( (value & X86_CR4_PAE) && !(old_cr & X86_CR4_PAE) )
2137 if ( vmx_pgbit_test(v) )
2139 /* The guest is a 32-bit PAE guest. */
2140 #if CONFIG_PAGING_LEVELS >= 3
2141 unsigned long mfn, old_base_mfn;
2142 mfn = get_mfn_from_gpfn(v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT);
2143 if ( !mfn_valid(mfn) ||
2144 !get_page(mfn_to_page(mfn), v->domain) )
2145 goto bad_cr3;
2147 /*
2148 * Now arch.guest_table points to machine physical.
2149 */
2151 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
2152 v->arch.guest_table = pagetable_from_pfn(mfn);
2153 if ( old_base_mfn )
2154 put_page(mfn_to_page(old_base_mfn));
2156 HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
2157 (unsigned long) (mfn << PAGE_SHIFT));
2159 HVM_DBG_LOG(DBG_LEVEL_VMMU,
2160 "Update CR3 value = %lx, mfn = %lx",
2161 v->arch.hvm_vmx.cpu_cr3, mfn);
2162 #endif
2165 else if ( !(value & X86_CR4_PAE) )
2167 if ( unlikely(vmx_long_mode_enabled(v)) )
2169 HVM_DBG_LOG(DBG_LEVEL_1, "Guest cleared CR4.PAE while "
2170 "EFER.LMA is set\n");
2171 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
2175 __vmwrite(GUEST_CR4, value| VMX_CR4_HOST_MASK);
2176 v->arch.hvm_vmx.cpu_shadow_cr4 = value;
2177 __vmwrite(CR4_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr4);
2179 /*
2180 * Writing to CR4 to modify the PSE, PGE, or PAE flag invalidates
2181 * all TLB entries except global entries.
2182 */
2183 if ( (old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE) )
2184 paging_update_paging_modes(v);
2185 break;
2187 case 8:
2188 vlapic_set_reg(vlapic, APIC_TASKPRI, ((value & 0x0F) << 4));
2189 break;
2191 default:
2192 gdprintk(XENLOG_ERR, "invalid cr: %d\n", cr);
2193 domain_crash(v->domain);
2194 return 0;
2197 return 1;
2199 bad_cr3:
2200 gdprintk(XENLOG_ERR, "Invalid CR3\n");
2201 exit_and_crash:
2202 domain_crash(v->domain);
2203 return 0;
2206 /*
2207 * Read from control registers. CR0 and CR4 are read from the shadow.
2208 */
2209 static void mov_from_cr(int cr, int gp, struct cpu_user_regs *regs)
2211 unsigned long value = 0;
2212 struct vcpu *v = current;
2213 struct vlapic *vlapic = vcpu_vlapic(v);
2215 switch ( cr )
2217 case 3:
2218 value = (unsigned long)v->arch.hvm_vmx.cpu_cr3;
2219 break;
2220 case 8:
2221 value = (unsigned long)vlapic_get_reg(vlapic, APIC_TASKPRI);
2222 value = (value & 0xF0) >> 4;
2223 break;
2224 default:
2225 gdprintk(XENLOG_ERR, "invalid cr: %d\n", cr);
2226 domain_crash(v->domain);
2227 break;
2230 switch ( gp ) {
2231 CASE_SET_REG(EAX, eax);
2232 CASE_SET_REG(ECX, ecx);
2233 CASE_SET_REG(EDX, edx);
2234 CASE_SET_REG(EBX, ebx);
2235 CASE_SET_REG(EBP, ebp);
2236 CASE_SET_REG(ESI, esi);
2237 CASE_SET_REG(EDI, edi);
2238 CASE_EXTEND_SET_REG;
2239 case REG_ESP:
2240 __vmwrite(GUEST_RSP, value);
2241 regs->esp = value;
2242 break;
2243 default:
2244 printk("invalid gp: %d\n", gp);
2245 domain_crash(v->domain);
2246 break;
2249 TRACE_VMEXIT(1, TYPE_MOV_FROM_CR);
2250 TRACE_VMEXIT(2, cr);
2251 TRACE_VMEXIT(3, value);
2253 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR%d, value = %lx", cr, value);
2256 static int vmx_cr_access(unsigned long exit_qualification,
2257 struct cpu_user_regs *regs)
2259 unsigned int gp, cr;
2260 unsigned long value;
2261 struct vcpu *v = current;
2263 switch (exit_qualification & CONTROL_REG_ACCESS_TYPE) {
2264 case TYPE_MOV_TO_CR:
2265 gp = exit_qualification & CONTROL_REG_ACCESS_REG;
2266 cr = exit_qualification & CONTROL_REG_ACCESS_NUM;
2267 return mov_to_cr(gp, cr, regs);
2268 case TYPE_MOV_FROM_CR:
2269 gp = exit_qualification & CONTROL_REG_ACCESS_REG;
2270 cr = exit_qualification & CONTROL_REG_ACCESS_NUM;
2271 mov_from_cr(cr, gp, regs);
2272 break;
2273 case TYPE_CLTS:
2274 TRACE_VMEXIT(1, TYPE_CLTS);
2276 /* We initialise the FPU now, to avoid needing another vmexit. */
2277 setup_fpu(v);
2278 __vm_clear_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_NM);
2280 v->arch.hvm_vmx.cpu_cr0 &= ~X86_CR0_TS; /* clear TS */
2281 __vmwrite(GUEST_CR0, v->arch.hvm_vmx.cpu_cr0);
2283 v->arch.hvm_vmx.cpu_shadow_cr0 &= ~X86_CR0_TS; /* clear TS */
2284 __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr0);
2285 break;
2286 case TYPE_LMSW:
2287 value = v->arch.hvm_vmx.cpu_shadow_cr0;
2288 value = (value & ~0xF) |
2289 (((exit_qualification & LMSW_SOURCE_DATA) >> 16) & 0xF);
2290 TRACE_VMEXIT(1, TYPE_LMSW);
2291 TRACE_VMEXIT(2, value);
2292 return vmx_set_cr0(value);
2293 break;
2294 default:
2295 BUG();
2298 return 1;
2301 static inline int vmx_do_msr_read(struct cpu_user_regs *regs)
2303 u64 msr_content = 0;
2304 u32 ecx = regs->ecx, eax, edx;
2305 struct vcpu *v = current;
2307 HVM_DBG_LOG(DBG_LEVEL_1, "ecx=%x, eax=%x, edx=%x",
2308 ecx, (u32)regs->eax, (u32)regs->edx);
2310 switch (ecx) {
2311 case MSR_IA32_TIME_STAMP_COUNTER:
2312 msr_content = hvm_get_guest_time(v);
2313 break;
2314 case MSR_IA32_SYSENTER_CS:
2315 msr_content = (u32)__vmread(GUEST_SYSENTER_CS);
2316 break;
2317 case MSR_IA32_SYSENTER_ESP:
2318 msr_content = __vmread(GUEST_SYSENTER_ESP);
2319 break;
2320 case MSR_IA32_SYSENTER_EIP:
2321 msr_content = __vmread(GUEST_SYSENTER_EIP);
2322 break;
2323 case MSR_IA32_APICBASE:
2324 msr_content = vcpu_vlapic(v)->hw.apic_base_msr;
2325 break;
2326 default:
2327 if ( long_mode_do_msr_read(regs) )
2328 goto done;
2330 if ( rdmsr_hypervisor_regs(ecx, &eax, &edx) ||
2331 rdmsr_safe(ecx, eax, edx) == 0 )
2333 regs->eax = eax;
2334 regs->edx = edx;
2335 goto done;
2337 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
2338 return 0;
2341 regs->eax = msr_content & 0xFFFFFFFF;
2342 regs->edx = msr_content >> 32;
2344 done:
2345 HVM_DBG_LOG(DBG_LEVEL_1, "returns: ecx=%x, eax=%lx, edx=%lx",
2346 ecx, (unsigned long)regs->eax,
2347 (unsigned long)regs->edx);
2348 return 1;
2351 static inline int vmx_do_msr_write(struct cpu_user_regs *regs)
2353 u32 ecx = regs->ecx;
2354 u64 msr_content;
2355 struct vcpu *v = current;
2357 HVM_DBG_LOG(DBG_LEVEL_1, "ecx=%x, eax=%x, edx=%x",
2358 ecx, (u32)regs->eax, (u32)regs->edx);
2360 msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
2362 switch (ecx) {
2363 case MSR_IA32_TIME_STAMP_COUNTER:
2364 hvm_set_guest_time(v, msr_content);
2365 pt_reset(v);
2366 break;
2367 case MSR_IA32_SYSENTER_CS:
2368 __vmwrite(GUEST_SYSENTER_CS, msr_content);
2369 break;
2370 case MSR_IA32_SYSENTER_ESP:
2371 __vmwrite(GUEST_SYSENTER_ESP, msr_content);
2372 break;
2373 case MSR_IA32_SYSENTER_EIP:
2374 __vmwrite(GUEST_SYSENTER_EIP, msr_content);
2375 break;
2376 case MSR_IA32_APICBASE:
2377 vlapic_msr_set(vcpu_vlapic(v), msr_content);
2378 break;
2379 default:
2380 if ( !long_mode_do_msr_write(regs) )
2381 wrmsr_hypervisor_regs(ecx, regs->eax, regs->edx);
2382 break;
2385 return 1;
2388 static void vmx_do_hlt(void)
2390 unsigned long rflags;
2391 rflags = __vmread(GUEST_RFLAGS);
2392 hvm_hlt(rflags);
2395 static inline void vmx_do_extint(struct cpu_user_regs *regs)
2397 unsigned int vector;
2399 asmlinkage void do_IRQ(struct cpu_user_regs *);
2400 fastcall void smp_apic_timer_interrupt(struct cpu_user_regs *);
2401 fastcall void smp_event_check_interrupt(void);
2402 fastcall void smp_invalidate_interrupt(void);
2403 fastcall void smp_call_function_interrupt(void);
2404 fastcall void smp_spurious_interrupt(struct cpu_user_regs *regs);
2405 fastcall void smp_error_interrupt(struct cpu_user_regs *regs);
2406 #ifdef CONFIG_X86_MCE_P4THERMAL
2407 fastcall void smp_thermal_interrupt(struct cpu_user_regs *regs);
2408 #endif
2410 vector = __vmread(VM_EXIT_INTR_INFO);
2411 BUG_ON(!(vector & INTR_INFO_VALID_MASK));
2413 vector &= INTR_INFO_VECTOR_MASK;
2414 TRACE_VMEXIT(1, vector);
2416 switch(vector) {
2417 case LOCAL_TIMER_VECTOR:
2418 smp_apic_timer_interrupt(regs);
2419 break;
2420 case EVENT_CHECK_VECTOR:
2421 smp_event_check_interrupt();
2422 break;
2423 case INVALIDATE_TLB_VECTOR:
2424 smp_invalidate_interrupt();
2425 break;
2426 case CALL_FUNCTION_VECTOR:
2427 smp_call_function_interrupt();
2428 break;
2429 case SPURIOUS_APIC_VECTOR:
2430 smp_spurious_interrupt(regs);
2431 break;
2432 case ERROR_APIC_VECTOR:
2433 smp_error_interrupt(regs);
2434 break;
2435 #ifdef CONFIG_X86_MCE_P4THERMAL
2436 case THERMAL_APIC_VECTOR:
2437 smp_thermal_interrupt(regs);
2438 break;
2439 #endif
2440 default:
2441 regs->entry_vector = vector;
2442 do_IRQ(regs);
2443 break;
2447 #if defined (__x86_64__)
2448 void store_cpu_user_regs(struct cpu_user_regs *regs)
2450 regs->ss = __vmread(GUEST_SS_SELECTOR);
2451 regs->rsp = __vmread(GUEST_RSP);
2452 regs->rflags = __vmread(GUEST_RFLAGS);
2453 regs->cs = __vmread(GUEST_CS_SELECTOR);
2454 regs->ds = __vmread(GUEST_DS_SELECTOR);
2455 regs->es = __vmread(GUEST_ES_SELECTOR);
2456 regs->rip = __vmread(GUEST_RIP);
2458 #elif defined (__i386__)
2459 void store_cpu_user_regs(struct cpu_user_regs *regs)
2461 regs->ss = __vmread(GUEST_SS_SELECTOR);
2462 regs->esp = __vmread(GUEST_RSP);
2463 regs->eflags = __vmread(GUEST_RFLAGS);
2464 regs->cs = __vmread(GUEST_CS_SELECTOR);
2465 regs->ds = __vmread(GUEST_DS_SELECTOR);
2466 regs->es = __vmread(GUEST_ES_SELECTOR);
2467 regs->eip = __vmread(GUEST_RIP);
2469 #endif
2471 #ifdef XEN_DEBUGGER
2472 void save_cpu_user_regs(struct cpu_user_regs *regs)
2474 regs->xss = __vmread(GUEST_SS_SELECTOR);
2475 regs->esp = __vmread(GUEST_RSP);
2476 regs->eflags = __vmread(GUEST_RFLAGS);
2477 regs->xcs = __vmread(GUEST_CS_SELECTOR);
2478 regs->eip = __vmread(GUEST_RIP);
2480 regs->xgs = __vmread(GUEST_GS_SELECTOR);
2481 regs->xfs = __vmread(GUEST_FS_SELECTOR);
2482 regs->xes = __vmread(GUEST_ES_SELECTOR);
2483 regs->xds = __vmread(GUEST_DS_SELECTOR);
2486 void restore_cpu_user_regs(struct cpu_user_regs *regs)
2488 __vmwrite(GUEST_SS_SELECTOR, regs->xss);
2489 __vmwrite(GUEST_RSP, regs->esp);
2490 __vmwrite(GUEST_RFLAGS, regs->eflags);
2491 __vmwrite(GUEST_CS_SELECTOR, regs->xcs);
2492 __vmwrite(GUEST_RIP, regs->eip);
2494 __vmwrite(GUEST_GS_SELECTOR, regs->xgs);
2495 __vmwrite(GUEST_FS_SELECTOR, regs->xfs);
2496 __vmwrite(GUEST_ES_SELECTOR, regs->xes);
2497 __vmwrite(GUEST_DS_SELECTOR, regs->xds);
2499 #endif
2501 static void vmx_reflect_exception(struct vcpu *v)
2503 int error_code, intr_info, vector;
2505 intr_info = __vmread(VM_EXIT_INTR_INFO);
2506 vector = intr_info & 0xff;
2507 if ( intr_info & INTR_INFO_DELIVER_CODE_MASK )
2508 error_code = __vmread(VM_EXIT_INTR_ERROR_CODE);
2509 else
2510 error_code = VMX_DELIVER_NO_ERROR_CODE;
2512 #ifndef NDEBUG
2514 unsigned long rip;
2516 rip = __vmread(GUEST_RIP);
2517 HVM_DBG_LOG(DBG_LEVEL_1, "rip = %lx, error_code = %x",
2518 rip, error_code);
2520 #endif /* NDEBUG */
2522 /*
2523 * According to Intel Virtualization Technology Specification for
2524 * the IA-32 Intel Architecture (C97063-002 April 2005), section
2525 * 2.8.3, SW_EXCEPTION should be used for #BP and #OV, and
2526 * HW_EXCEPTION used for everything else. The main difference
2527 * appears to be that for SW_EXCEPTION, the EIP/RIP is incremented
2528 * by VM_ENTER_INSTRUCTION_LEN bytes, whereas for HW_EXCEPTION,
2529 * it is not.
2530 */
2531 if ( (intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_SW_EXCEPTION )
2533 int ilen = __get_instruction_length(); /* Safe: software exception */
2534 vmx_inject_sw_exception(v, vector, ilen);
2536 else
2538 vmx_inject_hw_exception(v, vector, error_code);
2542 static void vmx_failed_vmentry(unsigned int exit_reason)
2544 unsigned int failed_vmentry_reason = (uint16_t)exit_reason;
2545 unsigned long exit_qualification;
2547 exit_qualification = __vmread(EXIT_QUALIFICATION);
2548 printk("Failed vm entry (exit reason 0x%x) ", exit_reason);
2549 switch ( failed_vmentry_reason )
2551 case EXIT_REASON_INVALID_GUEST_STATE:
2552 printk("caused by invalid guest state (%ld).\n", exit_qualification);
2553 break;
2554 case EXIT_REASON_MSR_LOADING:
2555 printk("caused by MSR entry %ld loading.\n", exit_qualification);
2556 break;
2557 case EXIT_REASON_MACHINE_CHECK:
2558 printk("caused by machine check.\n");
2559 break;
2560 default:
2561 printk("reason not known yet!");
2562 break;
2565 printk("************* VMCS Area **************\n");
2566 vmcs_dump_vcpu();
2567 printk("**************************************\n");
2569 domain_crash(current->domain);
2572 asmlinkage void vmx_vmexit_handler(struct cpu_user_regs *regs)
2574 unsigned int exit_reason;
2575 unsigned long exit_qualification, inst_len = 0;
2576 struct vcpu *v = current;
2578 TRACE_3D(TRC_VMX_VMEXIT + v->vcpu_id, 0, 0, 0);
2580 exit_reason = __vmread(VM_EXIT_REASON);
2582 perfc_incra(vmexits, exit_reason);
2583 TRACE_VMEXIT(0, exit_reason);
2585 if ( exit_reason != EXIT_REASON_EXTERNAL_INTERRUPT )
2586 local_irq_enable();
2588 if ( unlikely(exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) )
2589 return vmx_failed_vmentry(exit_reason);
2591 switch ( exit_reason )
2593 case EXIT_REASON_EXCEPTION_NMI:
2595 /*
2596 * We don't set the software-interrupt exiting (INT n).
2597 * (1) We can get an exception (e.g. #PG) in the guest, or
2598 * (2) NMI
2599 */
2600 unsigned int intr_info, vector;
2602 intr_info = __vmread(VM_EXIT_INTR_INFO);
2603 BUG_ON(!(intr_info & INTR_INFO_VALID_MASK));
2605 vector = intr_info & INTR_INFO_VECTOR_MASK;
2607 TRACE_VMEXIT(1, vector);
2608 perfc_incra(cause_vector, vector);
2610 switch ( vector )
2612 #ifdef XEN_DEBUGGER
2613 case TRAP_debug:
2615 save_cpu_user_regs(regs);
2616 pdb_handle_exception(1, regs, 1);
2617 restore_cpu_user_regs(regs);
2618 break;
2620 case TRAP_int3:
2622 save_cpu_user_regs(regs);
2623 pdb_handle_exception(3, regs, 1);
2624 restore_cpu_user_regs(regs);
2625 break;
2627 #else
2628 case TRAP_debug:
2630 if ( test_bit(_DOMF_debugging, &v->domain->domain_flags) )
2632 store_cpu_user_regs(regs);
2633 domain_pause_for_debugger();
2634 __vm_clear_bit(GUEST_PENDING_DBG_EXCEPTIONS,
2635 PENDING_DEBUG_EXC_BS);
2637 else
2639 vmx_reflect_exception(v);
2640 __vm_clear_bit(GUEST_PENDING_DBG_EXCEPTIONS,
2641 PENDING_DEBUG_EXC_BS);
2644 break;
2646 case TRAP_int3:
2648 if ( test_bit(_DOMF_debugging, &v->domain->domain_flags) )
2649 domain_pause_for_debugger();
2650 else
2651 vmx_reflect_exception(v);
2652 break;
2654 #endif
2655 case TRAP_no_device:
2657 vmx_do_no_device_fault();
2658 break;
2660 case TRAP_page_fault:
2662 exit_qualification = __vmread(EXIT_QUALIFICATION);
2663 regs->error_code = __vmread(VM_EXIT_INTR_ERROR_CODE);
2665 TRACE_VMEXIT(3, regs->error_code);
2666 TRACE_VMEXIT(4, exit_qualification);
2668 HVM_DBG_LOG(DBG_LEVEL_VMMU,
2669 "eax=%lx, ebx=%lx, ecx=%lx, edx=%lx, esi=%lx, edi=%lx",
2670 (unsigned long)regs->eax, (unsigned long)regs->ebx,
2671 (unsigned long)regs->ecx, (unsigned long)regs->edx,
2672 (unsigned long)regs->esi, (unsigned long)regs->edi);
2674 if ( !vmx_do_page_fault(exit_qualification, regs) )
2676 /* Inject #PG using Interruption-Information Fields. */
2677 vmx_inject_hw_exception(v, TRAP_page_fault, regs->error_code);
2678 v->arch.hvm_vmx.cpu_cr2 = exit_qualification;
2679 TRACE_3D(TRC_VMX_INTR, v->domain->domain_id,
2680 TRAP_page_fault, exit_qualification);
2682 break;
2684 case TRAP_nmi:
2685 if ( (intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI )
2686 do_nmi(regs); /* Real NMI, vector 2: normal processing. */
2687 else
2688 vmx_reflect_exception(v);
2689 break;
2690 default:
2691 vmx_reflect_exception(v);
2692 break;
2694 break;
2696 case EXIT_REASON_EXTERNAL_INTERRUPT:
2697 vmx_do_extint(regs);
2698 break;
2699 case EXIT_REASON_TRIPLE_FAULT:
2700 hvm_triple_fault();
2701 break;
2702 case EXIT_REASON_PENDING_INTERRUPT:
2703 /* Disable the interrupt window. */
2704 v->arch.hvm_vcpu.u.vmx.exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
2705 __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
2706 v->arch.hvm_vcpu.u.vmx.exec_control);
2707 break;
2708 case EXIT_REASON_TASK_SWITCH:
2709 goto exit_and_crash;
2710 case EXIT_REASON_CPUID:
2711 inst_len = __get_instruction_length(); /* Safe: CPUID */
2712 __update_guest_eip(inst_len);
2713 vmx_do_cpuid(regs);
2714 break;
2715 case EXIT_REASON_HLT:
2716 inst_len = __get_instruction_length(); /* Safe: HLT */
2717 __update_guest_eip(inst_len);
2718 vmx_do_hlt();
2719 break;
2720 case EXIT_REASON_INVLPG:
2722 inst_len = __get_instruction_length(); /* Safe: INVLPG */
2723 __update_guest_eip(inst_len);
2724 exit_qualification = __vmread(EXIT_QUALIFICATION);
2725 vmx_do_invlpg(exit_qualification);
2726 TRACE_VMEXIT(4, exit_qualification);
2727 break;
2729 case EXIT_REASON_VMCALL:
2731 inst_len = __get_instruction_length(); /* Safe: VMCALL */
2732 __update_guest_eip(inst_len);
2733 hvm_do_hypercall(regs);
2734 break;
2736 case EXIT_REASON_CR_ACCESS:
2738 exit_qualification = __vmread(EXIT_QUALIFICATION);
2739 inst_len = __get_instruction_length(); /* Safe: MOV Cn, LMSW, CLTS */
2740 if ( vmx_cr_access(exit_qualification, regs) )
2741 __update_guest_eip(inst_len);
2742 TRACE_VMEXIT(4, exit_qualification);
2743 break;
2745 case EXIT_REASON_DR_ACCESS:
2746 exit_qualification = __vmread(EXIT_QUALIFICATION);
2747 vmx_dr_access(exit_qualification, regs);
2748 break;
2749 case EXIT_REASON_IO_INSTRUCTION:
2750 exit_qualification = __vmread(EXIT_QUALIFICATION);
2751 inst_len = __get_instruction_length(); /* Safe: IN, INS, OUT, OUTS */
2752 vmx_io_instruction(exit_qualification, inst_len);
2753 TRACE_VMEXIT(4, exit_qualification);
2754 break;
2755 case EXIT_REASON_MSR_READ:
2756 inst_len = __get_instruction_length(); /* Safe: RDMSR */
2757 if ( vmx_do_msr_read(regs) )
2758 __update_guest_eip(inst_len);
2759 TRACE_VMEXIT(1, regs->ecx);
2760 TRACE_VMEXIT(2, regs->eax);
2761 TRACE_VMEXIT(3, regs->edx);
2762 break;
2763 case EXIT_REASON_MSR_WRITE:
2764 inst_len = __get_instruction_length(); /* Safe: WRMSR */
2765 if ( vmx_do_msr_write(regs) )
2766 __update_guest_eip(inst_len);
2767 TRACE_VMEXIT(1, regs->ecx);
2768 TRACE_VMEXIT(2, regs->eax);
2769 TRACE_VMEXIT(3, regs->edx);
2770 break;
2771 case EXIT_REASON_MWAIT_INSTRUCTION:
2772 case EXIT_REASON_MONITOR_INSTRUCTION:
2773 case EXIT_REASON_PAUSE_INSTRUCTION:
2774 goto exit_and_crash;
2775 case EXIT_REASON_VMCLEAR:
2776 case EXIT_REASON_VMLAUNCH:
2777 case EXIT_REASON_VMPTRLD:
2778 case EXIT_REASON_VMPTRST:
2779 case EXIT_REASON_VMREAD:
2780 case EXIT_REASON_VMRESUME:
2781 case EXIT_REASON_VMWRITE:
2782 case EXIT_REASON_VMXOFF:
2783 case EXIT_REASON_VMXON:
2784 /* Report invalid opcode exception when a VMX guest tries to execute
2785 any of the VMX instructions */
2786 vmx_inject_hw_exception(v, TRAP_invalid_op, VMX_DELIVER_NO_ERROR_CODE);
2787 break;
2789 case EXIT_REASON_TPR_BELOW_THRESHOLD:
2790 break;
2792 default:
2793 exit_and_crash:
2794 gdprintk(XENLOG_ERR, "Bad vmexit (reason %x)\n", exit_reason);
2795 domain_crash(v->domain);
2796 break;
2800 asmlinkage void vmx_trace_vmentry(void)
2802 struct vcpu *v = current;
2803 TRACE_5D(TRC_VMX_VMENTRY + current->vcpu_id,
2804 v->arch.hvm_vcpu.hvm_trace_values[0],
2805 v->arch.hvm_vcpu.hvm_trace_values[1],
2806 v->arch.hvm_vcpu.hvm_trace_values[2],
2807 v->arch.hvm_vcpu.hvm_trace_values[3],
2808 v->arch.hvm_vcpu.hvm_trace_values[4]);
2810 TRACE_VMEXIT(0, 0);
2811 TRACE_VMEXIT(1, 0);
2812 TRACE_VMEXIT(2, 0);
2813 TRACE_VMEXIT(3, 0);
2814 TRACE_VMEXIT(4, 0);
2817 /*
2818 * Local variables:
2819 * mode: C
2820 * c-set-style: "BSD"
2821 * c-basic-offset: 4
2822 * tab-width: 4
2823 * indent-tabs-mode: nil
2824 * End:
2825 */