ia64/xen-unstable

view xen/arch/x86/hvm/vmx/vmx.c @ 14728:adf7f391be71

Fix comment.

Signed-off-by: Steven Hand <steven@xensource.com>
author Steven Hand <steven@xensource.com>
date Thu Apr 05 06:51:53 2007 +0100 (2007-04-05)
parents f4318c89291a
children 1a347b19142a
line source
1 /*
2 * vmx.c: handling VMX architecture-related VM exits
3 * Copyright (c) 2004, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
16 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 *
18 */
20 #include <xen/config.h>
21 #include <xen/init.h>
22 #include <xen/lib.h>
23 #include <xen/trace.h>
24 #include <xen/sched.h>
25 #include <xen/irq.h>
26 #include <xen/softirq.h>
27 #include <xen/domain_page.h>
28 #include <xen/hypercall.h>
29 #include <xen/perfc.h>
30 #include <asm/current.h>
31 #include <asm/io.h>
32 #include <asm/regs.h>
33 #include <asm/cpufeature.h>
34 #include <asm/processor.h>
35 #include <asm/types.h>
36 #include <asm/msr.h>
37 #include <asm/spinlock.h>
38 #include <asm/paging.h>
39 #include <asm/p2m.h>
40 #include <asm/hvm/hvm.h>
41 #include <asm/hvm/support.h>
42 #include <asm/hvm/vmx/vmx.h>
43 #include <asm/hvm/vmx/vmcs.h>
44 #include <asm/hvm/vmx/cpu.h>
45 #include <public/sched.h>
46 #include <public/hvm/ioreq.h>
47 #include <asm/hvm/vpic.h>
48 #include <asm/hvm/vlapic.h>
49 #include <asm/x86_emulate.h>
50 #include <asm/hvm/vpt.h>
51 #include <public/hvm/save.h>
52 #include <asm/hvm/trace.h>
54 char *vmx_msr_bitmap;
56 static void vmx_ctxt_switch_from(struct vcpu *v);
57 static void vmx_ctxt_switch_to(struct vcpu *v);
59 static int vmx_vcpu_initialise(struct vcpu *v)
60 {
61 int rc;
63 spin_lock_init(&v->arch.hvm_vmx.vmcs_lock);
65 v->arch.schedule_tail = vmx_do_resume;
66 v->arch.ctxt_switch_from = vmx_ctxt_switch_from;
67 v->arch.ctxt_switch_to = vmx_ctxt_switch_to;
69 if ( (rc = vmx_create_vmcs(v)) != 0 )
70 {
71 dprintk(XENLOG_WARNING,
72 "Failed to create VMCS for vcpu %d: err=%d.\n",
73 v->vcpu_id, rc);
74 return rc;
75 }
77 return 0;
78 }
80 static void vmx_vcpu_destroy(struct vcpu *v)
81 {
82 vmx_destroy_vmcs(v);
83 }
85 #ifdef __x86_64__
87 static DEFINE_PER_CPU(struct vmx_msr_state, host_msr_state);
89 static u32 msr_index[VMX_MSR_COUNT] =
90 {
91 MSR_LSTAR, MSR_STAR, MSR_CSTAR,
92 MSR_SYSCALL_MASK, MSR_EFER,
93 };
95 static void vmx_save_host_msrs(void)
96 {
97 struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state);
98 int i;
100 for ( i = 0; i < VMX_MSR_COUNT; i++ )
101 rdmsrl(msr_index[i], host_msr_state->msrs[i]);
102 }
104 #define WRITE_MSR(address) \
105 guest_msr_state->msrs[VMX_INDEX_MSR_ ## address] = msr_content; \
106 if ( !test_bit(VMX_INDEX_MSR_ ## address, &guest_msr_state->flags) )\
107 set_bit(VMX_INDEX_MSR_ ## address, &guest_msr_state->flags); \
108 wrmsrl(MSR_ ## address, msr_content); \
109 set_bit(VMX_INDEX_MSR_ ## address, &host_msr_state->flags); \
110 break
112 static inline int long_mode_do_msr_read(struct cpu_user_regs *regs)
113 {
114 u64 msr_content = 0;
115 struct vcpu *v = current;
116 struct vmx_msr_state *guest_msr_state = &v->arch.hvm_vmx.msr_state;
118 switch ( (u32)regs->ecx ) {
119 case MSR_EFER:
120 HVM_DBG_LOG(DBG_LEVEL_2, "EFER msr_content 0x%"PRIx64, msr_content);
121 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_EFER];
122 break;
124 case MSR_FS_BASE:
125 msr_content = __vmread(GUEST_FS_BASE);
126 goto check_long_mode;
128 case MSR_GS_BASE:
129 msr_content = __vmread(GUEST_GS_BASE);
130 goto check_long_mode;
132 case MSR_SHADOW_GS_BASE:
133 msr_content = guest_msr_state->shadow_gs;
134 check_long_mode:
135 if ( !(vmx_long_mode_enabled(v)) )
136 {
137 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
138 return 0;
139 }
140 break;
142 case MSR_STAR:
143 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_STAR];
144 break;
146 case MSR_LSTAR:
147 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_LSTAR];
148 break;
150 case MSR_CSTAR:
151 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_CSTAR];
152 break;
154 case MSR_SYSCALL_MASK:
155 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_SYSCALL_MASK];
156 break;
158 default:
159 return 0;
160 }
162 HVM_DBG_LOG(DBG_LEVEL_2, "msr_content: 0x%"PRIx64, msr_content);
164 regs->eax = (u32)(msr_content >> 0);
165 regs->edx = (u32)(msr_content >> 32);
167 return 1;
168 }
170 static inline int long_mode_do_msr_write(struct cpu_user_regs *regs)
171 {
172 u64 msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
173 u32 ecx = regs->ecx;
174 struct vcpu *v = current;
175 struct vmx_msr_state *guest_msr_state = &v->arch.hvm_vmx.msr_state;
176 struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state);
178 HVM_DBG_LOG(DBG_LEVEL_1, "msr 0x%x msr_content 0x%"PRIx64"\n",
179 ecx, msr_content);
181 switch ( ecx )
182 {
183 case MSR_EFER:
184 /* offending reserved bit will cause #GP */
185 if ( msr_content & ~(EFER_LME | EFER_LMA | EFER_NX | EFER_SCE) )
186 {
187 gdprintk(XENLOG_WARNING, "Trying to set reserved bit in "
188 "EFER: %"PRIx64"\n", msr_content);
189 goto gp_fault;
190 }
192 if ( (msr_content & EFER_LME)
193 && !(guest_msr_state->msrs[VMX_INDEX_MSR_EFER] & EFER_LME) )
194 {
195 if ( unlikely(vmx_paging_enabled(v)) )
196 {
197 gdprintk(XENLOG_WARNING,
198 "Trying to set EFER.LME with paging enabled\n");
199 goto gp_fault;
200 }
201 }
202 else if ( !(msr_content & EFER_LME)
203 && (guest_msr_state->msrs[VMX_INDEX_MSR_EFER] & EFER_LME) )
204 {
205 if ( unlikely(vmx_paging_enabled(v)) )
206 {
207 gdprintk(XENLOG_WARNING,
208 "Trying to clear EFER.LME with paging enabled\n");
209 goto gp_fault;
210 }
211 }
213 guest_msr_state->msrs[VMX_INDEX_MSR_EFER] = msr_content;
214 break;
216 case MSR_FS_BASE:
217 case MSR_GS_BASE:
218 case MSR_SHADOW_GS_BASE:
219 if ( !vmx_long_mode_enabled(v) )
220 goto gp_fault;
222 if ( !is_canonical_address(msr_content) )
223 goto uncanonical_address;
225 if ( ecx == MSR_FS_BASE )
226 __vmwrite(GUEST_FS_BASE, msr_content);
227 else if ( ecx == MSR_GS_BASE )
228 __vmwrite(GUEST_GS_BASE, msr_content);
229 else
230 {
231 v->arch.hvm_vmx.msr_state.shadow_gs = msr_content;
232 wrmsrl(MSR_SHADOW_GS_BASE, msr_content);
233 }
235 break;
237 case MSR_STAR:
238 WRITE_MSR(STAR);
240 case MSR_LSTAR:
241 if ( !is_canonical_address(msr_content) )
242 goto uncanonical_address;
243 WRITE_MSR(LSTAR);
245 case MSR_CSTAR:
246 if ( !is_canonical_address(msr_content) )
247 goto uncanonical_address;
248 WRITE_MSR(CSTAR);
250 case MSR_SYSCALL_MASK:
251 WRITE_MSR(SYSCALL_MASK);
253 default:
254 return 0;
255 }
257 return 1;
259 uncanonical_address:
260 HVM_DBG_LOG(DBG_LEVEL_1, "Not cano address of msr write %x\n", ecx);
261 gp_fault:
262 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
263 return 0;
264 }
266 /*
267 * To avoid MSR save/restore at every VM exit/entry time, we restore
268 * the x86_64 specific MSRs at domain switch time. Since these MSRs
269 * are not modified once set for para domains, we don't save them,
270 * but simply reset them to values set in percpu_traps_init().
271 */
272 static void vmx_restore_host_msrs(void)
273 {
274 struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state);
275 int i;
277 while ( host_msr_state->flags )
278 {
279 i = find_first_set_bit(host_msr_state->flags);
280 wrmsrl(msr_index[i], host_msr_state->msrs[i]);
281 clear_bit(i, &host_msr_state->flags);
282 }
283 }
285 static void vmx_save_guest_msrs(struct vcpu *v)
286 {
287 /* MSR_SHADOW_GS_BASE may have been changed by swapgs instruction. */
288 rdmsrl(MSR_SHADOW_GS_BASE, v->arch.hvm_vmx.msr_state.shadow_gs);
289 }
291 static void vmx_restore_guest_msrs(struct vcpu *v)
292 {
293 struct vmx_msr_state *guest_msr_state, *host_msr_state;
294 unsigned long guest_flags;
295 int i;
297 guest_msr_state = &v->arch.hvm_vmx.msr_state;
298 host_msr_state = &this_cpu(host_msr_state);
300 wrmsrl(MSR_SHADOW_GS_BASE, guest_msr_state->shadow_gs);
302 guest_flags = guest_msr_state->flags;
303 if ( !guest_flags )
304 return;
306 while ( guest_flags ) {
307 i = find_first_set_bit(guest_flags);
309 HVM_DBG_LOG(DBG_LEVEL_2,
310 "restore guest's index %d msr %x with value %lx",
311 i, msr_index[i], guest_msr_state->msrs[i]);
312 set_bit(i, &host_msr_state->flags);
313 wrmsrl(msr_index[i], guest_msr_state->msrs[i]);
314 clear_bit(i, &guest_flags);
315 }
316 }
318 #else /* __i386__ */
320 #define vmx_save_host_msrs() ((void)0)
321 #define vmx_restore_host_msrs() ((void)0)
322 #define vmx_save_guest_msrs(v) ((void)0)
323 #define vmx_restore_guest_msrs(v) ((void)0)
325 static inline int long_mode_do_msr_read(struct cpu_user_regs *regs)
326 {
327 return 0;
328 }
330 static inline int long_mode_do_msr_write(struct cpu_user_regs *regs)
331 {
332 return 0;
333 }
335 #endif /* __i386__ */
337 #define loaddebug(_v,_reg) \
338 __asm__ __volatile__ ("mov %0,%%db" #_reg : : "r" ((_v)->debugreg[_reg]))
339 #define savedebug(_v,_reg) \
340 __asm__ __volatile__ ("mov %%db" #_reg ",%0" : : "r" ((_v)->debugreg[_reg]))
342 static inline void vmx_save_dr(struct vcpu *v)
343 {
344 if ( !v->arch.hvm_vcpu.flag_dr_dirty )
345 return;
347 /* Clear the DR dirty flag and re-enable intercepts for DR accesses. */
348 v->arch.hvm_vcpu.flag_dr_dirty = 0;
349 v->arch.hvm_vcpu.u.vmx.exec_control |= CPU_BASED_MOV_DR_EXITING;
350 __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vcpu.u.vmx.exec_control);
352 savedebug(&v->arch.guest_context, 0);
353 savedebug(&v->arch.guest_context, 1);
354 savedebug(&v->arch.guest_context, 2);
355 savedebug(&v->arch.guest_context, 3);
356 savedebug(&v->arch.guest_context, 6);
357 v->arch.guest_context.debugreg[7] = __vmread(GUEST_DR7);
358 }
360 static inline void __restore_debug_registers(struct vcpu *v)
361 {
362 loaddebug(&v->arch.guest_context, 0);
363 loaddebug(&v->arch.guest_context, 1);
364 loaddebug(&v->arch.guest_context, 2);
365 loaddebug(&v->arch.guest_context, 3);
366 /* No 4 and 5 */
367 loaddebug(&v->arch.guest_context, 6);
368 /* DR7 is loaded from the VMCS. */
369 }
371 int vmx_vmcs_save(struct vcpu *v, struct hvm_hw_cpu *c)
372 {
373 c->eip = __vmread(GUEST_RIP);
374 c->esp = __vmread(GUEST_RSP);
375 c->eflags = __vmread(GUEST_RFLAGS);
377 c->cr0 = v->arch.hvm_vmx.cpu_shadow_cr0;
378 c->cr3 = v->arch.hvm_vmx.cpu_cr3;
379 c->cr4 = v->arch.hvm_vmx.cpu_shadow_cr4;
381 #ifdef HVM_DEBUG_SUSPEND
382 printk("vmx_vmcs_save: cr3=0x%"PRIx64", cr0=0x%"PRIx64", cr4=0x%"PRIx64".\n",
383 c->cr3,
384 c->cr0,
385 c->cr4);
386 #endif
388 c->idtr_limit = __vmread(GUEST_IDTR_LIMIT);
389 c->idtr_base = __vmread(GUEST_IDTR_BASE);
391 c->gdtr_limit = __vmread(GUEST_GDTR_LIMIT);
392 c->gdtr_base = __vmread(GUEST_GDTR_BASE);
394 c->cs_sel = __vmread(GUEST_CS_SELECTOR);
395 c->cs_limit = __vmread(GUEST_CS_LIMIT);
396 c->cs_base = __vmread(GUEST_CS_BASE);
397 c->cs_arbytes = __vmread(GUEST_CS_AR_BYTES);
399 c->ds_sel = __vmread(GUEST_DS_SELECTOR);
400 c->ds_limit = __vmread(GUEST_DS_LIMIT);
401 c->ds_base = __vmread(GUEST_DS_BASE);
402 c->ds_arbytes = __vmread(GUEST_DS_AR_BYTES);
404 c->es_sel = __vmread(GUEST_ES_SELECTOR);
405 c->es_limit = __vmread(GUEST_ES_LIMIT);
406 c->es_base = __vmread(GUEST_ES_BASE);
407 c->es_arbytes = __vmread(GUEST_ES_AR_BYTES);
409 c->ss_sel = __vmread(GUEST_SS_SELECTOR);
410 c->ss_limit = __vmread(GUEST_SS_LIMIT);
411 c->ss_base = __vmread(GUEST_SS_BASE);
412 c->ss_arbytes = __vmread(GUEST_SS_AR_BYTES);
414 c->fs_sel = __vmread(GUEST_FS_SELECTOR);
415 c->fs_limit = __vmread(GUEST_FS_LIMIT);
416 c->fs_base = __vmread(GUEST_FS_BASE);
417 c->fs_arbytes = __vmread(GUEST_FS_AR_BYTES);
419 c->gs_sel = __vmread(GUEST_GS_SELECTOR);
420 c->gs_limit = __vmread(GUEST_GS_LIMIT);
421 c->gs_base = __vmread(GUEST_GS_BASE);
422 c->gs_arbytes = __vmread(GUEST_GS_AR_BYTES);
424 c->tr_sel = __vmread(GUEST_TR_SELECTOR);
425 c->tr_limit = __vmread(GUEST_TR_LIMIT);
426 c->tr_base = __vmread(GUEST_TR_BASE);
427 c->tr_arbytes = __vmread(GUEST_TR_AR_BYTES);
429 c->ldtr_sel = __vmread(GUEST_LDTR_SELECTOR);
430 c->ldtr_limit = __vmread(GUEST_LDTR_LIMIT);
431 c->ldtr_base = __vmread(GUEST_LDTR_BASE);
432 c->ldtr_arbytes = __vmread(GUEST_LDTR_AR_BYTES);
434 c->sysenter_cs = __vmread(GUEST_SYSENTER_CS);
435 c->sysenter_esp = __vmread(GUEST_SYSENTER_ESP);
436 c->sysenter_eip = __vmread(GUEST_SYSENTER_EIP);
438 return 1;
439 }
441 int vmx_vmcs_restore(struct vcpu *v, struct hvm_hw_cpu *c)
442 {
443 unsigned long mfn, old_base_mfn;
445 vmx_vmcs_enter(v);
447 __vmwrite(GUEST_RIP, c->eip);
448 __vmwrite(GUEST_RSP, c->esp);
449 __vmwrite(GUEST_RFLAGS, c->eflags);
451 v->arch.hvm_vmx.cpu_shadow_cr0 = c->cr0;
452 __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr0);
454 #ifdef HVM_DEBUG_SUSPEND
455 printk("vmx_vmcs_restore: cr3=0x%"PRIx64", cr0=0x%"PRIx64", cr4=0x%"PRIx64".\n",
456 c->cr3,
457 c->cr0,
458 c->cr4);
459 #endif
461 if (!vmx_paging_enabled(v)) {
462 printk("vmx_vmcs_restore: paging not enabled.");
463 goto skip_cr3;
464 }
466 if (c->cr3 == v->arch.hvm_vmx.cpu_cr3) {
467 /*
468 * This is simple TLB flush, implying the guest has
469 * removed some translation or changed page attributes.
470 * We simply invalidate the shadow.
471 */
472 mfn = gmfn_to_mfn(v->domain, c->cr3 >> PAGE_SHIFT);
473 if (mfn != pagetable_get_pfn(v->arch.guest_table)) {
474 goto bad_cr3;
475 }
476 } else {
477 /*
478 * If different, make a shadow. Check if the PDBR is valid
479 * first.
480 */
481 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 c->cr3 = %"PRIx64"", c->cr3);
482 /* current!=vcpu as not called by arch_vmx_do_launch */
483 mfn = gmfn_to_mfn(v->domain, c->cr3 >> PAGE_SHIFT);
484 if( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), v->domain)) {
485 goto bad_cr3;
486 }
487 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
488 v->arch.guest_table = pagetable_from_pfn(mfn);
489 if (old_base_mfn)
490 put_page(mfn_to_page(old_base_mfn));
491 v->arch.hvm_vmx.cpu_cr3 = c->cr3;
492 }
494 skip_cr3:
495 #if defined(__x86_64__)
496 if (vmx_long_mode_enabled(v)) {
497 unsigned long vm_entry_value;
498 vm_entry_value = __vmread(VM_ENTRY_CONTROLS);
499 vm_entry_value |= VM_ENTRY_IA32E_MODE;
500 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
501 }
502 #endif
504 __vmwrite(GUEST_CR4, (c->cr4 | VMX_CR4_HOST_MASK));
505 v->arch.hvm_vmx.cpu_shadow_cr4 = c->cr4;
506 __vmwrite(CR4_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr4);
508 __vmwrite(GUEST_IDTR_LIMIT, c->idtr_limit);
509 __vmwrite(GUEST_IDTR_BASE, c->idtr_base);
511 __vmwrite(GUEST_GDTR_LIMIT, c->gdtr_limit);
512 __vmwrite(GUEST_GDTR_BASE, c->gdtr_base);
514 __vmwrite(GUEST_CS_SELECTOR, c->cs_sel);
515 __vmwrite(GUEST_CS_LIMIT, c->cs_limit);
516 __vmwrite(GUEST_CS_BASE, c->cs_base);
517 __vmwrite(GUEST_CS_AR_BYTES, c->cs_arbytes);
519 __vmwrite(GUEST_DS_SELECTOR, c->ds_sel);
520 __vmwrite(GUEST_DS_LIMIT, c->ds_limit);
521 __vmwrite(GUEST_DS_BASE, c->ds_base);
522 __vmwrite(GUEST_DS_AR_BYTES, c->ds_arbytes);
524 __vmwrite(GUEST_ES_SELECTOR, c->es_sel);
525 __vmwrite(GUEST_ES_LIMIT, c->es_limit);
526 __vmwrite(GUEST_ES_BASE, c->es_base);
527 __vmwrite(GUEST_ES_AR_BYTES, c->es_arbytes);
529 __vmwrite(GUEST_SS_SELECTOR, c->ss_sel);
530 __vmwrite(GUEST_SS_LIMIT, c->ss_limit);
531 __vmwrite(GUEST_SS_BASE, c->ss_base);
532 __vmwrite(GUEST_SS_AR_BYTES, c->ss_arbytes);
534 __vmwrite(GUEST_FS_SELECTOR, c->fs_sel);
535 __vmwrite(GUEST_FS_LIMIT, c->fs_limit);
536 __vmwrite(GUEST_FS_BASE, c->fs_base);
537 __vmwrite(GUEST_FS_AR_BYTES, c->fs_arbytes);
539 __vmwrite(GUEST_GS_SELECTOR, c->gs_sel);
540 __vmwrite(GUEST_GS_LIMIT, c->gs_limit);
541 __vmwrite(GUEST_GS_BASE, c->gs_base);
542 __vmwrite(GUEST_GS_AR_BYTES, c->gs_arbytes);
544 __vmwrite(GUEST_TR_SELECTOR, c->tr_sel);
545 __vmwrite(GUEST_TR_LIMIT, c->tr_limit);
546 __vmwrite(GUEST_TR_BASE, c->tr_base);
547 __vmwrite(GUEST_TR_AR_BYTES, c->tr_arbytes);
549 __vmwrite(GUEST_LDTR_SELECTOR, c->ldtr_sel);
550 __vmwrite(GUEST_LDTR_LIMIT, c->ldtr_limit);
551 __vmwrite(GUEST_LDTR_BASE, c->ldtr_base);
552 __vmwrite(GUEST_LDTR_AR_BYTES, c->ldtr_arbytes);
554 __vmwrite(GUEST_SYSENTER_CS, c->sysenter_cs);
555 __vmwrite(GUEST_SYSENTER_ESP, c->sysenter_esp);
556 __vmwrite(GUEST_SYSENTER_EIP, c->sysenter_eip);
558 vmx_vmcs_exit(v);
560 paging_update_paging_modes(v);
561 return 0;
563 bad_cr3:
564 gdprintk(XENLOG_ERR, "Invalid CR3 value=0x%"PRIx64"", c->cr3);
565 vmx_vmcs_exit(v);
566 return -EINVAL;
567 }
569 #ifdef HVM_DEBUG_SUSPEND
570 static void dump_msr_state(struct vmx_msr_state *m)
571 {
572 int i = 0;
573 printk("**** msr state ****\n");
574 printk("shadow_gs=0x%lx, flags=0x%lx, msr_items:", m->shadow_gs, m->flags);
575 for (i = 0; i < VMX_MSR_COUNT; i++)
576 printk("0x%lx,", m->msrs[i]);
577 printk("\n");
578 }
579 #else
580 static void dump_msr_state(struct vmx_msr_state *m)
581 {
582 }
583 #endif
585 void vmx_save_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data)
586 {
587 struct vmx_msr_state *guest_state = &v->arch.hvm_vmx.msr_state;
588 unsigned long guest_flags = guest_state->flags;
590 data->shadow_gs = guest_state->shadow_gs;
592 /* save msrs */
593 data->flags = guest_flags;
594 data->msr_lstar = guest_state->msrs[VMX_INDEX_MSR_LSTAR];
595 data->msr_star = guest_state->msrs[VMX_INDEX_MSR_STAR];
596 data->msr_cstar = guest_state->msrs[VMX_INDEX_MSR_CSTAR];
597 data->msr_syscall_mask = guest_state->msrs[VMX_INDEX_MSR_SYSCALL_MASK];
598 data->msr_efer = guest_state->msrs[VMX_INDEX_MSR_EFER];
600 data->tsc = hvm_get_guest_time(v);
602 dump_msr_state(guest_state);
603 }
605 void vmx_load_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data)
606 {
607 struct vmx_msr_state *guest_state = &v->arch.hvm_vmx.msr_state;
609 /* restore msrs */
610 guest_state->flags = data->flags;
611 guest_state->msrs[VMX_INDEX_MSR_LSTAR] = data->msr_lstar;
612 guest_state->msrs[VMX_INDEX_MSR_STAR] = data->msr_star;
613 guest_state->msrs[VMX_INDEX_MSR_CSTAR] = data->msr_cstar;
614 guest_state->msrs[VMX_INDEX_MSR_SYSCALL_MASK] = data->msr_syscall_mask;
615 guest_state->msrs[VMX_INDEX_MSR_EFER] = data->msr_efer;
617 guest_state->shadow_gs = data->shadow_gs;
619 v->arch.hvm_vmx.vmxassist_enabled = !(data->cr0 & X86_CR0_PE);
621 hvm_set_guest_time(v, data->tsc);
623 dump_msr_state(guest_state);
624 }
627 void vmx_save_vmcs_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt)
628 {
629 vmx_save_cpu_state(v, ctxt);
630 vmx_vmcs_enter(v);
631 vmx_vmcs_save(v, ctxt);
632 vmx_vmcs_exit(v);
633 }
635 int vmx_load_vmcs_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt)
636 {
637 vmx_load_cpu_state(v, ctxt);
638 if (vmx_vmcs_restore(v, ctxt)) {
639 printk("vmx_vmcs restore failed!\n");
640 domain_crash(v->domain);
641 return -EINVAL;
642 }
644 return 0;
645 }
647 /*
648 * DR7 is saved and restored on every vmexit. Other debug registers only
649 * need to be restored if their value is going to affect execution -- i.e.,
650 * if one of the breakpoints is enabled. So mask out all bits that don't
651 * enable some breakpoint functionality.
652 */
653 #define DR7_ACTIVE_MASK 0xff
655 static inline void vmx_restore_dr(struct vcpu *v)
656 {
657 /* NB. __vmread() is not usable here, so we cannot read from the VMCS. */
658 if ( unlikely(v->arch.guest_context.debugreg[7] & DR7_ACTIVE_MASK) )
659 __restore_debug_registers(v);
660 }
662 static void vmx_ctxt_switch_from(struct vcpu *v)
663 {
664 vmx_save_guest_msrs(v);
665 vmx_restore_host_msrs();
666 vmx_save_dr(v);
667 }
669 static void vmx_ctxt_switch_to(struct vcpu *v)
670 {
671 vmx_restore_guest_msrs(v);
672 vmx_restore_dr(v);
673 }
675 static void stop_vmx(void)
676 {
677 if ( !(read_cr4() & X86_CR4_VMXE) )
678 return;
680 __vmxoff();
681 clear_in_cr4(X86_CR4_VMXE);
682 }
684 static void vmx_store_cpu_guest_regs(
685 struct vcpu *v, struct cpu_user_regs *regs, unsigned long *crs)
686 {
687 vmx_vmcs_enter(v);
689 if ( regs != NULL )
690 {
691 regs->eflags = __vmread(GUEST_RFLAGS);
692 regs->ss = __vmread(GUEST_SS_SELECTOR);
693 regs->cs = __vmread(GUEST_CS_SELECTOR);
694 regs->eip = __vmread(GUEST_RIP);
695 regs->esp = __vmread(GUEST_RSP);
696 }
698 if ( crs != NULL )
699 {
700 crs[0] = v->arch.hvm_vmx.cpu_shadow_cr0;
701 crs[2] = v->arch.hvm_vmx.cpu_cr2;
702 crs[3] = v->arch.hvm_vmx.cpu_cr3;
703 crs[4] = v->arch.hvm_vmx.cpu_shadow_cr4;
704 }
706 vmx_vmcs_exit(v);
707 }
709 static void vmx_load_cpu_guest_regs(struct vcpu *v, struct cpu_user_regs *regs)
710 {
711 unsigned long base;
713 vmx_vmcs_enter(v);
715 __vmwrite(GUEST_SS_SELECTOR, regs->ss);
716 __vmwrite(GUEST_RSP, regs->esp);
718 /* NB. Bit 1 of RFLAGS must be set for VMENTRY to succeed. */
719 __vmwrite(GUEST_RFLAGS, regs->eflags | 2UL);
721 if ( regs->eflags & EF_VM )
722 {
723 /*
724 * The VMX spec (section 4.3.1.2, Checks on Guest Segment
725 * Registers) says that virtual-8086 mode guests' segment
726 * base-address fields in the VMCS must be equal to their
727 * corresponding segment selector field shifted right by
728 * four bits upon vmentry.
729 */
730 base = __vmread(GUEST_CS_BASE);
731 if ( (regs->cs << 4) != base )
732 __vmwrite(GUEST_CS_BASE, regs->cs << 4);
733 base = __vmread(GUEST_SS_BASE);
734 if ( (regs->ss << 4) != base )
735 __vmwrite(GUEST_SS_BASE, regs->ss << 4);
736 }
738 __vmwrite(GUEST_CS_SELECTOR, regs->cs);
739 __vmwrite(GUEST_RIP, regs->eip);
741 vmx_vmcs_exit(v);
742 }
744 static unsigned long vmx_get_ctrl_reg(struct vcpu *v, unsigned int num)
745 {
746 switch ( num )
747 {
748 case 0:
749 return v->arch.hvm_vmx.cpu_cr0;
750 case 2:
751 return v->arch.hvm_vmx.cpu_cr2;
752 case 3:
753 return v->arch.hvm_vmx.cpu_cr3;
754 case 4:
755 return v->arch.hvm_vmx.cpu_shadow_cr4;
756 default:
757 BUG();
758 }
759 return 0; /* dummy */
760 }
762 static unsigned long vmx_get_segment_base(struct vcpu *v, enum x86_segment seg)
763 {
764 unsigned long base = 0;
765 int long_mode = 0;
767 ASSERT(v == current);
769 #ifdef __x86_64__
770 if ( vmx_long_mode_enabled(v) && (__vmread(GUEST_CS_AR_BYTES) & (1u<<13)) )
771 long_mode = 1;
772 #endif
774 switch ( seg )
775 {
776 case x86_seg_cs: if ( !long_mode ) base = __vmread(GUEST_CS_BASE); break;
777 case x86_seg_ds: if ( !long_mode ) base = __vmread(GUEST_DS_BASE); break;
778 case x86_seg_es: if ( !long_mode ) base = __vmread(GUEST_ES_BASE); break;
779 case x86_seg_fs: base = __vmread(GUEST_FS_BASE); break;
780 case x86_seg_gs: base = __vmread(GUEST_GS_BASE); break;
781 case x86_seg_ss: if ( !long_mode ) base = __vmread(GUEST_SS_BASE); break;
782 case x86_seg_tr: base = __vmread(GUEST_TR_BASE); break;
783 case x86_seg_gdtr: base = __vmread(GUEST_GDTR_BASE); break;
784 case x86_seg_idtr: base = __vmread(GUEST_IDTR_BASE); break;
785 case x86_seg_ldtr: base = __vmread(GUEST_LDTR_BASE); break;
786 default: BUG(); break;
787 }
789 return base;
790 }
792 static void vmx_get_segment_register(struct vcpu *v, enum x86_segment seg,
793 struct segment_register *reg)
794 {
795 u16 attr = 0;
797 ASSERT(v == current);
799 switch ( seg )
800 {
801 case x86_seg_cs:
802 reg->sel = __vmread(GUEST_CS_SELECTOR);
803 reg->limit = __vmread(GUEST_CS_LIMIT);
804 reg->base = __vmread(GUEST_CS_BASE);
805 attr = __vmread(GUEST_CS_AR_BYTES);
806 break;
807 case x86_seg_ds:
808 reg->sel = __vmread(GUEST_DS_SELECTOR);
809 reg->limit = __vmread(GUEST_DS_LIMIT);
810 reg->base = __vmread(GUEST_DS_BASE);
811 attr = __vmread(GUEST_DS_AR_BYTES);
812 break;
813 case x86_seg_es:
814 reg->sel = __vmread(GUEST_ES_SELECTOR);
815 reg->limit = __vmread(GUEST_ES_LIMIT);
816 reg->base = __vmread(GUEST_ES_BASE);
817 attr = __vmread(GUEST_ES_AR_BYTES);
818 break;
819 case x86_seg_fs:
820 reg->sel = __vmread(GUEST_FS_SELECTOR);
821 reg->limit = __vmread(GUEST_FS_LIMIT);
822 reg->base = __vmread(GUEST_FS_BASE);
823 attr = __vmread(GUEST_FS_AR_BYTES);
824 break;
825 case x86_seg_gs:
826 reg->sel = __vmread(GUEST_GS_SELECTOR);
827 reg->limit = __vmread(GUEST_GS_LIMIT);
828 reg->base = __vmread(GUEST_GS_BASE);
829 attr = __vmread(GUEST_GS_AR_BYTES);
830 break;
831 case x86_seg_ss:
832 reg->sel = __vmread(GUEST_SS_SELECTOR);
833 reg->limit = __vmread(GUEST_SS_LIMIT);
834 reg->base = __vmread(GUEST_SS_BASE);
835 attr = __vmread(GUEST_SS_AR_BYTES);
836 break;
837 case x86_seg_tr:
838 reg->sel = __vmread(GUEST_TR_SELECTOR);
839 reg->limit = __vmread(GUEST_TR_LIMIT);
840 reg->base = __vmread(GUEST_TR_BASE);
841 attr = __vmread(GUEST_TR_AR_BYTES);
842 break;
843 case x86_seg_gdtr:
844 reg->limit = __vmread(GUEST_GDTR_LIMIT);
845 reg->base = __vmread(GUEST_GDTR_BASE);
846 break;
847 case x86_seg_idtr:
848 reg->limit = __vmread(GUEST_IDTR_LIMIT);
849 reg->base = __vmread(GUEST_IDTR_BASE);
850 break;
851 case x86_seg_ldtr:
852 reg->sel = __vmread(GUEST_LDTR_SELECTOR);
853 reg->limit = __vmread(GUEST_LDTR_LIMIT);
854 reg->base = __vmread(GUEST_LDTR_BASE);
855 attr = __vmread(GUEST_LDTR_AR_BYTES);
856 break;
857 default:
858 BUG();
859 }
861 reg->attr.bytes = (attr & 0xff) | ((attr >> 4) & 0xf00);
862 }
864 /* Make sure that xen intercepts any FP accesses from current */
865 static void vmx_stts(struct vcpu *v)
866 {
867 /* VMX depends on operating on the current vcpu */
868 ASSERT(v == current);
870 /*
871 * If the guest does not have TS enabled then we must cause and handle an
872 * exception on first use of the FPU. If the guest *does* have TS enabled
873 * then this is not necessary: no FPU activity can occur until the guest
874 * clears CR0.TS, and we will initialise the FPU when that happens.
875 */
876 if ( !(v->arch.hvm_vmx.cpu_shadow_cr0 & X86_CR0_TS) )
877 {
878 v->arch.hvm_vmx.cpu_cr0 |= X86_CR0_TS;
879 __vmwrite(GUEST_CR0, v->arch.hvm_vmx.cpu_cr0);
880 __vm_set_bit(EXCEPTION_BITMAP, TRAP_no_device);
881 }
882 }
884 static void vmx_set_tsc_offset(struct vcpu *v, u64 offset)
885 {
886 vmx_vmcs_enter(v);
887 __vmwrite(TSC_OFFSET, offset);
888 #if defined (__i386__)
889 __vmwrite(TSC_OFFSET_HIGH, offset >> 32);
890 #endif
891 vmx_vmcs_exit(v);
892 }
894 static void vmx_init_ap_context(
895 struct vcpu_guest_context *ctxt, int vcpuid, int trampoline_vector)
896 {
897 memset(ctxt, 0, sizeof(*ctxt));
898 ctxt->user_regs.eip = VMXASSIST_BASE;
899 ctxt->user_regs.edx = vcpuid;
900 ctxt->user_regs.ebx = trampoline_vector;
901 }
903 void do_nmi(struct cpu_user_regs *);
905 static void vmx_init_hypercall_page(struct domain *d, void *hypercall_page)
906 {
907 char *p;
908 int i;
910 memset(hypercall_page, 0, PAGE_SIZE);
912 for ( i = 0; i < (PAGE_SIZE / 32); i++ )
913 {
914 p = (char *)(hypercall_page + (i * 32));
915 *(u8 *)(p + 0) = 0xb8; /* mov imm32, %eax */
916 *(u32 *)(p + 1) = i;
917 *(u8 *)(p + 5) = 0x0f; /* vmcall */
918 *(u8 *)(p + 6) = 0x01;
919 *(u8 *)(p + 7) = 0xc1;
920 *(u8 *)(p + 8) = 0xc3; /* ret */
921 }
923 /* Don't support HYPERVISOR_iret at the moment */
924 *(u16 *)(hypercall_page + (__HYPERVISOR_iret * 32)) = 0x0b0f; /* ud2 */
925 }
927 static int vmx_realmode(struct vcpu *v)
928 {
929 unsigned long rflags;
931 ASSERT(v == current);
933 rflags = __vmread(GUEST_RFLAGS);
934 return rflags & X86_EFLAGS_VM;
935 }
937 static int vmx_guest_x86_mode(struct vcpu *v)
938 {
939 unsigned long cs_ar_bytes;
941 ASSERT(v == current);
943 cs_ar_bytes = __vmread(GUEST_CS_AR_BYTES);
945 if ( vmx_long_mode_enabled(v) && (cs_ar_bytes & (1u<<13)) )
946 return 8;
948 if ( vmx_realmode(v) )
949 return 2;
951 return ((cs_ar_bytes & (1u<<14)) ? 4 : 2);
952 }
954 static int vmx_pae_enabled(struct vcpu *v)
955 {
956 unsigned long cr4 = v->arch.hvm_vmx.cpu_shadow_cr4;
957 return (vmx_paging_enabled(v) && (cr4 & X86_CR4_PAE));
958 }
960 static int vmx_interrupts_enabled(struct vcpu *v)
961 {
962 unsigned long eflags = __vmread(GUEST_RFLAGS);
963 return !irq_masked(eflags);
964 }
967 static void vmx_update_host_cr3(struct vcpu *v)
968 {
969 ASSERT( (v == current) || !vcpu_runnable(v) );
970 vmx_vmcs_enter(v);
971 __vmwrite(HOST_CR3, v->arch.cr3);
972 vmx_vmcs_exit(v);
973 }
975 static void vmx_update_guest_cr3(struct vcpu *v)
976 {
977 ASSERT( (v == current) || !vcpu_runnable(v) );
978 vmx_vmcs_enter(v);
979 __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
980 vmx_vmcs_exit(v);
981 }
984 static void vmx_inject_exception(
985 unsigned int trapnr, int errcode, unsigned long cr2)
986 {
987 struct vcpu *v = current;
988 vmx_inject_hw_exception(v, trapnr, errcode);
989 if ( trapnr == TRAP_page_fault )
990 v->arch.hvm_vmx.cpu_cr2 = cr2;
991 }
993 static void vmx_update_vtpr(struct vcpu *v, unsigned long value)
994 {
995 /* VMX doesn't have a V_TPR field */
996 }
998 static int vmx_event_injection_faulted(struct vcpu *v)
999 {
1000 unsigned int idtv_info_field;
1002 ASSERT(v == current);
1004 idtv_info_field = __vmread(IDT_VECTORING_INFO_FIELD);
1005 return (idtv_info_field & INTR_INFO_VALID_MASK);
1008 static void disable_intercept_for_msr(u32 msr)
1010 /*
1011 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
1012 * have the write-low and read-high bitmap offsets the wrong way round.
1013 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
1014 */
1015 if ( msr <= 0x1fff )
1017 __clear_bit(msr, vmx_msr_bitmap + 0x000); /* read-low */
1018 __clear_bit(msr, vmx_msr_bitmap + 0x800); /* write-low */
1020 else if ( (msr >= 0xc0000000) && (msr <= 0xc0001fff) )
1022 msr &= 0x1fff;
1023 __clear_bit(msr, vmx_msr_bitmap + 0x400); /* read-high */
1024 __clear_bit(msr, vmx_msr_bitmap + 0xc00); /* write-high */
1028 static struct hvm_function_table vmx_function_table = {
1029 .name = "VMX",
1030 .disable = stop_vmx,
1031 .vcpu_initialise = vmx_vcpu_initialise,
1032 .vcpu_destroy = vmx_vcpu_destroy,
1033 .store_cpu_guest_regs = vmx_store_cpu_guest_regs,
1034 .load_cpu_guest_regs = vmx_load_cpu_guest_regs,
1035 .save_cpu_ctxt = vmx_save_vmcs_ctxt,
1036 .load_cpu_ctxt = vmx_load_vmcs_ctxt,
1037 .paging_enabled = vmx_paging_enabled,
1038 .long_mode_enabled = vmx_long_mode_enabled,
1039 .pae_enabled = vmx_pae_enabled,
1040 .interrupts_enabled = vmx_interrupts_enabled,
1041 .guest_x86_mode = vmx_guest_x86_mode,
1042 .get_guest_ctrl_reg = vmx_get_ctrl_reg,
1043 .get_segment_base = vmx_get_segment_base,
1044 .get_segment_register = vmx_get_segment_register,
1045 .update_host_cr3 = vmx_update_host_cr3,
1046 .update_guest_cr3 = vmx_update_guest_cr3,
1047 .update_vtpr = vmx_update_vtpr,
1048 .stts = vmx_stts,
1049 .set_tsc_offset = vmx_set_tsc_offset,
1050 .inject_exception = vmx_inject_exception,
1051 .init_ap_context = vmx_init_ap_context,
1052 .init_hypercall_page = vmx_init_hypercall_page,
1053 .event_injection_faulted = vmx_event_injection_faulted
1054 };
1056 int start_vmx(void)
1058 u32 eax, edx;
1059 struct vmcs_struct *vmcs;
1061 /*
1062 * Xen does not fill x86_capability words except 0.
1063 */
1064 boot_cpu_data.x86_capability[4] = cpuid_ecx(1);
1066 if ( !test_bit(X86_FEATURE_VMXE, &boot_cpu_data.x86_capability) )
1067 return 0;
1069 rdmsr(IA32_FEATURE_CONTROL_MSR, eax, edx);
1071 if ( eax & IA32_FEATURE_CONTROL_MSR_LOCK )
1073 if ( (eax & IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON) == 0x0 )
1075 printk("VMX disabled by Feature Control MSR.\n");
1076 return 0;
1079 else
1081 wrmsr(IA32_FEATURE_CONTROL_MSR,
1082 IA32_FEATURE_CONTROL_MSR_LOCK |
1083 IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON, 0);
1086 set_in_cr4(X86_CR4_VMXE);
1088 vmx_init_vmcs_config();
1090 if ( smp_processor_id() == 0 )
1091 setup_vmcs_dump();
1093 if ( (vmcs = vmx_alloc_host_vmcs()) == NULL )
1095 clear_in_cr4(X86_CR4_VMXE);
1096 printk("Failed to allocate host VMCS\n");
1097 return 0;
1100 if ( __vmxon(virt_to_maddr(vmcs)) )
1102 clear_in_cr4(X86_CR4_VMXE);
1103 printk("VMXON failed\n");
1104 vmx_free_host_vmcs(vmcs);
1105 return 0;
1108 vmx_save_host_msrs();
1110 if ( smp_processor_id() != 0 )
1111 return 1;
1113 hvm_enable(&vmx_function_table);
1115 if ( cpu_has_vmx_msr_bitmap )
1117 printk("VMX: MSR intercept bitmap enabled\n");
1118 vmx_msr_bitmap = alloc_xenheap_page();
1119 BUG_ON(vmx_msr_bitmap == NULL);
1120 memset(vmx_msr_bitmap, ~0, PAGE_SIZE);
1121 disable_intercept_for_msr(MSR_FS_BASE);
1122 disable_intercept_for_msr(MSR_GS_BASE);
1125 return 1;
1128 /*
1129 * Not all cases receive valid value in the VM-exit instruction length field.
1130 * Callers must know what they're doing!
1131 */
1132 static int __get_instruction_length(void)
1134 int len;
1135 len = __vmread(VM_EXIT_INSTRUCTION_LEN); /* Safe: callers audited */
1136 BUG_ON((len < 1) || (len > 15));
1137 return len;
1140 static void inline __update_guest_eip(unsigned long inst_len)
1142 unsigned long current_eip;
1144 current_eip = __vmread(GUEST_RIP);
1145 __vmwrite(GUEST_RIP, current_eip + inst_len);
1146 __vmwrite(GUEST_INTERRUPTIBILITY_INFO, 0);
1149 static void vmx_do_no_device_fault(void)
1151 struct vcpu *v = current;
1153 setup_fpu(current);
1154 __vm_clear_bit(EXCEPTION_BITMAP, TRAP_no_device);
1156 /* Disable TS in guest CR0 unless the guest wants the exception too. */
1157 if ( !(v->arch.hvm_vmx.cpu_shadow_cr0 & X86_CR0_TS) )
1159 v->arch.hvm_vmx.cpu_cr0 &= ~X86_CR0_TS;
1160 __vmwrite(GUEST_CR0, v->arch.hvm_vmx.cpu_cr0);
1164 #define bitmaskof(idx) (1U << ((idx) & 31))
1165 static void vmx_do_cpuid(struct cpu_user_regs *regs)
1167 unsigned int input = (unsigned int)regs->eax;
1168 unsigned int count = (unsigned int)regs->ecx;
1169 unsigned int eax, ebx, ecx, edx;
1171 if ( input == 0x00000004 )
1173 cpuid_count(input, count, &eax, &ebx, &ecx, &edx);
1174 eax &= NUM_CORES_RESET_MASK;
1176 else if ( input == 0x40000003 )
1178 /*
1179 * NB. Unsupported interface for private use of VMXASSIST only.
1180 * Note that this leaf lives at <max-hypervisor-leaf> + 1.
1181 */
1182 u64 value = ((u64)regs->edx << 32) | (u32)regs->ecx;
1183 unsigned long mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
1184 struct vcpu *v = current;
1185 char *p;
1187 gdprintk(XENLOG_INFO, "Input address is 0x%"PRIx64".\n", value);
1189 /* 8-byte aligned valid pseudophys address from vmxassist, please. */
1190 if ( (value & 7) || (mfn == INVALID_MFN) ||
1191 !v->arch.hvm_vmx.vmxassist_enabled )
1193 domain_crash(v->domain);
1194 return;
1197 p = map_domain_page(mfn);
1198 value = *((uint64_t *)(p + (value & (PAGE_SIZE - 1))));
1199 unmap_domain_page(p);
1201 gdprintk(XENLOG_INFO, "Output value is 0x%"PRIx64".\n", value);
1202 ecx = (u32)value;
1203 edx = (u32)(value >> 32);
1204 } else {
1205 hvm_cpuid(input, &eax, &ebx, &ecx, &edx);
1207 if ( input == 0x00000001 )
1209 /* Mask off reserved bits. */
1210 ecx &= ~VMX_VCPU_CPUID_L1_ECX_RESERVED;
1212 ebx &= NUM_THREADS_RESET_MASK;
1214 /* Unsupportable for virtualised CPUs. */
1215 ecx &= ~(bitmaskof(X86_FEATURE_VMXE) |
1216 bitmaskof(X86_FEATURE_EST) |
1217 bitmaskof(X86_FEATURE_TM2) |
1218 bitmaskof(X86_FEATURE_CID));
1220 edx &= ~(bitmaskof(X86_FEATURE_HT) |
1221 bitmaskof(X86_FEATURE_ACPI) |
1222 bitmaskof(X86_FEATURE_ACC));
1225 if ( input == 0x00000006 || input == 0x00000009 || input == 0x0000000A )
1226 eax = ebx = ecx = edx = 0x0;
1229 regs->eax = (unsigned long)eax;
1230 regs->ebx = (unsigned long)ebx;
1231 regs->ecx = (unsigned long)ecx;
1232 regs->edx = (unsigned long)edx;
1234 HVMTRACE_3D(CPUID, current, input,
1235 ((uint64_t)eax << 32) | ebx, ((uint64_t)ecx << 32) | edx);
1238 #define CASE_GET_REG_P(REG, reg) \
1239 case REG_ ## REG: reg_p = (unsigned long *)&(regs->reg); break
1241 #ifdef __i386__
1242 #define CASE_EXTEND_GET_REG_P
1243 #else
1244 #define CASE_EXTEND_GET_REG_P \
1245 CASE_GET_REG_P(R8, r8); \
1246 CASE_GET_REG_P(R9, r9); \
1247 CASE_GET_REG_P(R10, r10); \
1248 CASE_GET_REG_P(R11, r11); \
1249 CASE_GET_REG_P(R12, r12); \
1250 CASE_GET_REG_P(R13, r13); \
1251 CASE_GET_REG_P(R14, r14); \
1252 CASE_GET_REG_P(R15, r15)
1253 #endif
1255 static void vmx_dr_access(unsigned long exit_qualification,
1256 struct cpu_user_regs *regs)
1258 struct vcpu *v = current;
1260 HVMTRACE_0D(DR_WRITE, v);
1262 v->arch.hvm_vcpu.flag_dr_dirty = 1;
1264 /* We could probably be smarter about this */
1265 __restore_debug_registers(v);
1267 /* Allow guest direct access to DR registers */
1268 v->arch.hvm_vcpu.u.vmx.exec_control &= ~CPU_BASED_MOV_DR_EXITING;
1269 __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
1270 v->arch.hvm_vcpu.u.vmx.exec_control);
1273 /*
1274 * Invalidate the TLB for va. Invalidate the shadow page corresponding
1275 * the address va.
1276 */
1277 static void vmx_do_invlpg(unsigned long va)
1279 unsigned long eip;
1280 struct vcpu *v = current;
1282 HVMTRACE_2D(INVLPG, v, /*invlpga=*/ 0, va);
1284 eip = __vmread(GUEST_RIP);
1286 HVM_DBG_LOG(DBG_LEVEL_VMMU, "eip=%lx, va=%lx",
1287 eip, va);
1289 /*
1290 * We do the safest things first, then try to update the shadow
1291 * copying from guest
1292 */
1293 paging_invlpg(v, va);
1297 static int vmx_check_descriptor(int long_mode, unsigned long eip, int inst_len,
1298 enum x86_segment seg, unsigned long *base,
1299 u32 *limit, u32 *ar_bytes)
1301 enum vmcs_field ar_field, base_field, limit_field;
1303 *base = 0;
1304 *limit = 0;
1305 if ( seg != x86_seg_es )
1307 unsigned char inst[MAX_INST_LEN];
1308 int i;
1309 extern int inst_copy_from_guest(unsigned char *, unsigned long, int);
1311 if ( !long_mode )
1312 eip += __vmread(GUEST_CS_BASE);
1313 memset(inst, 0, MAX_INST_LEN);
1314 if ( inst_copy_from_guest(inst, eip, inst_len) != inst_len )
1316 gdprintk(XENLOG_ERR, "Get guest instruction failed\n");
1317 domain_crash(current->domain);
1318 return 0;
1321 for ( i = 0; i < inst_len; i++ )
1323 switch ( inst[i] )
1325 case 0xf3: /* REPZ */
1326 case 0xf2: /* REPNZ */
1327 case 0xf0: /* LOCK */
1328 case 0x66: /* data32 */
1329 case 0x67: /* addr32 */
1330 #ifdef __x86_64__
1331 case 0x40 ... 0x4f: /* REX */
1332 #endif
1333 continue;
1334 case 0x2e: /* CS */
1335 seg = x86_seg_cs;
1336 continue;
1337 case 0x36: /* SS */
1338 seg = x86_seg_ss;
1339 continue;
1340 case 0x26: /* ES */
1341 seg = x86_seg_es;
1342 continue;
1343 case 0x64: /* FS */
1344 seg = x86_seg_fs;
1345 continue;
1346 case 0x65: /* GS */
1347 seg = x86_seg_gs;
1348 continue;
1349 case 0x3e: /* DS */
1350 seg = x86_seg_ds;
1351 continue;
1356 switch ( seg )
1358 case x86_seg_cs:
1359 ar_field = GUEST_CS_AR_BYTES;
1360 base_field = GUEST_CS_BASE;
1361 limit_field = GUEST_CS_LIMIT;
1362 break;
1363 case x86_seg_ds:
1364 ar_field = GUEST_DS_AR_BYTES;
1365 base_field = GUEST_DS_BASE;
1366 limit_field = GUEST_DS_LIMIT;
1367 break;
1368 case x86_seg_es:
1369 ar_field = GUEST_ES_AR_BYTES;
1370 base_field = GUEST_ES_BASE;
1371 limit_field = GUEST_ES_LIMIT;
1372 break;
1373 case x86_seg_fs:
1374 ar_field = GUEST_FS_AR_BYTES;
1375 base_field = GUEST_FS_BASE;
1376 limit_field = GUEST_FS_LIMIT;
1377 break;
1378 case x86_seg_gs:
1379 ar_field = GUEST_FS_AR_BYTES;
1380 base_field = GUEST_FS_BASE;
1381 limit_field = GUEST_FS_LIMIT;
1382 break;
1383 case x86_seg_ss:
1384 ar_field = GUEST_GS_AR_BYTES;
1385 base_field = GUEST_GS_BASE;
1386 limit_field = GUEST_GS_LIMIT;
1387 break;
1388 default:
1389 BUG();
1390 return 0;
1393 if ( !long_mode || seg == x86_seg_fs || seg == x86_seg_gs )
1395 *base = __vmread(base_field);
1396 *limit = __vmread(limit_field);
1398 *ar_bytes = __vmread(ar_field);
1400 return !(*ar_bytes & 0x10000);
1403 static void vmx_io_instruction(unsigned long exit_qualification,
1404 unsigned long inst_len)
1406 struct cpu_user_regs *regs;
1407 struct hvm_io_op *pio_opp;
1408 unsigned int port, size;
1409 int dir, df, vm86;
1411 pio_opp = &current->arch.hvm_vcpu.io_op;
1412 pio_opp->instr = INSTR_PIO;
1413 pio_opp->flags = 0;
1415 regs = &pio_opp->io_context;
1417 /* Copy current guest state into io instruction state structure. */
1418 memcpy(regs, guest_cpu_user_regs(), HVM_CONTEXT_STACK_BYTES);
1419 hvm_store_cpu_guest_regs(current, regs, NULL);
1421 vm86 = regs->eflags & X86_EFLAGS_VM ? 1 : 0;
1422 df = regs->eflags & X86_EFLAGS_DF ? 1 : 0;
1424 HVM_DBG_LOG(DBG_LEVEL_IO, "vm86 %d, eip=%x:%lx, "
1425 "exit_qualification = %lx",
1426 vm86, regs->cs, (unsigned long)regs->eip, exit_qualification);
1428 if ( test_bit(6, &exit_qualification) )
1429 port = (exit_qualification >> 16) & 0xFFFF;
1430 else
1431 port = regs->edx & 0xffff;
1433 size = (exit_qualification & 7) + 1;
1434 dir = test_bit(3, &exit_qualification); /* direction */
1436 if (dir==IOREQ_READ)
1437 HVMTRACE_2D(IO_READ, current, port, size);
1438 else
1439 HVMTRACE_2D(IO_WRITE, current, port, size);
1441 if ( test_bit(4, &exit_qualification) ) { /* string instruction */
1442 unsigned long addr, count = 1, base;
1443 paddr_t paddr;
1444 unsigned long gfn;
1445 u32 ar_bytes, limit;
1446 int sign = regs->eflags & X86_EFLAGS_DF ? -1 : 1;
1447 int long_mode = 0;
1449 ar_bytes = __vmread(GUEST_CS_AR_BYTES);
1450 #ifdef __x86_64__
1451 if ( vmx_long_mode_enabled(current) && (ar_bytes & (1u<<13)) )
1452 long_mode = 1;
1453 #endif
1454 addr = __vmread(GUEST_LINEAR_ADDRESS);
1456 if ( test_bit(5, &exit_qualification) ) { /* "rep" prefix */
1457 pio_opp->flags |= REPZ;
1458 count = regs->ecx;
1459 if ( !long_mode && (vm86 || !(ar_bytes & (1u<<14))) )
1460 count &= 0xFFFF;
1463 /*
1464 * In protected mode, guest linear address is invalid if the
1465 * selector is null.
1466 */
1467 if ( !vmx_check_descriptor(long_mode, regs->eip, inst_len,
1468 dir==IOREQ_WRITE ? x86_seg_ds : x86_seg_es,
1469 &base, &limit, &ar_bytes) ) {
1470 if ( !long_mode ) {
1471 vmx_inject_hw_exception(current, TRAP_gp_fault, 0);
1472 return;
1474 addr = dir == IOREQ_WRITE ? base + regs->esi : regs->edi;
1477 if ( !long_mode ) {
1478 unsigned long ea = addr - base;
1480 /* Segment must be readable for outs and writeable for ins. */
1481 if ( dir == IOREQ_WRITE ? (ar_bytes & 0xa) == 0x8
1482 : (ar_bytes & 0xa) != 0x2 ) {
1483 vmx_inject_hw_exception(current, TRAP_gp_fault, 0);
1484 return;
1487 /* Offset must be within limits. */
1488 ASSERT(ea == (u32)ea);
1489 if ( (u32)(ea + size - 1) < (u32)ea ||
1490 (ar_bytes & 0xc) != 0x4 ? ea + size - 1 > limit
1491 : ea <= limit )
1493 vmx_inject_hw_exception(current, TRAP_gp_fault, 0);
1494 return;
1497 /* Check the limit for repeated instructions, as above we checked
1498 only the first instance. Truncate the count if a limit violation
1499 would occur. Note that the checking is not necessary for page
1500 granular segments as transfers crossing page boundaries will be
1501 broken up anyway. */
1502 if ( !(ar_bytes & (1u<<15)) && count > 1 )
1504 if ( (ar_bytes & 0xc) != 0x4 )
1506 /* expand-up */
1507 if ( !df )
1509 if ( ea + count * size - 1 < ea ||
1510 ea + count * size - 1 > limit )
1511 count = (limit + 1UL - ea) / size;
1513 else
1515 if ( count - 1 > ea / size )
1516 count = ea / size + 1;
1519 else
1521 /* expand-down */
1522 if ( !df )
1524 if ( count - 1 > -(s32)ea / size )
1525 count = -(s32)ea / size + 1UL;
1527 else
1529 if ( ea < (count - 1) * size ||
1530 ea - (count - 1) * size <= limit )
1531 count = (ea - limit - 1) / size + 1;
1534 ASSERT(count);
1537 #ifdef __x86_64__
1538 else
1540 if ( !is_canonical_address(addr) ||
1541 !is_canonical_address(addr + size - 1) )
1543 vmx_inject_hw_exception(current, TRAP_gp_fault, 0);
1544 return;
1546 if ( count > (1UL << 48) / size )
1547 count = (1UL << 48) / size;
1548 if ( !(regs->eflags & EF_DF) )
1550 if ( addr + count * size - 1 < addr ||
1551 !is_canonical_address(addr + count * size - 1) )
1552 count = (addr & ~((1UL << 48) - 1)) / size;
1554 else
1556 if ( (count - 1) * size > addr ||
1557 !is_canonical_address(addr + (count - 1) * size) )
1558 count = (addr & ~((1UL << 48) - 1)) / size + 1;
1560 ASSERT(count);
1562 #endif
1564 /* Translate the address to a physical address */
1565 gfn = paging_gva_to_gfn(current, addr);
1566 if ( gfn == INVALID_GFN )
1568 /* The guest does not have the RAM address mapped.
1569 * Need to send in a page fault */
1570 int errcode = 0;
1571 /* IO read --> memory write */
1572 if ( dir == IOREQ_READ ) errcode |= PFEC_write_access;
1573 vmx_inject_exception(TRAP_page_fault, errcode, addr);
1574 return;
1576 paddr = (paddr_t)gfn << PAGE_SHIFT | (addr & ~PAGE_MASK);
1578 /*
1579 * Handle string pio instructions that cross pages or that
1580 * are unaligned. See the comments in hvm_domain.c/handle_mmio()
1581 */
1582 if ( (addr & PAGE_MASK) != ((addr + size - 1) & PAGE_MASK) ) {
1583 unsigned long value = 0;
1585 pio_opp->flags |= OVERLAP;
1587 if ( dir == IOREQ_WRITE ) /* OUTS */
1589 if ( hvm_paging_enabled(current) )
1591 int rv = hvm_copy_from_guest_virt(&value, addr, size);
1592 if ( rv != 0 )
1594 /* Failed on the page-spanning copy. Inject PF into
1595 * the guest for the address where we failed. */
1596 addr += size - rv;
1597 gdprintk(XENLOG_DEBUG, "Pagefault reading non-io side "
1598 "of a page-spanning PIO: va=%#lx\n", addr);
1599 vmx_inject_exception(TRAP_page_fault, 0, addr);
1600 return;
1603 else
1604 (void) hvm_copy_from_guest_phys(&value, addr, size);
1605 } else /* dir != IOREQ_WRITE */
1606 /* Remember where to write the result, as a *VA*.
1607 * Must be a VA so we can handle the page overlap
1608 * correctly in hvm_pio_assist() */
1609 pio_opp->addr = addr;
1611 if ( count == 1 )
1612 regs->eip += inst_len;
1614 send_pio_req(port, 1, size, value, dir, df, 0);
1615 } else {
1616 unsigned long last_addr = sign > 0 ? addr + count * size - 1
1617 : addr - (count - 1) * size;
1619 if ( (addr & PAGE_MASK) != (last_addr & PAGE_MASK) )
1621 if ( sign > 0 )
1622 count = (PAGE_SIZE - (addr & ~PAGE_MASK)) / size;
1623 else
1624 count = (addr & ~PAGE_MASK) / size + 1;
1625 } else
1626 regs->eip += inst_len;
1628 send_pio_req(port, count, size, paddr, dir, df, 1);
1630 } else {
1631 if ( port == 0xe9 && dir == IOREQ_WRITE && size == 1 )
1632 hvm_print_line(current, regs->eax); /* guest debug output */
1634 regs->eip += inst_len;
1635 send_pio_req(port, 1, size, regs->eax, dir, df, 0);
1639 static void vmx_world_save(struct vcpu *v, struct vmx_assist_context *c)
1641 /* NB. Skip transition instruction. */
1642 c->eip = __vmread(GUEST_RIP);
1643 c->eip += __get_instruction_length(); /* Safe: MOV Cn, LMSW, CLTS */
1645 c->esp = __vmread(GUEST_RSP);
1646 c->eflags = __vmread(GUEST_RFLAGS);
1648 c->cr0 = v->arch.hvm_vmx.cpu_shadow_cr0;
1649 c->cr3 = v->arch.hvm_vmx.cpu_cr3;
1650 c->cr4 = v->arch.hvm_vmx.cpu_shadow_cr4;
1652 c->idtr_limit = __vmread(GUEST_IDTR_LIMIT);
1653 c->idtr_base = __vmread(GUEST_IDTR_BASE);
1655 c->gdtr_limit = __vmread(GUEST_GDTR_LIMIT);
1656 c->gdtr_base = __vmread(GUEST_GDTR_BASE);
1658 c->cs_sel = __vmread(GUEST_CS_SELECTOR);
1659 c->cs_limit = __vmread(GUEST_CS_LIMIT);
1660 c->cs_base = __vmread(GUEST_CS_BASE);
1661 c->cs_arbytes.bytes = __vmread(GUEST_CS_AR_BYTES);
1663 c->ds_sel = __vmread(GUEST_DS_SELECTOR);
1664 c->ds_limit = __vmread(GUEST_DS_LIMIT);
1665 c->ds_base = __vmread(GUEST_DS_BASE);
1666 c->ds_arbytes.bytes = __vmread(GUEST_DS_AR_BYTES);
1668 c->es_sel = __vmread(GUEST_ES_SELECTOR);
1669 c->es_limit = __vmread(GUEST_ES_LIMIT);
1670 c->es_base = __vmread(GUEST_ES_BASE);
1671 c->es_arbytes.bytes = __vmread(GUEST_ES_AR_BYTES);
1673 c->ss_sel = __vmread(GUEST_SS_SELECTOR);
1674 c->ss_limit = __vmread(GUEST_SS_LIMIT);
1675 c->ss_base = __vmread(GUEST_SS_BASE);
1676 c->ss_arbytes.bytes = __vmread(GUEST_SS_AR_BYTES);
1678 c->fs_sel = __vmread(GUEST_FS_SELECTOR);
1679 c->fs_limit = __vmread(GUEST_FS_LIMIT);
1680 c->fs_base = __vmread(GUEST_FS_BASE);
1681 c->fs_arbytes.bytes = __vmread(GUEST_FS_AR_BYTES);
1683 c->gs_sel = __vmread(GUEST_GS_SELECTOR);
1684 c->gs_limit = __vmread(GUEST_GS_LIMIT);
1685 c->gs_base = __vmread(GUEST_GS_BASE);
1686 c->gs_arbytes.bytes = __vmread(GUEST_GS_AR_BYTES);
1688 c->tr_sel = __vmread(GUEST_TR_SELECTOR);
1689 c->tr_limit = __vmread(GUEST_TR_LIMIT);
1690 c->tr_base = __vmread(GUEST_TR_BASE);
1691 c->tr_arbytes.bytes = __vmread(GUEST_TR_AR_BYTES);
1693 c->ldtr_sel = __vmread(GUEST_LDTR_SELECTOR);
1694 c->ldtr_limit = __vmread(GUEST_LDTR_LIMIT);
1695 c->ldtr_base = __vmread(GUEST_LDTR_BASE);
1696 c->ldtr_arbytes.bytes = __vmread(GUEST_LDTR_AR_BYTES);
1699 static int vmx_world_restore(struct vcpu *v, struct vmx_assist_context *c)
1701 unsigned long mfn, old_base_mfn;
1703 __vmwrite(GUEST_RIP, c->eip);
1704 __vmwrite(GUEST_RSP, c->esp);
1705 __vmwrite(GUEST_RFLAGS, c->eflags);
1707 v->arch.hvm_vmx.cpu_shadow_cr0 = c->cr0;
1708 __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr0);
1710 if ( !vmx_paging_enabled(v) )
1711 goto skip_cr3;
1713 if ( c->cr3 == v->arch.hvm_vmx.cpu_cr3 )
1715 /*
1716 * This is simple TLB flush, implying the guest has
1717 * removed some translation or changed page attributes.
1718 * We simply invalidate the shadow.
1719 */
1720 mfn = get_mfn_from_gpfn(c->cr3 >> PAGE_SHIFT);
1721 if ( mfn != pagetable_get_pfn(v->arch.guest_table) )
1722 goto bad_cr3;
1724 else
1726 /*
1727 * If different, make a shadow. Check if the PDBR is valid
1728 * first.
1729 */
1730 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 c->cr3 = %x", c->cr3);
1731 mfn = get_mfn_from_gpfn(c->cr3 >> PAGE_SHIFT);
1732 if ( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), v->domain) )
1733 goto bad_cr3;
1734 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1735 v->arch.guest_table = pagetable_from_pfn(mfn);
1736 if (old_base_mfn)
1737 put_page(mfn_to_page(old_base_mfn));
1738 v->arch.hvm_vmx.cpu_cr3 = c->cr3;
1741 skip_cr3:
1742 if ( !vmx_paging_enabled(v) )
1743 HVM_DBG_LOG(DBG_LEVEL_VMMU, "switching to vmxassist. use phys table");
1744 else
1745 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %x", c->cr3);
1747 __vmwrite(GUEST_CR4, (c->cr4 | VMX_CR4_HOST_MASK));
1748 v->arch.hvm_vmx.cpu_shadow_cr4 = c->cr4;
1749 __vmwrite(CR4_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr4);
1751 __vmwrite(GUEST_IDTR_LIMIT, c->idtr_limit);
1752 __vmwrite(GUEST_IDTR_BASE, c->idtr_base);
1754 __vmwrite(GUEST_GDTR_LIMIT, c->gdtr_limit);
1755 __vmwrite(GUEST_GDTR_BASE, c->gdtr_base);
1757 __vmwrite(GUEST_CS_SELECTOR, c->cs_sel);
1758 __vmwrite(GUEST_CS_LIMIT, c->cs_limit);
1759 __vmwrite(GUEST_CS_BASE, c->cs_base);
1760 __vmwrite(GUEST_CS_AR_BYTES, c->cs_arbytes.bytes);
1762 __vmwrite(GUEST_DS_SELECTOR, c->ds_sel);
1763 __vmwrite(GUEST_DS_LIMIT, c->ds_limit);
1764 __vmwrite(GUEST_DS_BASE, c->ds_base);
1765 __vmwrite(GUEST_DS_AR_BYTES, c->ds_arbytes.bytes);
1767 __vmwrite(GUEST_ES_SELECTOR, c->es_sel);
1768 __vmwrite(GUEST_ES_LIMIT, c->es_limit);
1769 __vmwrite(GUEST_ES_BASE, c->es_base);
1770 __vmwrite(GUEST_ES_AR_BYTES, c->es_arbytes.bytes);
1772 __vmwrite(GUEST_SS_SELECTOR, c->ss_sel);
1773 __vmwrite(GUEST_SS_LIMIT, c->ss_limit);
1774 __vmwrite(GUEST_SS_BASE, c->ss_base);
1775 __vmwrite(GUEST_SS_AR_BYTES, c->ss_arbytes.bytes);
1777 __vmwrite(GUEST_FS_SELECTOR, c->fs_sel);
1778 __vmwrite(GUEST_FS_LIMIT, c->fs_limit);
1779 __vmwrite(GUEST_FS_BASE, c->fs_base);
1780 __vmwrite(GUEST_FS_AR_BYTES, c->fs_arbytes.bytes);
1782 __vmwrite(GUEST_GS_SELECTOR, c->gs_sel);
1783 __vmwrite(GUEST_GS_LIMIT, c->gs_limit);
1784 __vmwrite(GUEST_GS_BASE, c->gs_base);
1785 __vmwrite(GUEST_GS_AR_BYTES, c->gs_arbytes.bytes);
1787 __vmwrite(GUEST_TR_SELECTOR, c->tr_sel);
1788 __vmwrite(GUEST_TR_LIMIT, c->tr_limit);
1789 __vmwrite(GUEST_TR_BASE, c->tr_base);
1790 __vmwrite(GUEST_TR_AR_BYTES, c->tr_arbytes.bytes);
1792 __vmwrite(GUEST_LDTR_SELECTOR, c->ldtr_sel);
1793 __vmwrite(GUEST_LDTR_LIMIT, c->ldtr_limit);
1794 __vmwrite(GUEST_LDTR_BASE, c->ldtr_base);
1795 __vmwrite(GUEST_LDTR_AR_BYTES, c->ldtr_arbytes.bytes);
1797 paging_update_paging_modes(v);
1798 return 0;
1800 bad_cr3:
1801 gdprintk(XENLOG_ERR, "Invalid CR3 value=%x", c->cr3);
1802 return -EINVAL;
1805 enum { VMX_ASSIST_INVOKE = 0, VMX_ASSIST_RESTORE };
1807 static int vmx_assist(struct vcpu *v, int mode)
1809 struct vmx_assist_context c;
1810 u32 magic;
1811 u32 cp;
1813 /* make sure vmxassist exists (this is not an error) */
1814 if (hvm_copy_from_guest_phys(&magic, VMXASSIST_MAGIC_OFFSET,
1815 sizeof(magic)))
1816 return 0;
1817 if (magic != VMXASSIST_MAGIC)
1818 return 0;
1820 switch (mode) {
1821 /*
1822 * Transfer control to vmxassist.
1823 * Store the current context in VMXASSIST_OLD_CONTEXT and load
1824 * the new VMXASSIST_NEW_CONTEXT context. This context was created
1825 * by vmxassist and will transfer control to it.
1826 */
1827 case VMX_ASSIST_INVOKE:
1828 /* save the old context */
1829 if (hvm_copy_from_guest_phys(&cp, VMXASSIST_OLD_CONTEXT, sizeof(cp)))
1830 goto error;
1831 if (cp != 0) {
1832 vmx_world_save(v, &c);
1833 if (hvm_copy_to_guest_phys(cp, &c, sizeof(c)))
1834 goto error;
1837 /* restore the new context, this should activate vmxassist */
1838 if (hvm_copy_from_guest_phys(&cp, VMXASSIST_NEW_CONTEXT, sizeof(cp)))
1839 goto error;
1840 if (cp != 0) {
1841 if (hvm_copy_from_guest_phys(&c, cp, sizeof(c)))
1842 goto error;
1843 if ( vmx_world_restore(v, &c) != 0 )
1844 goto error;
1845 v->arch.hvm_vmx.vmxassist_enabled = 1;
1846 return 1;
1848 break;
1850 /*
1851 * Restore the VMXASSIST_OLD_CONTEXT that was saved by
1852 * VMX_ASSIST_INVOKE above.
1853 */
1854 case VMX_ASSIST_RESTORE:
1855 /* save the old context */
1856 if (hvm_copy_from_guest_phys(&cp, VMXASSIST_OLD_CONTEXT, sizeof(cp)))
1857 goto error;
1858 if (cp != 0) {
1859 if (hvm_copy_from_guest_phys(&c, cp, sizeof(c)))
1860 goto error;
1861 if ( vmx_world_restore(v, &c) != 0 )
1862 goto error;
1863 v->arch.hvm_vmx.vmxassist_enabled = 0;
1864 return 1;
1866 break;
1869 error:
1870 gdprintk(XENLOG_ERR, "Failed to transfer to vmxassist\n");
1871 domain_crash(v->domain);
1872 return 0;
1875 static int vmx_set_cr0(unsigned long value)
1877 struct vcpu *v = current;
1878 unsigned long mfn;
1879 unsigned long eip;
1880 int paging_enabled;
1881 unsigned long vm_entry_value;
1882 unsigned long old_cr0;
1883 unsigned long old_base_mfn;
1885 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR0 value = %lx\n", value);
1887 /* ET is reserved and should be always be 1. */
1888 value |= X86_CR0_ET;
1890 if ( (value & (X86_CR0_PE|X86_CR0_PG)) == X86_CR0_PG )
1892 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
1893 return 0;
1896 /* TS cleared? Then initialise FPU now. */
1897 if ( !(value & X86_CR0_TS) )
1899 setup_fpu(v);
1900 __vm_clear_bit(EXCEPTION_BITMAP, TRAP_no_device);
1903 old_cr0 = v->arch.hvm_vmx.cpu_shadow_cr0;
1904 paging_enabled = old_cr0 & X86_CR0_PG;
1906 v->arch.hvm_vmx.cpu_cr0 = (value | X86_CR0_PE | X86_CR0_PG
1907 | X86_CR0_NE | X86_CR0_WP);
1908 __vmwrite(GUEST_CR0, v->arch.hvm_vmx.cpu_cr0);
1910 v->arch.hvm_vmx.cpu_shadow_cr0 = value;
1911 __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr0);
1913 if ( (value & X86_CR0_PE) && (value & X86_CR0_PG) && !paging_enabled )
1915 /*
1916 * Trying to enable guest paging.
1917 * The guest CR3 must be pointing to the guest physical.
1918 */
1919 mfn = get_mfn_from_gpfn(v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT);
1920 if ( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), v->domain) )
1922 gdprintk(XENLOG_ERR, "Invalid CR3 value = %lx (mfn=%lx)\n",
1923 v->arch.hvm_vmx.cpu_cr3, mfn);
1924 domain_crash(v->domain);
1925 return 0;
1928 #if defined(__x86_64__)
1929 if ( vmx_lme_is_set(v) )
1931 if ( !(v->arch.hvm_vmx.cpu_shadow_cr4 & X86_CR4_PAE) )
1933 HVM_DBG_LOG(DBG_LEVEL_1, "Guest enabled paging "
1934 "with EFER.LME set but not CR4.PAE\n");
1935 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
1937 else
1939 HVM_DBG_LOG(DBG_LEVEL_1, "Enabling long mode\n");
1940 v->arch.hvm_vmx.msr_state.msrs[VMX_INDEX_MSR_EFER]
1941 |= EFER_LMA;
1942 vm_entry_value = __vmread(VM_ENTRY_CONTROLS);
1943 vm_entry_value |= VM_ENTRY_IA32E_MODE;
1944 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
1947 #endif
1949 /*
1950 * Now arch.guest_table points to machine physical.
1951 */
1952 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1953 v->arch.guest_table = pagetable_from_pfn(mfn);
1954 if (old_base_mfn)
1955 put_page(mfn_to_page(old_base_mfn));
1956 paging_update_paging_modes(v);
1958 HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
1959 (unsigned long) (mfn << PAGE_SHIFT));
1961 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, mfn = %lx",
1962 v->arch.hvm_vmx.cpu_cr3, mfn);
1965 if ( !((value & X86_CR0_PE) && (value & X86_CR0_PG)) && paging_enabled )
1966 if ( v->arch.hvm_vmx.cpu_cr3 ) {
1967 put_page(mfn_to_page(get_mfn_from_gpfn(
1968 v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT)));
1969 v->arch.guest_table = pagetable_null();
1972 /*
1973 * VMX does not implement real-mode virtualization. We emulate
1974 * real-mode by performing a world switch to VMXAssist whenever
1975 * a partition disables the CR0.PE bit.
1976 */
1977 if ( (value & X86_CR0_PE) == 0 )
1979 if ( value & X86_CR0_PG ) {
1980 /* inject GP here */
1981 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
1982 return 0;
1983 } else {
1984 /*
1985 * Disable paging here.
1986 * Same to PE == 1 && PG == 0
1987 */
1988 if ( vmx_long_mode_enabled(v) )
1990 v->arch.hvm_vmx.msr_state.msrs[VMX_INDEX_MSR_EFER]
1991 &= ~EFER_LMA;
1992 vm_entry_value = __vmread(VM_ENTRY_CONTROLS);
1993 vm_entry_value &= ~VM_ENTRY_IA32E_MODE;
1994 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
1998 if ( vmx_assist(v, VMX_ASSIST_INVOKE) )
2000 eip = __vmread(GUEST_RIP);
2001 HVM_DBG_LOG(DBG_LEVEL_1,
2002 "Transfering control to vmxassist %%eip 0x%lx\n", eip);
2003 return 0; /* do not update eip! */
2006 else if ( v->arch.hvm_vmx.vmxassist_enabled )
2008 eip = __vmread(GUEST_RIP);
2009 HVM_DBG_LOG(DBG_LEVEL_1,
2010 "Enabling CR0.PE at %%eip 0x%lx\n", eip);
2011 if ( vmx_assist(v, VMX_ASSIST_RESTORE) )
2013 eip = __vmread(GUEST_RIP);
2014 HVM_DBG_LOG(DBG_LEVEL_1,
2015 "Restoring to %%eip 0x%lx\n", eip);
2016 return 0; /* do not update eip! */
2019 else if ( (value & (X86_CR0_PE | X86_CR0_PG)) == X86_CR0_PE )
2021 if ( vmx_long_mode_enabled(v) )
2023 v->arch.hvm_vmx.msr_state.msrs[VMX_INDEX_MSR_EFER] &= ~EFER_LMA;
2024 vm_entry_value = __vmread(VM_ENTRY_CONTROLS);
2025 vm_entry_value &= ~VM_ENTRY_IA32E_MODE;
2026 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
2028 paging_update_paging_modes(v);
2031 return 1;
2034 #define CASE_SET_REG(REG, reg) \
2035 case REG_ ## REG: regs->reg = value; break
2036 #define CASE_GET_REG(REG, reg) \
2037 case REG_ ## REG: value = regs->reg; break
2039 #define CASE_EXTEND_SET_REG \
2040 CASE_EXTEND_REG(S)
2041 #define CASE_EXTEND_GET_REG \
2042 CASE_EXTEND_REG(G)
2044 #ifdef __i386__
2045 #define CASE_EXTEND_REG(T)
2046 #else
2047 #define CASE_EXTEND_REG(T) \
2048 CASE_ ## T ## ET_REG(R8, r8); \
2049 CASE_ ## T ## ET_REG(R9, r9); \
2050 CASE_ ## T ## ET_REG(R10, r10); \
2051 CASE_ ## T ## ET_REG(R11, r11); \
2052 CASE_ ## T ## ET_REG(R12, r12); \
2053 CASE_ ## T ## ET_REG(R13, r13); \
2054 CASE_ ## T ## ET_REG(R14, r14); \
2055 CASE_ ## T ## ET_REG(R15, r15)
2056 #endif
2058 /*
2059 * Write to control registers
2060 */
2061 static int mov_to_cr(int gp, int cr, struct cpu_user_regs *regs)
2063 unsigned long value, old_cr, old_base_mfn, mfn;
2064 struct vcpu *v = current;
2065 struct vlapic *vlapic = vcpu_vlapic(v);
2067 switch ( gp )
2069 CASE_GET_REG(EAX, eax);
2070 CASE_GET_REG(ECX, ecx);
2071 CASE_GET_REG(EDX, edx);
2072 CASE_GET_REG(EBX, ebx);
2073 CASE_GET_REG(EBP, ebp);
2074 CASE_GET_REG(ESI, esi);
2075 CASE_GET_REG(EDI, edi);
2076 CASE_EXTEND_GET_REG;
2077 case REG_ESP:
2078 value = __vmread(GUEST_RSP);
2079 break;
2080 default:
2081 gdprintk(XENLOG_ERR, "invalid gp: %d\n", gp);
2082 goto exit_and_crash;
2085 HVMTRACE_2D(CR_WRITE, v, cr, value);
2087 HVM_DBG_LOG(DBG_LEVEL_1, "CR%d, value = %lx", cr, value);
2089 switch ( cr )
2091 case 0:
2092 return vmx_set_cr0(value);
2094 case 3:
2095 /*
2096 * If paging is not enabled yet, simply copy the value to CR3.
2097 */
2098 if (!vmx_paging_enabled(v)) {
2099 v->arch.hvm_vmx.cpu_cr3 = value;
2100 break;
2103 /*
2104 * We make a new one if the shadow does not exist.
2105 */
2106 if (value == v->arch.hvm_vmx.cpu_cr3) {
2107 /*
2108 * This is simple TLB flush, implying the guest has
2109 * removed some translation or changed page attributes.
2110 * We simply invalidate the shadow.
2111 */
2112 mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
2113 if (mfn != pagetable_get_pfn(v->arch.guest_table))
2114 goto bad_cr3;
2115 paging_update_cr3(v);
2116 } else {
2117 /*
2118 * If different, make a shadow. Check if the PDBR is valid
2119 * first.
2120 */
2121 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 value = %lx", value);
2122 mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
2123 if ( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), v->domain) )
2124 goto bad_cr3;
2125 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
2126 v->arch.guest_table = pagetable_from_pfn(mfn);
2127 if (old_base_mfn)
2128 put_page(mfn_to_page(old_base_mfn));
2129 v->arch.hvm_vmx.cpu_cr3 = value;
2130 update_cr3(v);
2131 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx", value);
2133 break;
2135 case 4: /* CR4 */
2136 old_cr = v->arch.hvm_vmx.cpu_shadow_cr4;
2138 if ( (value & X86_CR4_PAE) && !(old_cr & X86_CR4_PAE) )
2140 if ( vmx_pgbit_test(v) )
2142 /* The guest is a 32-bit PAE guest. */
2143 #if CONFIG_PAGING_LEVELS >= 3
2144 unsigned long mfn, old_base_mfn;
2145 mfn = get_mfn_from_gpfn(v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT);
2146 if ( !mfn_valid(mfn) ||
2147 !get_page(mfn_to_page(mfn), v->domain) )
2148 goto bad_cr3;
2150 /*
2151 * Now arch.guest_table points to machine physical.
2152 */
2154 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
2155 v->arch.guest_table = pagetable_from_pfn(mfn);
2156 if ( old_base_mfn )
2157 put_page(mfn_to_page(old_base_mfn));
2159 HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
2160 (unsigned long) (mfn << PAGE_SHIFT));
2162 HVM_DBG_LOG(DBG_LEVEL_VMMU,
2163 "Update CR3 value = %lx, mfn = %lx",
2164 v->arch.hvm_vmx.cpu_cr3, mfn);
2165 #endif
2168 else if ( !(value & X86_CR4_PAE) )
2170 if ( unlikely(vmx_long_mode_enabled(v)) )
2172 HVM_DBG_LOG(DBG_LEVEL_1, "Guest cleared CR4.PAE while "
2173 "EFER.LMA is set\n");
2174 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
2178 __vmwrite(GUEST_CR4, value| VMX_CR4_HOST_MASK);
2179 v->arch.hvm_vmx.cpu_shadow_cr4 = value;
2180 __vmwrite(CR4_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr4);
2182 /*
2183 * Writing to CR4 to modify the PSE, PGE, or PAE flag invalidates
2184 * all TLB entries except global entries.
2185 */
2186 if ( (old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE) )
2187 paging_update_paging_modes(v);
2188 break;
2190 case 8:
2191 vlapic_set_reg(vlapic, APIC_TASKPRI, ((value & 0x0F) << 4));
2192 break;
2194 default:
2195 gdprintk(XENLOG_ERR, "invalid cr: %d\n", cr);
2196 domain_crash(v->domain);
2197 return 0;
2200 return 1;
2202 bad_cr3:
2203 gdprintk(XENLOG_ERR, "Invalid CR3\n");
2204 exit_and_crash:
2205 domain_crash(v->domain);
2206 return 0;
2209 /*
2210 * Read from control registers. CR0 and CR4 are read from the shadow.
2211 */
2212 static void mov_from_cr(int cr, int gp, struct cpu_user_regs *regs)
2214 unsigned long value = 0;
2215 struct vcpu *v = current;
2216 struct vlapic *vlapic = vcpu_vlapic(v);
2218 switch ( cr )
2220 case 3:
2221 value = (unsigned long)v->arch.hvm_vmx.cpu_cr3;
2222 break;
2223 case 8:
2224 value = (unsigned long)vlapic_get_reg(vlapic, APIC_TASKPRI);
2225 value = (value & 0xF0) >> 4;
2226 break;
2227 default:
2228 gdprintk(XENLOG_ERR, "invalid cr: %d\n", cr);
2229 domain_crash(v->domain);
2230 break;
2233 switch ( gp ) {
2234 CASE_SET_REG(EAX, eax);
2235 CASE_SET_REG(ECX, ecx);
2236 CASE_SET_REG(EDX, edx);
2237 CASE_SET_REG(EBX, ebx);
2238 CASE_SET_REG(EBP, ebp);
2239 CASE_SET_REG(ESI, esi);
2240 CASE_SET_REG(EDI, edi);
2241 CASE_EXTEND_SET_REG;
2242 case REG_ESP:
2243 __vmwrite(GUEST_RSP, value);
2244 regs->esp = value;
2245 break;
2246 default:
2247 printk("invalid gp: %d\n", gp);
2248 domain_crash(v->domain);
2249 break;
2252 HVMTRACE_2D(CR_READ, v, cr, value);
2254 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR%d, value = %lx", cr, value);
2257 static int vmx_cr_access(unsigned long exit_qualification,
2258 struct cpu_user_regs *regs)
2260 unsigned int gp, cr;
2261 unsigned long value;
2262 struct vcpu *v = current;
2264 switch (exit_qualification & CONTROL_REG_ACCESS_TYPE) {
2265 case TYPE_MOV_TO_CR:
2266 gp = exit_qualification & CONTROL_REG_ACCESS_REG;
2267 cr = exit_qualification & CONTROL_REG_ACCESS_NUM;
2268 return mov_to_cr(gp, cr, regs);
2269 case TYPE_MOV_FROM_CR:
2270 gp = exit_qualification & CONTROL_REG_ACCESS_REG;
2271 cr = exit_qualification & CONTROL_REG_ACCESS_NUM;
2272 mov_from_cr(cr, gp, regs);
2273 break;
2274 case TYPE_CLTS:
2275 /* We initialise the FPU now, to avoid needing another vmexit. */
2276 setup_fpu(v);
2277 __vm_clear_bit(EXCEPTION_BITMAP, TRAP_no_device);
2279 v->arch.hvm_vmx.cpu_cr0 &= ~X86_CR0_TS; /* clear TS */
2280 __vmwrite(GUEST_CR0, v->arch.hvm_vmx.cpu_cr0);
2282 v->arch.hvm_vmx.cpu_shadow_cr0 &= ~X86_CR0_TS; /* clear TS */
2283 __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr0);
2284 break;
2285 case TYPE_LMSW:
2286 value = v->arch.hvm_vmx.cpu_shadow_cr0;
2287 value = (value & ~0xF) |
2288 (((exit_qualification & LMSW_SOURCE_DATA) >> 16) & 0xF);
2289 return vmx_set_cr0(value);
2290 default:
2291 BUG();
2294 return 1;
2297 static inline int vmx_do_msr_read(struct cpu_user_regs *regs)
2299 u64 msr_content = 0;
2300 u32 ecx = regs->ecx, eax, edx;
2301 struct vcpu *v = current;
2303 HVM_DBG_LOG(DBG_LEVEL_1, "ecx=%x, eax=%x, edx=%x",
2304 ecx, (u32)regs->eax, (u32)regs->edx);
2306 switch (ecx) {
2307 case MSR_IA32_TIME_STAMP_COUNTER:
2308 msr_content = hvm_get_guest_time(v);
2309 break;
2310 case MSR_IA32_SYSENTER_CS:
2311 msr_content = (u32)__vmread(GUEST_SYSENTER_CS);
2312 break;
2313 case MSR_IA32_SYSENTER_ESP:
2314 msr_content = __vmread(GUEST_SYSENTER_ESP);
2315 break;
2316 case MSR_IA32_SYSENTER_EIP:
2317 msr_content = __vmread(GUEST_SYSENTER_EIP);
2318 break;
2319 case MSR_IA32_APICBASE:
2320 msr_content = vcpu_vlapic(v)->hw.apic_base_msr;
2321 break;
2322 default:
2323 if ( long_mode_do_msr_read(regs) )
2324 goto done;
2326 if ( rdmsr_hypervisor_regs(ecx, &eax, &edx) ||
2327 rdmsr_safe(ecx, eax, edx) == 0 )
2329 regs->eax = eax;
2330 regs->edx = edx;
2331 goto done;
2333 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
2334 return 0;
2337 regs->eax = msr_content & 0xFFFFFFFF;
2338 regs->edx = msr_content >> 32;
2340 done:
2341 HVMTRACE_2D(MSR_READ, v, ecx, msr_content);
2342 HVM_DBG_LOG(DBG_LEVEL_1, "returns: ecx=%x, eax=%lx, edx=%lx",
2343 ecx, (unsigned long)regs->eax,
2344 (unsigned long)regs->edx);
2345 return 1;
2348 static inline int vmx_do_msr_write(struct cpu_user_regs *regs)
2350 u32 ecx = regs->ecx;
2351 u64 msr_content;
2352 struct vcpu *v = current;
2354 HVM_DBG_LOG(DBG_LEVEL_1, "ecx=%x, eax=%x, edx=%x",
2355 ecx, (u32)regs->eax, (u32)regs->edx);
2357 msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
2358 HVMTRACE_2D(MSR_WRITE, v, ecx, msr_content);
2360 switch (ecx) {
2361 case MSR_IA32_TIME_STAMP_COUNTER:
2362 hvm_set_guest_time(v, msr_content);
2363 pt_reset(v);
2364 break;
2365 case MSR_IA32_SYSENTER_CS:
2366 __vmwrite(GUEST_SYSENTER_CS, msr_content);
2367 break;
2368 case MSR_IA32_SYSENTER_ESP:
2369 __vmwrite(GUEST_SYSENTER_ESP, msr_content);
2370 break;
2371 case MSR_IA32_SYSENTER_EIP:
2372 __vmwrite(GUEST_SYSENTER_EIP, msr_content);
2373 break;
2374 case MSR_IA32_APICBASE:
2375 vlapic_msr_set(vcpu_vlapic(v), msr_content);
2376 break;
2377 default:
2378 if ( !long_mode_do_msr_write(regs) )
2379 wrmsr_hypervisor_regs(ecx, regs->eax, regs->edx);
2380 break;
2383 return 1;
2386 static void vmx_do_hlt(void)
2388 unsigned long rflags;
2389 HVMTRACE_0D(HLT, current);
2390 rflags = __vmread(GUEST_RFLAGS);
2391 hvm_hlt(rflags);
2394 static inline void vmx_do_extint(struct cpu_user_regs *regs)
2396 unsigned int vector;
2398 asmlinkage void do_IRQ(struct cpu_user_regs *);
2399 fastcall void smp_apic_timer_interrupt(struct cpu_user_regs *);
2400 fastcall void smp_event_check_interrupt(void);
2401 fastcall void smp_invalidate_interrupt(void);
2402 fastcall void smp_call_function_interrupt(void);
2403 fastcall void smp_spurious_interrupt(struct cpu_user_regs *regs);
2404 fastcall void smp_error_interrupt(struct cpu_user_regs *regs);
2405 #ifdef CONFIG_X86_MCE_P4THERMAL
2406 fastcall void smp_thermal_interrupt(struct cpu_user_regs *regs);
2407 #endif
2409 vector = __vmread(VM_EXIT_INTR_INFO);
2410 BUG_ON(!(vector & INTR_INFO_VALID_MASK));
2412 vector &= INTR_INFO_VECTOR_MASK;
2413 HVMTRACE_1D(INTR, current, vector);
2415 switch(vector) {
2416 case LOCAL_TIMER_VECTOR:
2417 smp_apic_timer_interrupt(regs);
2418 break;
2419 case EVENT_CHECK_VECTOR:
2420 smp_event_check_interrupt();
2421 break;
2422 case INVALIDATE_TLB_VECTOR:
2423 smp_invalidate_interrupt();
2424 break;
2425 case CALL_FUNCTION_VECTOR:
2426 smp_call_function_interrupt();
2427 break;
2428 case SPURIOUS_APIC_VECTOR:
2429 smp_spurious_interrupt(regs);
2430 break;
2431 case ERROR_APIC_VECTOR:
2432 smp_error_interrupt(regs);
2433 break;
2434 #ifdef CONFIG_X86_MCE_P4THERMAL
2435 case THERMAL_APIC_VECTOR:
2436 smp_thermal_interrupt(regs);
2437 break;
2438 #endif
2439 default:
2440 regs->entry_vector = vector;
2441 do_IRQ(regs);
2442 break;
2446 static void vmx_reflect_exception(struct vcpu *v)
2448 int error_code, intr_info, vector;
2450 intr_info = __vmread(VM_EXIT_INTR_INFO);
2451 vector = intr_info & 0xff;
2452 if ( intr_info & INTR_INFO_DELIVER_CODE_MASK )
2453 error_code = __vmread(VM_EXIT_INTR_ERROR_CODE);
2454 else
2455 error_code = VMX_DELIVER_NO_ERROR_CODE;
2457 #ifndef NDEBUG
2459 unsigned long rip;
2461 rip = __vmread(GUEST_RIP);
2462 HVM_DBG_LOG(DBG_LEVEL_1, "rip = %lx, error_code = %x",
2463 rip, error_code);
2465 #endif /* NDEBUG */
2467 /*
2468 * According to Intel Virtualization Technology Specification for
2469 * the IA-32 Intel Architecture (C97063-002 April 2005), section
2470 * 2.8.3, SW_EXCEPTION should be used for #BP and #OV, and
2471 * HW_EXCEPTION used for everything else. The main difference
2472 * appears to be that for SW_EXCEPTION, the EIP/RIP is incremented
2473 * by VM_ENTER_INSTRUCTION_LEN bytes, whereas for HW_EXCEPTION,
2474 * it is not.
2475 */
2476 if ( (intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_SW_EXCEPTION )
2478 int ilen = __get_instruction_length(); /* Safe: software exception */
2479 vmx_inject_sw_exception(v, vector, ilen);
2481 else
2483 vmx_inject_hw_exception(v, vector, error_code);
2487 static void vmx_failed_vmentry(unsigned int exit_reason)
2489 unsigned int failed_vmentry_reason = (uint16_t)exit_reason;
2490 unsigned long exit_qualification;
2492 exit_qualification = __vmread(EXIT_QUALIFICATION);
2493 printk("Failed vm entry (exit reason 0x%x) ", exit_reason);
2494 switch ( failed_vmentry_reason )
2496 case EXIT_REASON_INVALID_GUEST_STATE:
2497 printk("caused by invalid guest state (%ld).\n", exit_qualification);
2498 break;
2499 case EXIT_REASON_MSR_LOADING:
2500 printk("caused by MSR entry %ld loading.\n", exit_qualification);
2501 break;
2502 case EXIT_REASON_MACHINE_CHECK:
2503 printk("caused by machine check.\n");
2504 break;
2505 default:
2506 printk("reason not known yet!");
2507 break;
2510 printk("************* VMCS Area **************\n");
2511 vmcs_dump_vcpu();
2512 printk("**************************************\n");
2514 domain_crash(current->domain);
2517 asmlinkage void vmx_vmexit_handler(struct cpu_user_regs *regs)
2519 unsigned int exit_reason;
2520 unsigned long exit_qualification, inst_len = 0;
2521 struct vcpu *v = current;
2523 exit_reason = __vmread(VM_EXIT_REASON);
2525 HVMTRACE_2D(VMEXIT, v, __vmread(GUEST_RIP), exit_reason);
2527 perfc_incra(vmexits, exit_reason);
2529 if ( exit_reason != EXIT_REASON_EXTERNAL_INTERRUPT )
2530 local_irq_enable();
2532 if ( unlikely(exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) )
2533 return vmx_failed_vmentry(exit_reason);
2535 switch ( exit_reason )
2537 case EXIT_REASON_EXCEPTION_NMI:
2539 /*
2540 * We don't set the software-interrupt exiting (INT n).
2541 * (1) We can get an exception (e.g. #PG) in the guest, or
2542 * (2) NMI
2543 */
2544 unsigned int intr_info, vector;
2546 intr_info = __vmread(VM_EXIT_INTR_INFO);
2547 BUG_ON(!(intr_info & INTR_INFO_VALID_MASK));
2549 vector = intr_info & INTR_INFO_VECTOR_MASK;
2551 perfc_incra(cause_vector, vector);
2553 switch ( vector )
2555 case TRAP_debug:
2556 case TRAP_int3:
2557 if ( !v->domain->debugger_attached )
2558 goto exit_and_crash;
2559 domain_pause_for_debugger();
2560 break;
2561 case TRAP_no_device:
2562 vmx_do_no_device_fault();
2563 break;
2564 case TRAP_page_fault:
2565 exit_qualification = __vmread(EXIT_QUALIFICATION);
2566 regs->error_code = __vmread(VM_EXIT_INTR_ERROR_CODE);
2568 HVM_DBG_LOG(DBG_LEVEL_VMMU,
2569 "eax=%lx, ebx=%lx, ecx=%lx, edx=%lx, esi=%lx, edi=%lx",
2570 (unsigned long)regs->eax, (unsigned long)regs->ebx,
2571 (unsigned long)regs->ecx, (unsigned long)regs->edx,
2572 (unsigned long)regs->esi, (unsigned long)regs->edi);
2574 if ( paging_fault(exit_qualification, regs) )
2576 HVMTRACE_2D(PF_XEN, v, exit_qualification, regs->error_code);
2577 break;
2580 v->arch.hvm_vmx.cpu_cr2 = exit_qualification;
2581 vmx_inject_hw_exception(v, TRAP_page_fault, regs->error_code);
2582 break;
2583 case TRAP_nmi:
2584 HVMTRACE_0D(NMI, v);
2585 if ( (intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI )
2586 do_nmi(regs); /* Real NMI, vector 2: normal processing. */
2587 else
2588 vmx_reflect_exception(v);
2589 break;
2590 default:
2591 goto exit_and_crash;
2593 break;
2595 case EXIT_REASON_EXTERNAL_INTERRUPT:
2596 vmx_do_extint(regs);
2597 break;
2598 case EXIT_REASON_TRIPLE_FAULT:
2599 hvm_triple_fault();
2600 break;
2601 case EXIT_REASON_PENDING_INTERRUPT:
2602 /* Disable the interrupt window. */
2603 v->arch.hvm_vcpu.u.vmx.exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
2604 __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
2605 v->arch.hvm_vcpu.u.vmx.exec_control);
2606 break;
2607 case EXIT_REASON_TASK_SWITCH:
2608 goto exit_and_crash;
2609 case EXIT_REASON_CPUID:
2610 inst_len = __get_instruction_length(); /* Safe: CPUID */
2611 __update_guest_eip(inst_len);
2612 vmx_do_cpuid(regs);
2613 break;
2614 case EXIT_REASON_HLT:
2615 inst_len = __get_instruction_length(); /* Safe: HLT */
2616 __update_guest_eip(inst_len);
2617 vmx_do_hlt();
2618 break;
2619 case EXIT_REASON_INVLPG:
2621 inst_len = __get_instruction_length(); /* Safe: INVLPG */
2622 __update_guest_eip(inst_len);
2623 exit_qualification = __vmread(EXIT_QUALIFICATION);
2624 vmx_do_invlpg(exit_qualification);
2625 break;
2627 case EXIT_REASON_VMCALL:
2629 HVMTRACE_1D(VMMCALL, v, regs->eax);
2630 inst_len = __get_instruction_length(); /* Safe: VMCALL */
2631 if ( !hvm_do_hypercall(regs) )
2632 __update_guest_eip(inst_len); /* not preempted */
2633 break;
2635 case EXIT_REASON_CR_ACCESS:
2637 exit_qualification = __vmread(EXIT_QUALIFICATION);
2638 inst_len = __get_instruction_length(); /* Safe: MOV Cn, LMSW, CLTS */
2639 if ( vmx_cr_access(exit_qualification, regs) )
2640 __update_guest_eip(inst_len);
2641 break;
2643 case EXIT_REASON_DR_ACCESS:
2644 exit_qualification = __vmread(EXIT_QUALIFICATION);
2645 vmx_dr_access(exit_qualification, regs);
2646 break;
2647 case EXIT_REASON_IO_INSTRUCTION:
2648 exit_qualification = __vmread(EXIT_QUALIFICATION);
2649 inst_len = __get_instruction_length(); /* Safe: IN, INS, OUT, OUTS */
2650 vmx_io_instruction(exit_qualification, inst_len);
2651 break;
2652 case EXIT_REASON_MSR_READ:
2653 inst_len = __get_instruction_length(); /* Safe: RDMSR */
2654 if ( vmx_do_msr_read(regs) )
2655 __update_guest_eip(inst_len);
2656 break;
2657 case EXIT_REASON_MSR_WRITE:
2658 inst_len = __get_instruction_length(); /* Safe: WRMSR */
2659 if ( vmx_do_msr_write(regs) )
2660 __update_guest_eip(inst_len);
2661 break;
2662 case EXIT_REASON_MWAIT_INSTRUCTION:
2663 case EXIT_REASON_MONITOR_INSTRUCTION:
2664 case EXIT_REASON_PAUSE_INSTRUCTION:
2665 goto exit_and_crash;
2666 case EXIT_REASON_VMCLEAR:
2667 case EXIT_REASON_VMLAUNCH:
2668 case EXIT_REASON_VMPTRLD:
2669 case EXIT_REASON_VMPTRST:
2670 case EXIT_REASON_VMREAD:
2671 case EXIT_REASON_VMRESUME:
2672 case EXIT_REASON_VMWRITE:
2673 case EXIT_REASON_VMXOFF:
2674 case EXIT_REASON_VMXON:
2675 /* Report invalid opcode exception when a VMX guest tries to execute
2676 any of the VMX instructions */
2677 vmx_inject_hw_exception(v, TRAP_invalid_op, VMX_DELIVER_NO_ERROR_CODE);
2678 break;
2680 case EXIT_REASON_TPR_BELOW_THRESHOLD:
2681 break;
2683 default:
2684 exit_and_crash:
2685 gdprintk(XENLOG_ERR, "Bad vmexit (reason %x)\n", exit_reason);
2686 domain_crash(v->domain);
2687 break;
2691 asmlinkage void vmx_trace_vmentry(void)
2693 struct vcpu *v = current;
2694 HVMTRACE_0D(VMENTRY, v);
2697 /*
2698 * Local variables:
2699 * mode: C
2700 * c-set-style: "BSD"
2701 * c-basic-offset: 4
2702 * tab-width: 4
2703 * indent-tabs-mode: nil
2704 * End:
2705 */