ia64/xen-unstable

view xen/arch/x86/hvm/vmx/vmx.c @ 12599:93e657836d07

[XEN] Remove VALID_MFN(); replace uses with mfn_valid().
Signed-off-by: Keir Fraser <keir@xensource.com>
author kfraser@localhost.localdomain
date Mon Nov 27 17:48:24 2006 +0000 (2006-11-27)
parents 6d892ea6194d
children 62b0b520ea53
line source
1 /*
2 * vmx.c: handling VMX architecture-related VM exits
3 * Copyright (c) 2004, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
16 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 *
18 */
20 #include <xen/config.h>
21 #include <xen/init.h>
22 #include <xen/lib.h>
23 #include <xen/trace.h>
24 #include <xen/sched.h>
25 #include <xen/irq.h>
26 #include <xen/softirq.h>
27 #include <xen/domain_page.h>
28 #include <xen/hypercall.h>
29 #include <xen/perfc.h>
30 #include <asm/current.h>
31 #include <asm/io.h>
32 #include <asm/regs.h>
33 #include <asm/cpufeature.h>
34 #include <asm/processor.h>
35 #include <asm/types.h>
36 #include <asm/msr.h>
37 #include <asm/spinlock.h>
38 #include <asm/hvm/hvm.h>
39 #include <asm/hvm/support.h>
40 #include <asm/hvm/vmx/vmx.h>
41 #include <asm/hvm/vmx/vmcs.h>
42 #include <asm/hvm/vmx/cpu.h>
43 #include <asm/shadow.h>
44 #include <public/sched.h>
45 #include <public/hvm/ioreq.h>
46 #include <asm/hvm/vpic.h>
47 #include <asm/hvm/vlapic.h>
48 #include <asm/x86_emulate.h>
50 static void vmx_ctxt_switch_from(struct vcpu *v);
51 static void vmx_ctxt_switch_to(struct vcpu *v);
53 static int vmx_vcpu_initialise(struct vcpu *v)
54 {
55 int rc;
57 spin_lock_init(&v->arch.hvm_vmx.vmcs_lock);
59 v->arch.schedule_tail = arch_vmx_do_resume;
60 v->arch.ctxt_switch_from = vmx_ctxt_switch_from;
61 v->arch.ctxt_switch_to = vmx_ctxt_switch_to;
63 if ( (rc = vmx_create_vmcs(v)) != 0 )
64 {
65 dprintk(XENLOG_WARNING,
66 "Failed to create VMCS for vcpu %d: err=%d.\n",
67 v->vcpu_id, rc);
68 return rc;
69 }
71 return 0;
72 }
74 static void vmx_vcpu_destroy(struct vcpu *v)
75 {
76 vmx_destroy_vmcs(v);
77 }
79 #ifdef __x86_64__
81 static DEFINE_PER_CPU(struct vmx_msr_state, host_msr_state);
83 static u32 msr_index[VMX_MSR_COUNT] =
84 {
85 MSR_LSTAR, MSR_STAR, MSR_CSTAR,
86 MSR_SYSCALL_MASK, MSR_EFER,
87 };
89 static void vmx_save_host_msrs(void)
90 {
91 struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state);
92 int i;
94 for ( i = 0; i < VMX_MSR_COUNT; i++ )
95 rdmsrl(msr_index[i], host_msr_state->msrs[i]);
96 }
98 #define CASE_READ_MSR(address) \
99 case MSR_ ## address: \
100 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_ ## address]; \
101 break
103 #define CASE_WRITE_MSR(address) \
104 case MSR_ ## address: \
105 guest_msr_state->msrs[VMX_INDEX_MSR_ ## address] = msr_content; \
106 if ( !test_bit(VMX_INDEX_MSR_ ## address, &guest_msr_state->flags) )\
107 set_bit(VMX_INDEX_MSR_ ## address, &guest_msr_state->flags); \
108 wrmsrl(MSR_ ## address, msr_content); \
109 set_bit(VMX_INDEX_MSR_ ## address, &host_msr_state->flags); \
110 break
112 #define IS_CANO_ADDRESS(add) 1
113 static inline int long_mode_do_msr_read(struct cpu_user_regs *regs)
114 {
115 u64 msr_content = 0;
116 struct vcpu *v = current;
117 struct vmx_msr_state *guest_msr_state = &v->arch.hvm_vmx.msr_state;
119 switch ( regs->ecx ) {
120 case MSR_EFER:
121 HVM_DBG_LOG(DBG_LEVEL_2, "EFER msr_content 0x%"PRIx64, msr_content);
122 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_EFER];
123 break;
125 case MSR_FS_BASE:
126 if ( !(vmx_long_mode_enabled(v)) )
127 goto exit_and_crash;
129 msr_content = __vmread(GUEST_FS_BASE);
130 break;
132 case MSR_GS_BASE:
133 if ( !(vmx_long_mode_enabled(v)) )
134 goto exit_and_crash;
136 msr_content = __vmread(GUEST_GS_BASE);
137 break;
139 case MSR_SHADOW_GS_BASE:
140 msr_content = guest_msr_state->shadow_gs;
141 break;
143 CASE_READ_MSR(STAR);
144 CASE_READ_MSR(LSTAR);
145 CASE_READ_MSR(CSTAR);
146 CASE_READ_MSR(SYSCALL_MASK);
148 default:
149 return 0;
150 }
152 HVM_DBG_LOG(DBG_LEVEL_2, "msr_content: 0x%"PRIx64, msr_content);
154 regs->eax = (u32)(msr_content >> 0);
155 regs->edx = (u32)(msr_content >> 32);
157 return 1;
159 exit_and_crash:
160 gdprintk(XENLOG_ERR, "Fatal error reading MSR %lx\n", (long)regs->ecx);
161 domain_crash(v->domain);
162 return 1; /* handled */
163 }
165 static inline int long_mode_do_msr_write(struct cpu_user_regs *regs)
166 {
167 u64 msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
168 struct vcpu *v = current;
169 struct vmx_msr_state *guest_msr_state = &v->arch.hvm_vmx.msr_state;
170 struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state);
172 HVM_DBG_LOG(DBG_LEVEL_1, "msr 0x%lx msr_content 0x%"PRIx64"\n",
173 (unsigned long)regs->ecx, msr_content);
175 switch ( regs->ecx ) {
176 case MSR_EFER:
177 /* offending reserved bit will cause #GP */
178 if ( msr_content & ~(EFER_LME | EFER_LMA | EFER_NX | EFER_SCE) )
179 {
180 printk("Trying to set reserved bit in EFER: %"PRIx64"\n",
181 msr_content);
182 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
183 return 0;
184 }
186 if ( (msr_content & EFER_LME)
187 && !(guest_msr_state->msrs[VMX_INDEX_MSR_EFER] & EFER_LME) )
188 {
189 if ( unlikely(vmx_paging_enabled(v)) )
190 {
191 printk("Trying to set EFER.LME with paging enabled\n");
192 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
193 return 0;
194 }
195 }
196 else if ( !(msr_content & EFER_LME)
197 && (guest_msr_state->msrs[VMX_INDEX_MSR_EFER] & EFER_LME) )
198 {
199 if ( unlikely(vmx_paging_enabled(v)) )
200 {
201 printk("Trying to clear EFER.LME with paging enabled\n");
202 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
203 return 0;
204 }
205 }
207 guest_msr_state->msrs[VMX_INDEX_MSR_EFER] = msr_content;
208 break;
210 case MSR_FS_BASE:
211 case MSR_GS_BASE:
212 if ( !vmx_long_mode_enabled(v) )
213 goto exit_and_crash;
215 if ( !IS_CANO_ADDRESS(msr_content) )
216 {
217 HVM_DBG_LOG(DBG_LEVEL_1, "Not cano address of msr write\n");
218 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
219 return 0;
220 }
222 if ( regs->ecx == MSR_FS_BASE )
223 __vmwrite(GUEST_FS_BASE, msr_content);
224 else
225 __vmwrite(GUEST_GS_BASE, msr_content);
227 break;
229 case MSR_SHADOW_GS_BASE:
230 if ( !(vmx_long_mode_enabled(v)) )
231 goto exit_and_crash;
233 v->arch.hvm_vmx.msr_state.shadow_gs = msr_content;
234 wrmsrl(MSR_SHADOW_GS_BASE, msr_content);
235 break;
237 CASE_WRITE_MSR(STAR);
238 CASE_WRITE_MSR(LSTAR);
239 CASE_WRITE_MSR(CSTAR);
240 CASE_WRITE_MSR(SYSCALL_MASK);
242 default:
243 return 0;
244 }
246 return 1;
248 exit_and_crash:
249 gdprintk(XENLOG_ERR, "Fatal error writing MSR %lx\n", (long)regs->ecx);
250 domain_crash(v->domain);
251 return 1; /* handled */
252 }
254 /*
255 * To avoid MSR save/restore at every VM exit/entry time, we restore
256 * the x86_64 specific MSRs at domain switch time. Since these MSRs
257 * are not modified once set for para domains, we don't save them,
258 * but simply reset them to values set in percpu_traps_init().
259 */
260 static void vmx_restore_host_msrs(void)
261 {
262 struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state);
263 int i;
265 while ( host_msr_state->flags )
266 {
267 i = find_first_set_bit(host_msr_state->flags);
268 wrmsrl(msr_index[i], host_msr_state->msrs[i]);
269 clear_bit(i, &host_msr_state->flags);
270 }
271 }
273 static void vmx_restore_guest_msrs(struct vcpu *v)
274 {
275 struct vmx_msr_state *guest_msr_state, *host_msr_state;
276 unsigned long guest_flags;
277 int i;
279 guest_msr_state = &v->arch.hvm_vmx.msr_state;
280 host_msr_state = &this_cpu(host_msr_state);
282 wrmsrl(MSR_SHADOW_GS_BASE, guest_msr_state->shadow_gs);
284 guest_flags = guest_msr_state->flags;
285 if ( !guest_flags )
286 return;
288 while ( guest_flags ) {
289 i = find_first_set_bit(guest_flags);
291 HVM_DBG_LOG(DBG_LEVEL_2,
292 "restore guest's index %d msr %x with value %lx",
293 i, msr_index[i], guest_msr_state->msrs[i]);
294 set_bit(i, &host_msr_state->flags);
295 wrmsrl(msr_index[i], guest_msr_state->msrs[i]);
296 clear_bit(i, &guest_flags);
297 }
298 }
300 #else /* __i386__ */
302 #define vmx_save_host_msrs() ((void)0)
303 #define vmx_restore_host_msrs() ((void)0)
304 #define vmx_restore_guest_msrs(v) ((void)0)
306 static inline int long_mode_do_msr_read(struct cpu_user_regs *regs)
307 {
308 return 0;
309 }
311 static inline int long_mode_do_msr_write(struct cpu_user_regs *regs)
312 {
313 return 0;
314 }
316 #endif /* __i386__ */
318 #define loaddebug(_v,_reg) \
319 __asm__ __volatile__ ("mov %0,%%db" #_reg : : "r" ((_v)->debugreg[_reg]))
320 #define savedebug(_v,_reg) \
321 __asm__ __volatile__ ("mov %%db" #_reg ",%0" : : "r" ((_v)->debugreg[_reg]))
323 static inline void vmx_save_dr(struct vcpu *v)
324 {
325 if ( !v->arch.hvm_vcpu.flag_dr_dirty )
326 return;
328 /* Clear the DR dirty flag and re-enable intercepts for DR accesses. */
329 v->arch.hvm_vcpu.flag_dr_dirty = 0;
330 v->arch.hvm_vcpu.u.vmx.exec_control |= CPU_BASED_MOV_DR_EXITING;
331 __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vcpu.u.vmx.exec_control);
333 savedebug(&v->arch.guest_context, 0);
334 savedebug(&v->arch.guest_context, 1);
335 savedebug(&v->arch.guest_context, 2);
336 savedebug(&v->arch.guest_context, 3);
337 savedebug(&v->arch.guest_context, 6);
338 v->arch.guest_context.debugreg[7] = __vmread(GUEST_DR7);
339 }
341 static inline void __restore_debug_registers(struct vcpu *v)
342 {
343 loaddebug(&v->arch.guest_context, 0);
344 loaddebug(&v->arch.guest_context, 1);
345 loaddebug(&v->arch.guest_context, 2);
346 loaddebug(&v->arch.guest_context, 3);
347 /* No 4 and 5 */
348 loaddebug(&v->arch.guest_context, 6);
349 /* DR7 is loaded from the VMCS. */
350 }
352 /*
353 * DR7 is saved and restored on every vmexit. Other debug registers only
354 * need to be restored if their value is going to affect execution -- i.e.,
355 * if one of the breakpoints is enabled. So mask out all bits that don't
356 * enable some breakpoint functionality.
357 */
358 #define DR7_ACTIVE_MASK 0xff
360 static inline void vmx_restore_dr(struct vcpu *v)
361 {
362 /* NB. __vmread() is not usable here, so we cannot read from the VMCS. */
363 if ( unlikely(v->arch.guest_context.debugreg[7] & DR7_ACTIVE_MASK) )
364 __restore_debug_registers(v);
365 }
367 static void vmx_ctxt_switch_from(struct vcpu *v)
368 {
369 hvm_freeze_time(v);
371 /* NB. MSR_SHADOW_GS_BASE may be changed by swapgs instrucion in guest,
372 * so we must save it. */
373 rdmsrl(MSR_SHADOW_GS_BASE, v->arch.hvm_vmx.msr_state.shadow_gs);
375 vmx_restore_host_msrs();
376 vmx_save_dr(v);
377 }
379 static void vmx_ctxt_switch_to(struct vcpu *v)
380 {
381 vmx_restore_guest_msrs(v);
382 vmx_restore_dr(v);
383 }
385 static void stop_vmx(void)
386 {
387 if ( !(read_cr4() & X86_CR4_VMXE) )
388 return;
390 __vmxoff();
391 clear_in_cr4(X86_CR4_VMXE);
392 }
394 static void vmx_store_cpu_guest_regs(
395 struct vcpu *v, struct cpu_user_regs *regs, unsigned long *crs)
396 {
397 vmx_vmcs_enter(v);
399 if ( regs != NULL )
400 {
401 regs->eflags = __vmread(GUEST_RFLAGS);
402 regs->ss = __vmread(GUEST_SS_SELECTOR);
403 regs->cs = __vmread(GUEST_CS_SELECTOR);
404 regs->ds = __vmread(GUEST_DS_SELECTOR);
405 regs->es = __vmread(GUEST_ES_SELECTOR);
406 regs->gs = __vmread(GUEST_GS_SELECTOR);
407 regs->fs = __vmread(GUEST_FS_SELECTOR);
408 regs->eip = __vmread(GUEST_RIP);
409 regs->esp = __vmread(GUEST_RSP);
410 }
412 if ( crs != NULL )
413 {
414 crs[0] = v->arch.hvm_vmx.cpu_shadow_cr0;
415 crs[2] = v->arch.hvm_vmx.cpu_cr2;
416 crs[3] = __vmread(GUEST_CR3);
417 crs[4] = v->arch.hvm_vmx.cpu_shadow_cr4;
418 }
420 vmx_vmcs_exit(v);
421 }
423 /*
424 * The VMX spec (section 4.3.1.2, Checks on Guest Segment
425 * Registers) says that virtual-8086 mode guests' segment
426 * base-address fields in the VMCS must be equal to their
427 * corresponding segment selector field shifted right by
428 * four bits upon vmentry.
429 *
430 * This function (called only for VM86-mode guests) fixes
431 * the bases to be consistent with the selectors in regs
432 * if they're not already. Without this, we can fail the
433 * vmentry check mentioned above.
434 */
435 static void fixup_vm86_seg_bases(struct cpu_user_regs *regs)
436 {
437 unsigned long base;
439 base = __vmread(GUEST_ES_BASE);
440 if (regs->es << 4 != base)
441 __vmwrite(GUEST_ES_BASE, regs->es << 4);
442 base = __vmread(GUEST_CS_BASE);
443 if (regs->cs << 4 != base)
444 __vmwrite(GUEST_CS_BASE, regs->cs << 4);
445 base = __vmread(GUEST_SS_BASE);
446 if (regs->ss << 4 != base)
447 __vmwrite(GUEST_SS_BASE, regs->ss << 4);
448 base = __vmread(GUEST_DS_BASE);
449 if (regs->ds << 4 != base)
450 __vmwrite(GUEST_DS_BASE, regs->ds << 4);
451 base = __vmread(GUEST_FS_BASE);
452 if (regs->fs << 4 != base)
453 __vmwrite(GUEST_FS_BASE, regs->fs << 4);
454 base = __vmread(GUEST_GS_BASE);
455 if (regs->gs << 4 != base)
456 __vmwrite(GUEST_GS_BASE, regs->gs << 4);
457 }
459 static void vmx_load_cpu_guest_regs(struct vcpu *v, struct cpu_user_regs *regs)
460 {
461 vmx_vmcs_enter(v);
463 __vmwrite(GUEST_SS_SELECTOR, regs->ss);
464 __vmwrite(GUEST_DS_SELECTOR, regs->ds);
465 __vmwrite(GUEST_ES_SELECTOR, regs->es);
466 __vmwrite(GUEST_GS_SELECTOR, regs->gs);
467 __vmwrite(GUEST_FS_SELECTOR, regs->fs);
469 __vmwrite(GUEST_RSP, regs->esp);
471 /* NB. Bit 1 of RFLAGS must be set for VMENTRY to succeed. */
472 __vmwrite(GUEST_RFLAGS, regs->eflags | 2UL);
473 if (regs->eflags & EF_TF)
474 __vm_set_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_DB);
475 else
476 __vm_clear_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_DB);
477 if (regs->eflags & EF_VM)
478 fixup_vm86_seg_bases(regs);
480 __vmwrite(GUEST_CS_SELECTOR, regs->cs);
481 __vmwrite(GUEST_RIP, regs->eip);
483 vmx_vmcs_exit(v);
484 }
486 static unsigned long vmx_get_ctrl_reg(struct vcpu *v, unsigned int num)
487 {
488 switch ( num )
489 {
490 case 0:
491 return v->arch.hvm_vmx.cpu_cr0;
492 case 2:
493 return v->arch.hvm_vmx.cpu_cr2;
494 case 3:
495 return v->arch.hvm_vmx.cpu_cr3;
496 case 4:
497 return v->arch.hvm_vmx.cpu_shadow_cr4;
498 default:
499 BUG();
500 }
501 return 0; /* dummy */
502 }
504 static unsigned long vmx_get_segment_base(struct vcpu *v, enum segment seg)
505 {
506 unsigned long base;
508 BUG_ON(v != current);
509 switch ( seg )
510 {
511 case seg_cs: base = __vmread(GUEST_CS_BASE); break;
512 case seg_ds: base = __vmread(GUEST_DS_BASE); break;
513 case seg_es: base = __vmread(GUEST_ES_BASE); break;
514 case seg_fs: base = __vmread(GUEST_FS_BASE); break;
515 case seg_gs: base = __vmread(GUEST_GS_BASE); break;
516 case seg_ss: base = __vmread(GUEST_SS_BASE); break;
517 case seg_tr: base = __vmread(GUEST_TR_BASE); break;
518 case seg_gdtr: base = __vmread(GUEST_GDTR_BASE); break;
519 case seg_idtr: base = __vmread(GUEST_IDTR_BASE); break;
520 case seg_ldtr: base = __vmread(GUEST_LDTR_BASE); break;
521 default: BUG(); base = 0; break;
522 }
523 return base;
524 }
526 /* Make sure that xen intercepts any FP accesses from current */
527 static void vmx_stts(struct vcpu *v)
528 {
529 /* VMX depends on operating on the current vcpu */
530 ASSERT(v == current);
532 /*
533 * If the guest does not have TS enabled then we must cause and handle an
534 * exception on first use of the FPU. If the guest *does* have TS enabled
535 * then this is not necessary: no FPU activity can occur until the guest
536 * clears CR0.TS, and we will initialise the FPU when that happens.
537 */
538 if ( !(v->arch.hvm_vmx.cpu_shadow_cr0 & X86_CR0_TS) )
539 {
540 v->arch.hvm_vmx.cpu_cr0 |= X86_CR0_TS;
541 __vmwrite(GUEST_CR0, v->arch.hvm_vmx.cpu_cr0);
542 __vm_set_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_NM);
543 }
544 }
546 static void vmx_set_tsc_offset(struct vcpu *v, u64 offset)
547 {
548 vmx_vmcs_enter(v);
549 __vmwrite(TSC_OFFSET, offset);
550 #if defined (__i386__)
551 __vmwrite(TSC_OFFSET_HIGH, offset >> 32);
552 #endif
553 vmx_vmcs_exit(v);
554 }
556 static void vmx_init_ap_context(
557 struct vcpu_guest_context *ctxt, int vcpuid, int trampoline_vector)
558 {
559 memset(ctxt, 0, sizeof(*ctxt));
560 ctxt->user_regs.eip = VMXASSIST_BASE;
561 ctxt->user_regs.edx = vcpuid;
562 ctxt->user_regs.ebx = trampoline_vector;
563 }
565 void do_nmi(struct cpu_user_regs *);
567 static void vmx_init_hypercall_page(struct domain *d, void *hypercall_page)
568 {
569 char *p;
570 int i;
572 memset(hypercall_page, 0, PAGE_SIZE);
574 for ( i = 0; i < (PAGE_SIZE / 32); i++ )
575 {
576 p = (char *)(hypercall_page + (i * 32));
577 *(u8 *)(p + 0) = 0xb8; /* mov imm32, %eax */
578 *(u32 *)(p + 1) = i;
579 *(u8 *)(p + 5) = 0x0f; /* vmcall */
580 *(u8 *)(p + 6) = 0x01;
581 *(u8 *)(p + 7) = 0xc1;
582 *(u8 *)(p + 8) = 0xc3; /* ret */
583 }
585 /* Don't support HYPERVISOR_iret at the moment */
586 *(u16 *)(hypercall_page + (__HYPERVISOR_iret * 32)) = 0x0b0f; /* ud2 */
587 }
589 static int vmx_realmode(struct vcpu *v)
590 {
591 unsigned long rflags;
593 ASSERT(v == current);
595 rflags = __vmread(GUEST_RFLAGS);
596 return rflags & X86_EFLAGS_VM;
597 }
599 static int vmx_guest_x86_mode(struct vcpu *v)
600 {
601 unsigned long cs_ar_bytes;
603 ASSERT(v == current);
605 cs_ar_bytes = __vmread(GUEST_CS_AR_BYTES);
607 if ( vmx_long_mode_enabled(v) )
608 return ((cs_ar_bytes & (1u<<13)) ?
609 X86EMUL_MODE_PROT64 : X86EMUL_MODE_PROT32);
611 if ( vmx_realmode(v) )
612 return X86EMUL_MODE_REAL;
614 return ((cs_ar_bytes & (1u<<14)) ?
615 X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16);
616 }
618 static int vmx_pae_enabled(struct vcpu *v)
619 {
620 unsigned long cr4 = v->arch.hvm_vmx.cpu_shadow_cr4;
621 return (vmx_paging_enabled(v) && (cr4 & X86_CR4_PAE));
622 }
624 /* Setup HVM interfaces */
625 static void vmx_setup_hvm_funcs(void)
626 {
627 if ( hvm_enabled )
628 return;
630 hvm_funcs.disable = stop_vmx;
632 hvm_funcs.vcpu_initialise = vmx_vcpu_initialise;
633 hvm_funcs.vcpu_destroy = vmx_vcpu_destroy;
635 hvm_funcs.store_cpu_guest_regs = vmx_store_cpu_guest_regs;
636 hvm_funcs.load_cpu_guest_regs = vmx_load_cpu_guest_regs;
638 hvm_funcs.realmode = vmx_realmode;
639 hvm_funcs.paging_enabled = vmx_paging_enabled;
640 hvm_funcs.long_mode_enabled = vmx_long_mode_enabled;
641 hvm_funcs.pae_enabled = vmx_pae_enabled;
642 hvm_funcs.guest_x86_mode = vmx_guest_x86_mode;
643 hvm_funcs.get_guest_ctrl_reg = vmx_get_ctrl_reg;
644 hvm_funcs.get_segment_base = vmx_get_segment_base;
646 hvm_funcs.update_host_cr3 = vmx_update_host_cr3;
648 hvm_funcs.stts = vmx_stts;
649 hvm_funcs.set_tsc_offset = vmx_set_tsc_offset;
651 hvm_funcs.init_ap_context = vmx_init_ap_context;
653 hvm_funcs.init_hypercall_page = vmx_init_hypercall_page;
654 }
656 int start_vmx(void)
657 {
658 u32 eax, edx;
659 struct vmcs_struct *vmcs;
661 /*
662 * Xen does not fill x86_capability words except 0.
663 */
664 boot_cpu_data.x86_capability[4] = cpuid_ecx(1);
666 if ( !test_bit(X86_FEATURE_VMXE, &boot_cpu_data.x86_capability) )
667 return 0;
669 rdmsr(IA32_FEATURE_CONTROL_MSR, eax, edx);
671 if ( eax & IA32_FEATURE_CONTROL_MSR_LOCK )
672 {
673 if ( (eax & IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON) == 0x0 )
674 {
675 printk("VMX disabled by Feature Control MSR.\n");
676 return 0;
677 }
678 }
679 else
680 {
681 wrmsr(IA32_FEATURE_CONTROL_MSR,
682 IA32_FEATURE_CONTROL_MSR_LOCK |
683 IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON, 0);
684 }
686 set_in_cr4(X86_CR4_VMXE);
688 vmx_init_vmcs_config();
690 if ( smp_processor_id() == 0 )
691 setup_vmcs_dump();
693 if ( (vmcs = vmx_alloc_host_vmcs()) == NULL )
694 {
695 clear_in_cr4(X86_CR4_VMXE);
696 printk("Failed to allocate host VMCS\n");
697 return 0;
698 }
700 if ( __vmxon(virt_to_maddr(vmcs)) )
701 {
702 clear_in_cr4(X86_CR4_VMXE);
703 printk("VMXON failed\n");
704 vmx_free_host_vmcs(vmcs);
705 return 0;
706 }
708 printk("VMXON is done\n");
710 vmx_save_host_msrs();
712 vmx_setup_hvm_funcs();
714 hvm_enabled = 1;
716 return 1;
717 }
719 /*
720 * Not all cases receive valid value in the VM-exit instruction length field.
721 * Callers must know what they're doing!
722 */
723 static int __get_instruction_length(void)
724 {
725 int len;
726 len = __vmread(VM_EXIT_INSTRUCTION_LEN); /* Safe: callers audited */
727 BUG_ON((len < 1) || (len > 15));
728 return len;
729 }
731 static void inline __update_guest_eip(unsigned long inst_len)
732 {
733 unsigned long current_eip;
735 current_eip = __vmread(GUEST_RIP);
736 __vmwrite(GUEST_RIP, current_eip + inst_len);
737 __vmwrite(GUEST_INTERRUPTIBILITY_INFO, 0);
738 }
740 static int vmx_do_page_fault(unsigned long va, struct cpu_user_regs *regs)
741 {
742 int result;
744 #if 0 /* keep for debugging */
745 {
746 unsigned long eip, cs;
748 cs = __vmread(GUEST_CS_BASE);
749 eip = __vmread(GUEST_RIP);
750 HVM_DBG_LOG(DBG_LEVEL_VMMU,
751 "vmx_do_page_fault = 0x%lx, cs_base=%lx, "
752 "eip = %lx, error_code = %lx\n",
753 va, cs, eip, (unsigned long)regs->error_code);
754 }
755 #endif
757 result = shadow_fault(va, regs);
759 TRACE_VMEXIT(2, result);
760 #if 0
761 if ( !result )
762 {
763 eip = __vmread(GUEST_RIP);
764 printk("vmx pgfault to guest va=%lx eip=%lx\n", va, eip);
765 }
766 #endif
768 return result;
769 }
771 static void vmx_do_no_device_fault(void)
772 {
773 struct vcpu *v = current;
775 setup_fpu(current);
776 __vm_clear_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_NM);
778 /* Disable TS in guest CR0 unless the guest wants the exception too. */
779 if ( !(v->arch.hvm_vmx.cpu_shadow_cr0 & X86_CR0_TS) )
780 {
781 v->arch.hvm_vmx.cpu_cr0 &= ~X86_CR0_TS;
782 __vmwrite(GUEST_CR0, v->arch.hvm_vmx.cpu_cr0);
783 }
784 }
786 #define bitmaskof(idx) (1U << ((idx)&31))
787 static void vmx_do_cpuid(struct cpu_user_regs *regs)
788 {
789 unsigned int input = (unsigned int)regs->eax;
790 unsigned int count = (unsigned int)regs->ecx;
791 unsigned int eax, ebx, ecx, edx;
792 unsigned long eip;
793 struct vcpu *v = current;
795 eip = __vmread(GUEST_RIP);
797 HVM_DBG_LOG(DBG_LEVEL_3, "(eax) 0x%08lx, (ebx) 0x%08lx, "
798 "(ecx) 0x%08lx, (edx) 0x%08lx, (esi) 0x%08lx, (edi) 0x%08lx",
799 (unsigned long)regs->eax, (unsigned long)regs->ebx,
800 (unsigned long)regs->ecx, (unsigned long)regs->edx,
801 (unsigned long)regs->esi, (unsigned long)regs->edi);
803 if ( input == CPUID_LEAF_0x4 )
804 {
805 cpuid_count(input, count, &eax, &ebx, &ecx, &edx);
806 eax &= NUM_CORES_RESET_MASK;
807 }
808 else if ( input == 0x40000003 )
809 {
810 /*
811 * NB. Unsupported interface for private use of VMXASSIST only.
812 * Note that this leaf lives at <max-hypervisor-leaf> + 1.
813 */
814 u64 value = ((u64)regs->edx << 32) | (u32)regs->ecx;
815 unsigned long mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
816 char *p;
818 gdprintk(XENLOG_INFO, "Input address is 0x%"PRIx64".\n", value);
820 /* 8-byte aligned valid pseudophys address from vmxassist, please. */
821 if ( (value & 7) || (mfn == INVALID_MFN) ||
822 !v->arch.hvm_vmx.vmxassist_enabled )
823 {
824 domain_crash(v->domain);
825 return;
826 }
828 p = map_domain_page(mfn);
829 value = *((uint64_t *)(p + (value & (PAGE_SIZE - 1))));
830 unmap_domain_page(p);
832 gdprintk(XENLOG_INFO, "Output value is 0x%"PRIx64".\n", value);
833 ecx = (u32)(value >> 0);
834 edx = (u32)(value >> 32);
835 }
836 else if ( !cpuid_hypervisor_leaves(input, &eax, &ebx, &ecx, &edx) )
837 {
838 cpuid(input, &eax, &ebx, &ecx, &edx);
840 if ( input == CPUID_LEAF_0x1 )
841 {
842 /* Mask off reserved bits. */
843 ecx &= ~VMX_VCPU_CPUID_L1_ECX_RESERVED;
845 if ( vlapic_hw_disabled(vcpu_vlapic(v)) )
846 clear_bit(X86_FEATURE_APIC, &edx);
848 #if CONFIG_PAGING_LEVELS >= 3
849 if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] )
850 #endif
851 clear_bit(X86_FEATURE_PAE, &edx);
852 clear_bit(X86_FEATURE_PSE36, &edx);
854 ebx &= NUM_THREADS_RESET_MASK;
856 /* Unsupportable for virtualised CPUs. */
857 ecx &= ~(bitmaskof(X86_FEATURE_VMXE) |
858 bitmaskof(X86_FEATURE_EST) |
859 bitmaskof(X86_FEATURE_TM2) |
860 bitmaskof(X86_FEATURE_CID) |
861 bitmaskof(X86_FEATURE_MWAIT) );
863 edx &= ~( bitmaskof(X86_FEATURE_HT) |
864 bitmaskof(X86_FEATURE_ACPI) |
865 bitmaskof(X86_FEATURE_ACC) );
866 }
867 else if ( ( input == CPUID_LEAF_0x6 )
868 || ( input == CPUID_LEAF_0x9 )
869 || ( input == CPUID_LEAF_0xA ))
870 {
871 eax = ebx = ecx = edx = 0x0;
872 }
873 #ifdef __i386__
874 else if ( input == CPUID_LEAF_0x80000001 )
875 {
876 clear_bit(X86_FEATURE_LAHF_LM & 31, &ecx);
878 clear_bit(X86_FEATURE_LM & 31, &edx);
879 clear_bit(X86_FEATURE_SYSCALL & 31, &edx);
880 }
881 #endif
882 }
884 regs->eax = (unsigned long) eax;
885 regs->ebx = (unsigned long) ebx;
886 regs->ecx = (unsigned long) ecx;
887 regs->edx = (unsigned long) edx;
889 HVM_DBG_LOG(DBG_LEVEL_3, "eip@%lx, input: 0x%lx, "
890 "output: eax = 0x%08lx, ebx = 0x%08lx, "
891 "ecx = 0x%08lx, edx = 0x%08lx",
892 (unsigned long)eip, (unsigned long)input,
893 (unsigned long)eax, (unsigned long)ebx,
894 (unsigned long)ecx, (unsigned long)edx);
895 }
897 #define CASE_GET_REG_P(REG, reg) \
898 case REG_ ## REG: reg_p = (unsigned long *)&(regs->reg); break
900 #ifdef __i386__
901 #define CASE_EXTEND_GET_REG_P
902 #else
903 #define CASE_EXTEND_GET_REG_P \
904 CASE_GET_REG_P(R8, r8); \
905 CASE_GET_REG_P(R9, r9); \
906 CASE_GET_REG_P(R10, r10); \
907 CASE_GET_REG_P(R11, r11); \
908 CASE_GET_REG_P(R12, r12); \
909 CASE_GET_REG_P(R13, r13); \
910 CASE_GET_REG_P(R14, r14); \
911 CASE_GET_REG_P(R15, r15)
912 #endif
914 static void vmx_dr_access(unsigned long exit_qualification,
915 struct cpu_user_regs *regs)
916 {
917 struct vcpu *v = current;
919 v->arch.hvm_vcpu.flag_dr_dirty = 1;
921 /* We could probably be smarter about this */
922 __restore_debug_registers(v);
924 /* Allow guest direct access to DR registers */
925 v->arch.hvm_vcpu.u.vmx.exec_control &= ~CPU_BASED_MOV_DR_EXITING;
926 __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
927 v->arch.hvm_vcpu.u.vmx.exec_control);
928 }
930 /*
931 * Invalidate the TLB for va. Invalidate the shadow page corresponding
932 * the address va.
933 */
934 static void vmx_do_invlpg(unsigned long va)
935 {
936 unsigned long eip;
937 struct vcpu *v = current;
939 eip = __vmread(GUEST_RIP);
941 HVM_DBG_LOG(DBG_LEVEL_VMMU, "eip=%lx, va=%lx",
942 eip, va);
944 /*
945 * We do the safest things first, then try to update the shadow
946 * copying from guest
947 */
948 shadow_invlpg(v, va);
949 }
952 static int check_for_null_selector(unsigned long eip, int inst_len, int dir)
953 {
954 unsigned char inst[MAX_INST_LEN];
955 unsigned long sel;
956 int i;
957 int inst_copy_from_guest(unsigned char *, unsigned long, int);
959 /* INS can only use ES segment register, and it can't be overridden */
960 if ( dir == IOREQ_READ )
961 {
962 sel = __vmread(GUEST_ES_SELECTOR);
963 return sel == 0 ? 1 : 0;
964 }
966 memset(inst, 0, MAX_INST_LEN);
967 if ( inst_copy_from_guest(inst, eip, inst_len) != inst_len )
968 {
969 gdprintk(XENLOG_ERR, "Get guest instruction failed\n");
970 domain_crash(current->domain);
971 return 0;
972 }
974 for ( i = 0; i < inst_len; i++ )
975 {
976 switch ( inst[i] )
977 {
978 case 0xf3: /* REPZ */
979 case 0xf2: /* REPNZ */
980 case 0xf0: /* LOCK */
981 case 0x66: /* data32 */
982 case 0x67: /* addr32 */
983 continue;
984 case 0x2e: /* CS */
985 sel = __vmread(GUEST_CS_SELECTOR);
986 break;
987 case 0x36: /* SS */
988 sel = __vmread(GUEST_SS_SELECTOR);
989 break;
990 case 0x26: /* ES */
991 sel = __vmread(GUEST_ES_SELECTOR);
992 break;
993 case 0x64: /* FS */
994 sel = __vmread(GUEST_FS_SELECTOR);
995 break;
996 case 0x65: /* GS */
997 sel = __vmread(GUEST_GS_SELECTOR);
998 break;
999 case 0x3e: /* DS */
1000 /* FALLTHROUGH */
1001 default:
1002 /* DS is the default */
1003 sel = __vmread(GUEST_DS_SELECTOR);
1005 return sel == 0 ? 1 : 0;
1008 return 0;
1011 static void vmx_io_instruction(unsigned long exit_qualification,
1012 unsigned long inst_len)
1014 struct cpu_user_regs *regs;
1015 struct hvm_io_op *pio_opp;
1016 unsigned long port, size;
1017 int dir, df, vm86;
1019 pio_opp = &current->arch.hvm_vcpu.io_op;
1020 pio_opp->instr = INSTR_PIO;
1021 pio_opp->flags = 0;
1023 regs = &pio_opp->io_context;
1025 /* Copy current guest state into io instruction state structure. */
1026 memcpy(regs, guest_cpu_user_regs(), HVM_CONTEXT_STACK_BYTES);
1027 hvm_store_cpu_guest_regs(current, regs, NULL);
1029 vm86 = regs->eflags & X86_EFLAGS_VM ? 1 : 0;
1030 df = regs->eflags & X86_EFLAGS_DF ? 1 : 0;
1032 HVM_DBG_LOG(DBG_LEVEL_IO, "vm86 %d, eip=%x:%lx, "
1033 "exit_qualification = %lx",
1034 vm86, regs->cs, (unsigned long)regs->eip, exit_qualification);
1036 if ( test_bit(6, &exit_qualification) )
1037 port = (exit_qualification >> 16) & 0xFFFF;
1038 else
1039 port = regs->edx & 0xffff;
1041 TRACE_VMEXIT(1, port);
1043 size = (exit_qualification & 7) + 1;
1044 dir = test_bit(3, &exit_qualification); /* direction */
1046 if ( test_bit(4, &exit_qualification) ) { /* string instruction */
1047 unsigned long addr, count = 1;
1048 int sign = regs->eflags & X86_EFLAGS_DF ? -1 : 1;
1050 addr = __vmread(GUEST_LINEAR_ADDRESS);
1052 /*
1053 * In protected mode, guest linear address is invalid if the
1054 * selector is null.
1055 */
1056 if ( !vm86 && check_for_null_selector(regs->eip, inst_len, dir) )
1057 addr = dir == IOREQ_WRITE ? regs->esi : regs->edi;
1059 if ( test_bit(5, &exit_qualification) ) { /* "rep" prefix */
1060 pio_opp->flags |= REPZ;
1061 count = vm86 ? regs->ecx & 0xFFFF : regs->ecx;
1064 /*
1065 * Handle string pio instructions that cross pages or that
1066 * are unaligned. See the comments in hvm_domain.c/handle_mmio()
1067 */
1068 if ( (addr & PAGE_MASK) != ((addr + size - 1) & PAGE_MASK) ) {
1069 unsigned long value = 0;
1071 pio_opp->flags |= OVERLAP;
1073 if ( dir == IOREQ_WRITE ) /* OUTS */
1075 if ( hvm_paging_enabled(current) )
1076 (void)hvm_copy_from_guest_virt(&value, addr, size);
1077 else
1078 (void)hvm_copy_from_guest_phys(&value, addr, size);
1079 } else
1080 pio_opp->addr = addr;
1082 if ( count == 1 )
1083 regs->eip += inst_len;
1085 send_pio_req(port, 1, size, value, dir, df, 0);
1086 } else {
1087 unsigned long last_addr = sign > 0 ? addr + count * size - 1
1088 : addr - (count - 1) * size;
1090 if ( (addr & PAGE_MASK) != (last_addr & PAGE_MASK) )
1092 if ( sign > 0 )
1093 count = (PAGE_SIZE - (addr & ~PAGE_MASK)) / size;
1094 else
1095 count = (addr & ~PAGE_MASK) / size + 1;
1096 } else
1097 regs->eip += inst_len;
1099 send_pio_req(port, count, size, addr, dir, df, 1);
1101 } else {
1102 if ( port == 0xe9 && dir == IOREQ_WRITE && size == 1 )
1103 hvm_print_line(current, regs->eax); /* guest debug output */
1105 if ( dir == IOREQ_WRITE )
1106 TRACE_VMEXIT(2, regs->eax);
1108 regs->eip += inst_len;
1109 send_pio_req(port, 1, size, regs->eax, dir, df, 0);
1113 static void vmx_world_save(struct vcpu *v, struct vmx_assist_context *c)
1115 /* NB. Skip transition instruction. */
1116 c->eip = __vmread(GUEST_RIP);
1117 c->eip += __get_instruction_length(); /* Safe: MOV Cn, LMSW, CLTS */
1119 c->esp = __vmread(GUEST_RSP);
1120 c->eflags = __vmread(GUEST_RFLAGS);
1122 c->cr0 = v->arch.hvm_vmx.cpu_shadow_cr0;
1123 c->cr3 = v->arch.hvm_vmx.cpu_cr3;
1124 c->cr4 = v->arch.hvm_vmx.cpu_shadow_cr4;
1126 c->idtr_limit = __vmread(GUEST_IDTR_LIMIT);
1127 c->idtr_base = __vmread(GUEST_IDTR_BASE);
1129 c->gdtr_limit = __vmread(GUEST_GDTR_LIMIT);
1130 c->gdtr_base = __vmread(GUEST_GDTR_BASE);
1132 c->cs_sel = __vmread(GUEST_CS_SELECTOR);
1133 c->cs_limit = __vmread(GUEST_CS_LIMIT);
1134 c->cs_base = __vmread(GUEST_CS_BASE);
1135 c->cs_arbytes.bytes = __vmread(GUEST_CS_AR_BYTES);
1137 c->ds_sel = __vmread(GUEST_DS_SELECTOR);
1138 c->ds_limit = __vmread(GUEST_DS_LIMIT);
1139 c->ds_base = __vmread(GUEST_DS_BASE);
1140 c->ds_arbytes.bytes = __vmread(GUEST_DS_AR_BYTES);
1142 c->es_sel = __vmread(GUEST_ES_SELECTOR);
1143 c->es_limit = __vmread(GUEST_ES_LIMIT);
1144 c->es_base = __vmread(GUEST_ES_BASE);
1145 c->es_arbytes.bytes = __vmread(GUEST_ES_AR_BYTES);
1147 c->ss_sel = __vmread(GUEST_SS_SELECTOR);
1148 c->ss_limit = __vmread(GUEST_SS_LIMIT);
1149 c->ss_base = __vmread(GUEST_SS_BASE);
1150 c->ss_arbytes.bytes = __vmread(GUEST_SS_AR_BYTES);
1152 c->fs_sel = __vmread(GUEST_FS_SELECTOR);
1153 c->fs_limit = __vmread(GUEST_FS_LIMIT);
1154 c->fs_base = __vmread(GUEST_FS_BASE);
1155 c->fs_arbytes.bytes = __vmread(GUEST_FS_AR_BYTES);
1157 c->gs_sel = __vmread(GUEST_GS_SELECTOR);
1158 c->gs_limit = __vmread(GUEST_GS_LIMIT);
1159 c->gs_base = __vmread(GUEST_GS_BASE);
1160 c->gs_arbytes.bytes = __vmread(GUEST_GS_AR_BYTES);
1162 c->tr_sel = __vmread(GUEST_TR_SELECTOR);
1163 c->tr_limit = __vmread(GUEST_TR_LIMIT);
1164 c->tr_base = __vmread(GUEST_TR_BASE);
1165 c->tr_arbytes.bytes = __vmread(GUEST_TR_AR_BYTES);
1167 c->ldtr_sel = __vmread(GUEST_LDTR_SELECTOR);
1168 c->ldtr_limit = __vmread(GUEST_LDTR_LIMIT);
1169 c->ldtr_base = __vmread(GUEST_LDTR_BASE);
1170 c->ldtr_arbytes.bytes = __vmread(GUEST_LDTR_AR_BYTES);
1173 static int vmx_world_restore(struct vcpu *v, struct vmx_assist_context *c)
1175 unsigned long mfn, old_base_mfn;
1177 __vmwrite(GUEST_RIP, c->eip);
1178 __vmwrite(GUEST_RSP, c->esp);
1179 __vmwrite(GUEST_RFLAGS, c->eflags);
1181 v->arch.hvm_vmx.cpu_shadow_cr0 = c->cr0;
1182 __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr0);
1184 if ( !vmx_paging_enabled(v) )
1185 goto skip_cr3;
1187 if ( c->cr3 == v->arch.hvm_vmx.cpu_cr3 )
1189 /*
1190 * This is simple TLB flush, implying the guest has
1191 * removed some translation or changed page attributes.
1192 * We simply invalidate the shadow.
1193 */
1194 mfn = get_mfn_from_gpfn(c->cr3 >> PAGE_SHIFT);
1195 if ( mfn != pagetable_get_pfn(v->arch.guest_table) )
1196 goto bad_cr3;
1198 else
1200 /*
1201 * If different, make a shadow. Check if the PDBR is valid
1202 * first.
1203 */
1204 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 c->cr3 = %x", c->cr3);
1205 mfn = get_mfn_from_gpfn(c->cr3 >> PAGE_SHIFT);
1206 if ( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), v->domain) )
1207 goto bad_cr3;
1208 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1209 v->arch.guest_table = pagetable_from_pfn(mfn);
1210 if (old_base_mfn)
1211 put_page(mfn_to_page(old_base_mfn));
1212 /*
1213 * arch.shadow_table should now hold the next CR3 for shadow
1214 */
1215 v->arch.hvm_vmx.cpu_cr3 = c->cr3;
1218 skip_cr3:
1219 if ( !vmx_paging_enabled(v) )
1220 HVM_DBG_LOG(DBG_LEVEL_VMMU, "switching to vmxassist. use phys table");
1221 else
1222 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %x", c->cr3);
1224 __vmwrite(GUEST_CR4, (c->cr4 | VMX_CR4_HOST_MASK));
1225 v->arch.hvm_vmx.cpu_shadow_cr4 = c->cr4;
1226 __vmwrite(CR4_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr4);
1228 __vmwrite(GUEST_IDTR_LIMIT, c->idtr_limit);
1229 __vmwrite(GUEST_IDTR_BASE, c->idtr_base);
1231 __vmwrite(GUEST_GDTR_LIMIT, c->gdtr_limit);
1232 __vmwrite(GUEST_GDTR_BASE, c->gdtr_base);
1234 __vmwrite(GUEST_CS_SELECTOR, c->cs_sel);
1235 __vmwrite(GUEST_CS_LIMIT, c->cs_limit);
1236 __vmwrite(GUEST_CS_BASE, c->cs_base);
1237 __vmwrite(GUEST_CS_AR_BYTES, c->cs_arbytes.bytes);
1239 __vmwrite(GUEST_DS_SELECTOR, c->ds_sel);
1240 __vmwrite(GUEST_DS_LIMIT, c->ds_limit);
1241 __vmwrite(GUEST_DS_BASE, c->ds_base);
1242 __vmwrite(GUEST_DS_AR_BYTES, c->ds_arbytes.bytes);
1244 __vmwrite(GUEST_ES_SELECTOR, c->es_sel);
1245 __vmwrite(GUEST_ES_LIMIT, c->es_limit);
1246 __vmwrite(GUEST_ES_BASE, c->es_base);
1247 __vmwrite(GUEST_ES_AR_BYTES, c->es_arbytes.bytes);
1249 __vmwrite(GUEST_SS_SELECTOR, c->ss_sel);
1250 __vmwrite(GUEST_SS_LIMIT, c->ss_limit);
1251 __vmwrite(GUEST_SS_BASE, c->ss_base);
1252 __vmwrite(GUEST_SS_AR_BYTES, c->ss_arbytes.bytes);
1254 __vmwrite(GUEST_FS_SELECTOR, c->fs_sel);
1255 __vmwrite(GUEST_FS_LIMIT, c->fs_limit);
1256 __vmwrite(GUEST_FS_BASE, c->fs_base);
1257 __vmwrite(GUEST_FS_AR_BYTES, c->fs_arbytes.bytes);
1259 __vmwrite(GUEST_GS_SELECTOR, c->gs_sel);
1260 __vmwrite(GUEST_GS_LIMIT, c->gs_limit);
1261 __vmwrite(GUEST_GS_BASE, c->gs_base);
1262 __vmwrite(GUEST_GS_AR_BYTES, c->gs_arbytes.bytes);
1264 __vmwrite(GUEST_TR_SELECTOR, c->tr_sel);
1265 __vmwrite(GUEST_TR_LIMIT, c->tr_limit);
1266 __vmwrite(GUEST_TR_BASE, c->tr_base);
1267 __vmwrite(GUEST_TR_AR_BYTES, c->tr_arbytes.bytes);
1269 __vmwrite(GUEST_LDTR_SELECTOR, c->ldtr_sel);
1270 __vmwrite(GUEST_LDTR_LIMIT, c->ldtr_limit);
1271 __vmwrite(GUEST_LDTR_BASE, c->ldtr_base);
1272 __vmwrite(GUEST_LDTR_AR_BYTES, c->ldtr_arbytes.bytes);
1274 shadow_update_paging_modes(v);
1275 __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
1276 return 0;
1278 bad_cr3:
1279 gdprintk(XENLOG_ERR, "Invalid CR3 value=%x", c->cr3);
1280 return -EINVAL;
1283 enum { VMX_ASSIST_INVOKE = 0, VMX_ASSIST_RESTORE };
1285 static int vmx_assist(struct vcpu *v, int mode)
1287 struct vmx_assist_context c;
1288 u32 magic;
1289 u32 cp;
1291 /* make sure vmxassist exists (this is not an error) */
1292 if (hvm_copy_from_guest_phys(&magic, VMXASSIST_MAGIC_OFFSET,
1293 sizeof(magic)))
1294 return 0;
1295 if (magic != VMXASSIST_MAGIC)
1296 return 0;
1298 switch (mode) {
1299 /*
1300 * Transfer control to vmxassist.
1301 * Store the current context in VMXASSIST_OLD_CONTEXT and load
1302 * the new VMXASSIST_NEW_CONTEXT context. This context was created
1303 * by vmxassist and will transfer control to it.
1304 */
1305 case VMX_ASSIST_INVOKE:
1306 /* save the old context */
1307 if (hvm_copy_from_guest_phys(&cp, VMXASSIST_OLD_CONTEXT, sizeof(cp)))
1308 goto error;
1309 if (cp != 0) {
1310 vmx_world_save(v, &c);
1311 if (hvm_copy_to_guest_phys(cp, &c, sizeof(c)))
1312 goto error;
1315 /* restore the new context, this should activate vmxassist */
1316 if (hvm_copy_from_guest_phys(&cp, VMXASSIST_NEW_CONTEXT, sizeof(cp)))
1317 goto error;
1318 if (cp != 0) {
1319 if (hvm_copy_from_guest_phys(&c, cp, sizeof(c)))
1320 goto error;
1321 if ( vmx_world_restore(v, &c) != 0 )
1322 goto error;
1323 v->arch.hvm_vmx.vmxassist_enabled = 1;
1324 return 1;
1326 break;
1328 /*
1329 * Restore the VMXASSIST_OLD_CONTEXT that was saved by
1330 * VMX_ASSIST_INVOKE above.
1331 */
1332 case VMX_ASSIST_RESTORE:
1333 /* save the old context */
1334 if (hvm_copy_from_guest_phys(&cp, VMXASSIST_OLD_CONTEXT, sizeof(cp)))
1335 goto error;
1336 if (cp != 0) {
1337 if (hvm_copy_from_guest_phys(&c, cp, sizeof(c)))
1338 goto error;
1339 if ( vmx_world_restore(v, &c) != 0 )
1340 goto error;
1341 v->arch.hvm_vmx.vmxassist_enabled = 0;
1342 return 1;
1344 break;
1347 error:
1348 gdprintk(XENLOG_ERR, "Failed to transfer to vmxassist\n");
1349 domain_crash(v->domain);
1350 return 0;
1353 static int vmx_set_cr0(unsigned long value)
1355 struct vcpu *v = current;
1356 unsigned long mfn;
1357 unsigned long eip;
1358 int paging_enabled;
1359 unsigned long vm_entry_value;
1360 unsigned long old_cr0;
1361 unsigned long old_base_mfn;
1363 /*
1364 * CR0: We don't want to lose PE and PG.
1365 */
1366 old_cr0 = v->arch.hvm_vmx.cpu_shadow_cr0;
1367 paging_enabled = (old_cr0 & X86_CR0_PE) && (old_cr0 & X86_CR0_PG);
1369 /* TS cleared? Then initialise FPU now. */
1370 if ( !(value & X86_CR0_TS) )
1372 setup_fpu(v);
1373 __vm_clear_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_NM);
1376 v->arch.hvm_vmx.cpu_cr0 = (value | X86_CR0_PE | X86_CR0_PG
1377 | X86_CR0_NE | X86_CR0_WP);
1378 __vmwrite(GUEST_CR0, v->arch.hvm_vmx.cpu_cr0);
1380 v->arch.hvm_vmx.cpu_shadow_cr0 = value;
1381 __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr0);
1383 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR0 value = %lx\n", value);
1385 if ( (value & X86_CR0_PE) && (value & X86_CR0_PG) && !paging_enabled )
1387 /*
1388 * Trying to enable guest paging.
1389 * The guest CR3 must be pointing to the guest physical.
1390 */
1391 mfn = get_mfn_from_gpfn(v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT);
1392 if ( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), v->domain) )
1394 gdprintk(XENLOG_ERR, "Invalid CR3 value = %lx (mfn=%lx)\n",
1395 v->arch.hvm_vmx.cpu_cr3, mfn);
1396 domain_crash(v->domain);
1397 return 0;
1400 #if defined(__x86_64__)
1401 if ( vmx_lme_is_set(v) )
1403 if ( !(v->arch.hvm_vmx.cpu_shadow_cr4 & X86_CR4_PAE) )
1405 HVM_DBG_LOG(DBG_LEVEL_1, "Guest enabled paging "
1406 "with EFER.LME set but not CR4.PAE\n");
1407 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
1409 else
1411 HVM_DBG_LOG(DBG_LEVEL_1, "Enabling long mode\n");
1412 v->arch.hvm_vmx.msr_state.msrs[VMX_INDEX_MSR_EFER]
1413 |= EFER_LMA;
1414 vm_entry_value = __vmread(VM_ENTRY_CONTROLS);
1415 vm_entry_value |= VM_ENTRY_IA32E_MODE;
1416 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
1419 #endif
1421 /*
1422 * Now arch.guest_table points to machine physical.
1423 */
1424 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1425 v->arch.guest_table = pagetable_from_pfn(mfn);
1426 if (old_base_mfn)
1427 put_page(mfn_to_page(old_base_mfn));
1428 shadow_update_paging_modes(v);
1430 HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
1431 (unsigned long) (mfn << PAGE_SHIFT));
1433 __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
1434 /*
1435 * arch->shadow_table should hold the next CR3 for shadow
1436 */
1437 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, mfn = %lx",
1438 v->arch.hvm_vmx.cpu_cr3, mfn);
1441 if ( !((value & X86_CR0_PE) && (value & X86_CR0_PG)) && paging_enabled )
1442 if ( v->arch.hvm_vmx.cpu_cr3 ) {
1443 put_page(mfn_to_page(get_mfn_from_gpfn(
1444 v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT)));
1445 v->arch.guest_table = pagetable_null();
1448 /*
1449 * VMX does not implement real-mode virtualization. We emulate
1450 * real-mode by performing a world switch to VMXAssist whenever
1451 * a partition disables the CR0.PE bit.
1452 */
1453 if ( (value & X86_CR0_PE) == 0 )
1455 if ( value & X86_CR0_PG ) {
1456 /* inject GP here */
1457 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
1458 return 0;
1459 } else {
1460 /*
1461 * Disable paging here.
1462 * Same to PE == 1 && PG == 0
1463 */
1464 if ( vmx_long_mode_enabled(v) )
1466 v->arch.hvm_vmx.msr_state.msrs[VMX_INDEX_MSR_EFER]
1467 &= ~EFER_LMA;
1468 vm_entry_value = __vmread(VM_ENTRY_CONTROLS);
1469 vm_entry_value &= ~VM_ENTRY_IA32E_MODE;
1470 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
1474 if ( vmx_assist(v, VMX_ASSIST_INVOKE) )
1476 eip = __vmread(GUEST_RIP);
1477 HVM_DBG_LOG(DBG_LEVEL_1,
1478 "Transfering control to vmxassist %%eip 0x%lx\n", eip);
1479 return 0; /* do not update eip! */
1482 else if ( v->arch.hvm_vmx.vmxassist_enabled )
1484 eip = __vmread(GUEST_RIP);
1485 HVM_DBG_LOG(DBG_LEVEL_1,
1486 "Enabling CR0.PE at %%eip 0x%lx\n", eip);
1487 if ( vmx_assist(v, VMX_ASSIST_RESTORE) )
1489 eip = __vmread(GUEST_RIP);
1490 HVM_DBG_LOG(DBG_LEVEL_1,
1491 "Restoring to %%eip 0x%lx\n", eip);
1492 return 0; /* do not update eip! */
1495 else if ( (value & (X86_CR0_PE | X86_CR0_PG)) == X86_CR0_PE )
1497 if ( vmx_long_mode_enabled(v) )
1499 v->arch.hvm_vmx.msr_state.msrs[VMX_INDEX_MSR_EFER] &= ~EFER_LMA;
1500 vm_entry_value = __vmread(VM_ENTRY_CONTROLS);
1501 vm_entry_value &= ~VM_ENTRY_IA32E_MODE;
1502 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
1504 shadow_update_paging_modes(v);
1505 __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
1508 return 1;
1511 #define CASE_SET_REG(REG, reg) \
1512 case REG_ ## REG: regs->reg = value; break
1513 #define CASE_GET_REG(REG, reg) \
1514 case REG_ ## REG: value = regs->reg; break
1516 #define CASE_EXTEND_SET_REG \
1517 CASE_EXTEND_REG(S)
1518 #define CASE_EXTEND_GET_REG \
1519 CASE_EXTEND_REG(G)
1521 #ifdef __i386__
1522 #define CASE_EXTEND_REG(T)
1523 #else
1524 #define CASE_EXTEND_REG(T) \
1525 CASE_ ## T ## ET_REG(R8, r8); \
1526 CASE_ ## T ## ET_REG(R9, r9); \
1527 CASE_ ## T ## ET_REG(R10, r10); \
1528 CASE_ ## T ## ET_REG(R11, r11); \
1529 CASE_ ## T ## ET_REG(R12, r12); \
1530 CASE_ ## T ## ET_REG(R13, r13); \
1531 CASE_ ## T ## ET_REG(R14, r14); \
1532 CASE_ ## T ## ET_REG(R15, r15)
1533 #endif
1535 /*
1536 * Write to control registers
1537 */
1538 static int mov_to_cr(int gp, int cr, struct cpu_user_regs *regs)
1540 unsigned long value, old_cr, old_base_mfn, mfn;
1541 struct vcpu *v = current;
1542 struct vlapic *vlapic = vcpu_vlapic(v);
1544 switch ( gp )
1546 CASE_GET_REG(EAX, eax);
1547 CASE_GET_REG(ECX, ecx);
1548 CASE_GET_REG(EDX, edx);
1549 CASE_GET_REG(EBX, ebx);
1550 CASE_GET_REG(EBP, ebp);
1551 CASE_GET_REG(ESI, esi);
1552 CASE_GET_REG(EDI, edi);
1553 CASE_EXTEND_GET_REG;
1554 case REG_ESP:
1555 value = __vmread(GUEST_RSP);
1556 break;
1557 default:
1558 gdprintk(XENLOG_ERR, "invalid gp: %d\n", gp);
1559 goto exit_and_crash;
1562 TRACE_VMEXIT(1, TYPE_MOV_TO_CR);
1563 TRACE_VMEXIT(2, cr);
1564 TRACE_VMEXIT(3, value);
1566 HVM_DBG_LOG(DBG_LEVEL_1, "CR%d, value = %lx", cr, value);
1568 switch ( cr )
1570 case 0:
1571 return vmx_set_cr0(value);
1573 case 3:
1574 /*
1575 * If paging is not enabled yet, simply copy the value to CR3.
1576 */
1577 if (!vmx_paging_enabled(v)) {
1578 v->arch.hvm_vmx.cpu_cr3 = value;
1579 break;
1582 /*
1583 * We make a new one if the shadow does not exist.
1584 */
1585 if (value == v->arch.hvm_vmx.cpu_cr3) {
1586 /*
1587 * This is simple TLB flush, implying the guest has
1588 * removed some translation or changed page attributes.
1589 * We simply invalidate the shadow.
1590 */
1591 mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
1592 if (mfn != pagetable_get_pfn(v->arch.guest_table))
1593 goto bad_cr3;
1594 shadow_update_cr3(v);
1595 } else {
1596 /*
1597 * If different, make a shadow. Check if the PDBR is valid
1598 * first.
1599 */
1600 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 value = %lx", value);
1601 mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
1602 if ( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), v->domain) )
1603 goto bad_cr3;
1604 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1605 v->arch.guest_table = pagetable_from_pfn(mfn);
1606 if (old_base_mfn)
1607 put_page(mfn_to_page(old_base_mfn));
1608 /*
1609 * arch.shadow_table should now hold the next CR3 for shadow
1610 */
1611 v->arch.hvm_vmx.cpu_cr3 = value;
1612 update_cr3(v);
1613 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx",
1614 value);
1615 __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
1617 break;
1619 case 4: /* CR4 */
1620 old_cr = v->arch.hvm_vmx.cpu_shadow_cr4;
1622 if ( (value & X86_CR4_PAE) && !(old_cr & X86_CR4_PAE) )
1624 if ( vmx_pgbit_test(v) )
1626 /* The guest is a 32-bit PAE guest. */
1627 #if CONFIG_PAGING_LEVELS >= 3
1628 unsigned long mfn, old_base_mfn;
1629 mfn = get_mfn_from_gpfn(v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT);
1630 if ( !mfn_valid(mfn) ||
1631 !get_page(mfn_to_page(mfn), v->domain) )
1632 goto bad_cr3;
1634 /*
1635 * Now arch.guest_table points to machine physical.
1636 */
1638 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1639 v->arch.guest_table = pagetable_from_pfn(mfn);
1640 if ( old_base_mfn )
1641 put_page(mfn_to_page(old_base_mfn));
1643 HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
1644 (unsigned long) (mfn << PAGE_SHIFT));
1646 __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
1648 /*
1649 * arch->shadow_table should hold the next CR3 for shadow
1650 */
1652 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, mfn = %lx",
1653 v->arch.hvm_vmx.cpu_cr3, mfn);
1654 #endif
1657 else if ( !(value & X86_CR4_PAE) )
1659 if ( unlikely(vmx_long_mode_enabled(v)) )
1661 HVM_DBG_LOG(DBG_LEVEL_1, "Guest cleared CR4.PAE while "
1662 "EFER.LMA is set\n");
1663 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
1667 __vmwrite(GUEST_CR4, value| VMX_CR4_HOST_MASK);
1668 v->arch.hvm_vmx.cpu_shadow_cr4 = value;
1669 __vmwrite(CR4_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr4);
1671 /*
1672 * Writing to CR4 to modify the PSE, PGE, or PAE flag invalidates
1673 * all TLB entries except global entries.
1674 */
1675 if ( (old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE) )
1676 shadow_update_paging_modes(v);
1677 break;
1679 case 8:
1680 vlapic_set_reg(vlapic, APIC_TASKPRI, ((value & 0x0F) << 4));
1681 break;
1683 default:
1684 gdprintk(XENLOG_ERR, "invalid cr: %d\n", cr);
1685 domain_crash(v->domain);
1686 return 0;
1689 return 1;
1691 bad_cr3:
1692 gdprintk(XENLOG_ERR, "Invalid CR3\n");
1693 exit_and_crash:
1694 domain_crash(v->domain);
1695 return 0;
1698 /*
1699 * Read from control registers. CR0 and CR4 are read from the shadow.
1700 */
1701 static void mov_from_cr(int cr, int gp, struct cpu_user_regs *regs)
1703 unsigned long value = 0;
1704 struct vcpu *v = current;
1705 struct vlapic *vlapic = vcpu_vlapic(v);
1707 switch ( cr )
1709 case 3:
1710 value = (unsigned long)v->arch.hvm_vmx.cpu_cr3;
1711 break;
1712 case 8:
1713 value = (unsigned long)vlapic_get_reg(vlapic, APIC_TASKPRI);
1714 value = (value & 0xF0) >> 4;
1715 break;
1716 default:
1717 gdprintk(XENLOG_ERR, "invalid cr: %d\n", cr);
1718 domain_crash(v->domain);
1719 break;
1722 switch ( gp ) {
1723 CASE_SET_REG(EAX, eax);
1724 CASE_SET_REG(ECX, ecx);
1725 CASE_SET_REG(EDX, edx);
1726 CASE_SET_REG(EBX, ebx);
1727 CASE_SET_REG(EBP, ebp);
1728 CASE_SET_REG(ESI, esi);
1729 CASE_SET_REG(EDI, edi);
1730 CASE_EXTEND_SET_REG;
1731 case REG_ESP:
1732 __vmwrite(GUEST_RSP, value);
1733 regs->esp = value;
1734 break;
1735 default:
1736 printk("invalid gp: %d\n", gp);
1737 domain_crash(v->domain);
1738 break;
1741 TRACE_VMEXIT(1, TYPE_MOV_FROM_CR);
1742 TRACE_VMEXIT(2, cr);
1743 TRACE_VMEXIT(3, value);
1745 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR%d, value = %lx", cr, value);
1748 static int vmx_cr_access(unsigned long exit_qualification,
1749 struct cpu_user_regs *regs)
1751 unsigned int gp, cr;
1752 unsigned long value;
1753 struct vcpu *v = current;
1755 switch (exit_qualification & CONTROL_REG_ACCESS_TYPE) {
1756 case TYPE_MOV_TO_CR:
1757 gp = exit_qualification & CONTROL_REG_ACCESS_REG;
1758 cr = exit_qualification & CONTROL_REG_ACCESS_NUM;
1759 return mov_to_cr(gp, cr, regs);
1760 case TYPE_MOV_FROM_CR:
1761 gp = exit_qualification & CONTROL_REG_ACCESS_REG;
1762 cr = exit_qualification & CONTROL_REG_ACCESS_NUM;
1763 mov_from_cr(cr, gp, regs);
1764 break;
1765 case TYPE_CLTS:
1766 TRACE_VMEXIT(1, TYPE_CLTS);
1768 /* We initialise the FPU now, to avoid needing another vmexit. */
1769 setup_fpu(v);
1770 __vm_clear_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_NM);
1772 v->arch.hvm_vmx.cpu_cr0 &= ~X86_CR0_TS; /* clear TS */
1773 __vmwrite(GUEST_CR0, v->arch.hvm_vmx.cpu_cr0);
1775 v->arch.hvm_vmx.cpu_shadow_cr0 &= ~X86_CR0_TS; /* clear TS */
1776 __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr0);
1777 break;
1778 case TYPE_LMSW:
1779 value = v->arch.hvm_vmx.cpu_shadow_cr0;
1780 value = (value & ~0xF) |
1781 (((exit_qualification & LMSW_SOURCE_DATA) >> 16) & 0xF);
1782 TRACE_VMEXIT(1, TYPE_LMSW);
1783 TRACE_VMEXIT(2, value);
1784 return vmx_set_cr0(value);
1785 break;
1786 default:
1787 BUG();
1790 return 1;
1793 static inline void vmx_do_msr_read(struct cpu_user_regs *regs)
1795 u64 msr_content = 0;
1796 u32 eax, edx;
1797 struct vcpu *v = current;
1799 HVM_DBG_LOG(DBG_LEVEL_1, "ecx=%lx, eax=%lx, edx=%lx",
1800 (unsigned long)regs->ecx, (unsigned long)regs->eax,
1801 (unsigned long)regs->edx);
1802 switch (regs->ecx) {
1803 case MSR_IA32_TIME_STAMP_COUNTER:
1804 msr_content = hvm_get_guest_time(v);
1805 break;
1806 case MSR_IA32_SYSENTER_CS:
1807 msr_content = (u32)__vmread(GUEST_SYSENTER_CS);
1808 break;
1809 case MSR_IA32_SYSENTER_ESP:
1810 msr_content = __vmread(GUEST_SYSENTER_ESP);
1811 break;
1812 case MSR_IA32_SYSENTER_EIP:
1813 msr_content = __vmread(GUEST_SYSENTER_EIP);
1814 break;
1815 case MSR_IA32_APICBASE:
1816 msr_content = vcpu_vlapic(v)->apic_base_msr;
1817 break;
1818 default:
1819 if ( long_mode_do_msr_read(regs) )
1820 return;
1822 if ( rdmsr_hypervisor_regs(regs->ecx, &eax, &edx) )
1824 regs->eax = eax;
1825 regs->edx = edx;
1826 return;
1829 rdmsr_safe(regs->ecx, regs->eax, regs->edx);
1830 return;
1833 regs->eax = msr_content & 0xFFFFFFFF;
1834 regs->edx = msr_content >> 32;
1836 HVM_DBG_LOG(DBG_LEVEL_1, "returns: ecx=%lx, eax=%lx, edx=%lx",
1837 (unsigned long)regs->ecx, (unsigned long)regs->eax,
1838 (unsigned long)regs->edx);
1841 static inline void vmx_do_msr_write(struct cpu_user_regs *regs)
1843 u64 msr_content;
1844 struct vcpu *v = current;
1846 HVM_DBG_LOG(DBG_LEVEL_1, "ecx=%lx, eax=%lx, edx=%lx",
1847 (unsigned long)regs->ecx, (unsigned long)regs->eax,
1848 (unsigned long)regs->edx);
1850 msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
1852 switch (regs->ecx) {
1853 case MSR_IA32_TIME_STAMP_COUNTER:
1855 struct periodic_time *pt =
1856 &(v->domain->arch.hvm_domain.pl_time.periodic_tm);
1857 if ( pt->enabled && pt->first_injected
1858 && v->vcpu_id == pt->bind_vcpu )
1859 pt->first_injected = 0;
1861 hvm_set_guest_time(v, msr_content);
1862 break;
1863 case MSR_IA32_SYSENTER_CS:
1864 __vmwrite(GUEST_SYSENTER_CS, msr_content);
1865 break;
1866 case MSR_IA32_SYSENTER_ESP:
1867 __vmwrite(GUEST_SYSENTER_ESP, msr_content);
1868 break;
1869 case MSR_IA32_SYSENTER_EIP:
1870 __vmwrite(GUEST_SYSENTER_EIP, msr_content);
1871 break;
1872 case MSR_IA32_APICBASE:
1873 vlapic_msr_set(vcpu_vlapic(v), msr_content);
1874 break;
1875 default:
1876 if ( !long_mode_do_msr_write(regs) )
1877 wrmsr_hypervisor_regs(regs->ecx, regs->eax, regs->edx);
1878 break;
1881 HVM_DBG_LOG(DBG_LEVEL_1, "returns: ecx=%lx, eax=%lx, edx=%lx",
1882 (unsigned long)regs->ecx, (unsigned long)regs->eax,
1883 (unsigned long)regs->edx);
1886 static void vmx_do_hlt(void)
1888 unsigned long rflags;
1889 rflags = __vmread(GUEST_RFLAGS);
1890 hvm_hlt(rflags);
1893 static inline void vmx_do_extint(struct cpu_user_regs *regs)
1895 unsigned int vector;
1897 asmlinkage void do_IRQ(struct cpu_user_regs *);
1898 fastcall void smp_apic_timer_interrupt(struct cpu_user_regs *);
1899 fastcall void smp_event_check_interrupt(void);
1900 fastcall void smp_invalidate_interrupt(void);
1901 fastcall void smp_call_function_interrupt(void);
1902 fastcall void smp_spurious_interrupt(struct cpu_user_regs *regs);
1903 fastcall void smp_error_interrupt(struct cpu_user_regs *regs);
1904 #ifdef CONFIG_X86_MCE_P4THERMAL
1905 fastcall void smp_thermal_interrupt(struct cpu_user_regs *regs);
1906 #endif
1908 vector = __vmread(VM_EXIT_INTR_INFO);
1909 BUG_ON(!(vector & INTR_INFO_VALID_MASK));
1911 vector &= INTR_INFO_VECTOR_MASK;
1912 TRACE_VMEXIT(1, vector);
1914 switch(vector) {
1915 case LOCAL_TIMER_VECTOR:
1916 smp_apic_timer_interrupt(regs);
1917 break;
1918 case EVENT_CHECK_VECTOR:
1919 smp_event_check_interrupt();
1920 break;
1921 case INVALIDATE_TLB_VECTOR:
1922 smp_invalidate_interrupt();
1923 break;
1924 case CALL_FUNCTION_VECTOR:
1925 smp_call_function_interrupt();
1926 break;
1927 case SPURIOUS_APIC_VECTOR:
1928 smp_spurious_interrupt(regs);
1929 break;
1930 case ERROR_APIC_VECTOR:
1931 smp_error_interrupt(regs);
1932 break;
1933 #ifdef CONFIG_X86_MCE_P4THERMAL
1934 case THERMAL_APIC_VECTOR:
1935 smp_thermal_interrupt(regs);
1936 break;
1937 #endif
1938 default:
1939 regs->entry_vector = vector;
1940 do_IRQ(regs);
1941 break;
1945 #if defined (__x86_64__)
1946 void store_cpu_user_regs(struct cpu_user_regs *regs)
1948 regs->ss = __vmread(GUEST_SS_SELECTOR);
1949 regs->rsp = __vmread(GUEST_RSP);
1950 regs->rflags = __vmread(GUEST_RFLAGS);
1951 regs->cs = __vmread(GUEST_CS_SELECTOR);
1952 regs->ds = __vmread(GUEST_DS_SELECTOR);
1953 regs->es = __vmread(GUEST_ES_SELECTOR);
1954 regs->rip = __vmread(GUEST_RIP);
1956 #elif defined (__i386__)
1957 void store_cpu_user_regs(struct cpu_user_regs *regs)
1959 regs->ss = __vmread(GUEST_SS_SELECTOR);
1960 regs->esp = __vmread(GUEST_RSP);
1961 regs->eflags = __vmread(GUEST_RFLAGS);
1962 regs->cs = __vmread(GUEST_CS_SELECTOR);
1963 regs->ds = __vmread(GUEST_DS_SELECTOR);
1964 regs->es = __vmread(GUEST_ES_SELECTOR);
1965 regs->eip = __vmread(GUEST_RIP);
1967 #endif
1969 #ifdef XEN_DEBUGGER
1970 void save_cpu_user_regs(struct cpu_user_regs *regs)
1972 regs->xss = __vmread(GUEST_SS_SELECTOR);
1973 regs->esp = __vmread(GUEST_RSP);
1974 regs->eflags = __vmread(GUEST_RFLAGS);
1975 regs->xcs = __vmread(GUEST_CS_SELECTOR);
1976 regs->eip = __vmread(GUEST_RIP);
1978 regs->xgs = __vmread(GUEST_GS_SELECTOR);
1979 regs->xfs = __vmread(GUEST_FS_SELECTOR);
1980 regs->xes = __vmread(GUEST_ES_SELECTOR);
1981 regs->xds = __vmread(GUEST_DS_SELECTOR);
1984 void restore_cpu_user_regs(struct cpu_user_regs *regs)
1986 __vmwrite(GUEST_SS_SELECTOR, regs->xss);
1987 __vmwrite(GUEST_RSP, regs->esp);
1988 __vmwrite(GUEST_RFLAGS, regs->eflags);
1989 __vmwrite(GUEST_CS_SELECTOR, regs->xcs);
1990 __vmwrite(GUEST_RIP, regs->eip);
1992 __vmwrite(GUEST_GS_SELECTOR, regs->xgs);
1993 __vmwrite(GUEST_FS_SELECTOR, regs->xfs);
1994 __vmwrite(GUEST_ES_SELECTOR, regs->xes);
1995 __vmwrite(GUEST_DS_SELECTOR, regs->xds);
1997 #endif
1999 static void vmx_reflect_exception(struct vcpu *v)
2001 int error_code, intr_info, vector;
2003 intr_info = __vmread(VM_EXIT_INTR_INFO);
2004 vector = intr_info & 0xff;
2005 if ( intr_info & INTR_INFO_DELIVER_CODE_MASK )
2006 error_code = __vmread(VM_EXIT_INTR_ERROR_CODE);
2007 else
2008 error_code = VMX_DELIVER_NO_ERROR_CODE;
2010 #ifndef NDEBUG
2012 unsigned long rip;
2014 rip = __vmread(GUEST_RIP);
2015 HVM_DBG_LOG(DBG_LEVEL_1, "rip = %lx, error_code = %x",
2016 rip, error_code);
2018 #endif /* NDEBUG */
2020 /*
2021 * According to Intel Virtualization Technology Specification for
2022 * the IA-32 Intel Architecture (C97063-002 April 2005), section
2023 * 2.8.3, SW_EXCEPTION should be used for #BP and #OV, and
2024 * HW_EXCEPTION used for everything else. The main difference
2025 * appears to be that for SW_EXCEPTION, the EIP/RIP is incremented
2026 * by VM_ENTER_INSTRUCTION_LEN bytes, whereas for HW_EXCEPTION,
2027 * it is not.
2028 */
2029 if ( (intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_SW_EXCEPTION )
2031 int ilen = __get_instruction_length(); /* Safe: software exception */
2032 vmx_inject_sw_exception(v, vector, ilen);
2034 else
2036 vmx_inject_hw_exception(v, vector, error_code);
2040 asmlinkage void vmx_vmexit_handler(struct cpu_user_regs *regs)
2042 unsigned int exit_reason;
2043 unsigned long exit_qualification, inst_len = 0;
2044 struct vcpu *v = current;
2046 exit_reason = __vmread(VM_EXIT_REASON);
2048 perfc_incra(vmexits, exit_reason);
2050 if ( exit_reason != EXIT_REASON_EXTERNAL_INTERRUPT )
2051 local_irq_enable();
2053 if ( unlikely(exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) )
2055 unsigned int failed_vmentry_reason = exit_reason & 0xFFFF;
2057 exit_qualification = __vmread(EXIT_QUALIFICATION);
2058 printk("Failed vm entry (exit reason 0x%x) ", exit_reason);
2059 switch ( failed_vmentry_reason ) {
2060 case EXIT_REASON_INVALID_GUEST_STATE:
2061 printk("caused by invalid guest state (%ld).\n", exit_qualification);
2062 break;
2063 case EXIT_REASON_MSR_LOADING:
2064 printk("caused by MSR entry %ld loading.\n", exit_qualification);
2065 break;
2066 case EXIT_REASON_MACHINE_CHECK:
2067 printk("caused by machine check.\n");
2068 break;
2069 default:
2070 printk("reason not known yet!");
2071 break;
2074 printk("************* VMCS Area **************\n");
2075 vmcs_dump_vcpu();
2076 printk("**************************************\n");
2077 goto exit_and_crash;
2080 TRACE_VMEXIT(0, exit_reason);
2082 switch ( exit_reason )
2084 case EXIT_REASON_EXCEPTION_NMI:
2086 /*
2087 * We don't set the software-interrupt exiting (INT n).
2088 * (1) We can get an exception (e.g. #PG) in the guest, or
2089 * (2) NMI
2090 */
2091 unsigned int intr_info, vector;
2093 intr_info = __vmread(VM_EXIT_INTR_INFO);
2094 BUG_ON(!(intr_info & INTR_INFO_VALID_MASK));
2096 vector = intr_info & INTR_INFO_VECTOR_MASK;
2098 TRACE_VMEXIT(1, vector);
2099 perfc_incra(cause_vector, vector);
2101 switch ( vector )
2103 #ifdef XEN_DEBUGGER
2104 case TRAP_debug:
2106 save_cpu_user_regs(regs);
2107 pdb_handle_exception(1, regs, 1);
2108 restore_cpu_user_regs(regs);
2109 break;
2111 case TRAP_int3:
2113 save_cpu_user_regs(regs);
2114 pdb_handle_exception(3, regs, 1);
2115 restore_cpu_user_regs(regs);
2116 break;
2118 #else
2119 case TRAP_debug:
2121 if ( test_bit(_DOMF_debugging, &v->domain->domain_flags) )
2123 store_cpu_user_regs(regs);
2124 domain_pause_for_debugger();
2125 __vm_clear_bit(GUEST_PENDING_DBG_EXCEPTIONS,
2126 PENDING_DEBUG_EXC_BS);
2128 else
2130 vmx_reflect_exception(v);
2131 __vm_clear_bit(GUEST_PENDING_DBG_EXCEPTIONS,
2132 PENDING_DEBUG_EXC_BS);
2135 break;
2137 case TRAP_int3:
2139 if ( test_bit(_DOMF_debugging, &v->domain->domain_flags) )
2140 domain_pause_for_debugger();
2141 else
2142 vmx_reflect_exception(v);
2143 break;
2145 #endif
2146 case TRAP_no_device:
2148 vmx_do_no_device_fault();
2149 break;
2151 case TRAP_page_fault:
2153 exit_qualification = __vmread(EXIT_QUALIFICATION);
2154 regs->error_code = __vmread(VM_EXIT_INTR_ERROR_CODE);
2156 TRACE_VMEXIT(3, regs->error_code);
2157 TRACE_VMEXIT(4, exit_qualification);
2159 HVM_DBG_LOG(DBG_LEVEL_VMMU,
2160 "eax=%lx, ebx=%lx, ecx=%lx, edx=%lx, esi=%lx, edi=%lx",
2161 (unsigned long)regs->eax, (unsigned long)regs->ebx,
2162 (unsigned long)regs->ecx, (unsigned long)regs->edx,
2163 (unsigned long)regs->esi, (unsigned long)regs->edi);
2165 if ( !vmx_do_page_fault(exit_qualification, regs) )
2167 /* Inject #PG using Interruption-Information Fields. */
2168 vmx_inject_hw_exception(v, TRAP_page_fault, regs->error_code);
2169 v->arch.hvm_vmx.cpu_cr2 = exit_qualification;
2170 TRACE_3D(TRC_VMX_INTR, v->domain->domain_id,
2171 TRAP_page_fault, exit_qualification);
2173 break;
2175 case TRAP_nmi:
2176 if ( (intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI )
2177 do_nmi(regs); /* Real NMI, vector 2: normal processing. */
2178 else
2179 vmx_reflect_exception(v);
2180 break;
2181 default:
2182 vmx_reflect_exception(v);
2183 break;
2185 break;
2187 case EXIT_REASON_EXTERNAL_INTERRUPT:
2188 vmx_do_extint(regs);
2189 break;
2190 case EXIT_REASON_TRIPLE_FAULT:
2191 goto exit_and_crash;
2192 case EXIT_REASON_PENDING_INTERRUPT:
2193 /* Disable the interrupt window. */
2194 v->arch.hvm_vcpu.u.vmx.exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
2195 __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
2196 v->arch.hvm_vcpu.u.vmx.exec_control);
2197 break;
2198 case EXIT_REASON_TASK_SWITCH:
2199 goto exit_and_crash;
2200 case EXIT_REASON_CPUID:
2201 inst_len = __get_instruction_length(); /* Safe: CPUID */
2202 __update_guest_eip(inst_len);
2203 vmx_do_cpuid(regs);
2204 break;
2205 case EXIT_REASON_HLT:
2206 inst_len = __get_instruction_length(); /* Safe: HLT */
2207 __update_guest_eip(inst_len);
2208 vmx_do_hlt();
2209 break;
2210 case EXIT_REASON_INVLPG:
2212 inst_len = __get_instruction_length(); /* Safe: INVLPG */
2213 __update_guest_eip(inst_len);
2214 exit_qualification = __vmread(EXIT_QUALIFICATION);
2215 vmx_do_invlpg(exit_qualification);
2216 TRACE_VMEXIT(4, exit_qualification);
2217 break;
2219 case EXIT_REASON_VMCALL:
2221 inst_len = __get_instruction_length(); /* Safe: VMCALL */
2222 __update_guest_eip(inst_len);
2223 hvm_do_hypercall(regs);
2224 break;
2226 case EXIT_REASON_CR_ACCESS:
2228 exit_qualification = __vmread(EXIT_QUALIFICATION);
2229 inst_len = __get_instruction_length(); /* Safe: MOV Cn, LMSW, CLTS */
2230 if ( vmx_cr_access(exit_qualification, regs) )
2231 __update_guest_eip(inst_len);
2232 TRACE_VMEXIT(4, exit_qualification);
2233 break;
2235 case EXIT_REASON_DR_ACCESS:
2236 exit_qualification = __vmread(EXIT_QUALIFICATION);
2237 vmx_dr_access(exit_qualification, regs);
2238 break;
2239 case EXIT_REASON_IO_INSTRUCTION:
2240 exit_qualification = __vmread(EXIT_QUALIFICATION);
2241 inst_len = __get_instruction_length(); /* Safe: IN, INS, OUT, OUTS */
2242 vmx_io_instruction(exit_qualification, inst_len);
2243 TRACE_VMEXIT(4, exit_qualification);
2244 break;
2245 case EXIT_REASON_MSR_READ:
2246 inst_len = __get_instruction_length(); /* Safe: RDMSR */
2247 __update_guest_eip(inst_len);
2248 vmx_do_msr_read(regs);
2249 TRACE_VMEXIT(1, regs->ecx);
2250 TRACE_VMEXIT(2, regs->eax);
2251 TRACE_VMEXIT(3, regs->edx);
2252 break;
2253 case EXIT_REASON_MSR_WRITE:
2254 inst_len = __get_instruction_length(); /* Safe: WRMSR */
2255 __update_guest_eip(inst_len);
2256 vmx_do_msr_write(regs);
2257 TRACE_VMEXIT(1, regs->ecx);
2258 TRACE_VMEXIT(2, regs->eax);
2259 TRACE_VMEXIT(3, regs->edx);
2260 break;
2261 case EXIT_REASON_MWAIT_INSTRUCTION:
2262 case EXIT_REASON_MONITOR_INSTRUCTION:
2263 case EXIT_REASON_PAUSE_INSTRUCTION:
2264 goto exit_and_crash;
2265 case EXIT_REASON_VMCLEAR:
2266 case EXIT_REASON_VMLAUNCH:
2267 case EXIT_REASON_VMPTRLD:
2268 case EXIT_REASON_VMPTRST:
2269 case EXIT_REASON_VMREAD:
2270 case EXIT_REASON_VMRESUME:
2271 case EXIT_REASON_VMWRITE:
2272 case EXIT_REASON_VMXOFF:
2273 case EXIT_REASON_VMXON:
2274 /* Report invalid opcode exception when a VMX guest tries to execute
2275 any of the VMX instructions */
2276 vmx_inject_hw_exception(v, TRAP_invalid_op, VMX_DELIVER_NO_ERROR_CODE);
2277 break;
2279 case EXIT_REASON_TPR_BELOW_THRESHOLD:
2280 vcpu_vlapic(v)->flush_tpr_threshold = 1;
2281 break;
2283 default:
2284 exit_and_crash:
2285 gdprintk(XENLOG_ERR, "Bad vmexit (reason %x)\n", exit_reason);
2286 domain_crash(v->domain);
2287 break;
2291 asmlinkage void vmx_trace_vmentry(void)
2293 struct vcpu *v = current;
2294 TRACE_5D(TRC_VMX_VMENTRY + current->vcpu_id,
2295 v->arch.hvm_vcpu.hvm_trace_values[0],
2296 v->arch.hvm_vcpu.hvm_trace_values[1],
2297 v->arch.hvm_vcpu.hvm_trace_values[2],
2298 v->arch.hvm_vcpu.hvm_trace_values[3],
2299 v->arch.hvm_vcpu.hvm_trace_values[4]);
2301 TRACE_VMEXIT(0, 0);
2302 TRACE_VMEXIT(1, 0);
2303 TRACE_VMEXIT(2, 0);
2304 TRACE_VMEXIT(3, 0);
2305 TRACE_VMEXIT(4, 0);
2308 asmlinkage void vmx_trace_vmexit (void)
2310 TRACE_3D(TRC_VMX_VMEXIT + current->vcpu_id, 0, 0, 0);
2313 /*
2314 * Local variables:
2315 * mode: C
2316 * c-set-style: "BSD"
2317 * c-basic-offset: 4
2318 * tab-width: 4
2319 * indent-tabs-mode: nil
2320 * End:
2321 */