direct-io.hg

view xen/arch/x86/hvm/vmx/vmx.c @ 12350:5a4517468f4f

[HVM] Remove HVM halt timer. It's no longer needed since interrupts
can wake it up now.

Signed-off-by: Xin Li <xin.b.li@intel.com>
author kfraser@localhost.localdomain
date Fri Nov 10 11:01:15 2006 +0000 (2006-11-10)
parents 9f9f569b0a1d
children 0b385df5f236
line source
1 /*
2 * vmx.c: handling VMX architecture-related VM exits
3 * Copyright (c) 2004, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
16 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 *
18 */
20 #include <xen/config.h>
21 #include <xen/init.h>
22 #include <xen/lib.h>
23 #include <xen/trace.h>
24 #include <xen/sched.h>
25 #include <xen/irq.h>
26 #include <xen/softirq.h>
27 #include <xen/domain_page.h>
28 #include <xen/hypercall.h>
29 #include <xen/perfc.h>
30 #include <asm/current.h>
31 #include <asm/io.h>
32 #include <asm/regs.h>
33 #include <asm/cpufeature.h>
34 #include <asm/processor.h>
35 #include <asm/types.h>
36 #include <asm/msr.h>
37 #include <asm/spinlock.h>
38 #include <asm/hvm/hvm.h>
39 #include <asm/hvm/support.h>
40 #include <asm/hvm/vmx/vmx.h>
41 #include <asm/hvm/vmx/vmcs.h>
42 #include <asm/hvm/vmx/cpu.h>
43 #include <asm/shadow.h>
44 #include <public/sched.h>
45 #include <public/hvm/ioreq.h>
46 #include <asm/hvm/vpic.h>
47 #include <asm/hvm/vlapic.h>
48 #include <asm/x86_emulate.h>
50 static void vmx_ctxt_switch_from(struct vcpu *v);
51 static void vmx_ctxt_switch_to(struct vcpu *v);
53 static int vmx_vcpu_initialise(struct vcpu *v)
54 {
55 int rc;
57 spin_lock_init(&v->arch.hvm_vmx.vmcs_lock);
59 v->arch.schedule_tail = arch_vmx_do_resume;
60 v->arch.ctxt_switch_from = vmx_ctxt_switch_from;
61 v->arch.ctxt_switch_to = vmx_ctxt_switch_to;
63 if ( (rc = vmx_create_vmcs(v)) != 0 )
64 {
65 dprintk(XENLOG_WARNING,
66 "Failed to create VMCS for vcpu %d: err=%d.\n",
67 v->vcpu_id, rc);
68 return rc;
69 }
71 return 0;
72 }
74 static void vmx_vcpu_destroy(struct vcpu *v)
75 {
76 vmx_destroy_vmcs(v);
77 }
79 #ifdef __x86_64__
81 static DEFINE_PER_CPU(struct vmx_msr_state, percpu_msr);
83 static u32 msr_data_index[VMX_MSR_COUNT] =
84 {
85 MSR_LSTAR, MSR_STAR, MSR_CSTAR,
86 MSR_SYSCALL_MASK, MSR_EFER,
87 };
89 static void vmx_save_segments(struct vcpu *v)
90 {
91 rdmsrl(MSR_SHADOW_GS_BASE, v->arch.hvm_vmx.msr_content.shadow_gs);
92 }
94 /*
95 * To avoid MSR save/restore at every VM exit/entry time, we restore
96 * the x86_64 specific MSRs at domain switch time. Since those MSRs are
97 * are not modified once set for generic domains, we don't save them,
98 * but simply reset them to the values set at percpu_traps_init().
99 */
100 static void vmx_load_msrs(void)
101 {
102 struct vmx_msr_state *host_state = &this_cpu(percpu_msr);
103 int i;
105 while ( host_state->flags )
106 {
107 i = find_first_set_bit(host_state->flags);
108 wrmsrl(msr_data_index[i], host_state->msr_items[i]);
109 clear_bit(i, &host_state->flags);
110 }
111 }
113 static void vmx_save_init_msrs(void)
114 {
115 struct vmx_msr_state *host_state = &this_cpu(percpu_msr);
116 int i;
118 for ( i = 0; i < VMX_MSR_COUNT; i++ )
119 rdmsrl(msr_data_index[i], host_state->msr_items[i]);
120 }
122 #define CASE_READ_MSR(address) \
123 case MSR_ ## address: \
124 msr_content = msr->msr_items[VMX_INDEX_MSR_ ## address]; \
125 break
127 #define CASE_WRITE_MSR(address) \
128 case MSR_ ## address: \
129 { \
130 msr->msr_items[VMX_INDEX_MSR_ ## address] = msr_content; \
131 if (!test_bit(VMX_INDEX_MSR_ ## address, &msr->flags)) { \
132 set_bit(VMX_INDEX_MSR_ ## address, &msr->flags); \
133 } \
134 wrmsrl(MSR_ ## address, msr_content); \
135 set_bit(VMX_INDEX_MSR_ ## address, &host_state->flags); \
136 } \
137 break
139 #define IS_CANO_ADDRESS(add) 1
140 static inline int long_mode_do_msr_read(struct cpu_user_regs *regs)
141 {
142 u64 msr_content = 0;
143 struct vcpu *v = current;
144 struct vmx_msr_state *msr = &v->arch.hvm_vmx.msr_content;
146 switch ( regs->ecx ) {
147 case MSR_EFER:
148 HVM_DBG_LOG(DBG_LEVEL_2, "EFER msr_content 0x%"PRIx64, msr_content);
149 msr_content = msr->msr_items[VMX_INDEX_MSR_EFER];
150 break;
152 case MSR_FS_BASE:
153 if ( !(vmx_long_mode_enabled(v)) )
154 /* XXX should it be GP fault */
155 domain_crash_synchronous();
157 msr_content = __vmread(GUEST_FS_BASE);
158 break;
160 case MSR_GS_BASE:
161 if ( !(vmx_long_mode_enabled(v)) )
162 domain_crash_synchronous();
164 msr_content = __vmread(GUEST_GS_BASE);
165 break;
167 case MSR_SHADOW_GS_BASE:
168 msr_content = msr->shadow_gs;
169 break;
171 CASE_READ_MSR(STAR);
172 CASE_READ_MSR(LSTAR);
173 CASE_READ_MSR(CSTAR);
174 CASE_READ_MSR(SYSCALL_MASK);
176 default:
177 return 0;
178 }
180 HVM_DBG_LOG(DBG_LEVEL_2, "msr_content: 0x%"PRIx64, msr_content);
182 regs->eax = (u32)(msr_content >> 0);
183 regs->edx = (u32)(msr_content >> 32);
185 return 1;
186 }
188 static inline int long_mode_do_msr_write(struct cpu_user_regs *regs)
189 {
190 u64 msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
191 struct vcpu *v = current;
192 struct vmx_msr_state *msr = &v->arch.hvm_vmx.msr_content;
193 struct vmx_msr_state *host_state = &this_cpu(percpu_msr);
195 HVM_DBG_LOG(DBG_LEVEL_1, "msr 0x%lx msr_content 0x%"PRIx64"\n",
196 (unsigned long)regs->ecx, msr_content);
198 switch ( regs->ecx ) {
199 case MSR_EFER:
200 /* offending reserved bit will cause #GP */
201 if ( msr_content & ~(EFER_LME | EFER_LMA | EFER_NX | EFER_SCE) )
202 {
203 printk("Trying to set reserved bit in EFER: %"PRIx64"\n",
204 msr_content);
205 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
206 return 0;
207 }
209 if ( (msr_content & EFER_LME)
210 && !(msr->msr_items[VMX_INDEX_MSR_EFER] & EFER_LME) )
211 {
212 if ( unlikely(vmx_paging_enabled(v)) )
213 {
214 printk("Trying to set EFER.LME with paging enabled\n");
215 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
216 return 0;
217 }
218 }
219 else if ( !(msr_content & EFER_LME)
220 && (msr->msr_items[VMX_INDEX_MSR_EFER] & EFER_LME) )
221 {
222 if ( unlikely(vmx_paging_enabled(v)) )
223 {
224 printk("Trying to clear EFER.LME with paging enabled\n");
225 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
226 return 0;
227 }
228 }
230 msr->msr_items[VMX_INDEX_MSR_EFER] = msr_content;
231 break;
233 case MSR_FS_BASE:
234 case MSR_GS_BASE:
235 if ( !(vmx_long_mode_enabled(v)) )
236 domain_crash_synchronous();
238 if ( !IS_CANO_ADDRESS(msr_content) )
239 {
240 HVM_DBG_LOG(DBG_LEVEL_1, "Not cano address of msr write\n");
241 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
242 return 0;
243 }
245 if ( regs->ecx == MSR_FS_BASE )
246 __vmwrite(GUEST_FS_BASE, msr_content);
247 else
248 __vmwrite(GUEST_GS_BASE, msr_content);
250 break;
252 case MSR_SHADOW_GS_BASE:
253 if ( !(vmx_long_mode_enabled(v)) )
254 domain_crash_synchronous();
256 v->arch.hvm_vmx.msr_content.shadow_gs = msr_content;
257 wrmsrl(MSR_SHADOW_GS_BASE, msr_content);
258 break;
260 CASE_WRITE_MSR(STAR);
261 CASE_WRITE_MSR(LSTAR);
262 CASE_WRITE_MSR(CSTAR);
263 CASE_WRITE_MSR(SYSCALL_MASK);
265 default:
266 return 0;
267 }
269 return 1;
270 }
272 static void vmx_restore_msrs(struct vcpu *v)
273 {
274 int i = 0;
275 struct vmx_msr_state *guest_state;
276 struct vmx_msr_state *host_state;
277 unsigned long guest_flags ;
279 guest_state = &v->arch.hvm_vmx.msr_content;;
280 host_state = &this_cpu(percpu_msr);
282 wrmsrl(MSR_SHADOW_GS_BASE, guest_state->shadow_gs);
283 guest_flags = guest_state->flags;
284 if (!guest_flags)
285 return;
287 while (guest_flags){
288 i = find_first_set_bit(guest_flags);
290 HVM_DBG_LOG(DBG_LEVEL_2,
291 "restore guest's index %d msr %lx with %lx\n",
292 i, (unsigned long)msr_data_index[i],
293 (unsigned long)guest_state->msr_items[i]);
294 set_bit(i, &host_state->flags);
295 wrmsrl(msr_data_index[i], guest_state->msr_items[i]);
296 clear_bit(i, &guest_flags);
297 }
298 }
300 #else /* __i386__ */
302 #define vmx_save_segments(v) ((void)0)
303 #define vmx_load_msrs() ((void)0)
304 #define vmx_restore_msrs(v) ((void)0)
305 #define vmx_save_init_msrs() ((void)0)
307 static inline int long_mode_do_msr_read(struct cpu_user_regs *regs)
308 {
309 return 0;
310 }
312 static inline int long_mode_do_msr_write(struct cpu_user_regs *regs)
313 {
314 return 0;
315 }
317 #endif /* __i386__ */
319 #define loaddebug(_v,_reg) \
320 __asm__ __volatile__ ("mov %0,%%db" #_reg : : "r" ((_v)->debugreg[_reg]))
321 #define savedebug(_v,_reg) \
322 __asm__ __volatile__ ("mov %%db" #_reg ",%0" : : "r" ((_v)->debugreg[_reg]))
324 static inline void vmx_save_dr(struct vcpu *v)
325 {
326 if ( !v->arch.hvm_vcpu.flag_dr_dirty )
327 return;
329 /* Clear the DR dirty flag and re-enable intercepts for DR accesses. */
330 v->arch.hvm_vcpu.flag_dr_dirty = 0;
331 v->arch.hvm_vcpu.u.vmx.exec_control |= CPU_BASED_MOV_DR_EXITING;
332 __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vcpu.u.vmx.exec_control);
334 savedebug(&v->arch.guest_context, 0);
335 savedebug(&v->arch.guest_context, 1);
336 savedebug(&v->arch.guest_context, 2);
337 savedebug(&v->arch.guest_context, 3);
338 savedebug(&v->arch.guest_context, 6);
339 v->arch.guest_context.debugreg[7] = __vmread(GUEST_DR7);
340 }
342 static inline void __restore_debug_registers(struct vcpu *v)
343 {
344 loaddebug(&v->arch.guest_context, 0);
345 loaddebug(&v->arch.guest_context, 1);
346 loaddebug(&v->arch.guest_context, 2);
347 loaddebug(&v->arch.guest_context, 3);
348 /* No 4 and 5 */
349 loaddebug(&v->arch.guest_context, 6);
350 /* DR7 is loaded from the VMCS. */
351 }
353 /*
354 * DR7 is saved and restored on every vmexit. Other debug registers only
355 * need to be restored if their value is going to affect execution -- i.e.,
356 * if one of the breakpoints is enabled. So mask out all bits that don't
357 * enable some breakpoint functionality.
358 */
359 #define DR7_ACTIVE_MASK 0xff
361 static inline void vmx_restore_dr(struct vcpu *v)
362 {
363 /* NB. __vmread() is not usable here, so we cannot read from the VMCS. */
364 if ( unlikely(v->arch.guest_context.debugreg[7] & DR7_ACTIVE_MASK) )
365 __restore_debug_registers(v);
366 }
368 static void vmx_freeze_time(struct vcpu *v)
369 {
370 struct periodic_time *pt=&v->domain->arch.hvm_domain.pl_time.periodic_tm;
372 if ( pt->enabled && pt->first_injected
373 && (v->vcpu_id == pt->bind_vcpu)
374 && !v->arch.hvm_vcpu.guest_time ) {
375 v->arch.hvm_vcpu.guest_time = hvm_get_guest_time(v);
376 if ( !test_bit(_VCPUF_blocked, &v->vcpu_flags) )
377 stop_timer(&pt->timer);
378 }
379 }
381 static void vmx_ctxt_switch_from(struct vcpu *v)
382 {
383 vmx_freeze_time(v);
384 vmx_save_segments(v);
385 vmx_load_msrs();
386 vmx_save_dr(v);
387 }
389 static void vmx_ctxt_switch_to(struct vcpu *v)
390 {
391 vmx_restore_msrs(v);
392 vmx_restore_dr(v);
393 }
395 static void stop_vmx(void)
396 {
397 if ( !(read_cr4() & X86_CR4_VMXE) )
398 return;
399 __vmxoff();
400 clear_in_cr4(X86_CR4_VMXE);
401 }
403 void vmx_migrate_timers(struct vcpu *v)
404 {
405 struct periodic_time *pt = &v->domain->arch.hvm_domain.pl_time.periodic_tm;
406 struct RTCState *vrtc = &v->domain->arch.hvm_domain.pl_time.vrtc;
407 struct PMTState *vpmt = &v->domain->arch.hvm_domain.pl_time.vpmt;
409 if ( pt->enabled )
410 {
411 migrate_timer(&pt->timer, v->processor);
412 }
413 migrate_timer(&vcpu_vlapic(v)->vlapic_timer, v->processor);
414 migrate_timer(&vrtc->second_timer, v->processor);
415 migrate_timer(&vrtc->second_timer2, v->processor);
416 migrate_timer(&vpmt->timer, v->processor);
417 }
419 static void vmx_store_cpu_guest_regs(
420 struct vcpu *v, struct cpu_user_regs *regs, unsigned long *crs)
421 {
422 vmx_vmcs_enter(v);
424 if ( regs != NULL )
425 {
426 regs->eflags = __vmread(GUEST_RFLAGS);
427 regs->ss = __vmread(GUEST_SS_SELECTOR);
428 regs->cs = __vmread(GUEST_CS_SELECTOR);
429 regs->ds = __vmread(GUEST_DS_SELECTOR);
430 regs->es = __vmread(GUEST_ES_SELECTOR);
431 regs->gs = __vmread(GUEST_GS_SELECTOR);
432 regs->fs = __vmread(GUEST_FS_SELECTOR);
433 regs->eip = __vmread(GUEST_RIP);
434 regs->esp = __vmread(GUEST_RSP);
435 }
437 if ( crs != NULL )
438 {
439 crs[0] = v->arch.hvm_vmx.cpu_shadow_cr0;
440 crs[2] = v->arch.hvm_vmx.cpu_cr2;
441 crs[3] = __vmread(GUEST_CR3);
442 crs[4] = v->arch.hvm_vmx.cpu_shadow_cr4;
443 }
445 vmx_vmcs_exit(v);
446 }
448 /*
449 * The VMX spec (section 4.3.1.2, Checks on Guest Segment
450 * Registers) says that virtual-8086 mode guests' segment
451 * base-address fields in the VMCS must be equal to their
452 * corresponding segment selector field shifted right by
453 * four bits upon vmentry.
454 *
455 * This function (called only for VM86-mode guests) fixes
456 * the bases to be consistent with the selectors in regs
457 * if they're not already. Without this, we can fail the
458 * vmentry check mentioned above.
459 */
460 static void fixup_vm86_seg_bases(struct cpu_user_regs *regs)
461 {
462 unsigned long base;
464 base = __vmread(GUEST_ES_BASE);
465 if (regs->es << 4 != base)
466 __vmwrite(GUEST_ES_BASE, regs->es << 4);
467 base = __vmread(GUEST_CS_BASE);
468 if (regs->cs << 4 != base)
469 __vmwrite(GUEST_CS_BASE, regs->cs << 4);
470 base = __vmread(GUEST_SS_BASE);
471 if (regs->ss << 4 != base)
472 __vmwrite(GUEST_SS_BASE, regs->ss << 4);
473 base = __vmread(GUEST_DS_BASE);
474 if (regs->ds << 4 != base)
475 __vmwrite(GUEST_DS_BASE, regs->ds << 4);
476 base = __vmread(GUEST_FS_BASE);
477 if (regs->fs << 4 != base)
478 __vmwrite(GUEST_FS_BASE, regs->fs << 4);
479 base = __vmread(GUEST_GS_BASE);
480 if (regs->gs << 4 != base)
481 __vmwrite(GUEST_GS_BASE, regs->gs << 4);
482 }
484 static void vmx_load_cpu_guest_regs(struct vcpu *v, struct cpu_user_regs *regs)
485 {
486 vmx_vmcs_enter(v);
488 __vmwrite(GUEST_SS_SELECTOR, regs->ss);
489 __vmwrite(GUEST_DS_SELECTOR, regs->ds);
490 __vmwrite(GUEST_ES_SELECTOR, regs->es);
491 __vmwrite(GUEST_GS_SELECTOR, regs->gs);
492 __vmwrite(GUEST_FS_SELECTOR, regs->fs);
494 __vmwrite(GUEST_RSP, regs->esp);
496 /* NB. Bit 1 of RFLAGS must be set for VMENTRY to succeed. */
497 __vmwrite(GUEST_RFLAGS, regs->eflags | 2UL);
498 if (regs->eflags & EF_TF)
499 __vm_set_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_DB);
500 else
501 __vm_clear_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_DB);
502 if (regs->eflags & EF_VM)
503 fixup_vm86_seg_bases(regs);
505 __vmwrite(GUEST_CS_SELECTOR, regs->cs);
506 __vmwrite(GUEST_RIP, regs->eip);
508 vmx_vmcs_exit(v);
509 }
511 static unsigned long vmx_get_ctrl_reg(struct vcpu *v, unsigned int num)
512 {
513 switch ( num )
514 {
515 case 0:
516 return v->arch.hvm_vmx.cpu_cr0;
517 case 2:
518 return v->arch.hvm_vmx.cpu_cr2;
519 case 3:
520 return v->arch.hvm_vmx.cpu_cr3;
521 case 4:
522 return v->arch.hvm_vmx.cpu_shadow_cr4;
523 default:
524 BUG();
525 }
526 return 0; /* dummy */
527 }
529 /* Make sure that xen intercepts any FP accesses from current */
530 static void vmx_stts(struct vcpu *v)
531 {
532 /* VMX depends on operating on the current vcpu */
533 ASSERT(v == current);
535 /*
536 * If the guest does not have TS enabled then we must cause and handle an
537 * exception on first use of the FPU. If the guest *does* have TS enabled
538 * then this is not necessary: no FPU activity can occur until the guest
539 * clears CR0.TS, and we will initialise the FPU when that happens.
540 */
541 if ( !(v->arch.hvm_vmx.cpu_shadow_cr0 & X86_CR0_TS) )
542 {
543 v->arch.hvm_vmx.cpu_cr0 |= X86_CR0_TS;
544 __vmwrite(GUEST_CR0, v->arch.hvm_vmx.cpu_cr0);
545 __vm_set_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_NM);
546 }
547 }
549 static void vmx_set_tsc_offset(struct vcpu *v, u64 offset)
550 {
551 vmx_vmcs_enter(v);
552 __vmwrite(TSC_OFFSET, offset);
553 #if defined (__i386__)
554 __vmwrite(TSC_OFFSET_HIGH, offset >> 32);
555 #endif
556 vmx_vmcs_exit(v);
557 }
559 static void vmx_init_ap_context(
560 struct vcpu_guest_context *ctxt, int vcpuid, int trampoline_vector)
561 {
562 memset(ctxt, 0, sizeof(*ctxt));
563 ctxt->user_regs.eip = VMXASSIST_BASE;
564 ctxt->user_regs.edx = vcpuid;
565 ctxt->user_regs.ebx = trampoline_vector;
566 }
568 void do_nmi(struct cpu_user_regs *);
570 static void vmx_init_hypercall_page(struct domain *d, void *hypercall_page)
571 {
572 char *p;
573 int i;
575 memset(hypercall_page, 0, PAGE_SIZE);
577 for ( i = 0; i < (PAGE_SIZE / 32); i++ )
578 {
579 p = (char *)(hypercall_page + (i * 32));
580 *(u8 *)(p + 0) = 0xb8; /* mov imm32, %eax */
581 *(u32 *)(p + 1) = i;
582 *(u8 *)(p + 5) = 0x0f; /* vmcall */
583 *(u8 *)(p + 6) = 0x01;
584 *(u8 *)(p + 7) = 0xc1;
585 *(u8 *)(p + 8) = 0xc3; /* ret */
586 }
588 /* Don't support HYPERVISOR_iret at the moment */
589 *(u16 *)(hypercall_page + (__HYPERVISOR_iret * 32)) = 0x0b0f; /* ud2 */
590 }
592 static int vmx_realmode(struct vcpu *v)
593 {
594 unsigned long rflags;
596 ASSERT(v == current);
598 rflags = __vmread(GUEST_RFLAGS);
599 return rflags & X86_EFLAGS_VM;
600 }
602 static int vmx_guest_x86_mode(struct vcpu *v)
603 {
604 unsigned long cs_ar_bytes;
606 ASSERT(v == current);
608 cs_ar_bytes = __vmread(GUEST_CS_AR_BYTES);
610 if ( vmx_long_mode_enabled(v) )
611 return ((cs_ar_bytes & (1u<<13)) ?
612 X86EMUL_MODE_PROT64 : X86EMUL_MODE_PROT32);
614 if ( vmx_realmode(v) )
615 return X86EMUL_MODE_REAL;
617 return ((cs_ar_bytes & (1u<<14)) ?
618 X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16);
619 }
621 static int vmx_pae_enabled(struct vcpu *v)
622 {
623 unsigned long cr4 = v->arch.hvm_vmx.cpu_shadow_cr4;
624 return (vmx_paging_enabled(v) && (cr4 & X86_CR4_PAE));
625 }
627 /* Setup HVM interfaces */
628 static void vmx_setup_hvm_funcs(void)
629 {
630 if ( hvm_enabled )
631 return;
633 hvm_funcs.disable = stop_vmx;
635 hvm_funcs.vcpu_initialise = vmx_vcpu_initialise;
636 hvm_funcs.vcpu_destroy = vmx_vcpu_destroy;
638 hvm_funcs.store_cpu_guest_regs = vmx_store_cpu_guest_regs;
639 hvm_funcs.load_cpu_guest_regs = vmx_load_cpu_guest_regs;
641 hvm_funcs.realmode = vmx_realmode;
642 hvm_funcs.paging_enabled = vmx_paging_enabled;
643 hvm_funcs.long_mode_enabled = vmx_long_mode_enabled;
644 hvm_funcs.pae_enabled = vmx_pae_enabled;
645 hvm_funcs.guest_x86_mode = vmx_guest_x86_mode;
646 hvm_funcs.get_guest_ctrl_reg = vmx_get_ctrl_reg;
648 hvm_funcs.update_host_cr3 = vmx_update_host_cr3;
650 hvm_funcs.stts = vmx_stts;
651 hvm_funcs.set_tsc_offset = vmx_set_tsc_offset;
653 hvm_funcs.init_ap_context = vmx_init_ap_context;
655 hvm_funcs.init_hypercall_page = vmx_init_hypercall_page;
656 }
658 int start_vmx(void)
659 {
660 u32 eax, edx;
661 struct vmcs_struct *vmcs;
663 /*
664 * Xen does not fill x86_capability words except 0.
665 */
666 boot_cpu_data.x86_capability[4] = cpuid_ecx(1);
668 if ( !test_bit(X86_FEATURE_VMXE, &boot_cpu_data.x86_capability) )
669 return 0;
671 rdmsr(IA32_FEATURE_CONTROL_MSR, eax, edx);
673 if ( eax & IA32_FEATURE_CONTROL_MSR_LOCK )
674 {
675 if ( (eax & IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON) == 0x0 )
676 {
677 printk("VMX disabled by Feature Control MSR.\n");
678 return 0;
679 }
680 }
681 else
682 {
683 wrmsr(IA32_FEATURE_CONTROL_MSR,
684 IA32_FEATURE_CONTROL_MSR_LOCK |
685 IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON, 0);
686 }
688 set_in_cr4(X86_CR4_VMXE);
690 vmx_init_vmcs_config();
692 if ( smp_processor_id() == 0 )
693 setup_vmcs_dump();
695 if ( (vmcs = vmx_alloc_host_vmcs()) == NULL )
696 {
697 clear_in_cr4(X86_CR4_VMXE);
698 printk("Failed to allocate host VMCS\n");
699 return 0;
700 }
702 if ( __vmxon(virt_to_maddr(vmcs)) )
703 {
704 clear_in_cr4(X86_CR4_VMXE);
705 printk("VMXON failed\n");
706 vmx_free_host_vmcs(vmcs);
707 return 0;
708 }
710 printk("VMXON is done\n");
712 vmx_save_init_msrs();
714 vmx_setup_hvm_funcs();
716 hvm_enabled = 1;
718 return 1;
719 }
721 /*
722 * Not all cases receive valid value in the VM-exit instruction length field.
723 * Callers must know what they're doing!
724 */
725 static int __get_instruction_length(void)
726 {
727 int len;
728 len = __vmread(VM_EXIT_INSTRUCTION_LEN); /* Safe: callers audited */
729 if ( (len < 1) || (len > 15) )
730 __hvm_bug(guest_cpu_user_regs());
731 return len;
732 }
734 static void inline __update_guest_eip(unsigned long inst_len)
735 {
736 unsigned long current_eip;
738 current_eip = __vmread(GUEST_RIP);
739 __vmwrite(GUEST_RIP, current_eip + inst_len);
740 __vmwrite(GUEST_INTERRUPTIBILITY_INFO, 0);
741 }
743 static int vmx_do_page_fault(unsigned long va, struct cpu_user_regs *regs)
744 {
745 int result;
747 #if 0 /* keep for debugging */
748 {
749 unsigned long eip, cs;
751 cs = __vmread(GUEST_CS_BASE);
752 eip = __vmread(GUEST_RIP);
753 HVM_DBG_LOG(DBG_LEVEL_VMMU,
754 "vmx_do_page_fault = 0x%lx, cs_base=%lx, "
755 "eip = %lx, error_code = %lx\n",
756 va, cs, eip, (unsigned long)regs->error_code);
757 }
758 #endif
760 result = shadow_fault(va, regs);
762 TRACE_VMEXIT(2, result);
763 #if 0
764 if ( !result )
765 {
766 eip = __vmread(GUEST_RIP);
767 printk("vmx pgfault to guest va=%lx eip=%lx\n", va, eip);
768 }
769 #endif
771 return result;
772 }
774 static void vmx_do_no_device_fault(void)
775 {
776 struct vcpu *v = current;
778 setup_fpu(current);
779 __vm_clear_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_NM);
781 /* Disable TS in guest CR0 unless the guest wants the exception too. */
782 if ( !(v->arch.hvm_vmx.cpu_shadow_cr0 & X86_CR0_TS) )
783 {
784 v->arch.hvm_vmx.cpu_cr0 &= ~X86_CR0_TS;
785 __vmwrite(GUEST_CR0, v->arch.hvm_vmx.cpu_cr0);
786 }
787 }
789 #define bitmaskof(idx) (1U << ((idx)&31))
790 static void vmx_do_cpuid(struct cpu_user_regs *regs)
791 {
792 unsigned int input = (unsigned int)regs->eax;
793 unsigned int count = (unsigned int)regs->ecx;
794 unsigned int eax, ebx, ecx, edx;
795 unsigned long eip;
796 struct vcpu *v = current;
798 eip = __vmread(GUEST_RIP);
800 HVM_DBG_LOG(DBG_LEVEL_3, "(eax) 0x%08lx, (ebx) 0x%08lx, "
801 "(ecx) 0x%08lx, (edx) 0x%08lx, (esi) 0x%08lx, (edi) 0x%08lx",
802 (unsigned long)regs->eax, (unsigned long)regs->ebx,
803 (unsigned long)regs->ecx, (unsigned long)regs->edx,
804 (unsigned long)regs->esi, (unsigned long)regs->edi);
806 if ( input == CPUID_LEAF_0x4 )
807 {
808 cpuid_count(input, count, &eax, &ebx, &ecx, &edx);
809 eax &= NUM_CORES_RESET_MASK;
810 }
811 else if ( input == 0x40000003 )
812 {
813 /*
814 * NB. Unsupported interface for private use of VMXASSIST only.
815 * Note that this leaf lives at <max-hypervisor-leaf> + 1.
816 */
817 u64 value = ((u64)regs->edx << 32) | (u32)regs->ecx;
818 unsigned long mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
819 char *p;
821 gdprintk(XENLOG_INFO, "Input address is 0x%"PRIx64".\n", value);
823 /* 8-byte aligned valid pseudophys address from vmxassist, please. */
824 if ( (value & 7) || (mfn == INVALID_MFN) ||
825 !v->arch.hvm_vmx.vmxassist_enabled )
826 domain_crash_synchronous();
828 p = map_domain_page(mfn);
829 value = *((uint64_t *)(p + (value & (PAGE_SIZE - 1))));
830 unmap_domain_page(p);
832 gdprintk(XENLOG_INFO, "Output value is 0x%"PRIx64".\n", value);
833 ecx = (u32)(value >> 0);
834 edx = (u32)(value >> 32);
835 }
836 else if ( !cpuid_hypervisor_leaves(input, &eax, &ebx, &ecx, &edx) )
837 {
838 cpuid(input, &eax, &ebx, &ecx, &edx);
840 if ( input == CPUID_LEAF_0x1 )
841 {
842 /* Mask off reserved bits. */
843 ecx &= ~VMX_VCPU_CPUID_L1_ECX_RESERVED;
845 if ( !vlapic_global_enabled(vcpu_vlapic(v)) )
846 clear_bit(X86_FEATURE_APIC, &edx);
848 #if CONFIG_PAGING_LEVELS >= 3
849 if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] )
850 #endif
851 clear_bit(X86_FEATURE_PAE, &edx);
852 clear_bit(X86_FEATURE_PSE36, &edx);
854 ebx &= NUM_THREADS_RESET_MASK;
856 /* Unsupportable for virtualised CPUs. */
857 ecx &= ~(bitmaskof(X86_FEATURE_VMXE) |
858 bitmaskof(X86_FEATURE_EST) |
859 bitmaskof(X86_FEATURE_TM2) |
860 bitmaskof(X86_FEATURE_CID) |
861 bitmaskof(X86_FEATURE_MWAIT) );
863 edx &= ~( bitmaskof(X86_FEATURE_HT) |
864 bitmaskof(X86_FEATURE_ACPI) |
865 bitmaskof(X86_FEATURE_ACC) );
866 }
867 else if ( ( input == CPUID_LEAF_0x6 )
868 || ( input == CPUID_LEAF_0x9 )
869 || ( input == CPUID_LEAF_0xA ))
870 {
871 eax = ebx = ecx = edx = 0x0;
872 }
873 #ifdef __i386__
874 else if ( input == CPUID_LEAF_0x80000001 )
875 {
876 clear_bit(X86_FEATURE_LAHF_LM & 31, &ecx);
878 clear_bit(X86_FEATURE_LM & 31, &edx);
879 clear_bit(X86_FEATURE_SYSCALL & 31, &edx);
880 }
881 #endif
882 }
884 regs->eax = (unsigned long) eax;
885 regs->ebx = (unsigned long) ebx;
886 regs->ecx = (unsigned long) ecx;
887 regs->edx = (unsigned long) edx;
889 HVM_DBG_LOG(DBG_LEVEL_3, "eip@%lx, input: 0x%lx, "
890 "output: eax = 0x%08lx, ebx = 0x%08lx, "
891 "ecx = 0x%08lx, edx = 0x%08lx",
892 (unsigned long)eip, (unsigned long)input,
893 (unsigned long)eax, (unsigned long)ebx,
894 (unsigned long)ecx, (unsigned long)edx);
895 }
897 #define CASE_GET_REG_P(REG, reg) \
898 case REG_ ## REG: reg_p = (unsigned long *)&(regs->reg); break
900 #ifdef __i386__
901 #define CASE_EXTEND_GET_REG_P
902 #else
903 #define CASE_EXTEND_GET_REG_P \
904 CASE_GET_REG_P(R8, r8); \
905 CASE_GET_REG_P(R9, r9); \
906 CASE_GET_REG_P(R10, r10); \
907 CASE_GET_REG_P(R11, r11); \
908 CASE_GET_REG_P(R12, r12); \
909 CASE_GET_REG_P(R13, r13); \
910 CASE_GET_REG_P(R14, r14); \
911 CASE_GET_REG_P(R15, r15)
912 #endif
914 static void vmx_dr_access(unsigned long exit_qualification,
915 struct cpu_user_regs *regs)
916 {
917 struct vcpu *v = current;
919 v->arch.hvm_vcpu.flag_dr_dirty = 1;
921 /* We could probably be smarter about this */
922 __restore_debug_registers(v);
924 /* Allow guest direct access to DR registers */
925 v->arch.hvm_vcpu.u.vmx.exec_control &= ~CPU_BASED_MOV_DR_EXITING;
926 __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
927 v->arch.hvm_vcpu.u.vmx.exec_control);
928 }
930 /*
931 * Invalidate the TLB for va. Invalidate the shadow page corresponding
932 * the address va.
933 */
934 static void vmx_do_invlpg(unsigned long va)
935 {
936 unsigned long eip;
937 struct vcpu *v = current;
939 eip = __vmread(GUEST_RIP);
941 HVM_DBG_LOG(DBG_LEVEL_VMMU, "eip=%lx, va=%lx",
942 eip, va);
944 /*
945 * We do the safest things first, then try to update the shadow
946 * copying from guest
947 */
948 shadow_invlpg(v, va);
949 }
952 static int check_for_null_selector(unsigned long eip, int inst_len, int dir)
953 {
954 unsigned char inst[MAX_INST_LEN];
955 unsigned long sel;
956 int i;
957 int inst_copy_from_guest(unsigned char *, unsigned long, int);
959 /* INS can only use ES segment register, and it can't be overridden */
960 if ( dir == IOREQ_READ )
961 {
962 sel = __vmread(GUEST_ES_SELECTOR);
963 return sel == 0 ? 1 : 0;
964 }
966 memset(inst, 0, MAX_INST_LEN);
967 if ( inst_copy_from_guest(inst, eip, inst_len) != inst_len )
968 {
969 printk("check_for_null_selector: get guest instruction failed\n");
970 domain_crash_synchronous();
971 }
973 for ( i = 0; i < inst_len; i++ )
974 {
975 switch ( inst[i] )
976 {
977 case 0xf3: /* REPZ */
978 case 0xf2: /* REPNZ */
979 case 0xf0: /* LOCK */
980 case 0x66: /* data32 */
981 case 0x67: /* addr32 */
982 continue;
983 case 0x2e: /* CS */
984 sel = __vmread(GUEST_CS_SELECTOR);
985 break;
986 case 0x36: /* SS */
987 sel = __vmread(GUEST_SS_SELECTOR);
988 break;
989 case 0x26: /* ES */
990 sel = __vmread(GUEST_ES_SELECTOR);
991 break;
992 case 0x64: /* FS */
993 sel = __vmread(GUEST_FS_SELECTOR);
994 break;
995 case 0x65: /* GS */
996 sel = __vmread(GUEST_GS_SELECTOR);
997 break;
998 case 0x3e: /* DS */
999 /* FALLTHROUGH */
1000 default:
1001 /* DS is the default */
1002 sel = __vmread(GUEST_DS_SELECTOR);
1004 return sel == 0 ? 1 : 0;
1007 return 0;
1010 static void vmx_io_instruction(unsigned long exit_qualification,
1011 unsigned long inst_len)
1013 struct cpu_user_regs *regs;
1014 struct hvm_io_op *pio_opp;
1015 unsigned long port, size;
1016 int dir, df, vm86;
1018 pio_opp = &current->arch.hvm_vcpu.io_op;
1019 pio_opp->instr = INSTR_PIO;
1020 pio_opp->flags = 0;
1022 regs = &pio_opp->io_context;
1024 /* Copy current guest state into io instruction state structure. */
1025 memcpy(regs, guest_cpu_user_regs(), HVM_CONTEXT_STACK_BYTES);
1026 hvm_store_cpu_guest_regs(current, regs, NULL);
1028 vm86 = regs->eflags & X86_EFLAGS_VM ? 1 : 0;
1029 df = regs->eflags & X86_EFLAGS_DF ? 1 : 0;
1031 HVM_DBG_LOG(DBG_LEVEL_IO, "vm86 %d, eip=%x:%lx, "
1032 "exit_qualification = %lx",
1033 vm86, regs->cs, (unsigned long)regs->eip, exit_qualification);
1035 if ( test_bit(6, &exit_qualification) )
1036 port = (exit_qualification >> 16) & 0xFFFF;
1037 else
1038 port = regs->edx & 0xffff;
1040 TRACE_VMEXIT(1, port);
1042 size = (exit_qualification & 7) + 1;
1043 dir = test_bit(3, &exit_qualification); /* direction */
1045 if ( test_bit(4, &exit_qualification) ) { /* string instruction */
1046 unsigned long addr, count = 1;
1047 int sign = regs->eflags & X86_EFLAGS_DF ? -1 : 1;
1049 addr = __vmread(GUEST_LINEAR_ADDRESS);
1051 /*
1052 * In protected mode, guest linear address is invalid if the
1053 * selector is null.
1054 */
1055 if ( !vm86 && check_for_null_selector(regs->eip, inst_len, dir) )
1056 addr = dir == IOREQ_WRITE ? regs->esi : regs->edi;
1058 if ( test_bit(5, &exit_qualification) ) { /* "rep" prefix */
1059 pio_opp->flags |= REPZ;
1060 count = vm86 ? regs->ecx & 0xFFFF : regs->ecx;
1063 /*
1064 * Handle string pio instructions that cross pages or that
1065 * are unaligned. See the comments in hvm_domain.c/handle_mmio()
1066 */
1067 if ( (addr & PAGE_MASK) != ((addr + size - 1) & PAGE_MASK) ) {
1068 unsigned long value = 0;
1070 pio_opp->flags |= OVERLAP;
1072 if ( dir == IOREQ_WRITE ) /* OUTS */
1074 if ( hvm_paging_enabled(current) )
1075 (void)hvm_copy_from_guest_virt(&value, addr, size);
1076 else
1077 (void)hvm_copy_from_guest_phys(&value, addr, size);
1078 } else
1079 pio_opp->addr = addr;
1081 if ( count == 1 )
1082 regs->eip += inst_len;
1084 send_pio_req(port, 1, size, value, dir, df, 0);
1085 } else {
1086 unsigned long last_addr = sign > 0 ? addr + count * size - 1
1087 : addr - (count - 1) * size;
1089 if ( (addr & PAGE_MASK) != (last_addr & PAGE_MASK) )
1091 if ( sign > 0 )
1092 count = (PAGE_SIZE - (addr & ~PAGE_MASK)) / size;
1093 else
1094 count = (addr & ~PAGE_MASK) / size + 1;
1095 } else
1096 regs->eip += inst_len;
1098 send_pio_req(port, count, size, addr, dir, df, 1);
1100 } else {
1101 if ( port == 0xe9 && dir == IOREQ_WRITE && size == 1 )
1102 hvm_print_line(current, regs->eax); /* guest debug output */
1104 if ( dir == IOREQ_WRITE )
1105 TRACE_VMEXIT(2, regs->eax);
1107 regs->eip += inst_len;
1108 send_pio_req(port, 1, size, regs->eax, dir, df, 0);
1112 static void vmx_world_save(struct vcpu *v, struct vmx_assist_context *c)
1114 /* NB. Skip transition instruction. */
1115 c->eip = __vmread(GUEST_RIP);
1116 c->eip += __get_instruction_length(); /* Safe: MOV Cn, LMSW, CLTS */
1118 c->esp = __vmread(GUEST_RSP);
1119 c->eflags = __vmread(GUEST_RFLAGS);
1121 c->cr0 = v->arch.hvm_vmx.cpu_shadow_cr0;
1122 c->cr3 = v->arch.hvm_vmx.cpu_cr3;
1123 c->cr4 = v->arch.hvm_vmx.cpu_shadow_cr4;
1125 c->idtr_limit = __vmread(GUEST_IDTR_LIMIT);
1126 c->idtr_base = __vmread(GUEST_IDTR_BASE);
1128 c->gdtr_limit = __vmread(GUEST_GDTR_LIMIT);
1129 c->gdtr_base = __vmread(GUEST_GDTR_BASE);
1131 c->cs_sel = __vmread(GUEST_CS_SELECTOR);
1132 c->cs_limit = __vmread(GUEST_CS_LIMIT);
1133 c->cs_base = __vmread(GUEST_CS_BASE);
1134 c->cs_arbytes.bytes = __vmread(GUEST_CS_AR_BYTES);
1136 c->ds_sel = __vmread(GUEST_DS_SELECTOR);
1137 c->ds_limit = __vmread(GUEST_DS_LIMIT);
1138 c->ds_base = __vmread(GUEST_DS_BASE);
1139 c->ds_arbytes.bytes = __vmread(GUEST_DS_AR_BYTES);
1141 c->es_sel = __vmread(GUEST_ES_SELECTOR);
1142 c->es_limit = __vmread(GUEST_ES_LIMIT);
1143 c->es_base = __vmread(GUEST_ES_BASE);
1144 c->es_arbytes.bytes = __vmread(GUEST_ES_AR_BYTES);
1146 c->ss_sel = __vmread(GUEST_SS_SELECTOR);
1147 c->ss_limit = __vmread(GUEST_SS_LIMIT);
1148 c->ss_base = __vmread(GUEST_SS_BASE);
1149 c->ss_arbytes.bytes = __vmread(GUEST_SS_AR_BYTES);
1151 c->fs_sel = __vmread(GUEST_FS_SELECTOR);
1152 c->fs_limit = __vmread(GUEST_FS_LIMIT);
1153 c->fs_base = __vmread(GUEST_FS_BASE);
1154 c->fs_arbytes.bytes = __vmread(GUEST_FS_AR_BYTES);
1156 c->gs_sel = __vmread(GUEST_GS_SELECTOR);
1157 c->gs_limit = __vmread(GUEST_GS_LIMIT);
1158 c->gs_base = __vmread(GUEST_GS_BASE);
1159 c->gs_arbytes.bytes = __vmread(GUEST_GS_AR_BYTES);
1161 c->tr_sel = __vmread(GUEST_TR_SELECTOR);
1162 c->tr_limit = __vmread(GUEST_TR_LIMIT);
1163 c->tr_base = __vmread(GUEST_TR_BASE);
1164 c->tr_arbytes.bytes = __vmread(GUEST_TR_AR_BYTES);
1166 c->ldtr_sel = __vmread(GUEST_LDTR_SELECTOR);
1167 c->ldtr_limit = __vmread(GUEST_LDTR_LIMIT);
1168 c->ldtr_base = __vmread(GUEST_LDTR_BASE);
1169 c->ldtr_arbytes.bytes = __vmread(GUEST_LDTR_AR_BYTES);
1172 static void vmx_world_restore(struct vcpu *v, struct vmx_assist_context *c)
1174 unsigned long mfn, old_base_mfn;
1176 __vmwrite(GUEST_RIP, c->eip);
1177 __vmwrite(GUEST_RSP, c->esp);
1178 __vmwrite(GUEST_RFLAGS, c->eflags);
1180 v->arch.hvm_vmx.cpu_shadow_cr0 = c->cr0;
1181 __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr0);
1183 if ( !vmx_paging_enabled(v) )
1184 goto skip_cr3;
1186 if ( c->cr3 == v->arch.hvm_vmx.cpu_cr3 )
1188 /*
1189 * This is simple TLB flush, implying the guest has
1190 * removed some translation or changed page attributes.
1191 * We simply invalidate the shadow.
1192 */
1193 mfn = get_mfn_from_gpfn(c->cr3 >> PAGE_SHIFT);
1194 if ( mfn != pagetable_get_pfn(v->arch.guest_table) )
1196 printk("Invalid CR3 value=%x", c->cr3);
1197 domain_crash_synchronous();
1200 else
1202 /*
1203 * If different, make a shadow. Check if the PDBR is valid
1204 * first.
1205 */
1206 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 c->cr3 = %x", c->cr3);
1207 mfn = get_mfn_from_gpfn(c->cr3 >> PAGE_SHIFT);
1208 if ( !VALID_MFN(mfn) )
1210 printk("Invalid CR3 value=%x", c->cr3);
1211 domain_crash_synchronous();
1213 if ( !get_page(mfn_to_page(mfn), v->domain) )
1214 domain_crash_synchronous();
1215 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1216 v->arch.guest_table = pagetable_from_pfn(mfn);
1217 if (old_base_mfn)
1218 put_page(mfn_to_page(old_base_mfn));
1219 /*
1220 * arch.shadow_table should now hold the next CR3 for shadow
1221 */
1222 v->arch.hvm_vmx.cpu_cr3 = c->cr3;
1225 skip_cr3:
1226 if ( !vmx_paging_enabled(v) )
1227 HVM_DBG_LOG(DBG_LEVEL_VMMU, "switching to vmxassist. use phys table");
1228 else
1229 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %x", c->cr3);
1231 __vmwrite(GUEST_CR4, (c->cr4 | VMX_CR4_HOST_MASK));
1232 v->arch.hvm_vmx.cpu_shadow_cr4 = c->cr4;
1233 __vmwrite(CR4_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr4);
1235 __vmwrite(GUEST_IDTR_LIMIT, c->idtr_limit);
1236 __vmwrite(GUEST_IDTR_BASE, c->idtr_base);
1238 __vmwrite(GUEST_GDTR_LIMIT, c->gdtr_limit);
1239 __vmwrite(GUEST_GDTR_BASE, c->gdtr_base);
1241 __vmwrite(GUEST_CS_SELECTOR, c->cs_sel);
1242 __vmwrite(GUEST_CS_LIMIT, c->cs_limit);
1243 __vmwrite(GUEST_CS_BASE, c->cs_base);
1244 __vmwrite(GUEST_CS_AR_BYTES, c->cs_arbytes.bytes);
1246 __vmwrite(GUEST_DS_SELECTOR, c->ds_sel);
1247 __vmwrite(GUEST_DS_LIMIT, c->ds_limit);
1248 __vmwrite(GUEST_DS_BASE, c->ds_base);
1249 __vmwrite(GUEST_DS_AR_BYTES, c->ds_arbytes.bytes);
1251 __vmwrite(GUEST_ES_SELECTOR, c->es_sel);
1252 __vmwrite(GUEST_ES_LIMIT, c->es_limit);
1253 __vmwrite(GUEST_ES_BASE, c->es_base);
1254 __vmwrite(GUEST_ES_AR_BYTES, c->es_arbytes.bytes);
1256 __vmwrite(GUEST_SS_SELECTOR, c->ss_sel);
1257 __vmwrite(GUEST_SS_LIMIT, c->ss_limit);
1258 __vmwrite(GUEST_SS_BASE, c->ss_base);
1259 __vmwrite(GUEST_SS_AR_BYTES, c->ss_arbytes.bytes);
1261 __vmwrite(GUEST_FS_SELECTOR, c->fs_sel);
1262 __vmwrite(GUEST_FS_LIMIT, c->fs_limit);
1263 __vmwrite(GUEST_FS_BASE, c->fs_base);
1264 __vmwrite(GUEST_FS_AR_BYTES, c->fs_arbytes.bytes);
1266 __vmwrite(GUEST_GS_SELECTOR, c->gs_sel);
1267 __vmwrite(GUEST_GS_LIMIT, c->gs_limit);
1268 __vmwrite(GUEST_GS_BASE, c->gs_base);
1269 __vmwrite(GUEST_GS_AR_BYTES, c->gs_arbytes.bytes);
1271 __vmwrite(GUEST_TR_SELECTOR, c->tr_sel);
1272 __vmwrite(GUEST_TR_LIMIT, c->tr_limit);
1273 __vmwrite(GUEST_TR_BASE, c->tr_base);
1274 __vmwrite(GUEST_TR_AR_BYTES, c->tr_arbytes.bytes);
1276 __vmwrite(GUEST_LDTR_SELECTOR, c->ldtr_sel);
1277 __vmwrite(GUEST_LDTR_LIMIT, c->ldtr_limit);
1278 __vmwrite(GUEST_LDTR_BASE, c->ldtr_base);
1279 __vmwrite(GUEST_LDTR_AR_BYTES, c->ldtr_arbytes.bytes);
1281 shadow_update_paging_modes(v);
1282 __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
1285 enum { VMX_ASSIST_INVOKE = 0, VMX_ASSIST_RESTORE };
1287 static int vmx_assist(struct vcpu *v, int mode)
1289 struct vmx_assist_context c;
1290 u32 magic;
1291 u32 cp;
1293 /* make sure vmxassist exists (this is not an error) */
1294 if (hvm_copy_from_guest_phys(&magic, VMXASSIST_MAGIC_OFFSET,
1295 sizeof(magic)))
1296 return 0;
1297 if (magic != VMXASSIST_MAGIC)
1298 return 0;
1300 switch (mode) {
1301 /*
1302 * Transfer control to vmxassist.
1303 * Store the current context in VMXASSIST_OLD_CONTEXT and load
1304 * the new VMXASSIST_NEW_CONTEXT context. This context was created
1305 * by vmxassist and will transfer control to it.
1306 */
1307 case VMX_ASSIST_INVOKE:
1308 /* save the old context */
1309 if (hvm_copy_from_guest_phys(&cp, VMXASSIST_OLD_CONTEXT, sizeof(cp)))
1310 goto error;
1311 if (cp != 0) {
1312 vmx_world_save(v, &c);
1313 if (hvm_copy_to_guest_phys(cp, &c, sizeof(c)))
1314 goto error;
1317 /* restore the new context, this should activate vmxassist */
1318 if (hvm_copy_from_guest_phys(&cp, VMXASSIST_NEW_CONTEXT, sizeof(cp)))
1319 goto error;
1320 if (cp != 0) {
1321 if (hvm_copy_from_guest_phys(&c, cp, sizeof(c)))
1322 goto error;
1323 vmx_world_restore(v, &c);
1324 v->arch.hvm_vmx.vmxassist_enabled = 1;
1325 return 1;
1327 break;
1329 /*
1330 * Restore the VMXASSIST_OLD_CONTEXT that was saved by
1331 * VMX_ASSIST_INVOKE above.
1332 */
1333 case VMX_ASSIST_RESTORE:
1334 /* save the old context */
1335 if (hvm_copy_from_guest_phys(&cp, VMXASSIST_OLD_CONTEXT, sizeof(cp)))
1336 goto error;
1337 if (cp != 0) {
1338 if (hvm_copy_from_guest_phys(&c, cp, sizeof(c)))
1339 goto error;
1340 vmx_world_restore(v, &c);
1341 v->arch.hvm_vmx.vmxassist_enabled = 0;
1342 return 1;
1344 break;
1347 error:
1348 printk("Failed to transfer to vmxassist\n");
1349 domain_crash_synchronous();
1350 return 0;
1353 static int vmx_set_cr0(unsigned long value)
1355 struct vcpu *v = current;
1356 unsigned long mfn;
1357 unsigned long eip;
1358 int paging_enabled;
1359 unsigned long vm_entry_value;
1360 unsigned long old_cr0;
1361 unsigned long old_base_mfn;
1363 /*
1364 * CR0: We don't want to lose PE and PG.
1365 */
1366 old_cr0 = v->arch.hvm_vmx.cpu_shadow_cr0;
1367 paging_enabled = (old_cr0 & X86_CR0_PE) && (old_cr0 & X86_CR0_PG);
1369 /* TS cleared? Then initialise FPU now. */
1370 if ( !(value & X86_CR0_TS) )
1372 setup_fpu(v);
1373 __vm_clear_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_NM);
1376 v->arch.hvm_vmx.cpu_cr0 = value | X86_CR0_PE | X86_CR0_PG | X86_CR0_NE;
1377 __vmwrite(GUEST_CR0, v->arch.hvm_vmx.cpu_cr0);
1379 v->arch.hvm_vmx.cpu_shadow_cr0 = value;
1380 __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr0);
1382 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR0 value = %lx\n", value);
1384 if ( (value & X86_CR0_PE) && (value & X86_CR0_PG) && !paging_enabled )
1386 /*
1387 * Trying to enable guest paging.
1388 * The guest CR3 must be pointing to the guest physical.
1389 */
1390 mfn = get_mfn_from_gpfn(v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT);
1391 if ( !VALID_MFN(mfn) || !get_page(mfn_to_page(mfn), v->domain) )
1393 printk("Invalid CR3 value = %lx (mfn=%lx)\n",
1394 v->arch.hvm_vmx.cpu_cr3, mfn);
1395 domain_crash_synchronous(); /* need to take a clean path */
1398 #if defined(__x86_64__)
1399 if ( vmx_lme_is_set(v) )
1401 if ( !(v->arch.hvm_vmx.cpu_shadow_cr4 & X86_CR4_PAE) )
1403 HVM_DBG_LOG(DBG_LEVEL_1, "Guest enabled paging "
1404 "with EFER.LME set but not CR4.PAE\n");
1405 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
1407 else
1409 HVM_DBG_LOG(DBG_LEVEL_1, "Enabling long mode\n");
1410 v->arch.hvm_vmx.msr_content.msr_items[VMX_INDEX_MSR_EFER]
1411 |= EFER_LMA;
1412 vm_entry_value = __vmread(VM_ENTRY_CONTROLS);
1413 vm_entry_value |= VM_ENTRY_IA32E_MODE;
1414 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
1417 #endif
1419 /*
1420 * Now arch.guest_table points to machine physical.
1421 */
1422 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1423 v->arch.guest_table = pagetable_from_pfn(mfn);
1424 if (old_base_mfn)
1425 put_page(mfn_to_page(old_base_mfn));
1426 shadow_update_paging_modes(v);
1428 HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
1429 (unsigned long) (mfn << PAGE_SHIFT));
1431 __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
1432 /*
1433 * arch->shadow_table should hold the next CR3 for shadow
1434 */
1435 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, mfn = %lx",
1436 v->arch.hvm_vmx.cpu_cr3, mfn);
1439 if ( !((value & X86_CR0_PE) && (value & X86_CR0_PG)) && paging_enabled )
1440 if ( v->arch.hvm_vmx.cpu_cr3 ) {
1441 put_page(mfn_to_page(get_mfn_from_gpfn(
1442 v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT)));
1443 v->arch.guest_table = pagetable_null();
1446 /*
1447 * VMX does not implement real-mode virtualization. We emulate
1448 * real-mode by performing a world switch to VMXAssist whenever
1449 * a partition disables the CR0.PE bit.
1450 */
1451 if ( (value & X86_CR0_PE) == 0 )
1453 if ( value & X86_CR0_PG ) {
1454 /* inject GP here */
1455 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
1456 return 0;
1457 } else {
1458 /*
1459 * Disable paging here.
1460 * Same to PE == 1 && PG == 0
1461 */
1462 if ( vmx_long_mode_enabled(v) )
1464 v->arch.hvm_vmx.msr_content.msr_items[VMX_INDEX_MSR_EFER]
1465 &= ~EFER_LMA;
1466 vm_entry_value = __vmread(VM_ENTRY_CONTROLS);
1467 vm_entry_value &= ~VM_ENTRY_IA32E_MODE;
1468 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
1472 if ( vmx_assist(v, VMX_ASSIST_INVOKE) )
1474 eip = __vmread(GUEST_RIP);
1475 HVM_DBG_LOG(DBG_LEVEL_1,
1476 "Transfering control to vmxassist %%eip 0x%lx\n", eip);
1477 return 0; /* do not update eip! */
1480 else if ( v->arch.hvm_vmx.vmxassist_enabled )
1482 eip = __vmread(GUEST_RIP);
1483 HVM_DBG_LOG(DBG_LEVEL_1,
1484 "Enabling CR0.PE at %%eip 0x%lx\n", eip);
1485 if ( vmx_assist(v, VMX_ASSIST_RESTORE) )
1487 eip = __vmread(GUEST_RIP);
1488 HVM_DBG_LOG(DBG_LEVEL_1,
1489 "Restoring to %%eip 0x%lx\n", eip);
1490 return 0; /* do not update eip! */
1493 else if ( (value & (X86_CR0_PE | X86_CR0_PG)) == X86_CR0_PE )
1495 if ( vmx_long_mode_enabled(v) )
1497 v->arch.hvm_vmx.msr_content.msr_items[VMX_INDEX_MSR_EFER]
1498 &= ~EFER_LMA;
1499 vm_entry_value = __vmread(VM_ENTRY_CONTROLS);
1500 vm_entry_value &= ~VM_ENTRY_IA32E_MODE;
1501 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
1503 shadow_update_paging_modes(v);
1504 __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
1507 return 1;
1510 #define CASE_SET_REG(REG, reg) \
1511 case REG_ ## REG: regs->reg = value; break
1512 #define CASE_GET_REG(REG, reg) \
1513 case REG_ ## REG: value = regs->reg; break
1515 #define CASE_EXTEND_SET_REG \
1516 CASE_EXTEND_REG(S)
1517 #define CASE_EXTEND_GET_REG \
1518 CASE_EXTEND_REG(G)
1520 #ifdef __i386__
1521 #define CASE_EXTEND_REG(T)
1522 #else
1523 #define CASE_EXTEND_REG(T) \
1524 CASE_ ## T ## ET_REG(R8, r8); \
1525 CASE_ ## T ## ET_REG(R9, r9); \
1526 CASE_ ## T ## ET_REG(R10, r10); \
1527 CASE_ ## T ## ET_REG(R11, r11); \
1528 CASE_ ## T ## ET_REG(R12, r12); \
1529 CASE_ ## T ## ET_REG(R13, r13); \
1530 CASE_ ## T ## ET_REG(R14, r14); \
1531 CASE_ ## T ## ET_REG(R15, r15)
1532 #endif
1534 /*
1535 * Write to control registers
1536 */
1537 static int mov_to_cr(int gp, int cr, struct cpu_user_regs *regs)
1539 unsigned long value;
1540 unsigned long old_cr;
1541 struct vcpu *v = current;
1542 struct vlapic *vlapic = vcpu_vlapic(v);
1544 switch ( gp ) {
1545 CASE_GET_REG(EAX, eax);
1546 CASE_GET_REG(ECX, ecx);
1547 CASE_GET_REG(EDX, edx);
1548 CASE_GET_REG(EBX, ebx);
1549 CASE_GET_REG(EBP, ebp);
1550 CASE_GET_REG(ESI, esi);
1551 CASE_GET_REG(EDI, edi);
1552 CASE_EXTEND_GET_REG;
1553 case REG_ESP:
1554 value = __vmread(GUEST_RSP);
1555 break;
1556 default:
1557 printk("invalid gp: %d\n", gp);
1558 __hvm_bug(regs);
1561 TRACE_VMEXIT(1, TYPE_MOV_TO_CR);
1562 TRACE_VMEXIT(2, cr);
1563 TRACE_VMEXIT(3, value);
1565 HVM_DBG_LOG(DBG_LEVEL_1, "CR%d, value = %lx", cr, value);
1567 switch ( cr ) {
1568 case 0:
1569 return vmx_set_cr0(value);
1570 case 3:
1572 unsigned long old_base_mfn, mfn;
1574 /*
1575 * If paging is not enabled yet, simply copy the value to CR3.
1576 */
1577 if (!vmx_paging_enabled(v)) {
1578 v->arch.hvm_vmx.cpu_cr3 = value;
1579 break;
1582 /*
1583 * We make a new one if the shadow does not exist.
1584 */
1585 if (value == v->arch.hvm_vmx.cpu_cr3) {
1586 /*
1587 * This is simple TLB flush, implying the guest has
1588 * removed some translation or changed page attributes.
1589 * We simply invalidate the shadow.
1590 */
1591 mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
1592 if (mfn != pagetable_get_pfn(v->arch.guest_table))
1593 __hvm_bug(regs);
1594 shadow_update_cr3(v);
1595 } else {
1596 /*
1597 * If different, make a shadow. Check if the PDBR is valid
1598 * first.
1599 */
1600 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 value = %lx", value);
1601 mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
1602 if ( !VALID_MFN(mfn) || !get_page(mfn_to_page(mfn), v->domain) )
1604 printk("Invalid CR3 value=%lx\n", value);
1605 domain_crash_synchronous(); /* need to take a clean path */
1607 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1608 v->arch.guest_table = pagetable_from_pfn(mfn);
1609 if (old_base_mfn)
1610 put_page(mfn_to_page(old_base_mfn));
1611 /*
1612 * arch.shadow_table should now hold the next CR3 for shadow
1613 */
1614 v->arch.hvm_vmx.cpu_cr3 = value;
1615 update_cr3(v);
1616 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx",
1617 value);
1618 __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
1620 break;
1622 case 4: /* CR4 */
1624 old_cr = v->arch.hvm_vmx.cpu_shadow_cr4;
1626 if ( (value & X86_CR4_PAE) && !(old_cr & X86_CR4_PAE) )
1628 if ( vmx_pgbit_test(v) )
1630 /* The guest is a 32-bit PAE guest. */
1631 #if CONFIG_PAGING_LEVELS >= 3
1632 unsigned long mfn, old_base_mfn;
1633 mfn = get_mfn_from_gpfn(v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT);
1634 if ( !VALID_MFN(mfn) ||
1635 !get_page(mfn_to_page(mfn), v->domain) )
1637 printk("Invalid CR3 value = %lx", v->arch.hvm_vmx.cpu_cr3);
1638 domain_crash_synchronous(); /* need to take a clean path */
1641 /*
1642 * Now arch.guest_table points to machine physical.
1643 */
1645 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1646 v->arch.guest_table = pagetable_from_pfn(mfn);
1647 if ( old_base_mfn )
1648 put_page(mfn_to_page(old_base_mfn));
1650 HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
1651 (unsigned long) (mfn << PAGE_SHIFT));
1653 __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
1655 /*
1656 * arch->shadow_table should hold the next CR3 for shadow
1657 */
1659 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, mfn = %lx",
1660 v->arch.hvm_vmx.cpu_cr3, mfn);
1661 #endif
1664 else if ( !(value & X86_CR4_PAE) )
1666 if ( unlikely(vmx_long_mode_enabled(v)) )
1668 HVM_DBG_LOG(DBG_LEVEL_1, "Guest cleared CR4.PAE while "
1669 "EFER.LMA is set\n");
1670 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
1674 __vmwrite(GUEST_CR4, value| VMX_CR4_HOST_MASK);
1675 v->arch.hvm_vmx.cpu_shadow_cr4 = value;
1676 __vmwrite(CR4_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr4);
1678 /*
1679 * Writing to CR4 to modify the PSE, PGE, or PAE flag invalidates
1680 * all TLB entries except global entries.
1681 */
1682 if ( (old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE) )
1683 shadow_update_paging_modes(v);
1684 break;
1686 case 8:
1688 vlapic_set_reg(vlapic, APIC_TASKPRI, ((value & 0x0F) << 4));
1689 break;
1691 default:
1692 printk("invalid cr: %d\n", gp);
1693 __hvm_bug(regs);
1696 return 1;
1699 /*
1700 * Read from control registers. CR0 and CR4 are read from the shadow.
1701 */
1702 static void mov_from_cr(int cr, int gp, struct cpu_user_regs *regs)
1704 unsigned long value = 0;
1705 struct vcpu *v = current;
1706 struct vlapic *vlapic = vcpu_vlapic(v);
1708 switch ( cr )
1710 case 3:
1711 value = (unsigned long)v->arch.hvm_vmx.cpu_cr3;
1712 break;
1713 case 8:
1714 value = (unsigned long)vlapic_get_reg(vlapic, APIC_TASKPRI);
1715 value = (value & 0xF0) >> 4;
1716 break;
1717 default:
1718 __hvm_bug(regs);
1721 switch ( gp ) {
1722 CASE_SET_REG(EAX, eax);
1723 CASE_SET_REG(ECX, ecx);
1724 CASE_SET_REG(EDX, edx);
1725 CASE_SET_REG(EBX, ebx);
1726 CASE_SET_REG(EBP, ebp);
1727 CASE_SET_REG(ESI, esi);
1728 CASE_SET_REG(EDI, edi);
1729 CASE_EXTEND_SET_REG;
1730 case REG_ESP:
1731 __vmwrite(GUEST_RSP, value);
1732 regs->esp = value;
1733 break;
1734 default:
1735 printk("invalid gp: %d\n", gp);
1736 __hvm_bug(regs);
1739 TRACE_VMEXIT(1, TYPE_MOV_FROM_CR);
1740 TRACE_VMEXIT(2, cr);
1741 TRACE_VMEXIT(3, value);
1743 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR%d, value = %lx", cr, value);
1746 static int vmx_cr_access(unsigned long exit_qualification,
1747 struct cpu_user_regs *regs)
1749 unsigned int gp, cr;
1750 unsigned long value;
1751 struct vcpu *v = current;
1753 switch (exit_qualification & CONTROL_REG_ACCESS_TYPE) {
1754 case TYPE_MOV_TO_CR:
1755 gp = exit_qualification & CONTROL_REG_ACCESS_REG;
1756 cr = exit_qualification & CONTROL_REG_ACCESS_NUM;
1757 return mov_to_cr(gp, cr, regs);
1758 case TYPE_MOV_FROM_CR:
1759 gp = exit_qualification & CONTROL_REG_ACCESS_REG;
1760 cr = exit_qualification & CONTROL_REG_ACCESS_NUM;
1761 mov_from_cr(cr, gp, regs);
1762 break;
1763 case TYPE_CLTS:
1764 TRACE_VMEXIT(1, TYPE_CLTS);
1766 /* We initialise the FPU now, to avoid needing another vmexit. */
1767 setup_fpu(v);
1768 __vm_clear_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_NM);
1770 v->arch.hvm_vmx.cpu_cr0 &= ~X86_CR0_TS; /* clear TS */
1771 __vmwrite(GUEST_CR0, v->arch.hvm_vmx.cpu_cr0);
1773 v->arch.hvm_vmx.cpu_shadow_cr0 &= ~X86_CR0_TS; /* clear TS */
1774 __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr0);
1775 break;
1776 case TYPE_LMSW:
1777 value = v->arch.hvm_vmx.cpu_shadow_cr0;
1778 value = (value & ~0xF) |
1779 (((exit_qualification & LMSW_SOURCE_DATA) >> 16) & 0xF);
1780 TRACE_VMEXIT(1, TYPE_LMSW);
1781 TRACE_VMEXIT(2, value);
1782 return vmx_set_cr0(value);
1783 break;
1784 default:
1785 __hvm_bug(regs);
1786 break;
1788 return 1;
1791 static inline void vmx_do_msr_read(struct cpu_user_regs *regs)
1793 u64 msr_content = 0;
1794 u32 eax, edx;
1795 struct vcpu *v = current;
1797 HVM_DBG_LOG(DBG_LEVEL_1, "ecx=%lx, eax=%lx, edx=%lx",
1798 (unsigned long)regs->ecx, (unsigned long)regs->eax,
1799 (unsigned long)regs->edx);
1800 switch (regs->ecx) {
1801 case MSR_IA32_TIME_STAMP_COUNTER:
1802 msr_content = hvm_get_guest_time(v);
1803 break;
1804 case MSR_IA32_SYSENTER_CS:
1805 msr_content = (u32)__vmread(GUEST_SYSENTER_CS);
1806 break;
1807 case MSR_IA32_SYSENTER_ESP:
1808 msr_content = __vmread(GUEST_SYSENTER_ESP);
1809 break;
1810 case MSR_IA32_SYSENTER_EIP:
1811 msr_content = __vmread(GUEST_SYSENTER_EIP);
1812 break;
1813 case MSR_IA32_APICBASE:
1814 msr_content = vcpu_vlapic(v)->apic_base_msr;
1815 break;
1816 default:
1817 if (long_mode_do_msr_read(regs))
1818 return;
1820 if ( rdmsr_hypervisor_regs(regs->ecx, &eax, &edx) )
1822 regs->eax = eax;
1823 regs->edx = edx;
1824 return;
1827 rdmsr_safe(regs->ecx, regs->eax, regs->edx);
1828 return;
1831 regs->eax = msr_content & 0xFFFFFFFF;
1832 regs->edx = msr_content >> 32;
1834 HVM_DBG_LOG(DBG_LEVEL_1, "returns: ecx=%lx, eax=%lx, edx=%lx",
1835 (unsigned long)regs->ecx, (unsigned long)regs->eax,
1836 (unsigned long)regs->edx);
1839 static inline void vmx_do_msr_write(struct cpu_user_regs *regs)
1841 u64 msr_content;
1842 struct vcpu *v = current;
1844 HVM_DBG_LOG(DBG_LEVEL_1, "ecx=%lx, eax=%lx, edx=%lx",
1845 (unsigned long)regs->ecx, (unsigned long)regs->eax,
1846 (unsigned long)regs->edx);
1848 msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
1850 switch (regs->ecx) {
1851 case MSR_IA32_TIME_STAMP_COUNTER:
1853 struct periodic_time *pt =
1854 &(v->domain->arch.hvm_domain.pl_time.periodic_tm);
1855 if ( pt->enabled && pt->first_injected
1856 && v->vcpu_id == pt->bind_vcpu )
1857 pt->first_injected = 0;
1859 hvm_set_guest_time(v, msr_content);
1860 break;
1861 case MSR_IA32_SYSENTER_CS:
1862 __vmwrite(GUEST_SYSENTER_CS, msr_content);
1863 break;
1864 case MSR_IA32_SYSENTER_ESP:
1865 __vmwrite(GUEST_SYSENTER_ESP, msr_content);
1866 break;
1867 case MSR_IA32_SYSENTER_EIP:
1868 __vmwrite(GUEST_SYSENTER_EIP, msr_content);
1869 break;
1870 case MSR_IA32_APICBASE:
1871 vlapic_msr_set(vcpu_vlapic(v), msr_content);
1872 break;
1873 default:
1874 if ( !long_mode_do_msr_write(regs) )
1875 wrmsr_hypervisor_regs(regs->ecx, regs->eax, regs->edx);
1876 break;
1879 HVM_DBG_LOG(DBG_LEVEL_1, "returns: ecx=%lx, eax=%lx, edx=%lx",
1880 (unsigned long)regs->ecx, (unsigned long)regs->eax,
1881 (unsigned long)regs->edx);
1884 static void vmx_do_hlt(void)
1886 unsigned long rflags;
1887 rflags = __vmread(GUEST_RFLAGS);
1888 hvm_hlt(rflags);
1891 static inline void vmx_do_extint(struct cpu_user_regs *regs)
1893 unsigned int vector;
1895 asmlinkage void do_IRQ(struct cpu_user_regs *);
1896 fastcall void smp_apic_timer_interrupt(struct cpu_user_regs *);
1897 fastcall void smp_event_check_interrupt(void);
1898 fastcall void smp_invalidate_interrupt(void);
1899 fastcall void smp_call_function_interrupt(void);
1900 fastcall void smp_spurious_interrupt(struct cpu_user_regs *regs);
1901 fastcall void smp_error_interrupt(struct cpu_user_regs *regs);
1902 #ifdef CONFIG_X86_MCE_P4THERMAL
1903 fastcall void smp_thermal_interrupt(struct cpu_user_regs *regs);
1904 #endif
1906 vector = __vmread(VM_EXIT_INTR_INFO);
1907 BUG_ON(!(vector & INTR_INFO_VALID_MASK));
1909 vector &= INTR_INFO_VECTOR_MASK;
1910 TRACE_VMEXIT(1, vector);
1912 switch(vector) {
1913 case LOCAL_TIMER_VECTOR:
1914 smp_apic_timer_interrupt(regs);
1915 break;
1916 case EVENT_CHECK_VECTOR:
1917 smp_event_check_interrupt();
1918 break;
1919 case INVALIDATE_TLB_VECTOR:
1920 smp_invalidate_interrupt();
1921 break;
1922 case CALL_FUNCTION_VECTOR:
1923 smp_call_function_interrupt();
1924 break;
1925 case SPURIOUS_APIC_VECTOR:
1926 smp_spurious_interrupt(regs);
1927 break;
1928 case ERROR_APIC_VECTOR:
1929 smp_error_interrupt(regs);
1930 break;
1931 #ifdef CONFIG_X86_MCE_P4THERMAL
1932 case THERMAL_APIC_VECTOR:
1933 smp_thermal_interrupt(regs);
1934 break;
1935 #endif
1936 default:
1937 regs->entry_vector = vector;
1938 do_IRQ(regs);
1939 break;
1943 #if defined (__x86_64__)
1944 void store_cpu_user_regs(struct cpu_user_regs *regs)
1946 regs->ss = __vmread(GUEST_SS_SELECTOR);
1947 regs->rsp = __vmread(GUEST_RSP);
1948 regs->rflags = __vmread(GUEST_RFLAGS);
1949 regs->cs = __vmread(GUEST_CS_SELECTOR);
1950 regs->ds = __vmread(GUEST_DS_SELECTOR);
1951 regs->es = __vmread(GUEST_ES_SELECTOR);
1952 regs->rip = __vmread(GUEST_RIP);
1954 #elif defined (__i386__)
1955 void store_cpu_user_regs(struct cpu_user_regs *regs)
1957 regs->ss = __vmread(GUEST_SS_SELECTOR);
1958 regs->esp = __vmread(GUEST_RSP);
1959 regs->eflags = __vmread(GUEST_RFLAGS);
1960 regs->cs = __vmread(GUEST_CS_SELECTOR);
1961 regs->ds = __vmread(GUEST_DS_SELECTOR);
1962 regs->es = __vmread(GUEST_ES_SELECTOR);
1963 regs->eip = __vmread(GUEST_RIP);
1965 #endif
1967 #ifdef XEN_DEBUGGER
1968 void save_cpu_user_regs(struct cpu_user_regs *regs)
1970 regs->xss = __vmread(GUEST_SS_SELECTOR);
1971 regs->esp = __vmread(GUEST_RSP);
1972 regs->eflags = __vmread(GUEST_RFLAGS);
1973 regs->xcs = __vmread(GUEST_CS_SELECTOR);
1974 regs->eip = __vmread(GUEST_RIP);
1976 regs->xgs = __vmread(GUEST_GS_SELECTOR);
1977 regs->xfs = __vmread(GUEST_FS_SELECTOR);
1978 regs->xes = __vmread(GUEST_ES_SELECTOR);
1979 regs->xds = __vmread(GUEST_DS_SELECTOR);
1982 void restore_cpu_user_regs(struct cpu_user_regs *regs)
1984 __vmwrite(GUEST_SS_SELECTOR, regs->xss);
1985 __vmwrite(GUEST_RSP, regs->esp);
1986 __vmwrite(GUEST_RFLAGS, regs->eflags);
1987 __vmwrite(GUEST_CS_SELECTOR, regs->xcs);
1988 __vmwrite(GUEST_RIP, regs->eip);
1990 __vmwrite(GUEST_GS_SELECTOR, regs->xgs);
1991 __vmwrite(GUEST_FS_SELECTOR, regs->xfs);
1992 __vmwrite(GUEST_ES_SELECTOR, regs->xes);
1993 __vmwrite(GUEST_DS_SELECTOR, regs->xds);
1995 #endif
1997 static void vmx_reflect_exception(struct vcpu *v)
1999 int error_code, intr_info, vector;
2001 intr_info = __vmread(VM_EXIT_INTR_INFO);
2002 vector = intr_info & 0xff;
2003 if ( intr_info & INTR_INFO_DELIVER_CODE_MASK )
2004 error_code = __vmread(VM_EXIT_INTR_ERROR_CODE);
2005 else
2006 error_code = VMX_DELIVER_NO_ERROR_CODE;
2008 #ifndef NDEBUG
2010 unsigned long rip;
2012 rip = __vmread(GUEST_RIP);
2013 HVM_DBG_LOG(DBG_LEVEL_1, "rip = %lx, error_code = %x",
2014 rip, error_code);
2016 #endif /* NDEBUG */
2018 /*
2019 * According to Intel Virtualization Technology Specification for
2020 * the IA-32 Intel Architecture (C97063-002 April 2005), section
2021 * 2.8.3, SW_EXCEPTION should be used for #BP and #OV, and
2022 * HW_EXCEPTION used for everything else. The main difference
2023 * appears to be that for SW_EXCEPTION, the EIP/RIP is incremented
2024 * by VM_ENTER_INSTRUCTION_LEN bytes, whereas for HW_EXCEPTION,
2025 * it is not.
2026 */
2027 if ( (intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_SW_EXCEPTION )
2029 int ilen = __get_instruction_length(); /* Safe: software exception */
2030 vmx_inject_sw_exception(v, vector, ilen);
2032 else
2034 vmx_inject_hw_exception(v, vector, error_code);
2038 asmlinkage void vmx_vmexit_handler(struct cpu_user_regs *regs)
2040 unsigned int exit_reason;
2041 unsigned long exit_qualification, inst_len = 0;
2042 struct vcpu *v = current;
2044 exit_reason = __vmread(VM_EXIT_REASON);
2046 perfc_incra(vmexits, exit_reason);
2048 if ( (exit_reason != EXIT_REASON_EXTERNAL_INTERRUPT) &&
2049 (exit_reason != EXIT_REASON_VMCALL) &&
2050 (exit_reason != EXIT_REASON_IO_INSTRUCTION) )
2051 HVM_DBG_LOG(DBG_LEVEL_0, "exit reason = %x", exit_reason);
2053 if ( exit_reason != EXIT_REASON_EXTERNAL_INTERRUPT )
2054 local_irq_enable();
2056 if ( unlikely(exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) )
2058 unsigned int failed_vmentry_reason = exit_reason & 0xFFFF;
2060 exit_qualification = __vmread(EXIT_QUALIFICATION);
2061 printk("Failed vm entry (exit reason 0x%x) ", exit_reason);
2062 switch ( failed_vmentry_reason ) {
2063 case EXIT_REASON_INVALID_GUEST_STATE:
2064 printk("caused by invalid guest state (%ld).\n", exit_qualification);
2065 break;
2066 case EXIT_REASON_MSR_LOADING:
2067 printk("caused by MSR entry %ld loading.\n", exit_qualification);
2068 break;
2069 case EXIT_REASON_MACHINE_CHECK:
2070 printk("caused by machine check.\n");
2071 break;
2072 default:
2073 printk("reason not known yet!");
2074 break;
2077 printk("************* VMCS Area **************\n");
2078 vmcs_dump_vcpu();
2079 printk("**************************************\n");
2080 domain_crash_synchronous();
2083 TRACE_VMEXIT(0, exit_reason);
2085 switch ( exit_reason )
2087 case EXIT_REASON_EXCEPTION_NMI:
2089 /*
2090 * We don't set the software-interrupt exiting (INT n).
2091 * (1) We can get an exception (e.g. #PG) in the guest, or
2092 * (2) NMI
2093 */
2094 unsigned int intr_info, vector;
2096 intr_info = __vmread(VM_EXIT_INTR_INFO);
2097 BUG_ON(!(intr_info & INTR_INFO_VALID_MASK));
2099 vector = intr_info & INTR_INFO_VECTOR_MASK;
2101 TRACE_VMEXIT(1, vector);
2102 perfc_incra(cause_vector, vector);
2104 switch ( vector )
2106 #ifdef XEN_DEBUGGER
2107 case TRAP_debug:
2109 save_cpu_user_regs(regs);
2110 pdb_handle_exception(1, regs, 1);
2111 restore_cpu_user_regs(regs);
2112 break;
2114 case TRAP_int3:
2116 save_cpu_user_regs(regs);
2117 pdb_handle_exception(3, regs, 1);
2118 restore_cpu_user_regs(regs);
2119 break;
2121 #else
2122 case TRAP_debug:
2124 void store_cpu_user_regs(struct cpu_user_regs *regs);
2126 if ( test_bit(_DOMF_debugging, &v->domain->domain_flags) )
2128 store_cpu_user_regs(regs);
2129 domain_pause_for_debugger();
2130 __vm_clear_bit(GUEST_PENDING_DBG_EXCEPTIONS,
2131 PENDING_DEBUG_EXC_BS);
2133 else
2135 vmx_reflect_exception(v);
2136 __vm_clear_bit(GUEST_PENDING_DBG_EXCEPTIONS,
2137 PENDING_DEBUG_EXC_BS);
2140 break;
2142 case TRAP_int3:
2144 if ( test_bit(_DOMF_debugging, &v->domain->domain_flags) )
2145 domain_pause_for_debugger();
2146 else
2147 vmx_reflect_exception(v);
2148 break;
2150 #endif
2151 case TRAP_no_device:
2153 vmx_do_no_device_fault();
2154 break;
2156 case TRAP_page_fault:
2158 exit_qualification = __vmread(EXIT_QUALIFICATION);
2159 regs->error_code = __vmread(VM_EXIT_INTR_ERROR_CODE);
2161 TRACE_VMEXIT(3, regs->error_code);
2162 TRACE_VMEXIT(4, exit_qualification);
2164 HVM_DBG_LOG(DBG_LEVEL_VMMU,
2165 "eax=%lx, ebx=%lx, ecx=%lx, edx=%lx, esi=%lx, edi=%lx",
2166 (unsigned long)regs->eax, (unsigned long)regs->ebx,
2167 (unsigned long)regs->ecx, (unsigned long)regs->edx,
2168 (unsigned long)regs->esi, (unsigned long)regs->edi);
2170 if ( !vmx_do_page_fault(exit_qualification, regs) )
2172 /* Inject #PG using Interruption-Information Fields. */
2173 vmx_inject_hw_exception(v, TRAP_page_fault, regs->error_code);
2174 v->arch.hvm_vmx.cpu_cr2 = exit_qualification;
2175 TRACE_3D(TRC_VMX_INTR, v->domain->domain_id,
2176 TRAP_page_fault, exit_qualification);
2178 break;
2180 case TRAP_nmi:
2181 if ( (intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI )
2182 do_nmi(regs); /* Real NMI, vector 2: normal processing. */
2183 else
2184 vmx_reflect_exception(v);
2185 break;
2186 default:
2187 vmx_reflect_exception(v);
2188 break;
2190 break;
2192 case EXIT_REASON_EXTERNAL_INTERRUPT:
2193 vmx_do_extint(regs);
2194 break;
2195 case EXIT_REASON_TRIPLE_FAULT:
2196 domain_crash_synchronous();
2197 break;
2198 case EXIT_REASON_PENDING_INTERRUPT:
2199 /* Disable the interrupt window. */
2200 v->arch.hvm_vcpu.u.vmx.exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
2201 __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
2202 v->arch.hvm_vcpu.u.vmx.exec_control);
2203 break;
2204 case EXIT_REASON_TASK_SWITCH:
2205 domain_crash_synchronous();
2206 break;
2207 case EXIT_REASON_CPUID:
2208 inst_len = __get_instruction_length(); /* Safe: CPUID */
2209 __update_guest_eip(inst_len);
2210 vmx_do_cpuid(regs);
2211 break;
2212 case EXIT_REASON_HLT:
2213 inst_len = __get_instruction_length(); /* Safe: HLT */
2214 __update_guest_eip(inst_len);
2215 vmx_do_hlt();
2216 break;
2217 case EXIT_REASON_INVLPG:
2219 inst_len = __get_instruction_length(); /* Safe: INVLPG */
2220 __update_guest_eip(inst_len);
2221 exit_qualification = __vmread(EXIT_QUALIFICATION);
2222 vmx_do_invlpg(exit_qualification);
2223 TRACE_VMEXIT(4, exit_qualification);
2224 break;
2226 case EXIT_REASON_VMCALL:
2228 inst_len = __get_instruction_length(); /* Safe: VMCALL */
2229 __update_guest_eip(inst_len);
2230 hvm_do_hypercall(regs);
2231 break;
2233 case EXIT_REASON_CR_ACCESS:
2235 exit_qualification = __vmread(EXIT_QUALIFICATION);
2236 inst_len = __get_instruction_length(); /* Safe: MOV Cn, LMSW, CLTS */
2237 if ( vmx_cr_access(exit_qualification, regs) )
2238 __update_guest_eip(inst_len);
2239 TRACE_VMEXIT(4, exit_qualification);
2240 break;
2242 case EXIT_REASON_DR_ACCESS:
2243 exit_qualification = __vmread(EXIT_QUALIFICATION);
2244 vmx_dr_access(exit_qualification, regs);
2245 break;
2246 case EXIT_REASON_IO_INSTRUCTION:
2247 exit_qualification = __vmread(EXIT_QUALIFICATION);
2248 inst_len = __get_instruction_length(); /* Safe: IN, INS, OUT, OUTS */
2249 vmx_io_instruction(exit_qualification, inst_len);
2250 TRACE_VMEXIT(4, exit_qualification);
2251 break;
2252 case EXIT_REASON_MSR_READ:
2253 inst_len = __get_instruction_length(); /* Safe: RDMSR */
2254 __update_guest_eip(inst_len);
2255 vmx_do_msr_read(regs);
2256 TRACE_VMEXIT(1, regs->ecx);
2257 TRACE_VMEXIT(2, regs->eax);
2258 TRACE_VMEXIT(3, regs->edx);
2259 break;
2260 case EXIT_REASON_MSR_WRITE:
2261 inst_len = __get_instruction_length(); /* Safe: WRMSR */
2262 __update_guest_eip(inst_len);
2263 vmx_do_msr_write(regs);
2264 TRACE_VMEXIT(1, regs->ecx);
2265 TRACE_VMEXIT(2, regs->eax);
2266 TRACE_VMEXIT(3, regs->edx);
2267 break;
2268 case EXIT_REASON_MWAIT_INSTRUCTION:
2269 case EXIT_REASON_MONITOR_INSTRUCTION:
2270 case EXIT_REASON_PAUSE_INSTRUCTION:
2271 domain_crash_synchronous();
2272 break;
2273 case EXIT_REASON_VMCLEAR:
2274 case EXIT_REASON_VMLAUNCH:
2275 case EXIT_REASON_VMPTRLD:
2276 case EXIT_REASON_VMPTRST:
2277 case EXIT_REASON_VMREAD:
2278 case EXIT_REASON_VMRESUME:
2279 case EXIT_REASON_VMWRITE:
2280 case EXIT_REASON_VMXOFF:
2281 case EXIT_REASON_VMXON:
2282 /* Report invalid opcode exception when a VMX guest tries to execute
2283 any of the VMX instructions */
2284 vmx_inject_hw_exception(v, TRAP_invalid_op, VMX_DELIVER_NO_ERROR_CODE);
2285 break;
2287 case EXIT_REASON_TPR_BELOW_THRESHOLD:
2288 vcpu_vlapic(v)->flush_tpr_threshold = 1;
2289 break;
2291 default:
2292 domain_crash_synchronous(); /* should not happen */
2296 asmlinkage void vmx_trace_vmentry(void)
2298 struct vcpu *v = current;
2299 TRACE_5D(TRC_VMX_VMENTRY + current->vcpu_id,
2300 v->arch.hvm_vcpu.hvm_trace_values[0],
2301 v->arch.hvm_vcpu.hvm_trace_values[1],
2302 v->arch.hvm_vcpu.hvm_trace_values[2],
2303 v->arch.hvm_vcpu.hvm_trace_values[3],
2304 v->arch.hvm_vcpu.hvm_trace_values[4]);
2306 TRACE_VMEXIT(0, 0);
2307 TRACE_VMEXIT(1, 0);
2308 TRACE_VMEXIT(2, 0);
2309 TRACE_VMEXIT(3, 0);
2310 TRACE_VMEXIT(4, 0);
2313 asmlinkage void vmx_trace_vmexit (void)
2315 TRACE_3D(TRC_VMX_VMEXIT + current->vcpu_id, 0, 0, 0);
2318 /*
2319 * Local variables:
2320 * mode: C
2321 * c-set-style: "BSD"
2322 * c-basic-offset: 4
2323 * tab-width: 4
2324 * indent-tabs-mode: nil
2325 * End:
2326 */