ia64/xen-unstable

view xen/arch/x86/hvm/vmx/vmcs.c @ 16643:35ab2bb25e09

vmx: Do not set bit 1 of FEATURE_CONTROL MSR if SMX is not supported
by the CPU. Also generally beef up robustness of VMXON instruction.
Signed-off-by: Keir Fraser <keir.fraser@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Wed Dec 19 15:51:01 2007 +0000 (2007-12-19)
parents e10eacec8b91
children aecbf98aa709
line source
1 /*
2 * vmcs.c: VMCS management
3 * Copyright (c) 2004, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
16 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 */
19 #include <xen/config.h>
20 #include <xen/init.h>
21 #include <xen/mm.h>
22 #include <xen/lib.h>
23 #include <xen/errno.h>
24 #include <xen/domain_page.h>
25 #include <asm/current.h>
26 #include <asm/cpufeature.h>
27 #include <asm/processor.h>
28 #include <asm/msr.h>
29 #include <asm/hvm/hvm.h>
30 #include <asm/hvm/io.h>
31 #include <asm/hvm/support.h>
32 #include <asm/hvm/vmx/vmx.h>
33 #include <asm/hvm/vmx/vmcs.h>
34 #include <asm/flushtlb.h>
35 #include <xen/event.h>
36 #include <xen/kernel.h>
37 #include <xen/keyhandler.h>
38 #include <asm/shadow.h>
39 #include <asm/tboot.h>
41 /* Dynamic (run-time adjusted) execution control flags. */
42 u32 vmx_pin_based_exec_control __read_mostly;
43 u32 vmx_cpu_based_exec_control __read_mostly;
44 u32 vmx_secondary_exec_control __read_mostly;
45 u32 vmx_vmexit_control __read_mostly;
46 u32 vmx_vmentry_control __read_mostly;
47 bool_t cpu_has_vmx_ins_outs_instr_info __read_mostly;
49 static DEFINE_PER_CPU(struct vmcs_struct *, host_vmcs);
50 static DEFINE_PER_CPU(struct vmcs_struct *, current_vmcs);
51 static DEFINE_PER_CPU(struct list_head, active_vmcs_list);
53 static u32 vmcs_revision_id __read_mostly;
55 static u32 adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, u32 msr)
56 {
57 u32 vmx_msr_low, vmx_msr_high, ctl = ctl_min | ctl_opt;
59 rdmsr(msr, vmx_msr_low, vmx_msr_high);
61 ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
62 ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */
64 /* Ensure minimum (required) set of control bits are supported. */
65 BUG_ON(ctl_min & ~ctl);
67 return ctl;
68 }
70 static void vmx_init_vmcs_config(void)
71 {
72 u32 vmx_msr_low, vmx_msr_high, min, opt;
73 u32 _vmx_pin_based_exec_control;
74 u32 _vmx_cpu_based_exec_control;
75 u32 _vmx_secondary_exec_control = 0;
76 u32 _vmx_vmexit_control;
77 u32 _vmx_vmentry_control;
79 min = (PIN_BASED_EXT_INTR_MASK |
80 PIN_BASED_NMI_EXITING);
81 opt = PIN_BASED_VIRTUAL_NMIS;
82 _vmx_pin_based_exec_control = adjust_vmx_controls(
83 min, opt, MSR_IA32_VMX_PINBASED_CTLS);
85 min = (CPU_BASED_HLT_EXITING |
86 CPU_BASED_INVLPG_EXITING |
87 CPU_BASED_MONITOR_EXITING |
88 CPU_BASED_MWAIT_EXITING |
89 CPU_BASED_MOV_DR_EXITING |
90 CPU_BASED_ACTIVATE_IO_BITMAP |
91 CPU_BASED_USE_TSC_OFFSETING);
92 opt = CPU_BASED_ACTIVATE_MSR_BITMAP;
93 opt |= CPU_BASED_TPR_SHADOW;
94 opt |= CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
95 _vmx_cpu_based_exec_control = adjust_vmx_controls(
96 min, opt, MSR_IA32_VMX_PROCBASED_CTLS);
97 #ifdef __x86_64__
98 if ( !(_vmx_cpu_based_exec_control & CPU_BASED_TPR_SHADOW) )
99 {
100 min |= CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING;
101 _vmx_cpu_based_exec_control = adjust_vmx_controls(
102 min, opt, MSR_IA32_VMX_PROCBASED_CTLS);
103 }
104 #endif
106 if ( _vmx_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS )
107 {
108 min = 0;
109 opt = (SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
110 SECONDARY_EXEC_WBINVD_EXITING);
111 _vmx_secondary_exec_control = adjust_vmx_controls(
112 min, opt, MSR_IA32_VMX_PROCBASED_CTLS2);
113 }
115 #if defined(__i386__)
116 /* If we can't virtualise APIC accesses, the TPR shadow is pointless. */
117 if ( !(_vmx_secondary_exec_control &
118 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) )
119 _vmx_cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
120 #endif
122 min = VM_EXIT_ACK_INTR_ON_EXIT;
123 opt = 0;
124 #ifdef __x86_64__
125 min |= VM_EXIT_IA32E_MODE;
126 #endif
127 _vmx_vmexit_control = adjust_vmx_controls(
128 min, opt, MSR_IA32_VMX_EXIT_CTLS);
130 min = opt = 0;
131 _vmx_vmentry_control = adjust_vmx_controls(
132 min, opt, MSR_IA32_VMX_ENTRY_CTLS);
134 rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
136 if ( !vmx_pin_based_exec_control )
137 {
138 /* First time through. */
139 vmcs_revision_id = vmx_msr_low;
140 vmx_pin_based_exec_control = _vmx_pin_based_exec_control;
141 vmx_cpu_based_exec_control = _vmx_cpu_based_exec_control;
142 vmx_secondary_exec_control = _vmx_secondary_exec_control;
143 vmx_vmexit_control = _vmx_vmexit_control;
144 vmx_vmentry_control = _vmx_vmentry_control;
145 cpu_has_vmx_ins_outs_instr_info = !!(vmx_msr_high & (1U<<22));
146 }
147 else
148 {
149 /* Globals are already initialised: re-check them. */
150 BUG_ON(vmcs_revision_id != vmx_msr_low);
151 BUG_ON(vmx_pin_based_exec_control != _vmx_pin_based_exec_control);
152 BUG_ON(vmx_cpu_based_exec_control != _vmx_cpu_based_exec_control);
153 BUG_ON(vmx_secondary_exec_control != _vmx_secondary_exec_control);
154 BUG_ON(vmx_vmexit_control != _vmx_vmexit_control);
155 BUG_ON(vmx_vmentry_control != _vmx_vmentry_control);
156 BUG_ON(cpu_has_vmx_ins_outs_instr_info != !!(vmx_msr_high & (1U<<22)));
157 }
159 /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
160 BUG_ON((vmx_msr_high & 0x1fff) > PAGE_SIZE);
162 #ifdef __x86_64__
163 /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
164 BUG_ON(vmx_msr_high & (1u<<16));
165 #endif
167 /* Require Write-Back (WB) memory type for VMCS accesses. */
168 BUG_ON(((vmx_msr_high >> 18) & 15) != 6);
169 }
171 static struct vmcs_struct *vmx_alloc_vmcs(void)
172 {
173 struct vmcs_struct *vmcs;
175 if ( (vmcs = alloc_xenheap_page()) == NULL )
176 {
177 gdprintk(XENLOG_WARNING, "Failed to allocate VMCS.\n");
178 return NULL;
179 }
181 clear_page(vmcs);
182 vmcs->vmcs_revision_id = vmcs_revision_id;
184 return vmcs;
185 }
187 static void vmx_free_vmcs(struct vmcs_struct *vmcs)
188 {
189 free_xenheap_page(vmcs);
190 }
192 static void __vmx_clear_vmcs(void *info)
193 {
194 struct vcpu *v = info;
195 struct arch_vmx_struct *arch_vmx = &v->arch.hvm_vmx;
197 /* Otherwise we can nest (vmx_cpu_down() vs. vmx_clear_vmcs()). */
198 ASSERT(!local_irq_is_enabled());
200 if ( arch_vmx->active_cpu == smp_processor_id() )
201 {
202 __vmpclear(virt_to_maddr(arch_vmx->vmcs));
204 arch_vmx->active_cpu = -1;
205 arch_vmx->launched = 0;
207 list_del(&arch_vmx->active_list);
209 if ( arch_vmx->vmcs == this_cpu(current_vmcs) )
210 this_cpu(current_vmcs) = NULL;
211 }
212 }
214 static void vmx_clear_vmcs(struct vcpu *v)
215 {
216 int cpu = v->arch.hvm_vmx.active_cpu;
218 if ( cpu != -1 )
219 on_selected_cpus(cpumask_of_cpu(cpu), __vmx_clear_vmcs, v, 1, 1);
220 }
222 static void vmx_load_vmcs(struct vcpu *v)
223 {
224 unsigned long flags;
226 local_irq_save(flags);
228 if ( v->arch.hvm_vmx.active_cpu == -1 )
229 {
230 list_add(&v->arch.hvm_vmx.active_list, &this_cpu(active_vmcs_list));
231 v->arch.hvm_vmx.active_cpu = smp_processor_id();
232 }
234 ASSERT(v->arch.hvm_vmx.active_cpu == smp_processor_id());
236 __vmptrld(virt_to_maddr(v->arch.hvm_vmx.vmcs));
237 this_cpu(current_vmcs) = v->arch.hvm_vmx.vmcs;
239 local_irq_restore(flags);
240 }
242 int vmx_cpu_up(void)
243 {
244 u32 eax, edx;
245 int cpu = smp_processor_id();
246 u64 cr0, vmx_cr0_fixed0, vmx_cr0_fixed1;
248 BUG_ON(!(read_cr4() & X86_CR4_VMXE));
250 /*
251 * Ensure the current processor operating mode meets
252 * the requred CRO fixed bits in VMX operation.
253 */
254 cr0 = read_cr0();
255 rdmsrl(MSR_IA32_VMX_CR0_FIXED0, vmx_cr0_fixed0);
256 rdmsrl(MSR_IA32_VMX_CR0_FIXED1, vmx_cr0_fixed1);
257 if ( (~cr0 & vmx_cr0_fixed0) || (cr0 & ~vmx_cr0_fixed1) )
258 {
259 printk("CPU%d: some settings of host CR0 are "
260 "not allowed in VMX operation.\n", cpu);
261 return 0;
262 }
264 rdmsr(IA32_FEATURE_CONTROL_MSR, eax, edx);
266 if ( eax & IA32_FEATURE_CONTROL_MSR_LOCK )
267 {
268 if ( !(eax & (IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON_OUTSIDE_SMX |
269 IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON_INSIDE_SMX)) )
270 {
271 printk("CPU%d: VMX disabled by BIOS.\n", cpu);
272 return 0;
273 }
274 }
275 else
276 {
277 eax = IA32_FEATURE_CONTROL_MSR_LOCK;
278 eax |= IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON_OUTSIDE_SMX;
279 if ( test_bit(X86_FEATURE_SMXE, &boot_cpu_data.x86_capability) )
280 eax |= IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON_INSIDE_SMX;
281 wrmsr(IA32_FEATURE_CONTROL_MSR, eax, 0);
282 }
284 vmx_init_vmcs_config();
286 INIT_LIST_HEAD(&this_cpu(active_vmcs_list));
288 if ( this_cpu(host_vmcs) == NULL )
289 {
290 this_cpu(host_vmcs) = vmx_alloc_vmcs();
291 if ( this_cpu(host_vmcs) == NULL )
292 {
293 printk("CPU%d: Could not allocate host VMCS\n", cpu);
294 return 0;
295 }
296 }
298 if ( __vmxon(virt_to_maddr(this_cpu(host_vmcs))) )
299 {
300 printk("CPU%d: VMXON failed\n", cpu);
301 return 0;
302 }
304 return 1;
305 }
307 void vmx_cpu_down(void)
308 {
309 struct list_head *active_vmcs_list = &this_cpu(active_vmcs_list);
310 unsigned long flags;
312 local_irq_save(flags);
314 while ( !list_empty(active_vmcs_list) )
315 __vmx_clear_vmcs(list_entry(active_vmcs_list->next,
316 struct vcpu, arch.hvm_vmx.active_list));
318 BUG_ON(!(read_cr4() & X86_CR4_VMXE));
319 __vmxoff();
321 local_irq_restore(flags);
322 }
324 struct foreign_vmcs {
325 struct vcpu *v;
326 unsigned int count;
327 };
328 static DEFINE_PER_CPU(struct foreign_vmcs, foreign_vmcs);
330 void vmx_vmcs_enter(struct vcpu *v)
331 {
332 struct foreign_vmcs *fv;
334 /*
335 * NB. We must *always* run an HVM VCPU on its own VMCS, except for
336 * vmx_vmcs_enter/exit critical regions.
337 */
338 if ( likely(v == current) )
339 return;
341 fv = &this_cpu(foreign_vmcs);
343 if ( fv->v == v )
344 {
345 BUG_ON(fv->count == 0);
346 }
347 else
348 {
349 BUG_ON(fv->v != NULL);
350 BUG_ON(fv->count != 0);
352 vcpu_pause(v);
353 spin_lock(&v->arch.hvm_vmx.vmcs_lock);
355 vmx_clear_vmcs(v);
356 vmx_load_vmcs(v);
358 fv->v = v;
359 }
361 fv->count++;
362 }
364 void vmx_vmcs_exit(struct vcpu *v)
365 {
366 struct foreign_vmcs *fv;
368 if ( likely(v == current) )
369 return;
371 fv = &this_cpu(foreign_vmcs);
372 BUG_ON(fv->v != v);
373 BUG_ON(fv->count == 0);
375 if ( --fv->count == 0 )
376 {
377 /* Don't confuse vmx_do_resume (for @v or @current!) */
378 vmx_clear_vmcs(v);
379 if ( is_hvm_vcpu(current) )
380 vmx_load_vmcs(current);
382 spin_unlock(&v->arch.hvm_vmx.vmcs_lock);
383 vcpu_unpause(v);
385 fv->v = NULL;
386 }
387 }
389 struct xgt_desc {
390 unsigned short size;
391 unsigned long address __attribute__((packed));
392 };
394 static void vmx_set_host_env(struct vcpu *v)
395 {
396 unsigned int cpu = smp_processor_id();
398 __vmwrite(HOST_IDTR_BASE, (unsigned long)idt_tables[cpu]);
400 __vmwrite(HOST_TR_SELECTOR, __TSS(cpu) << 3);
401 __vmwrite(HOST_TR_BASE, (unsigned long)&init_tss[cpu]);
403 __vmwrite(HOST_SYSENTER_ESP, get_stack_bottom());
405 /*
406 * Skip end of cpu_user_regs when entering the hypervisor because the
407 * CPU does not save context onto the stack. SS,RSP,CS,RIP,RFLAGS,etc
408 * all get saved into the VMCS instead.
409 */
410 __vmwrite(HOST_RSP,
411 (unsigned long)&get_cpu_info()->guest_cpu_user_regs.error_code);
412 }
414 void vmx_disable_intercept_for_msr(struct vcpu *v, u32 msr)
415 {
416 char *msr_bitmap = v->arch.hvm_vmx.msr_bitmap;
418 /* VMX MSR bitmap supported? */
419 if ( msr_bitmap == NULL )
420 return;
422 /*
423 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
424 * have the write-low and read-high bitmap offsets the wrong way round.
425 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
426 */
427 if ( msr <= 0x1fff )
428 {
429 __clear_bit(msr, msr_bitmap + 0x000); /* read-low */
430 __clear_bit(msr, msr_bitmap + 0x800); /* write-low */
431 }
432 else if ( (msr >= 0xc0000000) && (msr <= 0xc0001fff) )
433 {
434 msr &= 0x1fff;
435 __clear_bit(msr, msr_bitmap + 0x400); /* read-high */
436 __clear_bit(msr, msr_bitmap + 0xc00); /* write-high */
437 }
438 }
440 static int construct_vmcs(struct vcpu *v)
441 {
442 uint16_t sysenter_cs;
443 unsigned long sysenter_eip;
445 vmx_vmcs_enter(v);
447 /* VMCS controls. */
448 __vmwrite(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_control);
449 __vmwrite(VM_EXIT_CONTROLS, vmx_vmexit_control);
450 __vmwrite(VM_ENTRY_CONTROLS, vmx_vmentry_control);
451 __vmwrite(CPU_BASED_VM_EXEC_CONTROL, vmx_cpu_based_exec_control);
452 v->arch.hvm_vmx.exec_control = vmx_cpu_based_exec_control;
453 if ( vmx_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS )
454 __vmwrite(SECONDARY_VM_EXEC_CONTROL, vmx_secondary_exec_control);
456 /* MSR access bitmap. */
457 if ( cpu_has_vmx_msr_bitmap )
458 {
459 char *msr_bitmap = alloc_xenheap_page();
461 if ( msr_bitmap == NULL )
462 return -ENOMEM;
464 memset(msr_bitmap, ~0, PAGE_SIZE);
465 v->arch.hvm_vmx.msr_bitmap = msr_bitmap;
466 __vmwrite(MSR_BITMAP, virt_to_maddr(msr_bitmap));
468 vmx_disable_intercept_for_msr(v, MSR_FS_BASE);
469 vmx_disable_intercept_for_msr(v, MSR_GS_BASE);
470 vmx_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_CS);
471 vmx_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_ESP);
472 vmx_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_EIP);
473 }
475 /* I/O access bitmap. */
476 __vmwrite(IO_BITMAP_A, virt_to_maddr(hvm_io_bitmap));
477 __vmwrite(IO_BITMAP_B, virt_to_maddr(hvm_io_bitmap + PAGE_SIZE));
479 /* Host GDTR base. */
480 __vmwrite(HOST_GDTR_BASE, GDT_VIRT_START(v));
482 /* Host data selectors. */
483 __vmwrite(HOST_SS_SELECTOR, __HYPERVISOR_DS);
484 __vmwrite(HOST_DS_SELECTOR, __HYPERVISOR_DS);
485 __vmwrite(HOST_ES_SELECTOR, __HYPERVISOR_DS);
486 __vmwrite(HOST_FS_SELECTOR, 0);
487 __vmwrite(HOST_GS_SELECTOR, 0);
488 __vmwrite(HOST_FS_BASE, 0);
489 __vmwrite(HOST_GS_BASE, 0);
491 /* Host control registers. */
492 __vmwrite(HOST_CR0, read_cr0() | X86_CR0_TS);
493 __vmwrite(HOST_CR4, mmu_cr4_features);
495 /* Host CS:RIP. */
496 __vmwrite(HOST_CS_SELECTOR, __HYPERVISOR_CS);
497 __vmwrite(HOST_RIP, (unsigned long)vmx_asm_vmexit_handler);
499 /* Host SYSENTER CS:RIP. */
500 rdmsrl(MSR_IA32_SYSENTER_CS, sysenter_cs);
501 __vmwrite(HOST_SYSENTER_CS, sysenter_cs);
502 rdmsrl(MSR_IA32_SYSENTER_EIP, sysenter_eip);
503 __vmwrite(HOST_SYSENTER_EIP, sysenter_eip);
505 /* MSR intercepts. */
506 __vmwrite(VM_EXIT_MSR_LOAD_COUNT, 0);
507 __vmwrite(VM_EXIT_MSR_STORE_COUNT, 0);
508 __vmwrite(VM_ENTRY_MSR_LOAD_COUNT, 0);
510 __vmwrite(VM_ENTRY_INTR_INFO, 0);
512 __vmwrite(CR0_GUEST_HOST_MASK, ~0UL);
513 __vmwrite(CR4_GUEST_HOST_MASK, ~0UL);
515 __vmwrite(PAGE_FAULT_ERROR_CODE_MASK, 0);
516 __vmwrite(PAGE_FAULT_ERROR_CODE_MATCH, 0);
518 __vmwrite(CR3_TARGET_COUNT, 0);
520 __vmwrite(GUEST_ACTIVITY_STATE, 0);
522 /* Guest segment bases. */
523 __vmwrite(GUEST_ES_BASE, 0);
524 __vmwrite(GUEST_SS_BASE, 0);
525 __vmwrite(GUEST_DS_BASE, 0);
526 __vmwrite(GUEST_FS_BASE, 0);
527 __vmwrite(GUEST_GS_BASE, 0);
528 __vmwrite(GUEST_CS_BASE, 0);
530 /* Guest segment limits. */
531 __vmwrite(GUEST_ES_LIMIT, ~0u);
532 __vmwrite(GUEST_SS_LIMIT, ~0u);
533 __vmwrite(GUEST_DS_LIMIT, ~0u);
534 __vmwrite(GUEST_FS_LIMIT, ~0u);
535 __vmwrite(GUEST_GS_LIMIT, ~0u);
536 __vmwrite(GUEST_CS_LIMIT, ~0u);
538 /* Guest segment AR bytes. */
539 __vmwrite(GUEST_ES_AR_BYTES, 0xc093); /* read/write, accessed */
540 __vmwrite(GUEST_SS_AR_BYTES, 0xc093);
541 __vmwrite(GUEST_DS_AR_BYTES, 0xc093);
542 __vmwrite(GUEST_FS_AR_BYTES, 0xc093);
543 __vmwrite(GUEST_GS_AR_BYTES, 0xc093);
544 __vmwrite(GUEST_CS_AR_BYTES, 0xc09b); /* exec/read, accessed */
546 /* Guest IDT. */
547 __vmwrite(GUEST_IDTR_BASE, 0);
548 __vmwrite(GUEST_IDTR_LIMIT, 0);
550 /* Guest GDT. */
551 __vmwrite(GUEST_GDTR_BASE, 0);
552 __vmwrite(GUEST_GDTR_LIMIT, 0);
554 /* Guest LDT. */
555 __vmwrite(GUEST_LDTR_AR_BYTES, 0x0082); /* LDT */
556 __vmwrite(GUEST_LDTR_SELECTOR, 0);
557 __vmwrite(GUEST_LDTR_BASE, 0);
558 __vmwrite(GUEST_LDTR_LIMIT, 0);
560 /* Guest TSS. */
561 __vmwrite(GUEST_TR_AR_BYTES, 0x008b); /* 32-bit TSS (busy) */
562 __vmwrite(GUEST_TR_BASE, 0);
563 __vmwrite(GUEST_TR_LIMIT, 0xff);
565 __vmwrite(GUEST_INTERRUPTIBILITY_INFO, 0);
566 __vmwrite(GUEST_DR7, 0);
567 __vmwrite(VMCS_LINK_POINTER, ~0UL);
568 #if defined(__i386__)
569 __vmwrite(VMCS_LINK_POINTER_HIGH, ~0UL);
570 #endif
572 __vmwrite(EXCEPTION_BITMAP, HVM_TRAP_MASK | (1U << TRAP_page_fault));
574 v->arch.hvm_vcpu.guest_cr[0] = X86_CR0_PE | X86_CR0_ET;
575 hvm_update_guest_cr(v, 0);
577 v->arch.hvm_vcpu.guest_cr[4] = 0;
578 hvm_update_guest_cr(v, 4);
580 if ( cpu_has_vmx_tpr_shadow )
581 {
582 __vmwrite(VIRTUAL_APIC_PAGE_ADDR,
583 page_to_maddr(vcpu_vlapic(v)->regs_page));
584 __vmwrite(TPR_THRESHOLD, 0);
585 }
587 vmx_vmcs_exit(v);
589 paging_update_paging_modes(v); /* will update HOST & GUEST_CR3 as reqd */
591 vmx_vlapic_msr_changed(v);
593 return 0;
594 }
596 int vmx_read_guest_msr(struct vcpu *v, u32 msr, u64 *val)
597 {
598 unsigned int i, msr_count = v->arch.hvm_vmx.msr_count;
599 const struct vmx_msr_entry *msr_area = v->arch.hvm_vmx.msr_area;
601 for ( i = 0; i < msr_count; i++ )
602 {
603 if ( msr_area[i].index == msr )
604 {
605 *val = msr_area[i].data;
606 return 0;
607 }
608 }
610 return -ESRCH;
611 }
613 int vmx_write_guest_msr(struct vcpu *v, u32 msr, u64 val)
614 {
615 unsigned int i, msr_count = v->arch.hvm_vmx.msr_count;
616 struct vmx_msr_entry *msr_area = v->arch.hvm_vmx.msr_area;
618 for ( i = 0; i < msr_count; i++ )
619 {
620 if ( msr_area[i].index == msr )
621 {
622 msr_area[i].data = val;
623 return 0;
624 }
625 }
627 return -ESRCH;
628 }
630 int vmx_add_guest_msr(struct vcpu *v, u32 msr)
631 {
632 unsigned int i, msr_count = v->arch.hvm_vmx.msr_count;
633 struct vmx_msr_entry *msr_area = v->arch.hvm_vmx.msr_area;
635 for ( i = 0; i < msr_count; i++ )
636 if ( msr_area[i].index == msr )
637 return 0;
639 if ( msr_count == (PAGE_SIZE / sizeof(struct vmx_msr_entry)) )
640 return -ENOSPC;
642 if ( msr_area == NULL )
643 {
644 if ( (msr_area = alloc_xenheap_page()) == NULL )
645 return -ENOMEM;
646 v->arch.hvm_vmx.msr_area = msr_area;
647 __vmwrite(VM_EXIT_MSR_STORE_ADDR, virt_to_maddr(msr_area));
648 __vmwrite(VM_ENTRY_MSR_LOAD_ADDR, virt_to_maddr(msr_area));
649 }
651 msr_area[msr_count].index = msr;
652 msr_area[msr_count].mbz = 0;
653 msr_area[msr_count].data = 0;
654 v->arch.hvm_vmx.msr_count = ++msr_count;
655 __vmwrite(VM_EXIT_MSR_STORE_COUNT, msr_count);
656 __vmwrite(VM_ENTRY_MSR_LOAD_COUNT, msr_count);
658 return 0;
659 }
661 int vmx_add_host_load_msr(struct vcpu *v, u32 msr)
662 {
663 unsigned int i, msr_count = v->arch.hvm_vmx.host_msr_count;
664 struct vmx_msr_entry *msr_area = v->arch.hvm_vmx.host_msr_area;
666 for ( i = 0; i < msr_count; i++ )
667 if ( msr_area[i].index == msr )
668 return 0;
670 if ( msr_count == (PAGE_SIZE / sizeof(struct vmx_msr_entry)) )
671 return -ENOSPC;
673 if ( msr_area == NULL )
674 {
675 if ( (msr_area = alloc_xenheap_page()) == NULL )
676 return -ENOMEM;
677 v->arch.hvm_vmx.host_msr_area = msr_area;
678 __vmwrite(VM_EXIT_MSR_LOAD_ADDR, virt_to_maddr(msr_area));
679 }
681 msr_area[msr_count].index = msr;
682 msr_area[msr_count].mbz = 0;
683 rdmsrl(msr, msr_area[msr_count].data);
684 v->arch.hvm_vmx.host_msr_count = ++msr_count;
685 __vmwrite(VM_EXIT_MSR_LOAD_COUNT, msr_count);
687 return 0;
688 }
690 int vmx_create_vmcs(struct vcpu *v)
691 {
692 struct arch_vmx_struct *arch_vmx = &v->arch.hvm_vmx;
693 int rc;
695 if ( arch_vmx->vmcs == NULL )
696 {
697 if ( (arch_vmx->vmcs = vmx_alloc_vmcs()) == NULL )
698 return -ENOMEM;
700 INIT_LIST_HEAD(&arch_vmx->active_list);
701 __vmpclear(virt_to_maddr(arch_vmx->vmcs));
702 arch_vmx->active_cpu = -1;
703 arch_vmx->launched = 0;
704 }
706 if ( (rc = construct_vmcs(v)) != 0 )
707 {
708 vmx_free_vmcs(arch_vmx->vmcs);
709 arch_vmx->vmcs = NULL;
710 return rc;
711 }
713 return 0;
714 }
716 void vmx_destroy_vmcs(struct vcpu *v)
717 {
718 struct arch_vmx_struct *arch_vmx = &v->arch.hvm_vmx;
720 if ( arch_vmx->vmcs == NULL )
721 return;
723 vmx_clear_vmcs(v);
725 vmx_free_vmcs(arch_vmx->vmcs);
726 arch_vmx->vmcs = NULL;
727 }
729 void vm_launch_fail(unsigned long eflags)
730 {
731 unsigned long error = __vmread(VM_INSTRUCTION_ERROR);
732 printk("<vm_launch_fail> error code %lx\n", error);
733 domain_crash_synchronous();
734 }
736 void vm_resume_fail(unsigned long eflags)
737 {
738 unsigned long error = __vmread(VM_INSTRUCTION_ERROR);
739 printk("<vm_resume_fail> error code %lx\n", error);
740 domain_crash_synchronous();
741 }
743 static void wbinvd_ipi(void *info)
744 {
745 wbinvd();
746 }
748 void vmx_do_resume(struct vcpu *v)
749 {
750 bool_t debug_state;
752 if ( v->arch.hvm_vmx.active_cpu == smp_processor_id() )
753 {
754 if ( v->arch.hvm_vmx.vmcs != this_cpu(current_vmcs) )
755 vmx_load_vmcs(v);
756 }
757 else
758 {
759 /*
760 * For pass-through domain, guest PCI-E device driver may leverage the
761 * "Non-Snoop" I/O, and explicitly WBINVD or CLFLUSH to a RAM space.
762 * Since migration may occur before WBINVD or CLFLUSH, we need to
763 * maintain data consistency either by:
764 * 1: flushing cache (wbinvd) when the guest is scheduled out if
765 * there is no wbinvd exit, or
766 * 2: execute wbinvd on all dirty pCPUs when guest wbinvd exits.
767 */
768 if ( !list_empty(&(domain_hvm_iommu(v->domain)->pdev_list)) &&
769 !cpu_has_wbinvd_exiting )
770 {
771 int cpu = v->arch.hvm_vmx.active_cpu;
772 if ( cpu != -1 )
773 on_selected_cpus(cpumask_of_cpu(cpu), wbinvd_ipi, NULL, 1, 1);
774 }
776 vmx_clear_vmcs(v);
777 vmx_load_vmcs(v);
778 hvm_migrate_timers(v);
779 vmx_set_host_env(v);
780 }
782 debug_state = v->domain->debugger_attached;
783 if ( unlikely(v->arch.hvm_vcpu.debug_state_latch != debug_state) )
784 {
785 unsigned long intercepts = __vmread(EXCEPTION_BITMAP);
786 unsigned long mask = (1U << TRAP_debug) | (1U << TRAP_int3);
787 v->arch.hvm_vcpu.debug_state_latch = debug_state;
788 if ( debug_state )
789 intercepts |= mask;
790 else
791 intercepts &= ~mask;
792 __vmwrite(EXCEPTION_BITMAP, intercepts);
793 }
795 hvm_do_resume(v);
796 reset_stack_and_jump(vmx_asm_do_vmentry);
797 }
799 static void vmx_dump_sel(char *name, enum x86_segment seg)
800 {
801 struct segment_register sreg;
802 hvm_get_segment_register(current, seg, &sreg);
803 printk("%s: sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016llx\n",
804 name, sreg.sel, sreg.attr.bytes, sreg.limit,
805 (unsigned long long)sreg.base);
806 }
808 static unsigned long vmr(unsigned long field)
809 {
810 int rc;
811 unsigned long val;
812 val = __vmread_safe(field, &rc);
813 return rc ? 0 : val;
814 }
816 void vmcs_dump_vcpu(struct vcpu *v)
817 {
818 struct cpu_user_regs *regs = &v->arch.guest_context.user_regs;
819 unsigned long long x;
821 if ( v == current )
822 regs = guest_cpu_user_regs();
824 vmx_vmcs_enter(v);
826 printk("*** Guest State ***\n");
827 printk("CR0: actual=0x%016llx, shadow=0x%016llx, gh_mask=%016llx\n",
828 (unsigned long long)vmr(GUEST_CR0),
829 (unsigned long long)vmr(CR0_READ_SHADOW),
830 (unsigned long long)vmr(CR0_GUEST_HOST_MASK));
831 printk("CR4: actual=0x%016llx, shadow=0x%016llx, gh_mask=%016llx\n",
832 (unsigned long long)vmr(GUEST_CR4),
833 (unsigned long long)vmr(CR4_READ_SHADOW),
834 (unsigned long long)vmr(CR4_GUEST_HOST_MASK));
835 printk("CR3: actual=0x%016llx, target_count=%d\n",
836 (unsigned long long)vmr(GUEST_CR3),
837 (int)vmr(CR3_TARGET_COUNT));
838 printk(" target0=%016llx, target1=%016llx\n",
839 (unsigned long long)vmr(CR3_TARGET_VALUE0),
840 (unsigned long long)vmr(CR3_TARGET_VALUE1));
841 printk(" target2=%016llx, target3=%016llx\n",
842 (unsigned long long)vmr(CR3_TARGET_VALUE2),
843 (unsigned long long)vmr(CR3_TARGET_VALUE3));
844 printk("RSP = 0x%016llx (0x%016llx) RIP = 0x%016llx (0x%016llx)\n",
845 (unsigned long long)vmr(GUEST_RSP),
846 (unsigned long long)regs->esp,
847 (unsigned long long)vmr(GUEST_RIP),
848 (unsigned long long)regs->eip);
849 printk("RFLAGS=0x%016llx (0x%016llx) DR7 = 0x%016llx\n",
850 (unsigned long long)vmr(GUEST_RFLAGS),
851 (unsigned long long)regs->eflags,
852 (unsigned long long)vmr(GUEST_DR7));
853 printk("Sysenter RSP=%016llx CS:RIP=%04x:%016llx\n",
854 (unsigned long long)vmr(GUEST_SYSENTER_ESP),
855 (int)vmr(GUEST_SYSENTER_CS),
856 (unsigned long long)vmr(GUEST_SYSENTER_EIP));
857 vmx_dump_sel("CS", x86_seg_cs);
858 vmx_dump_sel("DS", x86_seg_ds);
859 vmx_dump_sel("SS", x86_seg_ss);
860 vmx_dump_sel("ES", x86_seg_es);
861 vmx_dump_sel("FS", x86_seg_fs);
862 vmx_dump_sel("GS", x86_seg_gs);
863 vmx_dump_sel("GDTR", x86_seg_gdtr);
864 vmx_dump_sel("LDTR", x86_seg_ldtr);
865 vmx_dump_sel("IDTR", x86_seg_idtr);
866 vmx_dump_sel("TR", x86_seg_tr);
867 x = (unsigned long long)vmr(TSC_OFFSET_HIGH) << 32;
868 x |= (uint32_t)vmr(TSC_OFFSET);
869 printk("TSC Offset = %016llx\n", x);
870 x = (unsigned long long)vmr(GUEST_IA32_DEBUGCTL) << 32;
871 x |= (uint32_t)vmr(GUEST_IA32_DEBUGCTL);
872 printk("DebugCtl=%016llx DebugExceptions=%016llx\n", x,
873 (unsigned long long)vmr(GUEST_PENDING_DBG_EXCEPTIONS));
874 printk("Interruptibility=%04x ActivityState=%04x\n",
875 (int)vmr(GUEST_INTERRUPTIBILITY_INFO),
876 (int)vmr(GUEST_ACTIVITY_STATE));
878 printk("*** Host State ***\n");
879 printk("RSP = 0x%016llx RIP = 0x%016llx\n",
880 (unsigned long long)vmr(HOST_RSP),
881 (unsigned long long)vmr(HOST_RIP));
882 printk("CS=%04x DS=%04x ES=%04x FS=%04x GS=%04x SS=%04x TR=%04x\n",
883 (uint16_t)vmr(HOST_CS_SELECTOR),
884 (uint16_t)vmr(HOST_DS_SELECTOR),
885 (uint16_t)vmr(HOST_ES_SELECTOR),
886 (uint16_t)vmr(HOST_FS_SELECTOR),
887 (uint16_t)vmr(HOST_GS_SELECTOR),
888 (uint16_t)vmr(HOST_SS_SELECTOR),
889 (uint16_t)vmr(HOST_TR_SELECTOR));
890 printk("FSBase=%016llx GSBase=%016llx TRBase=%016llx\n",
891 (unsigned long long)vmr(HOST_FS_BASE),
892 (unsigned long long)vmr(HOST_GS_BASE),
893 (unsigned long long)vmr(HOST_TR_BASE));
894 printk("GDTBase=%016llx IDTBase=%016llx\n",
895 (unsigned long long)vmr(HOST_GDTR_BASE),
896 (unsigned long long)vmr(HOST_IDTR_BASE));
897 printk("CR0=%016llx CR3=%016llx CR4=%016llx\n",
898 (unsigned long long)vmr(HOST_CR0),
899 (unsigned long long)vmr(HOST_CR3),
900 (unsigned long long)vmr(HOST_CR4));
901 printk("Sysenter RSP=%016llx CS:RIP=%04x:%016llx\n",
902 (unsigned long long)vmr(HOST_SYSENTER_ESP),
903 (int)vmr(HOST_SYSENTER_CS),
904 (unsigned long long)vmr(HOST_SYSENTER_EIP));
906 printk("*** Control State ***\n");
907 printk("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n",
908 (uint32_t)vmr(PIN_BASED_VM_EXEC_CONTROL),
909 (uint32_t)vmr(CPU_BASED_VM_EXEC_CONTROL),
910 (uint32_t)vmr(SECONDARY_VM_EXEC_CONTROL));
911 printk("EntryControls=%08x ExitControls=%08x\n",
912 (uint32_t)vmr(VM_ENTRY_CONTROLS),
913 (uint32_t)vmr(VM_EXIT_CONTROLS));
914 printk("ExceptionBitmap=%08x\n",
915 (uint32_t)vmr(EXCEPTION_BITMAP));
916 printk("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n",
917 (uint32_t)vmr(VM_ENTRY_INTR_INFO),
918 (uint32_t)vmr(VM_ENTRY_EXCEPTION_ERROR_CODE),
919 (uint32_t)vmr(VM_ENTRY_INSTRUCTION_LEN));
920 printk("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n",
921 (uint32_t)vmr(VM_EXIT_INTR_INFO),
922 (uint32_t)vmr(VM_EXIT_INTR_ERROR_CODE),
923 (uint32_t)vmr(VM_ENTRY_INSTRUCTION_LEN));
924 printk(" reason=%08x qualification=%08x\n",
925 (uint32_t)vmr(VM_EXIT_REASON),
926 (uint32_t)vmr(EXIT_QUALIFICATION));
927 printk("IDTVectoring: info=%08x errcode=%08x\n",
928 (uint32_t)vmr(IDT_VECTORING_INFO),
929 (uint32_t)vmr(IDT_VECTORING_ERROR_CODE));
930 printk("TPR Threshold = 0x%02x\n",
931 (uint32_t)vmr(TPR_THRESHOLD));
933 vmx_vmcs_exit(v);
934 }
936 static void vmcs_dump(unsigned char ch)
937 {
938 struct domain *d;
939 struct vcpu *v;
941 printk("*********** VMCS Areas **************\n");
943 rcu_read_lock(&domlist_read_lock);
945 for_each_domain ( d )
946 {
947 if ( !is_hvm_domain(d) )
948 continue;
949 printk("\n>>> Domain %d <<<\n", d->domain_id);
950 for_each_vcpu ( d, v )
951 {
952 printk("\tVCPU %d\n", v->vcpu_id);
953 vmcs_dump_vcpu(v);
954 }
955 }
957 rcu_read_unlock(&domlist_read_lock);
959 printk("**************************************\n");
960 }
962 void setup_vmcs_dump(void)
963 {
964 register_keyhandler('v', vmcs_dump, "dump Intel's VMCS");
965 }
968 /*
969 * Local variables:
970 * mode: C
971 * c-set-style: "BSD"
972 * c-basic-offset: 4
973 * tab-width: 4
974 * indent-tabs-mode: nil
975 * End:
976 */