ia64/xen-unstable

view xen/arch/x86/hvm/vmx/vmcs.c @ 17416:0553004fa328

x86, vmx: Enable VPID (Virtual Processor Identification)

Allows TLB entries to be retained across VM entry and VM exit, and Xen
can now identify distinct address spaces through a new
virtual-processor ID (VPID) field of the VMCS.

Signed-off-by: Xin Li <xin.b.li@intel.com>
Signed-off-by: Jun Nakajima <jun.nakajima@intel.com>
Signed-off-by: Xiaohui Xin <Xiaohui.xin@intel.com>
Signed-off-by: Keir Fraser <keir.fraser@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Wed Apr 09 14:34:49 2008 +0100 (2008-04-09)
parents 9b635405ef90
children 8bd776540ab3
line source
1 /*
2 * vmcs.c: VMCS management
3 * Copyright (c) 2004, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
16 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 */
19 #include <xen/config.h>
20 #include <xen/init.h>
21 #include <xen/mm.h>
22 #include <xen/lib.h>
23 #include <xen/errno.h>
24 #include <xen/domain_page.h>
25 #include <asm/current.h>
26 #include <asm/cpufeature.h>
27 #include <asm/processor.h>
28 #include <asm/msr.h>
29 #include <asm/hvm/hvm.h>
30 #include <asm/hvm/io.h>
31 #include <asm/hvm/support.h>
32 #include <asm/hvm/vmx/vmx.h>
33 #include <asm/hvm/vmx/vmcs.h>
34 #include <asm/flushtlb.h>
35 #include <xen/event.h>
36 #include <xen/kernel.h>
37 #include <xen/keyhandler.h>
38 #include <asm/shadow.h>
39 #include <asm/tboot.h>
41 static int opt_vpid_enabled = 1;
42 boolean_param("vpid", opt_vpid_enabled);
44 /* Dynamic (run-time adjusted) execution control flags. */
45 u32 vmx_pin_based_exec_control __read_mostly;
46 u32 vmx_cpu_based_exec_control __read_mostly;
47 u32 vmx_secondary_exec_control __read_mostly;
48 u32 vmx_vmexit_control __read_mostly;
49 u32 vmx_vmentry_control __read_mostly;
50 bool_t cpu_has_vmx_ins_outs_instr_info __read_mostly;
52 static DEFINE_PER_CPU(struct vmcs_struct *, host_vmcs);
53 static DEFINE_PER_CPU(struct vmcs_struct *, current_vmcs);
54 static DEFINE_PER_CPU(struct list_head, active_vmcs_list);
56 static u32 vmcs_revision_id __read_mostly;
58 static u32 adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, u32 msr)
59 {
60 u32 vmx_msr_low, vmx_msr_high, ctl = ctl_min | ctl_opt;
62 rdmsr(msr, vmx_msr_low, vmx_msr_high);
64 ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
65 ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */
67 /* Ensure minimum (required) set of control bits are supported. */
68 BUG_ON(ctl_min & ~ctl);
70 return ctl;
71 }
73 static void vmx_init_vmcs_config(void)
74 {
75 u32 vmx_msr_low, vmx_msr_high, min, opt;
76 u32 _vmx_pin_based_exec_control;
77 u32 _vmx_cpu_based_exec_control;
78 u32 _vmx_secondary_exec_control = 0;
79 u32 _vmx_vmexit_control;
80 u32 _vmx_vmentry_control;
82 min = (PIN_BASED_EXT_INTR_MASK |
83 PIN_BASED_NMI_EXITING);
84 opt = PIN_BASED_VIRTUAL_NMIS;
85 _vmx_pin_based_exec_control = adjust_vmx_controls(
86 min, opt, MSR_IA32_VMX_PINBASED_CTLS);
88 min = (CPU_BASED_HLT_EXITING |
89 CPU_BASED_INVLPG_EXITING |
90 CPU_BASED_CR3_LOAD_EXITING |
91 CPU_BASED_CR3_STORE_EXITING |
92 CPU_BASED_MONITOR_EXITING |
93 CPU_BASED_MWAIT_EXITING |
94 CPU_BASED_MOV_DR_EXITING |
95 CPU_BASED_ACTIVATE_IO_BITMAP |
96 CPU_BASED_USE_TSC_OFFSETING);
97 opt = (CPU_BASED_ACTIVATE_MSR_BITMAP |
98 CPU_BASED_TPR_SHADOW |
99 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS);
100 _vmx_cpu_based_exec_control = adjust_vmx_controls(
101 min, opt, MSR_IA32_VMX_PROCBASED_CTLS);
102 #ifdef __x86_64__
103 if ( !(_vmx_cpu_based_exec_control & CPU_BASED_TPR_SHADOW) )
104 {
105 min |= CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING;
106 _vmx_cpu_based_exec_control = adjust_vmx_controls(
107 min, opt, MSR_IA32_VMX_PROCBASED_CTLS);
108 }
109 #endif
111 if ( _vmx_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS )
112 {
113 min = 0;
114 opt = (SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
115 SECONDARY_EXEC_WBINVD_EXITING |
116 SECONDARY_EXEC_ENABLE_EPT);
117 if ( opt_vpid_enabled )
118 opt |= SECONDARY_EXEC_ENABLE_VPID;
119 _vmx_secondary_exec_control = adjust_vmx_controls(
120 min, opt, MSR_IA32_VMX_PROCBASED_CTLS2);
121 }
123 if ( _vmx_secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT )
124 {
125 /* To use EPT we expect to be able to clear certain intercepts. */
126 uint32_t must_be_one, must_be_zero;
127 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, must_be_one, must_be_zero);
128 if ( must_be_one & (CPU_BASED_INVLPG_EXITING |
129 CPU_BASED_CR3_LOAD_EXITING |
130 CPU_BASED_CR3_STORE_EXITING) )
131 _vmx_secondary_exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
132 }
134 #if defined(__i386__)
135 /* If we can't virtualise APIC accesses, the TPR shadow is pointless. */
136 if ( !(_vmx_secondary_exec_control &
137 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) )
138 _vmx_cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
139 #endif
141 min = VM_EXIT_ACK_INTR_ON_EXIT;
142 opt = 0;
143 #ifdef __x86_64__
144 min |= VM_EXIT_IA32E_MODE;
145 #endif
146 _vmx_vmexit_control = adjust_vmx_controls(
147 min, opt, MSR_IA32_VMX_EXIT_CTLS);
149 min = opt = 0;
150 _vmx_vmentry_control = adjust_vmx_controls(
151 min, opt, MSR_IA32_VMX_ENTRY_CTLS);
153 rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
155 if ( !vmx_pin_based_exec_control )
156 {
157 /* First time through. */
158 vmcs_revision_id = vmx_msr_low;
159 vmx_pin_based_exec_control = _vmx_pin_based_exec_control;
160 vmx_cpu_based_exec_control = _vmx_cpu_based_exec_control;
161 vmx_secondary_exec_control = _vmx_secondary_exec_control;
162 vmx_vmexit_control = _vmx_vmexit_control;
163 vmx_vmentry_control = _vmx_vmentry_control;
164 cpu_has_vmx_ins_outs_instr_info = !!(vmx_msr_high & (1U<<22));
165 }
166 else
167 {
168 /* Globals are already initialised: re-check them. */
169 BUG_ON(vmcs_revision_id != vmx_msr_low);
170 BUG_ON(vmx_pin_based_exec_control != _vmx_pin_based_exec_control);
171 BUG_ON(vmx_cpu_based_exec_control != _vmx_cpu_based_exec_control);
172 BUG_ON(vmx_secondary_exec_control != _vmx_secondary_exec_control);
173 BUG_ON(vmx_vmexit_control != _vmx_vmexit_control);
174 BUG_ON(vmx_vmentry_control != _vmx_vmentry_control);
175 BUG_ON(cpu_has_vmx_ins_outs_instr_info != !!(vmx_msr_high & (1U<<22)));
176 }
178 /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
179 BUG_ON((vmx_msr_high & 0x1fff) > PAGE_SIZE);
181 #ifdef __x86_64__
182 /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
183 BUG_ON(vmx_msr_high & (1u<<16));
184 #endif
186 /* Require Write-Back (WB) memory type for VMCS accesses. */
187 BUG_ON(((vmx_msr_high >> 18) & 15) != 6);
188 }
190 static struct vmcs_struct *vmx_alloc_vmcs(void)
191 {
192 struct vmcs_struct *vmcs;
194 if ( (vmcs = alloc_xenheap_page()) == NULL )
195 {
196 gdprintk(XENLOG_WARNING, "Failed to allocate VMCS.\n");
197 return NULL;
198 }
200 clear_page(vmcs);
201 vmcs->vmcs_revision_id = vmcs_revision_id;
203 return vmcs;
204 }
206 static void vmx_free_vmcs(struct vmcs_struct *vmcs)
207 {
208 free_xenheap_page(vmcs);
209 }
211 static void __vmx_clear_vmcs(void *info)
212 {
213 struct vcpu *v = info;
214 struct arch_vmx_struct *arch_vmx = &v->arch.hvm_vmx;
216 /* Otherwise we can nest (vmx_cpu_down() vs. vmx_clear_vmcs()). */
217 ASSERT(!local_irq_is_enabled());
219 if ( arch_vmx->active_cpu == smp_processor_id() )
220 {
221 __vmpclear(virt_to_maddr(arch_vmx->vmcs));
223 arch_vmx->active_cpu = -1;
224 arch_vmx->launched = 0;
226 list_del(&arch_vmx->active_list);
228 if ( arch_vmx->vmcs == this_cpu(current_vmcs) )
229 this_cpu(current_vmcs) = NULL;
230 }
231 }
233 static void vmx_clear_vmcs(struct vcpu *v)
234 {
235 int cpu = v->arch.hvm_vmx.active_cpu;
237 if ( cpu != -1 )
238 on_selected_cpus(cpumask_of_cpu(cpu), __vmx_clear_vmcs, v, 1, 1);
239 }
241 static void vmx_load_vmcs(struct vcpu *v)
242 {
243 unsigned long flags;
245 local_irq_save(flags);
247 if ( v->arch.hvm_vmx.active_cpu == -1 )
248 {
249 list_add(&v->arch.hvm_vmx.active_list, &this_cpu(active_vmcs_list));
250 v->arch.hvm_vmx.active_cpu = smp_processor_id();
251 }
253 ASSERT(v->arch.hvm_vmx.active_cpu == smp_processor_id());
255 __vmptrld(virt_to_maddr(v->arch.hvm_vmx.vmcs));
256 this_cpu(current_vmcs) = v->arch.hvm_vmx.vmcs;
258 local_irq_restore(flags);
259 }
261 int vmx_cpu_up(void)
262 {
263 u32 eax, edx;
264 int cpu = smp_processor_id();
265 u64 cr0, vmx_cr0_fixed0, vmx_cr0_fixed1;
267 BUG_ON(!(read_cr4() & X86_CR4_VMXE));
269 /*
270 * Ensure the current processor operating mode meets
271 * the requred CRO fixed bits in VMX operation.
272 */
273 cr0 = read_cr0();
274 rdmsrl(MSR_IA32_VMX_CR0_FIXED0, vmx_cr0_fixed0);
275 rdmsrl(MSR_IA32_VMX_CR0_FIXED1, vmx_cr0_fixed1);
276 if ( (~cr0 & vmx_cr0_fixed0) || (cr0 & ~vmx_cr0_fixed1) )
277 {
278 printk("CPU%d: some settings of host CR0 are "
279 "not allowed in VMX operation.\n", cpu);
280 return 0;
281 }
283 rdmsr(IA32_FEATURE_CONTROL_MSR, eax, edx);
285 if ( eax & IA32_FEATURE_CONTROL_MSR_LOCK )
286 {
287 if ( !(eax & (IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON_OUTSIDE_SMX |
288 IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON_INSIDE_SMX)) )
289 {
290 printk("CPU%d: VMX disabled by BIOS.\n", cpu);
291 return 0;
292 }
293 }
294 else
295 {
296 eax = IA32_FEATURE_CONTROL_MSR_LOCK;
297 eax |= IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON_OUTSIDE_SMX;
298 if ( test_bit(X86_FEATURE_SMXE, &boot_cpu_data.x86_capability) )
299 eax |= IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON_INSIDE_SMX;
300 wrmsr(IA32_FEATURE_CONTROL_MSR, eax, 0);
301 }
303 vmx_init_vmcs_config();
305 INIT_LIST_HEAD(&this_cpu(active_vmcs_list));
307 if ( this_cpu(host_vmcs) == NULL )
308 {
309 this_cpu(host_vmcs) = vmx_alloc_vmcs();
310 if ( this_cpu(host_vmcs) == NULL )
311 {
312 printk("CPU%d: Could not allocate host VMCS\n", cpu);
313 return 0;
314 }
315 }
317 if ( __vmxon(virt_to_maddr(this_cpu(host_vmcs))) )
318 {
319 printk("CPU%d: VMXON failed\n", cpu);
320 return 0;
321 }
323 ept_sync_all();
325 vpid_sync_all();
327 return 1;
328 }
330 void vmx_cpu_down(void)
331 {
332 struct list_head *active_vmcs_list = &this_cpu(active_vmcs_list);
333 unsigned long flags;
335 local_irq_save(flags);
337 while ( !list_empty(active_vmcs_list) )
338 __vmx_clear_vmcs(list_entry(active_vmcs_list->next,
339 struct vcpu, arch.hvm_vmx.active_list));
341 BUG_ON(!(read_cr4() & X86_CR4_VMXE));
342 __vmxoff();
344 local_irq_restore(flags);
345 }
347 struct foreign_vmcs {
348 struct vcpu *v;
349 unsigned int count;
350 };
351 static DEFINE_PER_CPU(struct foreign_vmcs, foreign_vmcs);
353 void vmx_vmcs_enter(struct vcpu *v)
354 {
355 struct foreign_vmcs *fv;
357 /*
358 * NB. We must *always* run an HVM VCPU on its own VMCS, except for
359 * vmx_vmcs_enter/exit critical regions.
360 */
361 if ( likely(v == current) )
362 return;
364 fv = &this_cpu(foreign_vmcs);
366 if ( fv->v == v )
367 {
368 BUG_ON(fv->count == 0);
369 }
370 else
371 {
372 BUG_ON(fv->v != NULL);
373 BUG_ON(fv->count != 0);
375 vcpu_pause(v);
376 spin_lock(&v->arch.hvm_vmx.vmcs_lock);
378 vmx_clear_vmcs(v);
379 vmx_load_vmcs(v);
381 fv->v = v;
382 }
384 fv->count++;
385 }
387 void vmx_vmcs_exit(struct vcpu *v)
388 {
389 struct foreign_vmcs *fv;
391 if ( likely(v == current) )
392 return;
394 fv = &this_cpu(foreign_vmcs);
395 BUG_ON(fv->v != v);
396 BUG_ON(fv->count == 0);
398 if ( --fv->count == 0 )
399 {
400 /* Don't confuse vmx_do_resume (for @v or @current!) */
401 vmx_clear_vmcs(v);
402 if ( is_hvm_vcpu(current) )
403 vmx_load_vmcs(current);
405 spin_unlock(&v->arch.hvm_vmx.vmcs_lock);
406 vcpu_unpause(v);
408 fv->v = NULL;
409 }
410 }
412 struct xgt_desc {
413 unsigned short size;
414 unsigned long address __attribute__((packed));
415 };
417 static void vmx_set_host_env(struct vcpu *v)
418 {
419 unsigned int cpu = smp_processor_id();
421 __vmwrite(HOST_IDTR_BASE, (unsigned long)idt_tables[cpu]);
423 __vmwrite(HOST_TR_SELECTOR, __TSS(cpu) << 3);
424 __vmwrite(HOST_TR_BASE, (unsigned long)&init_tss[cpu]);
426 __vmwrite(HOST_SYSENTER_ESP, get_stack_bottom());
428 /*
429 * Skip end of cpu_user_regs when entering the hypervisor because the
430 * CPU does not save context onto the stack. SS,RSP,CS,RIP,RFLAGS,etc
431 * all get saved into the VMCS instead.
432 */
433 __vmwrite(HOST_RSP,
434 (unsigned long)&get_cpu_info()->guest_cpu_user_regs.error_code);
435 }
437 void vmx_disable_intercept_for_msr(struct vcpu *v, u32 msr)
438 {
439 unsigned long *msr_bitmap = v->arch.hvm_vmx.msr_bitmap;
441 /* VMX MSR bitmap supported? */
442 if ( msr_bitmap == NULL )
443 return;
445 /*
446 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
447 * have the write-low and read-high bitmap offsets the wrong way round.
448 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
449 */
450 if ( msr <= 0x1fff )
451 {
452 __clear_bit(msr, msr_bitmap + 0x000/BYTES_PER_LONG); /* read-low */
453 __clear_bit(msr, msr_bitmap + 0x800/BYTES_PER_LONG); /* write-low */
454 }
455 else if ( (msr >= 0xc0000000) && (msr <= 0xc0001fff) )
456 {
457 msr &= 0x1fff;
458 __clear_bit(msr, msr_bitmap + 0x400/BYTES_PER_LONG); /* read-high */
459 __clear_bit(msr, msr_bitmap + 0xc00/BYTES_PER_LONG); /* write-high */
460 }
461 }
463 static int construct_vmcs(struct vcpu *v)
464 {
465 struct domain *d = v->domain;
466 uint16_t sysenter_cs;
467 unsigned long sysenter_eip;
469 vmx_vmcs_enter(v);
471 /* VMCS controls. */
472 __vmwrite(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_control);
473 __vmwrite(VM_EXIT_CONTROLS, vmx_vmexit_control);
474 __vmwrite(VM_ENTRY_CONTROLS, vmx_vmentry_control);
476 v->arch.hvm_vmx.exec_control = vmx_cpu_based_exec_control;
477 v->arch.hvm_vmx.secondary_exec_control = vmx_secondary_exec_control;
479 if ( paging_mode_hap(d) )
480 {
481 v->arch.hvm_vmx.exec_control &= ~(CPU_BASED_INVLPG_EXITING |
482 CPU_BASED_CR3_LOAD_EXITING |
483 CPU_BASED_CR3_STORE_EXITING);
484 }
485 else
486 {
487 v->arch.hvm_vmx.secondary_exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
488 }
490 __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control);
491 if ( cpu_has_vmx_secondary_exec_control )
492 __vmwrite(SECONDARY_VM_EXEC_CONTROL,
493 v->arch.hvm_vmx.secondary_exec_control);
495 /* MSR access bitmap. */
496 if ( cpu_has_vmx_msr_bitmap )
497 {
498 unsigned long *msr_bitmap = alloc_xenheap_page();
500 if ( msr_bitmap == NULL )
501 return -ENOMEM;
503 memset(msr_bitmap, ~0, PAGE_SIZE);
504 v->arch.hvm_vmx.msr_bitmap = msr_bitmap;
505 __vmwrite(MSR_BITMAP, virt_to_maddr(msr_bitmap));
507 vmx_disable_intercept_for_msr(v, MSR_FS_BASE);
508 vmx_disable_intercept_for_msr(v, MSR_GS_BASE);
509 vmx_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_CS);
510 vmx_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_ESP);
511 vmx_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_EIP);
512 }
514 /* I/O access bitmap. */
515 __vmwrite(IO_BITMAP_A, virt_to_maddr((char *)hvm_io_bitmap + 0));
516 __vmwrite(IO_BITMAP_B, virt_to_maddr((char *)hvm_io_bitmap + PAGE_SIZE));
518 /* Host GDTR base. */
519 __vmwrite(HOST_GDTR_BASE, GDT_VIRT_START(v));
521 /* Host data selectors. */
522 __vmwrite(HOST_SS_SELECTOR, __HYPERVISOR_DS);
523 __vmwrite(HOST_DS_SELECTOR, __HYPERVISOR_DS);
524 __vmwrite(HOST_ES_SELECTOR, __HYPERVISOR_DS);
525 __vmwrite(HOST_FS_SELECTOR, 0);
526 __vmwrite(HOST_GS_SELECTOR, 0);
527 __vmwrite(HOST_FS_BASE, 0);
528 __vmwrite(HOST_GS_BASE, 0);
530 /* Host control registers. */
531 v->arch.hvm_vmx.host_cr0 = read_cr0() | X86_CR0_TS;
532 __vmwrite(HOST_CR0, v->arch.hvm_vmx.host_cr0);
533 __vmwrite(HOST_CR4, mmu_cr4_features);
535 /* Host CS:RIP. */
536 __vmwrite(HOST_CS_SELECTOR, __HYPERVISOR_CS);
537 __vmwrite(HOST_RIP, (unsigned long)vmx_asm_vmexit_handler);
539 /* Host SYSENTER CS:RIP. */
540 rdmsrl(MSR_IA32_SYSENTER_CS, sysenter_cs);
541 __vmwrite(HOST_SYSENTER_CS, sysenter_cs);
542 rdmsrl(MSR_IA32_SYSENTER_EIP, sysenter_eip);
543 __vmwrite(HOST_SYSENTER_EIP, sysenter_eip);
545 /* MSR intercepts. */
546 __vmwrite(VM_EXIT_MSR_LOAD_COUNT, 0);
547 __vmwrite(VM_EXIT_MSR_STORE_COUNT, 0);
548 __vmwrite(VM_ENTRY_MSR_LOAD_COUNT, 0);
550 __vmwrite(VM_ENTRY_INTR_INFO, 0);
552 __vmwrite(CR0_GUEST_HOST_MASK, ~0UL);
553 __vmwrite(CR4_GUEST_HOST_MASK, ~0UL);
555 __vmwrite(PAGE_FAULT_ERROR_CODE_MASK, 0);
556 __vmwrite(PAGE_FAULT_ERROR_CODE_MATCH, 0);
558 __vmwrite(CR3_TARGET_COUNT, 0);
560 __vmwrite(GUEST_ACTIVITY_STATE, 0);
562 /* Guest segment bases. */
563 __vmwrite(GUEST_ES_BASE, 0);
564 __vmwrite(GUEST_SS_BASE, 0);
565 __vmwrite(GUEST_DS_BASE, 0);
566 __vmwrite(GUEST_FS_BASE, 0);
567 __vmwrite(GUEST_GS_BASE, 0);
568 __vmwrite(GUEST_CS_BASE, 0);
570 /* Guest segment limits. */
571 __vmwrite(GUEST_ES_LIMIT, ~0u);
572 __vmwrite(GUEST_SS_LIMIT, ~0u);
573 __vmwrite(GUEST_DS_LIMIT, ~0u);
574 __vmwrite(GUEST_FS_LIMIT, ~0u);
575 __vmwrite(GUEST_GS_LIMIT, ~0u);
576 __vmwrite(GUEST_CS_LIMIT, ~0u);
578 /* Guest segment AR bytes. */
579 __vmwrite(GUEST_ES_AR_BYTES, 0xc093); /* read/write, accessed */
580 __vmwrite(GUEST_SS_AR_BYTES, 0xc093);
581 __vmwrite(GUEST_DS_AR_BYTES, 0xc093);
582 __vmwrite(GUEST_FS_AR_BYTES, 0xc093);
583 __vmwrite(GUEST_GS_AR_BYTES, 0xc093);
584 __vmwrite(GUEST_CS_AR_BYTES, 0xc09b); /* exec/read, accessed */
586 /* Guest IDT. */
587 __vmwrite(GUEST_IDTR_BASE, 0);
588 __vmwrite(GUEST_IDTR_LIMIT, 0);
590 /* Guest GDT. */
591 __vmwrite(GUEST_GDTR_BASE, 0);
592 __vmwrite(GUEST_GDTR_LIMIT, 0);
594 /* Guest LDT. */
595 __vmwrite(GUEST_LDTR_AR_BYTES, 0x0082); /* LDT */
596 __vmwrite(GUEST_LDTR_SELECTOR, 0);
597 __vmwrite(GUEST_LDTR_BASE, 0);
598 __vmwrite(GUEST_LDTR_LIMIT, 0);
600 /* Guest TSS. */
601 __vmwrite(GUEST_TR_AR_BYTES, 0x008b); /* 32-bit TSS (busy) */
602 __vmwrite(GUEST_TR_BASE, 0);
603 __vmwrite(GUEST_TR_LIMIT, 0xff);
605 __vmwrite(GUEST_INTERRUPTIBILITY_INFO, 0);
606 __vmwrite(GUEST_DR7, 0);
607 __vmwrite(VMCS_LINK_POINTER, ~0UL);
608 #if defined(__i386__)
609 __vmwrite(VMCS_LINK_POINTER_HIGH, ~0UL);
610 #endif
612 __vmwrite(EXCEPTION_BITMAP,
613 HVM_TRAP_MASK
614 | (paging_mode_hap(d) ? 0 : (1U << TRAP_page_fault))
615 | (1U << TRAP_no_device));
617 v->arch.hvm_vcpu.guest_cr[0] = X86_CR0_PE | X86_CR0_ET;
618 hvm_update_guest_cr(v, 0);
620 v->arch.hvm_vcpu.guest_cr[4] = 0;
621 hvm_update_guest_cr(v, 4);
623 if ( cpu_has_vmx_tpr_shadow )
624 {
625 __vmwrite(VIRTUAL_APIC_PAGE_ADDR,
626 page_to_maddr(vcpu_vlapic(v)->regs_page));
627 __vmwrite(TPR_THRESHOLD, 0);
628 }
630 if ( paging_mode_hap(d) )
631 {
632 __vmwrite(EPT_POINTER, d->arch.hvm_domain.vmx.ept_control.eptp);
633 #ifdef CONFIG_X86_PAE
634 __vmwrite(EPT_POINTER_HIGH,
635 d->arch.hvm_domain.vmx.ept_control.eptp >> 32);
636 #endif
637 }
639 if ( cpu_has_vmx_vpid )
640 {
641 v->arch.hvm_vmx.vpid =
642 v->domain->arch.hvm_domain.vmx.vpid_base + v->vcpu_id;
643 __vmwrite(VIRTUAL_PROCESSOR_ID, v->arch.hvm_vmx.vpid);
644 }
646 vmx_vmcs_exit(v);
648 paging_update_paging_modes(v); /* will update HOST & GUEST_CR3 as reqd */
650 vmx_vlapic_msr_changed(v);
652 return 0;
653 }
655 int vmx_read_guest_msr(struct vcpu *v, u32 msr, u64 *val)
656 {
657 unsigned int i, msr_count = v->arch.hvm_vmx.msr_count;
658 const struct vmx_msr_entry *msr_area = v->arch.hvm_vmx.msr_area;
660 for ( i = 0; i < msr_count; i++ )
661 {
662 if ( msr_area[i].index == msr )
663 {
664 *val = msr_area[i].data;
665 return 0;
666 }
667 }
669 return -ESRCH;
670 }
672 int vmx_write_guest_msr(struct vcpu *v, u32 msr, u64 val)
673 {
674 unsigned int i, msr_count = v->arch.hvm_vmx.msr_count;
675 struct vmx_msr_entry *msr_area = v->arch.hvm_vmx.msr_area;
677 for ( i = 0; i < msr_count; i++ )
678 {
679 if ( msr_area[i].index == msr )
680 {
681 msr_area[i].data = val;
682 return 0;
683 }
684 }
686 return -ESRCH;
687 }
689 int vmx_add_guest_msr(struct vcpu *v, u32 msr)
690 {
691 unsigned int i, msr_count = v->arch.hvm_vmx.msr_count;
692 struct vmx_msr_entry *msr_area = v->arch.hvm_vmx.msr_area;
694 for ( i = 0; i < msr_count; i++ )
695 if ( msr_area[i].index == msr )
696 return 0;
698 if ( msr_count == (PAGE_SIZE / sizeof(struct vmx_msr_entry)) )
699 return -ENOSPC;
701 if ( msr_area == NULL )
702 {
703 if ( (msr_area = alloc_xenheap_page()) == NULL )
704 return -ENOMEM;
705 v->arch.hvm_vmx.msr_area = msr_area;
706 __vmwrite(VM_EXIT_MSR_STORE_ADDR, virt_to_maddr(msr_area));
707 __vmwrite(VM_ENTRY_MSR_LOAD_ADDR, virt_to_maddr(msr_area));
708 }
710 msr_area[msr_count].index = msr;
711 msr_area[msr_count].mbz = 0;
712 msr_area[msr_count].data = 0;
713 v->arch.hvm_vmx.msr_count = ++msr_count;
714 __vmwrite(VM_EXIT_MSR_STORE_COUNT, msr_count);
715 __vmwrite(VM_ENTRY_MSR_LOAD_COUNT, msr_count);
717 return 0;
718 }
720 int vmx_add_host_load_msr(struct vcpu *v, u32 msr)
721 {
722 unsigned int i, msr_count = v->arch.hvm_vmx.host_msr_count;
723 struct vmx_msr_entry *msr_area = v->arch.hvm_vmx.host_msr_area;
725 for ( i = 0; i < msr_count; i++ )
726 if ( msr_area[i].index == msr )
727 return 0;
729 if ( msr_count == (PAGE_SIZE / sizeof(struct vmx_msr_entry)) )
730 return -ENOSPC;
732 if ( msr_area == NULL )
733 {
734 if ( (msr_area = alloc_xenheap_page()) == NULL )
735 return -ENOMEM;
736 v->arch.hvm_vmx.host_msr_area = msr_area;
737 __vmwrite(VM_EXIT_MSR_LOAD_ADDR, virt_to_maddr(msr_area));
738 }
740 msr_area[msr_count].index = msr;
741 msr_area[msr_count].mbz = 0;
742 rdmsrl(msr, msr_area[msr_count].data);
743 v->arch.hvm_vmx.host_msr_count = ++msr_count;
744 __vmwrite(VM_EXIT_MSR_LOAD_COUNT, msr_count);
746 return 0;
747 }
749 int vmx_create_vmcs(struct vcpu *v)
750 {
751 struct arch_vmx_struct *arch_vmx = &v->arch.hvm_vmx;
752 int rc;
754 if ( arch_vmx->vmcs == NULL )
755 {
756 if ( (arch_vmx->vmcs = vmx_alloc_vmcs()) == NULL )
757 return -ENOMEM;
759 INIT_LIST_HEAD(&arch_vmx->active_list);
760 __vmpclear(virt_to_maddr(arch_vmx->vmcs));
761 arch_vmx->active_cpu = -1;
762 arch_vmx->launched = 0;
763 }
765 if ( (rc = construct_vmcs(v)) != 0 )
766 {
767 vmx_free_vmcs(arch_vmx->vmcs);
768 arch_vmx->vmcs = NULL;
769 return rc;
770 }
772 return 0;
773 }
775 void vmx_destroy_vmcs(struct vcpu *v)
776 {
777 struct arch_vmx_struct *arch_vmx = &v->arch.hvm_vmx;
779 if ( arch_vmx->vmcs == NULL )
780 return;
782 vmx_clear_vmcs(v);
784 vmx_free_vmcs(arch_vmx->vmcs);
785 arch_vmx->vmcs = NULL;
786 }
788 void vm_launch_fail(void)
789 {
790 unsigned long error = __vmread(VM_INSTRUCTION_ERROR);
791 printk("<vm_launch_fail> error code %lx\n", error);
792 domain_crash_synchronous();
793 }
795 void vm_resume_fail(void)
796 {
797 unsigned long error = __vmread(VM_INSTRUCTION_ERROR);
798 printk("<vm_resume_fail> error code %lx\n", error);
799 domain_crash_synchronous();
800 }
802 static void wbinvd_ipi(void *info)
803 {
804 wbinvd();
805 }
807 void vmx_do_resume(struct vcpu *v)
808 {
809 bool_t debug_state;
811 if ( v->arch.hvm_vmx.active_cpu == smp_processor_id() )
812 {
813 if ( v->arch.hvm_vmx.vmcs != this_cpu(current_vmcs) )
814 vmx_load_vmcs(v);
815 }
816 else
817 {
818 /*
819 * For pass-through domain, guest PCI-E device driver may leverage the
820 * "Non-Snoop" I/O, and explicitly WBINVD or CLFLUSH to a RAM space.
821 * Since migration may occur before WBINVD or CLFLUSH, we need to
822 * maintain data consistency either by:
823 * 1: flushing cache (wbinvd) when the guest is scheduled out if
824 * there is no wbinvd exit, or
825 * 2: execute wbinvd on all dirty pCPUs when guest wbinvd exits.
826 */
827 if ( !list_empty(&(domain_hvm_iommu(v->domain)->pdev_list)) &&
828 !cpu_has_wbinvd_exiting )
829 {
830 int cpu = v->arch.hvm_vmx.active_cpu;
831 if ( cpu != -1 )
832 on_selected_cpus(cpumask_of_cpu(cpu), wbinvd_ipi, NULL, 1, 1);
833 }
835 vmx_clear_vmcs(v);
836 vmx_load_vmcs(v);
837 hvm_migrate_timers(v);
838 vmx_set_host_env(v);
839 vpid_sync_vcpu_all(v);
840 }
842 debug_state = v->domain->debugger_attached;
843 if ( unlikely(v->arch.hvm_vcpu.debug_state_latch != debug_state) )
844 {
845 unsigned long intercepts = __vmread(EXCEPTION_BITMAP);
846 unsigned long mask = (1U << TRAP_debug) | (1U << TRAP_int3);
847 v->arch.hvm_vcpu.debug_state_latch = debug_state;
848 if ( debug_state )
849 intercepts |= mask;
850 else
851 intercepts &= ~mask;
852 __vmwrite(EXCEPTION_BITMAP, intercepts);
853 }
855 hvm_do_resume(v);
856 reset_stack_and_jump(vmx_asm_do_vmentry);
857 }
859 static void vmx_dump_sel(char *name, enum x86_segment seg)
860 {
861 struct segment_register sreg;
862 hvm_get_segment_register(current, seg, &sreg);
863 printk("%s: sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016llx\n",
864 name, sreg.sel, sreg.attr.bytes, sreg.limit,
865 (unsigned long long)sreg.base);
866 }
868 static unsigned long vmr(unsigned long field)
869 {
870 int rc;
871 unsigned long val;
872 val = __vmread_safe(field, &rc);
873 return rc ? 0 : val;
874 }
876 void vmcs_dump_vcpu(struct vcpu *v)
877 {
878 struct cpu_user_regs *regs = &v->arch.guest_context.user_regs;
879 unsigned long long x;
881 if ( v == current )
882 regs = guest_cpu_user_regs();
884 vmx_vmcs_enter(v);
886 printk("*** Guest State ***\n");
887 printk("CR0: actual=0x%016llx, shadow=0x%016llx, gh_mask=%016llx\n",
888 (unsigned long long)vmr(GUEST_CR0),
889 (unsigned long long)vmr(CR0_READ_SHADOW),
890 (unsigned long long)vmr(CR0_GUEST_HOST_MASK));
891 printk("CR4: actual=0x%016llx, shadow=0x%016llx, gh_mask=%016llx\n",
892 (unsigned long long)vmr(GUEST_CR4),
893 (unsigned long long)vmr(CR4_READ_SHADOW),
894 (unsigned long long)vmr(CR4_GUEST_HOST_MASK));
895 printk("CR3: actual=0x%016llx, target_count=%d\n",
896 (unsigned long long)vmr(GUEST_CR3),
897 (int)vmr(CR3_TARGET_COUNT));
898 printk(" target0=%016llx, target1=%016llx\n",
899 (unsigned long long)vmr(CR3_TARGET_VALUE0),
900 (unsigned long long)vmr(CR3_TARGET_VALUE1));
901 printk(" target2=%016llx, target3=%016llx\n",
902 (unsigned long long)vmr(CR3_TARGET_VALUE2),
903 (unsigned long long)vmr(CR3_TARGET_VALUE3));
904 printk("RSP = 0x%016llx (0x%016llx) RIP = 0x%016llx (0x%016llx)\n",
905 (unsigned long long)vmr(GUEST_RSP),
906 (unsigned long long)regs->esp,
907 (unsigned long long)vmr(GUEST_RIP),
908 (unsigned long long)regs->eip);
909 printk("RFLAGS=0x%016llx (0x%016llx) DR7 = 0x%016llx\n",
910 (unsigned long long)vmr(GUEST_RFLAGS),
911 (unsigned long long)regs->eflags,
912 (unsigned long long)vmr(GUEST_DR7));
913 printk("Sysenter RSP=%016llx CS:RIP=%04x:%016llx\n",
914 (unsigned long long)vmr(GUEST_SYSENTER_ESP),
915 (int)vmr(GUEST_SYSENTER_CS),
916 (unsigned long long)vmr(GUEST_SYSENTER_EIP));
917 vmx_dump_sel("CS", x86_seg_cs);
918 vmx_dump_sel("DS", x86_seg_ds);
919 vmx_dump_sel("SS", x86_seg_ss);
920 vmx_dump_sel("ES", x86_seg_es);
921 vmx_dump_sel("FS", x86_seg_fs);
922 vmx_dump_sel("GS", x86_seg_gs);
923 vmx_dump_sel("GDTR", x86_seg_gdtr);
924 vmx_dump_sel("LDTR", x86_seg_ldtr);
925 vmx_dump_sel("IDTR", x86_seg_idtr);
926 vmx_dump_sel("TR", x86_seg_tr);
927 x = (unsigned long long)vmr(TSC_OFFSET_HIGH) << 32;
928 x |= (uint32_t)vmr(TSC_OFFSET);
929 printk("TSC Offset = %016llx\n", x);
930 x = (unsigned long long)vmr(GUEST_IA32_DEBUGCTL_HIGH) << 32;
931 x |= (uint32_t)vmr(GUEST_IA32_DEBUGCTL);
932 printk("DebugCtl=%016llx DebugExceptions=%016llx\n", x,
933 (unsigned long long)vmr(GUEST_PENDING_DBG_EXCEPTIONS));
934 printk("Interruptibility=%04x ActivityState=%04x\n",
935 (int)vmr(GUEST_INTERRUPTIBILITY_INFO),
936 (int)vmr(GUEST_ACTIVITY_STATE));
938 printk("*** Host State ***\n");
939 printk("RSP = 0x%016llx RIP = 0x%016llx\n",
940 (unsigned long long)vmr(HOST_RSP),
941 (unsigned long long)vmr(HOST_RIP));
942 printk("CS=%04x DS=%04x ES=%04x FS=%04x GS=%04x SS=%04x TR=%04x\n",
943 (uint16_t)vmr(HOST_CS_SELECTOR),
944 (uint16_t)vmr(HOST_DS_SELECTOR),
945 (uint16_t)vmr(HOST_ES_SELECTOR),
946 (uint16_t)vmr(HOST_FS_SELECTOR),
947 (uint16_t)vmr(HOST_GS_SELECTOR),
948 (uint16_t)vmr(HOST_SS_SELECTOR),
949 (uint16_t)vmr(HOST_TR_SELECTOR));
950 printk("FSBase=%016llx GSBase=%016llx TRBase=%016llx\n",
951 (unsigned long long)vmr(HOST_FS_BASE),
952 (unsigned long long)vmr(HOST_GS_BASE),
953 (unsigned long long)vmr(HOST_TR_BASE));
954 printk("GDTBase=%016llx IDTBase=%016llx\n",
955 (unsigned long long)vmr(HOST_GDTR_BASE),
956 (unsigned long long)vmr(HOST_IDTR_BASE));
957 printk("CR0=%016llx CR3=%016llx CR4=%016llx\n",
958 (unsigned long long)vmr(HOST_CR0),
959 (unsigned long long)vmr(HOST_CR3),
960 (unsigned long long)vmr(HOST_CR4));
961 printk("Sysenter RSP=%016llx CS:RIP=%04x:%016llx\n",
962 (unsigned long long)vmr(HOST_SYSENTER_ESP),
963 (int)vmr(HOST_SYSENTER_CS),
964 (unsigned long long)vmr(HOST_SYSENTER_EIP));
966 printk("*** Control State ***\n");
967 printk("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n",
968 (uint32_t)vmr(PIN_BASED_VM_EXEC_CONTROL),
969 (uint32_t)vmr(CPU_BASED_VM_EXEC_CONTROL),
970 (uint32_t)vmr(SECONDARY_VM_EXEC_CONTROL));
971 printk("EntryControls=%08x ExitControls=%08x\n",
972 (uint32_t)vmr(VM_ENTRY_CONTROLS),
973 (uint32_t)vmr(VM_EXIT_CONTROLS));
974 printk("ExceptionBitmap=%08x\n",
975 (uint32_t)vmr(EXCEPTION_BITMAP));
976 printk("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n",
977 (uint32_t)vmr(VM_ENTRY_INTR_INFO),
978 (uint32_t)vmr(VM_ENTRY_EXCEPTION_ERROR_CODE),
979 (uint32_t)vmr(VM_ENTRY_INSTRUCTION_LEN));
980 printk("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n",
981 (uint32_t)vmr(VM_EXIT_INTR_INFO),
982 (uint32_t)vmr(VM_EXIT_INTR_ERROR_CODE),
983 (uint32_t)vmr(VM_ENTRY_INSTRUCTION_LEN));
984 printk(" reason=%08x qualification=%08x\n",
985 (uint32_t)vmr(VM_EXIT_REASON),
986 (uint32_t)vmr(EXIT_QUALIFICATION));
987 printk("IDTVectoring: info=%08x errcode=%08x\n",
988 (uint32_t)vmr(IDT_VECTORING_INFO),
989 (uint32_t)vmr(IDT_VECTORING_ERROR_CODE));
990 printk("TPR Threshold = 0x%02x\n",
991 (uint32_t)vmr(TPR_THRESHOLD));
992 printk("EPT pointer = 0x%08x%08x\n",
993 (uint32_t)vmr(EPT_POINTER_HIGH), (uint32_t)vmr(EPT_POINTER));
994 printk("Virtual processor ID = 0x%04x\n",
995 (uint32_t)vmr(VIRTUAL_PROCESSOR_ID));
997 vmx_vmcs_exit(v);
998 }
1000 static void vmcs_dump(unsigned char ch)
1002 struct domain *d;
1003 struct vcpu *v;
1005 printk("*********** VMCS Areas **************\n");
1007 rcu_read_lock(&domlist_read_lock);
1009 for_each_domain ( d )
1011 if ( !is_hvm_domain(d) )
1012 continue;
1013 printk("\n>>> Domain %d <<<\n", d->domain_id);
1014 for_each_vcpu ( d, v )
1016 printk("\tVCPU %d\n", v->vcpu_id);
1017 vmcs_dump_vcpu(v);
1021 rcu_read_unlock(&domlist_read_lock);
1023 printk("**************************************\n");
1026 void setup_vmcs_dump(void)
1028 register_keyhandler('v', vmcs_dump, "dump Intel's VMCS");
1032 /*
1033 * Local variables:
1034 * mode: C
1035 * c-set-style: "BSD"
1036 * c-basic-offset: 4
1037 * tab-width: 4
1038 * indent-tabs-mode: nil
1039 * End:
1040 */