ia64/xen-unstable

view xen/arch/x86/hvm/vmx/vmx.c @ 9447:dfbf0939350c

This patch is mainly a cleanup of vmx related xentrace code. One minor
xentrace bug is fixed.

Signed-off-by: Xiaowei Yang <xiaowei.yang@intel.com>
Signed-off-by: Yunfeng Zhao <yunfeng.zhao@intel.com>
author kaf24@firebug.cl.cam.ac.uk
date Fri Mar 24 10:59:31 2006 +0100 (2006-03-24)
parents c947b278a349
children beb37b340903
line source
1 /*
2 * vmx.c: handling VMX architecture-related VM exits
3 * Copyright (c) 2004, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
16 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 *
18 */
20 #include <xen/config.h>
21 #include <xen/init.h>
22 #include <xen/lib.h>
23 #include <xen/trace.h>
24 #include <xen/sched.h>
25 #include <xen/irq.h>
26 #include <xen/softirq.h>
27 #include <xen/domain_page.h>
28 #include <asm/current.h>
29 #include <asm/io.h>
30 #include <asm/shadow.h>
31 #include <asm/regs.h>
32 #include <asm/cpufeature.h>
33 #include <asm/processor.h>
34 #include <asm/types.h>
35 #include <asm/msr.h>
36 #include <asm/spinlock.h>
37 #include <asm/hvm/hvm.h>
38 #include <asm/hvm/support.h>
39 #include <asm/hvm/vmx/vmx.h>
40 #include <asm/hvm/vmx/vmcs.h>
41 #include <asm/shadow.h>
42 #if CONFIG_PAGING_LEVELS >= 3
43 #include <asm/shadow_64.h>
44 #endif
45 #include <public/sched.h>
46 #include <public/hvm/ioreq.h>
47 #include <asm/hvm/vpic.h>
48 #include <asm/hvm/vlapic.h>
50 static unsigned long trace_values[NR_CPUS][4];
51 #define TRACE_VMEXIT(index,value) trace_values[smp_processor_id()][index]=value
53 static void vmx_ctxt_switch_from(struct vcpu *v);
54 static void vmx_ctxt_switch_to(struct vcpu *v);
56 void vmx_final_setup_guest(struct vcpu *v)
57 {
58 v->arch.schedule_tail = arch_vmx_do_launch;
59 v->arch.ctxt_switch_from = vmx_ctxt_switch_from;
60 v->arch.ctxt_switch_to = vmx_ctxt_switch_to;
62 if ( v->vcpu_id == 0 )
63 {
64 struct domain *d = v->domain;
65 struct vcpu *vc;
67 /* Initialize monitor page table */
68 for_each_vcpu(d, vc)
69 vc->arch.monitor_table = mk_pagetable(0);
71 /*
72 * Required to do this once per domain
73 * XXX todo: add a seperate function to do these.
74 */
75 memset(&d->shared_info->evtchn_mask[0], 0xff,
76 sizeof(d->shared_info->evtchn_mask));
78 /* Put the domain in shadow mode even though we're going to be using
79 * the shared 1:1 page table initially. It shouldn't hurt */
80 shadow_mode_enable(d,
81 SHM_enable|SHM_refcounts|
82 SHM_translate|SHM_external|SHM_wr_pt_pte);
83 }
84 }
86 static void vmx_relinquish_guest_resources(struct domain *d)
87 {
88 struct vcpu *v;
90 for_each_vcpu ( d, v )
91 {
92 vmx_request_clear_vmcs(v);
93 destroy_vmcs(&v->arch.hvm_vmx);
94 free_monitor_pagetable(v);
95 kill_timer(&v->arch.hvm_vmx.hlt_timer);
96 if ( hvm_apic_support(v->domain) && (VLAPIC(v) != NULL) )
97 {
98 kill_timer(&VLAPIC(v)->vlapic_timer);
99 xfree(VLAPIC(v));
100 }
101 }
103 kill_timer(&d->arch.hvm_domain.vpit.pit_timer);
105 if ( d->arch.hvm_domain.shared_page_va )
106 unmap_domain_page_global(
107 (void *)d->arch.hvm_domain.shared_page_va);
109 shadow_direct_map_clean(d);
110 }
112 #ifdef __x86_64__
114 static struct vmx_msr_state percpu_msr[NR_CPUS];
116 static u32 msr_data_index[VMX_MSR_COUNT] =
117 {
118 MSR_LSTAR, MSR_STAR, MSR_CSTAR,
119 MSR_SYSCALL_MASK, MSR_EFER,
120 };
122 static void vmx_save_segments(struct vcpu *v)
123 {
124 rdmsrl(MSR_SHADOW_GS_BASE, v->arch.hvm_vmx.msr_content.shadow_gs);
125 }
127 /*
128 * To avoid MSR save/restore at every VM exit/entry time, we restore
129 * the x86_64 specific MSRs at domain switch time. Since those MSRs are
130 * are not modified once set for generic domains, we don't save them,
131 * but simply reset them to the values set at percpu_traps_init().
132 */
133 static void vmx_load_msrs(void)
134 {
135 struct vmx_msr_state *host_state = &percpu_msr[smp_processor_id()];
136 int i;
138 while ( host_state->flags )
139 {
140 i = find_first_set_bit(host_state->flags);
141 wrmsrl(msr_data_index[i], host_state->msr_items[i]);
142 clear_bit(i, &host_state->flags);
143 }
144 }
146 static void vmx_save_init_msrs(void)
147 {
148 struct vmx_msr_state *host_state = &percpu_msr[smp_processor_id()];
149 int i;
151 for ( i = 0; i < VMX_MSR_COUNT; i++ )
152 rdmsrl(msr_data_index[i], host_state->msr_items[i]);
153 }
155 #define CASE_READ_MSR(address) \
156 case MSR_ ## address: \
157 msr_content = msr->msr_items[VMX_INDEX_MSR_ ## address]; \
158 break
160 #define CASE_WRITE_MSR(address) \
161 case MSR_ ## address: \
162 { \
163 msr->msr_items[VMX_INDEX_MSR_ ## address] = msr_content; \
164 if (!test_bit(VMX_INDEX_MSR_ ## address, &msr->flags)) { \
165 set_bit(VMX_INDEX_MSR_ ## address, &msr->flags); \
166 } \
167 wrmsrl(MSR_ ## address, msr_content); \
168 set_bit(VMX_INDEX_MSR_ ## address, &host_state->flags); \
169 } \
170 break
172 #define IS_CANO_ADDRESS(add) 1
173 static inline int long_mode_do_msr_read(struct cpu_user_regs *regs)
174 {
175 u64 msr_content = 0;
176 struct vcpu *v = current;
177 struct vmx_msr_state *msr = &v->arch.hvm_vmx.msr_content;
179 switch ( regs->ecx ) {
180 case MSR_EFER:
181 HVM_DBG_LOG(DBG_LEVEL_2, "EFER msr_content 0x%"PRIx64, msr_content);
182 msr_content = msr->msr_items[VMX_INDEX_MSR_EFER];
184 /* the following code may be not needed */
185 if ( test_bit(VMX_CPU_STATE_LME_ENABLED, &v->arch.hvm_vmx.cpu_state) )
186 msr_content |= EFER_LME;
187 else
188 msr_content &= ~EFER_LME;
190 if ( VMX_LONG_GUEST(v) )
191 msr_content |= EFER_LMA;
192 else
193 msr_content &= ~EFER_LMA;
194 break;
196 case MSR_FS_BASE:
197 if ( !(VMX_LONG_GUEST(v)) )
198 /* XXX should it be GP fault */
199 domain_crash_synchronous();
201 __vmread(GUEST_FS_BASE, &msr_content);
202 break;
204 case MSR_GS_BASE:
205 if ( !(VMX_LONG_GUEST(v)) )
206 domain_crash_synchronous();
208 __vmread(GUEST_GS_BASE, &msr_content);
209 break;
211 case MSR_SHADOW_GS_BASE:
212 msr_content = msr->shadow_gs;
213 break;
215 CASE_READ_MSR(STAR);
216 CASE_READ_MSR(LSTAR);
217 CASE_READ_MSR(CSTAR);
218 CASE_READ_MSR(SYSCALL_MASK);
220 default:
221 return 0;
222 }
224 HVM_DBG_LOG(DBG_LEVEL_2, "msr_content: 0x%"PRIx64, msr_content);
226 regs->eax = msr_content & 0xffffffff;
227 regs->edx = msr_content >> 32;
229 return 1;
230 }
232 static inline int long_mode_do_msr_write(struct cpu_user_regs *regs)
233 {
234 u64 msr_content = regs->eax | ((u64)regs->edx << 32);
235 struct vcpu *v = current;
236 struct vmx_msr_state *msr = &v->arch.hvm_vmx.msr_content;
237 struct vmx_msr_state *host_state = &percpu_msr[smp_processor_id()];
239 HVM_DBG_LOG(DBG_LEVEL_1, "msr 0x%lx msr_content 0x%"PRIx64"\n",
240 (unsigned long)regs->ecx, msr_content);
242 switch ( regs->ecx ) {
243 case MSR_EFER:
244 /* offending reserved bit will cause #GP */
245 if ( msr_content & ~(EFER_LME | EFER_LMA | EFER_NX | EFER_SCE) )
246 {
247 printk("trying to set reserved bit in EFER\n");
248 vmx_inject_exception(v, TRAP_gp_fault, 0);
249 return 0;
250 }
252 /* LME: 0 -> 1 */
253 if ( msr_content & EFER_LME &&
254 !test_bit(VMX_CPU_STATE_LME_ENABLED, &v->arch.hvm_vmx.cpu_state) )
255 {
256 if ( vmx_paging_enabled(v) ||
257 !test_bit(VMX_CPU_STATE_PAE_ENABLED,
258 &v->arch.hvm_vmx.cpu_state) )
259 {
260 printk("trying to set LME bit when "
261 "in paging mode or PAE bit is not set\n");
262 vmx_inject_exception(v, TRAP_gp_fault, 0);
263 return 0;
264 }
266 set_bit(VMX_CPU_STATE_LME_ENABLED, &v->arch.hvm_vmx.cpu_state);
267 }
269 msr->msr_items[VMX_INDEX_MSR_EFER] = msr_content;
270 break;
272 case MSR_FS_BASE:
273 case MSR_GS_BASE:
274 if ( !(VMX_LONG_GUEST(v)) )
275 domain_crash_synchronous();
277 if ( !IS_CANO_ADDRESS(msr_content) )
278 {
279 HVM_DBG_LOG(DBG_LEVEL_1, "Not cano address of msr write\n");
280 vmx_inject_exception(v, TRAP_gp_fault, 0);
281 return 0;
282 }
284 if ( regs->ecx == MSR_FS_BASE )
285 __vmwrite(GUEST_FS_BASE, msr_content);
286 else
287 __vmwrite(GUEST_GS_BASE, msr_content);
289 break;
291 case MSR_SHADOW_GS_BASE:
292 if ( !(VMX_LONG_GUEST(v)) )
293 domain_crash_synchronous();
295 v->arch.hvm_vmx.msr_content.shadow_gs = msr_content;
296 wrmsrl(MSR_SHADOW_GS_BASE, msr_content);
297 break;
299 CASE_WRITE_MSR(STAR);
300 CASE_WRITE_MSR(LSTAR);
301 CASE_WRITE_MSR(CSTAR);
302 CASE_WRITE_MSR(SYSCALL_MASK);
304 default:
305 return 0;
306 }
308 return 1;
309 }
311 static void vmx_restore_msrs(struct vcpu *v)
312 {
313 int i = 0;
314 struct vmx_msr_state *guest_state;
315 struct vmx_msr_state *host_state;
316 unsigned long guest_flags ;
318 guest_state = &v->arch.hvm_vmx.msr_content;;
319 host_state = &percpu_msr[smp_processor_id()];
321 wrmsrl(MSR_SHADOW_GS_BASE, guest_state->shadow_gs);
322 guest_flags = guest_state->flags;
323 if (!guest_flags)
324 return;
326 while (guest_flags){
327 i = find_first_set_bit(guest_flags);
329 HVM_DBG_LOG(DBG_LEVEL_2,
330 "restore guest's index %d msr %lx with %lx\n",
331 i, (unsigned long)msr_data_index[i],
332 (unsigned long)guest_state->msr_items[i]);
333 set_bit(i, &host_state->flags);
334 wrmsrl(msr_data_index[i], guest_state->msr_items[i]);
335 clear_bit(i, &guest_flags);
336 }
337 }
338 #else /* __i386__ */
340 #define vmx_save_segments(v) ((void)0)
341 #define vmx_load_msrs() ((void)0)
342 #define vmx_restore_msrs(v) ((void)0)
343 #define vmx_save_init_msrs() ((void)0)
345 static inline int long_mode_do_msr_read(struct cpu_user_regs *regs)
346 {
347 return 0;
348 }
350 static inline int long_mode_do_msr_write(struct cpu_user_regs *regs)
351 {
352 return 0;
353 }
355 #endif /* __i386__ */
357 static void vmx_freeze_time(struct vcpu *v)
358 {
359 struct hvm_virpit *vpit = &v->domain->arch.hvm_domain.vpit;
361 if ( vpit->first_injected && !v->domain->arch.hvm_domain.guest_time ) {
362 v->domain->arch.hvm_domain.guest_time = get_guest_time(v);
363 stop_timer(&(vpit->pit_timer));
364 }
365 }
367 static void vmx_ctxt_switch_from(struct vcpu *v)
368 {
369 vmx_freeze_time(v);
370 vmx_save_segments(v);
371 vmx_load_msrs();
372 }
374 static void vmx_ctxt_switch_to(struct vcpu *v)
375 {
376 vmx_restore_msrs(v);
377 }
379 void stop_vmx(void)
380 {
381 if (read_cr4() & X86_CR4_VMXE)
382 __vmxoff();
383 }
385 int vmx_initialize_guest_resources(struct vcpu *v)
386 {
387 vmx_final_setup_guest(v);
388 return 1;
389 }
391 void vmx_migrate_timers(struct vcpu *v)
392 {
393 struct hvm_virpit *vpit = &(v->domain->arch.hvm_domain.vpit);
395 migrate_timer(&vpit->pit_timer, v->processor);
396 migrate_timer(&v->arch.hvm_vmx.hlt_timer, v->processor);
397 if ( hvm_apic_support(v->domain) && VLAPIC(v))
398 migrate_timer(&(VLAPIC(v)->vlapic_timer), v->processor);
399 }
401 struct vmx_store_cpu_guest_regs_callback_info {
402 struct vcpu *v;
403 struct cpu_user_regs *regs;
404 unsigned long *crs;
405 };
407 static void vmx_store_cpu_guest_regs(
408 struct vcpu *v, struct cpu_user_regs *regs, unsigned long *crs);
410 static void vmx_store_cpu_guest_regs_callback(void *data)
411 {
412 struct vmx_store_cpu_guest_regs_callback_info *info = data;
413 vmx_store_cpu_guest_regs(info->v, info->regs, info->crs);
414 }
416 static void vmx_store_cpu_guest_regs(
417 struct vcpu *v, struct cpu_user_regs *regs, unsigned long *crs)
418 {
419 if ( v != current )
420 {
421 /* Non-current VCPUs must be paused to get a register snapshot. */
422 ASSERT(atomic_read(&v->pausecnt) != 0);
424 if ( v->arch.hvm_vmx.launch_cpu != smp_processor_id() )
425 {
426 /* Get register details from remote CPU. */
427 struct vmx_store_cpu_guest_regs_callback_info info = {
428 .v = v, .regs = regs, .crs = crs };
429 cpumask_t cpumask = cpumask_of_cpu(v->arch.hvm_vmx.launch_cpu);
430 on_selected_cpus(cpumask, vmx_store_cpu_guest_regs_callback,
431 &info, 1, 1);
432 return;
433 }
435 /* Register details are on this CPU. Load the correct VMCS. */
436 __vmptrld(virt_to_maddr(v->arch.hvm_vmx.vmcs));
437 }
439 ASSERT(v->arch.hvm_vmx.launch_cpu == smp_processor_id());
441 if ( regs != NULL )
442 {
443 #if defined (__x86_64__)
444 __vmread(GUEST_RFLAGS, &regs->rflags);
445 __vmread(GUEST_SS_SELECTOR, &regs->ss);
446 __vmread(GUEST_CS_SELECTOR, &regs->cs);
447 __vmread(GUEST_DS_SELECTOR, &regs->ds);
448 __vmread(GUEST_ES_SELECTOR, &regs->es);
449 __vmread(GUEST_GS_SELECTOR, &regs->gs);
450 __vmread(GUEST_FS_SELECTOR, &regs->fs);
451 __vmread(GUEST_RIP, &regs->rip);
452 __vmread(GUEST_RSP, &regs->rsp);
453 #elif defined (__i386__)
454 __vmread(GUEST_RFLAGS, &regs->eflags);
455 __vmread(GUEST_SS_SELECTOR, &regs->ss);
456 __vmread(GUEST_CS_SELECTOR, &regs->cs);
457 __vmread(GUEST_DS_SELECTOR, &regs->ds);
458 __vmread(GUEST_ES_SELECTOR, &regs->es);
459 __vmread(GUEST_GS_SELECTOR, &regs->gs);
460 __vmread(GUEST_FS_SELECTOR, &regs->fs);
461 __vmread(GUEST_RIP, &regs->eip);
462 __vmread(GUEST_RSP, &regs->esp);
463 #endif
464 }
466 if ( crs != NULL )
467 {
468 __vmread(CR0_READ_SHADOW, &crs[0]);
469 __vmread(GUEST_CR3, &crs[3]);
470 __vmread(CR4_READ_SHADOW, &crs[4]);
471 }
473 /* Reload current VCPU's VMCS if it was temporarily unloaded. */
474 if ( (v != current) && hvm_guest(current) )
475 __vmptrld(virt_to_maddr(current->arch.hvm_vmx.vmcs));
476 }
478 void vmx_load_cpu_guest_regs(struct vcpu *v, struct cpu_user_regs *regs)
479 {
480 #if defined (__x86_64__)
481 __vmwrite(GUEST_SS_SELECTOR, regs->ss);
482 __vmwrite(GUEST_RSP, regs->rsp);
484 __vmwrite(GUEST_RFLAGS, regs->rflags);
485 if (regs->rflags & EF_TF)
486 __vm_set_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_DB);
487 else
488 __vm_clear_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_DB);
490 __vmwrite(GUEST_CS_SELECTOR, regs->cs);
491 __vmwrite(GUEST_RIP, regs->rip);
492 #elif defined (__i386__)
493 __vmwrite(GUEST_SS_SELECTOR, regs->ss);
494 __vmwrite(GUEST_RSP, regs->esp);
496 __vmwrite(GUEST_RFLAGS, regs->eflags);
497 if (regs->eflags & EF_TF)
498 __vm_set_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_DB);
499 else
500 __vm_clear_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_DB);
502 __vmwrite(GUEST_CS_SELECTOR, regs->cs);
503 __vmwrite(GUEST_RIP, regs->eip);
504 #else
505 #error Unsupported architecture
506 #endif
507 }
509 void vmx_modify_guest_state(struct vcpu *v)
510 {
511 modify_vmcs(&v->arch.hvm_vmx, &v->arch.guest_context.user_regs);
512 }
514 int vmx_realmode(struct vcpu *v)
515 {
516 unsigned long rflags;
518 __vmread(GUEST_RFLAGS, &rflags);
519 return rflags & X86_EFLAGS_VM;
520 }
522 int vmx_instruction_length(struct vcpu *v)
523 {
524 unsigned long inst_len;
526 if (__vmread(VM_EXIT_INSTRUCTION_LEN, &inst_len))
527 return 0;
528 return inst_len;
529 }
531 unsigned long vmx_get_ctrl_reg(struct vcpu *v, unsigned int num)
532 {
533 switch ( num )
534 {
535 case 0:
536 return v->arch.hvm_vmx.cpu_cr0;
537 case 2:
538 return v->arch.hvm_vmx.cpu_cr2;
539 case 3:
540 return v->arch.hvm_vmx.cpu_cr3;
541 default:
542 BUG();
543 }
544 return 0; /* dummy */
545 }
547 /* SMP VMX guest support */
548 void vmx_init_ap_context(struct vcpu_guest_context *ctxt,
549 int vcpuid, int trampoline_vector)
550 {
551 int i;
553 memset(ctxt, 0, sizeof(*ctxt));
555 /*
556 * Initial register values:
557 */
558 ctxt->user_regs.eip = VMXASSIST_BASE;
559 ctxt->user_regs.edx = vcpuid;
560 ctxt->user_regs.ebx = trampoline_vector;
562 ctxt->flags = VGCF_HVM_GUEST;
564 /* Virtual IDT is empty at start-of-day. */
565 for ( i = 0; i < 256; i++ )
566 {
567 ctxt->trap_ctxt[i].vector = i;
568 ctxt->trap_ctxt[i].cs = FLAT_KERNEL_CS;
569 }
571 /* No callback handlers. */
572 #if defined(__i386__)
573 ctxt->event_callback_cs = FLAT_KERNEL_CS;
574 ctxt->failsafe_callback_cs = FLAT_KERNEL_CS;
575 #endif
576 }
578 void do_nmi(struct cpu_user_regs *);
580 static int check_vmx_controls(u32 ctrls, u32 msr)
581 {
582 u32 vmx_msr_low, vmx_msr_high;
584 rdmsr(msr, vmx_msr_low, vmx_msr_high);
585 if ( (ctrls < vmx_msr_low) || (ctrls > vmx_msr_high) )
586 {
587 printk("Insufficient VMX capability 0x%x, "
588 "msr=0x%x,low=0x%8x,high=0x%x\n",
589 ctrls, msr, vmx_msr_low, vmx_msr_high);
590 return 0;
591 }
592 return 1;
593 }
595 int start_vmx(void)
596 {
597 struct vmcs_struct *vmcs;
598 u32 ecx;
599 u32 eax, edx;
600 u64 phys_vmcs; /* debugging */
602 /*
603 * Xen does not fill x86_capability words except 0.
604 */
605 ecx = cpuid_ecx(1);
606 boot_cpu_data.x86_capability[4] = ecx;
608 if (!(test_bit(X86_FEATURE_VMXE, &boot_cpu_data.x86_capability)))
609 return 0;
611 rdmsr(IA32_FEATURE_CONTROL_MSR, eax, edx);
613 if (eax & IA32_FEATURE_CONTROL_MSR_LOCK) {
614 if ((eax & IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON) == 0x0) {
615 printk("VMX disabled by Feature Control MSR.\n");
616 return 0;
617 }
618 }
619 else {
620 wrmsr(IA32_FEATURE_CONTROL_MSR,
621 IA32_FEATURE_CONTROL_MSR_LOCK |
622 IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON, 0);
623 }
625 if (!check_vmx_controls(MONITOR_PIN_BASED_EXEC_CONTROLS,
626 MSR_IA32_VMX_PINBASED_CTLS_MSR))
627 return 0;
628 if (!check_vmx_controls(MONITOR_CPU_BASED_EXEC_CONTROLS,
629 MSR_IA32_VMX_PROCBASED_CTLS_MSR))
630 return 0;
631 if (!check_vmx_controls(MONITOR_VM_EXIT_CONTROLS,
632 MSR_IA32_VMX_EXIT_CTLS_MSR))
633 return 0;
634 if (!check_vmx_controls(MONITOR_VM_ENTRY_CONTROLS,
635 MSR_IA32_VMX_ENTRY_CTLS_MSR))
636 return 0;
638 set_in_cr4(X86_CR4_VMXE); /* Enable VMXE */
640 if (!(vmcs = alloc_vmcs())) {
641 printk("Failed to allocate VMCS\n");
642 return 0;
643 }
645 phys_vmcs = (u64) virt_to_maddr(vmcs);
647 if (!(__vmxon(phys_vmcs))) {
648 printk("VMXON is done\n");
649 }
651 vmx_save_init_msrs();
653 /* Setup HVM interfaces */
654 hvm_funcs.disable = stop_vmx;
656 hvm_funcs.initialize_guest_resources = vmx_initialize_guest_resources;
657 hvm_funcs.relinquish_guest_resources = vmx_relinquish_guest_resources;
659 hvm_funcs.store_cpu_guest_regs = vmx_store_cpu_guest_regs;
660 hvm_funcs.load_cpu_guest_regs = vmx_load_cpu_guest_regs;
662 hvm_funcs.modify_guest_state = vmx_modify_guest_state;
664 hvm_funcs.realmode = vmx_realmode;
665 hvm_funcs.paging_enabled = vmx_paging_enabled;
666 hvm_funcs.instruction_length = vmx_instruction_length;
667 hvm_funcs.get_guest_ctrl_reg = vmx_get_ctrl_reg;
669 hvm_funcs.init_ap_context = vmx_init_ap_context;
671 hvm_enabled = 1;
673 return 1;
674 }
676 /*
677 * Not all cases receive valid value in the VM-exit instruction length field.
678 */
679 #define __get_instruction_length(len) \
680 __vmread(VM_EXIT_INSTRUCTION_LEN, &(len)); \
681 if ((len) < 1 || (len) > 15) \
682 __hvm_bug(&regs);
684 static void inline __update_guest_eip(unsigned long inst_len)
685 {
686 unsigned long current_eip;
688 __vmread(GUEST_RIP, &current_eip);
689 __vmwrite(GUEST_RIP, current_eip + inst_len);
690 __vmwrite(GUEST_INTERRUPTIBILITY_INFO, 0);
691 }
694 static int vmx_do_page_fault(unsigned long va, struct cpu_user_regs *regs)
695 {
696 unsigned long gpa; /* FIXME: PAE */
697 int result;
699 #if 0 /* keep for debugging */
700 {
701 unsigned long eip;
703 __vmread(GUEST_RIP, &eip);
704 HVM_DBG_LOG(DBG_LEVEL_VMMU,
705 "vmx_do_page_fault = 0x%lx, eip = %lx, error_code = %lx",
706 va, eip, (unsigned long)regs->error_code);
707 }
708 #endif
710 if ( !vmx_paging_enabled(current) )
711 {
712 /* construct 1-to-1 direct mapping */
713 if ( shadow_direct_map_fault(va, regs) )
714 return 1;
716 handle_mmio(va, va);
717 TRACE_VMEXIT (2,2);
718 return 1;
719 }
720 gpa = gva_to_gpa(va);
722 /* Use 1:1 page table to identify MMIO address space */
723 if ( mmio_space(gpa) ){
724 struct vcpu *v = current;
725 /* No support for APIC */
726 if (!hvm_apic_support(v->domain) && gpa >= 0xFEC00000) {
727 u32 inst_len;
728 __vmread(VM_EXIT_INSTRUCTION_LEN, &(inst_len));
729 __update_guest_eip(inst_len);
730 return 1;
731 }
732 TRACE_VMEXIT (2,2);
733 handle_mmio(va, gpa);
734 return 1;
735 }
737 result = shadow_fault(va, regs);
738 TRACE_VMEXIT (2,result);
739 #if 0
740 if ( !result )
741 {
742 __vmread(GUEST_RIP, &eip);
743 printk("vmx pgfault to guest va=%lx eip=%lx\n", va, eip);
744 }
745 #endif
747 return result;
748 }
750 static void vmx_do_no_device_fault(void)
751 {
752 unsigned long cr0;
753 struct vcpu *v = current;
755 setup_fpu(current);
756 __vm_clear_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_NM);
758 /* Disable TS in guest CR0 unless the guest wants the exception too. */
759 __vmread_vcpu(v, CR0_READ_SHADOW, &cr0);
760 if ( !(cr0 & X86_CR0_TS) )
761 {
762 __vmread_vcpu(v, GUEST_CR0, &cr0);
763 cr0 &= ~X86_CR0_TS;
764 __vmwrite(GUEST_CR0, cr0);
765 }
766 }
768 /* Reserved bits: [31:15], [12:11], [9], [6], [2:1] */
769 #define VMX_VCPU_CPUID_L1_RESERVED 0xffff9a46
771 static void vmx_vmexit_do_cpuid(struct cpu_user_regs *regs)
772 {
773 unsigned int input = (unsigned int)regs->eax;
774 unsigned int count = (unsigned int)regs->ecx;
775 unsigned int eax, ebx, ecx, edx;
776 unsigned long eip;
777 struct vcpu *v = current;
779 __vmread(GUEST_RIP, &eip);
781 HVM_DBG_LOG(DBG_LEVEL_3, "(eax) 0x%08lx, (ebx) 0x%08lx, "
782 "(ecx) 0x%08lx, (edx) 0x%08lx, (esi) 0x%08lx, (edi) 0x%08lx",
783 (unsigned long)regs->eax, (unsigned long)regs->ebx,
784 (unsigned long)regs->ecx, (unsigned long)regs->edx,
785 (unsigned long)regs->esi, (unsigned long)regs->edi);
787 if ( input == 4 )
788 cpuid_count(input, count, &eax, &ebx, &ecx, &edx);
789 else
790 cpuid(input, &eax, &ebx, &ecx, &edx);
792 if ( input == 1 )
793 {
794 if ( hvm_apic_support(v->domain) &&
795 !vlapic_global_enabled((VLAPIC(v))) )
796 clear_bit(X86_FEATURE_APIC, &edx);
798 #if CONFIG_PAGING_LEVELS < 3
799 clear_bit(X86_FEATURE_PAE, &edx);
800 clear_bit(X86_FEATURE_PSE, &edx);
801 clear_bit(X86_FEATURE_PSE36, &edx);
802 #else
803 if ( v->domain->arch.ops->guest_paging_levels == PAGING_L2 )
804 {
805 if ( !v->domain->arch.hvm_domain.pae_enabled )
806 clear_bit(X86_FEATURE_PAE, &edx);
807 clear_bit(X86_FEATURE_PSE, &edx);
808 clear_bit(X86_FEATURE_PSE36, &edx);
809 }
810 #endif
812 /* Unsupportable for virtualised CPUs. */
813 ecx &= ~VMX_VCPU_CPUID_L1_RESERVED; /* mask off reserved bits */
814 clear_bit(X86_FEATURE_VMXE & 31, &ecx);
815 clear_bit(X86_FEATURE_MWAIT & 31, &ecx);
816 }
817 #ifdef __i386__
818 else if ( input == 0x80000001 )
819 {
820 /* Mask feature for Intel ia32e or AMD long mode. */
821 clear_bit(X86_FEATURE_LM & 31, &edx);
822 }
823 #endif
825 regs->eax = (unsigned long) eax;
826 regs->ebx = (unsigned long) ebx;
827 regs->ecx = (unsigned long) ecx;
828 regs->edx = (unsigned long) edx;
830 HVM_DBG_LOG(DBG_LEVEL_3, "eip@%lx, input: 0x%lx, "
831 "output: eax = 0x%08lx, ebx = 0x%08lx, "
832 "ecx = 0x%08lx, edx = 0x%08lx",
833 (unsigned long)eip, (unsigned long)input,
834 (unsigned long)eax, (unsigned long)ebx,
835 (unsigned long)ecx, (unsigned long)edx);
836 }
838 #define CASE_GET_REG_P(REG, reg) \
839 case REG_ ## REG: reg_p = (unsigned long *)&(regs->reg); break
841 static void vmx_dr_access (unsigned long exit_qualification, struct cpu_user_regs *regs)
842 {
843 unsigned int reg;
844 unsigned long *reg_p = 0;
845 struct vcpu *v = current;
846 unsigned long eip;
848 __vmread(GUEST_RIP, &eip);
850 reg = exit_qualification & DEBUG_REG_ACCESS_NUM;
852 HVM_DBG_LOG(DBG_LEVEL_1,
853 "vmx_dr_access : eip=%lx, reg=%d, exit_qualification = %lx",
854 eip, reg, exit_qualification);
856 switch(exit_qualification & DEBUG_REG_ACCESS_REG) {
857 CASE_GET_REG_P(EAX, eax);
858 CASE_GET_REG_P(ECX, ecx);
859 CASE_GET_REG_P(EDX, edx);
860 CASE_GET_REG_P(EBX, ebx);
861 CASE_GET_REG_P(EBP, ebp);
862 CASE_GET_REG_P(ESI, esi);
863 CASE_GET_REG_P(EDI, edi);
864 case REG_ESP:
865 break;
866 default:
867 __hvm_bug(regs);
868 }
870 switch (exit_qualification & DEBUG_REG_ACCESS_TYPE) {
871 case TYPE_MOV_TO_DR:
872 /* don't need to check the range */
873 if (reg != REG_ESP)
874 v->arch.guest_context.debugreg[reg] = *reg_p;
875 else {
876 unsigned long value;
877 __vmread(GUEST_RSP, &value);
878 v->arch.guest_context.debugreg[reg] = value;
879 }
880 break;
881 case TYPE_MOV_FROM_DR:
882 if (reg != REG_ESP)
883 *reg_p = v->arch.guest_context.debugreg[reg];
884 else {
885 __vmwrite(GUEST_RSP, v->arch.guest_context.debugreg[reg]);
886 }
887 break;
888 }
889 }
891 /*
892 * Invalidate the TLB for va. Invalidate the shadow page corresponding
893 * the address va.
894 */
895 static void vmx_vmexit_do_invlpg(unsigned long va)
896 {
897 unsigned long eip;
898 struct vcpu *v = current;
900 __vmread(GUEST_RIP, &eip);
902 HVM_DBG_LOG(DBG_LEVEL_VMMU, "vmx_vmexit_do_invlpg: eip=%lx, va=%lx",
903 eip, va);
905 /*
906 * We do the safest things first, then try to update the shadow
907 * copying from guest
908 */
909 shadow_invlpg(v, va);
910 }
912 static int check_for_null_selector(unsigned long eip)
913 {
914 unsigned char inst[MAX_INST_LEN];
915 unsigned long sel;
916 int i, inst_len;
917 int inst_copy_from_guest(unsigned char *, unsigned long, int);
919 __vmread(VM_EXIT_INSTRUCTION_LEN, &inst_len);
920 memset(inst, 0, MAX_INST_LEN);
921 if (inst_copy_from_guest(inst, eip, inst_len) != inst_len) {
922 printf("check_for_null_selector: get guest instruction failed\n");
923 domain_crash_synchronous();
924 }
926 for (i = 0; i < inst_len; i++) {
927 switch (inst[i]) {
928 case 0xf3: /* REPZ */
929 case 0xf2: /* REPNZ */
930 case 0xf0: /* LOCK */
931 case 0x66: /* data32 */
932 case 0x67: /* addr32 */
933 continue;
934 case 0x2e: /* CS */
935 __vmread(GUEST_CS_SELECTOR, &sel);
936 break;
937 case 0x36: /* SS */
938 __vmread(GUEST_SS_SELECTOR, &sel);
939 break;
940 case 0x26: /* ES */
941 __vmread(GUEST_ES_SELECTOR, &sel);
942 break;
943 case 0x64: /* FS */
944 __vmread(GUEST_FS_SELECTOR, &sel);
945 break;
946 case 0x65: /* GS */
947 __vmread(GUEST_GS_SELECTOR, &sel);
948 break;
949 case 0x3e: /* DS */
950 /* FALLTHROUGH */
951 default:
952 /* DS is the default */
953 __vmread(GUEST_DS_SELECTOR, &sel);
954 }
955 return sel == 0 ? 1 : 0;
956 }
958 return 0;
959 }
961 extern void send_pio_req(struct cpu_user_regs *regs, unsigned long port,
962 unsigned long count, int size, long value,
963 int dir, int pvalid);
965 static void vmx_io_instruction(struct cpu_user_regs *regs,
966 unsigned long exit_qualification, unsigned long inst_len)
967 {
968 struct mmio_op *mmio_opp;
969 unsigned long eip, cs, eflags;
970 unsigned long port, size, dir;
971 int vm86;
973 mmio_opp = &current->arch.hvm_vcpu.mmio_op;
974 mmio_opp->instr = INSTR_PIO;
975 mmio_opp->flags = 0;
977 __vmread(GUEST_RIP, &eip);
978 __vmread(GUEST_CS_SELECTOR, &cs);
979 __vmread(GUEST_RFLAGS, &eflags);
980 vm86 = eflags & X86_EFLAGS_VM ? 1 : 0;
982 HVM_DBG_LOG(DBG_LEVEL_IO,
983 "vmx_io_instruction: vm86 %d, eip=%lx:%lx, "
984 "exit_qualification = %lx",
985 vm86, cs, eip, exit_qualification);
987 if (test_bit(6, &exit_qualification))
988 port = (exit_qualification >> 16) & 0xFFFF;
989 else
990 port = regs->edx & 0xffff;
991 TRACE_VMEXIT(1, port);
992 size = (exit_qualification & 7) + 1;
993 dir = test_bit(3, &exit_qualification); /* direction */
995 if (test_bit(4, &exit_qualification)) { /* string instruction */
996 unsigned long addr, count = 1;
997 int sign = regs->eflags & EF_DF ? -1 : 1;
999 __vmread(GUEST_LINEAR_ADDRESS, &addr);
1001 /*
1002 * In protected mode, guest linear address is invalid if the
1003 * selector is null.
1004 */
1005 if (!vm86 && check_for_null_selector(eip))
1006 addr = dir == IOREQ_WRITE ? regs->esi : regs->edi;
1008 if (test_bit(5, &exit_qualification)) { /* "rep" prefix */
1009 mmio_opp->flags |= REPZ;
1010 count = vm86 ? regs->ecx & 0xFFFF : regs->ecx;
1013 /*
1014 * Handle string pio instructions that cross pages or that
1015 * are unaligned. See the comments in hvm_domain.c/handle_mmio()
1016 */
1017 if ((addr & PAGE_MASK) != ((addr + size - 1) & PAGE_MASK)) {
1018 unsigned long value = 0;
1020 mmio_opp->flags |= OVERLAP;
1021 if (dir == IOREQ_WRITE)
1022 hvm_copy(&value, addr, size, HVM_COPY_IN);
1023 send_pio_req(regs, port, 1, size, value, dir, 0);
1024 } else {
1025 if ((addr & PAGE_MASK) != ((addr + count * size - 1) & PAGE_MASK)) {
1026 if (sign > 0)
1027 count = (PAGE_SIZE - (addr & ~PAGE_MASK)) / size;
1028 else
1029 count = (addr & ~PAGE_MASK) / size;
1030 } else
1031 __update_guest_eip(inst_len);
1033 send_pio_req(regs, port, count, size, addr, dir, 1);
1035 } else {
1036 if (port == 0xe9 && dir == IOREQ_WRITE && size == 1)
1037 hvm_print_line(current, regs->eax); /* guest debug output */
1039 __update_guest_eip(inst_len);
1040 send_pio_req(regs, port, 1, size, regs->eax, dir, 0);
1044 int
1045 vmx_world_save(struct vcpu *v, struct vmx_assist_context *c)
1047 unsigned long inst_len;
1048 int error = 0;
1050 error |= __vmread(VM_EXIT_INSTRUCTION_LEN, &inst_len);
1051 error |= __vmread(GUEST_RIP, &c->eip);
1052 c->eip += inst_len; /* skip transition instruction */
1053 error |= __vmread(GUEST_RSP, &c->esp);
1054 error |= __vmread(GUEST_RFLAGS, &c->eflags);
1056 error |= __vmread(CR0_READ_SHADOW, &c->cr0);
1057 c->cr3 = v->arch.hvm_vmx.cpu_cr3;
1058 error |= __vmread(CR4_READ_SHADOW, &c->cr4);
1060 error |= __vmread(GUEST_IDTR_LIMIT, &c->idtr_limit);
1061 error |= __vmread(GUEST_IDTR_BASE, &c->idtr_base);
1063 error |= __vmread(GUEST_GDTR_LIMIT, &c->gdtr_limit);
1064 error |= __vmread(GUEST_GDTR_BASE, &c->gdtr_base);
1066 error |= __vmread(GUEST_CS_SELECTOR, &c->cs_sel);
1067 error |= __vmread(GUEST_CS_LIMIT, &c->cs_limit);
1068 error |= __vmread(GUEST_CS_BASE, &c->cs_base);
1069 error |= __vmread(GUEST_CS_AR_BYTES, &c->cs_arbytes.bytes);
1071 error |= __vmread(GUEST_DS_SELECTOR, &c->ds_sel);
1072 error |= __vmread(GUEST_DS_LIMIT, &c->ds_limit);
1073 error |= __vmread(GUEST_DS_BASE, &c->ds_base);
1074 error |= __vmread(GUEST_DS_AR_BYTES, &c->ds_arbytes.bytes);
1076 error |= __vmread(GUEST_ES_SELECTOR, &c->es_sel);
1077 error |= __vmread(GUEST_ES_LIMIT, &c->es_limit);
1078 error |= __vmread(GUEST_ES_BASE, &c->es_base);
1079 error |= __vmread(GUEST_ES_AR_BYTES, &c->es_arbytes.bytes);
1081 error |= __vmread(GUEST_SS_SELECTOR, &c->ss_sel);
1082 error |= __vmread(GUEST_SS_LIMIT, &c->ss_limit);
1083 error |= __vmread(GUEST_SS_BASE, &c->ss_base);
1084 error |= __vmread(GUEST_SS_AR_BYTES, &c->ss_arbytes.bytes);
1086 error |= __vmread(GUEST_FS_SELECTOR, &c->fs_sel);
1087 error |= __vmread(GUEST_FS_LIMIT, &c->fs_limit);
1088 error |= __vmread(GUEST_FS_BASE, &c->fs_base);
1089 error |= __vmread(GUEST_FS_AR_BYTES, &c->fs_arbytes.bytes);
1091 error |= __vmread(GUEST_GS_SELECTOR, &c->gs_sel);
1092 error |= __vmread(GUEST_GS_LIMIT, &c->gs_limit);
1093 error |= __vmread(GUEST_GS_BASE, &c->gs_base);
1094 error |= __vmread(GUEST_GS_AR_BYTES, &c->gs_arbytes.bytes);
1096 error |= __vmread(GUEST_TR_SELECTOR, &c->tr_sel);
1097 error |= __vmread(GUEST_TR_LIMIT, &c->tr_limit);
1098 error |= __vmread(GUEST_TR_BASE, &c->tr_base);
1099 error |= __vmread(GUEST_TR_AR_BYTES, &c->tr_arbytes.bytes);
1101 error |= __vmread(GUEST_LDTR_SELECTOR, &c->ldtr_sel);
1102 error |= __vmread(GUEST_LDTR_LIMIT, &c->ldtr_limit);
1103 error |= __vmread(GUEST_LDTR_BASE, &c->ldtr_base);
1104 error |= __vmread(GUEST_LDTR_AR_BYTES, &c->ldtr_arbytes.bytes);
1106 return !error;
1109 int
1110 vmx_world_restore(struct vcpu *v, struct vmx_assist_context *c)
1112 unsigned long mfn, old_cr4, old_base_mfn;
1113 int error = 0;
1115 error |= __vmwrite(GUEST_RIP, c->eip);
1116 error |= __vmwrite(GUEST_RSP, c->esp);
1117 error |= __vmwrite(GUEST_RFLAGS, c->eflags);
1119 error |= __vmwrite(CR0_READ_SHADOW, c->cr0);
1121 if (!vmx_paging_enabled(v)) {
1122 HVM_DBG_LOG(DBG_LEVEL_VMMU, "switching to vmxassist. use phys table");
1123 __vmwrite(GUEST_CR3, pagetable_get_paddr(v->domain->arch.phys_table));
1124 goto skip_cr3;
1127 if (c->cr3 == v->arch.hvm_vmx.cpu_cr3) {
1128 /*
1129 * This is simple TLB flush, implying the guest has
1130 * removed some translation or changed page attributes.
1131 * We simply invalidate the shadow.
1132 */
1133 mfn = get_mfn_from_gpfn(c->cr3 >> PAGE_SHIFT);
1134 if (mfn != pagetable_get_pfn(v->arch.guest_table)) {
1135 printk("Invalid CR3 value=%x", c->cr3);
1136 domain_crash_synchronous();
1137 return 0;
1139 shadow_sync_all(v->domain);
1140 } else {
1141 /*
1142 * If different, make a shadow. Check if the PDBR is valid
1143 * first.
1144 */
1145 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 c->cr3 = %x", c->cr3);
1146 if ((c->cr3 >> PAGE_SHIFT) > v->domain->max_pages) {
1147 printk("Invalid CR3 value=%x", c->cr3);
1148 domain_crash_synchronous();
1149 return 0;
1151 mfn = get_mfn_from_gpfn(c->cr3 >> PAGE_SHIFT);
1152 if(!get_page(mfn_to_page(mfn), v->domain))
1153 return 0;
1154 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1155 v->arch.guest_table = mk_pagetable((u64)mfn << PAGE_SHIFT);
1156 if (old_base_mfn)
1157 put_page(mfn_to_page(old_base_mfn));
1158 /*
1159 * arch.shadow_table should now hold the next CR3 for shadow
1160 */
1161 v->arch.hvm_vmx.cpu_cr3 = c->cr3;
1162 update_pagetables(v);
1163 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %x", c->cr3);
1164 __vmwrite(GUEST_CR3, pagetable_get_paddr(v->arch.shadow_table));
1167 skip_cr3:
1169 error |= __vmread(CR4_READ_SHADOW, &old_cr4);
1170 error |= __vmwrite(GUEST_CR4, (c->cr4 | VMX_CR4_HOST_MASK));
1171 error |= __vmwrite(CR4_READ_SHADOW, c->cr4);
1173 error |= __vmwrite(GUEST_IDTR_LIMIT, c->idtr_limit);
1174 error |= __vmwrite(GUEST_IDTR_BASE, c->idtr_base);
1176 error |= __vmwrite(GUEST_GDTR_LIMIT, c->gdtr_limit);
1177 error |= __vmwrite(GUEST_GDTR_BASE, c->gdtr_base);
1179 error |= __vmwrite(GUEST_CS_SELECTOR, c->cs_sel);
1180 error |= __vmwrite(GUEST_CS_LIMIT, c->cs_limit);
1181 error |= __vmwrite(GUEST_CS_BASE, c->cs_base);
1182 error |= __vmwrite(GUEST_CS_AR_BYTES, c->cs_arbytes.bytes);
1184 error |= __vmwrite(GUEST_DS_SELECTOR, c->ds_sel);
1185 error |= __vmwrite(GUEST_DS_LIMIT, c->ds_limit);
1186 error |= __vmwrite(GUEST_DS_BASE, c->ds_base);
1187 error |= __vmwrite(GUEST_DS_AR_BYTES, c->ds_arbytes.bytes);
1189 error |= __vmwrite(GUEST_ES_SELECTOR, c->es_sel);
1190 error |= __vmwrite(GUEST_ES_LIMIT, c->es_limit);
1191 error |= __vmwrite(GUEST_ES_BASE, c->es_base);
1192 error |= __vmwrite(GUEST_ES_AR_BYTES, c->es_arbytes.bytes);
1194 error |= __vmwrite(GUEST_SS_SELECTOR, c->ss_sel);
1195 error |= __vmwrite(GUEST_SS_LIMIT, c->ss_limit);
1196 error |= __vmwrite(GUEST_SS_BASE, c->ss_base);
1197 error |= __vmwrite(GUEST_SS_AR_BYTES, c->ss_arbytes.bytes);
1199 error |= __vmwrite(GUEST_FS_SELECTOR, c->fs_sel);
1200 error |= __vmwrite(GUEST_FS_LIMIT, c->fs_limit);
1201 error |= __vmwrite(GUEST_FS_BASE, c->fs_base);
1202 error |= __vmwrite(GUEST_FS_AR_BYTES, c->fs_arbytes.bytes);
1204 error |= __vmwrite(GUEST_GS_SELECTOR, c->gs_sel);
1205 error |= __vmwrite(GUEST_GS_LIMIT, c->gs_limit);
1206 error |= __vmwrite(GUEST_GS_BASE, c->gs_base);
1207 error |= __vmwrite(GUEST_GS_AR_BYTES, c->gs_arbytes.bytes);
1209 error |= __vmwrite(GUEST_TR_SELECTOR, c->tr_sel);
1210 error |= __vmwrite(GUEST_TR_LIMIT, c->tr_limit);
1211 error |= __vmwrite(GUEST_TR_BASE, c->tr_base);
1212 error |= __vmwrite(GUEST_TR_AR_BYTES, c->tr_arbytes.bytes);
1214 error |= __vmwrite(GUEST_LDTR_SELECTOR, c->ldtr_sel);
1215 error |= __vmwrite(GUEST_LDTR_LIMIT, c->ldtr_limit);
1216 error |= __vmwrite(GUEST_LDTR_BASE, c->ldtr_base);
1217 error |= __vmwrite(GUEST_LDTR_AR_BYTES, c->ldtr_arbytes.bytes);
1219 return !error;
1222 enum { VMX_ASSIST_INVOKE = 0, VMX_ASSIST_RESTORE };
1224 int
1225 vmx_assist(struct vcpu *v, int mode)
1227 struct vmx_assist_context c;
1228 u32 magic;
1229 u32 cp;
1231 /* make sure vmxassist exists (this is not an error) */
1232 if (!hvm_copy(&magic, VMXASSIST_MAGIC_OFFSET, sizeof(magic), HVM_COPY_IN))
1233 return 0;
1234 if (magic != VMXASSIST_MAGIC)
1235 return 0;
1237 switch (mode) {
1238 /*
1239 * Transfer control to vmxassist.
1240 * Store the current context in VMXASSIST_OLD_CONTEXT and load
1241 * the new VMXASSIST_NEW_CONTEXT context. This context was created
1242 * by vmxassist and will transfer control to it.
1243 */
1244 case VMX_ASSIST_INVOKE:
1245 /* save the old context */
1246 if (!hvm_copy(&cp, VMXASSIST_OLD_CONTEXT, sizeof(cp), HVM_COPY_IN))
1247 goto error;
1248 if (cp != 0) {
1249 if (!vmx_world_save(v, &c))
1250 goto error;
1251 if (!hvm_copy(&c, cp, sizeof(c), HVM_COPY_OUT))
1252 goto error;
1255 /* restore the new context, this should activate vmxassist */
1256 if (!hvm_copy(&cp, VMXASSIST_NEW_CONTEXT, sizeof(cp), HVM_COPY_IN))
1257 goto error;
1258 if (cp != 0) {
1259 if (!hvm_copy(&c, cp, sizeof(c), HVM_COPY_IN))
1260 goto error;
1261 if (!vmx_world_restore(v, &c))
1262 goto error;
1263 return 1;
1265 break;
1267 /*
1268 * Restore the VMXASSIST_OLD_CONTEXT that was saved by VMX_ASSIST_INVOKE
1269 * above.
1270 */
1271 case VMX_ASSIST_RESTORE:
1272 /* save the old context */
1273 if (!hvm_copy(&cp, VMXASSIST_OLD_CONTEXT, sizeof(cp), HVM_COPY_IN))
1274 goto error;
1275 if (cp != 0) {
1276 if (!hvm_copy(&c, cp, sizeof(c), HVM_COPY_IN))
1277 goto error;
1278 if (!vmx_world_restore(v, &c))
1279 goto error;
1280 return 1;
1282 break;
1285 error:
1286 printf("Failed to transfer to vmxassist\n");
1287 domain_crash_synchronous();
1288 return 0;
1291 static int vmx_set_cr0(unsigned long value)
1293 struct vcpu *v = current;
1294 unsigned long mfn;
1295 unsigned long eip;
1296 int paging_enabled;
1297 unsigned long vm_entry_value;
1298 unsigned long old_cr0;
1300 /*
1301 * CR0: We don't want to lose PE and PG.
1302 */
1303 __vmread_vcpu(v, CR0_READ_SHADOW, &old_cr0);
1304 paging_enabled = (old_cr0 & X86_CR0_PE) && (old_cr0 & X86_CR0_PG);
1306 /* TS cleared? Then initialise FPU now. */
1307 if ( !(value & X86_CR0_TS) )
1309 setup_fpu(v);
1310 __vm_clear_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_NM);
1313 __vmwrite(GUEST_CR0, value | X86_CR0_PE | X86_CR0_PG | X86_CR0_NE);
1314 __vmwrite(CR0_READ_SHADOW, value);
1316 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR0 value = %lx\n", value);
1318 if ( (value & X86_CR0_PE) && (value & X86_CR0_PG) && !paging_enabled )
1320 /*
1321 * Trying to enable guest paging.
1322 * The guest CR3 must be pointing to the guest physical.
1323 */
1324 if ( !VALID_MFN(mfn = get_mfn_from_gpfn(
1325 v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT)) ||
1326 !get_page(mfn_to_page(mfn), v->domain) )
1328 printk("Invalid CR3 value = %lx", v->arch.hvm_vmx.cpu_cr3);
1329 domain_crash_synchronous(); /* need to take a clean path */
1332 #if defined(__x86_64__)
1333 if ( test_bit(VMX_CPU_STATE_LME_ENABLED,
1334 &v->arch.hvm_vmx.cpu_state) &&
1335 !test_bit(VMX_CPU_STATE_PAE_ENABLED,
1336 &v->arch.hvm_vmx.cpu_state) )
1338 HVM_DBG_LOG(DBG_LEVEL_1, "Enable paging before PAE enabled\n");
1339 vmx_inject_exception(v, TRAP_gp_fault, 0);
1342 if ( test_bit(VMX_CPU_STATE_LME_ENABLED,
1343 &v->arch.hvm_vmx.cpu_state) )
1345 /* Here the PAE is should be opened */
1346 HVM_DBG_LOG(DBG_LEVEL_1, "Enable long mode\n");
1347 set_bit(VMX_CPU_STATE_LMA_ENABLED,
1348 &v->arch.hvm_vmx.cpu_state);
1350 __vmread(VM_ENTRY_CONTROLS, &vm_entry_value);
1351 vm_entry_value |= VM_ENTRY_CONTROLS_IA32E_MODE;
1352 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
1354 if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L4) )
1356 printk("Unsupported guest paging levels\n");
1357 domain_crash_synchronous(); /* need to take a clean path */
1360 else
1361 #endif /* __x86_64__ */
1363 #if CONFIG_PAGING_LEVELS >= 3
1364 /* seems it's a 32-bit or 32-bit PAE guest */
1366 if ( test_bit(VMX_CPU_STATE_PAE_ENABLED,
1367 &v->arch.hvm_vmx.cpu_state) )
1369 /* The guest enables PAE first and then it enables PG, it is
1370 * really a PAE guest */
1371 if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L3) )
1373 printk("Unsupported guest paging levels\n");
1374 domain_crash_synchronous();
1377 else
1379 if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L2) )
1381 printk("Unsupported guest paging levels\n");
1382 domain_crash_synchronous(); /* need to take a clean path */
1385 #endif
1388 /*
1389 * Now arch.guest_table points to machine physical.
1390 */
1391 v->arch.guest_table = mk_pagetable((u64)mfn << PAGE_SHIFT);
1392 update_pagetables(v);
1394 HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
1395 (unsigned long) (mfn << PAGE_SHIFT));
1397 __vmwrite(GUEST_CR3, pagetable_get_paddr(v->arch.shadow_table));
1398 /*
1399 * arch->shadow_table should hold the next CR3 for shadow
1400 */
1401 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, mfn = %lx",
1402 v->arch.hvm_vmx.cpu_cr3, mfn);
1405 if ( !((value & X86_CR0_PE) && (value & X86_CR0_PG)) && paging_enabled )
1406 if ( v->arch.hvm_vmx.cpu_cr3 ) {
1407 put_page(mfn_to_page(get_mfn_from_gpfn(
1408 v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT)));
1409 v->arch.guest_table = mk_pagetable(0);
1412 /*
1413 * VMX does not implement real-mode virtualization. We emulate
1414 * real-mode by performing a world switch to VMXAssist whenever
1415 * a partition disables the CR0.PE bit.
1416 */
1417 if ( (value & X86_CR0_PE) == 0 )
1419 if ( value & X86_CR0_PG ) {
1420 /* inject GP here */
1421 vmx_inject_exception(v, TRAP_gp_fault, 0);
1422 return 0;
1423 } else {
1424 /*
1425 * Disable paging here.
1426 * Same to PE == 1 && PG == 0
1427 */
1428 if ( test_bit(VMX_CPU_STATE_LMA_ENABLED,
1429 &v->arch.hvm_vmx.cpu_state) )
1431 clear_bit(VMX_CPU_STATE_LMA_ENABLED,
1432 &v->arch.hvm_vmx.cpu_state);
1433 __vmread(VM_ENTRY_CONTROLS, &vm_entry_value);
1434 vm_entry_value &= ~VM_ENTRY_CONTROLS_IA32E_MODE;
1435 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
1439 clear_all_shadow_status(v->domain);
1440 if ( vmx_assist(v, VMX_ASSIST_INVOKE) ) {
1441 set_bit(VMX_CPU_STATE_ASSIST_ENABLED, &v->arch.hvm_vmx.cpu_state);
1442 __vmread(GUEST_RIP, &eip);
1443 HVM_DBG_LOG(DBG_LEVEL_1,
1444 "Transfering control to vmxassist %%eip 0x%lx\n", eip);
1445 return 0; /* do not update eip! */
1447 } else if ( test_bit(VMX_CPU_STATE_ASSIST_ENABLED,
1448 &v->arch.hvm_vmx.cpu_state) )
1450 __vmread(GUEST_RIP, &eip);
1451 HVM_DBG_LOG(DBG_LEVEL_1,
1452 "Enabling CR0.PE at %%eip 0x%lx\n", eip);
1453 if ( vmx_assist(v, VMX_ASSIST_RESTORE) )
1455 clear_bit(VMX_CPU_STATE_ASSIST_ENABLED,
1456 &v->arch.hvm_vmx.cpu_state);
1457 __vmread(GUEST_RIP, &eip);
1458 HVM_DBG_LOG(DBG_LEVEL_1,
1459 "Restoring to %%eip 0x%lx\n", eip);
1460 return 0; /* do not update eip! */
1463 else if ( (value & (X86_CR0_PE | X86_CR0_PG)) == X86_CR0_PE )
1465 /* we should take care of this kind of situation */
1466 clear_all_shadow_status(v->domain);
1467 __vmwrite(GUEST_CR3, pagetable_get_paddr(v->domain->arch.phys_table));
1470 return 1;
1473 #define CASE_GET_REG(REG, reg) \
1474 case REG_ ## REG: value = regs->reg; break
1476 #define CASE_EXTEND_SET_REG \
1477 CASE_EXTEND_REG(S)
1478 #define CASE_EXTEND_GET_REG \
1479 CASE_EXTEND_REG(G)
1481 #ifdef __i386__
1482 #define CASE_EXTEND_REG(T)
1483 #else
1484 #define CASE_EXTEND_REG(T) \
1485 CASE_ ## T ## ET_REG(R8, r8); \
1486 CASE_ ## T ## ET_REG(R9, r9); \
1487 CASE_ ## T ## ET_REG(R10, r10); \
1488 CASE_ ## T ## ET_REG(R11, r11); \
1489 CASE_ ## T ## ET_REG(R12, r12); \
1490 CASE_ ## T ## ET_REG(R13, r13); \
1491 CASE_ ## T ## ET_REG(R14, r14); \
1492 CASE_ ## T ## ET_REG(R15, r15);
1493 #endif
1496 /*
1497 * Write to control registers
1498 */
1499 static int mov_to_cr(int gp, int cr, struct cpu_user_regs *regs)
1501 unsigned long value;
1502 unsigned long old_cr;
1503 struct vcpu *v = current;
1505 switch (gp) {
1506 CASE_GET_REG(EAX, eax);
1507 CASE_GET_REG(ECX, ecx);
1508 CASE_GET_REG(EDX, edx);
1509 CASE_GET_REG(EBX, ebx);
1510 CASE_GET_REG(EBP, ebp);
1511 CASE_GET_REG(ESI, esi);
1512 CASE_GET_REG(EDI, edi);
1513 CASE_EXTEND_GET_REG
1514 case REG_ESP:
1515 __vmread(GUEST_RSP, &value);
1516 break;
1517 default:
1518 printk("invalid gp: %d\n", gp);
1519 __hvm_bug(regs);
1522 HVM_DBG_LOG(DBG_LEVEL_1, "mov_to_cr: CR%d, value = %lx,", cr, value);
1523 HVM_DBG_LOG(DBG_LEVEL_1, "current = %lx,", (unsigned long) current);
1525 switch(cr) {
1526 case 0:
1528 return vmx_set_cr0(value);
1530 case 3:
1532 unsigned long old_base_mfn, mfn;
1534 /*
1535 * If paging is not enabled yet, simply copy the value to CR3.
1536 */
1537 if (!vmx_paging_enabled(v)) {
1538 v->arch.hvm_vmx.cpu_cr3 = value;
1539 break;
1542 /*
1543 * We make a new one if the shadow does not exist.
1544 */
1545 if (value == v->arch.hvm_vmx.cpu_cr3) {
1546 /*
1547 * This is simple TLB flush, implying the guest has
1548 * removed some translation or changed page attributes.
1549 * We simply invalidate the shadow.
1550 */
1551 mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
1552 if (mfn != pagetable_get_pfn(v->arch.guest_table))
1553 __hvm_bug(regs);
1554 shadow_sync_all(v->domain);
1555 } else {
1556 /*
1557 * If different, make a shadow. Check if the PDBR is valid
1558 * first.
1559 */
1560 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 value = %lx", value);
1561 if ( ((value >> PAGE_SHIFT) > v->domain->max_pages ) ||
1562 !VALID_MFN(mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT)) ||
1563 !get_page(mfn_to_page(mfn), v->domain) )
1565 printk("Invalid CR3 value=%lx", value);
1566 domain_crash_synchronous(); /* need to take a clean path */
1568 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1569 v->arch.guest_table = mk_pagetable((u64)mfn << PAGE_SHIFT);
1570 if (old_base_mfn)
1571 put_page(mfn_to_page(old_base_mfn));
1572 /*
1573 * arch.shadow_table should now hold the next CR3 for shadow
1574 */
1575 #if CONFIG_PAGING_LEVELS >= 3
1576 if ( v->domain->arch.ops->guest_paging_levels == PAGING_L3 )
1577 shadow_sync_all(v->domain);
1578 #endif
1580 v->arch.hvm_vmx.cpu_cr3 = value;
1581 update_pagetables(v);
1582 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx",
1583 value);
1584 __vmwrite(GUEST_CR3, pagetable_get_paddr(v->arch.shadow_table));
1586 break;
1588 case 4: /* CR4 */
1590 __vmread(CR4_READ_SHADOW, &old_cr);
1592 if ( value & X86_CR4_PAE && !(old_cr & X86_CR4_PAE) )
1594 set_bit(VMX_CPU_STATE_PAE_ENABLED, &v->arch.hvm_vmx.cpu_state);
1596 if ( vmx_pgbit_test(v) )
1598 /* The guest is a 32-bit PAE guest. */
1599 #if CONFIG_PAGING_LEVELS >= 4
1600 unsigned long mfn, old_base_mfn;
1602 if( !shadow_set_guest_paging_levels(v->domain, PAGING_L3) )
1604 printk("Unsupported guest paging levels\n");
1605 domain_crash_synchronous(); /* need to take a clean path */
1608 if ( !VALID_MFN(mfn = get_mfn_from_gpfn(
1609 v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT)) ||
1610 !get_page(mfn_to_page(mfn), v->domain) )
1612 printk("Invalid CR3 value = %lx", v->arch.hvm_vmx.cpu_cr3);
1613 domain_crash_synchronous(); /* need to take a clean path */
1616 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1617 if ( old_base_mfn )
1618 put_page(mfn_to_page(old_base_mfn));
1620 /*
1621 * Now arch.guest_table points to machine physical.
1622 */
1624 v->arch.guest_table = mk_pagetable((u64)mfn << PAGE_SHIFT);
1625 update_pagetables(v);
1627 HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
1628 (unsigned long) (mfn << PAGE_SHIFT));
1630 __vmwrite(GUEST_CR3, pagetable_get_paddr(v->arch.shadow_table));
1632 /*
1633 * arch->shadow_table should hold the next CR3 for shadow
1634 */
1636 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, mfn = %lx",
1637 v->arch.hvm_vmx.cpu_cr3, mfn);
1638 #endif
1640 else
1642 /* The guest is a 64 bit or 32-bit PAE guest. */
1643 #if CONFIG_PAGING_LEVELS >= 4
1644 if ( (v->domain->arch.ops != NULL) &&
1645 v->domain->arch.ops->guest_paging_levels == PAGING_L2)
1647 /* Seems the guest first enables PAE without enabling PG,
1648 * it must enable PG after that, and it is a 32-bit PAE
1649 * guest */
1651 if ( !shadow_set_guest_paging_levels(v->domain,
1652 PAGING_L3) )
1654 printk("Unsupported guest paging levels\n");
1655 /* need to take a clean path */
1656 domain_crash_synchronous();
1659 else
1661 if ( !shadow_set_guest_paging_levels(v->domain,
1662 PAGING_L4) )
1664 printk("Unsupported guest paging levels\n");
1665 domain_crash_synchronous();
1668 #endif
1671 else if ( value & X86_CR4_PAE )
1672 set_bit(VMX_CPU_STATE_PAE_ENABLED, &v->arch.hvm_vmx.cpu_state);
1673 else
1675 if ( test_bit(VMX_CPU_STATE_LMA_ENABLED, &v->arch.hvm_vmx.cpu_state) )
1676 vmx_inject_exception(v, TRAP_gp_fault, 0);
1678 clear_bit(VMX_CPU_STATE_PAE_ENABLED, &v->arch.hvm_vmx.cpu_state);
1681 __vmwrite(GUEST_CR4, value| VMX_CR4_HOST_MASK);
1682 __vmwrite(CR4_READ_SHADOW, value);
1684 /*
1685 * Writing to CR4 to modify the PSE, PGE, or PAE flag invalidates
1686 * all TLB entries except global entries.
1687 */
1688 if ( (old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE) )
1689 shadow_sync_all(v->domain);
1691 break;
1693 default:
1694 printk("invalid cr: %d\n", gp);
1695 __hvm_bug(regs);
1698 return 1;
1701 #define CASE_SET_REG(REG, reg) \
1702 case REG_ ## REG: \
1703 regs->reg = value; \
1704 break
1706 /*
1707 * Read from control registers. CR0 and CR4 are read from the shadow.
1708 */
1709 static void mov_from_cr(int cr, int gp, struct cpu_user_regs *regs)
1711 unsigned long value;
1712 struct vcpu *v = current;
1714 if (cr != 3)
1715 __hvm_bug(regs);
1717 value = (unsigned long) v->arch.hvm_vmx.cpu_cr3;
1719 switch (gp) {
1720 CASE_SET_REG(EAX, eax);
1721 CASE_SET_REG(ECX, ecx);
1722 CASE_SET_REG(EDX, edx);
1723 CASE_SET_REG(EBX, ebx);
1724 CASE_SET_REG(EBP, ebp);
1725 CASE_SET_REG(ESI, esi);
1726 CASE_SET_REG(EDI, edi);
1727 CASE_EXTEND_SET_REG
1728 case REG_ESP:
1729 __vmwrite(GUEST_RSP, value);
1730 regs->esp = value;
1731 break;
1732 default:
1733 printk("invalid gp: %d\n", gp);
1734 __hvm_bug(regs);
1737 HVM_DBG_LOG(DBG_LEVEL_VMMU, "mov_from_cr: CR%d, value = %lx,", cr, value);
1740 static int vmx_cr_access(unsigned long exit_qualification, struct cpu_user_regs *regs)
1742 unsigned int gp, cr;
1743 unsigned long value;
1744 struct vcpu *v = current;
1746 switch (exit_qualification & CONTROL_REG_ACCESS_TYPE) {
1747 case TYPE_MOV_TO_CR:
1748 gp = exit_qualification & CONTROL_REG_ACCESS_REG;
1749 cr = exit_qualification & CONTROL_REG_ACCESS_NUM;
1750 TRACE_VMEXIT(1,TYPE_MOV_TO_CR);
1751 TRACE_VMEXIT(2,cr);
1752 TRACE_VMEXIT(3,gp);
1753 return mov_to_cr(gp, cr, regs);
1754 case TYPE_MOV_FROM_CR:
1755 gp = exit_qualification & CONTROL_REG_ACCESS_REG;
1756 cr = exit_qualification & CONTROL_REG_ACCESS_NUM;
1757 TRACE_VMEXIT(1,TYPE_MOV_FROM_CR);
1758 TRACE_VMEXIT(2,cr);
1759 TRACE_VMEXIT(3,gp);
1760 mov_from_cr(cr, gp, regs);
1761 break;
1762 case TYPE_CLTS:
1763 TRACE_VMEXIT(1,TYPE_CLTS);
1765 /* We initialise the FPU now, to avoid needing another vmexit. */
1766 setup_fpu(v);
1767 __vm_clear_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_NM);
1769 __vmread_vcpu(v, GUEST_CR0, &value);
1770 value &= ~X86_CR0_TS; /* clear TS */
1771 __vmwrite(GUEST_CR0, value);
1773 __vmread_vcpu(v, CR0_READ_SHADOW, &value);
1774 value &= ~X86_CR0_TS; /* clear TS */
1775 __vmwrite(CR0_READ_SHADOW, value);
1776 break;
1777 case TYPE_LMSW:
1778 TRACE_VMEXIT(1,TYPE_LMSW);
1779 __vmread_vcpu(v, CR0_READ_SHADOW, &value);
1780 value = (value & ~0xF) |
1781 (((exit_qualification & LMSW_SOURCE_DATA) >> 16) & 0xF);
1782 return vmx_set_cr0(value);
1783 break;
1784 default:
1785 __hvm_bug(regs);
1786 break;
1788 return 1;
1791 static inline void vmx_do_msr_read(struct cpu_user_regs *regs)
1793 u64 msr_content = 0;
1794 struct vcpu *v = current;
1796 HVM_DBG_LOG(DBG_LEVEL_1, "vmx_do_msr_read: ecx=%lx, eax=%lx, edx=%lx",
1797 (unsigned long)regs->ecx, (unsigned long)regs->eax,
1798 (unsigned long)regs->edx);
1799 switch (regs->ecx) {
1800 case MSR_IA32_TIME_STAMP_COUNTER:
1802 struct hvm_virpit *vpit;
1804 rdtscll(msr_content);
1805 vpit = &(v->domain->arch.hvm_domain.vpit);
1806 msr_content += vpit->cache_tsc_offset;
1807 break;
1809 case MSR_IA32_SYSENTER_CS:
1810 __vmread(GUEST_SYSENTER_CS, (u32 *)&msr_content);
1811 break;
1812 case MSR_IA32_SYSENTER_ESP:
1813 __vmread(GUEST_SYSENTER_ESP, &msr_content);
1814 break;
1815 case MSR_IA32_SYSENTER_EIP:
1816 __vmread(GUEST_SYSENTER_EIP, &msr_content);
1817 break;
1818 case MSR_IA32_APICBASE:
1819 msr_content = VLAPIC(v) ? VLAPIC(v)->apic_base_msr : 0;
1820 break;
1821 default:
1822 if(long_mode_do_msr_read(regs))
1823 return;
1824 rdmsr_safe(regs->ecx, regs->eax, regs->edx);
1825 break;
1828 regs->eax = msr_content & 0xFFFFFFFF;
1829 regs->edx = msr_content >> 32;
1831 HVM_DBG_LOG(DBG_LEVEL_1, "vmx_do_msr_read returns: "
1832 "ecx=%lx, eax=%lx, edx=%lx",
1833 (unsigned long)regs->ecx, (unsigned long)regs->eax,
1834 (unsigned long)regs->edx);
1837 static inline void vmx_do_msr_write(struct cpu_user_regs *regs)
1839 u64 msr_content;
1840 struct vcpu *v = current;
1842 HVM_DBG_LOG(DBG_LEVEL_1, "vmx_do_msr_write: ecx=%lx, eax=%lx, edx=%lx",
1843 (unsigned long)regs->ecx, (unsigned long)regs->eax,
1844 (unsigned long)regs->edx);
1846 msr_content = (regs->eax & 0xFFFFFFFF) | ((u64)regs->edx << 32);
1848 switch (regs->ecx) {
1849 case MSR_IA32_TIME_STAMP_COUNTER:
1850 set_guest_time(v, msr_content);
1851 break;
1852 case MSR_IA32_SYSENTER_CS:
1853 __vmwrite(GUEST_SYSENTER_CS, msr_content);
1854 break;
1855 case MSR_IA32_SYSENTER_ESP:
1856 __vmwrite(GUEST_SYSENTER_ESP, msr_content);
1857 break;
1858 case MSR_IA32_SYSENTER_EIP:
1859 __vmwrite(GUEST_SYSENTER_EIP, msr_content);
1860 break;
1861 case MSR_IA32_APICBASE:
1862 vlapic_msr_set(VLAPIC(v), msr_content);
1863 break;
1864 default:
1865 long_mode_do_msr_write(regs);
1866 break;
1869 HVM_DBG_LOG(DBG_LEVEL_1, "vmx_do_msr_write returns: "
1870 "ecx=%lx, eax=%lx, edx=%lx",
1871 (unsigned long)regs->ecx, (unsigned long)regs->eax,
1872 (unsigned long)regs->edx);
1875 /*
1876 * Need to use this exit to reschedule
1877 */
1878 void vmx_vmexit_do_hlt(void)
1880 struct vcpu *v=current;
1881 struct hvm_virpit *vpit = &(v->domain->arch.hvm_domain.vpit);
1882 s_time_t next_pit=-1,next_wakeup;
1884 if ( !v->vcpu_id )
1885 next_pit = get_pit_scheduled(v,vpit);
1886 next_wakeup = get_apictime_scheduled(v);
1887 if ( (next_pit != -1 && next_pit < next_wakeup) || next_wakeup == -1 )
1888 next_wakeup = next_pit;
1889 if ( next_wakeup != - 1 )
1890 set_timer(&current->arch.hvm_vmx.hlt_timer, next_wakeup);
1891 hvm_safe_block();
1894 static inline void vmx_vmexit_do_extint(struct cpu_user_regs *regs)
1896 unsigned int vector;
1897 int error;
1899 asmlinkage void do_IRQ(struct cpu_user_regs *);
1900 fastcall void smp_apic_timer_interrupt(struct cpu_user_regs *);
1901 fastcall void smp_event_check_interrupt(void);
1902 fastcall void smp_invalidate_interrupt(void);
1903 fastcall void smp_call_function_interrupt(void);
1904 fastcall void smp_spurious_interrupt(struct cpu_user_regs *regs);
1905 fastcall void smp_error_interrupt(struct cpu_user_regs *regs);
1906 #ifdef CONFIG_X86_MCE_P4THERMAL
1907 fastcall void smp_thermal_interrupt(struct cpu_user_regs *regs);
1908 #endif
1910 if ((error = __vmread(VM_EXIT_INTR_INFO, &vector))
1911 && !(vector & INTR_INFO_VALID_MASK))
1912 __hvm_bug(regs);
1914 vector &= 0xff;
1915 local_irq_disable();
1916 TRACE_VMEXIT(1,vector);
1918 switch(vector) {
1919 case LOCAL_TIMER_VECTOR:
1920 smp_apic_timer_interrupt(regs);
1921 break;
1922 case EVENT_CHECK_VECTOR:
1923 smp_event_check_interrupt();
1924 break;
1925 case INVALIDATE_TLB_VECTOR:
1926 smp_invalidate_interrupt();
1927 break;
1928 case CALL_FUNCTION_VECTOR:
1929 smp_call_function_interrupt();
1930 break;
1931 case SPURIOUS_APIC_VECTOR:
1932 smp_spurious_interrupt(regs);
1933 break;
1934 case ERROR_APIC_VECTOR:
1935 smp_error_interrupt(regs);
1936 break;
1937 #ifdef CONFIG_X86_MCE_P4THERMAL
1938 case THERMAL_APIC_VECTOR:
1939 smp_thermal_interrupt(regs);
1940 break;
1941 #endif
1942 default:
1943 regs->entry_vector = vector;
1944 do_IRQ(regs);
1945 break;
1949 #if defined (__x86_64__)
1950 void store_cpu_user_regs(struct cpu_user_regs *regs)
1952 __vmread(GUEST_SS_SELECTOR, &regs->ss);
1953 __vmread(GUEST_RSP, &regs->rsp);
1954 __vmread(GUEST_RFLAGS, &regs->rflags);
1955 __vmread(GUEST_CS_SELECTOR, &regs->cs);
1956 __vmread(GUEST_DS_SELECTOR, &regs->ds);
1957 __vmread(GUEST_ES_SELECTOR, &regs->es);
1958 __vmread(GUEST_RIP, &regs->rip);
1960 #elif defined (__i386__)
1961 void store_cpu_user_regs(struct cpu_user_regs *regs)
1963 __vmread(GUEST_SS_SELECTOR, &regs->ss);
1964 __vmread(GUEST_RSP, &regs->esp);
1965 __vmread(GUEST_RFLAGS, &regs->eflags);
1966 __vmread(GUEST_CS_SELECTOR, &regs->cs);
1967 __vmread(GUEST_DS_SELECTOR, &regs->ds);
1968 __vmread(GUEST_ES_SELECTOR, &regs->es);
1969 __vmread(GUEST_RIP, &regs->eip);
1971 #endif
1973 #ifdef XEN_DEBUGGER
1974 void save_cpu_user_regs(struct cpu_user_regs *regs)
1976 __vmread(GUEST_SS_SELECTOR, &regs->xss);
1977 __vmread(GUEST_RSP, &regs->esp);
1978 __vmread(GUEST_RFLAGS, &regs->eflags);
1979 __vmread(GUEST_CS_SELECTOR, &regs->xcs);
1980 __vmread(GUEST_RIP, &regs->eip);
1982 __vmread(GUEST_GS_SELECTOR, &regs->xgs);
1983 __vmread(GUEST_FS_SELECTOR, &regs->xfs);
1984 __vmread(GUEST_ES_SELECTOR, &regs->xes);
1985 __vmread(GUEST_DS_SELECTOR, &regs->xds);
1988 void restore_cpu_user_regs(struct cpu_user_regs *regs)
1990 __vmwrite(GUEST_SS_SELECTOR, regs->xss);
1991 __vmwrite(GUEST_RSP, regs->esp);
1992 __vmwrite(GUEST_RFLAGS, regs->eflags);
1993 __vmwrite(GUEST_CS_SELECTOR, regs->xcs);
1994 __vmwrite(GUEST_RIP, regs->eip);
1996 __vmwrite(GUEST_GS_SELECTOR, regs->xgs);
1997 __vmwrite(GUEST_FS_SELECTOR, regs->xfs);
1998 __vmwrite(GUEST_ES_SELECTOR, regs->xes);
1999 __vmwrite(GUEST_DS_SELECTOR, regs->xds);
2001 #endif
2003 asmlinkage void vmx_vmexit_handler(struct cpu_user_regs regs)
2005 unsigned int exit_reason, idtv_info_field;
2006 unsigned long exit_qualification, eip, inst_len = 0;
2007 struct vcpu *v = current;
2008 int error;
2010 if ((error = __vmread(VM_EXIT_REASON, &exit_reason)))
2011 __hvm_bug(&regs);
2013 perfc_incra(vmexits, exit_reason);
2015 __vmread(IDT_VECTORING_INFO_FIELD, &idtv_info_field);
2016 if (idtv_info_field & INTR_INFO_VALID_MASK) {
2017 __vmwrite(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field);
2019 __vmread(VM_EXIT_INSTRUCTION_LEN, &inst_len);
2020 if (inst_len >= 1 && inst_len <= 15)
2021 __vmwrite(VM_ENTRY_INSTRUCTION_LEN, inst_len);
2023 if (idtv_info_field & 0x800) { /* valid error code */
2024 unsigned long error_code;
2025 __vmread(IDT_VECTORING_ERROR_CODE, &error_code);
2026 __vmwrite(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
2029 HVM_DBG_LOG(DBG_LEVEL_1, "idtv_info_field=%x", idtv_info_field);
2032 /* don't bother H/W interrutps */
2033 if (exit_reason != EXIT_REASON_EXTERNAL_INTERRUPT &&
2034 exit_reason != EXIT_REASON_VMCALL &&
2035 exit_reason != EXIT_REASON_IO_INSTRUCTION)
2036 HVM_DBG_LOG(DBG_LEVEL_0, "exit reason = %x", exit_reason);
2038 if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
2039 printk("Failed vm entry\n");
2040 domain_crash_synchronous();
2041 return;
2045 __vmread(GUEST_RIP, &eip);
2046 TRACE_VMEXIT(0,exit_reason);
2049 switch (exit_reason) {
2050 case EXIT_REASON_EXCEPTION_NMI:
2052 /*
2053 * We don't set the software-interrupt exiting (INT n).
2054 * (1) We can get an exception (e.g. #PG) in the guest, or
2055 * (2) NMI
2056 */
2057 int error;
2058 unsigned int vector;
2059 unsigned long va;
2061 if ((error = __vmread(VM_EXIT_INTR_INFO, &vector))
2062 || !(vector & INTR_INFO_VALID_MASK))
2063 __hvm_bug(&regs);
2064 vector &= 0xff;
2066 TRACE_VMEXIT(1,vector);
2067 perfc_incra(cause_vector, vector);
2069 switch (vector) {
2070 #ifdef XEN_DEBUGGER
2071 case TRAP_debug:
2073 save_cpu_user_regs(&regs);
2074 pdb_handle_exception(1, &regs, 1);
2075 restore_cpu_user_regs(&regs);
2076 break;
2078 case TRAP_int3:
2080 save_cpu_user_regs(&regs);
2081 pdb_handle_exception(3, &regs, 1);
2082 restore_cpu_user_regs(&regs);
2083 break;
2085 #else
2086 case TRAP_debug:
2088 void store_cpu_user_regs(struct cpu_user_regs *regs);
2090 store_cpu_user_regs(&regs);
2091 __vm_clear_bit(GUEST_PENDING_DBG_EXCEPTIONS, PENDING_DEBUG_EXC_BS);
2093 domain_pause_for_debugger();
2095 break;
2097 #endif
2098 case TRAP_no_device:
2100 vmx_do_no_device_fault();
2101 break;
2103 case TRAP_page_fault:
2105 __vmread(EXIT_QUALIFICATION, &va);
2106 __vmread(VM_EXIT_INTR_ERROR_CODE, &regs.error_code);
2108 TRACE_VMEXIT(3,regs.error_code);
2109 TRACE_VMEXIT(4,va);
2111 HVM_DBG_LOG(DBG_LEVEL_VMMU,
2112 "eax=%lx, ebx=%lx, ecx=%lx, edx=%lx, esi=%lx, edi=%lx",
2113 (unsigned long)regs.eax, (unsigned long)regs.ebx,
2114 (unsigned long)regs.ecx, (unsigned long)regs.edx,
2115 (unsigned long)regs.esi, (unsigned long)regs.edi);
2116 v->arch.hvm_vcpu.mmio_op.inst_decoder_regs = &regs;
2118 if (!(error = vmx_do_page_fault(va, &regs))) {
2119 /*
2120 * Inject #PG using Interruption-Information Fields
2121 */
2122 vmx_inject_exception(v, TRAP_page_fault, regs.error_code);
2123 v->arch.hvm_vmx.cpu_cr2 = va;
2124 TRACE_3D(TRC_VMX_INT, v->domain->domain_id, TRAP_page_fault, va);
2126 break;
2128 case TRAP_nmi:
2129 do_nmi(&regs);
2130 break;
2131 default:
2132 vmx_reflect_exception(v);
2133 break;
2135 break;
2137 case EXIT_REASON_EXTERNAL_INTERRUPT:
2138 vmx_vmexit_do_extint(&regs);
2139 break;
2140 case EXIT_REASON_PENDING_INTERRUPT:
2141 __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
2142 MONITOR_CPU_BASED_EXEC_CONTROLS);
2143 v->arch.hvm_vcpu.u.vmx.exec_control = MONITOR_CPU_BASED_EXEC_CONTROLS;
2144 break;
2145 case EXIT_REASON_TASK_SWITCH:
2146 __hvm_bug(&regs);
2147 break;
2148 case EXIT_REASON_CPUID:
2149 vmx_vmexit_do_cpuid(&regs);
2150 __get_instruction_length(inst_len);
2151 __update_guest_eip(inst_len);
2152 break;
2153 case EXIT_REASON_HLT:
2154 __get_instruction_length(inst_len);
2155 __update_guest_eip(inst_len);
2156 vmx_vmexit_do_hlt();
2157 break;
2158 case EXIT_REASON_INVLPG:
2160 unsigned long va;
2162 __vmread(EXIT_QUALIFICATION, &va);
2163 vmx_vmexit_do_invlpg(va);
2164 __get_instruction_length(inst_len);
2165 __update_guest_eip(inst_len);
2166 break;
2168 #if 0 /* keep this for debugging */
2169 case EXIT_REASON_VMCALL:
2170 __get_instruction_length(inst_len);
2171 __vmread(GUEST_RIP, &eip);
2172 __vmread(EXIT_QUALIFICATION, &exit_qualification);
2174 hvm_print_line(v, regs.eax); /* provides the current domain */
2175 __update_guest_eip(inst_len);
2176 break;
2177 #endif
2178 case EXIT_REASON_CR_ACCESS:
2180 __vmread(GUEST_RIP, &eip);
2181 __get_instruction_length(inst_len);
2182 __vmread(EXIT_QUALIFICATION, &exit_qualification);
2184 HVM_DBG_LOG(DBG_LEVEL_1, "eip = %lx, inst_len =%lx, exit_qualification = %lx",
2185 eip, inst_len, exit_qualification);
2186 if (vmx_cr_access(exit_qualification, &regs))
2187 __update_guest_eip(inst_len);
2188 TRACE_VMEXIT(3,regs.error_code);
2189 TRACE_VMEXIT(4,exit_qualification);
2190 break;
2192 case EXIT_REASON_DR_ACCESS:
2193 __vmread(EXIT_QUALIFICATION, &exit_qualification);
2194 vmx_dr_access(exit_qualification, &regs);
2195 __get_instruction_length(inst_len);
2196 __update_guest_eip(inst_len);
2197 break;
2198 case EXIT_REASON_IO_INSTRUCTION:
2199 __vmread(EXIT_QUALIFICATION, &exit_qualification);
2200 __get_instruction_length(inst_len);
2201 vmx_io_instruction(&regs, exit_qualification, inst_len);
2202 TRACE_VMEXIT(4,exit_qualification);
2203 break;
2204 case EXIT_REASON_MSR_READ:
2205 __get_instruction_length(inst_len);
2206 vmx_do_msr_read(&regs);
2207 __update_guest_eip(inst_len);
2208 break;
2209 case EXIT_REASON_MSR_WRITE:
2210 __vmread(GUEST_RIP, &eip);
2211 vmx_do_msr_write(&regs);
2212 __get_instruction_length(inst_len);
2213 __update_guest_eip(inst_len);
2214 break;
2215 case EXIT_REASON_MWAIT_INSTRUCTION:
2216 __hvm_bug(&regs);
2217 break;
2218 case EXIT_REASON_VMCALL:
2219 case EXIT_REASON_VMCLEAR:
2220 case EXIT_REASON_VMLAUNCH:
2221 case EXIT_REASON_VMPTRLD:
2222 case EXIT_REASON_VMPTRST:
2223 case EXIT_REASON_VMREAD:
2224 case EXIT_REASON_VMRESUME:
2225 case EXIT_REASON_VMWRITE:
2226 case EXIT_REASON_VMOFF:
2227 case EXIT_REASON_VMON:
2228 /* Report invalid opcode exception when a VMX guest tries to execute
2229 any of the VMX instructions */
2230 vmx_inject_exception(v, TRAP_invalid_op, VMX_DELIVER_NO_ERROR_CODE);
2231 break;
2233 default:
2234 __hvm_bug(&regs); /* should not happen */
2238 asmlinkage void vmx_load_cr2(void)
2240 struct vcpu *v = current;
2242 local_irq_disable();
2243 #ifdef __i386__
2244 asm volatile("movl %0,%%cr2": :"r" (v->arch.hvm_vmx.cpu_cr2));
2245 #else
2246 asm volatile("movq %0,%%cr2": :"r" (v->arch.hvm_vmx.cpu_cr2));
2247 #endif
2250 asmlinkage void vmx_trace_vmentry (void)
2252 TRACE_5D(TRC_VMX_VMENTRY,
2253 trace_values[smp_processor_id()][0],
2254 trace_values[smp_processor_id()][1],
2255 trace_values[smp_processor_id()][2],
2256 trace_values[smp_processor_id()][3],
2257 trace_values[smp_processor_id()][4]);
2258 TRACE_VMEXIT(0,9);
2259 TRACE_VMEXIT(1,9);
2260 TRACE_VMEXIT(2,9);
2261 TRACE_VMEXIT(3,9);
2262 TRACE_VMEXIT(4,9);
2263 return;
2266 asmlinkage void vmx_trace_vmexit (void)
2268 TRACE_3D(TRC_VMX_VMEXIT,0,0,0);
2269 return;
2272 /*
2273 * Local variables:
2274 * mode: C
2275 * c-set-style: "BSD"
2276 * c-basic-offset: 4
2277 * tab-width: 4
2278 * indent-tabs-mode: nil
2279 * End:
2280 */