ia64/xen-unstable

view xen/arch/x86/hvm/vmx/vmx.c @ 10526:5610d916ad1b

[HVM][VMX] Added flag_dr_dirty to hvm vcpu struct. If this flag is set,
save the debug registers, clear the flag, and remove guest access to
debug registers.
Signed-off-by: George Dunlap <dunlapg@umich.edu>
Signed-off-by: Nitin Kamble <nitin.a.kamble@intel.com>
author kaf24@firebug.cl.cam.ac.uk
date Tue Jun 27 09:51:18 2006 +0100 (2006-06-27)
parents 81bfa15a071e
children 9158ecb9045f
line source
1 /*
2 * vmx.c: handling VMX architecture-related VM exits
3 * Copyright (c) 2004, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
16 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 *
18 */
20 #include <xen/config.h>
21 #include <xen/init.h>
22 #include <xen/lib.h>
23 #include <xen/trace.h>
24 #include <xen/sched.h>
25 #include <xen/irq.h>
26 #include <xen/softirq.h>
27 #include <xen/domain_page.h>
28 #include <asm/current.h>
29 #include <asm/io.h>
30 #include <asm/shadow.h>
31 #include <asm/regs.h>
32 #include <asm/cpufeature.h>
33 #include <asm/processor.h>
34 #include <asm/types.h>
35 #include <asm/msr.h>
36 #include <asm/spinlock.h>
37 #include <asm/hvm/hvm.h>
38 #include <asm/hvm/support.h>
39 #include <asm/hvm/vmx/vmx.h>
40 #include <asm/hvm/vmx/vmcs.h>
41 #include <asm/hvm/vmx/cpu.h>
42 #include <asm/shadow.h>
43 #if CONFIG_PAGING_LEVELS >= 3
44 #include <asm/shadow_64.h>
45 #endif
46 #include <public/sched.h>
47 #include <public/hvm/ioreq.h>
48 #include <asm/hvm/vpic.h>
49 #include <asm/hvm/vlapic.h>
51 static unsigned long trace_values[NR_CPUS][5];
52 #define TRACE_VMEXIT(index,value) trace_values[smp_processor_id()][index]=value
54 static void vmx_ctxt_switch_from(struct vcpu *v);
55 static void vmx_ctxt_switch_to(struct vcpu *v);
57 void vmx_final_setup_guest(struct vcpu *v)
58 {
59 v->arch.schedule_tail = arch_vmx_do_launch;
60 v->arch.ctxt_switch_from = vmx_ctxt_switch_from;
61 v->arch.ctxt_switch_to = vmx_ctxt_switch_to;
63 if ( v->vcpu_id == 0 )
64 {
65 struct domain *d = v->domain;
66 struct vcpu *vc;
68 /* Initialize monitor page table */
69 for_each_vcpu(d, vc)
70 vc->arch.monitor_table = pagetable_null();
72 /*
73 * Required to do this once per domain
74 * XXX todo: add a seperate function to do these.
75 */
76 memset(&d->shared_info->evtchn_mask[0], 0xff,
77 sizeof(d->shared_info->evtchn_mask));
79 /* Put the domain in shadow mode even though we're going to be using
80 * the shared 1:1 page table initially. It shouldn't hurt */
81 shadow_mode_enable(d,
82 SHM_enable|SHM_refcounts|
83 SHM_translate|SHM_external|SHM_wr_pt_pte);
84 }
85 }
87 static void vmx_relinquish_guest_resources(struct domain *d)
88 {
89 struct vcpu *v;
91 for_each_vcpu ( d, v )
92 {
93 if ( !test_bit(_VCPUF_initialised, &v->vcpu_flags) )
94 continue;
95 vmx_destroy_vmcs(v);
96 free_monitor_pagetable(v);
97 kill_timer(&v->arch.hvm_vmx.hlt_timer);
98 if ( hvm_apic_support(v->domain) && (VLAPIC(v) != NULL) )
99 {
100 kill_timer(&VLAPIC(v)->vlapic_timer);
101 xfree(VLAPIC(v));
102 }
103 }
105 kill_timer(&d->arch.hvm_domain.pl_time.periodic_tm.timer);
107 if ( d->arch.hvm_domain.shared_page_va )
108 unmap_domain_page_global(
109 (void *)d->arch.hvm_domain.shared_page_va);
111 shadow_direct_map_clean(d);
112 }
114 #ifdef __x86_64__
116 static struct vmx_msr_state percpu_msr[NR_CPUS];
118 static u32 msr_data_index[VMX_MSR_COUNT] =
119 {
120 MSR_LSTAR, MSR_STAR, MSR_CSTAR,
121 MSR_SYSCALL_MASK, MSR_EFER,
122 };
124 static void vmx_save_segments(struct vcpu *v)
125 {
126 rdmsrl(MSR_SHADOW_GS_BASE, v->arch.hvm_vmx.msr_content.shadow_gs);
127 }
129 /*
130 * To avoid MSR save/restore at every VM exit/entry time, we restore
131 * the x86_64 specific MSRs at domain switch time. Since those MSRs are
132 * are not modified once set for generic domains, we don't save them,
133 * but simply reset them to the values set at percpu_traps_init().
134 */
135 static void vmx_load_msrs(void)
136 {
137 struct vmx_msr_state *host_state = &percpu_msr[smp_processor_id()];
138 int i;
140 while ( host_state->flags )
141 {
142 i = find_first_set_bit(host_state->flags);
143 wrmsrl(msr_data_index[i], host_state->msr_items[i]);
144 clear_bit(i, &host_state->flags);
145 }
146 }
148 static void vmx_save_init_msrs(void)
149 {
150 struct vmx_msr_state *host_state = &percpu_msr[smp_processor_id()];
151 int i;
153 for ( i = 0; i < VMX_MSR_COUNT; i++ )
154 rdmsrl(msr_data_index[i], host_state->msr_items[i]);
155 }
157 #define CASE_READ_MSR(address) \
158 case MSR_ ## address: \
159 msr_content = msr->msr_items[VMX_INDEX_MSR_ ## address]; \
160 break
162 #define CASE_WRITE_MSR(address) \
163 case MSR_ ## address: \
164 { \
165 msr->msr_items[VMX_INDEX_MSR_ ## address] = msr_content; \
166 if (!test_bit(VMX_INDEX_MSR_ ## address, &msr->flags)) { \
167 set_bit(VMX_INDEX_MSR_ ## address, &msr->flags); \
168 } \
169 wrmsrl(MSR_ ## address, msr_content); \
170 set_bit(VMX_INDEX_MSR_ ## address, &host_state->flags); \
171 } \
172 break
174 #define IS_CANO_ADDRESS(add) 1
175 static inline int long_mode_do_msr_read(struct cpu_user_regs *regs)
176 {
177 u64 msr_content = 0;
178 struct vcpu *v = current;
179 struct vmx_msr_state *msr = &v->arch.hvm_vmx.msr_content;
181 switch ( regs->ecx ) {
182 case MSR_EFER:
183 HVM_DBG_LOG(DBG_LEVEL_2, "EFER msr_content 0x%"PRIx64, msr_content);
184 msr_content = msr->msr_items[VMX_INDEX_MSR_EFER];
186 /* the following code may be not needed */
187 if ( test_bit(VMX_CPU_STATE_LME_ENABLED, &v->arch.hvm_vmx.cpu_state) )
188 msr_content |= EFER_LME;
189 else
190 msr_content &= ~EFER_LME;
192 if ( VMX_LONG_GUEST(v) )
193 msr_content |= EFER_LMA;
194 else
195 msr_content &= ~EFER_LMA;
196 break;
198 case MSR_FS_BASE:
199 if ( !(VMX_LONG_GUEST(v)) )
200 /* XXX should it be GP fault */
201 domain_crash_synchronous();
203 __vmread(GUEST_FS_BASE, &msr_content);
204 break;
206 case MSR_GS_BASE:
207 if ( !(VMX_LONG_GUEST(v)) )
208 domain_crash_synchronous();
210 __vmread(GUEST_GS_BASE, &msr_content);
211 break;
213 case MSR_SHADOW_GS_BASE:
214 msr_content = msr->shadow_gs;
215 break;
217 CASE_READ_MSR(STAR);
218 CASE_READ_MSR(LSTAR);
219 CASE_READ_MSR(CSTAR);
220 CASE_READ_MSR(SYSCALL_MASK);
222 default:
223 return 0;
224 }
226 HVM_DBG_LOG(DBG_LEVEL_2, "msr_content: 0x%"PRIx64, msr_content);
228 regs->eax = msr_content & 0xffffffff;
229 regs->edx = msr_content >> 32;
231 return 1;
232 }
234 static inline int long_mode_do_msr_write(struct cpu_user_regs *regs)
235 {
236 u64 msr_content = regs->eax | ((u64)regs->edx << 32);
237 struct vcpu *v = current;
238 struct vmx_msr_state *msr = &v->arch.hvm_vmx.msr_content;
239 struct vmx_msr_state *host_state = &percpu_msr[smp_processor_id()];
241 HVM_DBG_LOG(DBG_LEVEL_1, "msr 0x%lx msr_content 0x%"PRIx64"\n",
242 (unsigned long)regs->ecx, msr_content);
244 switch ( regs->ecx ) {
245 case MSR_EFER:
246 /* offending reserved bit will cause #GP */
247 if ( msr_content & ~(EFER_LME | EFER_LMA | EFER_NX | EFER_SCE) )
248 {
249 printk("trying to set reserved bit in EFER\n");
250 vmx_inject_exception(v, TRAP_gp_fault, 0);
251 return 0;
252 }
254 /* LME: 0 -> 1 */
255 if ( msr_content & EFER_LME &&
256 !test_bit(VMX_CPU_STATE_LME_ENABLED, &v->arch.hvm_vmx.cpu_state) )
257 {
258 if ( vmx_paging_enabled(v) ||
259 !test_bit(VMX_CPU_STATE_PAE_ENABLED,
260 &v->arch.hvm_vmx.cpu_state) )
261 {
262 printk("trying to set LME bit when "
263 "in paging mode or PAE bit is not set\n");
264 vmx_inject_exception(v, TRAP_gp_fault, 0);
265 return 0;
266 }
268 set_bit(VMX_CPU_STATE_LME_ENABLED, &v->arch.hvm_vmx.cpu_state);
269 }
271 msr->msr_items[VMX_INDEX_MSR_EFER] = msr_content;
272 break;
274 case MSR_FS_BASE:
275 case MSR_GS_BASE:
276 if ( !(VMX_LONG_GUEST(v)) )
277 domain_crash_synchronous();
279 if ( !IS_CANO_ADDRESS(msr_content) )
280 {
281 HVM_DBG_LOG(DBG_LEVEL_1, "Not cano address of msr write\n");
282 vmx_inject_exception(v, TRAP_gp_fault, 0);
283 return 0;
284 }
286 if ( regs->ecx == MSR_FS_BASE )
287 __vmwrite(GUEST_FS_BASE, msr_content);
288 else
289 __vmwrite(GUEST_GS_BASE, msr_content);
291 break;
293 case MSR_SHADOW_GS_BASE:
294 if ( !(VMX_LONG_GUEST(v)) )
295 domain_crash_synchronous();
297 v->arch.hvm_vmx.msr_content.shadow_gs = msr_content;
298 wrmsrl(MSR_SHADOW_GS_BASE, msr_content);
299 break;
301 CASE_WRITE_MSR(STAR);
302 CASE_WRITE_MSR(LSTAR);
303 CASE_WRITE_MSR(CSTAR);
304 CASE_WRITE_MSR(SYSCALL_MASK);
306 default:
307 return 0;
308 }
310 return 1;
311 }
313 static void vmx_restore_msrs(struct vcpu *v)
314 {
315 int i = 0;
316 struct vmx_msr_state *guest_state;
317 struct vmx_msr_state *host_state;
318 unsigned long guest_flags ;
320 guest_state = &v->arch.hvm_vmx.msr_content;;
321 host_state = &percpu_msr[smp_processor_id()];
323 wrmsrl(MSR_SHADOW_GS_BASE, guest_state->shadow_gs);
324 guest_flags = guest_state->flags;
325 if (!guest_flags)
326 return;
328 while (guest_flags){
329 i = find_first_set_bit(guest_flags);
331 HVM_DBG_LOG(DBG_LEVEL_2,
332 "restore guest's index %d msr %lx with %lx\n",
333 i, (unsigned long)msr_data_index[i],
334 (unsigned long)guest_state->msr_items[i]);
335 set_bit(i, &host_state->flags);
336 wrmsrl(msr_data_index[i], guest_state->msr_items[i]);
337 clear_bit(i, &guest_flags);
338 }
339 }
341 #else /* __i386__ */
343 #define vmx_save_segments(v) ((void)0)
344 #define vmx_load_msrs() ((void)0)
345 #define vmx_restore_msrs(v) ((void)0)
346 #define vmx_save_init_msrs() ((void)0)
348 static inline int long_mode_do_msr_read(struct cpu_user_regs *regs)
349 {
350 return 0;
351 }
353 static inline int long_mode_do_msr_write(struct cpu_user_regs *regs)
354 {
355 return 0;
356 }
358 #endif /* __i386__ */
360 #define loaddebug(_v,_reg) \
361 __asm__ __volatile__ ("mov %0,%%db" #_reg : : "r" ((_v)->debugreg[_reg]))
362 #define savedebug(_v,_reg) \
363 __asm__ __volatile__ ("mov %%db" #_reg ",%0" : : "r" ((_v)->debugreg[_reg]))
365 static inline void vmx_save_dr(struct vcpu *v)
366 {
367 if ( v->arch.hvm_vcpu.flag_dr_dirty )
368 {
369 savedebug(&v->arch.guest_context, 0);
370 savedebug(&v->arch.guest_context, 1);
371 savedebug(&v->arch.guest_context, 2);
372 savedebug(&v->arch.guest_context, 3);
373 savedebug(&v->arch.guest_context, 6);
375 v->arch.hvm_vcpu.flag_dr_dirty = 0;
377 v->arch.hvm_vcpu.u.vmx.exec_control |= CPU_BASED_MOV_DR_EXITING;
378 __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
379 v->arch.hvm_vcpu.u.vmx.exec_control);
380 }
381 }
383 static inline void __restore_debug_registers(struct vcpu *v)
384 {
385 loaddebug(&v->arch.guest_context, 0);
386 loaddebug(&v->arch.guest_context, 1);
387 loaddebug(&v->arch.guest_context, 2);
388 loaddebug(&v->arch.guest_context, 3);
389 /* No 4 and 5 */
390 loaddebug(&v->arch.guest_context, 6);
391 /* DR7 is loaded from the vmcs. */
392 }
394 /*
395 * DR7 is saved and restored on every vmexit. Other debug registers only
396 * need to be restored if their value is going to affect execution -- i.e.,
397 * if one of the breakpoints is enabled. So mask out all bits that don't
398 * enable some breakpoint functionality.
399 *
400 * This is in part necessary because bit 10 of DR7 is hardwired to 1, so a
401 * simple if( guest_dr7 ) will always return true. As long as we're masking,
402 * we might as well do it right.
403 */
404 #define DR7_ACTIVE_MASK 0xff
406 static inline void vmx_restore_dr(struct vcpu *v)
407 {
408 unsigned long guest_dr7;
410 __vmread(GUEST_DR7, &guest_dr7);
412 /* Assumes guest does not have DR access at time of context switch. */
413 if ( unlikely(guest_dr7 & DR7_ACTIVE_MASK) )
414 __restore_debug_registers(v);
415 }
417 static void vmx_freeze_time(struct vcpu *v)
418 {
419 struct periodic_time *pt=&v->domain->arch.hvm_domain.pl_time.periodic_tm;
421 if ( pt->enabled && pt->first_injected && !v->arch.hvm_vcpu.guest_time ) {
422 v->arch.hvm_vcpu.guest_time = hvm_get_guest_time(v);
423 stop_timer(&(pt->timer));
424 }
425 }
427 static void vmx_ctxt_switch_from(struct vcpu *v)
428 {
429 vmx_freeze_time(v);
430 vmx_save_segments(v);
431 vmx_load_msrs();
432 vmx_save_dr(v);
433 }
435 static void vmx_ctxt_switch_to(struct vcpu *v)
436 {
437 vmx_restore_msrs(v);
438 vmx_restore_dr(v);
439 }
441 void stop_vmx(void)
442 {
443 if (read_cr4() & X86_CR4_VMXE)
444 __vmxoff();
445 }
447 int vmx_initialize_guest_resources(struct vcpu *v)
448 {
449 vmx_final_setup_guest(v);
450 return 1;
451 }
453 void vmx_migrate_timers(struct vcpu *v)
454 {
455 struct periodic_time *pt = &(v->domain->arch.hvm_domain.pl_time.periodic_tm);
457 if ( pt->enabled ) {
458 migrate_timer(&pt->timer, v->processor);
459 migrate_timer(&v->arch.hvm_vmx.hlt_timer, v->processor);
460 }
461 if ( hvm_apic_support(v->domain) && VLAPIC(v))
462 migrate_timer(&(VLAPIC(v)->vlapic_timer), v->processor);
463 }
465 static void vmx_store_cpu_guest_regs(
466 struct vcpu *v, struct cpu_user_regs *regs, unsigned long *crs)
467 {
468 vmx_vmcs_enter(v);
470 if ( regs != NULL )
471 {
472 __vmread(GUEST_RFLAGS, &regs->eflags);
473 __vmread(GUEST_SS_SELECTOR, &regs->ss);
474 __vmread(GUEST_CS_SELECTOR, &regs->cs);
475 __vmread(GUEST_DS_SELECTOR, &regs->ds);
476 __vmread(GUEST_ES_SELECTOR, &regs->es);
477 __vmread(GUEST_GS_SELECTOR, &regs->gs);
478 __vmread(GUEST_FS_SELECTOR, &regs->fs);
479 __vmread(GUEST_RIP, &regs->eip);
480 __vmread(GUEST_RSP, &regs->esp);
481 }
483 if ( crs != NULL )
484 {
485 __vmread(CR0_READ_SHADOW, &crs[0]);
486 __vmread(GUEST_CR3, &crs[3]);
487 __vmread(CR4_READ_SHADOW, &crs[4]);
488 }
490 vmx_vmcs_exit(v);
491 }
493 /*
494 * The VMX spec (section 4.3.1.2, Checks on Guest Segment
495 * Registers) says that virtual-8086 mode guests' segment
496 * base-address fields in the VMCS must be equal to their
497 * corresponding segment selector field shifted right by
498 * four bits upon vmentry.
499 *
500 * This function (called only for VM86-mode guests) fixes
501 * the bases to be consistent with the selectors in regs
502 * if they're not already. Without this, we can fail the
503 * vmentry check mentioned above.
504 */
505 static void fixup_vm86_seg_bases(struct cpu_user_regs *regs)
506 {
507 int err = 0;
508 unsigned long base;
510 err |= __vmread(GUEST_ES_BASE, &base);
511 if (regs->es << 4 != base)
512 err |= __vmwrite(GUEST_ES_BASE, regs->es << 4);
513 err |= __vmread(GUEST_CS_BASE, &base);
514 if (regs->cs << 4 != base)
515 err |= __vmwrite(GUEST_CS_BASE, regs->cs << 4);
516 err |= __vmread(GUEST_SS_BASE, &base);
517 if (regs->ss << 4 != base)
518 err |= __vmwrite(GUEST_SS_BASE, regs->ss << 4);
519 err |= __vmread(GUEST_DS_BASE, &base);
520 if (regs->ds << 4 != base)
521 err |= __vmwrite(GUEST_DS_BASE, regs->ds << 4);
522 err |= __vmread(GUEST_FS_BASE, &base);
523 if (regs->fs << 4 != base)
524 err |= __vmwrite(GUEST_FS_BASE, regs->fs << 4);
525 err |= __vmread(GUEST_GS_BASE, &base);
526 if (regs->gs << 4 != base)
527 err |= __vmwrite(GUEST_GS_BASE, regs->gs << 4);
529 BUG_ON(err);
530 }
532 void vmx_load_cpu_guest_regs(struct vcpu *v, struct cpu_user_regs *regs)
533 {
534 vmx_vmcs_enter(v);
536 __vmwrite(GUEST_SS_SELECTOR, regs->ss);
537 __vmwrite(GUEST_DS_SELECTOR, regs->ds);
538 __vmwrite(GUEST_ES_SELECTOR, regs->es);
539 __vmwrite(GUEST_GS_SELECTOR, regs->gs);
540 __vmwrite(GUEST_FS_SELECTOR, regs->fs);
542 __vmwrite(GUEST_RSP, regs->esp);
544 __vmwrite(GUEST_RFLAGS, regs->eflags);
545 if (regs->eflags & EF_TF)
546 __vm_set_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_DB);
547 else
548 __vm_clear_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_DB);
549 if (regs->eflags & EF_VM)
550 fixup_vm86_seg_bases(regs);
552 __vmwrite(GUEST_CS_SELECTOR, regs->cs);
553 __vmwrite(GUEST_RIP, regs->eip);
555 vmx_vmcs_exit(v);
556 }
558 int vmx_realmode(struct vcpu *v)
559 {
560 unsigned long rflags;
562 __vmread(GUEST_RFLAGS, &rflags);
563 return rflags & X86_EFLAGS_VM;
564 }
566 int vmx_instruction_length(struct vcpu *v)
567 {
568 unsigned long inst_len;
570 if (__vmread(VM_EXIT_INSTRUCTION_LEN, &inst_len))
571 return 0;
572 return inst_len;
573 }
575 unsigned long vmx_get_ctrl_reg(struct vcpu *v, unsigned int num)
576 {
577 switch ( num )
578 {
579 case 0:
580 return v->arch.hvm_vmx.cpu_cr0;
581 case 2:
582 return v->arch.hvm_vmx.cpu_cr2;
583 case 3:
584 return v->arch.hvm_vmx.cpu_cr3;
585 default:
586 BUG();
587 }
588 return 0; /* dummy */
589 }
591 /* SMP VMX guest support */
592 void vmx_init_ap_context(struct vcpu_guest_context *ctxt,
593 int vcpuid, int trampoline_vector)
594 {
595 int i;
597 memset(ctxt, 0, sizeof(*ctxt));
599 /*
600 * Initial register values:
601 */
602 ctxt->user_regs.eip = VMXASSIST_BASE;
603 ctxt->user_regs.edx = vcpuid;
604 ctxt->user_regs.ebx = trampoline_vector;
606 ctxt->flags = VGCF_HVM_GUEST;
608 /* Virtual IDT is empty at start-of-day. */
609 for ( i = 0; i < 256; i++ )
610 {
611 ctxt->trap_ctxt[i].vector = i;
612 ctxt->trap_ctxt[i].cs = FLAT_KERNEL_CS;
613 }
615 /* No callback handlers. */
616 #if defined(__i386__)
617 ctxt->event_callback_cs = FLAT_KERNEL_CS;
618 ctxt->failsafe_callback_cs = FLAT_KERNEL_CS;
619 #endif
620 }
622 void do_nmi(struct cpu_user_regs *);
624 static int check_vmx_controls(u32 ctrls, u32 msr)
625 {
626 u32 vmx_msr_low, vmx_msr_high;
628 rdmsr(msr, vmx_msr_low, vmx_msr_high);
629 if ( (ctrls < vmx_msr_low) || (ctrls > vmx_msr_high) )
630 {
631 printk("Insufficient VMX capability 0x%x, "
632 "msr=0x%x,low=0x%8x,high=0x%x\n",
633 ctrls, msr, vmx_msr_low, vmx_msr_high);
634 return 0;
635 }
636 return 1;
637 }
639 int start_vmx(void)
640 {
641 struct vmcs_struct *vmcs;
642 u32 ecx;
643 u32 eax, edx;
644 u64 phys_vmcs; /* debugging */
646 /*
647 * Xen does not fill x86_capability words except 0.
648 */
649 ecx = cpuid_ecx(1);
650 boot_cpu_data.x86_capability[4] = ecx;
652 if (!(test_bit(X86_FEATURE_VMXE, &boot_cpu_data.x86_capability)))
653 return 0;
655 rdmsr(IA32_FEATURE_CONTROL_MSR, eax, edx);
657 if (eax & IA32_FEATURE_CONTROL_MSR_LOCK) {
658 if ((eax & IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON) == 0x0) {
659 printk("VMX disabled by Feature Control MSR.\n");
660 return 0;
661 }
662 }
663 else {
664 wrmsr(IA32_FEATURE_CONTROL_MSR,
665 IA32_FEATURE_CONTROL_MSR_LOCK |
666 IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON, 0);
667 }
669 if (!check_vmx_controls(MONITOR_PIN_BASED_EXEC_CONTROLS,
670 MSR_IA32_VMX_PINBASED_CTLS_MSR))
671 return 0;
672 if (!check_vmx_controls(MONITOR_CPU_BASED_EXEC_CONTROLS,
673 MSR_IA32_VMX_PROCBASED_CTLS_MSR))
674 return 0;
675 if (!check_vmx_controls(MONITOR_VM_EXIT_CONTROLS,
676 MSR_IA32_VMX_EXIT_CTLS_MSR))
677 return 0;
678 if (!check_vmx_controls(MONITOR_VM_ENTRY_CONTROLS,
679 MSR_IA32_VMX_ENTRY_CTLS_MSR))
680 return 0;
682 set_in_cr4(X86_CR4_VMXE); /* Enable VMXE */
684 if (!(vmcs = vmx_alloc_vmcs())) {
685 printk("Failed to allocate VMCS\n");
686 return 0;
687 }
689 phys_vmcs = (u64) virt_to_maddr(vmcs);
691 if (__vmxon(phys_vmcs)) {
692 printk("VMXON failed\n");
693 return 0;
694 }
696 printk("VMXON is done\n");
698 vmx_save_init_msrs();
700 /* Setup HVM interfaces */
701 hvm_funcs.disable = stop_vmx;
703 hvm_funcs.initialize_guest_resources = vmx_initialize_guest_resources;
704 hvm_funcs.relinquish_guest_resources = vmx_relinquish_guest_resources;
706 hvm_funcs.store_cpu_guest_regs = vmx_store_cpu_guest_regs;
707 hvm_funcs.load_cpu_guest_regs = vmx_load_cpu_guest_regs;
709 hvm_funcs.realmode = vmx_realmode;
710 hvm_funcs.paging_enabled = vmx_paging_enabled;
711 hvm_funcs.instruction_length = vmx_instruction_length;
712 hvm_funcs.get_guest_ctrl_reg = vmx_get_ctrl_reg;
714 hvm_funcs.init_ap_context = vmx_init_ap_context;
716 hvm_enabled = 1;
718 return 1;
719 }
721 /*
722 * Not all cases receive valid value in the VM-exit instruction length field.
723 */
724 #define __get_instruction_length(len) \
725 __vmread(VM_EXIT_INSTRUCTION_LEN, &(len)); \
726 if ((len) < 1 || (len) > 15) \
727 __hvm_bug(&regs);
729 static void inline __update_guest_eip(unsigned long inst_len)
730 {
731 unsigned long current_eip;
733 __vmread(GUEST_RIP, &current_eip);
734 __vmwrite(GUEST_RIP, current_eip + inst_len);
735 __vmwrite(GUEST_INTERRUPTIBILITY_INFO, 0);
736 }
739 static int vmx_do_page_fault(unsigned long va, struct cpu_user_regs *regs)
740 {
741 unsigned long gpa; /* FIXME: PAE */
742 int result;
744 #if 0 /* keep for debugging */
745 {
746 unsigned long eip;
748 __vmread(GUEST_RIP, &eip);
749 HVM_DBG_LOG(DBG_LEVEL_VMMU,
750 "vmx_do_page_fault = 0x%lx, eip = %lx, error_code = %lx",
751 va, eip, (unsigned long)regs->error_code);
752 }
753 #endif
755 if ( !vmx_paging_enabled(current) )
756 {
757 /* construct 1-to-1 direct mapping */
758 if ( shadow_direct_map_fault(va, regs) )
759 return 1;
761 handle_mmio(va, va);
762 TRACE_VMEXIT (2,2);
763 return 1;
764 }
765 gpa = gva_to_gpa(va);
767 /* Use 1:1 page table to identify MMIO address space */
768 if ( mmio_space(gpa) ){
769 struct vcpu *v = current;
770 /* No support for APIC */
771 if (!hvm_apic_support(v->domain) && gpa >= 0xFEC00000) {
772 u32 inst_len;
773 __vmread(VM_EXIT_INSTRUCTION_LEN, &(inst_len));
774 __update_guest_eip(inst_len);
775 return 1;
776 }
777 TRACE_VMEXIT (2,2);
778 handle_mmio(va, gpa);
779 return 1;
780 }
782 result = shadow_fault(va, regs);
783 TRACE_VMEXIT (2,result);
784 #if 0
785 if ( !result )
786 {
787 __vmread(GUEST_RIP, &eip);
788 printk("vmx pgfault to guest va=%lx eip=%lx\n", va, eip);
789 }
790 #endif
792 return result;
793 }
795 static void vmx_do_no_device_fault(void)
796 {
797 unsigned long cr0;
798 struct vcpu *v = current;
800 setup_fpu(current);
801 __vm_clear_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_NM);
803 /* Disable TS in guest CR0 unless the guest wants the exception too. */
804 __vmread_vcpu(v, CR0_READ_SHADOW, &cr0);
805 if ( !(cr0 & X86_CR0_TS) )
806 {
807 __vmread_vcpu(v, GUEST_CR0, &cr0);
808 cr0 &= ~X86_CR0_TS;
809 __vmwrite(GUEST_CR0, cr0);
810 }
811 }
813 #define bitmaskof(idx) (1U << ((idx)&31))
814 static void vmx_vmexit_do_cpuid(struct cpu_user_regs *regs)
815 {
816 unsigned int input = (unsigned int)regs->eax;
817 unsigned int count = (unsigned int)regs->ecx;
818 unsigned int eax, ebx, ecx, edx;
819 unsigned long eip;
820 struct vcpu *v = current;
822 __vmread(GUEST_RIP, &eip);
824 HVM_DBG_LOG(DBG_LEVEL_3, "(eax) 0x%08lx, (ebx) 0x%08lx, "
825 "(ecx) 0x%08lx, (edx) 0x%08lx, (esi) 0x%08lx, (edi) 0x%08lx",
826 (unsigned long)regs->eax, (unsigned long)regs->ebx,
827 (unsigned long)regs->ecx, (unsigned long)regs->edx,
828 (unsigned long)regs->esi, (unsigned long)regs->edi);
830 if ( input == CPUID_LEAF_0x4 )
831 {
832 cpuid_count(input, count, &eax, &ebx, &ecx, &edx);
833 eax &= NUM_CORES_RESET_MASK;
834 }
835 else
836 {
837 cpuid(input, &eax, &ebx, &ecx, &edx);
839 if ( input == CPUID_LEAF_0x1 )
840 {
841 /* mask off reserved bits */
842 ecx &= ~VMX_VCPU_CPUID_L1_ECX_RESERVED;
844 if ( !hvm_apic_support(v->domain) ||
845 !vlapic_global_enabled((VLAPIC(v))) )
846 {
847 /* Since the apic is disabled, avoid any
848 confusion about SMP cpus being available */
850 clear_bit(X86_FEATURE_APIC, &edx);
851 }
853 #if CONFIG_PAGING_LEVELS < 3
854 edx &= ~(bitmaskof(X86_FEATURE_PAE) |
855 bitmaskof(X86_FEATURE_PSE) |
856 bitmaskof(X86_FEATURE_PSE36));
857 #else
858 if ( v->domain->arch.ops->guest_paging_levels == PAGING_L2 )
859 {
860 if ( !v->domain->arch.hvm_domain.pae_enabled )
861 clear_bit(X86_FEATURE_PAE, &edx);
862 clear_bit(X86_FEATURE_PSE, &edx);
863 clear_bit(X86_FEATURE_PSE36, &edx);
864 }
865 #endif
867 ebx &= NUM_THREADS_RESET_MASK;
869 /* Unsupportable for virtualised CPUs. */
870 ecx &= ~(bitmaskof(X86_FEATURE_VMXE) |
871 bitmaskof(X86_FEATURE_EST) |
872 bitmaskof(X86_FEATURE_TM2) |
873 bitmaskof(X86_FEATURE_CID) |
874 bitmaskof(X86_FEATURE_MWAIT) );
876 edx &= ~( bitmaskof(X86_FEATURE_HT) |
877 bitmaskof(X86_FEATURE_MCA) |
878 bitmaskof(X86_FEATURE_MCE) |
879 bitmaskof(X86_FEATURE_ACPI) |
880 bitmaskof(X86_FEATURE_ACC) );
881 }
882 else if ( ( input == CPUID_LEAF_0x6 )
883 || ( input == CPUID_LEAF_0x9 )
884 || ( input == CPUID_LEAF_0xA ))
885 {
886 eax = ebx = ecx = edx = 0x0;
887 }
888 #ifdef __i386__
889 else if ( input == CPUID_LEAF_0x80000001 )
890 {
891 clear_bit(X86_FEATURE_LAHF_LM & 31, &ecx);
893 clear_bit(X86_FEATURE_LM & 31, &edx);
894 clear_bit(X86_FEATURE_SYSCALL & 31, &edx);
895 }
896 #endif
897 }
899 regs->eax = (unsigned long) eax;
900 regs->ebx = (unsigned long) ebx;
901 regs->ecx = (unsigned long) ecx;
902 regs->edx = (unsigned long) edx;
904 HVM_DBG_LOG(DBG_LEVEL_3, "eip@%lx, input: 0x%lx, "
905 "output: eax = 0x%08lx, ebx = 0x%08lx, "
906 "ecx = 0x%08lx, edx = 0x%08lx",
907 (unsigned long)eip, (unsigned long)input,
908 (unsigned long)eax, (unsigned long)ebx,
909 (unsigned long)ecx, (unsigned long)edx);
910 }
912 #define CASE_GET_REG_P(REG, reg) \
913 case REG_ ## REG: reg_p = (unsigned long *)&(regs->reg); break
915 #ifdef __i386__
916 #define CASE_EXTEND_GET_REG_P
917 #else
918 #define CASE_EXTEND_GET_REG_P \
919 CASE_GET_REG_P(R8, r8); \
920 CASE_GET_REG_P(R9, r9); \
921 CASE_GET_REG_P(R10, r10); \
922 CASE_GET_REG_P(R11, r11); \
923 CASE_GET_REG_P(R12, r12); \
924 CASE_GET_REG_P(R13, r13); \
925 CASE_GET_REG_P(R14, r14); \
926 CASE_GET_REG_P(R15, r15)
927 #endif
929 static void vmx_dr_access (unsigned long exit_qualification, struct cpu_user_regs *regs)
930 {
931 unsigned int reg;
932 unsigned long *reg_p = 0;
933 struct vcpu *v = current;
934 unsigned long eip;
936 __vmread(GUEST_RIP, &eip);
938 reg = exit_qualification & DEBUG_REG_ACCESS_NUM;
940 HVM_DBG_LOG(DBG_LEVEL_1,
941 "vmx_dr_access : eip=%lx, reg=%d, exit_qualification = %lx",
942 eip, reg, exit_qualification);
944 switch ( exit_qualification & DEBUG_REG_ACCESS_REG ) {
945 CASE_GET_REG_P(EAX, eax);
946 CASE_GET_REG_P(ECX, ecx);
947 CASE_GET_REG_P(EDX, edx);
948 CASE_GET_REG_P(EBX, ebx);
949 CASE_GET_REG_P(EBP, ebp);
950 CASE_GET_REG_P(ESI, esi);
951 CASE_GET_REG_P(EDI, edi);
952 CASE_EXTEND_GET_REG_P;
953 case REG_ESP:
954 break;
955 default:
956 __hvm_bug(regs);
957 }
959 switch (exit_qualification & DEBUG_REG_ACCESS_TYPE) {
960 case TYPE_MOV_TO_DR:
961 /* don't need to check the range */
962 if (reg != REG_ESP)
963 v->arch.guest_context.debugreg[reg] = *reg_p;
964 else {
965 unsigned long value;
966 __vmread(GUEST_RSP, &value);
967 v->arch.guest_context.debugreg[reg] = value;
968 }
969 break;
970 case TYPE_MOV_FROM_DR:
971 if (reg != REG_ESP)
972 *reg_p = v->arch.guest_context.debugreg[reg];
973 else {
974 __vmwrite(GUEST_RSP, v->arch.guest_context.debugreg[reg]);
975 }
976 break;
977 }
978 }
980 /*
981 * Invalidate the TLB for va. Invalidate the shadow page corresponding
982 * the address va.
983 */
984 static void vmx_vmexit_do_invlpg(unsigned long va)
985 {
986 unsigned long eip;
987 struct vcpu *v = current;
989 __vmread(GUEST_RIP, &eip);
991 HVM_DBG_LOG(DBG_LEVEL_VMMU, "vmx_vmexit_do_invlpg: eip=%lx, va=%lx",
992 eip, va);
994 /*
995 * We do the safest things first, then try to update the shadow
996 * copying from guest
997 */
998 shadow_invlpg(v, va);
999 }
1001 static int check_for_null_selector(unsigned long eip)
1003 unsigned char inst[MAX_INST_LEN];
1004 unsigned long sel;
1005 int i, inst_len;
1006 int inst_copy_from_guest(unsigned char *, unsigned long, int);
1008 __vmread(VM_EXIT_INSTRUCTION_LEN, &inst_len);
1009 memset(inst, 0, MAX_INST_LEN);
1010 if (inst_copy_from_guest(inst, eip, inst_len) != inst_len) {
1011 printf("check_for_null_selector: get guest instruction failed\n");
1012 domain_crash_synchronous();
1015 for (i = 0; i < inst_len; i++) {
1016 switch (inst[i]) {
1017 case 0xf3: /* REPZ */
1018 case 0xf2: /* REPNZ */
1019 case 0xf0: /* LOCK */
1020 case 0x66: /* data32 */
1021 case 0x67: /* addr32 */
1022 continue;
1023 case 0x2e: /* CS */
1024 __vmread(GUEST_CS_SELECTOR, &sel);
1025 break;
1026 case 0x36: /* SS */
1027 __vmread(GUEST_SS_SELECTOR, &sel);
1028 break;
1029 case 0x26: /* ES */
1030 __vmread(GUEST_ES_SELECTOR, &sel);
1031 break;
1032 case 0x64: /* FS */
1033 __vmread(GUEST_FS_SELECTOR, &sel);
1034 break;
1035 case 0x65: /* GS */
1036 __vmread(GUEST_GS_SELECTOR, &sel);
1037 break;
1038 case 0x3e: /* DS */
1039 /* FALLTHROUGH */
1040 default:
1041 /* DS is the default */
1042 __vmread(GUEST_DS_SELECTOR, &sel);
1044 return sel == 0 ? 1 : 0;
1047 return 0;
1050 extern void send_pio_req(struct cpu_user_regs *regs, unsigned long port,
1051 unsigned long count, int size, long value,
1052 int dir, int pvalid);
1054 static void vmx_io_instruction(struct cpu_user_regs *regs,
1055 unsigned long exit_qualification, unsigned long inst_len)
1057 struct mmio_op *mmio_opp;
1058 unsigned long eip, cs, eflags;
1059 unsigned long port, size, dir;
1060 int vm86;
1062 mmio_opp = &current->arch.hvm_vcpu.mmio_op;
1063 mmio_opp->instr = INSTR_PIO;
1064 mmio_opp->flags = 0;
1066 __vmread(GUEST_RIP, &eip);
1067 __vmread(GUEST_CS_SELECTOR, &cs);
1068 __vmread(GUEST_RFLAGS, &eflags);
1069 vm86 = eflags & X86_EFLAGS_VM ? 1 : 0;
1071 HVM_DBG_LOG(DBG_LEVEL_IO,
1072 "vmx_io_instruction: vm86 %d, eip=%lx:%lx, "
1073 "exit_qualification = %lx",
1074 vm86, cs, eip, exit_qualification);
1076 if (test_bit(6, &exit_qualification))
1077 port = (exit_qualification >> 16) & 0xFFFF;
1078 else
1079 port = regs->edx & 0xffff;
1080 TRACE_VMEXIT(1, port);
1081 size = (exit_qualification & 7) + 1;
1082 dir = test_bit(3, &exit_qualification); /* direction */
1084 if (test_bit(4, &exit_qualification)) { /* string instruction */
1085 unsigned long addr, count = 1;
1086 int sign = regs->eflags & EF_DF ? -1 : 1;
1088 __vmread(GUEST_LINEAR_ADDRESS, &addr);
1090 /*
1091 * In protected mode, guest linear address is invalid if the
1092 * selector is null.
1093 */
1094 if (!vm86 && check_for_null_selector(eip))
1095 addr = dir == IOREQ_WRITE ? regs->esi : regs->edi;
1097 if (test_bit(5, &exit_qualification)) { /* "rep" prefix */
1098 mmio_opp->flags |= REPZ;
1099 count = vm86 ? regs->ecx & 0xFFFF : regs->ecx;
1102 /*
1103 * Handle string pio instructions that cross pages or that
1104 * are unaligned. See the comments in hvm_domain.c/handle_mmio()
1105 */
1106 if ((addr & PAGE_MASK) != ((addr + size - 1) & PAGE_MASK)) {
1107 unsigned long value = 0;
1109 mmio_opp->flags |= OVERLAP;
1110 if (dir == IOREQ_WRITE)
1111 hvm_copy(&value, addr, size, HVM_COPY_IN);
1112 send_pio_req(regs, port, 1, size, value, dir, 0);
1113 } else {
1114 if ((addr & PAGE_MASK) != ((addr + count * size - 1) & PAGE_MASK)) {
1115 if (sign > 0)
1116 count = (PAGE_SIZE - (addr & ~PAGE_MASK)) / size;
1117 else
1118 count = (addr & ~PAGE_MASK) / size;
1119 } else
1120 __update_guest_eip(inst_len);
1122 send_pio_req(regs, port, count, size, addr, dir, 1);
1124 } else {
1125 if (port == 0xe9 && dir == IOREQ_WRITE && size == 1)
1126 hvm_print_line(current, regs->eax); /* guest debug output */
1128 __update_guest_eip(inst_len);
1129 send_pio_req(regs, port, 1, size, regs->eax, dir, 0);
1133 int
1134 vmx_world_save(struct vcpu *v, struct vmx_assist_context *c)
1136 unsigned long inst_len;
1137 int error = 0;
1139 error |= __vmread(VM_EXIT_INSTRUCTION_LEN, &inst_len);
1140 error |= __vmread(GUEST_RIP, &c->eip);
1141 c->eip += inst_len; /* skip transition instruction */
1142 error |= __vmread(GUEST_RSP, &c->esp);
1143 error |= __vmread(GUEST_RFLAGS, &c->eflags);
1145 error |= __vmread(CR0_READ_SHADOW, &c->cr0);
1146 c->cr3 = v->arch.hvm_vmx.cpu_cr3;
1147 error |= __vmread(CR4_READ_SHADOW, &c->cr4);
1149 error |= __vmread(GUEST_IDTR_LIMIT, &c->idtr_limit);
1150 error |= __vmread(GUEST_IDTR_BASE, &c->idtr_base);
1152 error |= __vmread(GUEST_GDTR_LIMIT, &c->gdtr_limit);
1153 error |= __vmread(GUEST_GDTR_BASE, &c->gdtr_base);
1155 error |= __vmread(GUEST_CS_SELECTOR, &c->cs_sel);
1156 error |= __vmread(GUEST_CS_LIMIT, &c->cs_limit);
1157 error |= __vmread(GUEST_CS_BASE, &c->cs_base);
1158 error |= __vmread(GUEST_CS_AR_BYTES, &c->cs_arbytes.bytes);
1160 error |= __vmread(GUEST_DS_SELECTOR, &c->ds_sel);
1161 error |= __vmread(GUEST_DS_LIMIT, &c->ds_limit);
1162 error |= __vmread(GUEST_DS_BASE, &c->ds_base);
1163 error |= __vmread(GUEST_DS_AR_BYTES, &c->ds_arbytes.bytes);
1165 error |= __vmread(GUEST_ES_SELECTOR, &c->es_sel);
1166 error |= __vmread(GUEST_ES_LIMIT, &c->es_limit);
1167 error |= __vmread(GUEST_ES_BASE, &c->es_base);
1168 error |= __vmread(GUEST_ES_AR_BYTES, &c->es_arbytes.bytes);
1170 error |= __vmread(GUEST_SS_SELECTOR, &c->ss_sel);
1171 error |= __vmread(GUEST_SS_LIMIT, &c->ss_limit);
1172 error |= __vmread(GUEST_SS_BASE, &c->ss_base);
1173 error |= __vmread(GUEST_SS_AR_BYTES, &c->ss_arbytes.bytes);
1175 error |= __vmread(GUEST_FS_SELECTOR, &c->fs_sel);
1176 error |= __vmread(GUEST_FS_LIMIT, &c->fs_limit);
1177 error |= __vmread(GUEST_FS_BASE, &c->fs_base);
1178 error |= __vmread(GUEST_FS_AR_BYTES, &c->fs_arbytes.bytes);
1180 error |= __vmread(GUEST_GS_SELECTOR, &c->gs_sel);
1181 error |= __vmread(GUEST_GS_LIMIT, &c->gs_limit);
1182 error |= __vmread(GUEST_GS_BASE, &c->gs_base);
1183 error |= __vmread(GUEST_GS_AR_BYTES, &c->gs_arbytes.bytes);
1185 error |= __vmread(GUEST_TR_SELECTOR, &c->tr_sel);
1186 error |= __vmread(GUEST_TR_LIMIT, &c->tr_limit);
1187 error |= __vmread(GUEST_TR_BASE, &c->tr_base);
1188 error |= __vmread(GUEST_TR_AR_BYTES, &c->tr_arbytes.bytes);
1190 error |= __vmread(GUEST_LDTR_SELECTOR, &c->ldtr_sel);
1191 error |= __vmread(GUEST_LDTR_LIMIT, &c->ldtr_limit);
1192 error |= __vmread(GUEST_LDTR_BASE, &c->ldtr_base);
1193 error |= __vmread(GUEST_LDTR_AR_BYTES, &c->ldtr_arbytes.bytes);
1195 return !error;
1198 int
1199 vmx_world_restore(struct vcpu *v, struct vmx_assist_context *c)
1201 unsigned long mfn, old_cr4, old_base_mfn;
1202 int error = 0;
1204 error |= __vmwrite(GUEST_RIP, c->eip);
1205 error |= __vmwrite(GUEST_RSP, c->esp);
1206 error |= __vmwrite(GUEST_RFLAGS, c->eflags);
1208 error |= __vmwrite(CR0_READ_SHADOW, c->cr0);
1210 if (!vmx_paging_enabled(v)) {
1211 HVM_DBG_LOG(DBG_LEVEL_VMMU, "switching to vmxassist. use phys table");
1212 __vmwrite(GUEST_CR3, pagetable_get_paddr(v->domain->arch.phys_table));
1213 goto skip_cr3;
1216 if (c->cr3 == v->arch.hvm_vmx.cpu_cr3) {
1217 /*
1218 * This is simple TLB flush, implying the guest has
1219 * removed some translation or changed page attributes.
1220 * We simply invalidate the shadow.
1221 */
1222 mfn = get_mfn_from_gpfn(c->cr3 >> PAGE_SHIFT);
1223 if (mfn != pagetable_get_pfn(v->arch.guest_table)) {
1224 printk("Invalid CR3 value=%x", c->cr3);
1225 domain_crash_synchronous();
1226 return 0;
1228 shadow_sync_all(v->domain);
1229 } else {
1230 /*
1231 * If different, make a shadow. Check if the PDBR is valid
1232 * first.
1233 */
1234 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 c->cr3 = %x", c->cr3);
1235 if ((c->cr3 >> PAGE_SHIFT) > v->domain->max_pages) {
1236 printk("Invalid CR3 value=%x", c->cr3);
1237 domain_crash_synchronous();
1238 return 0;
1240 mfn = get_mfn_from_gpfn(c->cr3 >> PAGE_SHIFT);
1241 if(!get_page(mfn_to_page(mfn), v->domain))
1242 return 0;
1243 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1244 v->arch.guest_table = pagetable_from_pfn(mfn);
1245 if (old_base_mfn)
1246 put_page(mfn_to_page(old_base_mfn));
1247 /*
1248 * arch.shadow_table should now hold the next CR3 for shadow
1249 */
1250 v->arch.hvm_vmx.cpu_cr3 = c->cr3;
1251 update_pagetables(v);
1252 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %x", c->cr3);
1253 __vmwrite(GUEST_CR3, pagetable_get_paddr(v->arch.shadow_table));
1256 skip_cr3:
1258 error |= __vmread(CR4_READ_SHADOW, &old_cr4);
1259 error |= __vmwrite(GUEST_CR4, (c->cr4 | VMX_CR4_HOST_MASK));
1260 error |= __vmwrite(CR4_READ_SHADOW, c->cr4);
1262 error |= __vmwrite(GUEST_IDTR_LIMIT, c->idtr_limit);
1263 error |= __vmwrite(GUEST_IDTR_BASE, c->idtr_base);
1265 error |= __vmwrite(GUEST_GDTR_LIMIT, c->gdtr_limit);
1266 error |= __vmwrite(GUEST_GDTR_BASE, c->gdtr_base);
1268 error |= __vmwrite(GUEST_CS_SELECTOR, c->cs_sel);
1269 error |= __vmwrite(GUEST_CS_LIMIT, c->cs_limit);
1270 error |= __vmwrite(GUEST_CS_BASE, c->cs_base);
1271 error |= __vmwrite(GUEST_CS_AR_BYTES, c->cs_arbytes.bytes);
1273 error |= __vmwrite(GUEST_DS_SELECTOR, c->ds_sel);
1274 error |= __vmwrite(GUEST_DS_LIMIT, c->ds_limit);
1275 error |= __vmwrite(GUEST_DS_BASE, c->ds_base);
1276 error |= __vmwrite(GUEST_DS_AR_BYTES, c->ds_arbytes.bytes);
1278 error |= __vmwrite(GUEST_ES_SELECTOR, c->es_sel);
1279 error |= __vmwrite(GUEST_ES_LIMIT, c->es_limit);
1280 error |= __vmwrite(GUEST_ES_BASE, c->es_base);
1281 error |= __vmwrite(GUEST_ES_AR_BYTES, c->es_arbytes.bytes);
1283 error |= __vmwrite(GUEST_SS_SELECTOR, c->ss_sel);
1284 error |= __vmwrite(GUEST_SS_LIMIT, c->ss_limit);
1285 error |= __vmwrite(GUEST_SS_BASE, c->ss_base);
1286 error |= __vmwrite(GUEST_SS_AR_BYTES, c->ss_arbytes.bytes);
1288 error |= __vmwrite(GUEST_FS_SELECTOR, c->fs_sel);
1289 error |= __vmwrite(GUEST_FS_LIMIT, c->fs_limit);
1290 error |= __vmwrite(GUEST_FS_BASE, c->fs_base);
1291 error |= __vmwrite(GUEST_FS_AR_BYTES, c->fs_arbytes.bytes);
1293 error |= __vmwrite(GUEST_GS_SELECTOR, c->gs_sel);
1294 error |= __vmwrite(GUEST_GS_LIMIT, c->gs_limit);
1295 error |= __vmwrite(GUEST_GS_BASE, c->gs_base);
1296 error |= __vmwrite(GUEST_GS_AR_BYTES, c->gs_arbytes.bytes);
1298 error |= __vmwrite(GUEST_TR_SELECTOR, c->tr_sel);
1299 error |= __vmwrite(GUEST_TR_LIMIT, c->tr_limit);
1300 error |= __vmwrite(GUEST_TR_BASE, c->tr_base);
1301 error |= __vmwrite(GUEST_TR_AR_BYTES, c->tr_arbytes.bytes);
1303 error |= __vmwrite(GUEST_LDTR_SELECTOR, c->ldtr_sel);
1304 error |= __vmwrite(GUEST_LDTR_LIMIT, c->ldtr_limit);
1305 error |= __vmwrite(GUEST_LDTR_BASE, c->ldtr_base);
1306 error |= __vmwrite(GUEST_LDTR_AR_BYTES, c->ldtr_arbytes.bytes);
1308 return !error;
1311 enum { VMX_ASSIST_INVOKE = 0, VMX_ASSIST_RESTORE };
1313 int
1314 vmx_assist(struct vcpu *v, int mode)
1316 struct vmx_assist_context c;
1317 u32 magic;
1318 u32 cp;
1320 /* make sure vmxassist exists (this is not an error) */
1321 if (!hvm_copy(&magic, VMXASSIST_MAGIC_OFFSET, sizeof(magic), HVM_COPY_IN))
1322 return 0;
1323 if (magic != VMXASSIST_MAGIC)
1324 return 0;
1326 switch (mode) {
1327 /*
1328 * Transfer control to vmxassist.
1329 * Store the current context in VMXASSIST_OLD_CONTEXT and load
1330 * the new VMXASSIST_NEW_CONTEXT context. This context was created
1331 * by vmxassist and will transfer control to it.
1332 */
1333 case VMX_ASSIST_INVOKE:
1334 /* save the old context */
1335 if (!hvm_copy(&cp, VMXASSIST_OLD_CONTEXT, sizeof(cp), HVM_COPY_IN))
1336 goto error;
1337 if (cp != 0) {
1338 if (!vmx_world_save(v, &c))
1339 goto error;
1340 if (!hvm_copy(&c, cp, sizeof(c), HVM_COPY_OUT))
1341 goto error;
1344 /* restore the new context, this should activate vmxassist */
1345 if (!hvm_copy(&cp, VMXASSIST_NEW_CONTEXT, sizeof(cp), HVM_COPY_IN))
1346 goto error;
1347 if (cp != 0) {
1348 if (!hvm_copy(&c, cp, sizeof(c), HVM_COPY_IN))
1349 goto error;
1350 if (!vmx_world_restore(v, &c))
1351 goto error;
1352 return 1;
1354 break;
1356 /*
1357 * Restore the VMXASSIST_OLD_CONTEXT that was saved by VMX_ASSIST_INVOKE
1358 * above.
1359 */
1360 case VMX_ASSIST_RESTORE:
1361 /* save the old context */
1362 if (!hvm_copy(&cp, VMXASSIST_OLD_CONTEXT, sizeof(cp), HVM_COPY_IN))
1363 goto error;
1364 if (cp != 0) {
1365 if (!hvm_copy(&c, cp, sizeof(c), HVM_COPY_IN))
1366 goto error;
1367 if (!vmx_world_restore(v, &c))
1368 goto error;
1369 return 1;
1371 break;
1374 error:
1375 printf("Failed to transfer to vmxassist\n");
1376 domain_crash_synchronous();
1377 return 0;
1380 static int vmx_set_cr0(unsigned long value)
1382 struct vcpu *v = current;
1383 unsigned long mfn;
1384 unsigned long eip;
1385 int paging_enabled;
1386 unsigned long vm_entry_value;
1387 unsigned long old_cr0;
1389 /*
1390 * CR0: We don't want to lose PE and PG.
1391 */
1392 __vmread_vcpu(v, CR0_READ_SHADOW, &old_cr0);
1393 paging_enabled = (old_cr0 & X86_CR0_PE) && (old_cr0 & X86_CR0_PG);
1395 /* TS cleared? Then initialise FPU now. */
1396 if ( !(value & X86_CR0_TS) )
1398 setup_fpu(v);
1399 __vm_clear_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_NM);
1402 __vmwrite(GUEST_CR0, value | X86_CR0_PE | X86_CR0_PG | X86_CR0_NE);
1403 __vmwrite(CR0_READ_SHADOW, value);
1405 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR0 value = %lx\n", value);
1407 if ( (value & X86_CR0_PE) && (value & X86_CR0_PG) && !paging_enabled )
1409 /*
1410 * Trying to enable guest paging.
1411 * The guest CR3 must be pointing to the guest physical.
1412 */
1413 if ( !VALID_MFN(mfn = get_mfn_from_gpfn(
1414 v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT)) ||
1415 !get_page(mfn_to_page(mfn), v->domain) )
1417 printk("Invalid CR3 value = %lx", v->arch.hvm_vmx.cpu_cr3);
1418 domain_crash_synchronous(); /* need to take a clean path */
1421 #if defined(__x86_64__)
1422 if ( test_bit(VMX_CPU_STATE_LME_ENABLED,
1423 &v->arch.hvm_vmx.cpu_state) &&
1424 !test_bit(VMX_CPU_STATE_PAE_ENABLED,
1425 &v->arch.hvm_vmx.cpu_state) )
1427 HVM_DBG_LOG(DBG_LEVEL_1, "Enable paging before PAE enabled\n");
1428 vmx_inject_exception(v, TRAP_gp_fault, 0);
1431 if ( test_bit(VMX_CPU_STATE_LME_ENABLED,
1432 &v->arch.hvm_vmx.cpu_state) )
1434 /* Here the PAE is should be opened */
1435 HVM_DBG_LOG(DBG_LEVEL_1, "Enable long mode\n");
1436 set_bit(VMX_CPU_STATE_LMA_ENABLED,
1437 &v->arch.hvm_vmx.cpu_state);
1439 __vmread(VM_ENTRY_CONTROLS, &vm_entry_value);
1440 vm_entry_value |= VM_ENTRY_CONTROLS_IA32E_MODE;
1441 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
1443 if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L4) )
1445 printk("Unsupported guest paging levels\n");
1446 domain_crash_synchronous(); /* need to take a clean path */
1449 else
1450 #endif /* __x86_64__ */
1452 #if CONFIG_PAGING_LEVELS >= 3
1453 /* seems it's a 32-bit or 32-bit PAE guest */
1455 if ( test_bit(VMX_CPU_STATE_PAE_ENABLED,
1456 &v->arch.hvm_vmx.cpu_state) )
1458 /* The guest enables PAE first and then it enables PG, it is
1459 * really a PAE guest */
1460 if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L3) )
1462 printk("Unsupported guest paging levels\n");
1463 domain_crash_synchronous();
1466 else
1468 if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L2) )
1470 printk("Unsupported guest paging levels\n");
1471 domain_crash_synchronous(); /* need to take a clean path */
1474 #endif
1477 /*
1478 * Now arch.guest_table points to machine physical.
1479 */
1480 v->arch.guest_table = pagetable_from_pfn(mfn);
1481 update_pagetables(v);
1483 HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
1484 (unsigned long) (mfn << PAGE_SHIFT));
1486 __vmwrite(GUEST_CR3, pagetable_get_paddr(v->arch.shadow_table));
1487 /*
1488 * arch->shadow_table should hold the next CR3 for shadow
1489 */
1490 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, mfn = %lx",
1491 v->arch.hvm_vmx.cpu_cr3, mfn);
1494 if ( !((value & X86_CR0_PE) && (value & X86_CR0_PG)) && paging_enabled )
1495 if ( v->arch.hvm_vmx.cpu_cr3 ) {
1496 put_page(mfn_to_page(get_mfn_from_gpfn(
1497 v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT)));
1498 v->arch.guest_table = pagetable_null();
1501 /*
1502 * VMX does not implement real-mode virtualization. We emulate
1503 * real-mode by performing a world switch to VMXAssist whenever
1504 * a partition disables the CR0.PE bit.
1505 */
1506 if ( (value & X86_CR0_PE) == 0 )
1508 if ( value & X86_CR0_PG ) {
1509 /* inject GP here */
1510 vmx_inject_exception(v, TRAP_gp_fault, 0);
1511 return 0;
1512 } else {
1513 /*
1514 * Disable paging here.
1515 * Same to PE == 1 && PG == 0
1516 */
1517 if ( test_bit(VMX_CPU_STATE_LMA_ENABLED,
1518 &v->arch.hvm_vmx.cpu_state) )
1520 clear_bit(VMX_CPU_STATE_LMA_ENABLED,
1521 &v->arch.hvm_vmx.cpu_state);
1522 __vmread(VM_ENTRY_CONTROLS, &vm_entry_value);
1523 vm_entry_value &= ~VM_ENTRY_CONTROLS_IA32E_MODE;
1524 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
1528 clear_all_shadow_status(v->domain);
1529 if ( vmx_assist(v, VMX_ASSIST_INVOKE) ) {
1530 set_bit(VMX_CPU_STATE_ASSIST_ENABLED, &v->arch.hvm_vmx.cpu_state);
1531 __vmread(GUEST_RIP, &eip);
1532 HVM_DBG_LOG(DBG_LEVEL_1,
1533 "Transfering control to vmxassist %%eip 0x%lx\n", eip);
1534 return 0; /* do not update eip! */
1536 } else if ( test_bit(VMX_CPU_STATE_ASSIST_ENABLED,
1537 &v->arch.hvm_vmx.cpu_state) )
1539 __vmread(GUEST_RIP, &eip);
1540 HVM_DBG_LOG(DBG_LEVEL_1,
1541 "Enabling CR0.PE at %%eip 0x%lx\n", eip);
1542 if ( vmx_assist(v, VMX_ASSIST_RESTORE) )
1544 clear_bit(VMX_CPU_STATE_ASSIST_ENABLED,
1545 &v->arch.hvm_vmx.cpu_state);
1546 __vmread(GUEST_RIP, &eip);
1547 HVM_DBG_LOG(DBG_LEVEL_1,
1548 "Restoring to %%eip 0x%lx\n", eip);
1549 return 0; /* do not update eip! */
1552 else if ( (value & (X86_CR0_PE | X86_CR0_PG)) == X86_CR0_PE )
1554 /* we should take care of this kind of situation */
1555 clear_all_shadow_status(v->domain);
1556 __vmwrite(GUEST_CR3, pagetable_get_paddr(v->domain->arch.phys_table));
1559 return 1;
1562 #define CASE_SET_REG(REG, reg) \
1563 case REG_ ## REG: regs->reg = value; break
1564 #define CASE_GET_REG(REG, reg) \
1565 case REG_ ## REG: value = regs->reg; break
1567 #define CASE_EXTEND_SET_REG \
1568 CASE_EXTEND_REG(S)
1569 #define CASE_EXTEND_GET_REG \
1570 CASE_EXTEND_REG(G)
1572 #ifdef __i386__
1573 #define CASE_EXTEND_REG(T)
1574 #else
1575 #define CASE_EXTEND_REG(T) \
1576 CASE_ ## T ## ET_REG(R8, r8); \
1577 CASE_ ## T ## ET_REG(R9, r9); \
1578 CASE_ ## T ## ET_REG(R10, r10); \
1579 CASE_ ## T ## ET_REG(R11, r11); \
1580 CASE_ ## T ## ET_REG(R12, r12); \
1581 CASE_ ## T ## ET_REG(R13, r13); \
1582 CASE_ ## T ## ET_REG(R14, r14); \
1583 CASE_ ## T ## ET_REG(R15, r15)
1584 #endif
1586 /*
1587 * Write to control registers
1588 */
1589 static int mov_to_cr(int gp, int cr, struct cpu_user_regs *regs)
1591 unsigned long value;
1592 unsigned long old_cr;
1593 struct vcpu *v = current;
1595 switch ( gp ) {
1596 CASE_GET_REG(EAX, eax);
1597 CASE_GET_REG(ECX, ecx);
1598 CASE_GET_REG(EDX, edx);
1599 CASE_GET_REG(EBX, ebx);
1600 CASE_GET_REG(EBP, ebp);
1601 CASE_GET_REG(ESI, esi);
1602 CASE_GET_REG(EDI, edi);
1603 CASE_EXTEND_GET_REG;
1604 case REG_ESP:
1605 __vmread(GUEST_RSP, &value);
1606 break;
1607 default:
1608 printk("invalid gp: %d\n", gp);
1609 __hvm_bug(regs);
1612 HVM_DBG_LOG(DBG_LEVEL_1, "CR%d, value = %lx", cr, value);
1614 switch ( cr ) {
1615 case 0:
1616 return vmx_set_cr0(value);
1617 case 3:
1619 unsigned long old_base_mfn, mfn;
1621 /*
1622 * If paging is not enabled yet, simply copy the value to CR3.
1623 */
1624 if (!vmx_paging_enabled(v)) {
1625 v->arch.hvm_vmx.cpu_cr3 = value;
1626 break;
1629 /*
1630 * We make a new one if the shadow does not exist.
1631 */
1632 if (value == v->arch.hvm_vmx.cpu_cr3) {
1633 /*
1634 * This is simple TLB flush, implying the guest has
1635 * removed some translation or changed page attributes.
1636 * We simply invalidate the shadow.
1637 */
1638 mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
1639 if (mfn != pagetable_get_pfn(v->arch.guest_table))
1640 __hvm_bug(regs);
1641 shadow_sync_all(v->domain);
1642 } else {
1643 /*
1644 * If different, make a shadow. Check if the PDBR is valid
1645 * first.
1646 */
1647 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 value = %lx", value);
1648 if ( ((value >> PAGE_SHIFT) > v->domain->max_pages ) ||
1649 !VALID_MFN(mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT)) ||
1650 !get_page(mfn_to_page(mfn), v->domain) )
1652 printk("Invalid CR3 value=%lx", value);
1653 domain_crash_synchronous(); /* need to take a clean path */
1655 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1656 v->arch.guest_table = pagetable_from_pfn(mfn);
1657 if (old_base_mfn)
1658 put_page(mfn_to_page(old_base_mfn));
1659 /*
1660 * arch.shadow_table should now hold the next CR3 for shadow
1661 */
1662 #if CONFIG_PAGING_LEVELS >= 3
1663 if ( v->domain->arch.ops->guest_paging_levels == PAGING_L3 )
1664 shadow_sync_all(v->domain);
1665 #endif
1667 v->arch.hvm_vmx.cpu_cr3 = value;
1668 update_pagetables(v);
1669 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx",
1670 value);
1671 __vmwrite(GUEST_CR3, pagetable_get_paddr(v->arch.shadow_table));
1673 break;
1675 case 4: /* CR4 */
1677 __vmread(CR4_READ_SHADOW, &old_cr);
1679 if ( value & X86_CR4_PAE && !(old_cr & X86_CR4_PAE) )
1681 set_bit(VMX_CPU_STATE_PAE_ENABLED, &v->arch.hvm_vmx.cpu_state);
1683 if ( vmx_pgbit_test(v) )
1685 /* The guest is a 32-bit PAE guest. */
1686 #if CONFIG_PAGING_LEVELS >= 3
1687 unsigned long mfn, old_base_mfn;
1689 if( !shadow_set_guest_paging_levels(v->domain, PAGING_L3) )
1691 printk("Unsupported guest paging levels\n");
1692 domain_crash_synchronous(); /* need to take a clean path */
1695 if ( !VALID_MFN(mfn = get_mfn_from_gpfn(
1696 v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT)) ||
1697 !get_page(mfn_to_page(mfn), v->domain) )
1699 printk("Invalid CR3 value = %lx", v->arch.hvm_vmx.cpu_cr3);
1700 domain_crash_synchronous(); /* need to take a clean path */
1703 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1704 if ( old_base_mfn )
1705 put_page(mfn_to_page(old_base_mfn));
1707 /*
1708 * Now arch.guest_table points to machine physical.
1709 */
1711 v->arch.guest_table = pagetable_from_pfn(mfn);
1712 update_pagetables(v);
1714 HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
1715 (unsigned long) (mfn << PAGE_SHIFT));
1717 __vmwrite(GUEST_CR3, pagetable_get_paddr(v->arch.shadow_table));
1719 /*
1720 * arch->shadow_table should hold the next CR3 for shadow
1721 */
1723 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, mfn = %lx",
1724 v->arch.hvm_vmx.cpu_cr3, mfn);
1725 #endif
1727 else
1729 /* The guest is a 64 bit or 32-bit PAE guest. */
1730 #if CONFIG_PAGING_LEVELS >= 3
1731 if ( (v->domain->arch.ops != NULL) &&
1732 v->domain->arch.ops->guest_paging_levels == PAGING_L2)
1734 /* Seems the guest first enables PAE without enabling PG,
1735 * it must enable PG after that, and it is a 32-bit PAE
1736 * guest */
1738 if ( !shadow_set_guest_paging_levels(v->domain,
1739 PAGING_L3) )
1741 printk("Unsupported guest paging levels\n");
1742 /* need to take a clean path */
1743 domain_crash_synchronous();
1746 #endif
1749 else if ( value & X86_CR4_PAE )
1750 set_bit(VMX_CPU_STATE_PAE_ENABLED, &v->arch.hvm_vmx.cpu_state);
1751 else
1753 if ( test_bit(VMX_CPU_STATE_LMA_ENABLED, &v->arch.hvm_vmx.cpu_state) )
1754 vmx_inject_exception(v, TRAP_gp_fault, 0);
1756 clear_bit(VMX_CPU_STATE_PAE_ENABLED, &v->arch.hvm_vmx.cpu_state);
1759 __vmwrite(GUEST_CR4, value| VMX_CR4_HOST_MASK);
1760 __vmwrite(CR4_READ_SHADOW, value);
1762 /*
1763 * Writing to CR4 to modify the PSE, PGE, or PAE flag invalidates
1764 * all TLB entries except global entries.
1765 */
1766 if ( (old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE) )
1767 shadow_sync_all(v->domain);
1769 break;
1771 default:
1772 printk("invalid cr: %d\n", gp);
1773 __hvm_bug(regs);
1776 return 1;
1779 /*
1780 * Read from control registers. CR0 and CR4 are read from the shadow.
1781 */
1782 static void mov_from_cr(int cr, int gp, struct cpu_user_regs *regs)
1784 unsigned long value;
1785 struct vcpu *v = current;
1787 if ( cr != 3 )
1788 __hvm_bug(regs);
1790 value = (unsigned long) v->arch.hvm_vmx.cpu_cr3;
1792 switch ( gp ) {
1793 CASE_SET_REG(EAX, eax);
1794 CASE_SET_REG(ECX, ecx);
1795 CASE_SET_REG(EDX, edx);
1796 CASE_SET_REG(EBX, ebx);
1797 CASE_SET_REG(EBP, ebp);
1798 CASE_SET_REG(ESI, esi);
1799 CASE_SET_REG(EDI, edi);
1800 CASE_EXTEND_SET_REG;
1801 case REG_ESP:
1802 __vmwrite(GUEST_RSP, value);
1803 regs->esp = value;
1804 break;
1805 default:
1806 printk("invalid gp: %d\n", gp);
1807 __hvm_bug(regs);
1810 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR%d, value = %lx", cr, value);
1813 static int vmx_cr_access(unsigned long exit_qualification, struct cpu_user_regs *regs)
1815 unsigned int gp, cr;
1816 unsigned long value;
1817 struct vcpu *v = current;
1819 switch (exit_qualification & CONTROL_REG_ACCESS_TYPE) {
1820 case TYPE_MOV_TO_CR:
1821 gp = exit_qualification & CONTROL_REG_ACCESS_REG;
1822 cr = exit_qualification & CONTROL_REG_ACCESS_NUM;
1823 TRACE_VMEXIT(1,TYPE_MOV_TO_CR);
1824 TRACE_VMEXIT(2,cr);
1825 TRACE_VMEXIT(3,gp);
1826 return mov_to_cr(gp, cr, regs);
1827 case TYPE_MOV_FROM_CR:
1828 gp = exit_qualification & CONTROL_REG_ACCESS_REG;
1829 cr = exit_qualification & CONTROL_REG_ACCESS_NUM;
1830 TRACE_VMEXIT(1,TYPE_MOV_FROM_CR);
1831 TRACE_VMEXIT(2,cr);
1832 TRACE_VMEXIT(3,gp);
1833 mov_from_cr(cr, gp, regs);
1834 break;
1835 case TYPE_CLTS:
1836 TRACE_VMEXIT(1,TYPE_CLTS);
1838 /* We initialise the FPU now, to avoid needing another vmexit. */
1839 setup_fpu(v);
1840 __vm_clear_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_NM);
1842 __vmread_vcpu(v, GUEST_CR0, &value);
1843 value &= ~X86_CR0_TS; /* clear TS */
1844 __vmwrite(GUEST_CR0, value);
1846 __vmread_vcpu(v, CR0_READ_SHADOW, &value);
1847 value &= ~X86_CR0_TS; /* clear TS */
1848 __vmwrite(CR0_READ_SHADOW, value);
1849 break;
1850 case TYPE_LMSW:
1851 TRACE_VMEXIT(1,TYPE_LMSW);
1852 __vmread_vcpu(v, CR0_READ_SHADOW, &value);
1853 value = (value & ~0xF) |
1854 (((exit_qualification & LMSW_SOURCE_DATA) >> 16) & 0xF);
1855 return vmx_set_cr0(value);
1856 break;
1857 default:
1858 __hvm_bug(regs);
1859 break;
1861 return 1;
1864 static inline void vmx_do_msr_read(struct cpu_user_regs *regs)
1866 u64 msr_content = 0;
1867 struct vcpu *v = current;
1869 HVM_DBG_LOG(DBG_LEVEL_1, "vmx_do_msr_read: ecx=%lx, eax=%lx, edx=%lx",
1870 (unsigned long)regs->ecx, (unsigned long)regs->eax,
1871 (unsigned long)regs->edx);
1872 switch (regs->ecx) {
1873 case MSR_IA32_TIME_STAMP_COUNTER:
1874 msr_content = hvm_get_guest_time(v);
1875 break;
1876 case MSR_IA32_SYSENTER_CS:
1877 __vmread(GUEST_SYSENTER_CS, (u32 *)&msr_content);
1878 break;
1879 case MSR_IA32_SYSENTER_ESP:
1880 __vmread(GUEST_SYSENTER_ESP, &msr_content);
1881 break;
1882 case MSR_IA32_SYSENTER_EIP:
1883 __vmread(GUEST_SYSENTER_EIP, &msr_content);
1884 break;
1885 case MSR_IA32_APICBASE:
1886 msr_content = VLAPIC(v) ? VLAPIC(v)->apic_base_msr : 0;
1887 break;
1888 default:
1889 if(long_mode_do_msr_read(regs))
1890 return;
1891 rdmsr_safe(regs->ecx, regs->eax, regs->edx);
1892 break;
1895 regs->eax = msr_content & 0xFFFFFFFF;
1896 regs->edx = msr_content >> 32;
1898 HVM_DBG_LOG(DBG_LEVEL_1, "vmx_do_msr_read returns: "
1899 "ecx=%lx, eax=%lx, edx=%lx",
1900 (unsigned long)regs->ecx, (unsigned long)regs->eax,
1901 (unsigned long)regs->edx);
1904 static inline void vmx_do_msr_write(struct cpu_user_regs *regs)
1906 u64 msr_content;
1907 struct vcpu *v = current;
1909 HVM_DBG_LOG(DBG_LEVEL_1, "vmx_do_msr_write: ecx=%lx, eax=%lx, edx=%lx",
1910 (unsigned long)regs->ecx, (unsigned long)regs->eax,
1911 (unsigned long)regs->edx);
1913 msr_content = (regs->eax & 0xFFFFFFFF) | ((u64)regs->edx << 32);
1915 switch (regs->ecx) {
1916 case MSR_IA32_TIME_STAMP_COUNTER:
1917 set_guest_time(v, msr_content);
1918 break;
1919 case MSR_IA32_SYSENTER_CS:
1920 __vmwrite(GUEST_SYSENTER_CS, msr_content);
1921 break;
1922 case MSR_IA32_SYSENTER_ESP:
1923 __vmwrite(GUEST_SYSENTER_ESP, msr_content);
1924 break;
1925 case MSR_IA32_SYSENTER_EIP:
1926 __vmwrite(GUEST_SYSENTER_EIP, msr_content);
1927 break;
1928 case MSR_IA32_APICBASE:
1929 vlapic_msr_set(VLAPIC(v), msr_content);
1930 break;
1931 default:
1932 long_mode_do_msr_write(regs);
1933 break;
1936 HVM_DBG_LOG(DBG_LEVEL_1, "vmx_do_msr_write returns: "
1937 "ecx=%lx, eax=%lx, edx=%lx",
1938 (unsigned long)regs->ecx, (unsigned long)regs->eax,
1939 (unsigned long)regs->edx);
1942 /*
1943 * Need to use this exit to reschedule
1944 */
1945 void vmx_vmexit_do_hlt(void)
1947 struct vcpu *v=current;
1948 struct periodic_time *pt = &(v->domain->arch.hvm_domain.pl_time.periodic_tm);
1949 s_time_t next_pit=-1,next_wakeup;
1951 if ( !v->vcpu_id )
1952 next_pit = get_scheduled(v, pt->irq, pt);
1953 next_wakeup = get_apictime_scheduled(v);
1954 if ( (next_pit != -1 && next_pit < next_wakeup) || next_wakeup == -1 )
1955 next_wakeup = next_pit;
1956 if ( next_wakeup != - 1 )
1957 set_timer(&current->arch.hvm_vmx.hlt_timer, next_wakeup);
1958 hvm_safe_block();
1961 static inline void vmx_vmexit_do_extint(struct cpu_user_regs *regs)
1963 unsigned int vector;
1964 int error;
1966 asmlinkage void do_IRQ(struct cpu_user_regs *);
1967 fastcall void smp_apic_timer_interrupt(struct cpu_user_regs *);
1968 fastcall void smp_event_check_interrupt(void);
1969 fastcall void smp_invalidate_interrupt(void);
1970 fastcall void smp_call_function_interrupt(void);
1971 fastcall void smp_spurious_interrupt(struct cpu_user_regs *regs);
1972 fastcall void smp_error_interrupt(struct cpu_user_regs *regs);
1973 #ifdef CONFIG_X86_MCE_P4THERMAL
1974 fastcall void smp_thermal_interrupt(struct cpu_user_regs *regs);
1975 #endif
1977 if ((error = __vmread(VM_EXIT_INTR_INFO, &vector))
1978 && !(vector & INTR_INFO_VALID_MASK))
1979 __hvm_bug(regs);
1981 vector &= INTR_INFO_VECTOR_MASK;
1982 TRACE_VMEXIT(1,vector);
1984 switch(vector) {
1985 case LOCAL_TIMER_VECTOR:
1986 smp_apic_timer_interrupt(regs);
1987 break;
1988 case EVENT_CHECK_VECTOR:
1989 smp_event_check_interrupt();
1990 break;
1991 case INVALIDATE_TLB_VECTOR:
1992 smp_invalidate_interrupt();
1993 break;
1994 case CALL_FUNCTION_VECTOR:
1995 smp_call_function_interrupt();
1996 break;
1997 case SPURIOUS_APIC_VECTOR:
1998 smp_spurious_interrupt(regs);
1999 break;
2000 case ERROR_APIC_VECTOR:
2001 smp_error_interrupt(regs);
2002 break;
2003 #ifdef CONFIG_X86_MCE_P4THERMAL
2004 case THERMAL_APIC_VECTOR:
2005 smp_thermal_interrupt(regs);
2006 break;
2007 #endif
2008 default:
2009 regs->entry_vector = vector;
2010 do_IRQ(regs);
2011 break;
2015 #if defined (__x86_64__)
2016 void store_cpu_user_regs(struct cpu_user_regs *regs)
2018 __vmread(GUEST_SS_SELECTOR, &regs->ss);
2019 __vmread(GUEST_RSP, &regs->rsp);
2020 __vmread(GUEST_RFLAGS, &regs->rflags);
2021 __vmread(GUEST_CS_SELECTOR, &regs->cs);
2022 __vmread(GUEST_DS_SELECTOR, &regs->ds);
2023 __vmread(GUEST_ES_SELECTOR, &regs->es);
2024 __vmread(GUEST_RIP, &regs->rip);
2026 #elif defined (__i386__)
2027 void store_cpu_user_regs(struct cpu_user_regs *regs)
2029 __vmread(GUEST_SS_SELECTOR, &regs->ss);
2030 __vmread(GUEST_RSP, &regs->esp);
2031 __vmread(GUEST_RFLAGS, &regs->eflags);
2032 __vmread(GUEST_CS_SELECTOR, &regs->cs);
2033 __vmread(GUEST_DS_SELECTOR, &regs->ds);
2034 __vmread(GUEST_ES_SELECTOR, &regs->es);
2035 __vmread(GUEST_RIP, &regs->eip);
2037 #endif
2039 #ifdef XEN_DEBUGGER
2040 void save_cpu_user_regs(struct cpu_user_regs *regs)
2042 __vmread(GUEST_SS_SELECTOR, &regs->xss);
2043 __vmread(GUEST_RSP, &regs->esp);
2044 __vmread(GUEST_RFLAGS, &regs->eflags);
2045 __vmread(GUEST_CS_SELECTOR, &regs->xcs);
2046 __vmread(GUEST_RIP, &regs->eip);
2048 __vmread(GUEST_GS_SELECTOR, &regs->xgs);
2049 __vmread(GUEST_FS_SELECTOR, &regs->xfs);
2050 __vmread(GUEST_ES_SELECTOR, &regs->xes);
2051 __vmread(GUEST_DS_SELECTOR, &regs->xds);
2054 void restore_cpu_user_regs(struct cpu_user_regs *regs)
2056 __vmwrite(GUEST_SS_SELECTOR, regs->xss);
2057 __vmwrite(GUEST_RSP, regs->esp);
2058 __vmwrite(GUEST_RFLAGS, regs->eflags);
2059 __vmwrite(GUEST_CS_SELECTOR, regs->xcs);
2060 __vmwrite(GUEST_RIP, regs->eip);
2062 __vmwrite(GUEST_GS_SELECTOR, regs->xgs);
2063 __vmwrite(GUEST_FS_SELECTOR, regs->xfs);
2064 __vmwrite(GUEST_ES_SELECTOR, regs->xes);
2065 __vmwrite(GUEST_DS_SELECTOR, regs->xds);
2067 #endif
2069 asmlinkage void vmx_vmexit_handler(struct cpu_user_regs regs)
2071 unsigned int exit_reason;
2072 unsigned long exit_qualification, eip, inst_len = 0;
2073 struct vcpu *v = current;
2074 int error;
2076 error = __vmread(VM_EXIT_REASON, &exit_reason);
2077 BUG_ON(error);
2079 perfc_incra(vmexits, exit_reason);
2081 if ( (exit_reason != EXIT_REASON_EXTERNAL_INTERRUPT) &&
2082 (exit_reason != EXIT_REASON_VMCALL) &&
2083 (exit_reason != EXIT_REASON_IO_INSTRUCTION) )
2084 HVM_DBG_LOG(DBG_LEVEL_0, "exit reason = %x", exit_reason);
2086 if ( exit_reason != EXIT_REASON_EXTERNAL_INTERRUPT )
2087 local_irq_enable();
2089 if ( unlikely(exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) )
2091 printk("Failed vm entry (reason 0x%x)\n", exit_reason);
2092 printk("*********** VMCS Area **************\n");
2093 vmcs_dump_vcpu();
2094 printk("**************************************\n");
2095 domain_crash_synchronous();
2098 __vmread(GUEST_RIP, &eip);
2099 TRACE_VMEXIT(0,exit_reason);
2101 switch ( exit_reason )
2103 case EXIT_REASON_EXCEPTION_NMI:
2105 /*
2106 * We don't set the software-interrupt exiting (INT n).
2107 * (1) We can get an exception (e.g. #PG) in the guest, or
2108 * (2) NMI
2109 */
2110 int error;
2111 unsigned int vector;
2112 unsigned long va;
2114 if ((error = __vmread(VM_EXIT_INTR_INFO, &vector))
2115 || !(vector & INTR_INFO_VALID_MASK))
2116 __hvm_bug(&regs);
2117 vector &= INTR_INFO_VECTOR_MASK;
2119 TRACE_VMEXIT(1,vector);
2120 perfc_incra(cause_vector, vector);
2122 switch (vector) {
2123 #ifdef XEN_DEBUGGER
2124 case TRAP_debug:
2126 save_cpu_user_regs(&regs);
2127 pdb_handle_exception(1, &regs, 1);
2128 restore_cpu_user_regs(&regs);
2129 break;
2131 case TRAP_int3:
2133 save_cpu_user_regs(&regs);
2134 pdb_handle_exception(3, &regs, 1);
2135 restore_cpu_user_regs(&regs);
2136 break;
2138 #else
2139 case TRAP_debug:
2141 void store_cpu_user_regs(struct cpu_user_regs *regs);
2143 if ( test_bit(_DOMF_debugging, &v->domain->domain_flags) )
2145 store_cpu_user_regs(&regs);
2146 domain_pause_for_debugger();
2147 __vm_clear_bit(GUEST_PENDING_DBG_EXCEPTIONS,
2148 PENDING_DEBUG_EXC_BS);
2150 else
2152 vmx_reflect_exception(v);
2153 __vm_clear_bit(GUEST_PENDING_DBG_EXCEPTIONS,
2154 PENDING_DEBUG_EXC_BS);
2157 break;
2159 case TRAP_int3:
2161 if ( test_bit(_DOMF_debugging, &v->domain->domain_flags) )
2162 domain_pause_for_debugger();
2163 else
2164 vmx_inject_exception(v, TRAP_int3, VMX_DELIVER_NO_ERROR_CODE);
2165 break;
2167 #endif
2168 case TRAP_no_device:
2170 vmx_do_no_device_fault();
2171 break;
2173 case TRAP_page_fault:
2175 __vmread(EXIT_QUALIFICATION, &va);
2176 __vmread(VM_EXIT_INTR_ERROR_CODE, &regs.error_code);
2178 TRACE_VMEXIT(3,regs.error_code);
2179 TRACE_VMEXIT(4,va);
2181 HVM_DBG_LOG(DBG_LEVEL_VMMU,
2182 "eax=%lx, ebx=%lx, ecx=%lx, edx=%lx, esi=%lx, edi=%lx",
2183 (unsigned long)regs.eax, (unsigned long)regs.ebx,
2184 (unsigned long)regs.ecx, (unsigned long)regs.edx,
2185 (unsigned long)regs.esi, (unsigned long)regs.edi);
2186 v->arch.hvm_vcpu.mmio_op.inst_decoder_regs = &regs;
2188 if (!(error = vmx_do_page_fault(va, &regs))) {
2189 /*
2190 * Inject #PG using Interruption-Information Fields
2191 */
2192 vmx_inject_exception(v, TRAP_page_fault, regs.error_code);
2193 v->arch.hvm_vmx.cpu_cr2 = va;
2194 TRACE_3D(TRC_VMX_INT, v->domain->domain_id, TRAP_page_fault, va);
2196 break;
2198 case TRAP_nmi:
2199 do_nmi(&regs);
2200 break;
2201 default:
2202 vmx_reflect_exception(v);
2203 break;
2205 break;
2207 case EXIT_REASON_EXTERNAL_INTERRUPT:
2208 vmx_vmexit_do_extint(&regs);
2209 break;
2210 case EXIT_REASON_PENDING_INTERRUPT:
2211 /*
2212 * Not sure exactly what the purpose of this is. The only bits set
2213 * and cleared at this point are CPU_BASED_VIRTUAL_INTR_PENDING.
2214 * (in io.c:{enable,disable}_irq_window(). So presumably we want to
2215 * set it to the original value...
2216 */
2217 v->arch.hvm_vcpu.u.vmx.exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
2218 v->arch.hvm_vcpu.u.vmx.exec_control |=
2219 (MONITOR_CPU_BASED_EXEC_CONTROLS & CPU_BASED_VIRTUAL_INTR_PENDING);
2220 __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
2221 v->arch.hvm_vcpu.u.vmx.exec_control);
2222 break;
2223 case EXIT_REASON_TASK_SWITCH:
2224 __hvm_bug(&regs);
2225 break;
2226 case EXIT_REASON_CPUID:
2227 vmx_vmexit_do_cpuid(&regs);
2228 __get_instruction_length(inst_len);
2229 __update_guest_eip(inst_len);
2230 break;
2231 case EXIT_REASON_HLT:
2232 __get_instruction_length(inst_len);
2233 __update_guest_eip(inst_len);
2234 vmx_vmexit_do_hlt();
2235 break;
2236 case EXIT_REASON_INVLPG:
2238 unsigned long va;
2240 __vmread(EXIT_QUALIFICATION, &va);
2241 vmx_vmexit_do_invlpg(va);
2242 __get_instruction_length(inst_len);
2243 __update_guest_eip(inst_len);
2244 break;
2246 #if 0 /* keep this for debugging */
2247 case EXIT_REASON_VMCALL:
2248 __get_instruction_length(inst_len);
2249 __vmread(GUEST_RIP, &eip);
2250 __vmread(EXIT_QUALIFICATION, &exit_qualification);
2252 hvm_print_line(v, regs.eax); /* provides the current domain */
2253 __update_guest_eip(inst_len);
2254 break;
2255 #endif
2256 case EXIT_REASON_CR_ACCESS:
2258 __vmread(GUEST_RIP, &eip);
2259 __get_instruction_length(inst_len);
2260 __vmread(EXIT_QUALIFICATION, &exit_qualification);
2262 HVM_DBG_LOG(DBG_LEVEL_1, "eip = %lx, inst_len =%lx, exit_qualification = %lx",
2263 eip, inst_len, exit_qualification);
2264 if (vmx_cr_access(exit_qualification, &regs))
2265 __update_guest_eip(inst_len);
2266 TRACE_VMEXIT(3,regs.error_code);
2267 TRACE_VMEXIT(4,exit_qualification);
2268 break;
2270 case EXIT_REASON_DR_ACCESS:
2271 __vmread(EXIT_QUALIFICATION, &exit_qualification);
2272 vmx_dr_access(exit_qualification, &regs);
2273 __get_instruction_length(inst_len);
2274 __update_guest_eip(inst_len);
2275 break;
2276 case EXIT_REASON_IO_INSTRUCTION:
2277 __vmread(EXIT_QUALIFICATION, &exit_qualification);
2278 __get_instruction_length(inst_len);
2279 vmx_io_instruction(&regs, exit_qualification, inst_len);
2280 TRACE_VMEXIT(4,exit_qualification);
2281 break;
2282 case EXIT_REASON_MSR_READ:
2283 __get_instruction_length(inst_len);
2284 vmx_do_msr_read(&regs);
2285 __update_guest_eip(inst_len);
2286 break;
2287 case EXIT_REASON_MSR_WRITE:
2288 __vmread(GUEST_RIP, &eip);
2289 vmx_do_msr_write(&regs);
2290 __get_instruction_length(inst_len);
2291 __update_guest_eip(inst_len);
2292 break;
2293 case EXIT_REASON_MWAIT_INSTRUCTION:
2294 __hvm_bug(&regs);
2295 break;
2296 case EXIT_REASON_VMCALL:
2297 case EXIT_REASON_VMCLEAR:
2298 case EXIT_REASON_VMLAUNCH:
2299 case EXIT_REASON_VMPTRLD:
2300 case EXIT_REASON_VMPTRST:
2301 case EXIT_REASON_VMREAD:
2302 case EXIT_REASON_VMRESUME:
2303 case EXIT_REASON_VMWRITE:
2304 case EXIT_REASON_VMOFF:
2305 case EXIT_REASON_VMON:
2306 /* Report invalid opcode exception when a VMX guest tries to execute
2307 any of the VMX instructions */
2308 vmx_inject_exception(v, TRAP_invalid_op, VMX_DELIVER_NO_ERROR_CODE);
2309 break;
2311 default:
2312 __hvm_bug(&regs); /* should not happen */
2316 asmlinkage void vmx_load_cr2(void)
2318 struct vcpu *v = current;
2320 local_irq_disable();
2321 asm volatile("mov %0,%%cr2": :"r" (v->arch.hvm_vmx.cpu_cr2));
2324 asmlinkage void vmx_trace_vmentry (void)
2326 TRACE_5D(TRC_VMX_VMENTRY,
2327 trace_values[smp_processor_id()][0],
2328 trace_values[smp_processor_id()][1],
2329 trace_values[smp_processor_id()][2],
2330 trace_values[smp_processor_id()][3],
2331 trace_values[smp_processor_id()][4]);
2332 TRACE_VMEXIT(0,9);
2333 TRACE_VMEXIT(1,9);
2334 TRACE_VMEXIT(2,9);
2335 TRACE_VMEXIT(3,9);
2336 TRACE_VMEXIT(4,9);
2337 return;
2340 asmlinkage void vmx_trace_vmexit (void)
2342 TRACE_3D(TRC_VMX_VMEXIT,0,0,0);
2343 return;
2346 /*
2347 * Local variables:
2348 * mode: C
2349 * c-set-style: "BSD"
2350 * c-basic-offset: 4
2351 * tab-width: 4
2352 * indent-tabs-mode: nil
2353 * End:
2354 */