direct-io.hg

view xen/arch/x86/hvm/vmx/vmx.c @ 11667:bcd0e18f7881

[HVM][VMX] Clear vmxe when VMX is not enabled.

The current Xen code keeps X86_CR4_VMXE set even if VMXON has not been
executed. The stop_vmx() code assumes that it is possible to call
VMXOFF if X86_CR4_VMXE is set which is not always true. Calling VMXOFF
without VMXON results in an illegal opcode trap, and to avoid this condition
this patch makes sure that X86_CR4_VMXE is only set when VMXON has
been called.

Tested using x86_32 on a Pentium D 930.

Signed-Off-By: Magnus Damm <magnus@valinux.co.jp>
author kfraser@localhost.localdomain
date Fri Sep 29 11:26:33 2006 +0100 (2006-09-29)
parents 058f4a2a8642
children 82983c636549
line source
1 /*
2 * vmx.c: handling VMX architecture-related VM exits
3 * Copyright (c) 2004, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
16 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 *
18 */
20 #include <xen/config.h>
21 #include <xen/init.h>
22 #include <xen/lib.h>
23 #include <xen/trace.h>
24 #include <xen/sched.h>
25 #include <xen/irq.h>
26 #include <xen/softirq.h>
27 #include <xen/domain_page.h>
28 #include <xen/hypercall.h>
29 #include <xen/perfc.h>
30 #include <asm/current.h>
31 #include <asm/io.h>
32 #include <asm/regs.h>
33 #include <asm/cpufeature.h>
34 #include <asm/processor.h>
35 #include <asm/types.h>
36 #include <asm/msr.h>
37 #include <asm/spinlock.h>
38 #include <asm/hvm/hvm.h>
39 #include <asm/hvm/support.h>
40 #include <asm/hvm/vmx/vmx.h>
41 #include <asm/hvm/vmx/vmcs.h>
42 #include <asm/hvm/vmx/cpu.h>
43 #include <asm/shadow.h>
44 #include <public/sched.h>
45 #include <public/hvm/ioreq.h>
46 #include <asm/hvm/vpic.h>
47 #include <asm/hvm/vlapic.h>
48 #include <asm/x86_emulate.h>
50 extern uint32_t vlapic_update_ppr(struct vlapic *vlapic);
52 static DEFINE_PER_CPU(unsigned long, trace_values[5]);
53 #define TRACE_VMEXIT(index,value) this_cpu(trace_values)[index]=value
55 static void vmx_ctxt_switch_from(struct vcpu *v);
56 static void vmx_ctxt_switch_to(struct vcpu *v);
58 static int vmx_initialize_guest_resources(struct vcpu *v)
59 {
60 struct domain *d = v->domain;
61 struct vcpu *vc;
62 void *io_bitmap_a, *io_bitmap_b;
63 int rc;
65 v->arch.schedule_tail = arch_vmx_do_launch;
66 v->arch.ctxt_switch_from = vmx_ctxt_switch_from;
67 v->arch.ctxt_switch_to = vmx_ctxt_switch_to;
69 if ( v->vcpu_id != 0 )
70 return 1;
72 if ( !shadow_mode_external(d) )
73 {
74 DPRINTK("Can't init HVM for dom %u vcpu %u: "
75 "not in shadow external mode\n",
76 d->domain_id, v->vcpu_id);
77 domain_crash(d);
78 }
80 for_each_vcpu ( d, vc )
81 {
82 memset(&vc->arch.hvm_vmx, 0, sizeof(struct arch_vmx_struct));
84 if ( (rc = vmx_create_vmcs(vc)) != 0 )
85 {
86 DPRINTK("Failed to create VMCS for vcpu %d: err=%d.\n",
87 vc->vcpu_id, rc);
88 return 0;
89 }
91 spin_lock_init(&vc->arch.hvm_vmx.vmcs_lock);
93 if ( (io_bitmap_a = alloc_xenheap_pages(IO_BITMAP_ORDER)) == NULL )
94 {
95 DPRINTK("Failed to allocate io bitmap b for vcpu %d.\n",
96 vc->vcpu_id);
97 return 0;
98 }
100 if ( (io_bitmap_b = alloc_xenheap_pages(IO_BITMAP_ORDER)) == NULL )
101 {
102 DPRINTK("Failed to allocate io bitmap b for vcpu %d.\n",
103 vc->vcpu_id);
104 return 0;
105 }
107 memset(io_bitmap_a, 0xff, 0x1000);
108 memset(io_bitmap_b, 0xff, 0x1000);
110 /* don't bother debug port access */
111 clear_bit(PC_DEBUG_PORT, io_bitmap_a);
113 vc->arch.hvm_vmx.io_bitmap_a = io_bitmap_a;
114 vc->arch.hvm_vmx.io_bitmap_b = io_bitmap_b;
116 }
118 /*
119 * Required to do this once per domain XXX todo: add a seperate function
120 * to do these.
121 */
122 memset(&d->shared_info->evtchn_mask[0], 0xff,
123 sizeof(d->shared_info->evtchn_mask));
125 return 1;
126 }
128 static void vmx_relinquish_guest_resources(struct domain *d)
129 {
130 struct vcpu *v;
132 for_each_vcpu ( d, v )
133 {
134 vmx_destroy_vmcs(v);
135 if ( !test_bit(_VCPUF_initialised, &v->vcpu_flags) )
136 continue;
137 kill_timer(&v->arch.hvm_vcpu.hlt_timer);
138 if ( hvm_apic_support(v->domain) && (VLAPIC(v) != NULL) )
139 {
140 kill_timer(&VLAPIC(v)->vlapic_timer);
141 unmap_domain_page_global(VLAPIC(v)->regs);
142 free_domheap_page(VLAPIC(v)->regs_page);
143 xfree(VLAPIC(v));
144 }
145 hvm_release_assist_channel(v);
146 }
148 kill_timer(&d->arch.hvm_domain.pl_time.periodic_tm.timer);
150 if ( d->arch.hvm_domain.shared_page_va )
151 unmap_domain_page_global(
152 (void *)d->arch.hvm_domain.shared_page_va);
154 if ( d->arch.hvm_domain.buffered_io_va )
155 unmap_domain_page_global((void *)d->arch.hvm_domain.buffered_io_va);
156 }
158 #ifdef __x86_64__
160 static DEFINE_PER_CPU(struct vmx_msr_state, percpu_msr);
162 static u32 msr_data_index[VMX_MSR_COUNT] =
163 {
164 MSR_LSTAR, MSR_STAR, MSR_CSTAR,
165 MSR_SYSCALL_MASK, MSR_EFER,
166 };
168 static void vmx_save_segments(struct vcpu *v)
169 {
170 rdmsrl(MSR_SHADOW_GS_BASE, v->arch.hvm_vmx.msr_content.shadow_gs);
171 }
173 /*
174 * To avoid MSR save/restore at every VM exit/entry time, we restore
175 * the x86_64 specific MSRs at domain switch time. Since those MSRs are
176 * are not modified once set for generic domains, we don't save them,
177 * but simply reset them to the values set at percpu_traps_init().
178 */
179 static void vmx_load_msrs(void)
180 {
181 struct vmx_msr_state *host_state = &this_cpu(percpu_msr);
182 int i;
184 while ( host_state->flags )
185 {
186 i = find_first_set_bit(host_state->flags);
187 wrmsrl(msr_data_index[i], host_state->msr_items[i]);
188 clear_bit(i, &host_state->flags);
189 }
190 }
192 static void vmx_save_init_msrs(void)
193 {
194 struct vmx_msr_state *host_state = &this_cpu(percpu_msr);
195 int i;
197 for ( i = 0; i < VMX_MSR_COUNT; i++ )
198 rdmsrl(msr_data_index[i], host_state->msr_items[i]);
199 }
201 #define CASE_READ_MSR(address) \
202 case MSR_ ## address: \
203 msr_content = msr->msr_items[VMX_INDEX_MSR_ ## address]; \
204 break
206 #define CASE_WRITE_MSR(address) \
207 case MSR_ ## address: \
208 { \
209 msr->msr_items[VMX_INDEX_MSR_ ## address] = msr_content; \
210 if (!test_bit(VMX_INDEX_MSR_ ## address, &msr->flags)) { \
211 set_bit(VMX_INDEX_MSR_ ## address, &msr->flags); \
212 } \
213 wrmsrl(MSR_ ## address, msr_content); \
214 set_bit(VMX_INDEX_MSR_ ## address, &host_state->flags); \
215 } \
216 break
218 #define IS_CANO_ADDRESS(add) 1
219 static inline int long_mode_do_msr_read(struct cpu_user_regs *regs)
220 {
221 u64 msr_content = 0;
222 struct vcpu *v = current;
223 struct vmx_msr_state *msr = &v->arch.hvm_vmx.msr_content;
225 switch ( regs->ecx ) {
226 case MSR_EFER:
227 HVM_DBG_LOG(DBG_LEVEL_2, "EFER msr_content 0x%"PRIx64, msr_content);
228 msr_content = msr->msr_items[VMX_INDEX_MSR_EFER];
230 /* the following code may be not needed */
231 if ( test_bit(VMX_CPU_STATE_LME_ENABLED, &v->arch.hvm_vmx.cpu_state) )
232 msr_content |= EFER_LME;
233 else
234 msr_content &= ~EFER_LME;
236 if ( VMX_LONG_GUEST(v) )
237 msr_content |= EFER_LMA;
238 else
239 msr_content &= ~EFER_LMA;
240 break;
242 case MSR_FS_BASE:
243 if ( !(VMX_LONG_GUEST(v)) )
244 /* XXX should it be GP fault */
245 domain_crash_synchronous();
247 __vmread(GUEST_FS_BASE, &msr_content);
248 break;
250 case MSR_GS_BASE:
251 if ( !(VMX_LONG_GUEST(v)) )
252 domain_crash_synchronous();
254 __vmread(GUEST_GS_BASE, &msr_content);
255 break;
257 case MSR_SHADOW_GS_BASE:
258 msr_content = msr->shadow_gs;
259 break;
261 CASE_READ_MSR(STAR);
262 CASE_READ_MSR(LSTAR);
263 CASE_READ_MSR(CSTAR);
264 CASE_READ_MSR(SYSCALL_MASK);
266 default:
267 return 0;
268 }
270 HVM_DBG_LOG(DBG_LEVEL_2, "msr_content: 0x%"PRIx64, msr_content);
272 regs->eax = (u32)(msr_content >> 0);
273 regs->edx = (u32)(msr_content >> 32);
275 return 1;
276 }
278 static inline int long_mode_do_msr_write(struct cpu_user_regs *regs)
279 {
280 u64 msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
281 struct vcpu *v = current;
282 struct vmx_msr_state *msr = &v->arch.hvm_vmx.msr_content;
283 struct vmx_msr_state *host_state = &this_cpu(percpu_msr);
285 HVM_DBG_LOG(DBG_LEVEL_1, "msr 0x%lx msr_content 0x%"PRIx64"\n",
286 (unsigned long)regs->ecx, msr_content);
288 switch ( regs->ecx ) {
289 case MSR_EFER:
290 /* offending reserved bit will cause #GP */
291 if ( msr_content & ~(EFER_LME | EFER_LMA | EFER_NX | EFER_SCE) )
292 {
293 printk("Trying to set reserved bit in EFER: %"PRIx64"\n",
294 msr_content);
295 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
296 return 0;
297 }
299 /* LME: 0 -> 1 */
300 if ( msr_content & EFER_LME &&
301 !test_bit(VMX_CPU_STATE_LME_ENABLED, &v->arch.hvm_vmx.cpu_state) )
302 {
303 if ( vmx_paging_enabled(v) ||
304 !test_bit(VMX_CPU_STATE_PAE_ENABLED,
305 &v->arch.hvm_vmx.cpu_state) )
306 {
307 printk("Trying to set LME bit when "
308 "in paging mode or PAE bit is not set\n");
309 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
310 return 0;
311 }
313 set_bit(VMX_CPU_STATE_LME_ENABLED, &v->arch.hvm_vmx.cpu_state);
314 }
316 msr->msr_items[VMX_INDEX_MSR_EFER] = msr_content;
317 break;
319 case MSR_FS_BASE:
320 case MSR_GS_BASE:
321 if ( !(VMX_LONG_GUEST(v)) )
322 domain_crash_synchronous();
324 if ( !IS_CANO_ADDRESS(msr_content) )
325 {
326 HVM_DBG_LOG(DBG_LEVEL_1, "Not cano address of msr write\n");
327 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
328 return 0;
329 }
331 if ( regs->ecx == MSR_FS_BASE )
332 __vmwrite(GUEST_FS_BASE, msr_content);
333 else
334 __vmwrite(GUEST_GS_BASE, msr_content);
336 break;
338 case MSR_SHADOW_GS_BASE:
339 if ( !(VMX_LONG_GUEST(v)) )
340 domain_crash_synchronous();
342 v->arch.hvm_vmx.msr_content.shadow_gs = msr_content;
343 wrmsrl(MSR_SHADOW_GS_BASE, msr_content);
344 break;
346 CASE_WRITE_MSR(STAR);
347 CASE_WRITE_MSR(LSTAR);
348 CASE_WRITE_MSR(CSTAR);
349 CASE_WRITE_MSR(SYSCALL_MASK);
351 default:
352 return 0;
353 }
355 return 1;
356 }
358 static void vmx_restore_msrs(struct vcpu *v)
359 {
360 int i = 0;
361 struct vmx_msr_state *guest_state;
362 struct vmx_msr_state *host_state;
363 unsigned long guest_flags ;
365 guest_state = &v->arch.hvm_vmx.msr_content;;
366 host_state = &this_cpu(percpu_msr);
368 wrmsrl(MSR_SHADOW_GS_BASE, guest_state->shadow_gs);
369 guest_flags = guest_state->flags;
370 if (!guest_flags)
371 return;
373 while (guest_flags){
374 i = find_first_set_bit(guest_flags);
376 HVM_DBG_LOG(DBG_LEVEL_2,
377 "restore guest's index %d msr %lx with %lx\n",
378 i, (unsigned long)msr_data_index[i],
379 (unsigned long)guest_state->msr_items[i]);
380 set_bit(i, &host_state->flags);
381 wrmsrl(msr_data_index[i], guest_state->msr_items[i]);
382 clear_bit(i, &guest_flags);
383 }
384 }
386 #else /* __i386__ */
388 #define vmx_save_segments(v) ((void)0)
389 #define vmx_load_msrs() ((void)0)
390 #define vmx_restore_msrs(v) ((void)0)
391 #define vmx_save_init_msrs() ((void)0)
393 static inline int long_mode_do_msr_read(struct cpu_user_regs *regs)
394 {
395 return 0;
396 }
398 static inline int long_mode_do_msr_write(struct cpu_user_regs *regs)
399 {
400 return 0;
401 }
403 #endif /* __i386__ */
405 #define loaddebug(_v,_reg) \
406 __asm__ __volatile__ ("mov %0,%%db" #_reg : : "r" ((_v)->debugreg[_reg]))
407 #define savedebug(_v,_reg) \
408 __asm__ __volatile__ ("mov %%db" #_reg ",%0" : : "r" ((_v)->debugreg[_reg]))
410 static inline void vmx_save_dr(struct vcpu *v)
411 {
412 if ( v->arch.hvm_vcpu.flag_dr_dirty )
413 {
414 savedebug(&v->arch.guest_context, 0);
415 savedebug(&v->arch.guest_context, 1);
416 savedebug(&v->arch.guest_context, 2);
417 savedebug(&v->arch.guest_context, 3);
418 savedebug(&v->arch.guest_context, 6);
420 v->arch.hvm_vcpu.flag_dr_dirty = 0;
422 v->arch.hvm_vcpu.u.vmx.exec_control |= CPU_BASED_MOV_DR_EXITING;
423 __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
424 v->arch.hvm_vcpu.u.vmx.exec_control);
425 }
426 }
428 static inline void __restore_debug_registers(struct vcpu *v)
429 {
430 loaddebug(&v->arch.guest_context, 0);
431 loaddebug(&v->arch.guest_context, 1);
432 loaddebug(&v->arch.guest_context, 2);
433 loaddebug(&v->arch.guest_context, 3);
434 /* No 4 and 5 */
435 loaddebug(&v->arch.guest_context, 6);
436 /* DR7 is loaded from the vmcs. */
437 }
439 /*
440 * DR7 is saved and restored on every vmexit. Other debug registers only
441 * need to be restored if their value is going to affect execution -- i.e.,
442 * if one of the breakpoints is enabled. So mask out all bits that don't
443 * enable some breakpoint functionality.
444 *
445 * This is in part necessary because bit 10 of DR7 is hardwired to 1, so a
446 * simple if( guest_dr7 ) will always return true. As long as we're masking,
447 * we might as well do it right.
448 */
449 #define DR7_ACTIVE_MASK 0xff
451 static inline void vmx_restore_dr(struct vcpu *v)
452 {
453 unsigned long guest_dr7;
455 __vmread(GUEST_DR7, &guest_dr7);
457 /* Assumes guest does not have DR access at time of context switch. */
458 if ( unlikely(guest_dr7 & DR7_ACTIVE_MASK) )
459 __restore_debug_registers(v);
460 }
462 static void vmx_freeze_time(struct vcpu *v)
463 {
464 struct periodic_time *pt=&v->domain->arch.hvm_domain.pl_time.periodic_tm;
466 if ( pt->enabled && pt->first_injected && !v->arch.hvm_vcpu.guest_time ) {
467 v->arch.hvm_vcpu.guest_time = hvm_get_guest_time(v);
468 stop_timer(&(pt->timer));
469 }
470 }
472 static void vmx_ctxt_switch_from(struct vcpu *v)
473 {
474 vmx_freeze_time(v);
475 vmx_save_segments(v);
476 vmx_load_msrs();
477 vmx_save_dr(v);
478 }
480 static void vmx_ctxt_switch_to(struct vcpu *v)
481 {
482 vmx_restore_msrs(v);
483 vmx_restore_dr(v);
484 }
486 static void stop_vmx(void)
487 {
488 if ( !(read_cr4() & X86_CR4_VMXE) )
489 return;
490 __vmxoff();
491 clear_in_cr4(X86_CR4_VMXE);
492 }
494 void vmx_migrate_timers(struct vcpu *v)
495 {
496 struct periodic_time *pt = &(v->domain->arch.hvm_domain.pl_time.periodic_tm);
498 if ( pt->enabled ) {
499 migrate_timer(&pt->timer, v->processor);
500 migrate_timer(&v->arch.hvm_vcpu.hlt_timer, v->processor);
501 }
502 if ( hvm_apic_support(v->domain) && VLAPIC(v))
503 migrate_timer(&(VLAPIC(v)->vlapic_timer), v->processor);
504 }
506 static void vmx_store_cpu_guest_regs(
507 struct vcpu *v, struct cpu_user_regs *regs, unsigned long *crs)
508 {
509 vmx_vmcs_enter(v);
511 if ( regs != NULL )
512 {
513 __vmread(GUEST_RFLAGS, &regs->eflags);
514 __vmread(GUEST_SS_SELECTOR, &regs->ss);
515 __vmread(GUEST_CS_SELECTOR, &regs->cs);
516 __vmread(GUEST_DS_SELECTOR, &regs->ds);
517 __vmread(GUEST_ES_SELECTOR, &regs->es);
518 __vmread(GUEST_GS_SELECTOR, &regs->gs);
519 __vmread(GUEST_FS_SELECTOR, &regs->fs);
520 __vmread(GUEST_RIP, &regs->eip);
521 __vmread(GUEST_RSP, &regs->esp);
522 }
524 if ( crs != NULL )
525 {
526 __vmread(CR0_READ_SHADOW, &crs[0]);
527 crs[2] = v->arch.hvm_vmx.cpu_cr2;
528 __vmread(GUEST_CR3, &crs[3]);
529 __vmread(CR4_READ_SHADOW, &crs[4]);
530 }
532 vmx_vmcs_exit(v);
533 }
535 /*
536 * The VMX spec (section 4.3.1.2, Checks on Guest Segment
537 * Registers) says that virtual-8086 mode guests' segment
538 * base-address fields in the VMCS must be equal to their
539 * corresponding segment selector field shifted right by
540 * four bits upon vmentry.
541 *
542 * This function (called only for VM86-mode guests) fixes
543 * the bases to be consistent with the selectors in regs
544 * if they're not already. Without this, we can fail the
545 * vmentry check mentioned above.
546 */
547 static void fixup_vm86_seg_bases(struct cpu_user_regs *regs)
548 {
549 int err = 0;
550 unsigned long base;
552 err |= __vmread(GUEST_ES_BASE, &base);
553 if (regs->es << 4 != base)
554 err |= __vmwrite(GUEST_ES_BASE, regs->es << 4);
555 err |= __vmread(GUEST_CS_BASE, &base);
556 if (regs->cs << 4 != base)
557 err |= __vmwrite(GUEST_CS_BASE, regs->cs << 4);
558 err |= __vmread(GUEST_SS_BASE, &base);
559 if (regs->ss << 4 != base)
560 err |= __vmwrite(GUEST_SS_BASE, regs->ss << 4);
561 err |= __vmread(GUEST_DS_BASE, &base);
562 if (regs->ds << 4 != base)
563 err |= __vmwrite(GUEST_DS_BASE, regs->ds << 4);
564 err |= __vmread(GUEST_FS_BASE, &base);
565 if (regs->fs << 4 != base)
566 err |= __vmwrite(GUEST_FS_BASE, regs->fs << 4);
567 err |= __vmread(GUEST_GS_BASE, &base);
568 if (regs->gs << 4 != base)
569 err |= __vmwrite(GUEST_GS_BASE, regs->gs << 4);
571 BUG_ON(err);
572 }
574 static void vmx_load_cpu_guest_regs(struct vcpu *v, struct cpu_user_regs *regs)
575 {
576 vmx_vmcs_enter(v);
578 __vmwrite(GUEST_SS_SELECTOR, regs->ss);
579 __vmwrite(GUEST_DS_SELECTOR, regs->ds);
580 __vmwrite(GUEST_ES_SELECTOR, regs->es);
581 __vmwrite(GUEST_GS_SELECTOR, regs->gs);
582 __vmwrite(GUEST_FS_SELECTOR, regs->fs);
584 __vmwrite(GUEST_RSP, regs->esp);
586 __vmwrite(GUEST_RFLAGS, regs->eflags);
587 if (regs->eflags & EF_TF)
588 __vm_set_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_DB);
589 else
590 __vm_clear_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_DB);
591 if (regs->eflags & EF_VM)
592 fixup_vm86_seg_bases(regs);
594 __vmwrite(GUEST_CS_SELECTOR, regs->cs);
595 __vmwrite(GUEST_RIP, regs->eip);
597 vmx_vmcs_exit(v);
598 }
600 static unsigned long vmx_get_ctrl_reg(struct vcpu *v, unsigned int num)
601 {
602 switch ( num )
603 {
604 case 0:
605 return v->arch.hvm_vmx.cpu_cr0;
606 case 2:
607 return v->arch.hvm_vmx.cpu_cr2;
608 case 3:
609 return v->arch.hvm_vmx.cpu_cr3;
610 case 4:
611 return v->arch.hvm_vmx.cpu_shadow_cr4;
612 default:
613 BUG();
614 }
615 return 0; /* dummy */
616 }
620 /* Make sure that xen intercepts any FP accesses from current */
621 static void vmx_stts(struct vcpu *v)
622 {
623 unsigned long cr0;
625 /* VMX depends on operating on the current vcpu */
626 ASSERT(v == current);
628 /*
629 * If the guest does not have TS enabled then we must cause and handle an
630 * exception on first use of the FPU. If the guest *does* have TS enabled
631 * then this is not necessary: no FPU activity can occur until the guest
632 * clears CR0.TS, and we will initialise the FPU when that happens.
633 */
634 __vmread_vcpu(v, CR0_READ_SHADOW, &cr0);
635 if ( !(cr0 & X86_CR0_TS) )
636 {
637 __vmread_vcpu(v, GUEST_CR0, &cr0);
638 __vmwrite(GUEST_CR0, cr0 | X86_CR0_TS);
639 __vm_set_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_NM);
640 }
641 }
644 static void vmx_set_tsc_offset(struct vcpu *v, u64 offset)
645 {
646 /* VMX depends on operating on the current vcpu */
647 ASSERT(v == current);
649 __vmwrite(TSC_OFFSET, offset);
650 #if defined (__i386__)
651 __vmwrite(TSC_OFFSET_HIGH, offset >> 32);
652 #endif
653 }
657 /* SMP VMX guest support */
658 static void vmx_init_ap_context(struct vcpu_guest_context *ctxt,
659 int vcpuid, int trampoline_vector)
660 {
661 int i;
663 memset(ctxt, 0, sizeof(*ctxt));
665 /*
666 * Initial register values:
667 */
668 ctxt->user_regs.eip = VMXASSIST_BASE;
669 ctxt->user_regs.edx = vcpuid;
670 ctxt->user_regs.ebx = trampoline_vector;
672 ctxt->flags = VGCF_HVM_GUEST;
674 /* Virtual IDT is empty at start-of-day. */
675 for ( i = 0; i < 256; i++ )
676 {
677 ctxt->trap_ctxt[i].vector = i;
678 ctxt->trap_ctxt[i].cs = FLAT_KERNEL_CS;
679 }
681 /* No callback handlers. */
682 #if defined(__i386__)
683 ctxt->event_callback_cs = FLAT_KERNEL_CS;
684 ctxt->failsafe_callback_cs = FLAT_KERNEL_CS;
685 #endif
686 }
688 void do_nmi(struct cpu_user_regs *);
690 static void vmx_init_hypercall_page(struct domain *d, void *hypercall_page)
691 {
692 char *p;
693 int i;
695 memset(hypercall_page, 0, PAGE_SIZE);
697 for ( i = 0; i < (PAGE_SIZE / 32); i++ )
698 {
699 p = (char *)(hypercall_page + (i * 32));
700 *(u8 *)(p + 0) = 0xb8; /* mov imm32, %eax */
701 *(u32 *)(p + 1) = i;
702 *(u8 *)(p + 5) = 0x0f; /* vmcall */
703 *(u8 *)(p + 6) = 0x01;
704 *(u8 *)(p + 7) = 0xc1;
705 *(u8 *)(p + 8) = 0xc3; /* ret */
706 }
708 /* Don't support HYPERVISOR_iret at the moment */
709 *(u16 *)(hypercall_page + (__HYPERVISOR_iret * 32)) = 0x0b0f; /* ud2 */
710 }
712 static int vmx_realmode(struct vcpu *v)
713 {
714 unsigned long rflags;
716 ASSERT(v == current);
718 __vmread(GUEST_RFLAGS, &rflags);
719 return rflags & X86_EFLAGS_VM;
720 }
722 static int vmx_guest_x86_mode(struct vcpu *v)
723 {
724 unsigned long cs_ar_bytes;
726 ASSERT(v == current);
728 __vmread(GUEST_CS_AR_BYTES, &cs_ar_bytes);
730 if ( vmx_long_mode_enabled(v) )
731 return ((cs_ar_bytes & (1u<<13)) ?
732 X86EMUL_MODE_PROT64 : X86EMUL_MODE_PROT32);
734 if ( vmx_realmode(v) )
735 return X86EMUL_MODE_REAL;
737 return ((cs_ar_bytes & (1u<<14)) ?
738 X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16);
739 }
741 /* Setup HVM interfaces */
742 static void vmx_setup_hvm_funcs(void)
743 {
744 if ( hvm_enabled )
745 return;
747 hvm_funcs.disable = stop_vmx;
749 hvm_funcs.initialize_guest_resources = vmx_initialize_guest_resources;
750 hvm_funcs.relinquish_guest_resources = vmx_relinquish_guest_resources;
752 hvm_funcs.store_cpu_guest_regs = vmx_store_cpu_guest_regs;
753 hvm_funcs.load_cpu_guest_regs = vmx_load_cpu_guest_regs;
755 hvm_funcs.realmode = vmx_realmode;
756 hvm_funcs.paging_enabled = vmx_paging_enabled;
757 hvm_funcs.long_mode_enabled = vmx_long_mode_enabled;
758 hvm_funcs.pae_enabled = vmx_pae_enabled;
759 hvm_funcs.guest_x86_mode = vmx_guest_x86_mode;
760 hvm_funcs.get_guest_ctrl_reg = vmx_get_ctrl_reg;
762 hvm_funcs.update_host_cr3 = vmx_update_host_cr3;
764 hvm_funcs.stts = vmx_stts;
765 hvm_funcs.set_tsc_offset = vmx_set_tsc_offset;
767 hvm_funcs.init_ap_context = vmx_init_ap_context;
769 hvm_funcs.init_hypercall_page = vmx_init_hypercall_page;
770 }
772 int start_vmx(void)
773 {
774 u32 eax, edx;
775 struct vmcs_struct *vmcs;
777 /*
778 * Xen does not fill x86_capability words except 0.
779 */
780 boot_cpu_data.x86_capability[4] = cpuid_ecx(1);
782 if ( !test_bit(X86_FEATURE_VMXE, &boot_cpu_data.x86_capability) )
783 return 0;
785 rdmsr(IA32_FEATURE_CONTROL_MSR, eax, edx);
787 if ( eax & IA32_FEATURE_CONTROL_MSR_LOCK )
788 {
789 if ( (eax & IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON) == 0x0 )
790 {
791 printk("VMX disabled by Feature Control MSR.\n");
792 return 0;
793 }
794 }
795 else
796 {
797 wrmsr(IA32_FEATURE_CONTROL_MSR,
798 IA32_FEATURE_CONTROL_MSR_LOCK |
799 IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON, 0);
800 }
802 set_in_cr4(X86_CR4_VMXE);
804 vmx_init_vmcs_config();
806 if ( smp_processor_id() == 0 )
807 setup_vmcs_dump();
809 if ( (vmcs = vmx_alloc_host_vmcs()) == NULL )
810 {
811 clear_in_cr4(X86_CR4_VMXE);
812 printk("Failed to allocate host VMCS\n");
813 return 0;
814 }
816 if ( __vmxon(virt_to_maddr(vmcs)) )
817 {
818 clear_in_cr4(X86_CR4_VMXE);
819 printk("VMXON failed\n");
820 vmx_free_host_vmcs(vmcs);
821 return 0;
822 }
824 printk("VMXON is done\n");
826 vmx_save_init_msrs();
828 vmx_setup_hvm_funcs();
830 hvm_enabled = 1;
832 return 1;
833 }
835 /*
836 * Not all cases receive valid value in the VM-exit instruction length field.
837 * Callers must know what they're doing!
838 */
839 static int __get_instruction_length(void)
840 {
841 int len;
842 __vmread(VM_EXIT_INSTRUCTION_LEN, &len); /* Safe: callers audited */
843 if ( (len < 1) || (len > 15) )
844 __hvm_bug(guest_cpu_user_regs());
845 return len;
846 }
848 static void inline __update_guest_eip(unsigned long inst_len)
849 {
850 unsigned long current_eip;
852 __vmread(GUEST_RIP, &current_eip);
853 __vmwrite(GUEST_RIP, current_eip + inst_len);
854 __vmwrite(GUEST_INTERRUPTIBILITY_INFO, 0);
855 }
857 static int vmx_do_page_fault(unsigned long va, struct cpu_user_regs *regs)
858 {
859 int result;
861 #if 0 /* keep for debugging */
862 {
863 unsigned long eip, cs;
865 __vmread(GUEST_CS_BASE, &cs);
866 __vmread(GUEST_RIP, &eip);
867 HVM_DBG_LOG(DBG_LEVEL_VMMU,
868 "vmx_do_page_fault = 0x%lx, cs_base=%lx, "
869 "eip = %lx, error_code = %lx\n",
870 va, cs, eip, (unsigned long)regs->error_code);
871 }
872 #endif
874 result = shadow_fault(va, regs);
876 TRACE_VMEXIT (2,result);
877 #if 0
878 if ( !result )
879 {
880 __vmread(GUEST_RIP, &eip);
881 printk("vmx pgfault to guest va=%lx eip=%lx\n", va, eip);
882 }
883 #endif
885 return result;
886 }
888 static void vmx_do_no_device_fault(void)
889 {
890 unsigned long cr0;
891 struct vcpu *v = current;
893 setup_fpu(current);
894 __vm_clear_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_NM);
896 /* Disable TS in guest CR0 unless the guest wants the exception too. */
897 __vmread_vcpu(v, CR0_READ_SHADOW, &cr0);
898 if ( !(cr0 & X86_CR0_TS) )
899 {
900 __vmread_vcpu(v, GUEST_CR0, &cr0);
901 cr0 &= ~X86_CR0_TS;
902 __vmwrite(GUEST_CR0, cr0);
903 }
904 }
906 #define bitmaskof(idx) (1U << ((idx)&31))
907 static void vmx_vmexit_do_cpuid(struct cpu_user_regs *regs)
908 {
909 unsigned int input = (unsigned int)regs->eax;
910 unsigned int count = (unsigned int)regs->ecx;
911 unsigned int eax, ebx, ecx, edx;
912 unsigned long eip;
913 struct vcpu *v = current;
915 __vmread(GUEST_RIP, &eip);
917 HVM_DBG_LOG(DBG_LEVEL_3, "(eax) 0x%08lx, (ebx) 0x%08lx, "
918 "(ecx) 0x%08lx, (edx) 0x%08lx, (esi) 0x%08lx, (edi) 0x%08lx",
919 (unsigned long)regs->eax, (unsigned long)regs->ebx,
920 (unsigned long)regs->ecx, (unsigned long)regs->edx,
921 (unsigned long)regs->esi, (unsigned long)regs->edi);
923 if ( input == CPUID_LEAF_0x4 )
924 {
925 cpuid_count(input, count, &eax, &ebx, &ecx, &edx);
926 eax &= NUM_CORES_RESET_MASK;
927 }
928 else if ( !cpuid_hypervisor_leaves(input, &eax, &ebx, &ecx, &edx) )
929 {
930 cpuid(input, &eax, &ebx, &ecx, &edx);
932 if ( input == CPUID_LEAF_0x1 )
933 {
934 /* mask off reserved bits */
935 ecx &= ~VMX_VCPU_CPUID_L1_ECX_RESERVED;
937 if ( !hvm_apic_support(v->domain) ||
938 !vlapic_global_enabled((VLAPIC(v))) )
939 {
940 /* Since the apic is disabled, avoid any
941 confusion about SMP cpus being available */
943 clear_bit(X86_FEATURE_APIC, &edx);
944 }
946 #if CONFIG_PAGING_LEVELS >= 3
947 if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] )
948 #endif
949 clear_bit(X86_FEATURE_PAE, &edx);
950 clear_bit(X86_FEATURE_PSE36, &edx);
952 ebx &= NUM_THREADS_RESET_MASK;
954 /* Unsupportable for virtualised CPUs. */
955 ecx &= ~(bitmaskof(X86_FEATURE_VMXE) |
956 bitmaskof(X86_FEATURE_EST) |
957 bitmaskof(X86_FEATURE_TM2) |
958 bitmaskof(X86_FEATURE_CID) |
959 bitmaskof(X86_FEATURE_MWAIT) );
961 edx &= ~( bitmaskof(X86_FEATURE_HT) |
962 bitmaskof(X86_FEATURE_ACPI) |
963 bitmaskof(X86_FEATURE_ACC) );
964 }
965 else if ( ( input == CPUID_LEAF_0x6 )
966 || ( input == CPUID_LEAF_0x9 )
967 || ( input == CPUID_LEAF_0xA ))
968 {
969 eax = ebx = ecx = edx = 0x0;
970 }
971 #ifdef __i386__
972 else if ( input == CPUID_LEAF_0x80000001 )
973 {
974 clear_bit(X86_FEATURE_LAHF_LM & 31, &ecx);
976 clear_bit(X86_FEATURE_LM & 31, &edx);
977 clear_bit(X86_FEATURE_SYSCALL & 31, &edx);
978 }
979 #endif
980 }
982 regs->eax = (unsigned long) eax;
983 regs->ebx = (unsigned long) ebx;
984 regs->ecx = (unsigned long) ecx;
985 regs->edx = (unsigned long) edx;
987 HVM_DBG_LOG(DBG_LEVEL_3, "eip@%lx, input: 0x%lx, "
988 "output: eax = 0x%08lx, ebx = 0x%08lx, "
989 "ecx = 0x%08lx, edx = 0x%08lx",
990 (unsigned long)eip, (unsigned long)input,
991 (unsigned long)eax, (unsigned long)ebx,
992 (unsigned long)ecx, (unsigned long)edx);
993 }
995 #define CASE_GET_REG_P(REG, reg) \
996 case REG_ ## REG: reg_p = (unsigned long *)&(regs->reg); break
998 #ifdef __i386__
999 #define CASE_EXTEND_GET_REG_P
1000 #else
1001 #define CASE_EXTEND_GET_REG_P \
1002 CASE_GET_REG_P(R8, r8); \
1003 CASE_GET_REG_P(R9, r9); \
1004 CASE_GET_REG_P(R10, r10); \
1005 CASE_GET_REG_P(R11, r11); \
1006 CASE_GET_REG_P(R12, r12); \
1007 CASE_GET_REG_P(R13, r13); \
1008 CASE_GET_REG_P(R14, r14); \
1009 CASE_GET_REG_P(R15, r15)
1010 #endif
1012 static void vmx_dr_access(unsigned long exit_qualification,
1013 struct cpu_user_regs *regs)
1015 struct vcpu *v = current;
1017 v->arch.hvm_vcpu.flag_dr_dirty = 1;
1019 /* We could probably be smarter about this */
1020 __restore_debug_registers(v);
1022 /* Allow guest direct access to DR registers */
1023 v->arch.hvm_vcpu.u.vmx.exec_control &= ~CPU_BASED_MOV_DR_EXITING;
1024 __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
1025 v->arch.hvm_vcpu.u.vmx.exec_control);
1028 /*
1029 * Invalidate the TLB for va. Invalidate the shadow page corresponding
1030 * the address va.
1031 */
1032 static void vmx_vmexit_do_invlpg(unsigned long va)
1034 unsigned long eip;
1035 struct vcpu *v = current;
1037 __vmread(GUEST_RIP, &eip);
1039 HVM_DBG_LOG(DBG_LEVEL_VMMU, "vmx_vmexit_do_invlpg: eip=%lx, va=%lx",
1040 eip, va);
1042 /*
1043 * We do the safest things first, then try to update the shadow
1044 * copying from guest
1045 */
1046 shadow_invlpg(v, va);
1050 static int check_for_null_selector(unsigned long eip)
1052 unsigned char inst[MAX_INST_LEN];
1053 unsigned long sel;
1054 int i, inst_len;
1055 int inst_copy_from_guest(unsigned char *, unsigned long, int);
1057 inst_len = __get_instruction_length(); /* Safe: INS/OUTS */
1058 memset(inst, 0, MAX_INST_LEN);
1059 if ( inst_copy_from_guest(inst, eip, inst_len) != inst_len )
1061 printf("check_for_null_selector: get guest instruction failed\n");
1062 domain_crash_synchronous();
1065 for ( i = 0; i < inst_len; i++ )
1067 switch ( inst[i] )
1069 case 0xf3: /* REPZ */
1070 case 0xf2: /* REPNZ */
1071 case 0xf0: /* LOCK */
1072 case 0x66: /* data32 */
1073 case 0x67: /* addr32 */
1074 continue;
1075 case 0x2e: /* CS */
1076 __vmread(GUEST_CS_SELECTOR, &sel);
1077 break;
1078 case 0x36: /* SS */
1079 __vmread(GUEST_SS_SELECTOR, &sel);
1080 break;
1081 case 0x26: /* ES */
1082 __vmread(GUEST_ES_SELECTOR, &sel);
1083 break;
1084 case 0x64: /* FS */
1085 __vmread(GUEST_FS_SELECTOR, &sel);
1086 break;
1087 case 0x65: /* GS */
1088 __vmread(GUEST_GS_SELECTOR, &sel);
1089 break;
1090 case 0x3e: /* DS */
1091 /* FALLTHROUGH */
1092 default:
1093 /* DS is the default */
1094 __vmread(GUEST_DS_SELECTOR, &sel);
1096 return sel == 0 ? 1 : 0;
1099 return 0;
1102 extern void send_pio_req(struct cpu_user_regs *regs, unsigned long port,
1103 unsigned long count, int size, long value,
1104 int dir, int pvalid);
1106 static void vmx_io_instruction(unsigned long exit_qualification,
1107 unsigned long inst_len)
1109 struct cpu_user_regs *regs;
1110 struct hvm_io_op *pio_opp;
1111 unsigned long eip, cs, eflags;
1112 unsigned long port, size, dir;
1113 int vm86;
1115 pio_opp = &current->arch.hvm_vcpu.io_op;
1116 pio_opp->instr = INSTR_PIO;
1117 pio_opp->flags = 0;
1119 regs = &pio_opp->io_context;
1121 /* Copy current guest state into io instruction state structure. */
1122 memcpy(regs, guest_cpu_user_regs(), HVM_CONTEXT_STACK_BYTES);
1123 hvm_store_cpu_guest_regs(current, regs, NULL);
1125 eip = regs->eip;
1126 cs = regs->cs;
1127 eflags = regs->eflags;
1129 vm86 = eflags & X86_EFLAGS_VM ? 1 : 0;
1131 HVM_DBG_LOG(DBG_LEVEL_IO,
1132 "vmx_io_instruction: vm86 %d, eip=%lx:%lx, "
1133 "exit_qualification = %lx",
1134 vm86, cs, eip, exit_qualification);
1136 if (test_bit(6, &exit_qualification))
1137 port = (exit_qualification >> 16) & 0xFFFF;
1138 else
1139 port = regs->edx & 0xffff;
1140 TRACE_VMEXIT(1, port);
1141 size = (exit_qualification & 7) + 1;
1142 dir = test_bit(3, &exit_qualification); /* direction */
1144 if (test_bit(4, &exit_qualification)) { /* string instruction */
1145 unsigned long addr, count = 1;
1146 int sign = regs->eflags & EF_DF ? -1 : 1;
1148 __vmread(GUEST_LINEAR_ADDRESS, &addr);
1150 /*
1151 * In protected mode, guest linear address is invalid if the
1152 * selector is null.
1153 */
1154 if (!vm86 && check_for_null_selector(eip))
1155 addr = dir == IOREQ_WRITE ? regs->esi : regs->edi;
1157 if (test_bit(5, &exit_qualification)) { /* "rep" prefix */
1158 pio_opp->flags |= REPZ;
1159 count = vm86 ? regs->ecx & 0xFFFF : regs->ecx;
1162 /*
1163 * Handle string pio instructions that cross pages or that
1164 * are unaligned. See the comments in hvm_domain.c/handle_mmio()
1165 */
1166 if ((addr & PAGE_MASK) != ((addr + size - 1) & PAGE_MASK)) {
1167 unsigned long value = 0;
1169 pio_opp->flags |= OVERLAP;
1170 if (dir == IOREQ_WRITE)
1171 (void)hvm_copy_from_guest_virt(&value, addr, size);
1172 send_pio_req(regs, port, 1, size, value, dir, 0);
1173 } else {
1174 if ((addr & PAGE_MASK) != ((addr + count * size - 1) & PAGE_MASK)) {
1175 if (sign > 0)
1176 count = (PAGE_SIZE - (addr & ~PAGE_MASK)) / size;
1177 else
1178 count = (addr & ~PAGE_MASK) / size;
1179 } else
1180 regs->eip += inst_len;
1182 send_pio_req(regs, port, count, size, addr, dir, 1);
1184 } else {
1185 if (port == 0xe9 && dir == IOREQ_WRITE && size == 1)
1186 hvm_print_line(current, regs->eax); /* guest debug output */
1188 regs->eip += inst_len;
1189 send_pio_req(regs, port, 1, size, regs->eax, dir, 0);
1193 static int vmx_world_save(struct vcpu *v, struct vmx_assist_context *c)
1195 int error = 0;
1197 /* NB. Skip transition instruction. */
1198 error |= __vmread(GUEST_RIP, &c->eip);
1199 c->eip += __get_instruction_length(); /* Safe: MOV Cn, LMSW, CLTS */
1201 error |= __vmread(GUEST_RSP, &c->esp);
1202 error |= __vmread(GUEST_RFLAGS, &c->eflags);
1204 error |= __vmread(CR0_READ_SHADOW, &c->cr0);
1205 c->cr3 = v->arch.hvm_vmx.cpu_cr3;
1206 error |= __vmread(CR4_READ_SHADOW, &c->cr4);
1208 error |= __vmread(GUEST_IDTR_LIMIT, &c->idtr_limit);
1209 error |= __vmread(GUEST_IDTR_BASE, &c->idtr_base);
1211 error |= __vmread(GUEST_GDTR_LIMIT, &c->gdtr_limit);
1212 error |= __vmread(GUEST_GDTR_BASE, &c->gdtr_base);
1214 error |= __vmread(GUEST_CS_SELECTOR, &c->cs_sel);
1215 error |= __vmread(GUEST_CS_LIMIT, &c->cs_limit);
1216 error |= __vmread(GUEST_CS_BASE, &c->cs_base);
1217 error |= __vmread(GUEST_CS_AR_BYTES, &c->cs_arbytes.bytes);
1219 error |= __vmread(GUEST_DS_SELECTOR, &c->ds_sel);
1220 error |= __vmread(GUEST_DS_LIMIT, &c->ds_limit);
1221 error |= __vmread(GUEST_DS_BASE, &c->ds_base);
1222 error |= __vmread(GUEST_DS_AR_BYTES, &c->ds_arbytes.bytes);
1224 error |= __vmread(GUEST_ES_SELECTOR, &c->es_sel);
1225 error |= __vmread(GUEST_ES_LIMIT, &c->es_limit);
1226 error |= __vmread(GUEST_ES_BASE, &c->es_base);
1227 error |= __vmread(GUEST_ES_AR_BYTES, &c->es_arbytes.bytes);
1229 error |= __vmread(GUEST_SS_SELECTOR, &c->ss_sel);
1230 error |= __vmread(GUEST_SS_LIMIT, &c->ss_limit);
1231 error |= __vmread(GUEST_SS_BASE, &c->ss_base);
1232 error |= __vmread(GUEST_SS_AR_BYTES, &c->ss_arbytes.bytes);
1234 error |= __vmread(GUEST_FS_SELECTOR, &c->fs_sel);
1235 error |= __vmread(GUEST_FS_LIMIT, &c->fs_limit);
1236 error |= __vmread(GUEST_FS_BASE, &c->fs_base);
1237 error |= __vmread(GUEST_FS_AR_BYTES, &c->fs_arbytes.bytes);
1239 error |= __vmread(GUEST_GS_SELECTOR, &c->gs_sel);
1240 error |= __vmread(GUEST_GS_LIMIT, &c->gs_limit);
1241 error |= __vmread(GUEST_GS_BASE, &c->gs_base);
1242 error |= __vmread(GUEST_GS_AR_BYTES, &c->gs_arbytes.bytes);
1244 error |= __vmread(GUEST_TR_SELECTOR, &c->tr_sel);
1245 error |= __vmread(GUEST_TR_LIMIT, &c->tr_limit);
1246 error |= __vmread(GUEST_TR_BASE, &c->tr_base);
1247 error |= __vmread(GUEST_TR_AR_BYTES, &c->tr_arbytes.bytes);
1249 error |= __vmread(GUEST_LDTR_SELECTOR, &c->ldtr_sel);
1250 error |= __vmread(GUEST_LDTR_LIMIT, &c->ldtr_limit);
1251 error |= __vmread(GUEST_LDTR_BASE, &c->ldtr_base);
1252 error |= __vmread(GUEST_LDTR_AR_BYTES, &c->ldtr_arbytes.bytes);
1254 return !error;
1257 static int vmx_world_restore(struct vcpu *v, struct vmx_assist_context *c)
1259 unsigned long mfn, old_cr4, old_base_mfn;
1260 int error = 0;
1262 error |= __vmwrite(GUEST_RIP, c->eip);
1263 error |= __vmwrite(GUEST_RSP, c->esp);
1264 error |= __vmwrite(GUEST_RFLAGS, c->eflags);
1266 error |= __vmwrite(CR0_READ_SHADOW, c->cr0);
1268 if (!vmx_paging_enabled(v))
1269 goto skip_cr3;
1271 if (c->cr3 == v->arch.hvm_vmx.cpu_cr3) {
1272 /*
1273 * This is simple TLB flush, implying the guest has
1274 * removed some translation or changed page attributes.
1275 * We simply invalidate the shadow.
1276 */
1277 mfn = get_mfn_from_gpfn(c->cr3 >> PAGE_SHIFT);
1278 if (mfn != pagetable_get_pfn(v->arch.guest_table)) {
1279 printk("Invalid CR3 value=%x", c->cr3);
1280 domain_crash_synchronous();
1281 return 0;
1283 } else {
1284 /*
1285 * If different, make a shadow. Check if the PDBR is valid
1286 * first.
1287 */
1288 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 c->cr3 = %x", c->cr3);
1289 if ((c->cr3 >> PAGE_SHIFT) > v->domain->max_pages) {
1290 printk("Invalid CR3 value=%x", c->cr3);
1291 domain_crash_synchronous();
1292 return 0;
1294 mfn = get_mfn_from_gpfn(c->cr3 >> PAGE_SHIFT);
1295 if(!get_page(mfn_to_page(mfn), v->domain))
1296 return 0;
1297 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1298 v->arch.guest_table = pagetable_from_pfn(mfn);
1299 if (old_base_mfn)
1300 put_page(mfn_to_page(old_base_mfn));
1301 /*
1302 * arch.shadow_table should now hold the next CR3 for shadow
1303 */
1304 v->arch.hvm_vmx.cpu_cr3 = c->cr3;
1307 skip_cr3:
1309 shadow_update_paging_modes(v);
1310 if (!vmx_paging_enabled(v))
1311 HVM_DBG_LOG(DBG_LEVEL_VMMU, "switching to vmxassist. use phys table");
1312 else
1313 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %x", c->cr3);
1314 __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
1316 error |= __vmread(CR4_READ_SHADOW, &old_cr4);
1317 error |= __vmwrite(GUEST_CR4, (c->cr4 | VMX_CR4_HOST_MASK));
1318 error |= __vmwrite(CR4_READ_SHADOW, c->cr4);
1320 error |= __vmwrite(GUEST_IDTR_LIMIT, c->idtr_limit);
1321 error |= __vmwrite(GUEST_IDTR_BASE, c->idtr_base);
1323 error |= __vmwrite(GUEST_GDTR_LIMIT, c->gdtr_limit);
1324 error |= __vmwrite(GUEST_GDTR_BASE, c->gdtr_base);
1326 error |= __vmwrite(GUEST_CS_SELECTOR, c->cs_sel);
1327 error |= __vmwrite(GUEST_CS_LIMIT, c->cs_limit);
1328 error |= __vmwrite(GUEST_CS_BASE, c->cs_base);
1329 error |= __vmwrite(GUEST_CS_AR_BYTES, c->cs_arbytes.bytes);
1331 error |= __vmwrite(GUEST_DS_SELECTOR, c->ds_sel);
1332 error |= __vmwrite(GUEST_DS_LIMIT, c->ds_limit);
1333 error |= __vmwrite(GUEST_DS_BASE, c->ds_base);
1334 error |= __vmwrite(GUEST_DS_AR_BYTES, c->ds_arbytes.bytes);
1336 error |= __vmwrite(GUEST_ES_SELECTOR, c->es_sel);
1337 error |= __vmwrite(GUEST_ES_LIMIT, c->es_limit);
1338 error |= __vmwrite(GUEST_ES_BASE, c->es_base);
1339 error |= __vmwrite(GUEST_ES_AR_BYTES, c->es_arbytes.bytes);
1341 error |= __vmwrite(GUEST_SS_SELECTOR, c->ss_sel);
1342 error |= __vmwrite(GUEST_SS_LIMIT, c->ss_limit);
1343 error |= __vmwrite(GUEST_SS_BASE, c->ss_base);
1344 error |= __vmwrite(GUEST_SS_AR_BYTES, c->ss_arbytes.bytes);
1346 error |= __vmwrite(GUEST_FS_SELECTOR, c->fs_sel);
1347 error |= __vmwrite(GUEST_FS_LIMIT, c->fs_limit);
1348 error |= __vmwrite(GUEST_FS_BASE, c->fs_base);
1349 error |= __vmwrite(GUEST_FS_AR_BYTES, c->fs_arbytes.bytes);
1351 error |= __vmwrite(GUEST_GS_SELECTOR, c->gs_sel);
1352 error |= __vmwrite(GUEST_GS_LIMIT, c->gs_limit);
1353 error |= __vmwrite(GUEST_GS_BASE, c->gs_base);
1354 error |= __vmwrite(GUEST_GS_AR_BYTES, c->gs_arbytes.bytes);
1356 error |= __vmwrite(GUEST_TR_SELECTOR, c->tr_sel);
1357 error |= __vmwrite(GUEST_TR_LIMIT, c->tr_limit);
1358 error |= __vmwrite(GUEST_TR_BASE, c->tr_base);
1359 error |= __vmwrite(GUEST_TR_AR_BYTES, c->tr_arbytes.bytes);
1361 error |= __vmwrite(GUEST_LDTR_SELECTOR, c->ldtr_sel);
1362 error |= __vmwrite(GUEST_LDTR_LIMIT, c->ldtr_limit);
1363 error |= __vmwrite(GUEST_LDTR_BASE, c->ldtr_base);
1364 error |= __vmwrite(GUEST_LDTR_AR_BYTES, c->ldtr_arbytes.bytes);
1366 return !error;
1369 enum { VMX_ASSIST_INVOKE = 0, VMX_ASSIST_RESTORE };
1371 static int vmx_assist(struct vcpu *v, int mode)
1373 struct vmx_assist_context c;
1374 u32 magic;
1375 u32 cp;
1377 /* make sure vmxassist exists (this is not an error) */
1378 if (hvm_copy_from_guest_phys(&magic, VMXASSIST_MAGIC_OFFSET,
1379 sizeof(magic)))
1380 return 0;
1381 if (magic != VMXASSIST_MAGIC)
1382 return 0;
1384 switch (mode) {
1385 /*
1386 * Transfer control to vmxassist.
1387 * Store the current context in VMXASSIST_OLD_CONTEXT and load
1388 * the new VMXASSIST_NEW_CONTEXT context. This context was created
1389 * by vmxassist and will transfer control to it.
1390 */
1391 case VMX_ASSIST_INVOKE:
1392 /* save the old context */
1393 if (hvm_copy_from_guest_phys(&cp, VMXASSIST_OLD_CONTEXT, sizeof(cp)))
1394 goto error;
1395 if (cp != 0) {
1396 if (!vmx_world_save(v, &c))
1397 goto error;
1398 if (hvm_copy_to_guest_phys(cp, &c, sizeof(c)))
1399 goto error;
1402 /* restore the new context, this should activate vmxassist */
1403 if (hvm_copy_from_guest_phys(&cp, VMXASSIST_NEW_CONTEXT, sizeof(cp)))
1404 goto error;
1405 if (cp != 0) {
1406 if (hvm_copy_from_guest_phys(&c, cp, sizeof(c)))
1407 goto error;
1408 if (!vmx_world_restore(v, &c))
1409 goto error;
1410 return 1;
1412 break;
1414 /*
1415 * Restore the VMXASSIST_OLD_CONTEXT that was saved by
1416 * VMX_ASSIST_INVOKE above.
1417 */
1418 case VMX_ASSIST_RESTORE:
1419 /* save the old context */
1420 if (hvm_copy_from_guest_phys(&cp, VMXASSIST_OLD_CONTEXT, sizeof(cp)))
1421 goto error;
1422 if (cp != 0) {
1423 if (hvm_copy_from_guest_phys(&c, cp, sizeof(c)))
1424 goto error;
1425 if (!vmx_world_restore(v, &c))
1426 goto error;
1427 return 1;
1429 break;
1432 error:
1433 printf("Failed to transfer to vmxassist\n");
1434 domain_crash_synchronous();
1435 return 0;
1438 static int vmx_set_cr0(unsigned long value)
1440 struct vcpu *v = current;
1441 unsigned long mfn;
1442 unsigned long eip;
1443 int paging_enabled;
1444 unsigned long vm_entry_value;
1445 unsigned long old_cr0;
1446 unsigned long old_base_mfn;
1448 /*
1449 * CR0: We don't want to lose PE and PG.
1450 */
1451 __vmread_vcpu(v, CR0_READ_SHADOW, &old_cr0);
1452 paging_enabled = (old_cr0 & X86_CR0_PE) && (old_cr0 & X86_CR0_PG);
1454 /* TS cleared? Then initialise FPU now. */
1455 if ( !(value & X86_CR0_TS) )
1457 setup_fpu(v);
1458 __vm_clear_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_NM);
1461 __vmwrite(GUEST_CR0, value | X86_CR0_PE | X86_CR0_PG | X86_CR0_NE);
1462 __vmwrite(CR0_READ_SHADOW, value);
1464 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR0 value = %lx\n", value);
1466 if ( (value & X86_CR0_PE) && (value & X86_CR0_PG) && !paging_enabled )
1468 /*
1469 * Trying to enable guest paging.
1470 * The guest CR3 must be pointing to the guest physical.
1471 */
1472 if ( !VALID_MFN(mfn = get_mfn_from_gpfn(
1473 v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT)) ||
1474 !get_page(mfn_to_page(mfn), v->domain) )
1476 printk("Invalid CR3 value = %lx (mfn=%lx)\n",
1477 v->arch.hvm_vmx.cpu_cr3, mfn);
1478 domain_crash_synchronous(); /* need to take a clean path */
1481 #if defined(__x86_64__)
1482 if ( test_bit(VMX_CPU_STATE_LME_ENABLED,
1483 &v->arch.hvm_vmx.cpu_state) &&
1484 !test_bit(VMX_CPU_STATE_PAE_ENABLED,
1485 &v->arch.hvm_vmx.cpu_state) )
1487 HVM_DBG_LOG(DBG_LEVEL_1, "Enable paging before PAE enabled\n");
1488 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
1491 if ( test_bit(VMX_CPU_STATE_LME_ENABLED,
1492 &v->arch.hvm_vmx.cpu_state) )
1494 /* Here the PAE is should be opened */
1495 HVM_DBG_LOG(DBG_LEVEL_1, "Enable long mode\n");
1496 set_bit(VMX_CPU_STATE_LMA_ENABLED,
1497 &v->arch.hvm_vmx.cpu_state);
1499 __vmread(VM_ENTRY_CONTROLS, &vm_entry_value);
1500 vm_entry_value |= VM_ENTRY_IA32E_MODE;
1501 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
1503 #endif
1505 /*
1506 * Now arch.guest_table points to machine physical.
1507 */
1508 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1509 v->arch.guest_table = pagetable_from_pfn(mfn);
1510 if (old_base_mfn)
1511 put_page(mfn_to_page(old_base_mfn));
1512 shadow_update_paging_modes(v);
1514 HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
1515 (unsigned long) (mfn << PAGE_SHIFT));
1517 __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
1518 /*
1519 * arch->shadow_table should hold the next CR3 for shadow
1520 */
1521 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, mfn = %lx",
1522 v->arch.hvm_vmx.cpu_cr3, mfn);
1525 if ( !((value & X86_CR0_PE) && (value & X86_CR0_PG)) && paging_enabled )
1526 if ( v->arch.hvm_vmx.cpu_cr3 ) {
1527 put_page(mfn_to_page(get_mfn_from_gpfn(
1528 v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT)));
1529 v->arch.guest_table = pagetable_null();
1532 /*
1533 * VMX does not implement real-mode virtualization. We emulate
1534 * real-mode by performing a world switch to VMXAssist whenever
1535 * a partition disables the CR0.PE bit.
1536 */
1537 if ( (value & X86_CR0_PE) == 0 )
1539 if ( value & X86_CR0_PG ) {
1540 /* inject GP here */
1541 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
1542 return 0;
1543 } else {
1544 /*
1545 * Disable paging here.
1546 * Same to PE == 1 && PG == 0
1547 */
1548 if ( test_bit(VMX_CPU_STATE_LMA_ENABLED,
1549 &v->arch.hvm_vmx.cpu_state) )
1551 clear_bit(VMX_CPU_STATE_LMA_ENABLED,
1552 &v->arch.hvm_vmx.cpu_state);
1553 __vmread(VM_ENTRY_CONTROLS, &vm_entry_value);
1554 vm_entry_value &= ~VM_ENTRY_IA32E_MODE;
1555 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
1559 if ( vmx_assist(v, VMX_ASSIST_INVOKE) )
1561 set_bit(VMX_CPU_STATE_ASSIST_ENABLED, &v->arch.hvm_vmx.cpu_state);
1562 __vmread(GUEST_RIP, &eip);
1563 HVM_DBG_LOG(DBG_LEVEL_1,
1564 "Transfering control to vmxassist %%eip 0x%lx\n", eip);
1565 return 0; /* do not update eip! */
1567 } else if ( test_bit(VMX_CPU_STATE_ASSIST_ENABLED,
1568 &v->arch.hvm_vmx.cpu_state) )
1570 __vmread(GUEST_RIP, &eip);
1571 HVM_DBG_LOG(DBG_LEVEL_1,
1572 "Enabling CR0.PE at %%eip 0x%lx\n", eip);
1573 if ( vmx_assist(v, VMX_ASSIST_RESTORE) )
1575 clear_bit(VMX_CPU_STATE_ASSIST_ENABLED,
1576 &v->arch.hvm_vmx.cpu_state);
1577 __vmread(GUEST_RIP, &eip);
1578 HVM_DBG_LOG(DBG_LEVEL_1,
1579 "Restoring to %%eip 0x%lx\n", eip);
1580 return 0; /* do not update eip! */
1583 else if ( (value & (X86_CR0_PE | X86_CR0_PG)) == X86_CR0_PE )
1585 __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
1586 shadow_update_paging_modes(v);
1589 return 1;
1592 #define CASE_SET_REG(REG, reg) \
1593 case REG_ ## REG: regs->reg = value; break
1594 #define CASE_GET_REG(REG, reg) \
1595 case REG_ ## REG: value = regs->reg; break
1597 #define CASE_EXTEND_SET_REG \
1598 CASE_EXTEND_REG(S)
1599 #define CASE_EXTEND_GET_REG \
1600 CASE_EXTEND_REG(G)
1602 #ifdef __i386__
1603 #define CASE_EXTEND_REG(T)
1604 #else
1605 #define CASE_EXTEND_REG(T) \
1606 CASE_ ## T ## ET_REG(R8, r8); \
1607 CASE_ ## T ## ET_REG(R9, r9); \
1608 CASE_ ## T ## ET_REG(R10, r10); \
1609 CASE_ ## T ## ET_REG(R11, r11); \
1610 CASE_ ## T ## ET_REG(R12, r12); \
1611 CASE_ ## T ## ET_REG(R13, r13); \
1612 CASE_ ## T ## ET_REG(R14, r14); \
1613 CASE_ ## T ## ET_REG(R15, r15)
1614 #endif
1616 /*
1617 * Write to control registers
1618 */
1619 static int mov_to_cr(int gp, int cr, struct cpu_user_regs *regs)
1621 unsigned long value;
1622 unsigned long old_cr;
1623 struct vcpu *v = current;
1624 struct vlapic *vlapic = VLAPIC(v);
1626 switch ( gp ) {
1627 CASE_GET_REG(EAX, eax);
1628 CASE_GET_REG(ECX, ecx);
1629 CASE_GET_REG(EDX, edx);
1630 CASE_GET_REG(EBX, ebx);
1631 CASE_GET_REG(EBP, ebp);
1632 CASE_GET_REG(ESI, esi);
1633 CASE_GET_REG(EDI, edi);
1634 CASE_EXTEND_GET_REG;
1635 case REG_ESP:
1636 __vmread(GUEST_RSP, &value);
1637 break;
1638 default:
1639 printk("invalid gp: %d\n", gp);
1640 __hvm_bug(regs);
1643 HVM_DBG_LOG(DBG_LEVEL_1, "CR%d, value = %lx", cr, value);
1645 switch ( cr ) {
1646 case 0:
1647 return vmx_set_cr0(value);
1648 case 3:
1650 unsigned long old_base_mfn, mfn;
1652 /*
1653 * If paging is not enabled yet, simply copy the value to CR3.
1654 */
1655 if (!vmx_paging_enabled(v)) {
1656 v->arch.hvm_vmx.cpu_cr3 = value;
1657 break;
1660 /*
1661 * We make a new one if the shadow does not exist.
1662 */
1663 if (value == v->arch.hvm_vmx.cpu_cr3) {
1664 /*
1665 * This is simple TLB flush, implying the guest has
1666 * removed some translation or changed page attributes.
1667 * We simply invalidate the shadow.
1668 */
1669 mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
1670 if (mfn != pagetable_get_pfn(v->arch.guest_table))
1671 __hvm_bug(regs);
1672 shadow_update_cr3(v);
1673 } else {
1674 /*
1675 * If different, make a shadow. Check if the PDBR is valid
1676 * first.
1677 */
1678 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 value = %lx", value);
1679 if ( ((value >> PAGE_SHIFT) > v->domain->max_pages ) ||
1680 !VALID_MFN(mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT)) ||
1681 !get_page(mfn_to_page(mfn), v->domain) )
1683 printk("Invalid CR3 value=%lx", value);
1684 domain_crash_synchronous(); /* need to take a clean path */
1686 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1687 v->arch.guest_table = pagetable_from_pfn(mfn);
1688 if (old_base_mfn)
1689 put_page(mfn_to_page(old_base_mfn));
1690 /*
1691 * arch.shadow_table should now hold the next CR3 for shadow
1692 */
1693 v->arch.hvm_vmx.cpu_cr3 = value;
1694 update_cr3(v);
1695 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx",
1696 value);
1697 __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
1699 break;
1701 case 4: /* CR4 */
1703 __vmread(CR4_READ_SHADOW, &old_cr);
1705 if ( value & X86_CR4_PAE && !(old_cr & X86_CR4_PAE) )
1707 set_bit(VMX_CPU_STATE_PAE_ENABLED, &v->arch.hvm_vmx.cpu_state);
1709 if ( vmx_pgbit_test(v) )
1711 /* The guest is a 32-bit PAE guest. */
1712 #if CONFIG_PAGING_LEVELS >= 3
1713 unsigned long mfn, old_base_mfn;
1715 if ( !VALID_MFN(mfn = get_mfn_from_gpfn(
1716 v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT)) ||
1717 !get_page(mfn_to_page(mfn), v->domain) )
1719 printk("Invalid CR3 value = %lx", v->arch.hvm_vmx.cpu_cr3);
1720 domain_crash_synchronous(); /* need to take a clean path */
1724 /*
1725 * Now arch.guest_table points to machine physical.
1726 */
1728 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1729 v->arch.guest_table = pagetable_from_pfn(mfn);
1730 if ( old_base_mfn )
1731 put_page(mfn_to_page(old_base_mfn));
1733 HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
1734 (unsigned long) (mfn << PAGE_SHIFT));
1736 __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
1738 /*
1739 * arch->shadow_table should hold the next CR3 for shadow
1740 */
1742 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, mfn = %lx",
1743 v->arch.hvm_vmx.cpu_cr3, mfn);
1744 #endif
1747 else if ( value & X86_CR4_PAE )
1748 set_bit(VMX_CPU_STATE_PAE_ENABLED, &v->arch.hvm_vmx.cpu_state);
1749 else
1751 if ( test_bit(VMX_CPU_STATE_LMA_ENABLED, &v->arch.hvm_vmx.cpu_state) )
1752 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
1754 clear_bit(VMX_CPU_STATE_PAE_ENABLED, &v->arch.hvm_vmx.cpu_state);
1757 __vmwrite(GUEST_CR4, value| VMX_CR4_HOST_MASK);
1758 __vmwrite(CR4_READ_SHADOW, value);
1760 /*
1761 * Writing to CR4 to modify the PSE, PGE, or PAE flag invalidates
1762 * all TLB entries except global entries.
1763 */
1764 if ( (old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE) )
1765 shadow_update_paging_modes(v);
1766 break;
1768 case 8:
1770 vlapic_set_reg(vlapic, APIC_TASKPRI, ((value & 0x0F) << 4));
1771 vlapic_update_ppr(vlapic);
1772 break;
1774 default:
1775 printk("invalid cr: %d\n", gp);
1776 __hvm_bug(regs);
1779 return 1;
1782 /*
1783 * Read from control registers. CR0 and CR4 are read from the shadow.
1784 */
1785 static void mov_from_cr(int cr, int gp, struct cpu_user_regs *regs)
1787 unsigned long value = 0;
1788 struct vcpu *v = current;
1789 struct vlapic *vlapic = VLAPIC(v);
1791 if ( cr != 3 && cr != 8)
1792 __hvm_bug(regs);
1794 if ( cr == 3 )
1795 value = (unsigned long) v->arch.hvm_vmx.cpu_cr3;
1796 else if ( cr == 8 )
1798 value = (unsigned long)vlapic_get_reg(vlapic, APIC_TASKPRI);
1799 value = (value & 0xF0) >> 4;
1802 switch ( gp ) {
1803 CASE_SET_REG(EAX, eax);
1804 CASE_SET_REG(ECX, ecx);
1805 CASE_SET_REG(EDX, edx);
1806 CASE_SET_REG(EBX, ebx);
1807 CASE_SET_REG(EBP, ebp);
1808 CASE_SET_REG(ESI, esi);
1809 CASE_SET_REG(EDI, edi);
1810 CASE_EXTEND_SET_REG;
1811 case REG_ESP:
1812 __vmwrite(GUEST_RSP, value);
1813 regs->esp = value;
1814 break;
1815 default:
1816 printk("invalid gp: %d\n", gp);
1817 __hvm_bug(regs);
1820 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR%d, value = %lx", cr, value);
1823 static int vmx_cr_access(unsigned long exit_qualification,
1824 struct cpu_user_regs *regs)
1826 unsigned int gp, cr;
1827 unsigned long value;
1828 struct vcpu *v = current;
1830 switch (exit_qualification & CONTROL_REG_ACCESS_TYPE) {
1831 case TYPE_MOV_TO_CR:
1832 gp = exit_qualification & CONTROL_REG_ACCESS_REG;
1833 cr = exit_qualification & CONTROL_REG_ACCESS_NUM;
1834 TRACE_VMEXIT(1,TYPE_MOV_TO_CR);
1835 TRACE_VMEXIT(2,cr);
1836 TRACE_VMEXIT(3,gp);
1837 return mov_to_cr(gp, cr, regs);
1838 case TYPE_MOV_FROM_CR:
1839 gp = exit_qualification & CONTROL_REG_ACCESS_REG;
1840 cr = exit_qualification & CONTROL_REG_ACCESS_NUM;
1841 TRACE_VMEXIT(1,TYPE_MOV_FROM_CR);
1842 TRACE_VMEXIT(2,cr);
1843 TRACE_VMEXIT(3,gp);
1844 mov_from_cr(cr, gp, regs);
1845 break;
1846 case TYPE_CLTS:
1847 TRACE_VMEXIT(1,TYPE_CLTS);
1849 /* We initialise the FPU now, to avoid needing another vmexit. */
1850 setup_fpu(v);
1851 __vm_clear_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_NM);
1853 __vmread_vcpu(v, GUEST_CR0, &value);
1854 value &= ~X86_CR0_TS; /* clear TS */
1855 __vmwrite(GUEST_CR0, value);
1857 __vmread_vcpu(v, CR0_READ_SHADOW, &value);
1858 value &= ~X86_CR0_TS; /* clear TS */
1859 __vmwrite(CR0_READ_SHADOW, value);
1860 break;
1861 case TYPE_LMSW:
1862 TRACE_VMEXIT(1,TYPE_LMSW);
1863 __vmread_vcpu(v, CR0_READ_SHADOW, &value);
1864 value = (value & ~0xF) |
1865 (((exit_qualification & LMSW_SOURCE_DATA) >> 16) & 0xF);
1866 return vmx_set_cr0(value);
1867 break;
1868 default:
1869 __hvm_bug(regs);
1870 break;
1872 return 1;
1875 static inline void vmx_do_msr_read(struct cpu_user_regs *regs)
1877 u64 msr_content = 0;
1878 u32 eax, edx;
1879 struct vcpu *v = current;
1881 HVM_DBG_LOG(DBG_LEVEL_1, "vmx_do_msr_read: ecx=%lx, eax=%lx, edx=%lx",
1882 (unsigned long)regs->ecx, (unsigned long)regs->eax,
1883 (unsigned long)regs->edx);
1884 switch (regs->ecx) {
1885 case MSR_IA32_TIME_STAMP_COUNTER:
1886 msr_content = hvm_get_guest_time(v);
1887 break;
1888 case MSR_IA32_SYSENTER_CS:
1889 __vmread(GUEST_SYSENTER_CS, (u32 *)&msr_content);
1890 break;
1891 case MSR_IA32_SYSENTER_ESP:
1892 __vmread(GUEST_SYSENTER_ESP, &msr_content);
1893 break;
1894 case MSR_IA32_SYSENTER_EIP:
1895 __vmread(GUEST_SYSENTER_EIP, &msr_content);
1896 break;
1897 case MSR_IA32_APICBASE:
1898 msr_content = VLAPIC(v) ? VLAPIC(v)->apic_base_msr : 0;
1899 break;
1900 default:
1901 if (long_mode_do_msr_read(regs))
1902 return;
1904 if ( rdmsr_hypervisor_regs(regs->ecx, &eax, &edx) )
1906 regs->eax = eax;
1907 regs->edx = edx;
1908 return;
1911 rdmsr_safe(regs->ecx, regs->eax, regs->edx);
1912 return;
1915 regs->eax = msr_content & 0xFFFFFFFF;
1916 regs->edx = msr_content >> 32;
1918 HVM_DBG_LOG(DBG_LEVEL_1, "vmx_do_msr_read returns: "
1919 "ecx=%lx, eax=%lx, edx=%lx",
1920 (unsigned long)regs->ecx, (unsigned long)regs->eax,
1921 (unsigned long)regs->edx);
1924 static inline void vmx_do_msr_write(struct cpu_user_regs *regs)
1926 u64 msr_content;
1927 struct vcpu *v = current;
1929 HVM_DBG_LOG(DBG_LEVEL_1, "vmx_do_msr_write: ecx=%lx, eax=%lx, edx=%lx",
1930 (unsigned long)regs->ecx, (unsigned long)regs->eax,
1931 (unsigned long)regs->edx);
1933 msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
1935 switch (regs->ecx) {
1936 case MSR_IA32_TIME_STAMP_COUNTER:
1937 hvm_set_guest_time(v, msr_content);
1938 break;
1939 case MSR_IA32_SYSENTER_CS:
1940 __vmwrite(GUEST_SYSENTER_CS, msr_content);
1941 break;
1942 case MSR_IA32_SYSENTER_ESP:
1943 __vmwrite(GUEST_SYSENTER_ESP, msr_content);
1944 break;
1945 case MSR_IA32_SYSENTER_EIP:
1946 __vmwrite(GUEST_SYSENTER_EIP, msr_content);
1947 break;
1948 case MSR_IA32_APICBASE:
1949 vlapic_msr_set(VLAPIC(v), msr_content);
1950 break;
1951 default:
1952 if ( !long_mode_do_msr_write(regs) )
1953 wrmsr_hypervisor_regs(regs->ecx, regs->eax, regs->edx);
1954 break;
1957 HVM_DBG_LOG(DBG_LEVEL_1, "vmx_do_msr_write returns: "
1958 "ecx=%lx, eax=%lx, edx=%lx",
1959 (unsigned long)regs->ecx, (unsigned long)regs->eax,
1960 (unsigned long)regs->edx);
1963 void vmx_vmexit_do_hlt(void)
1965 unsigned long rflags;
1966 __vmread(GUEST_RFLAGS, &rflags);
1967 hvm_hlt(rflags);
1970 static inline void vmx_vmexit_do_extint(struct cpu_user_regs *regs)
1972 unsigned int vector;
1973 int error;
1975 asmlinkage void do_IRQ(struct cpu_user_regs *);
1976 fastcall void smp_apic_timer_interrupt(struct cpu_user_regs *);
1977 fastcall void smp_event_check_interrupt(void);
1978 fastcall void smp_invalidate_interrupt(void);
1979 fastcall void smp_call_function_interrupt(void);
1980 fastcall void smp_spurious_interrupt(struct cpu_user_regs *regs);
1981 fastcall void smp_error_interrupt(struct cpu_user_regs *regs);
1982 #ifdef CONFIG_X86_MCE_P4THERMAL
1983 fastcall void smp_thermal_interrupt(struct cpu_user_regs *regs);
1984 #endif
1986 if ((error = __vmread(VM_EXIT_INTR_INFO, &vector))
1987 && !(vector & INTR_INFO_VALID_MASK))
1988 __hvm_bug(regs);
1990 vector &= INTR_INFO_VECTOR_MASK;
1991 TRACE_VMEXIT(1,vector);
1993 switch(vector) {
1994 case LOCAL_TIMER_VECTOR:
1995 smp_apic_timer_interrupt(regs);
1996 break;
1997 case EVENT_CHECK_VECTOR:
1998 smp_event_check_interrupt();
1999 break;
2000 case INVALIDATE_TLB_VECTOR:
2001 smp_invalidate_interrupt();
2002 break;
2003 case CALL_FUNCTION_VECTOR:
2004 smp_call_function_interrupt();
2005 break;
2006 case SPURIOUS_APIC_VECTOR:
2007 smp_spurious_interrupt(regs);
2008 break;
2009 case ERROR_APIC_VECTOR:
2010 smp_error_interrupt(regs);
2011 break;
2012 #ifdef CONFIG_X86_MCE_P4THERMAL
2013 case THERMAL_APIC_VECTOR:
2014 smp_thermal_interrupt(regs);
2015 break;
2016 #endif
2017 default:
2018 regs->entry_vector = vector;
2019 do_IRQ(regs);
2020 break;
2024 #if defined (__x86_64__)
2025 void store_cpu_user_regs(struct cpu_user_regs *regs)
2027 __vmread(GUEST_SS_SELECTOR, &regs->ss);
2028 __vmread(GUEST_RSP, &regs->rsp);
2029 __vmread(GUEST_RFLAGS, &regs->rflags);
2030 __vmread(GUEST_CS_SELECTOR, &regs->cs);
2031 __vmread(GUEST_DS_SELECTOR, &regs->ds);
2032 __vmread(GUEST_ES_SELECTOR, &regs->es);
2033 __vmread(GUEST_RIP, &regs->rip);
2035 #elif defined (__i386__)
2036 void store_cpu_user_regs(struct cpu_user_regs *regs)
2038 __vmread(GUEST_SS_SELECTOR, &regs->ss);
2039 __vmread(GUEST_RSP, &regs->esp);
2040 __vmread(GUEST_RFLAGS, &regs->eflags);
2041 __vmread(GUEST_CS_SELECTOR, &regs->cs);
2042 __vmread(GUEST_DS_SELECTOR, &regs->ds);
2043 __vmread(GUEST_ES_SELECTOR, &regs->es);
2044 __vmread(GUEST_RIP, &regs->eip);
2046 #endif
2048 #ifdef XEN_DEBUGGER
2049 void save_cpu_user_regs(struct cpu_user_regs *regs)
2051 __vmread(GUEST_SS_SELECTOR, &regs->xss);
2052 __vmread(GUEST_RSP, &regs->esp);
2053 __vmread(GUEST_RFLAGS, &regs->eflags);
2054 __vmread(GUEST_CS_SELECTOR, &regs->xcs);
2055 __vmread(GUEST_RIP, &regs->eip);
2057 __vmread(GUEST_GS_SELECTOR, &regs->xgs);
2058 __vmread(GUEST_FS_SELECTOR, &regs->xfs);
2059 __vmread(GUEST_ES_SELECTOR, &regs->xes);
2060 __vmread(GUEST_DS_SELECTOR, &regs->xds);
2063 void restore_cpu_user_regs(struct cpu_user_regs *regs)
2065 __vmwrite(GUEST_SS_SELECTOR, regs->xss);
2066 __vmwrite(GUEST_RSP, regs->esp);
2067 __vmwrite(GUEST_RFLAGS, regs->eflags);
2068 __vmwrite(GUEST_CS_SELECTOR, regs->xcs);
2069 __vmwrite(GUEST_RIP, regs->eip);
2071 __vmwrite(GUEST_GS_SELECTOR, regs->xgs);
2072 __vmwrite(GUEST_FS_SELECTOR, regs->xfs);
2073 __vmwrite(GUEST_ES_SELECTOR, regs->xes);
2074 __vmwrite(GUEST_DS_SELECTOR, regs->xds);
2076 #endif
2078 static void vmx_reflect_exception(struct vcpu *v)
2080 int error_code, intr_info, vector;
2082 __vmread(VM_EXIT_INTR_INFO, &intr_info);
2083 vector = intr_info & 0xff;
2084 if ( intr_info & INTR_INFO_DELIVER_CODE_MASK )
2085 __vmread(VM_EXIT_INTR_ERROR_CODE, &error_code);
2086 else
2087 error_code = VMX_DELIVER_NO_ERROR_CODE;
2089 #ifndef NDEBUG
2091 unsigned long rip;
2093 __vmread(GUEST_RIP, &rip);
2094 HVM_DBG_LOG(DBG_LEVEL_1, "rip = %lx, error_code = %x",
2095 rip, error_code);
2097 #endif /* NDEBUG */
2099 /*
2100 * According to Intel Virtualization Technology Specification for
2101 * the IA-32 Intel Architecture (C97063-002 April 2005), section
2102 * 2.8.3, SW_EXCEPTION should be used for #BP and #OV, and
2103 * HW_EXCEPTION used for everything else. The main difference
2104 * appears to be that for SW_EXCEPTION, the EIP/RIP is incremented
2105 * by VM_ENTER_INSTRUCTION_LEN bytes, whereas for HW_EXCEPTION,
2106 * it is not.
2107 */
2108 if ( (intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_SW_EXCEPTION )
2110 int ilen = __get_instruction_length(); /* Safe: software exception */
2111 vmx_inject_sw_exception(v, vector, ilen);
2113 else
2115 vmx_inject_hw_exception(v, vector, error_code);
2119 asmlinkage void vmx_vmexit_handler(struct cpu_user_regs *regs)
2121 unsigned int exit_reason;
2122 unsigned long exit_qualification, rip, inst_len = 0;
2123 struct vcpu *v = current;
2125 __vmread(VM_EXIT_REASON, &exit_reason);
2127 perfc_incra(vmexits, exit_reason);
2129 if ( (exit_reason != EXIT_REASON_EXTERNAL_INTERRUPT) &&
2130 (exit_reason != EXIT_REASON_VMCALL) &&
2131 (exit_reason != EXIT_REASON_IO_INSTRUCTION) )
2132 HVM_DBG_LOG(DBG_LEVEL_0, "exit reason = %x", exit_reason);
2134 if ( exit_reason != EXIT_REASON_EXTERNAL_INTERRUPT )
2135 local_irq_enable();
2137 if ( unlikely(exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) )
2139 unsigned int failed_vmentry_reason = exit_reason & 0xFFFF;
2141 __vmread(EXIT_QUALIFICATION, &exit_qualification);
2142 printk("Failed vm entry (exit reason 0x%x) ", exit_reason);
2143 switch ( failed_vmentry_reason ) {
2144 case EXIT_REASON_INVALID_GUEST_STATE:
2145 printk("caused by invalid guest state (%ld).\n", exit_qualification);
2146 break;
2147 case EXIT_REASON_MSR_LOADING:
2148 printk("caused by MSR entry %ld loading.\n", exit_qualification);
2149 break;
2150 case EXIT_REASON_MACHINE_CHECK:
2151 printk("caused by machine check.\n");
2152 break;
2153 default:
2154 printk("reason not known yet!");
2155 break;
2158 printk("************* VMCS Area **************\n");
2159 vmcs_dump_vcpu();
2160 printk("**************************************\n");
2161 domain_crash_synchronous();
2164 TRACE_VMEXIT(0,exit_reason);
2166 switch ( exit_reason )
2168 case EXIT_REASON_EXCEPTION_NMI:
2170 /*
2171 * We don't set the software-interrupt exiting (INT n).
2172 * (1) We can get an exception (e.g. #PG) in the guest, or
2173 * (2) NMI
2174 */
2175 unsigned int vector;
2176 unsigned long va;
2178 if ( __vmread(VM_EXIT_INTR_INFO, &vector) ||
2179 !(vector & INTR_INFO_VALID_MASK) )
2180 domain_crash_synchronous();
2181 vector &= INTR_INFO_VECTOR_MASK;
2183 TRACE_VMEXIT(1,vector);
2184 perfc_incra(cause_vector, vector);
2186 switch ( vector ) {
2187 #ifdef XEN_DEBUGGER
2188 case TRAP_debug:
2190 save_cpu_user_regs(regs);
2191 pdb_handle_exception(1, regs, 1);
2192 restore_cpu_user_regs(regs);
2193 break;
2195 case TRAP_int3:
2197 save_cpu_user_regs(regs);
2198 pdb_handle_exception(3, regs, 1);
2199 restore_cpu_user_regs(regs);
2200 break;
2202 #else
2203 case TRAP_debug:
2205 void store_cpu_user_regs(struct cpu_user_regs *regs);
2207 if ( test_bit(_DOMF_debugging, &v->domain->domain_flags) )
2209 store_cpu_user_regs(regs);
2210 domain_pause_for_debugger();
2211 __vm_clear_bit(GUEST_PENDING_DBG_EXCEPTIONS,
2212 PENDING_DEBUG_EXC_BS);
2214 else
2216 vmx_reflect_exception(v);
2217 __vm_clear_bit(GUEST_PENDING_DBG_EXCEPTIONS,
2218 PENDING_DEBUG_EXC_BS);
2221 break;
2223 case TRAP_int3:
2225 if ( test_bit(_DOMF_debugging, &v->domain->domain_flags) )
2226 domain_pause_for_debugger();
2227 else
2228 vmx_reflect_exception(v);
2229 break;
2231 #endif
2232 case TRAP_no_device:
2234 vmx_do_no_device_fault();
2235 break;
2237 case TRAP_page_fault:
2239 __vmread(EXIT_QUALIFICATION, &va);
2240 __vmread(VM_EXIT_INTR_ERROR_CODE, &regs->error_code);
2242 TRACE_VMEXIT(3, regs->error_code);
2243 TRACE_VMEXIT(4, va);
2245 HVM_DBG_LOG(DBG_LEVEL_VMMU,
2246 "eax=%lx, ebx=%lx, ecx=%lx, edx=%lx, esi=%lx, edi=%lx",
2247 (unsigned long)regs->eax, (unsigned long)regs->ebx,
2248 (unsigned long)regs->ecx, (unsigned long)regs->edx,
2249 (unsigned long)regs->esi, (unsigned long)regs->edi);
2251 if ( !vmx_do_page_fault(va, regs) )
2253 /* Inject #PG using Interruption-Information Fields. */
2254 vmx_inject_hw_exception(v, TRAP_page_fault, regs->error_code);
2255 v->arch.hvm_vmx.cpu_cr2 = va;
2256 TRACE_3D(TRC_VMX_INT, v->domain->domain_id,
2257 TRAP_page_fault, va);
2259 break;
2261 case TRAP_nmi:
2262 do_nmi(regs);
2263 break;
2264 default:
2265 vmx_reflect_exception(v);
2266 break;
2268 break;
2270 case EXIT_REASON_EXTERNAL_INTERRUPT:
2271 vmx_vmexit_do_extint(regs);
2272 break;
2273 case EXIT_REASON_TRIPLE_FAULT:
2274 domain_crash_synchronous();
2275 break;
2276 case EXIT_REASON_PENDING_INTERRUPT:
2277 /* Disable the interrupt window. */
2278 v->arch.hvm_vcpu.u.vmx.exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
2279 __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
2280 v->arch.hvm_vcpu.u.vmx.exec_control);
2281 break;
2282 case EXIT_REASON_TASK_SWITCH:
2283 domain_crash_synchronous();
2284 break;
2285 case EXIT_REASON_CPUID:
2286 inst_len = __get_instruction_length(); /* Safe: CPUID */
2287 __update_guest_eip(inst_len);
2288 vmx_vmexit_do_cpuid(regs);
2289 break;
2290 case EXIT_REASON_HLT:
2291 inst_len = __get_instruction_length(); /* Safe: HLT */
2292 __update_guest_eip(inst_len);
2293 vmx_vmexit_do_hlt();
2294 break;
2295 case EXIT_REASON_INVLPG:
2297 unsigned long va;
2298 inst_len = __get_instruction_length(); /* Safe: INVLPG */
2299 __update_guest_eip(inst_len);
2300 __vmread(EXIT_QUALIFICATION, &va);
2301 vmx_vmexit_do_invlpg(va);
2302 break;
2304 case EXIT_REASON_VMCALL:
2306 inst_len = __get_instruction_length(); /* Safe: VMCALL */
2307 __update_guest_eip(inst_len);
2308 __vmread(GUEST_RIP, &rip);
2309 __vmread(EXIT_QUALIFICATION, &exit_qualification);
2310 hvm_do_hypercall(regs);
2311 break;
2313 case EXIT_REASON_CR_ACCESS:
2315 __vmread(GUEST_RIP, &rip);
2316 __vmread(EXIT_QUALIFICATION, &exit_qualification);
2317 inst_len = __get_instruction_length(); /* Safe: MOV Cn, LMSW, CLTS */
2318 if ( vmx_cr_access(exit_qualification, regs) )
2319 __update_guest_eip(inst_len);
2320 TRACE_VMEXIT(3, regs->error_code);
2321 TRACE_VMEXIT(4, exit_qualification);
2322 break;
2324 case EXIT_REASON_DR_ACCESS:
2325 __vmread(EXIT_QUALIFICATION, &exit_qualification);
2326 vmx_dr_access(exit_qualification, regs);
2327 break;
2328 case EXIT_REASON_IO_INSTRUCTION:
2329 __vmread(EXIT_QUALIFICATION, &exit_qualification);
2330 inst_len = __get_instruction_length(); /* Safe: IN, INS, OUT, OUTS */
2331 vmx_io_instruction(exit_qualification, inst_len);
2332 TRACE_VMEXIT(4,exit_qualification);
2333 break;
2334 case EXIT_REASON_MSR_READ:
2335 inst_len = __get_instruction_length(); /* Safe: RDMSR */
2336 __update_guest_eip(inst_len);
2337 vmx_do_msr_read(regs);
2338 break;
2339 case EXIT_REASON_MSR_WRITE:
2340 inst_len = __get_instruction_length(); /* Safe: WRMSR */
2341 __update_guest_eip(inst_len);
2342 vmx_do_msr_write(regs);
2343 break;
2344 case EXIT_REASON_MWAIT_INSTRUCTION:
2345 case EXIT_REASON_MONITOR_INSTRUCTION:
2346 case EXIT_REASON_PAUSE_INSTRUCTION:
2347 domain_crash_synchronous();
2348 break;
2349 case EXIT_REASON_VMCLEAR:
2350 case EXIT_REASON_VMLAUNCH:
2351 case EXIT_REASON_VMPTRLD:
2352 case EXIT_REASON_VMPTRST:
2353 case EXIT_REASON_VMREAD:
2354 case EXIT_REASON_VMRESUME:
2355 case EXIT_REASON_VMWRITE:
2356 case EXIT_REASON_VMXOFF:
2357 case EXIT_REASON_VMXON:
2358 /* Report invalid opcode exception when a VMX guest tries to execute
2359 any of the VMX instructions */
2360 vmx_inject_hw_exception(v, TRAP_invalid_op, VMX_DELIVER_NO_ERROR_CODE);
2361 break;
2363 default:
2364 domain_crash_synchronous(); /* should not happen */
2368 asmlinkage void vmx_load_cr2(void)
2370 struct vcpu *v = current;
2372 local_irq_disable();
2373 asm volatile("mov %0,%%cr2": :"r" (v->arch.hvm_vmx.cpu_cr2));
2376 asmlinkage void vmx_trace_vmentry (void)
2378 TRACE_5D(TRC_VMX_VMENTRY,
2379 this_cpu(trace_values)[0],
2380 this_cpu(trace_values)[1],
2381 this_cpu(trace_values)[2],
2382 this_cpu(trace_values)[3],
2383 this_cpu(trace_values)[4]);
2384 TRACE_VMEXIT(0,9);
2385 TRACE_VMEXIT(1,9);
2386 TRACE_VMEXIT(2,9);
2387 TRACE_VMEXIT(3,9);
2388 TRACE_VMEXIT(4,9);
2389 return;
2392 asmlinkage void vmx_trace_vmexit (void)
2394 TRACE_3D(TRC_VMX_VMEXIT,0,0,0);
2395 return;
2398 /*
2399 * Local variables:
2400 * mode: C
2401 * c-set-style: "BSD"
2402 * c-basic-offset: 4
2403 * tab-width: 4
2404 * indent-tabs-mode: nil
2405 * End:
2406 */