ia64/xen-unstable

view xen/arch/x86/hvm/vmx/vmx.c @ 12226:45e34f00a78f

[HVM] Clean up VCPU initialisation in Xen. No longer
parse HVM e820 tables in Xen (add some extra HVM parameters as a
cleaner alternative). Lots of code removal.
Signed-off-by: Keir Fraser <keir@xensource.com>
author kfraser@localhost.localdomain
date Thu Nov 02 15:55:51 2006 +0000 (2006-11-02)
parents 7b5115221dfc
children 2368e779f89f
line source
1 /*
2 * vmx.c: handling VMX architecture-related VM exits
3 * Copyright (c) 2004, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
16 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 *
18 */
20 #include <xen/config.h>
21 #include <xen/init.h>
22 #include <xen/lib.h>
23 #include <xen/trace.h>
24 #include <xen/sched.h>
25 #include <xen/irq.h>
26 #include <xen/softirq.h>
27 #include <xen/domain_page.h>
28 #include <xen/hypercall.h>
29 #include <xen/perfc.h>
30 #include <asm/current.h>
31 #include <asm/io.h>
32 #include <asm/regs.h>
33 #include <asm/cpufeature.h>
34 #include <asm/processor.h>
35 #include <asm/types.h>
36 #include <asm/msr.h>
37 #include <asm/spinlock.h>
38 #include <asm/hvm/hvm.h>
39 #include <asm/hvm/support.h>
40 #include <asm/hvm/vmx/vmx.h>
41 #include <asm/hvm/vmx/vmcs.h>
42 #include <asm/hvm/vmx/cpu.h>
43 #include <asm/shadow.h>
44 #include <public/sched.h>
45 #include <public/hvm/ioreq.h>
46 #include <asm/hvm/vpic.h>
47 #include <asm/hvm/vlapic.h>
48 #include <asm/x86_emulate.h>
50 static DEFINE_PER_CPU(unsigned long, trace_values[5]);
51 #define TRACE_VMEXIT(index,value) this_cpu(trace_values)[index]=value
53 static void vmx_ctxt_switch_from(struct vcpu *v);
54 static void vmx_ctxt_switch_to(struct vcpu *v);
56 static int vmx_vcpu_initialise(struct vcpu *v)
57 {
58 int rc;
60 v->arch.schedule_tail = arch_vmx_do_launch;
61 v->arch.ctxt_switch_from = vmx_ctxt_switch_from;
62 v->arch.ctxt_switch_to = vmx_ctxt_switch_to;
64 if ( (rc = vmx_create_vmcs(v)) != 0 )
65 {
66 dprintk(XENLOG_WARNING,
67 "Failed to create VMCS for vcpu %d: err=%d.\n",
68 v->vcpu_id, rc);
69 return rc;
70 }
72 spin_lock_init(&v->arch.hvm_vmx.vmcs_lock);
74 return 0;
75 }
77 static void vmx_relinquish_guest_resources(struct domain *d)
78 {
79 struct vcpu *v;
81 for_each_vcpu ( d, v )
82 {
83 vmx_destroy_vmcs(v);
84 if ( !test_bit(_VCPUF_initialised, &v->vcpu_flags) )
85 continue;
86 kill_timer(&v->arch.hvm_vcpu.hlt_timer);
87 if ( VLAPIC(v) != NULL )
88 {
89 kill_timer(&VLAPIC(v)->vlapic_timer);
90 unmap_domain_page_global(VLAPIC(v)->regs);
91 free_domheap_page(VLAPIC(v)->regs_page);
92 xfree(VLAPIC(v));
93 }
94 hvm_release_assist_channel(v);
95 }
97 kill_timer(&d->arch.hvm_domain.pl_time.periodic_tm.timer);
98 rtc_deinit(d);
99 pmtimer_deinit(d);
101 if ( d->arch.hvm_domain.shared_page_va )
102 unmap_domain_page_global(
103 (void *)d->arch.hvm_domain.shared_page_va);
105 if ( d->arch.hvm_domain.buffered_io_va )
106 unmap_domain_page_global((void *)d->arch.hvm_domain.buffered_io_va);
107 }
109 #ifdef __x86_64__
111 static DEFINE_PER_CPU(struct vmx_msr_state, percpu_msr);
113 static u32 msr_data_index[VMX_MSR_COUNT] =
114 {
115 MSR_LSTAR, MSR_STAR, MSR_CSTAR,
116 MSR_SYSCALL_MASK, MSR_EFER,
117 };
119 static void vmx_save_segments(struct vcpu *v)
120 {
121 rdmsrl(MSR_SHADOW_GS_BASE, v->arch.hvm_vmx.msr_content.shadow_gs);
122 }
124 /*
125 * To avoid MSR save/restore at every VM exit/entry time, we restore
126 * the x86_64 specific MSRs at domain switch time. Since those MSRs are
127 * are not modified once set for generic domains, we don't save them,
128 * but simply reset them to the values set at percpu_traps_init().
129 */
130 static void vmx_load_msrs(void)
131 {
132 struct vmx_msr_state *host_state = &this_cpu(percpu_msr);
133 int i;
135 while ( host_state->flags )
136 {
137 i = find_first_set_bit(host_state->flags);
138 wrmsrl(msr_data_index[i], host_state->msr_items[i]);
139 clear_bit(i, &host_state->flags);
140 }
141 }
143 static void vmx_save_init_msrs(void)
144 {
145 struct vmx_msr_state *host_state = &this_cpu(percpu_msr);
146 int i;
148 for ( i = 0; i < VMX_MSR_COUNT; i++ )
149 rdmsrl(msr_data_index[i], host_state->msr_items[i]);
150 }
152 #define CASE_READ_MSR(address) \
153 case MSR_ ## address: \
154 msr_content = msr->msr_items[VMX_INDEX_MSR_ ## address]; \
155 break
157 #define CASE_WRITE_MSR(address) \
158 case MSR_ ## address: \
159 { \
160 msr->msr_items[VMX_INDEX_MSR_ ## address] = msr_content; \
161 if (!test_bit(VMX_INDEX_MSR_ ## address, &msr->flags)) { \
162 set_bit(VMX_INDEX_MSR_ ## address, &msr->flags); \
163 } \
164 wrmsrl(MSR_ ## address, msr_content); \
165 set_bit(VMX_INDEX_MSR_ ## address, &host_state->flags); \
166 } \
167 break
169 #define IS_CANO_ADDRESS(add) 1
170 static inline int long_mode_do_msr_read(struct cpu_user_regs *regs)
171 {
172 u64 msr_content = 0;
173 struct vcpu *v = current;
174 struct vmx_msr_state *msr = &v->arch.hvm_vmx.msr_content;
176 switch ( regs->ecx ) {
177 case MSR_EFER:
178 HVM_DBG_LOG(DBG_LEVEL_2, "EFER msr_content 0x%"PRIx64, msr_content);
179 msr_content = msr->msr_items[VMX_INDEX_MSR_EFER];
180 break;
182 case MSR_FS_BASE:
183 if ( !(vmx_long_mode_enabled(v)) )
184 /* XXX should it be GP fault */
185 domain_crash_synchronous();
187 __vmread(GUEST_FS_BASE, &msr_content);
188 break;
190 case MSR_GS_BASE:
191 if ( !(vmx_long_mode_enabled(v)) )
192 domain_crash_synchronous();
194 __vmread(GUEST_GS_BASE, &msr_content);
195 break;
197 case MSR_SHADOW_GS_BASE:
198 msr_content = msr->shadow_gs;
199 break;
201 CASE_READ_MSR(STAR);
202 CASE_READ_MSR(LSTAR);
203 CASE_READ_MSR(CSTAR);
204 CASE_READ_MSR(SYSCALL_MASK);
206 default:
207 return 0;
208 }
210 HVM_DBG_LOG(DBG_LEVEL_2, "msr_content: 0x%"PRIx64, msr_content);
212 regs->eax = (u32)(msr_content >> 0);
213 regs->edx = (u32)(msr_content >> 32);
215 return 1;
216 }
218 static inline int long_mode_do_msr_write(struct cpu_user_regs *regs)
219 {
220 u64 msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
221 struct vcpu *v = current;
222 struct vmx_msr_state *msr = &v->arch.hvm_vmx.msr_content;
223 struct vmx_msr_state *host_state = &this_cpu(percpu_msr);
225 HVM_DBG_LOG(DBG_LEVEL_1, "msr 0x%lx msr_content 0x%"PRIx64"\n",
226 (unsigned long)regs->ecx, msr_content);
228 switch ( regs->ecx ) {
229 case MSR_EFER:
230 /* offending reserved bit will cause #GP */
231 if ( msr_content & ~(EFER_LME | EFER_LMA | EFER_NX | EFER_SCE) )
232 {
233 printk("Trying to set reserved bit in EFER: %"PRIx64"\n",
234 msr_content);
235 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
236 return 0;
237 }
239 if ( (msr_content & EFER_LME)
240 && !(msr->msr_items[VMX_INDEX_MSR_EFER] & EFER_LME) )
241 {
242 if ( unlikely(vmx_paging_enabled(v)) )
243 {
244 printk("Trying to set EFER.LME with paging enabled\n");
245 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
246 return 0;
247 }
248 }
249 else if ( !(msr_content & EFER_LME)
250 && (msr->msr_items[VMX_INDEX_MSR_EFER] & EFER_LME) )
251 {
252 if ( unlikely(vmx_paging_enabled(v)) )
253 {
254 printk("Trying to clear EFER.LME with paging enabled\n");
255 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
256 return 0;
257 }
258 }
260 msr->msr_items[VMX_INDEX_MSR_EFER] = msr_content;
261 break;
263 case MSR_FS_BASE:
264 case MSR_GS_BASE:
265 if ( !(vmx_long_mode_enabled(v)) )
266 domain_crash_synchronous();
268 if ( !IS_CANO_ADDRESS(msr_content) )
269 {
270 HVM_DBG_LOG(DBG_LEVEL_1, "Not cano address of msr write\n");
271 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
272 return 0;
273 }
275 if ( regs->ecx == MSR_FS_BASE )
276 __vmwrite(GUEST_FS_BASE, msr_content);
277 else
278 __vmwrite(GUEST_GS_BASE, msr_content);
280 break;
282 case MSR_SHADOW_GS_BASE:
283 if ( !(vmx_long_mode_enabled(v)) )
284 domain_crash_synchronous();
286 v->arch.hvm_vmx.msr_content.shadow_gs = msr_content;
287 wrmsrl(MSR_SHADOW_GS_BASE, msr_content);
288 break;
290 CASE_WRITE_MSR(STAR);
291 CASE_WRITE_MSR(LSTAR);
292 CASE_WRITE_MSR(CSTAR);
293 CASE_WRITE_MSR(SYSCALL_MASK);
295 default:
296 return 0;
297 }
299 return 1;
300 }
302 static void vmx_restore_msrs(struct vcpu *v)
303 {
304 int i = 0;
305 struct vmx_msr_state *guest_state;
306 struct vmx_msr_state *host_state;
307 unsigned long guest_flags ;
309 guest_state = &v->arch.hvm_vmx.msr_content;;
310 host_state = &this_cpu(percpu_msr);
312 wrmsrl(MSR_SHADOW_GS_BASE, guest_state->shadow_gs);
313 guest_flags = guest_state->flags;
314 if (!guest_flags)
315 return;
317 while (guest_flags){
318 i = find_first_set_bit(guest_flags);
320 HVM_DBG_LOG(DBG_LEVEL_2,
321 "restore guest's index %d msr %lx with %lx\n",
322 i, (unsigned long)msr_data_index[i],
323 (unsigned long)guest_state->msr_items[i]);
324 set_bit(i, &host_state->flags);
325 wrmsrl(msr_data_index[i], guest_state->msr_items[i]);
326 clear_bit(i, &guest_flags);
327 }
328 }
330 #else /* __i386__ */
332 #define vmx_save_segments(v) ((void)0)
333 #define vmx_load_msrs() ((void)0)
334 #define vmx_restore_msrs(v) ((void)0)
335 #define vmx_save_init_msrs() ((void)0)
337 static inline int long_mode_do_msr_read(struct cpu_user_regs *regs)
338 {
339 return 0;
340 }
342 static inline int long_mode_do_msr_write(struct cpu_user_regs *regs)
343 {
344 return 0;
345 }
347 #endif /* __i386__ */
349 #define loaddebug(_v,_reg) \
350 __asm__ __volatile__ ("mov %0,%%db" #_reg : : "r" ((_v)->debugreg[_reg]))
351 #define savedebug(_v,_reg) \
352 __asm__ __volatile__ ("mov %%db" #_reg ",%0" : : "r" ((_v)->debugreg[_reg]))
354 static inline void vmx_save_dr(struct vcpu *v)
355 {
356 if ( v->arch.hvm_vcpu.flag_dr_dirty )
357 {
358 savedebug(&v->arch.guest_context, 0);
359 savedebug(&v->arch.guest_context, 1);
360 savedebug(&v->arch.guest_context, 2);
361 savedebug(&v->arch.guest_context, 3);
362 savedebug(&v->arch.guest_context, 6);
364 v->arch.hvm_vcpu.flag_dr_dirty = 0;
366 v->arch.hvm_vcpu.u.vmx.exec_control |= CPU_BASED_MOV_DR_EXITING;
367 __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
368 v->arch.hvm_vcpu.u.vmx.exec_control);
369 }
370 }
372 static inline void __restore_debug_registers(struct vcpu *v)
373 {
374 loaddebug(&v->arch.guest_context, 0);
375 loaddebug(&v->arch.guest_context, 1);
376 loaddebug(&v->arch.guest_context, 2);
377 loaddebug(&v->arch.guest_context, 3);
378 /* No 4 and 5 */
379 loaddebug(&v->arch.guest_context, 6);
380 /* DR7 is loaded from the vmcs. */
381 }
383 /*
384 * DR7 is saved and restored on every vmexit. Other debug registers only
385 * need to be restored if their value is going to affect execution -- i.e.,
386 * if one of the breakpoints is enabled. So mask out all bits that don't
387 * enable some breakpoint functionality.
388 *
389 * This is in part necessary because bit 10 of DR7 is hardwired to 1, so a
390 * simple if( guest_dr7 ) will always return true. As long as we're masking,
391 * we might as well do it right.
392 */
393 #define DR7_ACTIVE_MASK 0xff
395 static inline void vmx_restore_dr(struct vcpu *v)
396 {
397 unsigned long guest_dr7;
399 __vmread(GUEST_DR7, &guest_dr7);
401 /* Assumes guest does not have DR access at time of context switch. */
402 if ( unlikely(guest_dr7 & DR7_ACTIVE_MASK) )
403 __restore_debug_registers(v);
404 }
406 static void vmx_freeze_time(struct vcpu *v)
407 {
408 struct periodic_time *pt=&v->domain->arch.hvm_domain.pl_time.periodic_tm;
410 if ( pt->enabled && pt->first_injected && v->vcpu_id == pt->bind_vcpu
411 && !v->arch.hvm_vcpu.guest_time ) {
412 v->arch.hvm_vcpu.guest_time = hvm_get_guest_time(v);
413 stop_timer(&(pt->timer));
414 }
415 }
417 static void vmx_ctxt_switch_from(struct vcpu *v)
418 {
419 vmx_freeze_time(v);
420 vmx_save_segments(v);
421 vmx_load_msrs();
422 vmx_save_dr(v);
423 }
425 static void vmx_ctxt_switch_to(struct vcpu *v)
426 {
427 vmx_restore_msrs(v);
428 vmx_restore_dr(v);
429 }
431 static void stop_vmx(void)
432 {
433 if ( !(read_cr4() & X86_CR4_VMXE) )
434 return;
435 __vmxoff();
436 clear_in_cr4(X86_CR4_VMXE);
437 }
439 void vmx_migrate_timers(struct vcpu *v)
440 {
441 struct periodic_time *pt = &(v->domain->arch.hvm_domain.pl_time.periodic_tm);
442 struct RTCState *vrtc = &v->domain->arch.hvm_domain.pl_time.vrtc;
443 struct PMTState *vpmt = &v->domain->arch.hvm_domain.pl_time.vpmt;
445 if ( pt->enabled )
446 {
447 migrate_timer(&pt->timer, v->processor);
448 migrate_timer(&v->arch.hvm_vcpu.hlt_timer, v->processor);
449 }
450 if ( VLAPIC(v) != NULL )
451 migrate_timer(&VLAPIC(v)->vlapic_timer, v->processor);
452 migrate_timer(&vrtc->second_timer, v->processor);
453 migrate_timer(&vrtc->second_timer2, v->processor);
454 migrate_timer(&vpmt->timer, v->processor);
455 }
457 static void vmx_store_cpu_guest_regs(
458 struct vcpu *v, struct cpu_user_regs *regs, unsigned long *crs)
459 {
460 vmx_vmcs_enter(v);
462 if ( regs != NULL )
463 {
464 __vmread(GUEST_RFLAGS, &regs->eflags);
465 __vmread(GUEST_SS_SELECTOR, &regs->ss);
466 __vmread(GUEST_CS_SELECTOR, &regs->cs);
467 __vmread(GUEST_DS_SELECTOR, &regs->ds);
468 __vmread(GUEST_ES_SELECTOR, &regs->es);
469 __vmread(GUEST_GS_SELECTOR, &regs->gs);
470 __vmread(GUEST_FS_SELECTOR, &regs->fs);
471 __vmread(GUEST_RIP, &regs->eip);
472 __vmread(GUEST_RSP, &regs->esp);
473 }
475 if ( crs != NULL )
476 {
477 __vmread(CR0_READ_SHADOW, &crs[0]);
478 crs[2] = v->arch.hvm_vmx.cpu_cr2;
479 __vmread(GUEST_CR3, &crs[3]);
480 __vmread(CR4_READ_SHADOW, &crs[4]);
481 }
483 vmx_vmcs_exit(v);
484 }
486 /*
487 * The VMX spec (section 4.3.1.2, Checks on Guest Segment
488 * Registers) says that virtual-8086 mode guests' segment
489 * base-address fields in the VMCS must be equal to their
490 * corresponding segment selector field shifted right by
491 * four bits upon vmentry.
492 *
493 * This function (called only for VM86-mode guests) fixes
494 * the bases to be consistent with the selectors in regs
495 * if they're not already. Without this, we can fail the
496 * vmentry check mentioned above.
497 */
498 static void fixup_vm86_seg_bases(struct cpu_user_regs *regs)
499 {
500 int err = 0;
501 unsigned long base;
503 err |= __vmread(GUEST_ES_BASE, &base);
504 if (regs->es << 4 != base)
505 err |= __vmwrite(GUEST_ES_BASE, regs->es << 4);
506 err |= __vmread(GUEST_CS_BASE, &base);
507 if (regs->cs << 4 != base)
508 err |= __vmwrite(GUEST_CS_BASE, regs->cs << 4);
509 err |= __vmread(GUEST_SS_BASE, &base);
510 if (regs->ss << 4 != base)
511 err |= __vmwrite(GUEST_SS_BASE, regs->ss << 4);
512 err |= __vmread(GUEST_DS_BASE, &base);
513 if (regs->ds << 4 != base)
514 err |= __vmwrite(GUEST_DS_BASE, regs->ds << 4);
515 err |= __vmread(GUEST_FS_BASE, &base);
516 if (regs->fs << 4 != base)
517 err |= __vmwrite(GUEST_FS_BASE, regs->fs << 4);
518 err |= __vmread(GUEST_GS_BASE, &base);
519 if (regs->gs << 4 != base)
520 err |= __vmwrite(GUEST_GS_BASE, regs->gs << 4);
522 BUG_ON(err);
523 }
525 static void vmx_load_cpu_guest_regs(struct vcpu *v, struct cpu_user_regs *regs)
526 {
527 vmx_vmcs_enter(v);
529 __vmwrite(GUEST_SS_SELECTOR, regs->ss);
530 __vmwrite(GUEST_DS_SELECTOR, regs->ds);
531 __vmwrite(GUEST_ES_SELECTOR, regs->es);
532 __vmwrite(GUEST_GS_SELECTOR, regs->gs);
533 __vmwrite(GUEST_FS_SELECTOR, regs->fs);
535 __vmwrite(GUEST_RSP, regs->esp);
537 __vmwrite(GUEST_RFLAGS, regs->eflags);
538 if (regs->eflags & EF_TF)
539 __vm_set_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_DB);
540 else
541 __vm_clear_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_DB);
542 if (regs->eflags & EF_VM)
543 fixup_vm86_seg_bases(regs);
545 __vmwrite(GUEST_CS_SELECTOR, regs->cs);
546 __vmwrite(GUEST_RIP, regs->eip);
548 vmx_vmcs_exit(v);
549 }
551 static unsigned long vmx_get_ctrl_reg(struct vcpu *v, unsigned int num)
552 {
553 switch ( num )
554 {
555 case 0:
556 return v->arch.hvm_vmx.cpu_cr0;
557 case 2:
558 return v->arch.hvm_vmx.cpu_cr2;
559 case 3:
560 return v->arch.hvm_vmx.cpu_cr3;
561 case 4:
562 return v->arch.hvm_vmx.cpu_shadow_cr4;
563 default:
564 BUG();
565 }
566 return 0; /* dummy */
567 }
571 /* Make sure that xen intercepts any FP accesses from current */
572 static void vmx_stts(struct vcpu *v)
573 {
574 unsigned long cr0;
576 /* VMX depends on operating on the current vcpu */
577 ASSERT(v == current);
579 /*
580 * If the guest does not have TS enabled then we must cause and handle an
581 * exception on first use of the FPU. If the guest *does* have TS enabled
582 * then this is not necessary: no FPU activity can occur until the guest
583 * clears CR0.TS, and we will initialise the FPU when that happens.
584 */
585 __vmread_vcpu(v, CR0_READ_SHADOW, &cr0);
586 if ( !(cr0 & X86_CR0_TS) )
587 {
588 __vmread_vcpu(v, GUEST_CR0, &cr0);
589 __vmwrite(GUEST_CR0, cr0 | X86_CR0_TS);
590 __vm_set_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_NM);
591 }
592 }
595 static void vmx_set_tsc_offset(struct vcpu *v, u64 offset)
596 {
597 /* VMX depends on operating on the current vcpu */
598 ASSERT(v == current);
600 __vmwrite(TSC_OFFSET, offset);
601 #if defined (__i386__)
602 __vmwrite(TSC_OFFSET_HIGH, offset >> 32);
603 #endif
604 }
608 /* SMP VMX guest support */
609 static void vmx_init_ap_context(struct vcpu_guest_context *ctxt,
610 int vcpuid, int trampoline_vector)
611 {
612 int i;
614 memset(ctxt, 0, sizeof(*ctxt));
616 /*
617 * Initial register values:
618 */
619 ctxt->user_regs.eip = VMXASSIST_BASE;
620 ctxt->user_regs.edx = vcpuid;
621 ctxt->user_regs.ebx = trampoline_vector;
623 /* Virtual IDT is empty at start-of-day. */
624 for ( i = 0; i < 256; i++ )
625 {
626 ctxt->trap_ctxt[i].vector = i;
627 ctxt->trap_ctxt[i].cs = FLAT_KERNEL_CS;
628 }
630 /* No callback handlers. */
631 #if defined(__i386__)
632 ctxt->event_callback_cs = FLAT_KERNEL_CS;
633 ctxt->failsafe_callback_cs = FLAT_KERNEL_CS;
634 #endif
635 }
637 void do_nmi(struct cpu_user_regs *);
639 static void vmx_init_hypercall_page(struct domain *d, void *hypercall_page)
640 {
641 char *p;
642 int i;
644 memset(hypercall_page, 0, PAGE_SIZE);
646 for ( i = 0; i < (PAGE_SIZE / 32); i++ )
647 {
648 p = (char *)(hypercall_page + (i * 32));
649 *(u8 *)(p + 0) = 0xb8; /* mov imm32, %eax */
650 *(u32 *)(p + 1) = i;
651 *(u8 *)(p + 5) = 0x0f; /* vmcall */
652 *(u8 *)(p + 6) = 0x01;
653 *(u8 *)(p + 7) = 0xc1;
654 *(u8 *)(p + 8) = 0xc3; /* ret */
655 }
657 /* Don't support HYPERVISOR_iret at the moment */
658 *(u16 *)(hypercall_page + (__HYPERVISOR_iret * 32)) = 0x0b0f; /* ud2 */
659 }
661 static int vmx_realmode(struct vcpu *v)
662 {
663 unsigned long rflags;
665 ASSERT(v == current);
667 __vmread(GUEST_RFLAGS, &rflags);
668 return rflags & X86_EFLAGS_VM;
669 }
671 static int vmx_guest_x86_mode(struct vcpu *v)
672 {
673 unsigned long cs_ar_bytes;
675 ASSERT(v == current);
677 __vmread(GUEST_CS_AR_BYTES, &cs_ar_bytes);
679 if ( vmx_long_mode_enabled(v) )
680 return ((cs_ar_bytes & (1u<<13)) ?
681 X86EMUL_MODE_PROT64 : X86EMUL_MODE_PROT32);
683 if ( vmx_realmode(v) )
684 return X86EMUL_MODE_REAL;
686 return ((cs_ar_bytes & (1u<<14)) ?
687 X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16);
688 }
690 /* Setup HVM interfaces */
691 static void vmx_setup_hvm_funcs(void)
692 {
693 if ( hvm_enabled )
694 return;
696 hvm_funcs.disable = stop_vmx;
698 hvm_funcs.vcpu_initialise = vmx_vcpu_initialise;
699 hvm_funcs.relinquish_guest_resources = vmx_relinquish_guest_resources;
701 hvm_funcs.store_cpu_guest_regs = vmx_store_cpu_guest_regs;
702 hvm_funcs.load_cpu_guest_regs = vmx_load_cpu_guest_regs;
704 hvm_funcs.realmode = vmx_realmode;
705 hvm_funcs.paging_enabled = vmx_paging_enabled;
706 hvm_funcs.long_mode_enabled = vmx_long_mode_enabled;
707 hvm_funcs.pae_enabled = vmx_pae_enabled;
708 hvm_funcs.guest_x86_mode = vmx_guest_x86_mode;
709 hvm_funcs.get_guest_ctrl_reg = vmx_get_ctrl_reg;
711 hvm_funcs.update_host_cr3 = vmx_update_host_cr3;
713 hvm_funcs.stts = vmx_stts;
714 hvm_funcs.set_tsc_offset = vmx_set_tsc_offset;
716 hvm_funcs.init_ap_context = vmx_init_ap_context;
718 hvm_funcs.init_hypercall_page = vmx_init_hypercall_page;
719 }
721 int start_vmx(void)
722 {
723 u32 eax, edx;
724 struct vmcs_struct *vmcs;
726 /*
727 * Xen does not fill x86_capability words except 0.
728 */
729 boot_cpu_data.x86_capability[4] = cpuid_ecx(1);
731 if ( !test_bit(X86_FEATURE_VMXE, &boot_cpu_data.x86_capability) )
732 return 0;
734 rdmsr(IA32_FEATURE_CONTROL_MSR, eax, edx);
736 if ( eax & IA32_FEATURE_CONTROL_MSR_LOCK )
737 {
738 if ( (eax & IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON) == 0x0 )
739 {
740 printk("VMX disabled by Feature Control MSR.\n");
741 return 0;
742 }
743 }
744 else
745 {
746 wrmsr(IA32_FEATURE_CONTROL_MSR,
747 IA32_FEATURE_CONTROL_MSR_LOCK |
748 IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON, 0);
749 }
751 set_in_cr4(X86_CR4_VMXE);
753 vmx_init_vmcs_config();
755 if ( smp_processor_id() == 0 )
756 setup_vmcs_dump();
758 if ( (vmcs = vmx_alloc_host_vmcs()) == NULL )
759 {
760 clear_in_cr4(X86_CR4_VMXE);
761 printk("Failed to allocate host VMCS\n");
762 return 0;
763 }
765 if ( __vmxon(virt_to_maddr(vmcs)) )
766 {
767 clear_in_cr4(X86_CR4_VMXE);
768 printk("VMXON failed\n");
769 vmx_free_host_vmcs(vmcs);
770 return 0;
771 }
773 printk("VMXON is done\n");
775 vmx_save_init_msrs();
777 vmx_setup_hvm_funcs();
779 hvm_enabled = 1;
781 return 1;
782 }
784 /*
785 * Not all cases receive valid value in the VM-exit instruction length field.
786 * Callers must know what they're doing!
787 */
788 static int __get_instruction_length(void)
789 {
790 int len;
791 __vmread(VM_EXIT_INSTRUCTION_LEN, &len); /* Safe: callers audited */
792 if ( (len < 1) || (len > 15) )
793 __hvm_bug(guest_cpu_user_regs());
794 return len;
795 }
797 static void inline __update_guest_eip(unsigned long inst_len)
798 {
799 unsigned long current_eip;
801 __vmread(GUEST_RIP, &current_eip);
802 __vmwrite(GUEST_RIP, current_eip + inst_len);
803 __vmwrite(GUEST_INTERRUPTIBILITY_INFO, 0);
804 }
806 static int vmx_do_page_fault(unsigned long va, struct cpu_user_regs *regs)
807 {
808 int result;
810 #if 0 /* keep for debugging */
811 {
812 unsigned long eip, cs;
814 __vmread(GUEST_CS_BASE, &cs);
815 __vmread(GUEST_RIP, &eip);
816 HVM_DBG_LOG(DBG_LEVEL_VMMU,
817 "vmx_do_page_fault = 0x%lx, cs_base=%lx, "
818 "eip = %lx, error_code = %lx\n",
819 va, cs, eip, (unsigned long)regs->error_code);
820 }
821 #endif
823 result = shadow_fault(va, regs);
825 TRACE_VMEXIT(2, result);
826 #if 0
827 if ( !result )
828 {
829 __vmread(GUEST_RIP, &eip);
830 printk("vmx pgfault to guest va=%lx eip=%lx\n", va, eip);
831 }
832 #endif
834 return result;
835 }
837 static void vmx_do_no_device_fault(void)
838 {
839 unsigned long cr0;
840 struct vcpu *v = current;
842 setup_fpu(current);
843 __vm_clear_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_NM);
845 /* Disable TS in guest CR0 unless the guest wants the exception too. */
846 __vmread_vcpu(v, CR0_READ_SHADOW, &cr0);
847 if ( !(cr0 & X86_CR0_TS) )
848 {
849 __vmread_vcpu(v, GUEST_CR0, &cr0);
850 cr0 &= ~X86_CR0_TS;
851 __vmwrite(GUEST_CR0, cr0);
852 }
853 }
855 #define bitmaskof(idx) (1U << ((idx)&31))
856 static void vmx_do_cpuid(struct cpu_user_regs *regs)
857 {
858 unsigned int input = (unsigned int)regs->eax;
859 unsigned int count = (unsigned int)regs->ecx;
860 unsigned int eax, ebx, ecx, edx;
861 unsigned long eip;
862 struct vcpu *v = current;
864 __vmread(GUEST_RIP, &eip);
866 HVM_DBG_LOG(DBG_LEVEL_3, "(eax) 0x%08lx, (ebx) 0x%08lx, "
867 "(ecx) 0x%08lx, (edx) 0x%08lx, (esi) 0x%08lx, (edi) 0x%08lx",
868 (unsigned long)regs->eax, (unsigned long)regs->ebx,
869 (unsigned long)regs->ecx, (unsigned long)regs->edx,
870 (unsigned long)regs->esi, (unsigned long)regs->edi);
872 if ( input == CPUID_LEAF_0x4 )
873 {
874 cpuid_count(input, count, &eax, &ebx, &ecx, &edx);
875 eax &= NUM_CORES_RESET_MASK;
876 }
877 else if ( input == 0x40000003 )
878 {
879 /*
880 * NB. Unsupported interface for private use of VMXASSIST only.
881 * Note that this leaf lives at <max-hypervisor-leaf> + 1.
882 */
883 u64 value = ((u64)regs->edx << 32) | (u32)regs->ecx;
884 unsigned long mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
885 char *p;
887 gdprintk(XENLOG_INFO, "Input address is 0x%"PRIx64".\n", value);
889 /* 8-byte aligned valid pseudophys address from vmxassist, please. */
890 if ( (value & 7) || (mfn == INVALID_MFN) ||
891 !v->arch.hvm_vmx.vmxassist_enabled )
892 domain_crash_synchronous();
894 p = map_domain_page(mfn);
895 value = *((uint64_t *)(p + (value & (PAGE_SIZE - 1))));
896 unmap_domain_page(p);
898 gdprintk(XENLOG_INFO, "Output value is 0x%"PRIx64".\n", value);
899 ecx = (u32)(value >> 0);
900 edx = (u32)(value >> 32);
901 }
902 else if ( !cpuid_hypervisor_leaves(input, &eax, &ebx, &ecx, &edx) )
903 {
904 cpuid(input, &eax, &ebx, &ecx, &edx);
906 if ( input == CPUID_LEAF_0x1 )
907 {
908 /* mask off reserved bits */
909 ecx &= ~VMX_VCPU_CPUID_L1_ECX_RESERVED;
911 if ( !hvm_apic_support(v->domain) ||
912 !vlapic_global_enabled((VLAPIC(v))) )
913 {
914 /* Since the apic is disabled, avoid any
915 confusion about SMP cpus being available */
917 clear_bit(X86_FEATURE_APIC, &edx);
918 }
920 #if CONFIG_PAGING_LEVELS >= 3
921 if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] )
922 #endif
923 clear_bit(X86_FEATURE_PAE, &edx);
924 clear_bit(X86_FEATURE_PSE36, &edx);
926 ebx &= NUM_THREADS_RESET_MASK;
928 /* Unsupportable for virtualised CPUs. */
929 ecx &= ~(bitmaskof(X86_FEATURE_VMXE) |
930 bitmaskof(X86_FEATURE_EST) |
931 bitmaskof(X86_FEATURE_TM2) |
932 bitmaskof(X86_FEATURE_CID) |
933 bitmaskof(X86_FEATURE_MWAIT) );
935 edx &= ~( bitmaskof(X86_FEATURE_HT) |
936 bitmaskof(X86_FEATURE_ACPI) |
937 bitmaskof(X86_FEATURE_ACC) );
938 }
939 else if ( ( input == CPUID_LEAF_0x6 )
940 || ( input == CPUID_LEAF_0x9 )
941 || ( input == CPUID_LEAF_0xA ))
942 {
943 eax = ebx = ecx = edx = 0x0;
944 }
945 #ifdef __i386__
946 else if ( input == CPUID_LEAF_0x80000001 )
947 {
948 clear_bit(X86_FEATURE_LAHF_LM & 31, &ecx);
950 clear_bit(X86_FEATURE_LM & 31, &edx);
951 clear_bit(X86_FEATURE_SYSCALL & 31, &edx);
952 }
953 #endif
954 }
956 regs->eax = (unsigned long) eax;
957 regs->ebx = (unsigned long) ebx;
958 regs->ecx = (unsigned long) ecx;
959 regs->edx = (unsigned long) edx;
961 HVM_DBG_LOG(DBG_LEVEL_3, "eip@%lx, input: 0x%lx, "
962 "output: eax = 0x%08lx, ebx = 0x%08lx, "
963 "ecx = 0x%08lx, edx = 0x%08lx",
964 (unsigned long)eip, (unsigned long)input,
965 (unsigned long)eax, (unsigned long)ebx,
966 (unsigned long)ecx, (unsigned long)edx);
967 }
969 #define CASE_GET_REG_P(REG, reg) \
970 case REG_ ## REG: reg_p = (unsigned long *)&(regs->reg); break
972 #ifdef __i386__
973 #define CASE_EXTEND_GET_REG_P
974 #else
975 #define CASE_EXTEND_GET_REG_P \
976 CASE_GET_REG_P(R8, r8); \
977 CASE_GET_REG_P(R9, r9); \
978 CASE_GET_REG_P(R10, r10); \
979 CASE_GET_REG_P(R11, r11); \
980 CASE_GET_REG_P(R12, r12); \
981 CASE_GET_REG_P(R13, r13); \
982 CASE_GET_REG_P(R14, r14); \
983 CASE_GET_REG_P(R15, r15)
984 #endif
986 static void vmx_dr_access(unsigned long exit_qualification,
987 struct cpu_user_regs *regs)
988 {
989 struct vcpu *v = current;
991 v->arch.hvm_vcpu.flag_dr_dirty = 1;
993 /* We could probably be smarter about this */
994 __restore_debug_registers(v);
996 /* Allow guest direct access to DR registers */
997 v->arch.hvm_vcpu.u.vmx.exec_control &= ~CPU_BASED_MOV_DR_EXITING;
998 __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
999 v->arch.hvm_vcpu.u.vmx.exec_control);
1002 /*
1003 * Invalidate the TLB for va. Invalidate the shadow page corresponding
1004 * the address va.
1005 */
1006 static void vmx_do_invlpg(unsigned long va)
1008 unsigned long eip;
1009 struct vcpu *v = current;
1011 __vmread(GUEST_RIP, &eip);
1013 HVM_DBG_LOG(DBG_LEVEL_VMMU, "eip=%lx, va=%lx",
1014 eip, va);
1016 /*
1017 * We do the safest things first, then try to update the shadow
1018 * copying from guest
1019 */
1020 shadow_invlpg(v, va);
1024 static int check_for_null_selector(unsigned long eip, int inst_len, int dir)
1026 unsigned char inst[MAX_INST_LEN];
1027 unsigned long sel;
1028 int i;
1029 int inst_copy_from_guest(unsigned char *, unsigned long, int);
1031 /* INS can only use ES segment register, and it can't be overridden */
1032 if ( dir == IOREQ_READ )
1034 __vmread(GUEST_ES_SELECTOR, &sel);
1035 return sel == 0 ? 1 : 0;
1038 memset(inst, 0, MAX_INST_LEN);
1039 if ( inst_copy_from_guest(inst, eip, inst_len) != inst_len )
1041 printk("check_for_null_selector: get guest instruction failed\n");
1042 domain_crash_synchronous();
1045 for ( i = 0; i < inst_len; i++ )
1047 switch ( inst[i] )
1049 case 0xf3: /* REPZ */
1050 case 0xf2: /* REPNZ */
1051 case 0xf0: /* LOCK */
1052 case 0x66: /* data32 */
1053 case 0x67: /* addr32 */
1054 continue;
1055 case 0x2e: /* CS */
1056 __vmread(GUEST_CS_SELECTOR, &sel);
1057 break;
1058 case 0x36: /* SS */
1059 __vmread(GUEST_SS_SELECTOR, &sel);
1060 break;
1061 case 0x26: /* ES */
1062 __vmread(GUEST_ES_SELECTOR, &sel);
1063 break;
1064 case 0x64: /* FS */
1065 __vmread(GUEST_FS_SELECTOR, &sel);
1066 break;
1067 case 0x65: /* GS */
1068 __vmread(GUEST_GS_SELECTOR, &sel);
1069 break;
1070 case 0x3e: /* DS */
1071 /* FALLTHROUGH */
1072 default:
1073 /* DS is the default */
1074 __vmread(GUEST_DS_SELECTOR, &sel);
1076 return sel == 0 ? 1 : 0;
1079 return 0;
1082 static void vmx_io_instruction(unsigned long exit_qualification,
1083 unsigned long inst_len)
1085 struct cpu_user_regs *regs;
1086 struct hvm_io_op *pio_opp;
1087 unsigned long port, size;
1088 int dir, df, vm86;
1090 pio_opp = &current->arch.hvm_vcpu.io_op;
1091 pio_opp->instr = INSTR_PIO;
1092 pio_opp->flags = 0;
1094 regs = &pio_opp->io_context;
1096 /* Copy current guest state into io instruction state structure. */
1097 memcpy(regs, guest_cpu_user_regs(), HVM_CONTEXT_STACK_BYTES);
1098 hvm_store_cpu_guest_regs(current, regs, NULL);
1100 vm86 = regs->eflags & X86_EFLAGS_VM ? 1 : 0;
1101 df = regs->eflags & X86_EFLAGS_DF ? 1 : 0;
1103 HVM_DBG_LOG(DBG_LEVEL_IO, "vm86 %d, eip=%x:%lx, "
1104 "exit_qualification = %lx",
1105 vm86, regs->cs, (unsigned long)regs->eip, exit_qualification);
1107 if ( test_bit(6, &exit_qualification) )
1108 port = (exit_qualification >> 16) & 0xFFFF;
1109 else
1110 port = regs->edx & 0xffff;
1112 TRACE_VMEXIT(1,port);
1114 size = (exit_qualification & 7) + 1;
1115 dir = test_bit(3, &exit_qualification); /* direction */
1117 if ( test_bit(4, &exit_qualification) ) { /* string instruction */
1118 unsigned long addr, count = 1;
1119 int sign = regs->eflags & X86_EFLAGS_DF ? -1 : 1;
1121 __vmread(GUEST_LINEAR_ADDRESS, &addr);
1123 /*
1124 * In protected mode, guest linear address is invalid if the
1125 * selector is null.
1126 */
1127 if ( !vm86 && check_for_null_selector(regs->eip, inst_len, dir) )
1128 addr = dir == IOREQ_WRITE ? regs->esi : regs->edi;
1130 if ( test_bit(5, &exit_qualification) ) { /* "rep" prefix */
1131 pio_opp->flags |= REPZ;
1132 count = vm86 ? regs->ecx & 0xFFFF : regs->ecx;
1135 /*
1136 * Handle string pio instructions that cross pages or that
1137 * are unaligned. See the comments in hvm_domain.c/handle_mmio()
1138 */
1139 if ( (addr & PAGE_MASK) != ((addr + size - 1) & PAGE_MASK) ) {
1140 unsigned long value = 0;
1142 pio_opp->flags |= OVERLAP;
1144 if ( dir == IOREQ_WRITE ) /* OUTS */
1146 if ( hvm_paging_enabled(current) )
1147 (void)hvm_copy_from_guest_virt(&value, addr, size);
1148 else
1149 (void)hvm_copy_from_guest_phys(&value, addr, size);
1150 } else
1151 pio_opp->addr = addr;
1153 if ( count == 1 )
1154 regs->eip += inst_len;
1156 send_pio_req(port, 1, size, value, dir, df, 0);
1157 } else {
1158 unsigned long last_addr = sign > 0 ? addr + count * size - 1
1159 : addr - (count - 1) * size;
1161 if ( (addr & PAGE_MASK) != (last_addr & PAGE_MASK) )
1163 if ( sign > 0 )
1164 count = (PAGE_SIZE - (addr & ~PAGE_MASK)) / size;
1165 else
1166 count = (addr & ~PAGE_MASK) / size + 1;
1167 } else
1168 regs->eip += inst_len;
1170 send_pio_req(port, count, size, addr, dir, df, 1);
1172 } else {
1173 if ( port == 0xe9 && dir == IOREQ_WRITE && size == 1 )
1174 hvm_print_line(current, regs->eax); /* guest debug output */
1176 regs->eip += inst_len;
1177 send_pio_req(port, 1, size, regs->eax, dir, df, 0);
1181 static int vmx_world_save(struct vcpu *v, struct vmx_assist_context *c)
1183 int error = 0;
1185 /* NB. Skip transition instruction. */
1186 error |= __vmread(GUEST_RIP, &c->eip);
1187 c->eip += __get_instruction_length(); /* Safe: MOV Cn, LMSW, CLTS */
1189 error |= __vmread(GUEST_RSP, &c->esp);
1190 error |= __vmread(GUEST_RFLAGS, &c->eflags);
1192 error |= __vmread(CR0_READ_SHADOW, &c->cr0);
1193 c->cr3 = v->arch.hvm_vmx.cpu_cr3;
1194 error |= __vmread(CR4_READ_SHADOW, &c->cr4);
1196 error |= __vmread(GUEST_IDTR_LIMIT, &c->idtr_limit);
1197 error |= __vmread(GUEST_IDTR_BASE, &c->idtr_base);
1199 error |= __vmread(GUEST_GDTR_LIMIT, &c->gdtr_limit);
1200 error |= __vmread(GUEST_GDTR_BASE, &c->gdtr_base);
1202 error |= __vmread(GUEST_CS_SELECTOR, &c->cs_sel);
1203 error |= __vmread(GUEST_CS_LIMIT, &c->cs_limit);
1204 error |= __vmread(GUEST_CS_BASE, &c->cs_base);
1205 error |= __vmread(GUEST_CS_AR_BYTES, &c->cs_arbytes.bytes);
1207 error |= __vmread(GUEST_DS_SELECTOR, &c->ds_sel);
1208 error |= __vmread(GUEST_DS_LIMIT, &c->ds_limit);
1209 error |= __vmread(GUEST_DS_BASE, &c->ds_base);
1210 error |= __vmread(GUEST_DS_AR_BYTES, &c->ds_arbytes.bytes);
1212 error |= __vmread(GUEST_ES_SELECTOR, &c->es_sel);
1213 error |= __vmread(GUEST_ES_LIMIT, &c->es_limit);
1214 error |= __vmread(GUEST_ES_BASE, &c->es_base);
1215 error |= __vmread(GUEST_ES_AR_BYTES, &c->es_arbytes.bytes);
1217 error |= __vmread(GUEST_SS_SELECTOR, &c->ss_sel);
1218 error |= __vmread(GUEST_SS_LIMIT, &c->ss_limit);
1219 error |= __vmread(GUEST_SS_BASE, &c->ss_base);
1220 error |= __vmread(GUEST_SS_AR_BYTES, &c->ss_arbytes.bytes);
1222 error |= __vmread(GUEST_FS_SELECTOR, &c->fs_sel);
1223 error |= __vmread(GUEST_FS_LIMIT, &c->fs_limit);
1224 error |= __vmread(GUEST_FS_BASE, &c->fs_base);
1225 error |= __vmread(GUEST_FS_AR_BYTES, &c->fs_arbytes.bytes);
1227 error |= __vmread(GUEST_GS_SELECTOR, &c->gs_sel);
1228 error |= __vmread(GUEST_GS_LIMIT, &c->gs_limit);
1229 error |= __vmread(GUEST_GS_BASE, &c->gs_base);
1230 error |= __vmread(GUEST_GS_AR_BYTES, &c->gs_arbytes.bytes);
1232 error |= __vmread(GUEST_TR_SELECTOR, &c->tr_sel);
1233 error |= __vmread(GUEST_TR_LIMIT, &c->tr_limit);
1234 error |= __vmread(GUEST_TR_BASE, &c->tr_base);
1235 error |= __vmread(GUEST_TR_AR_BYTES, &c->tr_arbytes.bytes);
1237 error |= __vmread(GUEST_LDTR_SELECTOR, &c->ldtr_sel);
1238 error |= __vmread(GUEST_LDTR_LIMIT, &c->ldtr_limit);
1239 error |= __vmread(GUEST_LDTR_BASE, &c->ldtr_base);
1240 error |= __vmread(GUEST_LDTR_AR_BYTES, &c->ldtr_arbytes.bytes);
1242 return !error;
1245 static int vmx_world_restore(struct vcpu *v, struct vmx_assist_context *c)
1247 unsigned long mfn, old_base_mfn;
1248 int error = 0;
1250 error |= __vmwrite(GUEST_RIP, c->eip);
1251 error |= __vmwrite(GUEST_RSP, c->esp);
1252 error |= __vmwrite(GUEST_RFLAGS, c->eflags);
1254 error |= __vmwrite(CR0_READ_SHADOW, c->cr0);
1256 if (!vmx_paging_enabled(v))
1257 goto skip_cr3;
1259 if (c->cr3 == v->arch.hvm_vmx.cpu_cr3) {
1260 /*
1261 * This is simple TLB flush, implying the guest has
1262 * removed some translation or changed page attributes.
1263 * We simply invalidate the shadow.
1264 */
1265 mfn = get_mfn_from_gpfn(c->cr3 >> PAGE_SHIFT);
1266 if (mfn != pagetable_get_pfn(v->arch.guest_table)) {
1267 printk("Invalid CR3 value=%x", c->cr3);
1268 domain_crash_synchronous();
1269 return 0;
1271 } else {
1272 /*
1273 * If different, make a shadow. Check if the PDBR is valid
1274 * first.
1275 */
1276 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 c->cr3 = %x", c->cr3);
1277 mfn = get_mfn_from_gpfn(c->cr3 >> PAGE_SHIFT);
1278 if ( !VALID_MFN(mfn) )
1280 printk("Invalid CR3 value=%x", c->cr3);
1281 domain_crash_synchronous();
1282 return 0;
1284 if(!get_page(mfn_to_page(mfn), v->domain))
1285 return 0;
1286 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1287 v->arch.guest_table = pagetable_from_pfn(mfn);
1288 if (old_base_mfn)
1289 put_page(mfn_to_page(old_base_mfn));
1290 /*
1291 * arch.shadow_table should now hold the next CR3 for shadow
1292 */
1293 v->arch.hvm_vmx.cpu_cr3 = c->cr3;
1296 skip_cr3:
1298 if (!vmx_paging_enabled(v))
1299 HVM_DBG_LOG(DBG_LEVEL_VMMU, "switching to vmxassist. use phys table");
1300 else
1301 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %x", c->cr3);
1303 error |= __vmwrite(GUEST_CR4, (c->cr4 | VMX_CR4_HOST_MASK));
1304 error |= __vmwrite(CR4_READ_SHADOW, c->cr4);
1306 error |= __vmwrite(GUEST_IDTR_LIMIT, c->idtr_limit);
1307 error |= __vmwrite(GUEST_IDTR_BASE, c->idtr_base);
1309 error |= __vmwrite(GUEST_GDTR_LIMIT, c->gdtr_limit);
1310 error |= __vmwrite(GUEST_GDTR_BASE, c->gdtr_base);
1312 error |= __vmwrite(GUEST_CS_SELECTOR, c->cs_sel);
1313 error |= __vmwrite(GUEST_CS_LIMIT, c->cs_limit);
1314 error |= __vmwrite(GUEST_CS_BASE, c->cs_base);
1315 error |= __vmwrite(GUEST_CS_AR_BYTES, c->cs_arbytes.bytes);
1317 error |= __vmwrite(GUEST_DS_SELECTOR, c->ds_sel);
1318 error |= __vmwrite(GUEST_DS_LIMIT, c->ds_limit);
1319 error |= __vmwrite(GUEST_DS_BASE, c->ds_base);
1320 error |= __vmwrite(GUEST_DS_AR_BYTES, c->ds_arbytes.bytes);
1322 error |= __vmwrite(GUEST_ES_SELECTOR, c->es_sel);
1323 error |= __vmwrite(GUEST_ES_LIMIT, c->es_limit);
1324 error |= __vmwrite(GUEST_ES_BASE, c->es_base);
1325 error |= __vmwrite(GUEST_ES_AR_BYTES, c->es_arbytes.bytes);
1327 error |= __vmwrite(GUEST_SS_SELECTOR, c->ss_sel);
1328 error |= __vmwrite(GUEST_SS_LIMIT, c->ss_limit);
1329 error |= __vmwrite(GUEST_SS_BASE, c->ss_base);
1330 error |= __vmwrite(GUEST_SS_AR_BYTES, c->ss_arbytes.bytes);
1332 error |= __vmwrite(GUEST_FS_SELECTOR, c->fs_sel);
1333 error |= __vmwrite(GUEST_FS_LIMIT, c->fs_limit);
1334 error |= __vmwrite(GUEST_FS_BASE, c->fs_base);
1335 error |= __vmwrite(GUEST_FS_AR_BYTES, c->fs_arbytes.bytes);
1337 error |= __vmwrite(GUEST_GS_SELECTOR, c->gs_sel);
1338 error |= __vmwrite(GUEST_GS_LIMIT, c->gs_limit);
1339 error |= __vmwrite(GUEST_GS_BASE, c->gs_base);
1340 error |= __vmwrite(GUEST_GS_AR_BYTES, c->gs_arbytes.bytes);
1342 error |= __vmwrite(GUEST_TR_SELECTOR, c->tr_sel);
1343 error |= __vmwrite(GUEST_TR_LIMIT, c->tr_limit);
1344 error |= __vmwrite(GUEST_TR_BASE, c->tr_base);
1345 error |= __vmwrite(GUEST_TR_AR_BYTES, c->tr_arbytes.bytes);
1347 error |= __vmwrite(GUEST_LDTR_SELECTOR, c->ldtr_sel);
1348 error |= __vmwrite(GUEST_LDTR_LIMIT, c->ldtr_limit);
1349 error |= __vmwrite(GUEST_LDTR_BASE, c->ldtr_base);
1350 error |= __vmwrite(GUEST_LDTR_AR_BYTES, c->ldtr_arbytes.bytes);
1352 shadow_update_paging_modes(v);
1353 __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
1355 return !error;
1358 enum { VMX_ASSIST_INVOKE = 0, VMX_ASSIST_RESTORE };
1360 static int vmx_assist(struct vcpu *v, int mode)
1362 struct vmx_assist_context c;
1363 u32 magic;
1364 u32 cp;
1366 /* make sure vmxassist exists (this is not an error) */
1367 if (hvm_copy_from_guest_phys(&magic, VMXASSIST_MAGIC_OFFSET,
1368 sizeof(magic)))
1369 return 0;
1370 if (magic != VMXASSIST_MAGIC)
1371 return 0;
1373 switch (mode) {
1374 /*
1375 * Transfer control to vmxassist.
1376 * Store the current context in VMXASSIST_OLD_CONTEXT and load
1377 * the new VMXASSIST_NEW_CONTEXT context. This context was created
1378 * by vmxassist and will transfer control to it.
1379 */
1380 case VMX_ASSIST_INVOKE:
1381 /* save the old context */
1382 if (hvm_copy_from_guest_phys(&cp, VMXASSIST_OLD_CONTEXT, sizeof(cp)))
1383 goto error;
1384 if (cp != 0) {
1385 if (!vmx_world_save(v, &c))
1386 goto error;
1387 if (hvm_copy_to_guest_phys(cp, &c, sizeof(c)))
1388 goto error;
1391 /* restore the new context, this should activate vmxassist */
1392 if (hvm_copy_from_guest_phys(&cp, VMXASSIST_NEW_CONTEXT, sizeof(cp)))
1393 goto error;
1394 if (cp != 0) {
1395 if (hvm_copy_from_guest_phys(&c, cp, sizeof(c)))
1396 goto error;
1397 if (!vmx_world_restore(v, &c))
1398 goto error;
1399 v->arch.hvm_vmx.vmxassist_enabled = 1;
1400 return 1;
1402 break;
1404 /*
1405 * Restore the VMXASSIST_OLD_CONTEXT that was saved by
1406 * VMX_ASSIST_INVOKE above.
1407 */
1408 case VMX_ASSIST_RESTORE:
1409 /* save the old context */
1410 if (hvm_copy_from_guest_phys(&cp, VMXASSIST_OLD_CONTEXT, sizeof(cp)))
1411 goto error;
1412 if (cp != 0) {
1413 if (hvm_copy_from_guest_phys(&c, cp, sizeof(c)))
1414 goto error;
1415 if (!vmx_world_restore(v, &c))
1416 goto error;
1417 v->arch.hvm_vmx.vmxassist_enabled = 0;
1418 return 1;
1420 break;
1423 error:
1424 printk("Failed to transfer to vmxassist\n");
1425 domain_crash_synchronous();
1426 return 0;
1429 static int vmx_set_cr0(unsigned long value)
1431 struct vcpu *v = current;
1432 unsigned long mfn;
1433 unsigned long eip;
1434 int paging_enabled;
1435 unsigned long vm_entry_value;
1436 unsigned long old_cr0;
1437 unsigned long old_base_mfn;
1439 /*
1440 * CR0: We don't want to lose PE and PG.
1441 */
1442 __vmread_vcpu(v, CR0_READ_SHADOW, &old_cr0);
1443 paging_enabled = (old_cr0 & X86_CR0_PE) && (old_cr0 & X86_CR0_PG);
1445 /* TS cleared? Then initialise FPU now. */
1446 if ( !(value & X86_CR0_TS) )
1448 setup_fpu(v);
1449 __vm_clear_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_NM);
1452 __vmwrite(GUEST_CR0, value | X86_CR0_PE | X86_CR0_PG | X86_CR0_NE);
1453 __vmwrite(CR0_READ_SHADOW, value);
1455 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR0 value = %lx\n", value);
1457 if ( (value & X86_CR0_PE) && (value & X86_CR0_PG) && !paging_enabled )
1459 /*
1460 * Trying to enable guest paging.
1461 * The guest CR3 must be pointing to the guest physical.
1462 */
1463 mfn = get_mfn_from_gpfn(v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT);
1464 if ( !VALID_MFN(mfn) || !get_page(mfn_to_page(mfn), v->domain) )
1466 printk("Invalid CR3 value = %lx (mfn=%lx)\n",
1467 v->arch.hvm_vmx.cpu_cr3, mfn);
1468 domain_crash_synchronous(); /* need to take a clean path */
1471 #if defined(__x86_64__)
1472 if ( vmx_lme_is_set(v) )
1474 if ( !(v->arch.hvm_vmx.cpu_shadow_cr4 & X86_CR4_PAE) )
1476 HVM_DBG_LOG(DBG_LEVEL_1, "Guest enabled paging "
1477 "with EFER.LME set but not CR4.PAE\n");
1478 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
1480 else
1482 HVM_DBG_LOG(DBG_LEVEL_1, "Enabling long mode\n");
1483 v->arch.hvm_vmx.msr_content.msr_items[VMX_INDEX_MSR_EFER]
1484 |= EFER_LMA;
1485 __vmread(VM_ENTRY_CONTROLS, &vm_entry_value);
1486 vm_entry_value |= VM_ENTRY_IA32E_MODE;
1487 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
1490 #endif
1492 /*
1493 * Now arch.guest_table points to machine physical.
1494 */
1495 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1496 v->arch.guest_table = pagetable_from_pfn(mfn);
1497 if (old_base_mfn)
1498 put_page(mfn_to_page(old_base_mfn));
1499 shadow_update_paging_modes(v);
1501 HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
1502 (unsigned long) (mfn << PAGE_SHIFT));
1504 __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
1505 /*
1506 * arch->shadow_table should hold the next CR3 for shadow
1507 */
1508 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, mfn = %lx",
1509 v->arch.hvm_vmx.cpu_cr3, mfn);
1512 if ( !((value & X86_CR0_PE) && (value & X86_CR0_PG)) && paging_enabled )
1513 if ( v->arch.hvm_vmx.cpu_cr3 ) {
1514 put_page(mfn_to_page(get_mfn_from_gpfn(
1515 v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT)));
1516 v->arch.guest_table = pagetable_null();
1519 /*
1520 * VMX does not implement real-mode virtualization. We emulate
1521 * real-mode by performing a world switch to VMXAssist whenever
1522 * a partition disables the CR0.PE bit.
1523 */
1524 if ( (value & X86_CR0_PE) == 0 )
1526 if ( value & X86_CR0_PG ) {
1527 /* inject GP here */
1528 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
1529 return 0;
1530 } else {
1531 /*
1532 * Disable paging here.
1533 * Same to PE == 1 && PG == 0
1534 */
1535 if ( vmx_long_mode_enabled(v) )
1537 v->arch.hvm_vmx.msr_content.msr_items[VMX_INDEX_MSR_EFER]
1538 &= ~EFER_LMA;
1539 __vmread(VM_ENTRY_CONTROLS, &vm_entry_value);
1540 vm_entry_value &= ~VM_ENTRY_IA32E_MODE;
1541 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
1545 if ( vmx_assist(v, VMX_ASSIST_INVOKE) )
1547 __vmread(GUEST_RIP, &eip);
1548 HVM_DBG_LOG(DBG_LEVEL_1,
1549 "Transfering control to vmxassist %%eip 0x%lx\n", eip);
1550 return 0; /* do not update eip! */
1553 else if ( v->arch.hvm_vmx.vmxassist_enabled )
1555 __vmread(GUEST_RIP, &eip);
1556 HVM_DBG_LOG(DBG_LEVEL_1,
1557 "Enabling CR0.PE at %%eip 0x%lx\n", eip);
1558 if ( vmx_assist(v, VMX_ASSIST_RESTORE) )
1560 __vmread(GUEST_RIP, &eip);
1561 HVM_DBG_LOG(DBG_LEVEL_1,
1562 "Restoring to %%eip 0x%lx\n", eip);
1563 return 0; /* do not update eip! */
1566 else if ( (value & (X86_CR0_PE | X86_CR0_PG)) == X86_CR0_PE )
1568 shadow_update_paging_modes(v);
1569 __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
1572 return 1;
1575 #define CASE_SET_REG(REG, reg) \
1576 case REG_ ## REG: regs->reg = value; break
1577 #define CASE_GET_REG(REG, reg) \
1578 case REG_ ## REG: value = regs->reg; break
1580 #define CASE_EXTEND_SET_REG \
1581 CASE_EXTEND_REG(S)
1582 #define CASE_EXTEND_GET_REG \
1583 CASE_EXTEND_REG(G)
1585 #ifdef __i386__
1586 #define CASE_EXTEND_REG(T)
1587 #else
1588 #define CASE_EXTEND_REG(T) \
1589 CASE_ ## T ## ET_REG(R8, r8); \
1590 CASE_ ## T ## ET_REG(R9, r9); \
1591 CASE_ ## T ## ET_REG(R10, r10); \
1592 CASE_ ## T ## ET_REG(R11, r11); \
1593 CASE_ ## T ## ET_REG(R12, r12); \
1594 CASE_ ## T ## ET_REG(R13, r13); \
1595 CASE_ ## T ## ET_REG(R14, r14); \
1596 CASE_ ## T ## ET_REG(R15, r15)
1597 #endif
1599 /*
1600 * Write to control registers
1601 */
1602 static int mov_to_cr(int gp, int cr, struct cpu_user_regs *regs)
1604 unsigned long value;
1605 unsigned long old_cr;
1606 struct vcpu *v = current;
1607 struct vlapic *vlapic = VLAPIC(v);
1609 switch ( gp ) {
1610 CASE_GET_REG(EAX, eax);
1611 CASE_GET_REG(ECX, ecx);
1612 CASE_GET_REG(EDX, edx);
1613 CASE_GET_REG(EBX, ebx);
1614 CASE_GET_REG(EBP, ebp);
1615 CASE_GET_REG(ESI, esi);
1616 CASE_GET_REG(EDI, edi);
1617 CASE_EXTEND_GET_REG;
1618 case REG_ESP:
1619 __vmread(GUEST_RSP, &value);
1620 break;
1621 default:
1622 printk("invalid gp: %d\n", gp);
1623 __hvm_bug(regs);
1626 TRACE_VMEXIT(1, TYPE_MOV_TO_CR);
1627 TRACE_VMEXIT(2, cr);
1628 TRACE_VMEXIT(3, value);
1630 HVM_DBG_LOG(DBG_LEVEL_1, "CR%d, value = %lx", cr, value);
1632 switch ( cr ) {
1633 case 0:
1634 return vmx_set_cr0(value);
1635 case 3:
1637 unsigned long old_base_mfn, mfn;
1639 /*
1640 * If paging is not enabled yet, simply copy the value to CR3.
1641 */
1642 if (!vmx_paging_enabled(v)) {
1643 v->arch.hvm_vmx.cpu_cr3 = value;
1644 break;
1647 /*
1648 * We make a new one if the shadow does not exist.
1649 */
1650 if (value == v->arch.hvm_vmx.cpu_cr3) {
1651 /*
1652 * This is simple TLB flush, implying the guest has
1653 * removed some translation or changed page attributes.
1654 * We simply invalidate the shadow.
1655 */
1656 mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
1657 if (mfn != pagetable_get_pfn(v->arch.guest_table))
1658 __hvm_bug(regs);
1659 shadow_update_cr3(v);
1660 } else {
1661 /*
1662 * If different, make a shadow. Check if the PDBR is valid
1663 * first.
1664 */
1665 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 value = %lx", value);
1666 mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
1667 if ( !VALID_MFN(mfn) || !get_page(mfn_to_page(mfn), v->domain) )
1669 printk("Invalid CR3 value=%lx\n", value);
1670 domain_crash_synchronous(); /* need to take a clean path */
1672 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1673 v->arch.guest_table = pagetable_from_pfn(mfn);
1674 if (old_base_mfn)
1675 put_page(mfn_to_page(old_base_mfn));
1676 /*
1677 * arch.shadow_table should now hold the next CR3 for shadow
1678 */
1679 v->arch.hvm_vmx.cpu_cr3 = value;
1680 update_cr3(v);
1681 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx",
1682 value);
1683 __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
1685 break;
1687 case 4: /* CR4 */
1689 __vmread(CR4_READ_SHADOW, &old_cr);
1691 if ( value & X86_CR4_PAE && !(old_cr & X86_CR4_PAE) )
1693 if ( vmx_pgbit_test(v) )
1695 /* The guest is a 32-bit PAE guest. */
1696 #if CONFIG_PAGING_LEVELS >= 3
1697 unsigned long mfn, old_base_mfn;
1698 mfn = get_mfn_from_gpfn(v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT);
1699 if ( !VALID_MFN(mfn) ||
1700 !get_page(mfn_to_page(mfn), v->domain) )
1702 printk("Invalid CR3 value = %lx", v->arch.hvm_vmx.cpu_cr3);
1703 domain_crash_synchronous(); /* need to take a clean path */
1706 /*
1707 * Now arch.guest_table points to machine physical.
1708 */
1710 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1711 v->arch.guest_table = pagetable_from_pfn(mfn);
1712 if ( old_base_mfn )
1713 put_page(mfn_to_page(old_base_mfn));
1715 HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
1716 (unsigned long) (mfn << PAGE_SHIFT));
1718 __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
1720 /*
1721 * arch->shadow_table should hold the next CR3 for shadow
1722 */
1724 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, mfn = %lx",
1725 v->arch.hvm_vmx.cpu_cr3, mfn);
1726 #endif
1729 else if ( !(value & X86_CR4_PAE) )
1731 if ( unlikely(vmx_long_mode_enabled(v)) )
1733 HVM_DBG_LOG(DBG_LEVEL_1, "Guest cleared CR4.PAE while "
1734 "EFER.LMA is set\n");
1735 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
1739 __vmwrite(GUEST_CR4, value| VMX_CR4_HOST_MASK);
1740 __vmwrite(CR4_READ_SHADOW, value);
1742 /*
1743 * Writing to CR4 to modify the PSE, PGE, or PAE flag invalidates
1744 * all TLB entries except global entries.
1745 */
1746 if ( (old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE) )
1747 shadow_update_paging_modes(v);
1748 break;
1750 case 8:
1752 if ( vlapic == NULL )
1753 break;
1754 vlapic_set_reg(vlapic, APIC_TASKPRI, ((value & 0x0F) << 4));
1755 break;
1757 default:
1758 printk("invalid cr: %d\n", gp);
1759 __hvm_bug(regs);
1762 return 1;
1765 /*
1766 * Read from control registers. CR0 and CR4 are read from the shadow.
1767 */
1768 static void mov_from_cr(int cr, int gp, struct cpu_user_regs *regs)
1770 unsigned long value = 0;
1771 struct vcpu *v = current;
1772 struct vlapic *vlapic = VLAPIC(v);
1774 switch ( cr )
1776 case 3:
1777 value = (unsigned long)v->arch.hvm_vmx.cpu_cr3;
1778 break;
1779 case 8:
1780 if ( vlapic == NULL )
1781 break;
1782 value = (unsigned long)vlapic_get_reg(vlapic, APIC_TASKPRI);
1783 value = (value & 0xF0) >> 4;
1784 break;
1785 default:
1786 __hvm_bug(regs);
1789 switch ( gp ) {
1790 CASE_SET_REG(EAX, eax);
1791 CASE_SET_REG(ECX, ecx);
1792 CASE_SET_REG(EDX, edx);
1793 CASE_SET_REG(EBX, ebx);
1794 CASE_SET_REG(EBP, ebp);
1795 CASE_SET_REG(ESI, esi);
1796 CASE_SET_REG(EDI, edi);
1797 CASE_EXTEND_SET_REG;
1798 case REG_ESP:
1799 __vmwrite(GUEST_RSP, value);
1800 regs->esp = value;
1801 break;
1802 default:
1803 printk("invalid gp: %d\n", gp);
1804 __hvm_bug(regs);
1807 TRACE_VMEXIT(1, TYPE_MOV_FROM_CR);
1808 TRACE_VMEXIT(2, cr);
1809 TRACE_VMEXIT(3, value);
1811 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR%d, value = %lx", cr, value);
1814 static int vmx_cr_access(unsigned long exit_qualification,
1815 struct cpu_user_regs *regs)
1817 unsigned int gp, cr;
1818 unsigned long value;
1819 struct vcpu *v = current;
1821 switch (exit_qualification & CONTROL_REG_ACCESS_TYPE) {
1822 case TYPE_MOV_TO_CR:
1823 gp = exit_qualification & CONTROL_REG_ACCESS_REG;
1824 cr = exit_qualification & CONTROL_REG_ACCESS_NUM;
1825 return mov_to_cr(gp, cr, regs);
1826 case TYPE_MOV_FROM_CR:
1827 gp = exit_qualification & CONTROL_REG_ACCESS_REG;
1828 cr = exit_qualification & CONTROL_REG_ACCESS_NUM;
1829 mov_from_cr(cr, gp, regs);
1830 break;
1831 case TYPE_CLTS:
1832 TRACE_VMEXIT(1, TYPE_CLTS);
1834 /* We initialise the FPU now, to avoid needing another vmexit. */
1835 setup_fpu(v);
1836 __vm_clear_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_NM);
1838 __vmread_vcpu(v, GUEST_CR0, &value);
1839 value &= ~X86_CR0_TS; /* clear TS */
1840 __vmwrite(GUEST_CR0, value);
1842 __vmread_vcpu(v, CR0_READ_SHADOW, &value);
1843 value &= ~X86_CR0_TS; /* clear TS */
1844 __vmwrite(CR0_READ_SHADOW, value);
1845 break;
1846 case TYPE_LMSW:
1847 __vmread_vcpu(v, CR0_READ_SHADOW, &value);
1848 value = (value & ~0xF) |
1849 (((exit_qualification & LMSW_SOURCE_DATA) >> 16) & 0xF);
1850 TRACE_VMEXIT(1, TYPE_LMSW);
1851 TRACE_VMEXIT(2, value);
1852 return vmx_set_cr0(value);
1853 break;
1854 default:
1855 __hvm_bug(regs);
1856 break;
1858 return 1;
1861 static inline void vmx_do_msr_read(struct cpu_user_regs *regs)
1863 u64 msr_content = 0;
1864 u32 eax, edx;
1865 struct vcpu *v = current;
1867 HVM_DBG_LOG(DBG_LEVEL_1, "ecx=%lx, eax=%lx, edx=%lx",
1868 (unsigned long)regs->ecx, (unsigned long)regs->eax,
1869 (unsigned long)regs->edx);
1870 switch (regs->ecx) {
1871 case MSR_IA32_TIME_STAMP_COUNTER:
1872 msr_content = hvm_get_guest_time(v);
1873 break;
1874 case MSR_IA32_SYSENTER_CS:
1875 __vmread(GUEST_SYSENTER_CS, (u32 *)&msr_content);
1876 break;
1877 case MSR_IA32_SYSENTER_ESP:
1878 __vmread(GUEST_SYSENTER_ESP, &msr_content);
1879 break;
1880 case MSR_IA32_SYSENTER_EIP:
1881 __vmread(GUEST_SYSENTER_EIP, &msr_content);
1882 break;
1883 case MSR_IA32_APICBASE:
1884 msr_content = VLAPIC(v) ? VLAPIC(v)->apic_base_msr : 0;
1885 break;
1886 default:
1887 if (long_mode_do_msr_read(regs))
1888 return;
1890 if ( rdmsr_hypervisor_regs(regs->ecx, &eax, &edx) )
1892 regs->eax = eax;
1893 regs->edx = edx;
1894 return;
1897 rdmsr_safe(regs->ecx, regs->eax, regs->edx);
1898 return;
1901 regs->eax = msr_content & 0xFFFFFFFF;
1902 regs->edx = msr_content >> 32;
1904 HVM_DBG_LOG(DBG_LEVEL_1, "returns: ecx=%lx, eax=%lx, edx=%lx",
1905 (unsigned long)regs->ecx, (unsigned long)regs->eax,
1906 (unsigned long)regs->edx);
1909 static inline void vmx_do_msr_write(struct cpu_user_regs *regs)
1911 u64 msr_content;
1912 struct vcpu *v = current;
1914 HVM_DBG_LOG(DBG_LEVEL_1, "ecx=%lx, eax=%lx, edx=%lx",
1915 (unsigned long)regs->ecx, (unsigned long)regs->eax,
1916 (unsigned long)regs->edx);
1918 msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
1920 switch (regs->ecx) {
1921 case MSR_IA32_TIME_STAMP_COUNTER:
1923 struct periodic_time *pt =
1924 &(v->domain->arch.hvm_domain.pl_time.periodic_tm);
1925 if ( pt->enabled && pt->first_injected
1926 && v->vcpu_id == pt->bind_vcpu )
1927 pt->first_injected = 0;
1929 hvm_set_guest_time(v, msr_content);
1930 break;
1931 case MSR_IA32_SYSENTER_CS:
1932 __vmwrite(GUEST_SYSENTER_CS, msr_content);
1933 break;
1934 case MSR_IA32_SYSENTER_ESP:
1935 __vmwrite(GUEST_SYSENTER_ESP, msr_content);
1936 break;
1937 case MSR_IA32_SYSENTER_EIP:
1938 __vmwrite(GUEST_SYSENTER_EIP, msr_content);
1939 break;
1940 case MSR_IA32_APICBASE:
1941 vlapic_msr_set(VLAPIC(v), msr_content);
1942 break;
1943 default:
1944 if ( !long_mode_do_msr_write(regs) )
1945 wrmsr_hypervisor_regs(regs->ecx, regs->eax, regs->edx);
1946 break;
1949 HVM_DBG_LOG(DBG_LEVEL_1, "returns: ecx=%lx, eax=%lx, edx=%lx",
1950 (unsigned long)regs->ecx, (unsigned long)regs->eax,
1951 (unsigned long)regs->edx);
1954 static void vmx_do_hlt(void)
1956 unsigned long rflags;
1957 __vmread(GUEST_RFLAGS, &rflags);
1958 hvm_hlt(rflags);
1961 static inline void vmx_do_extint(struct cpu_user_regs *regs)
1963 unsigned int vector;
1964 int error;
1966 asmlinkage void do_IRQ(struct cpu_user_regs *);
1967 fastcall void smp_apic_timer_interrupt(struct cpu_user_regs *);
1968 fastcall void smp_event_check_interrupt(void);
1969 fastcall void smp_invalidate_interrupt(void);
1970 fastcall void smp_call_function_interrupt(void);
1971 fastcall void smp_spurious_interrupt(struct cpu_user_regs *regs);
1972 fastcall void smp_error_interrupt(struct cpu_user_regs *regs);
1973 #ifdef CONFIG_X86_MCE_P4THERMAL
1974 fastcall void smp_thermal_interrupt(struct cpu_user_regs *regs);
1975 #endif
1977 if ((error = __vmread(VM_EXIT_INTR_INFO, &vector))
1978 && !(vector & INTR_INFO_VALID_MASK))
1979 __hvm_bug(regs);
1981 vector &= INTR_INFO_VECTOR_MASK;
1982 TRACE_VMEXIT(1, vector);
1984 switch(vector) {
1985 case LOCAL_TIMER_VECTOR:
1986 smp_apic_timer_interrupt(regs);
1987 break;
1988 case EVENT_CHECK_VECTOR:
1989 smp_event_check_interrupt();
1990 break;
1991 case INVALIDATE_TLB_VECTOR:
1992 smp_invalidate_interrupt();
1993 break;
1994 case CALL_FUNCTION_VECTOR:
1995 smp_call_function_interrupt();
1996 break;
1997 case SPURIOUS_APIC_VECTOR:
1998 smp_spurious_interrupt(regs);
1999 break;
2000 case ERROR_APIC_VECTOR:
2001 smp_error_interrupt(regs);
2002 break;
2003 #ifdef CONFIG_X86_MCE_P4THERMAL
2004 case THERMAL_APIC_VECTOR:
2005 smp_thermal_interrupt(regs);
2006 break;
2007 #endif
2008 default:
2009 regs->entry_vector = vector;
2010 do_IRQ(regs);
2011 break;
2015 #if defined (__x86_64__)
2016 void store_cpu_user_regs(struct cpu_user_regs *regs)
2018 __vmread(GUEST_SS_SELECTOR, &regs->ss);
2019 __vmread(GUEST_RSP, &regs->rsp);
2020 __vmread(GUEST_RFLAGS, &regs->rflags);
2021 __vmread(GUEST_CS_SELECTOR, &regs->cs);
2022 __vmread(GUEST_DS_SELECTOR, &regs->ds);
2023 __vmread(GUEST_ES_SELECTOR, &regs->es);
2024 __vmread(GUEST_RIP, &regs->rip);
2026 #elif defined (__i386__)
2027 void store_cpu_user_regs(struct cpu_user_regs *regs)
2029 __vmread(GUEST_SS_SELECTOR, &regs->ss);
2030 __vmread(GUEST_RSP, &regs->esp);
2031 __vmread(GUEST_RFLAGS, &regs->eflags);
2032 __vmread(GUEST_CS_SELECTOR, &regs->cs);
2033 __vmread(GUEST_DS_SELECTOR, &regs->ds);
2034 __vmread(GUEST_ES_SELECTOR, &regs->es);
2035 __vmread(GUEST_RIP, &regs->eip);
2037 #endif
2039 #ifdef XEN_DEBUGGER
2040 void save_cpu_user_regs(struct cpu_user_regs *regs)
2042 __vmread(GUEST_SS_SELECTOR, &regs->xss);
2043 __vmread(GUEST_RSP, &regs->esp);
2044 __vmread(GUEST_RFLAGS, &regs->eflags);
2045 __vmread(GUEST_CS_SELECTOR, &regs->xcs);
2046 __vmread(GUEST_RIP, &regs->eip);
2048 __vmread(GUEST_GS_SELECTOR, &regs->xgs);
2049 __vmread(GUEST_FS_SELECTOR, &regs->xfs);
2050 __vmread(GUEST_ES_SELECTOR, &regs->xes);
2051 __vmread(GUEST_DS_SELECTOR, &regs->xds);
2054 void restore_cpu_user_regs(struct cpu_user_regs *regs)
2056 __vmwrite(GUEST_SS_SELECTOR, regs->xss);
2057 __vmwrite(GUEST_RSP, regs->esp);
2058 __vmwrite(GUEST_RFLAGS, regs->eflags);
2059 __vmwrite(GUEST_CS_SELECTOR, regs->xcs);
2060 __vmwrite(GUEST_RIP, regs->eip);
2062 __vmwrite(GUEST_GS_SELECTOR, regs->xgs);
2063 __vmwrite(GUEST_FS_SELECTOR, regs->xfs);
2064 __vmwrite(GUEST_ES_SELECTOR, regs->xes);
2065 __vmwrite(GUEST_DS_SELECTOR, regs->xds);
2067 #endif
2069 static void vmx_reflect_exception(struct vcpu *v)
2071 int error_code, intr_info, vector;
2073 __vmread(VM_EXIT_INTR_INFO, &intr_info);
2074 vector = intr_info & 0xff;
2075 if ( intr_info & INTR_INFO_DELIVER_CODE_MASK )
2076 __vmread(VM_EXIT_INTR_ERROR_CODE, &error_code);
2077 else
2078 error_code = VMX_DELIVER_NO_ERROR_CODE;
2080 #ifndef NDEBUG
2082 unsigned long rip;
2084 __vmread(GUEST_RIP, &rip);
2085 HVM_DBG_LOG(DBG_LEVEL_1, "rip = %lx, error_code = %x",
2086 rip, error_code);
2088 #endif /* NDEBUG */
2090 /*
2091 * According to Intel Virtualization Technology Specification for
2092 * the IA-32 Intel Architecture (C97063-002 April 2005), section
2093 * 2.8.3, SW_EXCEPTION should be used for #BP and #OV, and
2094 * HW_EXCEPTION used for everything else. The main difference
2095 * appears to be that for SW_EXCEPTION, the EIP/RIP is incremented
2096 * by VM_ENTER_INSTRUCTION_LEN bytes, whereas for HW_EXCEPTION,
2097 * it is not.
2098 */
2099 if ( (intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_SW_EXCEPTION )
2101 int ilen = __get_instruction_length(); /* Safe: software exception */
2102 vmx_inject_sw_exception(v, vector, ilen);
2104 else
2106 vmx_inject_hw_exception(v, vector, error_code);
2110 asmlinkage void vmx_vmexit_handler(struct cpu_user_regs *regs)
2112 unsigned int exit_reason;
2113 unsigned long exit_qualification, inst_len = 0;
2114 struct vcpu *v = current;
2116 __vmread(VM_EXIT_REASON, &exit_reason);
2118 perfc_incra(vmexits, exit_reason);
2120 if ( (exit_reason != EXIT_REASON_EXTERNAL_INTERRUPT) &&
2121 (exit_reason != EXIT_REASON_VMCALL) &&
2122 (exit_reason != EXIT_REASON_IO_INSTRUCTION) )
2123 HVM_DBG_LOG(DBG_LEVEL_0, "exit reason = %x", exit_reason);
2125 if ( exit_reason != EXIT_REASON_EXTERNAL_INTERRUPT )
2126 local_irq_enable();
2128 if ( unlikely(exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) )
2130 unsigned int failed_vmentry_reason = exit_reason & 0xFFFF;
2132 __vmread(EXIT_QUALIFICATION, &exit_qualification);
2133 printk("Failed vm entry (exit reason 0x%x) ", exit_reason);
2134 switch ( failed_vmentry_reason ) {
2135 case EXIT_REASON_INVALID_GUEST_STATE:
2136 printk("caused by invalid guest state (%ld).\n", exit_qualification);
2137 break;
2138 case EXIT_REASON_MSR_LOADING:
2139 printk("caused by MSR entry %ld loading.\n", exit_qualification);
2140 break;
2141 case EXIT_REASON_MACHINE_CHECK:
2142 printk("caused by machine check.\n");
2143 break;
2144 default:
2145 printk("reason not known yet!");
2146 break;
2149 printk("************* VMCS Area **************\n");
2150 vmcs_dump_vcpu();
2151 printk("**************************************\n");
2152 domain_crash_synchronous();
2155 TRACE_VMEXIT(0, exit_reason);
2157 switch ( exit_reason )
2159 case EXIT_REASON_EXCEPTION_NMI:
2161 /*
2162 * We don't set the software-interrupt exiting (INT n).
2163 * (1) We can get an exception (e.g. #PG) in the guest, or
2164 * (2) NMI
2165 */
2166 unsigned int vector;
2168 if ( __vmread(VM_EXIT_INTR_INFO, &vector) ||
2169 !(vector & INTR_INFO_VALID_MASK) )
2170 domain_crash_synchronous();
2171 vector &= INTR_INFO_VECTOR_MASK;
2173 TRACE_VMEXIT(1, vector);
2174 perfc_incra(cause_vector, vector);
2176 switch ( vector ) {
2177 #ifdef XEN_DEBUGGER
2178 case TRAP_debug:
2180 save_cpu_user_regs(regs);
2181 pdb_handle_exception(1, regs, 1);
2182 restore_cpu_user_regs(regs);
2183 break;
2185 case TRAP_int3:
2187 save_cpu_user_regs(regs);
2188 pdb_handle_exception(3, regs, 1);
2189 restore_cpu_user_regs(regs);
2190 break;
2192 #else
2193 case TRAP_debug:
2195 void store_cpu_user_regs(struct cpu_user_regs *regs);
2197 if ( test_bit(_DOMF_debugging, &v->domain->domain_flags) )
2199 store_cpu_user_regs(regs);
2200 domain_pause_for_debugger();
2201 __vm_clear_bit(GUEST_PENDING_DBG_EXCEPTIONS,
2202 PENDING_DEBUG_EXC_BS);
2204 else
2206 vmx_reflect_exception(v);
2207 __vm_clear_bit(GUEST_PENDING_DBG_EXCEPTIONS,
2208 PENDING_DEBUG_EXC_BS);
2211 break;
2213 case TRAP_int3:
2215 if ( test_bit(_DOMF_debugging, &v->domain->domain_flags) )
2216 domain_pause_for_debugger();
2217 else
2218 vmx_reflect_exception(v);
2219 break;
2221 #endif
2222 case TRAP_no_device:
2224 vmx_do_no_device_fault();
2225 break;
2227 case TRAP_page_fault:
2229 __vmread(EXIT_QUALIFICATION, &exit_qualification);
2230 __vmread(VM_EXIT_INTR_ERROR_CODE, &regs->error_code);
2232 TRACE_VMEXIT(3, regs->error_code);
2233 TRACE_VMEXIT(4, exit_qualification);
2235 HVM_DBG_LOG(DBG_LEVEL_VMMU,
2236 "eax=%lx, ebx=%lx, ecx=%lx, edx=%lx, esi=%lx, edi=%lx",
2237 (unsigned long)regs->eax, (unsigned long)regs->ebx,
2238 (unsigned long)regs->ecx, (unsigned long)regs->edx,
2239 (unsigned long)regs->esi, (unsigned long)regs->edi);
2241 if ( !vmx_do_page_fault(exit_qualification, regs) )
2243 /* Inject #PG using Interruption-Information Fields. */
2244 vmx_inject_hw_exception(v, TRAP_page_fault, regs->error_code);
2245 v->arch.hvm_vmx.cpu_cr2 = exit_qualification;
2246 TRACE_3D(TRC_VMX_INTR, v->domain->domain_id,
2247 TRAP_page_fault, exit_qualification);
2249 break;
2251 case TRAP_nmi:
2252 do_nmi(regs);
2253 break;
2254 default:
2255 vmx_reflect_exception(v);
2256 break;
2258 break;
2260 case EXIT_REASON_EXTERNAL_INTERRUPT:
2261 vmx_do_extint(regs);
2262 break;
2263 case EXIT_REASON_TRIPLE_FAULT:
2264 domain_crash_synchronous();
2265 break;
2266 case EXIT_REASON_PENDING_INTERRUPT:
2267 /* Disable the interrupt window. */
2268 v->arch.hvm_vcpu.u.vmx.exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
2269 __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
2270 v->arch.hvm_vcpu.u.vmx.exec_control);
2271 break;
2272 case EXIT_REASON_TASK_SWITCH:
2273 domain_crash_synchronous();
2274 break;
2275 case EXIT_REASON_CPUID:
2276 inst_len = __get_instruction_length(); /* Safe: CPUID */
2277 __update_guest_eip(inst_len);
2278 vmx_do_cpuid(regs);
2279 break;
2280 case EXIT_REASON_HLT:
2281 inst_len = __get_instruction_length(); /* Safe: HLT */
2282 __update_guest_eip(inst_len);
2283 vmx_do_hlt();
2284 break;
2285 case EXIT_REASON_INVLPG:
2287 inst_len = __get_instruction_length(); /* Safe: INVLPG */
2288 __update_guest_eip(inst_len);
2289 __vmread(EXIT_QUALIFICATION, &exit_qualification);
2290 vmx_do_invlpg(exit_qualification);
2291 TRACE_VMEXIT(4, exit_qualification);
2292 break;
2294 case EXIT_REASON_VMCALL:
2296 inst_len = __get_instruction_length(); /* Safe: VMCALL */
2297 __update_guest_eip(inst_len);
2298 hvm_do_hypercall(regs);
2299 break;
2301 case EXIT_REASON_CR_ACCESS:
2303 __vmread(EXIT_QUALIFICATION, &exit_qualification);
2304 inst_len = __get_instruction_length(); /* Safe: MOV Cn, LMSW, CLTS */
2305 if ( vmx_cr_access(exit_qualification, regs) )
2306 __update_guest_eip(inst_len);
2307 TRACE_VMEXIT(4, exit_qualification);
2308 break;
2310 case EXIT_REASON_DR_ACCESS:
2311 __vmread(EXIT_QUALIFICATION, &exit_qualification);
2312 vmx_dr_access(exit_qualification, regs);
2313 break;
2314 case EXIT_REASON_IO_INSTRUCTION:
2315 __vmread(EXIT_QUALIFICATION, &exit_qualification);
2316 inst_len = __get_instruction_length(); /* Safe: IN, INS, OUT, OUTS */
2317 vmx_io_instruction(exit_qualification, inst_len);
2318 TRACE_VMEXIT(4, exit_qualification);
2319 break;
2320 case EXIT_REASON_MSR_READ:
2321 inst_len = __get_instruction_length(); /* Safe: RDMSR */
2322 __update_guest_eip(inst_len);
2323 vmx_do_msr_read(regs);
2324 TRACE_VMEXIT(1, regs->ecx);
2325 TRACE_VMEXIT(2, regs->eax);
2326 TRACE_VMEXIT(3, regs->edx);
2327 break;
2328 case EXIT_REASON_MSR_WRITE:
2329 inst_len = __get_instruction_length(); /* Safe: WRMSR */
2330 __update_guest_eip(inst_len);
2331 vmx_do_msr_write(regs);
2332 TRACE_VMEXIT(1, regs->ecx);
2333 TRACE_VMEXIT(2, regs->eax);
2334 TRACE_VMEXIT(3, regs->edx);
2335 break;
2336 case EXIT_REASON_MWAIT_INSTRUCTION:
2337 case EXIT_REASON_MONITOR_INSTRUCTION:
2338 case EXIT_REASON_PAUSE_INSTRUCTION:
2339 domain_crash_synchronous();
2340 break;
2341 case EXIT_REASON_VMCLEAR:
2342 case EXIT_REASON_VMLAUNCH:
2343 case EXIT_REASON_VMPTRLD:
2344 case EXIT_REASON_VMPTRST:
2345 case EXIT_REASON_VMREAD:
2346 case EXIT_REASON_VMRESUME:
2347 case EXIT_REASON_VMWRITE:
2348 case EXIT_REASON_VMXOFF:
2349 case EXIT_REASON_VMXON:
2350 /* Report invalid opcode exception when a VMX guest tries to execute
2351 any of the VMX instructions */
2352 vmx_inject_hw_exception(v, TRAP_invalid_op, VMX_DELIVER_NO_ERROR_CODE);
2353 break;
2355 case EXIT_REASON_TPR_BELOW_THRESHOLD:
2356 VLAPIC(v)->flush_tpr_threshold = 1;
2357 break;
2359 default:
2360 domain_crash_synchronous(); /* should not happen */
2364 asmlinkage void vmx_trace_vmentry(void)
2366 TRACE_5D(TRC_VMX_VMENTRY + current->vcpu_id,
2367 this_cpu(trace_values)[0],
2368 this_cpu(trace_values)[1],
2369 this_cpu(trace_values)[2],
2370 this_cpu(trace_values)[3],
2371 this_cpu(trace_values)[4]);
2373 TRACE_VMEXIT(0, 0);
2374 TRACE_VMEXIT(1, 0);
2375 TRACE_VMEXIT(2, 0);
2376 TRACE_VMEXIT(3, 0);
2377 TRACE_VMEXIT(4, 0);
2380 asmlinkage void vmx_trace_vmexit (void)
2382 TRACE_3D(TRC_VMX_VMEXIT + current->vcpu_id, 0, 0, 0);
2385 /*
2386 * Local variables:
2387 * mode: C
2388 * c-set-style: "BSD"
2389 * c-basic-offset: 4
2390 * tab-width: 4
2391 * indent-tabs-mode: nil
2392 * End:
2393 */