ia64/xen-unstable

view xen/arch/x86/hvm/vmx/vmx.c @ 11222:a71c265924d2

[HVM][VMX] Don't update the EIP on debug register accesses

On debug register accesses, the EIP should not be updated.
Because of the way that lazy save/restore of the debug
registers is implemented, this initial debug register access
is lost.

Signed-off-by: Travis Betak <travis.betak@amd.com>
author kaf24@localhost.localdomain
date Sat Aug 19 12:07:07 2006 +0100 (2006-08-19)
parents 043a4aa24781
children fab84f9c0ce6
line source
1 /*
2 * vmx.c: handling VMX architecture-related VM exits
3 * Copyright (c) 2004, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
16 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 *
18 */
20 #include <xen/config.h>
21 #include <xen/init.h>
22 #include <xen/lib.h>
23 #include <xen/trace.h>
24 #include <xen/sched.h>
25 #include <xen/irq.h>
26 #include <xen/softirq.h>
27 #include <xen/domain_page.h>
28 #include <xen/hypercall.h>
29 #include <xen/perfc.h>
30 #include <asm/current.h>
31 #include <asm/io.h>
32 #include <asm/regs.h>
33 #include <asm/cpufeature.h>
34 #include <asm/processor.h>
35 #include <asm/types.h>
36 #include <asm/msr.h>
37 #include <asm/spinlock.h>
38 #include <asm/hvm/hvm.h>
39 #include <asm/hvm/support.h>
40 #include <asm/hvm/vmx/vmx.h>
41 #include <asm/hvm/vmx/vmcs.h>
42 #include <asm/hvm/vmx/cpu.h>
43 #include <asm/shadow2.h>
44 #include <public/sched.h>
45 #include <public/hvm/ioreq.h>
46 #include <asm/hvm/vpic.h>
47 #include <asm/hvm/vlapic.h>
49 static DEFINE_PER_CPU(unsigned long, trace_values[5]);
50 #define TRACE_VMEXIT(index,value) this_cpu(trace_values)[index]=value
52 static void vmx_ctxt_switch_from(struct vcpu *v);
53 static void vmx_ctxt_switch_to(struct vcpu *v);
55 static int vmx_initialize_guest_resources(struct vcpu *v)
56 {
57 struct domain *d = v->domain;
58 struct vcpu *vc;
59 void *io_bitmap_a, *io_bitmap_b;
60 int rc;
62 v->arch.schedule_tail = arch_vmx_do_launch;
63 v->arch.ctxt_switch_from = vmx_ctxt_switch_from;
64 v->arch.ctxt_switch_to = vmx_ctxt_switch_to;
66 if ( v->vcpu_id != 0 )
67 return 1;
69 if ( !shadow2_mode_external(d) )
70 {
71 DPRINTK("Can't init HVM for dom %u vcpu %u: "
72 "not in shadow2 external mode\n",
73 d->domain_id, v->vcpu_id);
74 domain_crash(d);
75 }
77 for_each_vcpu ( d, vc )
78 {
79 memset(&vc->arch.hvm_vmx, 0, sizeof(struct arch_vmx_struct));
81 if ( (rc = vmx_create_vmcs(vc)) != 0 )
82 {
83 DPRINTK("Failed to create VMCS for vcpu %d: err=%d.\n",
84 vc->vcpu_id, rc);
85 return 0;
86 }
88 spin_lock_init(&vc->arch.hvm_vmx.vmcs_lock);
90 if ( (io_bitmap_a = alloc_xenheap_pages(IO_BITMAP_ORDER)) == NULL )
91 {
92 DPRINTK("Failed to allocate io bitmap b for vcpu %d.\n",
93 vc->vcpu_id);
94 return 0;
95 }
97 if ( (io_bitmap_b = alloc_xenheap_pages(IO_BITMAP_ORDER)) == NULL )
98 {
99 DPRINTK("Failed to allocate io bitmap b for vcpu %d.\n",
100 vc->vcpu_id);
101 return 0;
102 }
104 memset(io_bitmap_a, 0xff, 0x1000);
105 memset(io_bitmap_b, 0xff, 0x1000);
107 /* don't bother debug port access */
108 clear_bit(PC_DEBUG_PORT, io_bitmap_a);
110 vc->arch.hvm_vmx.io_bitmap_a = io_bitmap_a;
111 vc->arch.hvm_vmx.io_bitmap_b = io_bitmap_b;
113 }
115 /*
116 * Required to do this once per domain XXX todo: add a seperate function
117 * to do these.
118 */
119 memset(&d->shared_info->evtchn_mask[0], 0xff,
120 sizeof(d->shared_info->evtchn_mask));
122 return 1;
123 }
125 static void vmx_relinquish_guest_resources(struct domain *d)
126 {
127 struct vcpu *v;
129 for_each_vcpu ( d, v )
130 {
131 vmx_destroy_vmcs(v);
132 if ( !test_bit(_VCPUF_initialised, &v->vcpu_flags) )
133 continue;
134 kill_timer(&v->arch.hvm_vcpu.hlt_timer);
135 if ( hvm_apic_support(v->domain) && (VLAPIC(v) != NULL) )
136 {
137 kill_timer(&VLAPIC(v)->vlapic_timer);
138 unmap_domain_page_global(VLAPIC(v)->regs);
139 free_domheap_page(VLAPIC(v)->regs_page);
140 xfree(VLAPIC(v));
141 }
142 hvm_release_assist_channel(v);
143 }
145 kill_timer(&d->arch.hvm_domain.pl_time.periodic_tm.timer);
147 if ( d->arch.hvm_domain.shared_page_va )
148 unmap_domain_page_global(
149 (void *)d->arch.hvm_domain.shared_page_va);
151 if ( d->arch.hvm_domain.buffered_io_va )
152 unmap_domain_page_global((void *)d->arch.hvm_domain.buffered_io_va);
153 }
155 #ifdef __x86_64__
157 static DEFINE_PER_CPU(struct vmx_msr_state, percpu_msr);
159 static u32 msr_data_index[VMX_MSR_COUNT] =
160 {
161 MSR_LSTAR, MSR_STAR, MSR_CSTAR,
162 MSR_SYSCALL_MASK, MSR_EFER,
163 };
165 static void vmx_save_segments(struct vcpu *v)
166 {
167 rdmsrl(MSR_SHADOW_GS_BASE, v->arch.hvm_vmx.msr_content.shadow_gs);
168 }
170 /*
171 * To avoid MSR save/restore at every VM exit/entry time, we restore
172 * the x86_64 specific MSRs at domain switch time. Since those MSRs are
173 * are not modified once set for generic domains, we don't save them,
174 * but simply reset them to the values set at percpu_traps_init().
175 */
176 static void vmx_load_msrs(void)
177 {
178 struct vmx_msr_state *host_state = &this_cpu(percpu_msr);
179 int i;
181 while ( host_state->flags )
182 {
183 i = find_first_set_bit(host_state->flags);
184 wrmsrl(msr_data_index[i], host_state->msr_items[i]);
185 clear_bit(i, &host_state->flags);
186 }
187 }
189 static void vmx_save_init_msrs(void)
190 {
191 struct vmx_msr_state *host_state = &this_cpu(percpu_msr);
192 int i;
194 for ( i = 0; i < VMX_MSR_COUNT; i++ )
195 rdmsrl(msr_data_index[i], host_state->msr_items[i]);
196 }
198 #define CASE_READ_MSR(address) \
199 case MSR_ ## address: \
200 msr_content = msr->msr_items[VMX_INDEX_MSR_ ## address]; \
201 break
203 #define CASE_WRITE_MSR(address) \
204 case MSR_ ## address: \
205 { \
206 msr->msr_items[VMX_INDEX_MSR_ ## address] = msr_content; \
207 if (!test_bit(VMX_INDEX_MSR_ ## address, &msr->flags)) { \
208 set_bit(VMX_INDEX_MSR_ ## address, &msr->flags); \
209 } \
210 wrmsrl(MSR_ ## address, msr_content); \
211 set_bit(VMX_INDEX_MSR_ ## address, &host_state->flags); \
212 } \
213 break
215 #define IS_CANO_ADDRESS(add) 1
216 static inline int long_mode_do_msr_read(struct cpu_user_regs *regs)
217 {
218 u64 msr_content = 0;
219 struct vcpu *v = current;
220 struct vmx_msr_state *msr = &v->arch.hvm_vmx.msr_content;
222 switch ( regs->ecx ) {
223 case MSR_EFER:
224 HVM_DBG_LOG(DBG_LEVEL_2, "EFER msr_content 0x%"PRIx64, msr_content);
225 msr_content = msr->msr_items[VMX_INDEX_MSR_EFER];
227 /* the following code may be not needed */
228 if ( test_bit(VMX_CPU_STATE_LME_ENABLED, &v->arch.hvm_vmx.cpu_state) )
229 msr_content |= EFER_LME;
230 else
231 msr_content &= ~EFER_LME;
233 if ( VMX_LONG_GUEST(v) )
234 msr_content |= EFER_LMA;
235 else
236 msr_content &= ~EFER_LMA;
237 break;
239 case MSR_FS_BASE:
240 if ( !(VMX_LONG_GUEST(v)) )
241 /* XXX should it be GP fault */
242 domain_crash_synchronous();
244 __vmread(GUEST_FS_BASE, &msr_content);
245 break;
247 case MSR_GS_BASE:
248 if ( !(VMX_LONG_GUEST(v)) )
249 domain_crash_synchronous();
251 __vmread(GUEST_GS_BASE, &msr_content);
252 break;
254 case MSR_SHADOW_GS_BASE:
255 msr_content = msr->shadow_gs;
256 break;
258 CASE_READ_MSR(STAR);
259 CASE_READ_MSR(LSTAR);
260 CASE_READ_MSR(CSTAR);
261 CASE_READ_MSR(SYSCALL_MASK);
263 default:
264 return 0;
265 }
267 HVM_DBG_LOG(DBG_LEVEL_2, "msr_content: 0x%"PRIx64, msr_content);
269 regs->eax = msr_content & 0xffffffff;
270 regs->edx = msr_content >> 32;
272 return 1;
273 }
275 static inline int long_mode_do_msr_write(struct cpu_user_regs *regs)
276 {
277 u64 msr_content = regs->eax | ((u64)regs->edx << 32);
278 struct vcpu *v = current;
279 struct vmx_msr_state *msr = &v->arch.hvm_vmx.msr_content;
280 struct vmx_msr_state *host_state = &this_cpu(percpu_msr);
282 HVM_DBG_LOG(DBG_LEVEL_1, "msr 0x%lx msr_content 0x%"PRIx64"\n",
283 (unsigned long)regs->ecx, msr_content);
285 switch ( regs->ecx ) {
286 case MSR_EFER:
287 /* offending reserved bit will cause #GP */
288 if ( msr_content & ~(EFER_LME | EFER_LMA | EFER_NX | EFER_SCE) )
289 {
290 printk("trying to set reserved bit in EFER\n");
291 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
292 return 0;
293 }
295 /* LME: 0 -> 1 */
296 if ( msr_content & EFER_LME &&
297 !test_bit(VMX_CPU_STATE_LME_ENABLED, &v->arch.hvm_vmx.cpu_state) )
298 {
299 if ( vmx_paging_enabled(v) ||
300 !test_bit(VMX_CPU_STATE_PAE_ENABLED,
301 &v->arch.hvm_vmx.cpu_state) )
302 {
303 printk("trying to set LME bit when "
304 "in paging mode or PAE bit is not set\n");
305 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
306 return 0;
307 }
309 set_bit(VMX_CPU_STATE_LME_ENABLED, &v->arch.hvm_vmx.cpu_state);
310 }
312 msr->msr_items[VMX_INDEX_MSR_EFER] = msr_content;
313 break;
315 case MSR_FS_BASE:
316 case MSR_GS_BASE:
317 if ( !(VMX_LONG_GUEST(v)) )
318 domain_crash_synchronous();
320 if ( !IS_CANO_ADDRESS(msr_content) )
321 {
322 HVM_DBG_LOG(DBG_LEVEL_1, "Not cano address of msr write\n");
323 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
324 return 0;
325 }
327 if ( regs->ecx == MSR_FS_BASE )
328 __vmwrite(GUEST_FS_BASE, msr_content);
329 else
330 __vmwrite(GUEST_GS_BASE, msr_content);
332 break;
334 case MSR_SHADOW_GS_BASE:
335 if ( !(VMX_LONG_GUEST(v)) )
336 domain_crash_synchronous();
338 v->arch.hvm_vmx.msr_content.shadow_gs = msr_content;
339 wrmsrl(MSR_SHADOW_GS_BASE, msr_content);
340 break;
342 CASE_WRITE_MSR(STAR);
343 CASE_WRITE_MSR(LSTAR);
344 CASE_WRITE_MSR(CSTAR);
345 CASE_WRITE_MSR(SYSCALL_MASK);
347 default:
348 return 0;
349 }
351 return 1;
352 }
354 static void vmx_restore_msrs(struct vcpu *v)
355 {
356 int i = 0;
357 struct vmx_msr_state *guest_state;
358 struct vmx_msr_state *host_state;
359 unsigned long guest_flags ;
361 guest_state = &v->arch.hvm_vmx.msr_content;;
362 host_state = &this_cpu(percpu_msr);
364 wrmsrl(MSR_SHADOW_GS_BASE, guest_state->shadow_gs);
365 guest_flags = guest_state->flags;
366 if (!guest_flags)
367 return;
369 while (guest_flags){
370 i = find_first_set_bit(guest_flags);
372 HVM_DBG_LOG(DBG_LEVEL_2,
373 "restore guest's index %d msr %lx with %lx\n",
374 i, (unsigned long)msr_data_index[i],
375 (unsigned long)guest_state->msr_items[i]);
376 set_bit(i, &host_state->flags);
377 wrmsrl(msr_data_index[i], guest_state->msr_items[i]);
378 clear_bit(i, &guest_flags);
379 }
380 }
382 #else /* __i386__ */
384 #define vmx_save_segments(v) ((void)0)
385 #define vmx_load_msrs() ((void)0)
386 #define vmx_restore_msrs(v) ((void)0)
387 #define vmx_save_init_msrs() ((void)0)
389 static inline int long_mode_do_msr_read(struct cpu_user_regs *regs)
390 {
391 return 0;
392 }
394 static inline int long_mode_do_msr_write(struct cpu_user_regs *regs)
395 {
396 return 0;
397 }
399 #endif /* __i386__ */
401 #define loaddebug(_v,_reg) \
402 __asm__ __volatile__ ("mov %0,%%db" #_reg : : "r" ((_v)->debugreg[_reg]))
403 #define savedebug(_v,_reg) \
404 __asm__ __volatile__ ("mov %%db" #_reg ",%0" : : "r" ((_v)->debugreg[_reg]))
406 static inline void vmx_save_dr(struct vcpu *v)
407 {
408 if ( v->arch.hvm_vcpu.flag_dr_dirty )
409 {
410 savedebug(&v->arch.guest_context, 0);
411 savedebug(&v->arch.guest_context, 1);
412 savedebug(&v->arch.guest_context, 2);
413 savedebug(&v->arch.guest_context, 3);
414 savedebug(&v->arch.guest_context, 6);
416 v->arch.hvm_vcpu.flag_dr_dirty = 0;
418 v->arch.hvm_vcpu.u.vmx.exec_control |= CPU_BASED_MOV_DR_EXITING;
419 __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
420 v->arch.hvm_vcpu.u.vmx.exec_control);
421 }
422 }
424 static inline void __restore_debug_registers(struct vcpu *v)
425 {
426 loaddebug(&v->arch.guest_context, 0);
427 loaddebug(&v->arch.guest_context, 1);
428 loaddebug(&v->arch.guest_context, 2);
429 loaddebug(&v->arch.guest_context, 3);
430 /* No 4 and 5 */
431 loaddebug(&v->arch.guest_context, 6);
432 /* DR7 is loaded from the vmcs. */
433 }
435 /*
436 * DR7 is saved and restored on every vmexit. Other debug registers only
437 * need to be restored if their value is going to affect execution -- i.e.,
438 * if one of the breakpoints is enabled. So mask out all bits that don't
439 * enable some breakpoint functionality.
440 *
441 * This is in part necessary because bit 10 of DR7 is hardwired to 1, so a
442 * simple if( guest_dr7 ) will always return true. As long as we're masking,
443 * we might as well do it right.
444 */
445 #define DR7_ACTIVE_MASK 0xff
447 static inline void vmx_restore_dr(struct vcpu *v)
448 {
449 unsigned long guest_dr7;
451 __vmread(GUEST_DR7, &guest_dr7);
453 /* Assumes guest does not have DR access at time of context switch. */
454 if ( unlikely(guest_dr7 & DR7_ACTIVE_MASK) )
455 __restore_debug_registers(v);
456 }
458 static void vmx_freeze_time(struct vcpu *v)
459 {
460 struct periodic_time *pt=&v->domain->arch.hvm_domain.pl_time.periodic_tm;
462 if ( pt->enabled && pt->first_injected && !v->arch.hvm_vcpu.guest_time ) {
463 v->arch.hvm_vcpu.guest_time = hvm_get_guest_time(v);
464 stop_timer(&(pt->timer));
465 }
466 }
468 static void vmx_ctxt_switch_from(struct vcpu *v)
469 {
470 vmx_freeze_time(v);
471 vmx_save_segments(v);
472 vmx_load_msrs();
473 vmx_save_dr(v);
474 }
476 static void vmx_ctxt_switch_to(struct vcpu *v)
477 {
478 vmx_restore_msrs(v);
479 vmx_restore_dr(v);
480 }
482 static void stop_vmx(void)
483 {
484 if (read_cr4() & X86_CR4_VMXE)
485 __vmxoff();
486 }
488 void vmx_migrate_timers(struct vcpu *v)
489 {
490 struct periodic_time *pt = &(v->domain->arch.hvm_domain.pl_time.periodic_tm);
492 if ( pt->enabled ) {
493 migrate_timer(&pt->timer, v->processor);
494 migrate_timer(&v->arch.hvm_vcpu.hlt_timer, v->processor);
495 }
496 if ( hvm_apic_support(v->domain) && VLAPIC(v))
497 migrate_timer(&(VLAPIC(v)->vlapic_timer), v->processor);
498 }
500 static void vmx_store_cpu_guest_regs(
501 struct vcpu *v, struct cpu_user_regs *regs, unsigned long *crs)
502 {
503 vmx_vmcs_enter(v);
505 if ( regs != NULL )
506 {
507 __vmread(GUEST_RFLAGS, &regs->eflags);
508 __vmread(GUEST_SS_SELECTOR, &regs->ss);
509 __vmread(GUEST_CS_SELECTOR, &regs->cs);
510 __vmread(GUEST_DS_SELECTOR, &regs->ds);
511 __vmread(GUEST_ES_SELECTOR, &regs->es);
512 __vmread(GUEST_GS_SELECTOR, &regs->gs);
513 __vmread(GUEST_FS_SELECTOR, &regs->fs);
514 __vmread(GUEST_RIP, &regs->eip);
515 __vmread(GUEST_RSP, &regs->esp);
516 }
518 if ( crs != NULL )
519 {
520 __vmread(CR0_READ_SHADOW, &crs[0]);
521 __vmread(GUEST_CR3, &crs[3]);
522 __vmread(CR4_READ_SHADOW, &crs[4]);
523 }
525 vmx_vmcs_exit(v);
526 }
528 /*
529 * The VMX spec (section 4.3.1.2, Checks on Guest Segment
530 * Registers) says that virtual-8086 mode guests' segment
531 * base-address fields in the VMCS must be equal to their
532 * corresponding segment selector field shifted right by
533 * four bits upon vmentry.
534 *
535 * This function (called only for VM86-mode guests) fixes
536 * the bases to be consistent with the selectors in regs
537 * if they're not already. Without this, we can fail the
538 * vmentry check mentioned above.
539 */
540 static void fixup_vm86_seg_bases(struct cpu_user_regs *regs)
541 {
542 int err = 0;
543 unsigned long base;
545 err |= __vmread(GUEST_ES_BASE, &base);
546 if (regs->es << 4 != base)
547 err |= __vmwrite(GUEST_ES_BASE, regs->es << 4);
548 err |= __vmread(GUEST_CS_BASE, &base);
549 if (regs->cs << 4 != base)
550 err |= __vmwrite(GUEST_CS_BASE, regs->cs << 4);
551 err |= __vmread(GUEST_SS_BASE, &base);
552 if (regs->ss << 4 != base)
553 err |= __vmwrite(GUEST_SS_BASE, regs->ss << 4);
554 err |= __vmread(GUEST_DS_BASE, &base);
555 if (regs->ds << 4 != base)
556 err |= __vmwrite(GUEST_DS_BASE, regs->ds << 4);
557 err |= __vmread(GUEST_FS_BASE, &base);
558 if (regs->fs << 4 != base)
559 err |= __vmwrite(GUEST_FS_BASE, regs->fs << 4);
560 err |= __vmread(GUEST_GS_BASE, &base);
561 if (regs->gs << 4 != base)
562 err |= __vmwrite(GUEST_GS_BASE, regs->gs << 4);
564 BUG_ON(err);
565 }
567 static void vmx_load_cpu_guest_regs(struct vcpu *v, struct cpu_user_regs *regs)
568 {
569 vmx_vmcs_enter(v);
571 __vmwrite(GUEST_SS_SELECTOR, regs->ss);
572 __vmwrite(GUEST_DS_SELECTOR, regs->ds);
573 __vmwrite(GUEST_ES_SELECTOR, regs->es);
574 __vmwrite(GUEST_GS_SELECTOR, regs->gs);
575 __vmwrite(GUEST_FS_SELECTOR, regs->fs);
577 __vmwrite(GUEST_RSP, regs->esp);
579 __vmwrite(GUEST_RFLAGS, regs->eflags);
580 if (regs->eflags & EF_TF)
581 __vm_set_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_DB);
582 else
583 __vm_clear_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_DB);
584 if (regs->eflags & EF_VM)
585 fixup_vm86_seg_bases(regs);
587 __vmwrite(GUEST_CS_SELECTOR, regs->cs);
588 __vmwrite(GUEST_RIP, regs->eip);
590 vmx_vmcs_exit(v);
591 }
593 static int vmx_instruction_length(struct vcpu *v)
594 {
595 unsigned long inst_len;
597 if (__vmread(VM_EXIT_INSTRUCTION_LEN, &inst_len))
598 return 0;
599 return inst_len;
600 }
602 static unsigned long vmx_get_ctrl_reg(struct vcpu *v, unsigned int num)
603 {
604 switch ( num )
605 {
606 case 0:
607 return v->arch.hvm_vmx.cpu_cr0;
608 case 2:
609 return v->arch.hvm_vmx.cpu_cr2;
610 case 3:
611 return v->arch.hvm_vmx.cpu_cr3;
612 case 4:
613 return v->arch.hvm_vmx.cpu_shadow_cr4;
614 default:
615 BUG();
616 }
617 return 0; /* dummy */
618 }
622 /* Make sure that xen intercepts any FP accesses from current */
623 static void vmx_stts(struct vcpu *v)
624 {
625 unsigned long cr0;
627 /* VMX depends on operating on the current vcpu */
628 ASSERT(v == current);
630 /*
631 * If the guest does not have TS enabled then we must cause and handle an
632 * exception on first use of the FPU. If the guest *does* have TS enabled
633 * then this is not necessary: no FPU activity can occur until the guest
634 * clears CR0.TS, and we will initialise the FPU when that happens.
635 */
636 __vmread_vcpu(v, CR0_READ_SHADOW, &cr0);
637 if ( !(cr0 & X86_CR0_TS) )
638 {
639 __vmread_vcpu(v, GUEST_CR0, &cr0);
640 __vmwrite(GUEST_CR0, cr0 | X86_CR0_TS);
641 __vm_set_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_NM);
642 }
643 }
646 static void vmx_set_tsc_offset(struct vcpu *v, u64 offset)
647 {
648 /* VMX depends on operating on the current vcpu */
649 ASSERT(v == current);
651 __vmwrite(TSC_OFFSET, offset);
652 #if defined (__i386__)
653 __vmwrite(TSC_OFFSET_HIGH, offset >> 32);
654 #endif
655 }
659 /* SMP VMX guest support */
660 static void vmx_init_ap_context(struct vcpu_guest_context *ctxt,
661 int vcpuid, int trampoline_vector)
662 {
663 int i;
665 memset(ctxt, 0, sizeof(*ctxt));
667 /*
668 * Initial register values:
669 */
670 ctxt->user_regs.eip = VMXASSIST_BASE;
671 ctxt->user_regs.edx = vcpuid;
672 ctxt->user_regs.ebx = trampoline_vector;
674 ctxt->flags = VGCF_HVM_GUEST;
676 /* Virtual IDT is empty at start-of-day. */
677 for ( i = 0; i < 256; i++ )
678 {
679 ctxt->trap_ctxt[i].vector = i;
680 ctxt->trap_ctxt[i].cs = FLAT_KERNEL_CS;
681 }
683 /* No callback handlers. */
684 #if defined(__i386__)
685 ctxt->event_callback_cs = FLAT_KERNEL_CS;
686 ctxt->failsafe_callback_cs = FLAT_KERNEL_CS;
687 #endif
688 }
690 void do_nmi(struct cpu_user_regs *);
692 static int check_vmx_controls(u32 ctrls, u32 msr)
693 {
694 u32 vmx_msr_low, vmx_msr_high;
696 rdmsr(msr, vmx_msr_low, vmx_msr_high);
697 if ( (ctrls < vmx_msr_low) || (ctrls > vmx_msr_high) )
698 {
699 printk("Insufficient VMX capability 0x%x, "
700 "msr=0x%x,low=0x%8x,high=0x%x\n",
701 ctrls, msr, vmx_msr_low, vmx_msr_high);
702 return 0;
703 }
704 return 1;
705 }
707 static void vmx_init_hypercall_page(struct domain *d, void *hypercall_page)
708 {
709 char *p;
710 int i;
712 memset(hypercall_page, 0, PAGE_SIZE);
714 for ( i = 0; i < (PAGE_SIZE / 32); i++ )
715 {
716 p = (char *)(hypercall_page + (i * 32));
717 *(u8 *)(p + 0) = 0xb8; /* mov imm32, %eax */
718 *(u32 *)(p + 1) = i;
719 *(u8 *)(p + 5) = 0x0f; /* vmcall */
720 *(u8 *)(p + 6) = 0x01;
721 *(u8 *)(p + 7) = 0xc1;
722 *(u8 *)(p + 8) = 0xc3; /* ret */
723 }
725 /* Don't support HYPERVISOR_iret at the moment */
726 *(u16 *)(hypercall_page + (__HYPERVISOR_iret * 32)) = 0x0b0f; /* ud2 */
727 }
729 /* Setup HVM interfaces */
730 static void vmx_setup_hvm_funcs(void)
731 {
732 if ( hvm_enabled )
733 return;
735 hvm_funcs.disable = stop_vmx;
737 hvm_funcs.initialize_guest_resources = vmx_initialize_guest_resources;
738 hvm_funcs.relinquish_guest_resources = vmx_relinquish_guest_resources;
740 hvm_funcs.store_cpu_guest_regs = vmx_store_cpu_guest_regs;
741 hvm_funcs.load_cpu_guest_regs = vmx_load_cpu_guest_regs;
743 hvm_funcs.realmode = vmx_realmode;
744 hvm_funcs.paging_enabled = vmx_paging_enabled;
745 hvm_funcs.long_mode_enabled = vmx_long_mode_enabled;
746 hvm_funcs.guest_x86_mode = vmx_guest_x86_mode;
747 hvm_funcs.instruction_length = vmx_instruction_length;
748 hvm_funcs.get_guest_ctrl_reg = vmx_get_ctrl_reg;
750 hvm_funcs.update_host_cr3 = vmx_update_host_cr3;
752 hvm_funcs.stts = vmx_stts;
753 hvm_funcs.set_tsc_offset = vmx_set_tsc_offset;
755 hvm_funcs.init_ap_context = vmx_init_ap_context;
757 hvm_funcs.init_hypercall_page = vmx_init_hypercall_page;
758 }
760 int start_vmx(void)
761 {
762 u32 eax, edx;
763 struct vmcs_struct *vmcs;
765 /*
766 * Xen does not fill x86_capability words except 0.
767 */
768 boot_cpu_data.x86_capability[4] = cpuid_ecx(1);
770 if (!(test_bit(X86_FEATURE_VMXE, &boot_cpu_data.x86_capability)))
771 return 0;
773 rdmsr(IA32_FEATURE_CONTROL_MSR, eax, edx);
775 if ( eax & IA32_FEATURE_CONTROL_MSR_LOCK )
776 {
777 if ( (eax & IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON) == 0x0 )
778 {
779 printk("VMX disabled by Feature Control MSR.\n");
780 return 0;
781 }
782 }
783 else
784 {
785 wrmsr(IA32_FEATURE_CONTROL_MSR,
786 IA32_FEATURE_CONTROL_MSR_LOCK |
787 IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON, 0);
788 }
790 if ( !check_vmx_controls(MONITOR_PIN_BASED_EXEC_CONTROLS,
791 MSR_IA32_VMX_PINBASED_CTLS_MSR) )
792 return 0;
793 if ( !check_vmx_controls(MONITOR_CPU_BASED_EXEC_CONTROLS,
794 MSR_IA32_VMX_PROCBASED_CTLS_MSR) )
795 return 0;
796 if ( !check_vmx_controls(MONITOR_VM_EXIT_CONTROLS,
797 MSR_IA32_VMX_EXIT_CTLS_MSR) )
798 return 0;
799 if ( !check_vmx_controls(MONITOR_VM_ENTRY_CONTROLS,
800 MSR_IA32_VMX_ENTRY_CTLS_MSR) )
801 return 0;
803 set_in_cr4(X86_CR4_VMXE);
805 vmx_init_vmcs_config();
807 if(!smp_processor_id())
808 setup_vmcs_dump();
810 if ( (vmcs = vmx_alloc_host_vmcs()) == NULL )
811 {
812 printk("Failed to allocate host VMCS\n");
813 return 0;
814 }
816 if ( __vmxon(virt_to_maddr(vmcs)) )
817 {
818 printk("VMXON failed\n");
819 vmx_free_host_vmcs(vmcs);
820 return 0;
821 }
823 printk("VMXON is done\n");
825 vmx_save_init_msrs();
827 vmx_setup_hvm_funcs();
829 hvm_enabled = 1;
831 return 1;
832 }
834 /*
835 * Not all cases receive valid value in the VM-exit instruction length field.
836 */
837 #define __get_instruction_length(len) \
838 __vmread(VM_EXIT_INSTRUCTION_LEN, &(len)); \
839 if ((len) < 1 || (len) > 15) \
840 __hvm_bug(&regs);
842 static void inline __update_guest_eip(unsigned long inst_len)
843 {
844 unsigned long current_eip;
846 __vmread(GUEST_RIP, &current_eip);
847 __vmwrite(GUEST_RIP, current_eip + inst_len);
848 __vmwrite(GUEST_INTERRUPTIBILITY_INFO, 0);
849 }
851 static int vmx_do_page_fault(unsigned long va, struct cpu_user_regs *regs)
852 {
853 int result;
855 #if 0 /* keep for debugging */
856 {
857 unsigned long eip, cs;
859 __vmread(GUEST_CS_BASE, &cs);
860 __vmread(GUEST_RIP, &eip);
861 HVM_DBG_LOG(DBG_LEVEL_VMMU,
862 "vmx_do_page_fault = 0x%lx, cs_base=%lx, "
863 "eip = %lx, error_code = %lx\n",
864 va, cs, eip, (unsigned long)regs->error_code);
865 }
866 #endif
868 result = shadow2_fault(va, regs);
870 TRACE_VMEXIT (2,result);
871 #if 0
872 if ( !result )
873 {
874 __vmread(GUEST_RIP, &eip);
875 printk("vmx pgfault to guest va=%lx eip=%lx\n", va, eip);
876 }
877 #endif
879 return result;
880 }
882 static void vmx_do_no_device_fault(void)
883 {
884 unsigned long cr0;
885 struct vcpu *v = current;
887 setup_fpu(current);
888 __vm_clear_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_NM);
890 /* Disable TS in guest CR0 unless the guest wants the exception too. */
891 __vmread_vcpu(v, CR0_READ_SHADOW, &cr0);
892 if ( !(cr0 & X86_CR0_TS) )
893 {
894 __vmread_vcpu(v, GUEST_CR0, &cr0);
895 cr0 &= ~X86_CR0_TS;
896 __vmwrite(GUEST_CR0, cr0);
897 }
898 }
900 #define bitmaskof(idx) (1U << ((idx)&31))
901 static void vmx_vmexit_do_cpuid(struct cpu_user_regs *regs)
902 {
903 unsigned int input = (unsigned int)regs->eax;
904 unsigned int count = (unsigned int)regs->ecx;
905 unsigned int eax, ebx, ecx, edx;
906 unsigned long eip;
907 struct vcpu *v = current;
909 __vmread(GUEST_RIP, &eip);
911 HVM_DBG_LOG(DBG_LEVEL_3, "(eax) 0x%08lx, (ebx) 0x%08lx, "
912 "(ecx) 0x%08lx, (edx) 0x%08lx, (esi) 0x%08lx, (edi) 0x%08lx",
913 (unsigned long)regs->eax, (unsigned long)regs->ebx,
914 (unsigned long)regs->ecx, (unsigned long)regs->edx,
915 (unsigned long)regs->esi, (unsigned long)regs->edi);
917 if ( input == CPUID_LEAF_0x4 )
918 {
919 cpuid_count(input, count, &eax, &ebx, &ecx, &edx);
920 eax &= NUM_CORES_RESET_MASK;
921 }
922 else if ( !cpuid_hypervisor_leaves(input, &eax, &ebx, &ecx, &edx) )
923 {
924 cpuid(input, &eax, &ebx, &ecx, &edx);
926 if ( input == CPUID_LEAF_0x1 )
927 {
928 /* mask off reserved bits */
929 ecx &= ~VMX_VCPU_CPUID_L1_ECX_RESERVED;
931 if ( !hvm_apic_support(v->domain) ||
932 !vlapic_global_enabled((VLAPIC(v))) )
933 {
934 /* Since the apic is disabled, avoid any
935 confusion about SMP cpus being available */
937 clear_bit(X86_FEATURE_APIC, &edx);
938 }
940 #if CONFIG_PAGING_LEVELS >= 3
941 if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] )
942 #endif
943 clear_bit(X86_FEATURE_PAE, &edx);
944 clear_bit(X86_FEATURE_PSE36, &edx);
946 ebx &= NUM_THREADS_RESET_MASK;
948 /* Unsupportable for virtualised CPUs. */
949 ecx &= ~(bitmaskof(X86_FEATURE_VMXE) |
950 bitmaskof(X86_FEATURE_EST) |
951 bitmaskof(X86_FEATURE_TM2) |
952 bitmaskof(X86_FEATURE_CID) |
953 bitmaskof(X86_FEATURE_MWAIT) );
955 edx &= ~( bitmaskof(X86_FEATURE_HT) |
956 bitmaskof(X86_FEATURE_MCA) |
957 bitmaskof(X86_FEATURE_MCE) |
958 bitmaskof(X86_FEATURE_ACPI) |
959 bitmaskof(X86_FEATURE_ACC) );
960 }
961 else if ( ( input == CPUID_LEAF_0x6 )
962 || ( input == CPUID_LEAF_0x9 )
963 || ( input == CPUID_LEAF_0xA ))
964 {
965 eax = ebx = ecx = edx = 0x0;
966 }
967 #ifdef __i386__
968 else if ( input == CPUID_LEAF_0x80000001 )
969 {
970 clear_bit(X86_FEATURE_LAHF_LM & 31, &ecx);
972 clear_bit(X86_FEATURE_LM & 31, &edx);
973 clear_bit(X86_FEATURE_SYSCALL & 31, &edx);
974 }
975 #endif
976 }
978 regs->eax = (unsigned long) eax;
979 regs->ebx = (unsigned long) ebx;
980 regs->ecx = (unsigned long) ecx;
981 regs->edx = (unsigned long) edx;
983 HVM_DBG_LOG(DBG_LEVEL_3, "eip@%lx, input: 0x%lx, "
984 "output: eax = 0x%08lx, ebx = 0x%08lx, "
985 "ecx = 0x%08lx, edx = 0x%08lx",
986 (unsigned long)eip, (unsigned long)input,
987 (unsigned long)eax, (unsigned long)ebx,
988 (unsigned long)ecx, (unsigned long)edx);
989 }
991 #define CASE_GET_REG_P(REG, reg) \
992 case REG_ ## REG: reg_p = (unsigned long *)&(regs->reg); break
994 #ifdef __i386__
995 #define CASE_EXTEND_GET_REG_P
996 #else
997 #define CASE_EXTEND_GET_REG_P \
998 CASE_GET_REG_P(R8, r8); \
999 CASE_GET_REG_P(R9, r9); \
1000 CASE_GET_REG_P(R10, r10); \
1001 CASE_GET_REG_P(R11, r11); \
1002 CASE_GET_REG_P(R12, r12); \
1003 CASE_GET_REG_P(R13, r13); \
1004 CASE_GET_REG_P(R14, r14); \
1005 CASE_GET_REG_P(R15, r15)
1006 #endif
1008 static void vmx_dr_access(unsigned long exit_qualification,
1009 struct cpu_user_regs *regs)
1011 struct vcpu *v = current;
1013 v->arch.hvm_vcpu.flag_dr_dirty = 1;
1015 /* We could probably be smarter about this */
1016 __restore_debug_registers(v);
1018 /* Allow guest direct access to DR registers */
1019 v->arch.hvm_vcpu.u.vmx.exec_control &= ~CPU_BASED_MOV_DR_EXITING;
1020 __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
1021 v->arch.hvm_vcpu.u.vmx.exec_control);
1024 /*
1025 * Invalidate the TLB for va. Invalidate the shadow page corresponding
1026 * the address va.
1027 */
1028 static void vmx_vmexit_do_invlpg(unsigned long va)
1030 unsigned long eip;
1031 struct vcpu *v = current;
1033 __vmread(GUEST_RIP, &eip);
1035 HVM_DBG_LOG(DBG_LEVEL_VMMU, "vmx_vmexit_do_invlpg: eip=%lx, va=%lx",
1036 eip, va);
1038 /*
1039 * We do the safest things first, then try to update the shadow
1040 * copying from guest
1041 */
1042 shadow2_invlpg(v, va);
1046 static int check_for_null_selector(unsigned long eip)
1048 unsigned char inst[MAX_INST_LEN];
1049 unsigned long sel;
1050 int i, inst_len;
1051 int inst_copy_from_guest(unsigned char *, unsigned long, int);
1053 __vmread(VM_EXIT_INSTRUCTION_LEN, &inst_len);
1054 memset(inst, 0, MAX_INST_LEN);
1055 if (inst_copy_from_guest(inst, eip, inst_len) != inst_len) {
1056 printf("check_for_null_selector: get guest instruction failed\n");
1057 domain_crash_synchronous();
1060 for (i = 0; i < inst_len; i++) {
1061 switch (inst[i]) {
1062 case 0xf3: /* REPZ */
1063 case 0xf2: /* REPNZ */
1064 case 0xf0: /* LOCK */
1065 case 0x66: /* data32 */
1066 case 0x67: /* addr32 */
1067 continue;
1068 case 0x2e: /* CS */
1069 __vmread(GUEST_CS_SELECTOR, &sel);
1070 break;
1071 case 0x36: /* SS */
1072 __vmread(GUEST_SS_SELECTOR, &sel);
1073 break;
1074 case 0x26: /* ES */
1075 __vmread(GUEST_ES_SELECTOR, &sel);
1076 break;
1077 case 0x64: /* FS */
1078 __vmread(GUEST_FS_SELECTOR, &sel);
1079 break;
1080 case 0x65: /* GS */
1081 __vmread(GUEST_GS_SELECTOR, &sel);
1082 break;
1083 case 0x3e: /* DS */
1084 /* FALLTHROUGH */
1085 default:
1086 /* DS is the default */
1087 __vmread(GUEST_DS_SELECTOR, &sel);
1089 return sel == 0 ? 1 : 0;
1092 return 0;
1095 extern void send_pio_req(struct cpu_user_regs *regs, unsigned long port,
1096 unsigned long count, int size, long value,
1097 int dir, int pvalid);
1099 static void vmx_io_instruction(unsigned long exit_qualification,
1100 unsigned long inst_len)
1102 struct cpu_user_regs *regs;
1103 struct hvm_io_op *pio_opp;
1104 unsigned long eip, cs, eflags;
1105 unsigned long port, size, dir;
1106 int vm86;
1108 pio_opp = &current->arch.hvm_vcpu.io_op;
1109 pio_opp->instr = INSTR_PIO;
1110 pio_opp->flags = 0;
1112 regs = &pio_opp->io_context;
1114 /* Copy current guest state into io instruction state structure. */
1115 memcpy(regs, guest_cpu_user_regs(), HVM_CONTEXT_STACK_BYTES);
1116 hvm_store_cpu_guest_regs(current, regs, NULL);
1118 eip = regs->eip;
1119 cs = regs->cs;
1120 eflags = regs->eflags;
1122 vm86 = eflags & X86_EFLAGS_VM ? 1 : 0;
1124 HVM_DBG_LOG(DBG_LEVEL_IO,
1125 "vmx_io_instruction: vm86 %d, eip=%lx:%lx, "
1126 "exit_qualification = %lx",
1127 vm86, cs, eip, exit_qualification);
1129 if (test_bit(6, &exit_qualification))
1130 port = (exit_qualification >> 16) & 0xFFFF;
1131 else
1132 port = regs->edx & 0xffff;
1133 TRACE_VMEXIT(1, port);
1134 size = (exit_qualification & 7) + 1;
1135 dir = test_bit(3, &exit_qualification); /* direction */
1137 if (test_bit(4, &exit_qualification)) { /* string instruction */
1138 unsigned long addr, count = 1;
1139 int sign = regs->eflags & EF_DF ? -1 : 1;
1141 __vmread(GUEST_LINEAR_ADDRESS, &addr);
1143 /*
1144 * In protected mode, guest linear address is invalid if the
1145 * selector is null.
1146 */
1147 if (!vm86 && check_for_null_selector(eip))
1148 addr = dir == IOREQ_WRITE ? regs->esi : regs->edi;
1150 if (test_bit(5, &exit_qualification)) { /* "rep" prefix */
1151 pio_opp->flags |= REPZ;
1152 count = vm86 ? regs->ecx & 0xFFFF : regs->ecx;
1155 /*
1156 * Handle string pio instructions that cross pages or that
1157 * are unaligned. See the comments in hvm_domain.c/handle_mmio()
1158 */
1159 if ((addr & PAGE_MASK) != ((addr + size - 1) & PAGE_MASK)) {
1160 unsigned long value = 0;
1162 pio_opp->flags |= OVERLAP;
1163 if (dir == IOREQ_WRITE)
1164 hvm_copy(&value, addr, size, HVM_COPY_IN);
1165 send_pio_req(regs, port, 1, size, value, dir, 0);
1166 } else {
1167 if ((addr & PAGE_MASK) != ((addr + count * size - 1) & PAGE_MASK)) {
1168 if (sign > 0)
1169 count = (PAGE_SIZE - (addr & ~PAGE_MASK)) / size;
1170 else
1171 count = (addr & ~PAGE_MASK) / size;
1172 } else
1173 regs->eip += inst_len;
1175 send_pio_req(regs, port, count, size, addr, dir, 1);
1177 } else {
1178 if (port == 0xe9 && dir == IOREQ_WRITE && size == 1)
1179 hvm_print_line(current, regs->eax); /* guest debug output */
1181 regs->eip += inst_len;
1182 send_pio_req(regs, port, 1, size, regs->eax, dir, 0);
1186 int
1187 vmx_world_save(struct vcpu *v, struct vmx_assist_context *c)
1189 unsigned long inst_len;
1190 int error = 0;
1192 error |= __vmread(VM_EXIT_INSTRUCTION_LEN, &inst_len);
1193 error |= __vmread(GUEST_RIP, &c->eip);
1194 c->eip += inst_len; /* skip transition instruction */
1195 error |= __vmread(GUEST_RSP, &c->esp);
1196 error |= __vmread(GUEST_RFLAGS, &c->eflags);
1198 error |= __vmread(CR0_READ_SHADOW, &c->cr0);
1199 c->cr3 = v->arch.hvm_vmx.cpu_cr3;
1200 error |= __vmread(CR4_READ_SHADOW, &c->cr4);
1202 error |= __vmread(GUEST_IDTR_LIMIT, &c->idtr_limit);
1203 error |= __vmread(GUEST_IDTR_BASE, &c->idtr_base);
1205 error |= __vmread(GUEST_GDTR_LIMIT, &c->gdtr_limit);
1206 error |= __vmread(GUEST_GDTR_BASE, &c->gdtr_base);
1208 error |= __vmread(GUEST_CS_SELECTOR, &c->cs_sel);
1209 error |= __vmread(GUEST_CS_LIMIT, &c->cs_limit);
1210 error |= __vmread(GUEST_CS_BASE, &c->cs_base);
1211 error |= __vmread(GUEST_CS_AR_BYTES, &c->cs_arbytes.bytes);
1213 error |= __vmread(GUEST_DS_SELECTOR, &c->ds_sel);
1214 error |= __vmread(GUEST_DS_LIMIT, &c->ds_limit);
1215 error |= __vmread(GUEST_DS_BASE, &c->ds_base);
1216 error |= __vmread(GUEST_DS_AR_BYTES, &c->ds_arbytes.bytes);
1218 error |= __vmread(GUEST_ES_SELECTOR, &c->es_sel);
1219 error |= __vmread(GUEST_ES_LIMIT, &c->es_limit);
1220 error |= __vmread(GUEST_ES_BASE, &c->es_base);
1221 error |= __vmread(GUEST_ES_AR_BYTES, &c->es_arbytes.bytes);
1223 error |= __vmread(GUEST_SS_SELECTOR, &c->ss_sel);
1224 error |= __vmread(GUEST_SS_LIMIT, &c->ss_limit);
1225 error |= __vmread(GUEST_SS_BASE, &c->ss_base);
1226 error |= __vmread(GUEST_SS_AR_BYTES, &c->ss_arbytes.bytes);
1228 error |= __vmread(GUEST_FS_SELECTOR, &c->fs_sel);
1229 error |= __vmread(GUEST_FS_LIMIT, &c->fs_limit);
1230 error |= __vmread(GUEST_FS_BASE, &c->fs_base);
1231 error |= __vmread(GUEST_FS_AR_BYTES, &c->fs_arbytes.bytes);
1233 error |= __vmread(GUEST_GS_SELECTOR, &c->gs_sel);
1234 error |= __vmread(GUEST_GS_LIMIT, &c->gs_limit);
1235 error |= __vmread(GUEST_GS_BASE, &c->gs_base);
1236 error |= __vmread(GUEST_GS_AR_BYTES, &c->gs_arbytes.bytes);
1238 error |= __vmread(GUEST_TR_SELECTOR, &c->tr_sel);
1239 error |= __vmread(GUEST_TR_LIMIT, &c->tr_limit);
1240 error |= __vmread(GUEST_TR_BASE, &c->tr_base);
1241 error |= __vmread(GUEST_TR_AR_BYTES, &c->tr_arbytes.bytes);
1243 error |= __vmread(GUEST_LDTR_SELECTOR, &c->ldtr_sel);
1244 error |= __vmread(GUEST_LDTR_LIMIT, &c->ldtr_limit);
1245 error |= __vmread(GUEST_LDTR_BASE, &c->ldtr_base);
1246 error |= __vmread(GUEST_LDTR_AR_BYTES, &c->ldtr_arbytes.bytes);
1248 return !error;
1251 int
1252 vmx_world_restore(struct vcpu *v, struct vmx_assist_context *c)
1254 unsigned long mfn, old_cr4, old_base_mfn;
1255 int error = 0;
1257 error |= __vmwrite(GUEST_RIP, c->eip);
1258 error |= __vmwrite(GUEST_RSP, c->esp);
1259 error |= __vmwrite(GUEST_RFLAGS, c->eflags);
1261 error |= __vmwrite(CR0_READ_SHADOW, c->cr0);
1263 if (!vmx_paging_enabled(v))
1264 goto skip_cr3;
1266 if (c->cr3 == v->arch.hvm_vmx.cpu_cr3) {
1267 /*
1268 * This is simple TLB flush, implying the guest has
1269 * removed some translation or changed page attributes.
1270 * We simply invalidate the shadow.
1271 */
1272 mfn = get_mfn_from_gpfn(c->cr3 >> PAGE_SHIFT);
1273 if (mfn != pagetable_get_pfn(v->arch.guest_table)) {
1274 printk("Invalid CR3 value=%x", c->cr3);
1275 domain_crash_synchronous();
1276 return 0;
1278 } else {
1279 /*
1280 * If different, make a shadow. Check if the PDBR is valid
1281 * first.
1282 */
1283 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 c->cr3 = %x", c->cr3);
1284 if ((c->cr3 >> PAGE_SHIFT) > v->domain->max_pages) {
1285 printk("Invalid CR3 value=%x", c->cr3);
1286 domain_crash_synchronous();
1287 return 0;
1289 mfn = get_mfn_from_gpfn(c->cr3 >> PAGE_SHIFT);
1290 if(!get_page(mfn_to_page(mfn), v->domain))
1291 return 0;
1292 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1293 v->arch.guest_table = pagetable_from_pfn(mfn);
1294 if (old_base_mfn)
1295 put_page(mfn_to_page(old_base_mfn));
1296 /*
1297 * arch.shadow_table should now hold the next CR3 for shadow
1298 */
1299 v->arch.hvm_vmx.cpu_cr3 = c->cr3;
1302 skip_cr3:
1304 shadow2_update_paging_modes(v);
1305 if (!vmx_paging_enabled(v))
1306 HVM_DBG_LOG(DBG_LEVEL_VMMU, "switching to vmxassist. use phys table");
1307 else
1308 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %x", c->cr3);
1309 __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
1311 error |= __vmread(CR4_READ_SHADOW, &old_cr4);
1312 error |= __vmwrite(GUEST_CR4, (c->cr4 | VMX_CR4_HOST_MASK));
1313 error |= __vmwrite(CR4_READ_SHADOW, c->cr4);
1315 error |= __vmwrite(GUEST_IDTR_LIMIT, c->idtr_limit);
1316 error |= __vmwrite(GUEST_IDTR_BASE, c->idtr_base);
1318 error |= __vmwrite(GUEST_GDTR_LIMIT, c->gdtr_limit);
1319 error |= __vmwrite(GUEST_GDTR_BASE, c->gdtr_base);
1321 error |= __vmwrite(GUEST_CS_SELECTOR, c->cs_sel);
1322 error |= __vmwrite(GUEST_CS_LIMIT, c->cs_limit);
1323 error |= __vmwrite(GUEST_CS_BASE, c->cs_base);
1324 error |= __vmwrite(GUEST_CS_AR_BYTES, c->cs_arbytes.bytes);
1326 error |= __vmwrite(GUEST_DS_SELECTOR, c->ds_sel);
1327 error |= __vmwrite(GUEST_DS_LIMIT, c->ds_limit);
1328 error |= __vmwrite(GUEST_DS_BASE, c->ds_base);
1329 error |= __vmwrite(GUEST_DS_AR_BYTES, c->ds_arbytes.bytes);
1331 error |= __vmwrite(GUEST_ES_SELECTOR, c->es_sel);
1332 error |= __vmwrite(GUEST_ES_LIMIT, c->es_limit);
1333 error |= __vmwrite(GUEST_ES_BASE, c->es_base);
1334 error |= __vmwrite(GUEST_ES_AR_BYTES, c->es_arbytes.bytes);
1336 error |= __vmwrite(GUEST_SS_SELECTOR, c->ss_sel);
1337 error |= __vmwrite(GUEST_SS_LIMIT, c->ss_limit);
1338 error |= __vmwrite(GUEST_SS_BASE, c->ss_base);
1339 error |= __vmwrite(GUEST_SS_AR_BYTES, c->ss_arbytes.bytes);
1341 error |= __vmwrite(GUEST_FS_SELECTOR, c->fs_sel);
1342 error |= __vmwrite(GUEST_FS_LIMIT, c->fs_limit);
1343 error |= __vmwrite(GUEST_FS_BASE, c->fs_base);
1344 error |= __vmwrite(GUEST_FS_AR_BYTES, c->fs_arbytes.bytes);
1346 error |= __vmwrite(GUEST_GS_SELECTOR, c->gs_sel);
1347 error |= __vmwrite(GUEST_GS_LIMIT, c->gs_limit);
1348 error |= __vmwrite(GUEST_GS_BASE, c->gs_base);
1349 error |= __vmwrite(GUEST_GS_AR_BYTES, c->gs_arbytes.bytes);
1351 error |= __vmwrite(GUEST_TR_SELECTOR, c->tr_sel);
1352 error |= __vmwrite(GUEST_TR_LIMIT, c->tr_limit);
1353 error |= __vmwrite(GUEST_TR_BASE, c->tr_base);
1354 error |= __vmwrite(GUEST_TR_AR_BYTES, c->tr_arbytes.bytes);
1356 error |= __vmwrite(GUEST_LDTR_SELECTOR, c->ldtr_sel);
1357 error |= __vmwrite(GUEST_LDTR_LIMIT, c->ldtr_limit);
1358 error |= __vmwrite(GUEST_LDTR_BASE, c->ldtr_base);
1359 error |= __vmwrite(GUEST_LDTR_AR_BYTES, c->ldtr_arbytes.bytes);
1361 return !error;
1364 enum { VMX_ASSIST_INVOKE = 0, VMX_ASSIST_RESTORE };
1366 int
1367 vmx_assist(struct vcpu *v, int mode)
1369 struct vmx_assist_context c;
1370 u32 magic;
1371 u32 cp;
1373 /* make sure vmxassist exists (this is not an error) */
1374 if (!hvm_copy(&magic, VMXASSIST_MAGIC_OFFSET, sizeof(magic), HVM_COPY_IN))
1375 return 0;
1376 if (magic != VMXASSIST_MAGIC)
1377 return 0;
1379 switch (mode) {
1380 /*
1381 * Transfer control to vmxassist.
1382 * Store the current context in VMXASSIST_OLD_CONTEXT and load
1383 * the new VMXASSIST_NEW_CONTEXT context. This context was created
1384 * by vmxassist and will transfer control to it.
1385 */
1386 case VMX_ASSIST_INVOKE:
1387 /* save the old context */
1388 if (!hvm_copy(&cp, VMXASSIST_OLD_CONTEXT, sizeof(cp), HVM_COPY_IN))
1389 goto error;
1390 if (cp != 0) {
1391 if (!vmx_world_save(v, &c))
1392 goto error;
1393 if (!hvm_copy(&c, cp, sizeof(c), HVM_COPY_OUT))
1394 goto error;
1397 /* restore the new context, this should activate vmxassist */
1398 if (!hvm_copy(&cp, VMXASSIST_NEW_CONTEXT, sizeof(cp), HVM_COPY_IN))
1399 goto error;
1400 if (cp != 0) {
1401 if (!hvm_copy(&c, cp, sizeof(c), HVM_COPY_IN))
1402 goto error;
1403 if (!vmx_world_restore(v, &c))
1404 goto error;
1405 return 1;
1407 break;
1409 /*
1410 * Restore the VMXASSIST_OLD_CONTEXT that was saved by VMX_ASSIST_INVOKE
1411 * above.
1412 */
1413 case VMX_ASSIST_RESTORE:
1414 /* save the old context */
1415 if (!hvm_copy(&cp, VMXASSIST_OLD_CONTEXT, sizeof(cp), HVM_COPY_IN))
1416 goto error;
1417 if (cp != 0) {
1418 if (!hvm_copy(&c, cp, sizeof(c), HVM_COPY_IN))
1419 goto error;
1420 if (!vmx_world_restore(v, &c))
1421 goto error;
1422 return 1;
1424 break;
1427 error:
1428 printf("Failed to transfer to vmxassist\n");
1429 domain_crash_synchronous();
1430 return 0;
1433 static int vmx_set_cr0(unsigned long value)
1435 struct vcpu *v = current;
1436 unsigned long mfn;
1437 unsigned long eip;
1438 int paging_enabled;
1439 unsigned long vm_entry_value;
1440 unsigned long old_cr0;
1441 unsigned long old_base_mfn;
1443 /*
1444 * CR0: We don't want to lose PE and PG.
1445 */
1446 __vmread_vcpu(v, CR0_READ_SHADOW, &old_cr0);
1447 paging_enabled = (old_cr0 & X86_CR0_PE) && (old_cr0 & X86_CR0_PG);
1449 /* TS cleared? Then initialise FPU now. */
1450 if ( !(value & X86_CR0_TS) )
1452 setup_fpu(v);
1453 __vm_clear_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_NM);
1456 __vmwrite(GUEST_CR0, value | X86_CR0_PE | X86_CR0_PG | X86_CR0_NE);
1457 __vmwrite(CR0_READ_SHADOW, value);
1459 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR0 value = %lx\n", value);
1461 if ( (value & X86_CR0_PE) && (value & X86_CR0_PG) && !paging_enabled )
1463 /*
1464 * Trying to enable guest paging.
1465 * The guest CR3 must be pointing to the guest physical.
1466 */
1467 if ( !VALID_MFN(mfn = get_mfn_from_gpfn(
1468 v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT)) ||
1469 !get_page(mfn_to_page(mfn), v->domain) )
1471 printk("Invalid CR3 value = %lx (mfn=%lx)\n",
1472 v->arch.hvm_vmx.cpu_cr3, mfn);
1473 domain_crash_synchronous(); /* need to take a clean path */
1476 #if defined(__x86_64__)
1477 if ( test_bit(VMX_CPU_STATE_LME_ENABLED,
1478 &v->arch.hvm_vmx.cpu_state) &&
1479 !test_bit(VMX_CPU_STATE_PAE_ENABLED,
1480 &v->arch.hvm_vmx.cpu_state) )
1482 HVM_DBG_LOG(DBG_LEVEL_1, "Enable paging before PAE enabled\n");
1483 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
1486 if ( test_bit(VMX_CPU_STATE_LME_ENABLED,
1487 &v->arch.hvm_vmx.cpu_state) )
1489 /* Here the PAE is should be opened */
1490 HVM_DBG_LOG(DBG_LEVEL_1, "Enable long mode\n");
1491 set_bit(VMX_CPU_STATE_LMA_ENABLED,
1492 &v->arch.hvm_vmx.cpu_state);
1494 __vmread(VM_ENTRY_CONTROLS, &vm_entry_value);
1495 vm_entry_value |= VM_ENTRY_CONTROLS_IA32E_MODE;
1496 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
1498 #endif
1500 /*
1501 * Now arch.guest_table points to machine physical.
1502 */
1503 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1504 v->arch.guest_table = pagetable_from_pfn(mfn);
1505 if (old_base_mfn)
1506 put_page(mfn_to_page(old_base_mfn));
1507 shadow2_update_paging_modes(v);
1509 HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
1510 (unsigned long) (mfn << PAGE_SHIFT));
1512 __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
1513 /*
1514 * arch->shadow_table should hold the next CR3 for shadow
1515 */
1516 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, mfn = %lx",
1517 v->arch.hvm_vmx.cpu_cr3, mfn);
1520 if ( !((value & X86_CR0_PE) && (value & X86_CR0_PG)) && paging_enabled )
1521 if ( v->arch.hvm_vmx.cpu_cr3 ) {
1522 put_page(mfn_to_page(get_mfn_from_gpfn(
1523 v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT)));
1524 v->arch.guest_table = pagetable_null();
1527 /*
1528 * VMX does not implement real-mode virtualization. We emulate
1529 * real-mode by performing a world switch to VMXAssist whenever
1530 * a partition disables the CR0.PE bit.
1531 */
1532 if ( (value & X86_CR0_PE) == 0 )
1534 if ( value & X86_CR0_PG ) {
1535 /* inject GP here */
1536 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
1537 return 0;
1538 } else {
1539 /*
1540 * Disable paging here.
1541 * Same to PE == 1 && PG == 0
1542 */
1543 if ( test_bit(VMX_CPU_STATE_LMA_ENABLED,
1544 &v->arch.hvm_vmx.cpu_state) )
1546 clear_bit(VMX_CPU_STATE_LMA_ENABLED,
1547 &v->arch.hvm_vmx.cpu_state);
1548 __vmread(VM_ENTRY_CONTROLS, &vm_entry_value);
1549 vm_entry_value &= ~VM_ENTRY_CONTROLS_IA32E_MODE;
1550 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
1554 if ( vmx_assist(v, VMX_ASSIST_INVOKE) ) {
1555 set_bit(VMX_CPU_STATE_ASSIST_ENABLED, &v->arch.hvm_vmx.cpu_state);
1556 __vmread(GUEST_RIP, &eip);
1557 HVM_DBG_LOG(DBG_LEVEL_1,
1558 "Transfering control to vmxassist %%eip 0x%lx\n", eip);
1559 return 0; /* do not update eip! */
1561 } else if ( test_bit(VMX_CPU_STATE_ASSIST_ENABLED,
1562 &v->arch.hvm_vmx.cpu_state) )
1564 __vmread(GUEST_RIP, &eip);
1565 HVM_DBG_LOG(DBG_LEVEL_1,
1566 "Enabling CR0.PE at %%eip 0x%lx\n", eip);
1567 if ( vmx_assist(v, VMX_ASSIST_RESTORE) )
1569 clear_bit(VMX_CPU_STATE_ASSIST_ENABLED,
1570 &v->arch.hvm_vmx.cpu_state);
1571 __vmread(GUEST_RIP, &eip);
1572 HVM_DBG_LOG(DBG_LEVEL_1,
1573 "Restoring to %%eip 0x%lx\n", eip);
1574 return 0; /* do not update eip! */
1577 else if ( (value & (X86_CR0_PE | X86_CR0_PG)) == X86_CR0_PE )
1579 __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
1580 shadow2_update_paging_modes(v);
1583 return 1;
1586 #define CASE_SET_REG(REG, reg) \
1587 case REG_ ## REG: regs->reg = value; break
1588 #define CASE_GET_REG(REG, reg) \
1589 case REG_ ## REG: value = regs->reg; break
1591 #define CASE_EXTEND_SET_REG \
1592 CASE_EXTEND_REG(S)
1593 #define CASE_EXTEND_GET_REG \
1594 CASE_EXTEND_REG(G)
1596 #ifdef __i386__
1597 #define CASE_EXTEND_REG(T)
1598 #else
1599 #define CASE_EXTEND_REG(T) \
1600 CASE_ ## T ## ET_REG(R8, r8); \
1601 CASE_ ## T ## ET_REG(R9, r9); \
1602 CASE_ ## T ## ET_REG(R10, r10); \
1603 CASE_ ## T ## ET_REG(R11, r11); \
1604 CASE_ ## T ## ET_REG(R12, r12); \
1605 CASE_ ## T ## ET_REG(R13, r13); \
1606 CASE_ ## T ## ET_REG(R14, r14); \
1607 CASE_ ## T ## ET_REG(R15, r15)
1608 #endif
1610 /*
1611 * Write to control registers
1612 */
1613 static int mov_to_cr(int gp, int cr, struct cpu_user_regs *regs)
1615 unsigned long value;
1616 unsigned long old_cr;
1617 struct vcpu *v = current;
1619 switch ( gp ) {
1620 CASE_GET_REG(EAX, eax);
1621 CASE_GET_REG(ECX, ecx);
1622 CASE_GET_REG(EDX, edx);
1623 CASE_GET_REG(EBX, ebx);
1624 CASE_GET_REG(EBP, ebp);
1625 CASE_GET_REG(ESI, esi);
1626 CASE_GET_REG(EDI, edi);
1627 CASE_EXTEND_GET_REG;
1628 case REG_ESP:
1629 __vmread(GUEST_RSP, &value);
1630 break;
1631 default:
1632 printk("invalid gp: %d\n", gp);
1633 __hvm_bug(regs);
1636 HVM_DBG_LOG(DBG_LEVEL_1, "CR%d, value = %lx", cr, value);
1638 switch ( cr ) {
1639 case 0:
1640 return vmx_set_cr0(value);
1641 case 3:
1643 unsigned long old_base_mfn, mfn;
1645 /*
1646 * If paging is not enabled yet, simply copy the value to CR3.
1647 */
1648 if (!vmx_paging_enabled(v)) {
1649 v->arch.hvm_vmx.cpu_cr3 = value;
1650 break;
1653 /*
1654 * We make a new one if the shadow does not exist.
1655 */
1656 if (value == v->arch.hvm_vmx.cpu_cr3) {
1657 /*
1658 * This is simple TLB flush, implying the guest has
1659 * removed some translation or changed page attributes.
1660 * We simply invalidate the shadow.
1661 */
1662 mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
1663 if (mfn != pagetable_get_pfn(v->arch.guest_table))
1664 __hvm_bug(regs);
1665 shadow2_update_cr3(v);
1666 } else {
1667 /*
1668 * If different, make a shadow. Check if the PDBR is valid
1669 * first.
1670 */
1671 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 value = %lx", value);
1672 if ( ((value >> PAGE_SHIFT) > v->domain->max_pages ) ||
1673 !VALID_MFN(mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT)) ||
1674 !get_page(mfn_to_page(mfn), v->domain) )
1676 printk("Invalid CR3 value=%lx", value);
1677 domain_crash_synchronous(); /* need to take a clean path */
1679 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1680 v->arch.guest_table = pagetable_from_pfn(mfn);
1681 if (old_base_mfn)
1682 put_page(mfn_to_page(old_base_mfn));
1683 /*
1684 * arch.shadow_table should now hold the next CR3 for shadow
1685 */
1686 v->arch.hvm_vmx.cpu_cr3 = value;
1687 update_cr3(v);
1688 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx",
1689 value);
1690 __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
1692 break;
1694 case 4: /* CR4 */
1696 __vmread(CR4_READ_SHADOW, &old_cr);
1698 if ( value & X86_CR4_PAE && !(old_cr & X86_CR4_PAE) )
1700 set_bit(VMX_CPU_STATE_PAE_ENABLED, &v->arch.hvm_vmx.cpu_state);
1702 if ( vmx_pgbit_test(v) )
1704 /* The guest is a 32-bit PAE guest. */
1705 #if CONFIG_PAGING_LEVELS >= 3
1706 unsigned long mfn, old_base_mfn;
1708 if ( !VALID_MFN(mfn = get_mfn_from_gpfn(
1709 v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT)) ||
1710 !get_page(mfn_to_page(mfn), v->domain) )
1712 printk("Invalid CR3 value = %lx", v->arch.hvm_vmx.cpu_cr3);
1713 domain_crash_synchronous(); /* need to take a clean path */
1717 /*
1718 * Now arch.guest_table points to machine physical.
1719 */
1721 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1722 v->arch.guest_table = pagetable_from_pfn(mfn);
1723 if ( old_base_mfn )
1724 put_page(mfn_to_page(old_base_mfn));
1726 HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
1727 (unsigned long) (mfn << PAGE_SHIFT));
1729 __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
1731 /*
1732 * arch->shadow_table should hold the next CR3 for shadow
1733 */
1735 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, mfn = %lx",
1736 v->arch.hvm_vmx.cpu_cr3, mfn);
1737 #endif
1740 else if ( value & X86_CR4_PAE )
1741 set_bit(VMX_CPU_STATE_PAE_ENABLED, &v->arch.hvm_vmx.cpu_state);
1742 else
1744 if ( test_bit(VMX_CPU_STATE_LMA_ENABLED, &v->arch.hvm_vmx.cpu_state) )
1745 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
1747 clear_bit(VMX_CPU_STATE_PAE_ENABLED, &v->arch.hvm_vmx.cpu_state);
1750 __vmwrite(GUEST_CR4, value| VMX_CR4_HOST_MASK);
1751 __vmwrite(CR4_READ_SHADOW, value);
1753 /*
1754 * Writing to CR4 to modify the PSE, PGE, or PAE flag invalidates
1755 * all TLB entries except global entries.
1756 */
1757 if ( (old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE) )
1758 shadow2_update_paging_modes(v);
1759 break;
1761 default:
1762 printk("invalid cr: %d\n", gp);
1763 __hvm_bug(regs);
1766 return 1;
1769 /*
1770 * Read from control registers. CR0 and CR4 are read from the shadow.
1771 */
1772 static void mov_from_cr(int cr, int gp, struct cpu_user_regs *regs)
1774 unsigned long value;
1775 struct vcpu *v = current;
1777 if ( cr != 3 )
1778 __hvm_bug(regs);
1780 value = (unsigned long) v->arch.hvm_vmx.cpu_cr3;
1782 switch ( gp ) {
1783 CASE_SET_REG(EAX, eax);
1784 CASE_SET_REG(ECX, ecx);
1785 CASE_SET_REG(EDX, edx);
1786 CASE_SET_REG(EBX, ebx);
1787 CASE_SET_REG(EBP, ebp);
1788 CASE_SET_REG(ESI, esi);
1789 CASE_SET_REG(EDI, edi);
1790 CASE_EXTEND_SET_REG;
1791 case REG_ESP:
1792 __vmwrite(GUEST_RSP, value);
1793 regs->esp = value;
1794 break;
1795 default:
1796 printk("invalid gp: %d\n", gp);
1797 __hvm_bug(regs);
1800 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR%d, value = %lx", cr, value);
1803 static int vmx_cr_access(unsigned long exit_qualification, struct cpu_user_regs *regs)
1805 unsigned int gp, cr;
1806 unsigned long value;
1807 struct vcpu *v = current;
1809 switch (exit_qualification & CONTROL_REG_ACCESS_TYPE) {
1810 case TYPE_MOV_TO_CR:
1811 gp = exit_qualification & CONTROL_REG_ACCESS_REG;
1812 cr = exit_qualification & CONTROL_REG_ACCESS_NUM;
1813 TRACE_VMEXIT(1,TYPE_MOV_TO_CR);
1814 TRACE_VMEXIT(2,cr);
1815 TRACE_VMEXIT(3,gp);
1816 return mov_to_cr(gp, cr, regs);
1817 case TYPE_MOV_FROM_CR:
1818 gp = exit_qualification & CONTROL_REG_ACCESS_REG;
1819 cr = exit_qualification & CONTROL_REG_ACCESS_NUM;
1820 TRACE_VMEXIT(1,TYPE_MOV_FROM_CR);
1821 TRACE_VMEXIT(2,cr);
1822 TRACE_VMEXIT(3,gp);
1823 mov_from_cr(cr, gp, regs);
1824 break;
1825 case TYPE_CLTS:
1826 TRACE_VMEXIT(1,TYPE_CLTS);
1828 /* We initialise the FPU now, to avoid needing another vmexit. */
1829 setup_fpu(v);
1830 __vm_clear_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_NM);
1832 __vmread_vcpu(v, GUEST_CR0, &value);
1833 value &= ~X86_CR0_TS; /* clear TS */
1834 __vmwrite(GUEST_CR0, value);
1836 __vmread_vcpu(v, CR0_READ_SHADOW, &value);
1837 value &= ~X86_CR0_TS; /* clear TS */
1838 __vmwrite(CR0_READ_SHADOW, value);
1839 break;
1840 case TYPE_LMSW:
1841 TRACE_VMEXIT(1,TYPE_LMSW);
1842 __vmread_vcpu(v, CR0_READ_SHADOW, &value);
1843 value = (value & ~0xF) |
1844 (((exit_qualification & LMSW_SOURCE_DATA) >> 16) & 0xF);
1845 return vmx_set_cr0(value);
1846 break;
1847 default:
1848 __hvm_bug(regs);
1849 break;
1851 return 1;
1854 static inline void vmx_do_msr_read(struct cpu_user_regs *regs)
1856 u64 msr_content = 0;
1857 u32 eax, edx;
1858 struct vcpu *v = current;
1860 HVM_DBG_LOG(DBG_LEVEL_1, "vmx_do_msr_read: ecx=%lx, eax=%lx, edx=%lx",
1861 (unsigned long)regs->ecx, (unsigned long)regs->eax,
1862 (unsigned long)regs->edx);
1863 switch (regs->ecx) {
1864 case MSR_IA32_TIME_STAMP_COUNTER:
1865 msr_content = hvm_get_guest_time(v);
1866 break;
1867 case MSR_IA32_SYSENTER_CS:
1868 __vmread(GUEST_SYSENTER_CS, (u32 *)&msr_content);
1869 break;
1870 case MSR_IA32_SYSENTER_ESP:
1871 __vmread(GUEST_SYSENTER_ESP, &msr_content);
1872 break;
1873 case MSR_IA32_SYSENTER_EIP:
1874 __vmread(GUEST_SYSENTER_EIP, &msr_content);
1875 break;
1876 case MSR_IA32_APICBASE:
1877 msr_content = VLAPIC(v) ? VLAPIC(v)->apic_base_msr : 0;
1878 break;
1879 default:
1880 if (long_mode_do_msr_read(regs))
1881 return;
1883 if ( rdmsr_hypervisor_regs(regs->ecx, &eax, &edx) )
1885 regs->eax = eax;
1886 regs->edx = edx;
1887 return;
1890 rdmsr_safe(regs->ecx, regs->eax, regs->edx);
1891 break;
1894 regs->eax = msr_content & 0xFFFFFFFF;
1895 regs->edx = msr_content >> 32;
1897 HVM_DBG_LOG(DBG_LEVEL_1, "vmx_do_msr_read returns: "
1898 "ecx=%lx, eax=%lx, edx=%lx",
1899 (unsigned long)regs->ecx, (unsigned long)regs->eax,
1900 (unsigned long)regs->edx);
1903 static inline void vmx_do_msr_write(struct cpu_user_regs *regs)
1905 u64 msr_content;
1906 struct vcpu *v = current;
1908 HVM_DBG_LOG(DBG_LEVEL_1, "vmx_do_msr_write: ecx=%lx, eax=%lx, edx=%lx",
1909 (unsigned long)regs->ecx, (unsigned long)regs->eax,
1910 (unsigned long)regs->edx);
1912 msr_content = (regs->eax & 0xFFFFFFFF) | ((u64)regs->edx << 32);
1914 switch (regs->ecx) {
1915 case MSR_IA32_TIME_STAMP_COUNTER:
1916 hvm_set_guest_time(v, msr_content);
1917 break;
1918 case MSR_IA32_SYSENTER_CS:
1919 __vmwrite(GUEST_SYSENTER_CS, msr_content);
1920 break;
1921 case MSR_IA32_SYSENTER_ESP:
1922 __vmwrite(GUEST_SYSENTER_ESP, msr_content);
1923 break;
1924 case MSR_IA32_SYSENTER_EIP:
1925 __vmwrite(GUEST_SYSENTER_EIP, msr_content);
1926 break;
1927 case MSR_IA32_APICBASE:
1928 vlapic_msr_set(VLAPIC(v), msr_content);
1929 break;
1930 default:
1931 if ( !long_mode_do_msr_write(regs) )
1932 wrmsr_hypervisor_regs(regs->ecx, regs->eax, regs->edx);
1933 break;
1936 HVM_DBG_LOG(DBG_LEVEL_1, "vmx_do_msr_write returns: "
1937 "ecx=%lx, eax=%lx, edx=%lx",
1938 (unsigned long)regs->ecx, (unsigned long)regs->eax,
1939 (unsigned long)regs->edx);
1942 void vmx_vmexit_do_hlt(void)
1944 unsigned long rflags;
1945 __vmread(GUEST_RFLAGS, &rflags);
1946 hvm_hlt(rflags);
1949 static inline void vmx_vmexit_do_extint(struct cpu_user_regs *regs)
1951 unsigned int vector;
1952 int error;
1954 asmlinkage void do_IRQ(struct cpu_user_regs *);
1955 fastcall void smp_apic_timer_interrupt(struct cpu_user_regs *);
1956 fastcall void smp_event_check_interrupt(void);
1957 fastcall void smp_invalidate_interrupt(void);
1958 fastcall void smp_call_function_interrupt(void);
1959 fastcall void smp_spurious_interrupt(struct cpu_user_regs *regs);
1960 fastcall void smp_error_interrupt(struct cpu_user_regs *regs);
1961 #ifdef CONFIG_X86_MCE_P4THERMAL
1962 fastcall void smp_thermal_interrupt(struct cpu_user_regs *regs);
1963 #endif
1965 if ((error = __vmread(VM_EXIT_INTR_INFO, &vector))
1966 && !(vector & INTR_INFO_VALID_MASK))
1967 __hvm_bug(regs);
1969 vector &= INTR_INFO_VECTOR_MASK;
1970 TRACE_VMEXIT(1,vector);
1972 switch(vector) {
1973 case LOCAL_TIMER_VECTOR:
1974 smp_apic_timer_interrupt(regs);
1975 break;
1976 case EVENT_CHECK_VECTOR:
1977 smp_event_check_interrupt();
1978 break;
1979 case INVALIDATE_TLB_VECTOR:
1980 smp_invalidate_interrupt();
1981 break;
1982 case CALL_FUNCTION_VECTOR:
1983 smp_call_function_interrupt();
1984 break;
1985 case SPURIOUS_APIC_VECTOR:
1986 smp_spurious_interrupt(regs);
1987 break;
1988 case ERROR_APIC_VECTOR:
1989 smp_error_interrupt(regs);
1990 break;
1991 #ifdef CONFIG_X86_MCE_P4THERMAL
1992 case THERMAL_APIC_VECTOR:
1993 smp_thermal_interrupt(regs);
1994 break;
1995 #endif
1996 default:
1997 regs->entry_vector = vector;
1998 do_IRQ(regs);
1999 break;
2003 #if defined (__x86_64__)
2004 void store_cpu_user_regs(struct cpu_user_regs *regs)
2006 __vmread(GUEST_SS_SELECTOR, &regs->ss);
2007 __vmread(GUEST_RSP, &regs->rsp);
2008 __vmread(GUEST_RFLAGS, &regs->rflags);
2009 __vmread(GUEST_CS_SELECTOR, &regs->cs);
2010 __vmread(GUEST_DS_SELECTOR, &regs->ds);
2011 __vmread(GUEST_ES_SELECTOR, &regs->es);
2012 __vmread(GUEST_RIP, &regs->rip);
2014 #elif defined (__i386__)
2015 void store_cpu_user_regs(struct cpu_user_regs *regs)
2017 __vmread(GUEST_SS_SELECTOR, &regs->ss);
2018 __vmread(GUEST_RSP, &regs->esp);
2019 __vmread(GUEST_RFLAGS, &regs->eflags);
2020 __vmread(GUEST_CS_SELECTOR, &regs->cs);
2021 __vmread(GUEST_DS_SELECTOR, &regs->ds);
2022 __vmread(GUEST_ES_SELECTOR, &regs->es);
2023 __vmread(GUEST_RIP, &regs->eip);
2025 #endif
2027 #ifdef XEN_DEBUGGER
2028 void save_cpu_user_regs(struct cpu_user_regs *regs)
2030 __vmread(GUEST_SS_SELECTOR, &regs->xss);
2031 __vmread(GUEST_RSP, &regs->esp);
2032 __vmread(GUEST_RFLAGS, &regs->eflags);
2033 __vmread(GUEST_CS_SELECTOR, &regs->xcs);
2034 __vmread(GUEST_RIP, &regs->eip);
2036 __vmread(GUEST_GS_SELECTOR, &regs->xgs);
2037 __vmread(GUEST_FS_SELECTOR, &regs->xfs);
2038 __vmread(GUEST_ES_SELECTOR, &regs->xes);
2039 __vmread(GUEST_DS_SELECTOR, &regs->xds);
2042 void restore_cpu_user_regs(struct cpu_user_regs *regs)
2044 __vmwrite(GUEST_SS_SELECTOR, regs->xss);
2045 __vmwrite(GUEST_RSP, regs->esp);
2046 __vmwrite(GUEST_RFLAGS, regs->eflags);
2047 __vmwrite(GUEST_CS_SELECTOR, regs->xcs);
2048 __vmwrite(GUEST_RIP, regs->eip);
2050 __vmwrite(GUEST_GS_SELECTOR, regs->xgs);
2051 __vmwrite(GUEST_FS_SELECTOR, regs->xfs);
2052 __vmwrite(GUEST_ES_SELECTOR, regs->xes);
2053 __vmwrite(GUEST_DS_SELECTOR, regs->xds);
2055 #endif
2057 asmlinkage void vmx_vmexit_handler(struct cpu_user_regs regs)
2059 unsigned int exit_reason;
2060 unsigned long exit_qualification, rip, inst_len = 0;
2061 struct vcpu *v = current;
2063 __vmread(VM_EXIT_REASON, &exit_reason);
2065 perfc_incra(vmexits, exit_reason);
2067 if ( (exit_reason != EXIT_REASON_EXTERNAL_INTERRUPT) &&
2068 (exit_reason != EXIT_REASON_VMCALL) &&
2069 (exit_reason != EXIT_REASON_IO_INSTRUCTION) )
2070 HVM_DBG_LOG(DBG_LEVEL_0, "exit reason = %x", exit_reason);
2072 if ( exit_reason != EXIT_REASON_EXTERNAL_INTERRUPT )
2073 local_irq_enable();
2075 if ( unlikely(exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) )
2077 unsigned int failed_vmentry_reason = exit_reason & 0xFFFF;
2079 __vmread(EXIT_QUALIFICATION, &exit_qualification);
2080 printk("Failed vm entry (exit reason 0x%x) ", exit_reason);
2081 switch ( failed_vmentry_reason ) {
2082 case EXIT_REASON_INVALID_GUEST_STATE:
2083 printk("caused by invalid guest state (%ld).\n", exit_qualification);
2084 break;
2085 case EXIT_REASON_MSR_LOADING:
2086 printk("caused by MSR entry %ld loading.\n", exit_qualification);
2087 break;
2088 case EXIT_REASON_MACHINE_CHECK:
2089 printk("caused by machine check.\n");
2090 break;
2091 default:
2092 printk("reason not known yet!");
2093 break;
2096 printk("************* VMCS Area **************\n");
2097 vmcs_dump_vcpu();
2098 printk("**************************************\n");
2099 domain_crash_synchronous();
2102 TRACE_VMEXIT(0,exit_reason);
2104 switch ( exit_reason ) {
2105 case EXIT_REASON_EXCEPTION_NMI:
2107 /*
2108 * We don't set the software-interrupt exiting (INT n).
2109 * (1) We can get an exception (e.g. #PG) in the guest, or
2110 * (2) NMI
2111 */
2112 unsigned int vector;
2113 unsigned long va;
2115 if ( __vmread(VM_EXIT_INTR_INFO, &vector) ||
2116 !(vector & INTR_INFO_VALID_MASK) )
2117 domain_crash_synchronous();
2118 vector &= INTR_INFO_VECTOR_MASK;
2120 TRACE_VMEXIT(1,vector);
2121 perfc_incra(cause_vector, vector);
2123 switch ( vector ) {
2124 #ifdef XEN_DEBUGGER
2125 case TRAP_debug:
2127 save_cpu_user_regs(&regs);
2128 pdb_handle_exception(1, &regs, 1);
2129 restore_cpu_user_regs(&regs);
2130 break;
2132 case TRAP_int3:
2134 save_cpu_user_regs(&regs);
2135 pdb_handle_exception(3, &regs, 1);
2136 restore_cpu_user_regs(&regs);
2137 break;
2139 #else
2140 case TRAP_debug:
2142 void store_cpu_user_regs(struct cpu_user_regs *regs);
2144 if ( test_bit(_DOMF_debugging, &v->domain->domain_flags) )
2146 store_cpu_user_regs(&regs);
2147 domain_pause_for_debugger();
2148 __vm_clear_bit(GUEST_PENDING_DBG_EXCEPTIONS,
2149 PENDING_DEBUG_EXC_BS);
2151 else
2153 vmx_reflect_exception(v);
2154 __vm_clear_bit(GUEST_PENDING_DBG_EXCEPTIONS,
2155 PENDING_DEBUG_EXC_BS);
2158 break;
2160 case TRAP_int3:
2162 if ( test_bit(_DOMF_debugging, &v->domain->domain_flags) )
2163 domain_pause_for_debugger();
2164 else
2165 vmx_reflect_exception(v);
2166 break;
2168 #endif
2169 case TRAP_no_device:
2171 vmx_do_no_device_fault();
2172 break;
2174 case TRAP_page_fault:
2176 __vmread(EXIT_QUALIFICATION, &va);
2177 __vmread(VM_EXIT_INTR_ERROR_CODE, &regs.error_code);
2179 TRACE_VMEXIT(3,regs.error_code);
2180 TRACE_VMEXIT(4,va);
2182 HVM_DBG_LOG(DBG_LEVEL_VMMU,
2183 "eax=%lx, ebx=%lx, ecx=%lx, edx=%lx, esi=%lx, edi=%lx",
2184 (unsigned long)regs.eax, (unsigned long)regs.ebx,
2185 (unsigned long)regs.ecx, (unsigned long)regs.edx,
2186 (unsigned long)regs.esi, (unsigned long)regs.edi);
2188 if ( !vmx_do_page_fault(va, &regs) ) {
2189 /*
2190 * Inject #PG using Interruption-Information Fields
2191 */
2192 vmx_inject_hw_exception(v, TRAP_page_fault, regs.error_code);
2193 v->arch.hvm_vmx.cpu_cr2 = va;
2194 TRACE_3D(TRC_VMX_INT, v->domain->domain_id, TRAP_page_fault, va);
2196 break;
2198 case TRAP_nmi:
2199 do_nmi(&regs);
2200 break;
2201 default:
2202 vmx_reflect_exception(v);
2203 break;
2205 break;
2207 case EXIT_REASON_EXTERNAL_INTERRUPT:
2208 vmx_vmexit_do_extint(&regs);
2209 break;
2210 case EXIT_REASON_TRIPLE_FAULT:
2211 domain_crash_synchronous();
2212 break;
2213 case EXIT_REASON_PENDING_INTERRUPT:
2214 /*
2215 * Not sure exactly what the purpose of this is. The only bits set
2216 * and cleared at this point are CPU_BASED_VIRTUAL_INTR_PENDING.
2217 * (in io.c:{enable,disable}_irq_window(). So presumably we want to
2218 * set it to the original value...
2219 */
2220 v->arch.hvm_vcpu.u.vmx.exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
2221 v->arch.hvm_vcpu.u.vmx.exec_control |=
2222 (MONITOR_CPU_BASED_EXEC_CONTROLS & CPU_BASED_VIRTUAL_INTR_PENDING);
2223 __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
2224 v->arch.hvm_vcpu.u.vmx.exec_control);
2225 break;
2226 case EXIT_REASON_TASK_SWITCH:
2227 domain_crash_synchronous();
2228 break;
2229 case EXIT_REASON_CPUID:
2230 vmx_vmexit_do_cpuid(&regs);
2231 __get_instruction_length(inst_len);
2232 __update_guest_eip(inst_len);
2233 break;
2234 case EXIT_REASON_HLT:
2235 __get_instruction_length(inst_len);
2236 __update_guest_eip(inst_len);
2237 vmx_vmexit_do_hlt();
2238 break;
2239 case EXIT_REASON_INVLPG:
2241 unsigned long va;
2243 __vmread(EXIT_QUALIFICATION, &va);
2244 vmx_vmexit_do_invlpg(va);
2245 __get_instruction_length(inst_len);
2246 __update_guest_eip(inst_len);
2247 break;
2249 case EXIT_REASON_VMCALL:
2251 __get_instruction_length(inst_len);
2252 __vmread(GUEST_RIP, &rip);
2253 __vmread(EXIT_QUALIFICATION, &exit_qualification);
2255 hvm_do_hypercall(&regs);
2256 __update_guest_eip(inst_len);
2257 break;
2259 case EXIT_REASON_CR_ACCESS:
2261 __vmread(GUEST_RIP, &rip);
2262 __get_instruction_length(inst_len);
2263 __vmread(EXIT_QUALIFICATION, &exit_qualification);
2265 HVM_DBG_LOG(DBG_LEVEL_1, "rip = %lx, inst_len =%lx, exit_qualification = %lx",
2266 rip, inst_len, exit_qualification);
2267 if ( vmx_cr_access(exit_qualification, &regs) )
2268 __update_guest_eip(inst_len);
2269 TRACE_VMEXIT(3,regs.error_code);
2270 TRACE_VMEXIT(4,exit_qualification);
2271 break;
2273 case EXIT_REASON_DR_ACCESS:
2274 __vmread(EXIT_QUALIFICATION, &exit_qualification);
2275 vmx_dr_access(exit_qualification, &regs);
2276 break;
2277 case EXIT_REASON_IO_INSTRUCTION:
2278 __vmread(EXIT_QUALIFICATION, &exit_qualification);
2279 __get_instruction_length(inst_len);
2280 vmx_io_instruction(exit_qualification, inst_len);
2281 TRACE_VMEXIT(4,exit_qualification);
2282 break;
2283 case EXIT_REASON_MSR_READ:
2284 __get_instruction_length(inst_len);
2285 vmx_do_msr_read(&regs);
2286 __update_guest_eip(inst_len);
2287 break;
2288 case EXIT_REASON_MSR_WRITE:
2289 vmx_do_msr_write(&regs);
2290 __get_instruction_length(inst_len);
2291 __update_guest_eip(inst_len);
2292 break;
2293 case EXIT_REASON_MWAIT_INSTRUCTION:
2294 case EXIT_REASON_MONITOR_INSTRUCTION:
2295 case EXIT_REASON_PAUSE_INSTRUCTION:
2296 domain_crash_synchronous();
2297 break;
2298 case EXIT_REASON_VMCLEAR:
2299 case EXIT_REASON_VMLAUNCH:
2300 case EXIT_REASON_VMPTRLD:
2301 case EXIT_REASON_VMPTRST:
2302 case EXIT_REASON_VMREAD:
2303 case EXIT_REASON_VMRESUME:
2304 case EXIT_REASON_VMWRITE:
2305 case EXIT_REASON_VMXOFF:
2306 case EXIT_REASON_VMXON:
2307 /* Report invalid opcode exception when a VMX guest tries to execute
2308 any of the VMX instructions */
2309 vmx_inject_hw_exception(v, TRAP_invalid_op, VMX_DELIVER_NO_ERROR_CODE);
2310 break;
2312 default:
2313 domain_crash_synchronous(); /* should not happen */
2317 asmlinkage void vmx_load_cr2(void)
2319 struct vcpu *v = current;
2321 local_irq_disable();
2322 asm volatile("mov %0,%%cr2": :"r" (v->arch.hvm_vmx.cpu_cr2));
2325 asmlinkage void vmx_trace_vmentry (void)
2327 TRACE_5D(TRC_VMX_VMENTRY,
2328 this_cpu(trace_values)[0],
2329 this_cpu(trace_values)[1],
2330 this_cpu(trace_values)[2],
2331 this_cpu(trace_values)[3],
2332 this_cpu(trace_values)[4]);
2333 TRACE_VMEXIT(0,9);
2334 TRACE_VMEXIT(1,9);
2335 TRACE_VMEXIT(2,9);
2336 TRACE_VMEXIT(3,9);
2337 TRACE_VMEXIT(4,9);
2338 return;
2341 asmlinkage void vmx_trace_vmexit (void)
2343 TRACE_3D(TRC_VMX_VMEXIT,0,0,0);
2344 return;
2347 /*
2348 * Local variables:
2349 * mode: C
2350 * c-set-style: "BSD"
2351 * c-basic-offset: 4
2352 * tab-width: 4
2353 * indent-tabs-mode: nil
2354 * End:
2355 */