ia64/xen-unstable

view xen/arch/x86/hvm/vmx/vmx.c @ 10892:0d2ba35c0cf2

[XEN] Add hypercall support for HVM guests. This is
fairly useless at the moment, since all of the hypercalls
fail, since copy_from_user doesn't work correctly in HVM
domains.

Signed-off-by: Steven Smith <ssmith@xensource.com>

Add a CPUID hypervisor platform interface at leaf
0x40000000. Allow hypercall transfer page to be filled
in via MSR 0x40000000.

Signed-off-by: Keir Fraser <keir@xensource.com>
author kfraser@localhost.localdomain
date Tue Aug 01 17:18:05 2006 +0100 (2006-08-01)
parents f42039dcdc81
children 022f29d4d2b8
line source
1 /*
2 * vmx.c: handling VMX architecture-related VM exits
3 * Copyright (c) 2004, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
16 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 *
18 */
20 #include <xen/config.h>
21 #include <xen/init.h>
22 #include <xen/lib.h>
23 #include <xen/trace.h>
24 #include <xen/sched.h>
25 #include <xen/irq.h>
26 #include <xen/softirq.h>
27 #include <xen/domain_page.h>
28 #include <asm/current.h>
29 #include <asm/io.h>
30 #include <asm/shadow.h>
31 #include <asm/regs.h>
32 #include <asm/cpufeature.h>
33 #include <asm/processor.h>
34 #include <asm/types.h>
35 #include <asm/msr.h>
36 #include <asm/spinlock.h>
37 #include <asm/hvm/hvm.h>
38 #include <asm/hvm/support.h>
39 #include <asm/hvm/vmx/vmx.h>
40 #include <asm/hvm/vmx/vmcs.h>
41 #include <asm/hvm/vmx/cpu.h>
42 #include <asm/shadow.h>
43 #if CONFIG_PAGING_LEVELS >= 3
44 #include <asm/shadow_64.h>
45 #endif
46 #include <public/sched.h>
47 #include <public/hvm/ioreq.h>
48 #include <asm/hvm/vpic.h>
49 #include <asm/hvm/vlapic.h>
51 static unsigned long trace_values[NR_CPUS][5];
52 #define TRACE_VMEXIT(index,value) trace_values[smp_processor_id()][index]=value
54 static void vmx_ctxt_switch_from(struct vcpu *v);
55 static void vmx_ctxt_switch_to(struct vcpu *v);
57 static int vmx_initialize_guest_resources(struct vcpu *v)
58 {
59 struct domain *d = v->domain;
60 struct vcpu *vc;
61 void *io_bitmap_a, *io_bitmap_b;
62 int rc;
64 v->arch.schedule_tail = arch_vmx_do_launch;
65 v->arch.ctxt_switch_from = vmx_ctxt_switch_from;
66 v->arch.ctxt_switch_to = vmx_ctxt_switch_to;
68 if ( v->vcpu_id != 0 )
69 return 1;
71 for_each_vcpu ( d, vc )
72 {
73 /* Initialize monitor page table */
74 vc->arch.monitor_table = pagetable_null();
76 memset(&vc->arch.hvm_vmx, 0, sizeof(struct arch_vmx_struct));
78 if ( (rc = vmx_create_vmcs(vc)) != 0 )
79 {
80 DPRINTK("Failed to create VMCS for vcpu %d: err=%d.\n",
81 vc->vcpu_id, rc);
82 return 0;
83 }
85 spin_lock_init(&vc->arch.hvm_vmx.vmcs_lock);
87 if ( (io_bitmap_a = alloc_xenheap_pages(IO_BITMAP_ORDER)) == NULL )
88 {
89 DPRINTK("Failed to allocate io bitmap b for vcpu %d.\n",
90 vc->vcpu_id);
91 return 0;
92 }
94 if ( (io_bitmap_b = alloc_xenheap_pages(IO_BITMAP_ORDER)) == NULL )
95 {
96 DPRINTK("Failed to allocate io bitmap b for vcpu %d.\n",
97 vc->vcpu_id);
98 return 0;
99 }
101 memset(io_bitmap_a, 0xff, 0x1000);
102 memset(io_bitmap_b, 0xff, 0x1000);
104 /* don't bother debug port access */
105 clear_bit(PC_DEBUG_PORT, io_bitmap_a);
107 vc->arch.hvm_vmx.io_bitmap_a = io_bitmap_a;
108 vc->arch.hvm_vmx.io_bitmap_b = io_bitmap_b;
109 }
111 /*
112 * Required to do this once per domain XXX todo: add a seperate function
113 * to do these.
114 */
115 memset(&d->shared_info->evtchn_mask[0], 0xff,
116 sizeof(d->shared_info->evtchn_mask));
118 /* Put the domain in shadow mode even though we're going to be using
119 * the shared 1:1 page table initially. It shouldn't hurt */
120 shadow_mode_enable(
121 d, SHM_enable|SHM_refcounts|SHM_translate|SHM_external|SHM_wr_pt_pte);
123 return 1;
124 }
126 static void vmx_relinquish_guest_resources(struct domain *d)
127 {
128 struct vcpu *v;
130 for_each_vcpu ( d, v )
131 {
132 vmx_destroy_vmcs(v);
133 if ( !test_bit(_VCPUF_initialised, &v->vcpu_flags) )
134 continue;
135 free_monitor_pagetable(v);
136 kill_timer(&v->arch.hvm_vmx.hlt_timer);
137 if ( hvm_apic_support(v->domain) && (VLAPIC(v) != NULL) )
138 {
139 kill_timer(&VLAPIC(v)->vlapic_timer);
140 xfree(VLAPIC(v));
141 }
142 }
144 kill_timer(&d->arch.hvm_domain.pl_time.periodic_tm.timer);
146 if ( d->arch.hvm_domain.shared_page_va )
147 unmap_domain_page_global(
148 (void *)d->arch.hvm_domain.shared_page_va);
150 shadow_direct_map_clean(d);
151 }
153 #ifdef __x86_64__
155 static struct vmx_msr_state percpu_msr[NR_CPUS];
157 static u32 msr_data_index[VMX_MSR_COUNT] =
158 {
159 MSR_LSTAR, MSR_STAR, MSR_CSTAR,
160 MSR_SYSCALL_MASK, MSR_EFER,
161 };
163 static void vmx_save_segments(struct vcpu *v)
164 {
165 rdmsrl(MSR_SHADOW_GS_BASE, v->arch.hvm_vmx.msr_content.shadow_gs);
166 }
168 /*
169 * To avoid MSR save/restore at every VM exit/entry time, we restore
170 * the x86_64 specific MSRs at domain switch time. Since those MSRs are
171 * are not modified once set for generic domains, we don't save them,
172 * but simply reset them to the values set at percpu_traps_init().
173 */
174 static void vmx_load_msrs(void)
175 {
176 struct vmx_msr_state *host_state = &percpu_msr[smp_processor_id()];
177 int i;
179 while ( host_state->flags )
180 {
181 i = find_first_set_bit(host_state->flags);
182 wrmsrl(msr_data_index[i], host_state->msr_items[i]);
183 clear_bit(i, &host_state->flags);
184 }
185 }
187 static void vmx_save_init_msrs(void)
188 {
189 struct vmx_msr_state *host_state = &percpu_msr[smp_processor_id()];
190 int i;
192 for ( i = 0; i < VMX_MSR_COUNT; i++ )
193 rdmsrl(msr_data_index[i], host_state->msr_items[i]);
194 }
196 #define CASE_READ_MSR(address) \
197 case MSR_ ## address: \
198 msr_content = msr->msr_items[VMX_INDEX_MSR_ ## address]; \
199 break
201 #define CASE_WRITE_MSR(address) \
202 case MSR_ ## address: \
203 { \
204 msr->msr_items[VMX_INDEX_MSR_ ## address] = msr_content; \
205 if (!test_bit(VMX_INDEX_MSR_ ## address, &msr->flags)) { \
206 set_bit(VMX_INDEX_MSR_ ## address, &msr->flags); \
207 } \
208 wrmsrl(MSR_ ## address, msr_content); \
209 set_bit(VMX_INDEX_MSR_ ## address, &host_state->flags); \
210 } \
211 break
213 #define IS_CANO_ADDRESS(add) 1
214 static inline int long_mode_do_msr_read(struct cpu_user_regs *regs)
215 {
216 u64 msr_content = 0;
217 struct vcpu *v = current;
218 struct vmx_msr_state *msr = &v->arch.hvm_vmx.msr_content;
220 switch ( regs->ecx ) {
221 case MSR_EFER:
222 HVM_DBG_LOG(DBG_LEVEL_2, "EFER msr_content 0x%"PRIx64, msr_content);
223 msr_content = msr->msr_items[VMX_INDEX_MSR_EFER];
225 /* the following code may be not needed */
226 if ( test_bit(VMX_CPU_STATE_LME_ENABLED, &v->arch.hvm_vmx.cpu_state) )
227 msr_content |= EFER_LME;
228 else
229 msr_content &= ~EFER_LME;
231 if ( VMX_LONG_GUEST(v) )
232 msr_content |= EFER_LMA;
233 else
234 msr_content &= ~EFER_LMA;
235 break;
237 case MSR_FS_BASE:
238 if ( !(VMX_LONG_GUEST(v)) )
239 /* XXX should it be GP fault */
240 domain_crash_synchronous();
242 __vmread(GUEST_FS_BASE, &msr_content);
243 break;
245 case MSR_GS_BASE:
246 if ( !(VMX_LONG_GUEST(v)) )
247 domain_crash_synchronous();
249 __vmread(GUEST_GS_BASE, &msr_content);
250 break;
252 case MSR_SHADOW_GS_BASE:
253 msr_content = msr->shadow_gs;
254 break;
256 CASE_READ_MSR(STAR);
257 CASE_READ_MSR(LSTAR);
258 CASE_READ_MSR(CSTAR);
259 CASE_READ_MSR(SYSCALL_MASK);
261 default:
262 return 0;
263 }
265 HVM_DBG_LOG(DBG_LEVEL_2, "msr_content: 0x%"PRIx64, msr_content);
267 regs->eax = msr_content & 0xffffffff;
268 regs->edx = msr_content >> 32;
270 return 1;
271 }
273 static inline int long_mode_do_msr_write(struct cpu_user_regs *regs)
274 {
275 u64 msr_content = regs->eax | ((u64)regs->edx << 32);
276 struct vcpu *v = current;
277 struct vmx_msr_state *msr = &v->arch.hvm_vmx.msr_content;
278 struct vmx_msr_state *host_state = &percpu_msr[smp_processor_id()];
280 HVM_DBG_LOG(DBG_LEVEL_1, "msr 0x%lx msr_content 0x%"PRIx64"\n",
281 (unsigned long)regs->ecx, msr_content);
283 switch ( regs->ecx ) {
284 case MSR_EFER:
285 /* offending reserved bit will cause #GP */
286 if ( msr_content & ~(EFER_LME | EFER_LMA | EFER_NX | EFER_SCE) )
287 {
288 printk("trying to set reserved bit in EFER\n");
289 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
290 return 0;
291 }
293 /* LME: 0 -> 1 */
294 if ( msr_content & EFER_LME &&
295 !test_bit(VMX_CPU_STATE_LME_ENABLED, &v->arch.hvm_vmx.cpu_state) )
296 {
297 if ( vmx_paging_enabled(v) ||
298 !test_bit(VMX_CPU_STATE_PAE_ENABLED,
299 &v->arch.hvm_vmx.cpu_state) )
300 {
301 printk("trying to set LME bit when "
302 "in paging mode or PAE bit is not set\n");
303 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
304 return 0;
305 }
307 set_bit(VMX_CPU_STATE_LME_ENABLED, &v->arch.hvm_vmx.cpu_state);
308 }
310 msr->msr_items[VMX_INDEX_MSR_EFER] = msr_content;
311 break;
313 case MSR_FS_BASE:
314 case MSR_GS_BASE:
315 if ( !(VMX_LONG_GUEST(v)) )
316 domain_crash_synchronous();
318 if ( !IS_CANO_ADDRESS(msr_content) )
319 {
320 HVM_DBG_LOG(DBG_LEVEL_1, "Not cano address of msr write\n");
321 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
322 return 0;
323 }
325 if ( regs->ecx == MSR_FS_BASE )
326 __vmwrite(GUEST_FS_BASE, msr_content);
327 else
328 __vmwrite(GUEST_GS_BASE, msr_content);
330 break;
332 case MSR_SHADOW_GS_BASE:
333 if ( !(VMX_LONG_GUEST(v)) )
334 domain_crash_synchronous();
336 v->arch.hvm_vmx.msr_content.shadow_gs = msr_content;
337 wrmsrl(MSR_SHADOW_GS_BASE, msr_content);
338 break;
340 CASE_WRITE_MSR(STAR);
341 CASE_WRITE_MSR(LSTAR);
342 CASE_WRITE_MSR(CSTAR);
343 CASE_WRITE_MSR(SYSCALL_MASK);
345 default:
346 return 0;
347 }
349 return 1;
350 }
352 static void vmx_restore_msrs(struct vcpu *v)
353 {
354 int i = 0;
355 struct vmx_msr_state *guest_state;
356 struct vmx_msr_state *host_state;
357 unsigned long guest_flags ;
359 guest_state = &v->arch.hvm_vmx.msr_content;;
360 host_state = &percpu_msr[smp_processor_id()];
362 wrmsrl(MSR_SHADOW_GS_BASE, guest_state->shadow_gs);
363 guest_flags = guest_state->flags;
364 if (!guest_flags)
365 return;
367 while (guest_flags){
368 i = find_first_set_bit(guest_flags);
370 HVM_DBG_LOG(DBG_LEVEL_2,
371 "restore guest's index %d msr %lx with %lx\n",
372 i, (unsigned long)msr_data_index[i],
373 (unsigned long)guest_state->msr_items[i]);
374 set_bit(i, &host_state->flags);
375 wrmsrl(msr_data_index[i], guest_state->msr_items[i]);
376 clear_bit(i, &guest_flags);
377 }
378 }
380 #else /* __i386__ */
382 #define vmx_save_segments(v) ((void)0)
383 #define vmx_load_msrs() ((void)0)
384 #define vmx_restore_msrs(v) ((void)0)
385 #define vmx_save_init_msrs() ((void)0)
387 static inline int long_mode_do_msr_read(struct cpu_user_regs *regs)
388 {
389 return 0;
390 }
392 static inline int long_mode_do_msr_write(struct cpu_user_regs *regs)
393 {
394 return 0;
395 }
397 #endif /* __i386__ */
399 #define loaddebug(_v,_reg) \
400 __asm__ __volatile__ ("mov %0,%%db" #_reg : : "r" ((_v)->debugreg[_reg]))
401 #define savedebug(_v,_reg) \
402 __asm__ __volatile__ ("mov %%db" #_reg ",%0" : : "r" ((_v)->debugreg[_reg]))
404 static inline void vmx_save_dr(struct vcpu *v)
405 {
406 if ( v->arch.hvm_vcpu.flag_dr_dirty )
407 {
408 savedebug(&v->arch.guest_context, 0);
409 savedebug(&v->arch.guest_context, 1);
410 savedebug(&v->arch.guest_context, 2);
411 savedebug(&v->arch.guest_context, 3);
412 savedebug(&v->arch.guest_context, 6);
414 v->arch.hvm_vcpu.flag_dr_dirty = 0;
416 v->arch.hvm_vcpu.u.vmx.exec_control |= CPU_BASED_MOV_DR_EXITING;
417 __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
418 v->arch.hvm_vcpu.u.vmx.exec_control);
419 }
420 }
422 static inline void __restore_debug_registers(struct vcpu *v)
423 {
424 loaddebug(&v->arch.guest_context, 0);
425 loaddebug(&v->arch.guest_context, 1);
426 loaddebug(&v->arch.guest_context, 2);
427 loaddebug(&v->arch.guest_context, 3);
428 /* No 4 and 5 */
429 loaddebug(&v->arch.guest_context, 6);
430 /* DR7 is loaded from the vmcs. */
431 }
433 /*
434 * DR7 is saved and restored on every vmexit. Other debug registers only
435 * need to be restored if their value is going to affect execution -- i.e.,
436 * if one of the breakpoints is enabled. So mask out all bits that don't
437 * enable some breakpoint functionality.
438 *
439 * This is in part necessary because bit 10 of DR7 is hardwired to 1, so a
440 * simple if( guest_dr7 ) will always return true. As long as we're masking,
441 * we might as well do it right.
442 */
443 #define DR7_ACTIVE_MASK 0xff
445 static inline void vmx_restore_dr(struct vcpu *v)
446 {
447 unsigned long guest_dr7;
449 __vmread(GUEST_DR7, &guest_dr7);
451 /* Assumes guest does not have DR access at time of context switch. */
452 if ( unlikely(guest_dr7 & DR7_ACTIVE_MASK) )
453 __restore_debug_registers(v);
454 }
456 static void vmx_freeze_time(struct vcpu *v)
457 {
458 struct periodic_time *pt=&v->domain->arch.hvm_domain.pl_time.periodic_tm;
460 if ( pt->enabled && pt->first_injected && !v->arch.hvm_vcpu.guest_time ) {
461 v->arch.hvm_vcpu.guest_time = hvm_get_guest_time(v);
462 stop_timer(&(pt->timer));
463 }
464 }
466 static void vmx_ctxt_switch_from(struct vcpu *v)
467 {
468 vmx_freeze_time(v);
469 vmx_save_segments(v);
470 vmx_load_msrs();
471 vmx_save_dr(v);
472 }
474 static void vmx_ctxt_switch_to(struct vcpu *v)
475 {
476 vmx_restore_msrs(v);
477 vmx_restore_dr(v);
478 }
480 void stop_vmx(void)
481 {
482 if (read_cr4() & X86_CR4_VMXE)
483 __vmxoff();
484 }
486 void vmx_migrate_timers(struct vcpu *v)
487 {
488 struct periodic_time *pt = &(v->domain->arch.hvm_domain.pl_time.periodic_tm);
490 if ( pt->enabled ) {
491 migrate_timer(&pt->timer, v->processor);
492 migrate_timer(&v->arch.hvm_vmx.hlt_timer, v->processor);
493 }
494 if ( hvm_apic_support(v->domain) && VLAPIC(v))
495 migrate_timer(&(VLAPIC(v)->vlapic_timer), v->processor);
496 }
498 static void vmx_store_cpu_guest_regs(
499 struct vcpu *v, struct cpu_user_regs *regs, unsigned long *crs)
500 {
501 vmx_vmcs_enter(v);
503 if ( regs != NULL )
504 {
505 __vmread(GUEST_RFLAGS, &regs->eflags);
506 __vmread(GUEST_SS_SELECTOR, &regs->ss);
507 __vmread(GUEST_CS_SELECTOR, &regs->cs);
508 __vmread(GUEST_DS_SELECTOR, &regs->ds);
509 __vmread(GUEST_ES_SELECTOR, &regs->es);
510 __vmread(GUEST_GS_SELECTOR, &regs->gs);
511 __vmread(GUEST_FS_SELECTOR, &regs->fs);
512 __vmread(GUEST_RIP, &regs->eip);
513 __vmread(GUEST_RSP, &regs->esp);
514 }
516 if ( crs != NULL )
517 {
518 __vmread(CR0_READ_SHADOW, &crs[0]);
519 __vmread(GUEST_CR3, &crs[3]);
520 __vmread(CR4_READ_SHADOW, &crs[4]);
521 }
523 vmx_vmcs_exit(v);
524 }
526 /*
527 * The VMX spec (section 4.3.1.2, Checks on Guest Segment
528 * Registers) says that virtual-8086 mode guests' segment
529 * base-address fields in the VMCS must be equal to their
530 * corresponding segment selector field shifted right by
531 * four bits upon vmentry.
532 *
533 * This function (called only for VM86-mode guests) fixes
534 * the bases to be consistent with the selectors in regs
535 * if they're not already. Without this, we can fail the
536 * vmentry check mentioned above.
537 */
538 static void fixup_vm86_seg_bases(struct cpu_user_regs *regs)
539 {
540 int err = 0;
541 unsigned long base;
543 err |= __vmread(GUEST_ES_BASE, &base);
544 if (regs->es << 4 != base)
545 err |= __vmwrite(GUEST_ES_BASE, regs->es << 4);
546 err |= __vmread(GUEST_CS_BASE, &base);
547 if (regs->cs << 4 != base)
548 err |= __vmwrite(GUEST_CS_BASE, regs->cs << 4);
549 err |= __vmread(GUEST_SS_BASE, &base);
550 if (regs->ss << 4 != base)
551 err |= __vmwrite(GUEST_SS_BASE, regs->ss << 4);
552 err |= __vmread(GUEST_DS_BASE, &base);
553 if (regs->ds << 4 != base)
554 err |= __vmwrite(GUEST_DS_BASE, regs->ds << 4);
555 err |= __vmread(GUEST_FS_BASE, &base);
556 if (regs->fs << 4 != base)
557 err |= __vmwrite(GUEST_FS_BASE, regs->fs << 4);
558 err |= __vmread(GUEST_GS_BASE, &base);
559 if (regs->gs << 4 != base)
560 err |= __vmwrite(GUEST_GS_BASE, regs->gs << 4);
562 BUG_ON(err);
563 }
565 void vmx_load_cpu_guest_regs(struct vcpu *v, struct cpu_user_regs *regs)
566 {
567 vmx_vmcs_enter(v);
569 __vmwrite(GUEST_SS_SELECTOR, regs->ss);
570 __vmwrite(GUEST_DS_SELECTOR, regs->ds);
571 __vmwrite(GUEST_ES_SELECTOR, regs->es);
572 __vmwrite(GUEST_GS_SELECTOR, regs->gs);
573 __vmwrite(GUEST_FS_SELECTOR, regs->fs);
575 __vmwrite(GUEST_RSP, regs->esp);
577 __vmwrite(GUEST_RFLAGS, regs->eflags);
578 if (regs->eflags & EF_TF)
579 __vm_set_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_DB);
580 else
581 __vm_clear_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_DB);
582 if (regs->eflags & EF_VM)
583 fixup_vm86_seg_bases(regs);
585 __vmwrite(GUEST_CS_SELECTOR, regs->cs);
586 __vmwrite(GUEST_RIP, regs->eip);
588 vmx_vmcs_exit(v);
589 }
591 int vmx_realmode(struct vcpu *v)
592 {
593 unsigned long rflags;
595 __vmread(GUEST_RFLAGS, &rflags);
596 return rflags & X86_EFLAGS_VM;
597 }
599 int vmx_instruction_length(struct vcpu *v)
600 {
601 unsigned long inst_len;
603 if (__vmread(VM_EXIT_INSTRUCTION_LEN, &inst_len))
604 return 0;
605 return inst_len;
606 }
608 unsigned long vmx_get_ctrl_reg(struct vcpu *v, unsigned int num)
609 {
610 switch ( num )
611 {
612 case 0:
613 return v->arch.hvm_vmx.cpu_cr0;
614 case 2:
615 return v->arch.hvm_vmx.cpu_cr2;
616 case 3:
617 return v->arch.hvm_vmx.cpu_cr3;
618 default:
619 BUG();
620 }
621 return 0; /* dummy */
622 }
624 /* SMP VMX guest support */
625 void vmx_init_ap_context(struct vcpu_guest_context *ctxt,
626 int vcpuid, int trampoline_vector)
627 {
628 int i;
630 memset(ctxt, 0, sizeof(*ctxt));
632 /*
633 * Initial register values:
634 */
635 ctxt->user_regs.eip = VMXASSIST_BASE;
636 ctxt->user_regs.edx = vcpuid;
637 ctxt->user_regs.ebx = trampoline_vector;
639 ctxt->flags = VGCF_HVM_GUEST;
641 /* Virtual IDT is empty at start-of-day. */
642 for ( i = 0; i < 256; i++ )
643 {
644 ctxt->trap_ctxt[i].vector = i;
645 ctxt->trap_ctxt[i].cs = FLAT_KERNEL_CS;
646 }
648 /* No callback handlers. */
649 #if defined(__i386__)
650 ctxt->event_callback_cs = FLAT_KERNEL_CS;
651 ctxt->failsafe_callback_cs = FLAT_KERNEL_CS;
652 #endif
653 }
655 void do_nmi(struct cpu_user_regs *);
657 static int check_vmx_controls(u32 ctrls, u32 msr)
658 {
659 u32 vmx_msr_low, vmx_msr_high;
661 rdmsr(msr, vmx_msr_low, vmx_msr_high);
662 if ( (ctrls < vmx_msr_low) || (ctrls > vmx_msr_high) )
663 {
664 printk("Insufficient VMX capability 0x%x, "
665 "msr=0x%x,low=0x%8x,high=0x%x\n",
666 ctrls, msr, vmx_msr_low, vmx_msr_high);
667 return 0;
668 }
669 return 1;
670 }
672 static void vmx_init_hypercall_page(struct domain *d, void *hypercall_page)
673 {
674 char *p;
675 int i;
677 memset(hypercall_page, 0, PAGE_SIZE);
679 for ( i = 0; i < (PAGE_SIZE / 32); i++ )
680 {
681 p = (char *)(hypercall_page + (i * 32));
682 *(u8 *)(p + 0) = 0xb8; /* mov imm32, %eax */
683 *(u32 *)(p + 1) = i;
684 *(u8 *)(p + 5) = 0x0f; /* vmcall */
685 *(u8 *)(p + 6) = 0x01;
686 *(u8 *)(p + 7) = 0xc1;
687 *(u8 *)(p + 8) = 0xc3; /* ret */
688 }
690 /* Don't support HYPERVISOR_iret at the moment */
691 *(u16 *)(hypercall_page + (__HYPERVISOR_iret * 32)) = 0x0b0f; /* ud2 */
692 }
694 int start_vmx(void)
695 {
696 u32 eax, edx;
697 struct vmcs_struct *vmcs;
699 /*
700 * Xen does not fill x86_capability words except 0.
701 */
702 boot_cpu_data.x86_capability[4] = cpuid_ecx(1);
704 if (!(test_bit(X86_FEATURE_VMXE, &boot_cpu_data.x86_capability)))
705 return 0;
707 rdmsr(IA32_FEATURE_CONTROL_MSR, eax, edx);
709 if ( eax & IA32_FEATURE_CONTROL_MSR_LOCK )
710 {
711 if ( (eax & IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON) == 0x0 )
712 {
713 printk("VMX disabled by Feature Control MSR.\n");
714 return 0;
715 }
716 }
717 else
718 {
719 wrmsr(IA32_FEATURE_CONTROL_MSR,
720 IA32_FEATURE_CONTROL_MSR_LOCK |
721 IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON, 0);
722 }
724 if ( !check_vmx_controls(MONITOR_PIN_BASED_EXEC_CONTROLS,
725 MSR_IA32_VMX_PINBASED_CTLS_MSR) )
726 return 0;
727 if ( !check_vmx_controls(MONITOR_CPU_BASED_EXEC_CONTROLS,
728 MSR_IA32_VMX_PROCBASED_CTLS_MSR) )
729 return 0;
730 if ( !check_vmx_controls(MONITOR_VM_EXIT_CONTROLS,
731 MSR_IA32_VMX_EXIT_CTLS_MSR) )
732 return 0;
733 if ( !check_vmx_controls(MONITOR_VM_ENTRY_CONTROLS,
734 MSR_IA32_VMX_ENTRY_CTLS_MSR) )
735 return 0;
737 set_in_cr4(X86_CR4_VMXE);
739 vmx_init_vmcs_config();
741 if ( (vmcs = vmx_alloc_host_vmcs()) == NULL )
742 {
743 printk("Failed to allocate host VMCS\n");
744 return 0;
745 }
747 if ( __vmxon(virt_to_maddr(vmcs)) )
748 {
749 printk("VMXON failed\n");
750 vmx_free_host_vmcs(vmcs);
751 return 0;
752 }
754 printk("VMXON is done\n");
756 vmx_save_init_msrs();
758 /* Setup HVM interfaces */
759 hvm_funcs.disable = stop_vmx;
761 hvm_funcs.initialize_guest_resources = vmx_initialize_guest_resources;
762 hvm_funcs.relinquish_guest_resources = vmx_relinquish_guest_resources;
764 hvm_funcs.store_cpu_guest_regs = vmx_store_cpu_guest_regs;
765 hvm_funcs.load_cpu_guest_regs = vmx_load_cpu_guest_regs;
767 hvm_funcs.realmode = vmx_realmode;
768 hvm_funcs.paging_enabled = vmx_paging_enabled;
769 hvm_funcs.instruction_length = vmx_instruction_length;
770 hvm_funcs.get_guest_ctrl_reg = vmx_get_ctrl_reg;
772 hvm_funcs.init_ap_context = vmx_init_ap_context;
774 hvm_funcs.init_hypercall_page = vmx_init_hypercall_page;
776 hvm_enabled = 1;
778 return 1;
779 }
781 /*
782 * Not all cases receive valid value in the VM-exit instruction length field.
783 */
784 #define __get_instruction_length(len) \
785 __vmread(VM_EXIT_INSTRUCTION_LEN, &(len)); \
786 if ((len) < 1 || (len) > 15) \
787 __hvm_bug(&regs);
789 static void inline __update_guest_eip(unsigned long inst_len)
790 {
791 unsigned long current_eip;
793 __vmread(GUEST_RIP, &current_eip);
794 __vmwrite(GUEST_RIP, current_eip + inst_len);
795 __vmwrite(GUEST_INTERRUPTIBILITY_INFO, 0);
796 }
799 static int vmx_do_page_fault(unsigned long va, struct cpu_user_regs *regs)
800 {
801 unsigned long gpa; /* FIXME: PAE */
802 int result;
804 #if 0 /* keep for debugging */
805 {
806 unsigned long eip;
808 __vmread(GUEST_RIP, &eip);
809 HVM_DBG_LOG(DBG_LEVEL_VMMU,
810 "vmx_do_page_fault = 0x%lx, eip = %lx, error_code = %lx",
811 va, eip, (unsigned long)regs->error_code);
812 }
813 #endif
815 if ( !vmx_paging_enabled(current) )
816 {
817 /* construct 1-to-1 direct mapping */
818 if ( shadow_direct_map_fault(va, regs) )
819 return 1;
821 handle_mmio(va, va);
822 TRACE_VMEXIT (2,2);
823 return 1;
824 }
825 gpa = gva_to_gpa(va);
827 /* Use 1:1 page table to identify MMIO address space */
828 if ( mmio_space(gpa) ){
829 struct vcpu *v = current;
830 /* No support for APIC */
831 if (!hvm_apic_support(v->domain) && gpa >= 0xFEC00000) {
832 u32 inst_len;
833 __vmread(VM_EXIT_INSTRUCTION_LEN, &(inst_len));
834 __update_guest_eip(inst_len);
835 return 1;
836 }
837 TRACE_VMEXIT (2,2);
838 /* in the case of MMIO, we are more interested in gpa than in va */
839 TRACE_VMEXIT (4,gpa);
840 handle_mmio(va, gpa);
841 return 1;
842 }
844 result = shadow_fault(va, regs);
845 TRACE_VMEXIT (2,result);
846 #if 0
847 if ( !result )
848 {
849 __vmread(GUEST_RIP, &eip);
850 printk("vmx pgfault to guest va=%lx eip=%lx\n", va, eip);
851 }
852 #endif
854 return result;
855 }
857 static void vmx_do_no_device_fault(void)
858 {
859 unsigned long cr0;
860 struct vcpu *v = current;
862 setup_fpu(current);
863 __vm_clear_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_NM);
865 /* Disable TS in guest CR0 unless the guest wants the exception too. */
866 __vmread_vcpu(v, CR0_READ_SHADOW, &cr0);
867 if ( !(cr0 & X86_CR0_TS) )
868 {
869 __vmread_vcpu(v, GUEST_CR0, &cr0);
870 cr0 &= ~X86_CR0_TS;
871 __vmwrite(GUEST_CR0, cr0);
872 }
873 }
875 #define bitmaskof(idx) (1U << ((idx)&31))
876 static void vmx_vmexit_do_cpuid(struct cpu_user_regs *regs)
877 {
878 unsigned int input = (unsigned int)regs->eax;
879 unsigned int count = (unsigned int)regs->ecx;
880 unsigned int eax, ebx, ecx, edx;
881 unsigned long eip;
882 struct vcpu *v = current;
884 __vmread(GUEST_RIP, &eip);
886 HVM_DBG_LOG(DBG_LEVEL_3, "(eax) 0x%08lx, (ebx) 0x%08lx, "
887 "(ecx) 0x%08lx, (edx) 0x%08lx, (esi) 0x%08lx, (edi) 0x%08lx",
888 (unsigned long)regs->eax, (unsigned long)regs->ebx,
889 (unsigned long)regs->ecx, (unsigned long)regs->edx,
890 (unsigned long)regs->esi, (unsigned long)regs->edi);
892 if ( input == CPUID_LEAF_0x4 )
893 {
894 cpuid_count(input, count, &eax, &ebx, &ecx, &edx);
895 eax &= NUM_CORES_RESET_MASK;
896 }
897 else if ( !cpuid_hypervisor_leaves(input, &eax, &ebx, &ecx, &edx) )
898 {
899 cpuid(input, &eax, &ebx, &ecx, &edx);
901 if ( input == CPUID_LEAF_0x1 )
902 {
903 /* mask off reserved bits */
904 ecx &= ~VMX_VCPU_CPUID_L1_ECX_RESERVED;
906 if ( !hvm_apic_support(v->domain) ||
907 !vlapic_global_enabled((VLAPIC(v))) )
908 {
909 /* Since the apic is disabled, avoid any
910 confusion about SMP cpus being available */
912 clear_bit(X86_FEATURE_APIC, &edx);
913 }
915 #if CONFIG_PAGING_LEVELS < 3
916 edx &= ~(bitmaskof(X86_FEATURE_PAE) |
917 bitmaskof(X86_FEATURE_PSE) |
918 bitmaskof(X86_FEATURE_PSE36));
919 #else
920 if ( v->domain->arch.ops->guest_paging_levels == PAGING_L2 )
921 {
922 if ( v->domain->arch.hvm_domain.pae_enabled )
923 clear_bit(X86_FEATURE_PSE36, &edx);
924 else
925 {
926 clear_bit(X86_FEATURE_PAE, &edx);
927 clear_bit(X86_FEATURE_PSE, &edx);
928 clear_bit(X86_FEATURE_PSE36, &edx);
929 }
930 }
931 #endif
933 ebx &= NUM_THREADS_RESET_MASK;
935 /* Unsupportable for virtualised CPUs. */
936 ecx &= ~(bitmaskof(X86_FEATURE_VMXE) |
937 bitmaskof(X86_FEATURE_EST) |
938 bitmaskof(X86_FEATURE_TM2) |
939 bitmaskof(X86_FEATURE_CID) |
940 bitmaskof(X86_FEATURE_MWAIT) );
942 edx &= ~( bitmaskof(X86_FEATURE_HT) |
943 bitmaskof(X86_FEATURE_MCA) |
944 bitmaskof(X86_FEATURE_MCE) |
945 bitmaskof(X86_FEATURE_ACPI) |
946 bitmaskof(X86_FEATURE_ACC) );
947 }
948 else if ( ( input == CPUID_LEAF_0x6 )
949 || ( input == CPUID_LEAF_0x9 )
950 || ( input == CPUID_LEAF_0xA ))
951 {
952 eax = ebx = ecx = edx = 0x0;
953 }
954 #ifdef __i386__
955 else if ( input == CPUID_LEAF_0x80000001 )
956 {
957 clear_bit(X86_FEATURE_LAHF_LM & 31, &ecx);
959 clear_bit(X86_FEATURE_LM & 31, &edx);
960 clear_bit(X86_FEATURE_SYSCALL & 31, &edx);
961 }
962 #endif
963 }
965 regs->eax = (unsigned long) eax;
966 regs->ebx = (unsigned long) ebx;
967 regs->ecx = (unsigned long) ecx;
968 regs->edx = (unsigned long) edx;
970 HVM_DBG_LOG(DBG_LEVEL_3, "eip@%lx, input: 0x%lx, "
971 "output: eax = 0x%08lx, ebx = 0x%08lx, "
972 "ecx = 0x%08lx, edx = 0x%08lx",
973 (unsigned long)eip, (unsigned long)input,
974 (unsigned long)eax, (unsigned long)ebx,
975 (unsigned long)ecx, (unsigned long)edx);
976 }
978 #define CASE_GET_REG_P(REG, reg) \
979 case REG_ ## REG: reg_p = (unsigned long *)&(regs->reg); break
981 #ifdef __i386__
982 #define CASE_EXTEND_GET_REG_P
983 #else
984 #define CASE_EXTEND_GET_REG_P \
985 CASE_GET_REG_P(R8, r8); \
986 CASE_GET_REG_P(R9, r9); \
987 CASE_GET_REG_P(R10, r10); \
988 CASE_GET_REG_P(R11, r11); \
989 CASE_GET_REG_P(R12, r12); \
990 CASE_GET_REG_P(R13, r13); \
991 CASE_GET_REG_P(R14, r14); \
992 CASE_GET_REG_P(R15, r15)
993 #endif
995 static void vmx_dr_access(unsigned long exit_qualification,
996 struct cpu_user_regs *regs)
997 {
998 struct vcpu *v = current;
1000 v->arch.hvm_vcpu.flag_dr_dirty = 1;
1002 /* We could probably be smarter about this */
1003 __restore_debug_registers(v);
1005 /* Allow guest direct access to DR registers */
1006 v->arch.hvm_vcpu.u.vmx.exec_control &= ~CPU_BASED_MOV_DR_EXITING;
1007 __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
1008 v->arch.hvm_vcpu.u.vmx.exec_control);
1011 /*
1012 * Invalidate the TLB for va. Invalidate the shadow page corresponding
1013 * the address va.
1014 */
1015 static void vmx_vmexit_do_invlpg(unsigned long va)
1017 unsigned long eip;
1018 struct vcpu *v = current;
1020 __vmread(GUEST_RIP, &eip);
1022 HVM_DBG_LOG(DBG_LEVEL_VMMU, "vmx_vmexit_do_invlpg: eip=%lx, va=%lx",
1023 eip, va);
1025 /*
1026 * We do the safest things first, then try to update the shadow
1027 * copying from guest
1028 */
1029 shadow_invlpg(v, va);
1032 static int check_for_null_selector(unsigned long eip)
1034 unsigned char inst[MAX_INST_LEN];
1035 unsigned long sel;
1036 int i, inst_len;
1037 int inst_copy_from_guest(unsigned char *, unsigned long, int);
1039 __vmread(VM_EXIT_INSTRUCTION_LEN, &inst_len);
1040 memset(inst, 0, MAX_INST_LEN);
1041 if (inst_copy_from_guest(inst, eip, inst_len) != inst_len) {
1042 printf("check_for_null_selector: get guest instruction failed\n");
1043 domain_crash_synchronous();
1046 for (i = 0; i < inst_len; i++) {
1047 switch (inst[i]) {
1048 case 0xf3: /* REPZ */
1049 case 0xf2: /* REPNZ */
1050 case 0xf0: /* LOCK */
1051 case 0x66: /* data32 */
1052 case 0x67: /* addr32 */
1053 continue;
1054 case 0x2e: /* CS */
1055 __vmread(GUEST_CS_SELECTOR, &sel);
1056 break;
1057 case 0x36: /* SS */
1058 __vmread(GUEST_SS_SELECTOR, &sel);
1059 break;
1060 case 0x26: /* ES */
1061 __vmread(GUEST_ES_SELECTOR, &sel);
1062 break;
1063 case 0x64: /* FS */
1064 __vmread(GUEST_FS_SELECTOR, &sel);
1065 break;
1066 case 0x65: /* GS */
1067 __vmread(GUEST_GS_SELECTOR, &sel);
1068 break;
1069 case 0x3e: /* DS */
1070 /* FALLTHROUGH */
1071 default:
1072 /* DS is the default */
1073 __vmread(GUEST_DS_SELECTOR, &sel);
1075 return sel == 0 ? 1 : 0;
1078 return 0;
1081 extern void send_pio_req(struct cpu_user_regs *regs, unsigned long port,
1082 unsigned long count, int size, long value,
1083 int dir, int pvalid);
1085 static void vmx_io_instruction(unsigned long exit_qualification,
1086 unsigned long inst_len)
1088 struct cpu_user_regs *regs;
1089 struct hvm_io_op *pio_opp;
1090 unsigned long eip, cs, eflags;
1091 unsigned long port, size, dir;
1092 int vm86;
1094 pio_opp = &current->arch.hvm_vcpu.io_op;
1095 pio_opp->instr = INSTR_PIO;
1096 pio_opp->flags = 0;
1098 regs = &pio_opp->io_context;
1100 /* Copy current guest state into io instruction state structure. */
1101 memcpy(regs, guest_cpu_user_regs(), HVM_CONTEXT_STACK_BYTES);
1103 __vmread(GUEST_RIP, &eip);
1104 __vmread(GUEST_CS_SELECTOR, &cs);
1105 __vmread(GUEST_RFLAGS, &eflags);
1106 vm86 = eflags & X86_EFLAGS_VM ? 1 : 0;
1108 HVM_DBG_LOG(DBG_LEVEL_IO,
1109 "vmx_io_instruction: vm86 %d, eip=%lx:%lx, "
1110 "exit_qualification = %lx",
1111 vm86, cs, eip, exit_qualification);
1113 if (test_bit(6, &exit_qualification))
1114 port = (exit_qualification >> 16) & 0xFFFF;
1115 else
1116 port = regs->edx & 0xffff;
1117 TRACE_VMEXIT(1, port);
1118 size = (exit_qualification & 7) + 1;
1119 dir = test_bit(3, &exit_qualification); /* direction */
1121 if (test_bit(4, &exit_qualification)) { /* string instruction */
1122 unsigned long addr, count = 1;
1123 int sign = regs->eflags & EF_DF ? -1 : 1;
1125 __vmread(GUEST_LINEAR_ADDRESS, &addr);
1127 /*
1128 * In protected mode, guest linear address is invalid if the
1129 * selector is null.
1130 */
1131 if (!vm86 && check_for_null_selector(eip))
1132 addr = dir == IOREQ_WRITE ? regs->esi : regs->edi;
1134 if (test_bit(5, &exit_qualification)) { /* "rep" prefix */
1135 pio_opp->flags |= REPZ;
1136 count = vm86 ? regs->ecx & 0xFFFF : regs->ecx;
1139 /*
1140 * Handle string pio instructions that cross pages or that
1141 * are unaligned. See the comments in hvm_domain.c/handle_mmio()
1142 */
1143 if ((addr & PAGE_MASK) != ((addr + size - 1) & PAGE_MASK)) {
1144 unsigned long value = 0;
1146 pio_opp->flags |= OVERLAP;
1147 if (dir == IOREQ_WRITE)
1148 hvm_copy(&value, addr, size, HVM_COPY_IN);
1149 send_pio_req(regs, port, 1, size, value, dir, 0);
1150 } else {
1151 if ((addr & PAGE_MASK) != ((addr + count * size - 1) & PAGE_MASK)) {
1152 if (sign > 0)
1153 count = (PAGE_SIZE - (addr & ~PAGE_MASK)) / size;
1154 else
1155 count = (addr & ~PAGE_MASK) / size;
1156 } else
1157 __update_guest_eip(inst_len);
1159 send_pio_req(regs, port, count, size, addr, dir, 1);
1161 } else {
1162 if (port == 0xe9 && dir == IOREQ_WRITE && size == 1)
1163 hvm_print_line(current, regs->eax); /* guest debug output */
1165 __update_guest_eip(inst_len);
1166 send_pio_req(regs, port, 1, size, regs->eax, dir, 0);
1170 int
1171 vmx_world_save(struct vcpu *v, struct vmx_assist_context *c)
1173 unsigned long inst_len;
1174 int error = 0;
1176 error |= __vmread(VM_EXIT_INSTRUCTION_LEN, &inst_len);
1177 error |= __vmread(GUEST_RIP, &c->eip);
1178 c->eip += inst_len; /* skip transition instruction */
1179 error |= __vmread(GUEST_RSP, &c->esp);
1180 error |= __vmread(GUEST_RFLAGS, &c->eflags);
1182 error |= __vmread(CR0_READ_SHADOW, &c->cr0);
1183 c->cr3 = v->arch.hvm_vmx.cpu_cr3;
1184 error |= __vmread(CR4_READ_SHADOW, &c->cr4);
1186 error |= __vmread(GUEST_IDTR_LIMIT, &c->idtr_limit);
1187 error |= __vmread(GUEST_IDTR_BASE, &c->idtr_base);
1189 error |= __vmread(GUEST_GDTR_LIMIT, &c->gdtr_limit);
1190 error |= __vmread(GUEST_GDTR_BASE, &c->gdtr_base);
1192 error |= __vmread(GUEST_CS_SELECTOR, &c->cs_sel);
1193 error |= __vmread(GUEST_CS_LIMIT, &c->cs_limit);
1194 error |= __vmread(GUEST_CS_BASE, &c->cs_base);
1195 error |= __vmread(GUEST_CS_AR_BYTES, &c->cs_arbytes.bytes);
1197 error |= __vmread(GUEST_DS_SELECTOR, &c->ds_sel);
1198 error |= __vmread(GUEST_DS_LIMIT, &c->ds_limit);
1199 error |= __vmread(GUEST_DS_BASE, &c->ds_base);
1200 error |= __vmread(GUEST_DS_AR_BYTES, &c->ds_arbytes.bytes);
1202 error |= __vmread(GUEST_ES_SELECTOR, &c->es_sel);
1203 error |= __vmread(GUEST_ES_LIMIT, &c->es_limit);
1204 error |= __vmread(GUEST_ES_BASE, &c->es_base);
1205 error |= __vmread(GUEST_ES_AR_BYTES, &c->es_arbytes.bytes);
1207 error |= __vmread(GUEST_SS_SELECTOR, &c->ss_sel);
1208 error |= __vmread(GUEST_SS_LIMIT, &c->ss_limit);
1209 error |= __vmread(GUEST_SS_BASE, &c->ss_base);
1210 error |= __vmread(GUEST_SS_AR_BYTES, &c->ss_arbytes.bytes);
1212 error |= __vmread(GUEST_FS_SELECTOR, &c->fs_sel);
1213 error |= __vmread(GUEST_FS_LIMIT, &c->fs_limit);
1214 error |= __vmread(GUEST_FS_BASE, &c->fs_base);
1215 error |= __vmread(GUEST_FS_AR_BYTES, &c->fs_arbytes.bytes);
1217 error |= __vmread(GUEST_GS_SELECTOR, &c->gs_sel);
1218 error |= __vmread(GUEST_GS_LIMIT, &c->gs_limit);
1219 error |= __vmread(GUEST_GS_BASE, &c->gs_base);
1220 error |= __vmread(GUEST_GS_AR_BYTES, &c->gs_arbytes.bytes);
1222 error |= __vmread(GUEST_TR_SELECTOR, &c->tr_sel);
1223 error |= __vmread(GUEST_TR_LIMIT, &c->tr_limit);
1224 error |= __vmread(GUEST_TR_BASE, &c->tr_base);
1225 error |= __vmread(GUEST_TR_AR_BYTES, &c->tr_arbytes.bytes);
1227 error |= __vmread(GUEST_LDTR_SELECTOR, &c->ldtr_sel);
1228 error |= __vmread(GUEST_LDTR_LIMIT, &c->ldtr_limit);
1229 error |= __vmread(GUEST_LDTR_BASE, &c->ldtr_base);
1230 error |= __vmread(GUEST_LDTR_AR_BYTES, &c->ldtr_arbytes.bytes);
1232 return !error;
1235 int
1236 vmx_world_restore(struct vcpu *v, struct vmx_assist_context *c)
1238 unsigned long mfn, old_cr4, old_base_mfn;
1239 int error = 0;
1241 error |= __vmwrite(GUEST_RIP, c->eip);
1242 error |= __vmwrite(GUEST_RSP, c->esp);
1243 error |= __vmwrite(GUEST_RFLAGS, c->eflags);
1245 error |= __vmwrite(CR0_READ_SHADOW, c->cr0);
1247 if (!vmx_paging_enabled(v)) {
1248 HVM_DBG_LOG(DBG_LEVEL_VMMU, "switching to vmxassist. use phys table");
1249 __vmwrite(GUEST_CR3, pagetable_get_paddr(v->domain->arch.phys_table));
1250 goto skip_cr3;
1253 if (c->cr3 == v->arch.hvm_vmx.cpu_cr3) {
1254 /*
1255 * This is simple TLB flush, implying the guest has
1256 * removed some translation or changed page attributes.
1257 * We simply invalidate the shadow.
1258 */
1259 mfn = get_mfn_from_gpfn(c->cr3 >> PAGE_SHIFT);
1260 if (mfn != pagetable_get_pfn(v->arch.guest_table)) {
1261 printk("Invalid CR3 value=%x", c->cr3);
1262 domain_crash_synchronous();
1263 return 0;
1265 shadow_sync_all(v->domain);
1266 } else {
1267 /*
1268 * If different, make a shadow. Check if the PDBR is valid
1269 * first.
1270 */
1271 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 c->cr3 = %x", c->cr3);
1272 if ((c->cr3 >> PAGE_SHIFT) > v->domain->max_pages) {
1273 printk("Invalid CR3 value=%x", c->cr3);
1274 domain_crash_synchronous();
1275 return 0;
1277 mfn = get_mfn_from_gpfn(c->cr3 >> PAGE_SHIFT);
1278 if(!get_page(mfn_to_page(mfn), v->domain))
1279 return 0;
1280 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1281 v->arch.guest_table = pagetable_from_pfn(mfn);
1282 if (old_base_mfn)
1283 put_page(mfn_to_page(old_base_mfn));
1284 /*
1285 * arch.shadow_table should now hold the next CR3 for shadow
1286 */
1287 v->arch.hvm_vmx.cpu_cr3 = c->cr3;
1288 update_pagetables(v);
1289 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %x", c->cr3);
1290 __vmwrite(GUEST_CR3, pagetable_get_paddr(v->arch.shadow_table));
1293 skip_cr3:
1295 error |= __vmread(CR4_READ_SHADOW, &old_cr4);
1296 error |= __vmwrite(GUEST_CR4, (c->cr4 | VMX_CR4_HOST_MASK));
1297 error |= __vmwrite(CR4_READ_SHADOW, c->cr4);
1299 error |= __vmwrite(GUEST_IDTR_LIMIT, c->idtr_limit);
1300 error |= __vmwrite(GUEST_IDTR_BASE, c->idtr_base);
1302 error |= __vmwrite(GUEST_GDTR_LIMIT, c->gdtr_limit);
1303 error |= __vmwrite(GUEST_GDTR_BASE, c->gdtr_base);
1305 error |= __vmwrite(GUEST_CS_SELECTOR, c->cs_sel);
1306 error |= __vmwrite(GUEST_CS_LIMIT, c->cs_limit);
1307 error |= __vmwrite(GUEST_CS_BASE, c->cs_base);
1308 error |= __vmwrite(GUEST_CS_AR_BYTES, c->cs_arbytes.bytes);
1310 error |= __vmwrite(GUEST_DS_SELECTOR, c->ds_sel);
1311 error |= __vmwrite(GUEST_DS_LIMIT, c->ds_limit);
1312 error |= __vmwrite(GUEST_DS_BASE, c->ds_base);
1313 error |= __vmwrite(GUEST_DS_AR_BYTES, c->ds_arbytes.bytes);
1315 error |= __vmwrite(GUEST_ES_SELECTOR, c->es_sel);
1316 error |= __vmwrite(GUEST_ES_LIMIT, c->es_limit);
1317 error |= __vmwrite(GUEST_ES_BASE, c->es_base);
1318 error |= __vmwrite(GUEST_ES_AR_BYTES, c->es_arbytes.bytes);
1320 error |= __vmwrite(GUEST_SS_SELECTOR, c->ss_sel);
1321 error |= __vmwrite(GUEST_SS_LIMIT, c->ss_limit);
1322 error |= __vmwrite(GUEST_SS_BASE, c->ss_base);
1323 error |= __vmwrite(GUEST_SS_AR_BYTES, c->ss_arbytes.bytes);
1325 error |= __vmwrite(GUEST_FS_SELECTOR, c->fs_sel);
1326 error |= __vmwrite(GUEST_FS_LIMIT, c->fs_limit);
1327 error |= __vmwrite(GUEST_FS_BASE, c->fs_base);
1328 error |= __vmwrite(GUEST_FS_AR_BYTES, c->fs_arbytes.bytes);
1330 error |= __vmwrite(GUEST_GS_SELECTOR, c->gs_sel);
1331 error |= __vmwrite(GUEST_GS_LIMIT, c->gs_limit);
1332 error |= __vmwrite(GUEST_GS_BASE, c->gs_base);
1333 error |= __vmwrite(GUEST_GS_AR_BYTES, c->gs_arbytes.bytes);
1335 error |= __vmwrite(GUEST_TR_SELECTOR, c->tr_sel);
1336 error |= __vmwrite(GUEST_TR_LIMIT, c->tr_limit);
1337 error |= __vmwrite(GUEST_TR_BASE, c->tr_base);
1338 error |= __vmwrite(GUEST_TR_AR_BYTES, c->tr_arbytes.bytes);
1340 error |= __vmwrite(GUEST_LDTR_SELECTOR, c->ldtr_sel);
1341 error |= __vmwrite(GUEST_LDTR_LIMIT, c->ldtr_limit);
1342 error |= __vmwrite(GUEST_LDTR_BASE, c->ldtr_base);
1343 error |= __vmwrite(GUEST_LDTR_AR_BYTES, c->ldtr_arbytes.bytes);
1345 return !error;
1348 enum { VMX_ASSIST_INVOKE = 0, VMX_ASSIST_RESTORE };
1350 int
1351 vmx_assist(struct vcpu *v, int mode)
1353 struct vmx_assist_context c;
1354 u32 magic;
1355 u32 cp;
1357 /* make sure vmxassist exists (this is not an error) */
1358 if (!hvm_copy(&magic, VMXASSIST_MAGIC_OFFSET, sizeof(magic), HVM_COPY_IN))
1359 return 0;
1360 if (magic != VMXASSIST_MAGIC)
1361 return 0;
1363 switch (mode) {
1364 /*
1365 * Transfer control to vmxassist.
1366 * Store the current context in VMXASSIST_OLD_CONTEXT and load
1367 * the new VMXASSIST_NEW_CONTEXT context. This context was created
1368 * by vmxassist and will transfer control to it.
1369 */
1370 case VMX_ASSIST_INVOKE:
1371 /* save the old context */
1372 if (!hvm_copy(&cp, VMXASSIST_OLD_CONTEXT, sizeof(cp), HVM_COPY_IN))
1373 goto error;
1374 if (cp != 0) {
1375 if (!vmx_world_save(v, &c))
1376 goto error;
1377 if (!hvm_copy(&c, cp, sizeof(c), HVM_COPY_OUT))
1378 goto error;
1381 /* restore the new context, this should activate vmxassist */
1382 if (!hvm_copy(&cp, VMXASSIST_NEW_CONTEXT, sizeof(cp), HVM_COPY_IN))
1383 goto error;
1384 if (cp != 0) {
1385 if (!hvm_copy(&c, cp, sizeof(c), HVM_COPY_IN))
1386 goto error;
1387 if (!vmx_world_restore(v, &c))
1388 goto error;
1389 return 1;
1391 break;
1393 /*
1394 * Restore the VMXASSIST_OLD_CONTEXT that was saved by VMX_ASSIST_INVOKE
1395 * above.
1396 */
1397 case VMX_ASSIST_RESTORE:
1398 /* save the old context */
1399 if (!hvm_copy(&cp, VMXASSIST_OLD_CONTEXT, sizeof(cp), HVM_COPY_IN))
1400 goto error;
1401 if (cp != 0) {
1402 if (!hvm_copy(&c, cp, sizeof(c), HVM_COPY_IN))
1403 goto error;
1404 if (!vmx_world_restore(v, &c))
1405 goto error;
1406 return 1;
1408 break;
1411 error:
1412 printf("Failed to transfer to vmxassist\n");
1413 domain_crash_synchronous();
1414 return 0;
1417 static int vmx_set_cr0(unsigned long value)
1419 struct vcpu *v = current;
1420 unsigned long mfn;
1421 unsigned long eip;
1422 int paging_enabled;
1423 unsigned long vm_entry_value;
1424 unsigned long old_cr0;
1426 /*
1427 * CR0: We don't want to lose PE and PG.
1428 */
1429 __vmread_vcpu(v, CR0_READ_SHADOW, &old_cr0);
1430 paging_enabled = (old_cr0 & X86_CR0_PE) && (old_cr0 & X86_CR0_PG);
1432 /* TS cleared? Then initialise FPU now. */
1433 if ( !(value & X86_CR0_TS) )
1435 setup_fpu(v);
1436 __vm_clear_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_NM);
1439 __vmwrite(GUEST_CR0, value | X86_CR0_PE | X86_CR0_PG | X86_CR0_NE);
1440 __vmwrite(CR0_READ_SHADOW, value);
1442 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR0 value = %lx\n", value);
1444 if ( (value & X86_CR0_PE) && (value & X86_CR0_PG) && !paging_enabled )
1446 /*
1447 * Trying to enable guest paging.
1448 * The guest CR3 must be pointing to the guest physical.
1449 */
1450 if ( !VALID_MFN(mfn = get_mfn_from_gpfn(
1451 v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT)) ||
1452 !get_page(mfn_to_page(mfn), v->domain) )
1454 printk("Invalid CR3 value = %lx", v->arch.hvm_vmx.cpu_cr3);
1455 domain_crash_synchronous(); /* need to take a clean path */
1458 #if defined(__x86_64__)
1459 if ( test_bit(VMX_CPU_STATE_LME_ENABLED,
1460 &v->arch.hvm_vmx.cpu_state) &&
1461 !test_bit(VMX_CPU_STATE_PAE_ENABLED,
1462 &v->arch.hvm_vmx.cpu_state) )
1464 HVM_DBG_LOG(DBG_LEVEL_1, "Enable paging before PAE enabled\n");
1465 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
1468 if ( test_bit(VMX_CPU_STATE_LME_ENABLED,
1469 &v->arch.hvm_vmx.cpu_state) )
1471 /* Here the PAE is should be opened */
1472 HVM_DBG_LOG(DBG_LEVEL_1, "Enable long mode\n");
1473 set_bit(VMX_CPU_STATE_LMA_ENABLED,
1474 &v->arch.hvm_vmx.cpu_state);
1476 __vmread(VM_ENTRY_CONTROLS, &vm_entry_value);
1477 vm_entry_value |= VM_ENTRY_CONTROLS_IA32E_MODE;
1478 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
1480 if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L4) )
1482 printk("Unsupported guest paging levels\n");
1483 domain_crash_synchronous(); /* need to take a clean path */
1486 else
1487 #endif /* __x86_64__ */
1489 #if CONFIG_PAGING_LEVELS >= 3
1490 /* seems it's a 32-bit or 32-bit PAE guest */
1492 if ( test_bit(VMX_CPU_STATE_PAE_ENABLED,
1493 &v->arch.hvm_vmx.cpu_state) )
1495 /* The guest enables PAE first and then it enables PG, it is
1496 * really a PAE guest */
1497 if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L3) )
1499 printk("Unsupported guest paging levels\n");
1500 domain_crash_synchronous();
1503 else
1505 if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L2) )
1507 printk("Unsupported guest paging levels\n");
1508 domain_crash_synchronous(); /* need to take a clean path */
1511 #endif
1514 /*
1515 * Now arch.guest_table points to machine physical.
1516 */
1517 v->arch.guest_table = pagetable_from_pfn(mfn);
1518 update_pagetables(v);
1520 HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
1521 (unsigned long) (mfn << PAGE_SHIFT));
1523 __vmwrite(GUEST_CR3, pagetable_get_paddr(v->arch.shadow_table));
1524 /*
1525 * arch->shadow_table should hold the next CR3 for shadow
1526 */
1527 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, mfn = %lx",
1528 v->arch.hvm_vmx.cpu_cr3, mfn);
1531 if ( !((value & X86_CR0_PE) && (value & X86_CR0_PG)) && paging_enabled )
1532 if ( v->arch.hvm_vmx.cpu_cr3 ) {
1533 put_page(mfn_to_page(get_mfn_from_gpfn(
1534 v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT)));
1535 v->arch.guest_table = pagetable_null();
1538 /*
1539 * VMX does not implement real-mode virtualization. We emulate
1540 * real-mode by performing a world switch to VMXAssist whenever
1541 * a partition disables the CR0.PE bit.
1542 */
1543 if ( (value & X86_CR0_PE) == 0 )
1545 if ( value & X86_CR0_PG ) {
1546 /* inject GP here */
1547 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
1548 return 0;
1549 } else {
1550 /*
1551 * Disable paging here.
1552 * Same to PE == 1 && PG == 0
1553 */
1554 if ( test_bit(VMX_CPU_STATE_LMA_ENABLED,
1555 &v->arch.hvm_vmx.cpu_state) )
1557 clear_bit(VMX_CPU_STATE_LMA_ENABLED,
1558 &v->arch.hvm_vmx.cpu_state);
1559 __vmread(VM_ENTRY_CONTROLS, &vm_entry_value);
1560 vm_entry_value &= ~VM_ENTRY_CONTROLS_IA32E_MODE;
1561 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
1565 clear_all_shadow_status(v->domain);
1566 if ( vmx_assist(v, VMX_ASSIST_INVOKE) ) {
1567 set_bit(VMX_CPU_STATE_ASSIST_ENABLED, &v->arch.hvm_vmx.cpu_state);
1568 __vmread(GUEST_RIP, &eip);
1569 HVM_DBG_LOG(DBG_LEVEL_1,
1570 "Transfering control to vmxassist %%eip 0x%lx\n", eip);
1571 return 0; /* do not update eip! */
1573 } else if ( test_bit(VMX_CPU_STATE_ASSIST_ENABLED,
1574 &v->arch.hvm_vmx.cpu_state) )
1576 __vmread(GUEST_RIP, &eip);
1577 HVM_DBG_LOG(DBG_LEVEL_1,
1578 "Enabling CR0.PE at %%eip 0x%lx\n", eip);
1579 if ( vmx_assist(v, VMX_ASSIST_RESTORE) )
1581 clear_bit(VMX_CPU_STATE_ASSIST_ENABLED,
1582 &v->arch.hvm_vmx.cpu_state);
1583 __vmread(GUEST_RIP, &eip);
1584 HVM_DBG_LOG(DBG_LEVEL_1,
1585 "Restoring to %%eip 0x%lx\n", eip);
1586 return 0; /* do not update eip! */
1589 else if ( (value & (X86_CR0_PE | X86_CR0_PG)) == X86_CR0_PE )
1591 /* we should take care of this kind of situation */
1592 clear_all_shadow_status(v->domain);
1593 __vmwrite(GUEST_CR3, pagetable_get_paddr(v->domain->arch.phys_table));
1596 return 1;
1599 #define CASE_SET_REG(REG, reg) \
1600 case REG_ ## REG: regs->reg = value; break
1601 #define CASE_GET_REG(REG, reg) \
1602 case REG_ ## REG: value = regs->reg; break
1604 #define CASE_EXTEND_SET_REG \
1605 CASE_EXTEND_REG(S)
1606 #define CASE_EXTEND_GET_REG \
1607 CASE_EXTEND_REG(G)
1609 #ifdef __i386__
1610 #define CASE_EXTEND_REG(T)
1611 #else
1612 #define CASE_EXTEND_REG(T) \
1613 CASE_ ## T ## ET_REG(R8, r8); \
1614 CASE_ ## T ## ET_REG(R9, r9); \
1615 CASE_ ## T ## ET_REG(R10, r10); \
1616 CASE_ ## T ## ET_REG(R11, r11); \
1617 CASE_ ## T ## ET_REG(R12, r12); \
1618 CASE_ ## T ## ET_REG(R13, r13); \
1619 CASE_ ## T ## ET_REG(R14, r14); \
1620 CASE_ ## T ## ET_REG(R15, r15)
1621 #endif
1623 /*
1624 * Write to control registers
1625 */
1626 static int mov_to_cr(int gp, int cr, struct cpu_user_regs *regs)
1628 unsigned long value;
1629 unsigned long old_cr;
1630 struct vcpu *v = current;
1632 switch ( gp ) {
1633 CASE_GET_REG(EAX, eax);
1634 CASE_GET_REG(ECX, ecx);
1635 CASE_GET_REG(EDX, edx);
1636 CASE_GET_REG(EBX, ebx);
1637 CASE_GET_REG(EBP, ebp);
1638 CASE_GET_REG(ESI, esi);
1639 CASE_GET_REG(EDI, edi);
1640 CASE_EXTEND_GET_REG;
1641 case REG_ESP:
1642 __vmread(GUEST_RSP, &value);
1643 break;
1644 default:
1645 printk("invalid gp: %d\n", gp);
1646 __hvm_bug(regs);
1649 HVM_DBG_LOG(DBG_LEVEL_1, "CR%d, value = %lx", cr, value);
1651 switch ( cr ) {
1652 case 0:
1653 return vmx_set_cr0(value);
1654 case 3:
1656 unsigned long old_base_mfn, mfn;
1658 /*
1659 * If paging is not enabled yet, simply copy the value to CR3.
1660 */
1661 if (!vmx_paging_enabled(v)) {
1662 v->arch.hvm_vmx.cpu_cr3 = value;
1663 break;
1666 /*
1667 * We make a new one if the shadow does not exist.
1668 */
1669 if (value == v->arch.hvm_vmx.cpu_cr3) {
1670 /*
1671 * This is simple TLB flush, implying the guest has
1672 * removed some translation or changed page attributes.
1673 * We simply invalidate the shadow.
1674 */
1675 mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
1676 if (mfn != pagetable_get_pfn(v->arch.guest_table))
1677 __hvm_bug(regs);
1678 shadow_sync_all(v->domain);
1679 } else {
1680 /*
1681 * If different, make a shadow. Check if the PDBR is valid
1682 * first.
1683 */
1684 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 value = %lx", value);
1685 if ( ((value >> PAGE_SHIFT) > v->domain->max_pages ) ||
1686 !VALID_MFN(mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT)) ||
1687 !get_page(mfn_to_page(mfn), v->domain) )
1689 printk("Invalid CR3 value=%lx", value);
1690 domain_crash_synchronous(); /* need to take a clean path */
1692 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1693 v->arch.guest_table = pagetable_from_pfn(mfn);
1694 if (old_base_mfn)
1695 put_page(mfn_to_page(old_base_mfn));
1696 /*
1697 * arch.shadow_table should now hold the next CR3 for shadow
1698 */
1699 #if CONFIG_PAGING_LEVELS >= 3
1700 if ( v->domain->arch.ops->guest_paging_levels == PAGING_L3 )
1701 shadow_sync_all(v->domain);
1702 #endif
1704 v->arch.hvm_vmx.cpu_cr3 = value;
1705 update_pagetables(v);
1706 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx",
1707 value);
1708 __vmwrite(GUEST_CR3, pagetable_get_paddr(v->arch.shadow_table));
1710 break;
1712 case 4: /* CR4 */
1714 __vmread(CR4_READ_SHADOW, &old_cr);
1716 if ( value & X86_CR4_PAE && !(old_cr & X86_CR4_PAE) )
1718 set_bit(VMX_CPU_STATE_PAE_ENABLED, &v->arch.hvm_vmx.cpu_state);
1720 if ( vmx_pgbit_test(v) )
1722 /* The guest is a 32-bit PAE guest. */
1723 #if CONFIG_PAGING_LEVELS >= 3
1724 unsigned long mfn, old_base_mfn;
1726 if( !shadow_set_guest_paging_levels(v->domain, PAGING_L3) )
1728 printk("Unsupported guest paging levels\n");
1729 domain_crash_synchronous(); /* need to take a clean path */
1732 if ( !VALID_MFN(mfn = get_mfn_from_gpfn(
1733 v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT)) ||
1734 !get_page(mfn_to_page(mfn), v->domain) )
1736 printk("Invalid CR3 value = %lx", v->arch.hvm_vmx.cpu_cr3);
1737 domain_crash_synchronous(); /* need to take a clean path */
1740 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1741 if ( old_base_mfn )
1742 put_page(mfn_to_page(old_base_mfn));
1744 /*
1745 * Now arch.guest_table points to machine physical.
1746 */
1748 v->arch.guest_table = pagetable_from_pfn(mfn);
1749 update_pagetables(v);
1751 HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
1752 (unsigned long) (mfn << PAGE_SHIFT));
1754 __vmwrite(GUEST_CR3, pagetable_get_paddr(v->arch.shadow_table));
1756 /*
1757 * arch->shadow_table should hold the next CR3 for shadow
1758 */
1760 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, mfn = %lx",
1761 v->arch.hvm_vmx.cpu_cr3, mfn);
1762 #endif
1764 else
1766 /* The guest is a 64 bit or 32-bit PAE guest. */
1767 #if CONFIG_PAGING_LEVELS >= 3
1768 if ( (v->domain->arch.ops != NULL) &&
1769 v->domain->arch.ops->guest_paging_levels == PAGING_L2)
1771 /* Seems the guest first enables PAE without enabling PG,
1772 * it must enable PG after that, and it is a 32-bit PAE
1773 * guest */
1775 if ( !shadow_set_guest_paging_levels(v->domain,
1776 PAGING_L3) )
1778 printk("Unsupported guest paging levels\n");
1779 /* need to take a clean path */
1780 domain_crash_synchronous();
1783 #endif
1786 else if ( value & X86_CR4_PAE )
1787 set_bit(VMX_CPU_STATE_PAE_ENABLED, &v->arch.hvm_vmx.cpu_state);
1788 else
1790 if ( test_bit(VMX_CPU_STATE_LMA_ENABLED, &v->arch.hvm_vmx.cpu_state) )
1791 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
1793 clear_bit(VMX_CPU_STATE_PAE_ENABLED, &v->arch.hvm_vmx.cpu_state);
1796 __vmwrite(GUEST_CR4, value| VMX_CR4_HOST_MASK);
1797 __vmwrite(CR4_READ_SHADOW, value);
1799 /*
1800 * Writing to CR4 to modify the PSE, PGE, or PAE flag invalidates
1801 * all TLB entries except global entries.
1802 */
1803 if ( (old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE) )
1804 shadow_sync_all(v->domain);
1806 break;
1808 default:
1809 printk("invalid cr: %d\n", gp);
1810 __hvm_bug(regs);
1813 return 1;
1816 /*
1817 * Read from control registers. CR0 and CR4 are read from the shadow.
1818 */
1819 static void mov_from_cr(int cr, int gp, struct cpu_user_regs *regs)
1821 unsigned long value;
1822 struct vcpu *v = current;
1824 if ( cr != 3 )
1825 __hvm_bug(regs);
1827 value = (unsigned long) v->arch.hvm_vmx.cpu_cr3;
1829 switch ( gp ) {
1830 CASE_SET_REG(EAX, eax);
1831 CASE_SET_REG(ECX, ecx);
1832 CASE_SET_REG(EDX, edx);
1833 CASE_SET_REG(EBX, ebx);
1834 CASE_SET_REG(EBP, ebp);
1835 CASE_SET_REG(ESI, esi);
1836 CASE_SET_REG(EDI, edi);
1837 CASE_EXTEND_SET_REG;
1838 case REG_ESP:
1839 __vmwrite(GUEST_RSP, value);
1840 regs->esp = value;
1841 break;
1842 default:
1843 printk("invalid gp: %d\n", gp);
1844 __hvm_bug(regs);
1847 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR%d, value = %lx", cr, value);
1850 static int vmx_cr_access(unsigned long exit_qualification, struct cpu_user_regs *regs)
1852 unsigned int gp, cr;
1853 unsigned long value;
1854 struct vcpu *v = current;
1856 switch (exit_qualification & CONTROL_REG_ACCESS_TYPE) {
1857 case TYPE_MOV_TO_CR:
1858 gp = exit_qualification & CONTROL_REG_ACCESS_REG;
1859 cr = exit_qualification & CONTROL_REG_ACCESS_NUM;
1860 TRACE_VMEXIT(1,TYPE_MOV_TO_CR);
1861 TRACE_VMEXIT(2,cr);
1862 TRACE_VMEXIT(3,gp);
1863 return mov_to_cr(gp, cr, regs);
1864 case TYPE_MOV_FROM_CR:
1865 gp = exit_qualification & CONTROL_REG_ACCESS_REG;
1866 cr = exit_qualification & CONTROL_REG_ACCESS_NUM;
1867 TRACE_VMEXIT(1,TYPE_MOV_FROM_CR);
1868 TRACE_VMEXIT(2,cr);
1869 TRACE_VMEXIT(3,gp);
1870 mov_from_cr(cr, gp, regs);
1871 break;
1872 case TYPE_CLTS:
1873 TRACE_VMEXIT(1,TYPE_CLTS);
1875 /* We initialise the FPU now, to avoid needing another vmexit. */
1876 setup_fpu(v);
1877 __vm_clear_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_NM);
1879 __vmread_vcpu(v, GUEST_CR0, &value);
1880 value &= ~X86_CR0_TS; /* clear TS */
1881 __vmwrite(GUEST_CR0, value);
1883 __vmread_vcpu(v, CR0_READ_SHADOW, &value);
1884 value &= ~X86_CR0_TS; /* clear TS */
1885 __vmwrite(CR0_READ_SHADOW, value);
1886 break;
1887 case TYPE_LMSW:
1888 TRACE_VMEXIT(1,TYPE_LMSW);
1889 __vmread_vcpu(v, CR0_READ_SHADOW, &value);
1890 value = (value & ~0xF) |
1891 (((exit_qualification & LMSW_SOURCE_DATA) >> 16) & 0xF);
1892 return vmx_set_cr0(value);
1893 break;
1894 default:
1895 __hvm_bug(regs);
1896 break;
1898 return 1;
1901 static inline void vmx_do_msr_read(struct cpu_user_regs *regs)
1903 u64 msr_content = 0;
1904 u32 eax, edx;
1905 struct vcpu *v = current;
1907 HVM_DBG_LOG(DBG_LEVEL_1, "vmx_do_msr_read: ecx=%lx, eax=%lx, edx=%lx",
1908 (unsigned long)regs->ecx, (unsigned long)regs->eax,
1909 (unsigned long)regs->edx);
1910 switch (regs->ecx) {
1911 case MSR_IA32_TIME_STAMP_COUNTER:
1912 msr_content = hvm_get_guest_time(v);
1913 break;
1914 case MSR_IA32_SYSENTER_CS:
1915 __vmread(GUEST_SYSENTER_CS, (u32 *)&msr_content);
1916 break;
1917 case MSR_IA32_SYSENTER_ESP:
1918 __vmread(GUEST_SYSENTER_ESP, &msr_content);
1919 break;
1920 case MSR_IA32_SYSENTER_EIP:
1921 __vmread(GUEST_SYSENTER_EIP, &msr_content);
1922 break;
1923 case MSR_IA32_APICBASE:
1924 msr_content = VLAPIC(v) ? VLAPIC(v)->apic_base_msr : 0;
1925 break;
1926 default:
1927 if (long_mode_do_msr_read(regs))
1928 return;
1930 if ( rdmsr_hypervisor_regs(regs->ecx, &eax, &edx) )
1932 regs->eax = eax;
1933 regs->edx = edx;
1934 return;
1937 rdmsr_safe(regs->ecx, regs->eax, regs->edx);
1938 break;
1941 regs->eax = msr_content & 0xFFFFFFFF;
1942 regs->edx = msr_content >> 32;
1944 HVM_DBG_LOG(DBG_LEVEL_1, "vmx_do_msr_read returns: "
1945 "ecx=%lx, eax=%lx, edx=%lx",
1946 (unsigned long)regs->ecx, (unsigned long)regs->eax,
1947 (unsigned long)regs->edx);
1950 static inline void vmx_do_msr_write(struct cpu_user_regs *regs)
1952 u64 msr_content;
1953 struct vcpu *v = current;
1955 HVM_DBG_LOG(DBG_LEVEL_1, "vmx_do_msr_write: ecx=%lx, eax=%lx, edx=%lx",
1956 (unsigned long)regs->ecx, (unsigned long)regs->eax,
1957 (unsigned long)regs->edx);
1959 msr_content = (regs->eax & 0xFFFFFFFF) | ((u64)regs->edx << 32);
1961 switch (regs->ecx) {
1962 case MSR_IA32_TIME_STAMP_COUNTER:
1963 set_guest_time(v, msr_content);
1964 break;
1965 case MSR_IA32_SYSENTER_CS:
1966 __vmwrite(GUEST_SYSENTER_CS, msr_content);
1967 break;
1968 case MSR_IA32_SYSENTER_ESP:
1969 __vmwrite(GUEST_SYSENTER_ESP, msr_content);
1970 break;
1971 case MSR_IA32_SYSENTER_EIP:
1972 __vmwrite(GUEST_SYSENTER_EIP, msr_content);
1973 break;
1974 case MSR_IA32_APICBASE:
1975 vlapic_msr_set(VLAPIC(v), msr_content);
1976 break;
1977 default:
1978 if ( !long_mode_do_msr_write(regs) )
1979 wrmsr_hypervisor_regs(regs->ecx, regs->eax, regs->edx);
1980 break;
1983 HVM_DBG_LOG(DBG_LEVEL_1, "vmx_do_msr_write returns: "
1984 "ecx=%lx, eax=%lx, edx=%lx",
1985 (unsigned long)regs->ecx, (unsigned long)regs->eax,
1986 (unsigned long)regs->edx);
1989 /*
1990 * Need to use this exit to reschedule
1991 */
1992 void vmx_vmexit_do_hlt(void)
1994 struct vcpu *v=current;
1995 struct periodic_time *pt = &(v->domain->arch.hvm_domain.pl_time.periodic_tm);
1996 s_time_t next_pit=-1,next_wakeup;
1998 if ( !v->vcpu_id )
1999 next_pit = get_scheduled(v, pt->irq, pt);
2000 next_wakeup = get_apictime_scheduled(v);
2001 if ( (next_pit != -1 && next_pit < next_wakeup) || next_wakeup == -1 )
2002 next_wakeup = next_pit;
2003 if ( next_wakeup != - 1 )
2004 set_timer(&current->arch.hvm_vmx.hlt_timer, next_wakeup);
2005 hvm_safe_block();
2008 static inline void vmx_vmexit_do_extint(struct cpu_user_regs *regs)
2010 unsigned int vector;
2011 int error;
2013 asmlinkage void do_IRQ(struct cpu_user_regs *);
2014 fastcall void smp_apic_timer_interrupt(struct cpu_user_regs *);
2015 fastcall void smp_event_check_interrupt(void);
2016 fastcall void smp_invalidate_interrupt(void);
2017 fastcall void smp_call_function_interrupt(void);
2018 fastcall void smp_spurious_interrupt(struct cpu_user_regs *regs);
2019 fastcall void smp_error_interrupt(struct cpu_user_regs *regs);
2020 #ifdef CONFIG_X86_MCE_P4THERMAL
2021 fastcall void smp_thermal_interrupt(struct cpu_user_regs *regs);
2022 #endif
2024 if ((error = __vmread(VM_EXIT_INTR_INFO, &vector))
2025 && !(vector & INTR_INFO_VALID_MASK))
2026 __hvm_bug(regs);
2028 vector &= INTR_INFO_VECTOR_MASK;
2029 TRACE_VMEXIT(1,vector);
2031 switch(vector) {
2032 case LOCAL_TIMER_VECTOR:
2033 smp_apic_timer_interrupt(regs);
2034 break;
2035 case EVENT_CHECK_VECTOR:
2036 smp_event_check_interrupt();
2037 break;
2038 case INVALIDATE_TLB_VECTOR:
2039 smp_invalidate_interrupt();
2040 break;
2041 case CALL_FUNCTION_VECTOR:
2042 smp_call_function_interrupt();
2043 break;
2044 case SPURIOUS_APIC_VECTOR:
2045 smp_spurious_interrupt(regs);
2046 break;
2047 case ERROR_APIC_VECTOR:
2048 smp_error_interrupt(regs);
2049 break;
2050 #ifdef CONFIG_X86_MCE_P4THERMAL
2051 case THERMAL_APIC_VECTOR:
2052 smp_thermal_interrupt(regs);
2053 break;
2054 #endif
2055 default:
2056 regs->entry_vector = vector;
2057 do_IRQ(regs);
2058 break;
2062 #if defined (__x86_64__)
2063 void store_cpu_user_regs(struct cpu_user_regs *regs)
2065 __vmread(GUEST_SS_SELECTOR, &regs->ss);
2066 __vmread(GUEST_RSP, &regs->rsp);
2067 __vmread(GUEST_RFLAGS, &regs->rflags);
2068 __vmread(GUEST_CS_SELECTOR, &regs->cs);
2069 __vmread(GUEST_DS_SELECTOR, &regs->ds);
2070 __vmread(GUEST_ES_SELECTOR, &regs->es);
2071 __vmread(GUEST_RIP, &regs->rip);
2073 #elif defined (__i386__)
2074 void store_cpu_user_regs(struct cpu_user_regs *regs)
2076 __vmread(GUEST_SS_SELECTOR, &regs->ss);
2077 __vmread(GUEST_RSP, &regs->esp);
2078 __vmread(GUEST_RFLAGS, &regs->eflags);
2079 __vmread(GUEST_CS_SELECTOR, &regs->cs);
2080 __vmread(GUEST_DS_SELECTOR, &regs->ds);
2081 __vmread(GUEST_ES_SELECTOR, &regs->es);
2082 __vmread(GUEST_RIP, &regs->eip);
2084 #endif
2086 #ifdef XEN_DEBUGGER
2087 void save_cpu_user_regs(struct cpu_user_regs *regs)
2089 __vmread(GUEST_SS_SELECTOR, &regs->xss);
2090 __vmread(GUEST_RSP, &regs->esp);
2091 __vmread(GUEST_RFLAGS, &regs->eflags);
2092 __vmread(GUEST_CS_SELECTOR, &regs->xcs);
2093 __vmread(GUEST_RIP, &regs->eip);
2095 __vmread(GUEST_GS_SELECTOR, &regs->xgs);
2096 __vmread(GUEST_FS_SELECTOR, &regs->xfs);
2097 __vmread(GUEST_ES_SELECTOR, &regs->xes);
2098 __vmread(GUEST_DS_SELECTOR, &regs->xds);
2101 void restore_cpu_user_regs(struct cpu_user_regs *regs)
2103 __vmwrite(GUEST_SS_SELECTOR, regs->xss);
2104 __vmwrite(GUEST_RSP, regs->esp);
2105 __vmwrite(GUEST_RFLAGS, regs->eflags);
2106 __vmwrite(GUEST_CS_SELECTOR, regs->xcs);
2107 __vmwrite(GUEST_RIP, regs->eip);
2109 __vmwrite(GUEST_GS_SELECTOR, regs->xgs);
2110 __vmwrite(GUEST_FS_SELECTOR, regs->xfs);
2111 __vmwrite(GUEST_ES_SELECTOR, regs->xes);
2112 __vmwrite(GUEST_DS_SELECTOR, regs->xds);
2114 #endif
2116 asmlinkage void vmx_vmexit_handler(struct cpu_user_regs regs)
2118 unsigned int exit_reason;
2119 unsigned long exit_qualification, eip, inst_len = 0;
2120 struct vcpu *v = current;
2121 int error;
2123 error = __vmread(VM_EXIT_REASON, &exit_reason);
2124 BUG_ON(error);
2126 perfc_incra(vmexits, exit_reason);
2128 if ( (exit_reason != EXIT_REASON_EXTERNAL_INTERRUPT) &&
2129 (exit_reason != EXIT_REASON_VMCALL) &&
2130 (exit_reason != EXIT_REASON_IO_INSTRUCTION) )
2131 HVM_DBG_LOG(DBG_LEVEL_0, "exit reason = %x", exit_reason);
2133 if ( exit_reason != EXIT_REASON_EXTERNAL_INTERRUPT )
2134 local_irq_enable();
2136 if ( unlikely(exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) )
2138 unsigned int failed_vmentry_reason = exit_reason & 0xFFFF;
2140 __vmread(EXIT_QUALIFICATION, &exit_qualification);
2141 printk("Failed vm entry (exit reason 0x%x) ", exit_reason);
2142 switch ( failed_vmentry_reason ) {
2143 case EXIT_REASON_INVALID_GUEST_STATE:
2144 printk("caused by invalid guest state (%ld).\n", exit_qualification);
2145 break;
2146 case EXIT_REASON_MSR_LOADING:
2147 printk("caused by MSR entry %ld loading.\n", exit_qualification);
2148 break;
2149 case EXIT_REASON_MACHINE_CHECK:
2150 printk("caused by machine check.\n");
2151 break;
2152 default:
2153 printk("reason not known yet!");
2154 break;
2157 printk("************* VMCS Area **************\n");
2158 vmcs_dump_vcpu();
2159 printk("**************************************\n");
2160 domain_crash_synchronous();
2163 __vmread(GUEST_RIP, &eip);
2164 TRACE_VMEXIT(0,exit_reason);
2166 switch ( exit_reason )
2168 case EXIT_REASON_EXCEPTION_NMI:
2170 /*
2171 * We don't set the software-interrupt exiting (INT n).
2172 * (1) We can get an exception (e.g. #PG) in the guest, or
2173 * (2) NMI
2174 */
2175 int error;
2176 unsigned int vector;
2177 unsigned long va;
2179 if ((error = __vmread(VM_EXIT_INTR_INFO, &vector))
2180 || !(vector & INTR_INFO_VALID_MASK))
2181 __hvm_bug(&regs);
2182 vector &= INTR_INFO_VECTOR_MASK;
2184 TRACE_VMEXIT(1,vector);
2185 perfc_incra(cause_vector, vector);
2187 switch (vector) {
2188 #ifdef XEN_DEBUGGER
2189 case TRAP_debug:
2191 save_cpu_user_regs(&regs);
2192 pdb_handle_exception(1, &regs, 1);
2193 restore_cpu_user_regs(&regs);
2194 break;
2196 case TRAP_int3:
2198 save_cpu_user_regs(&regs);
2199 pdb_handle_exception(3, &regs, 1);
2200 restore_cpu_user_regs(&regs);
2201 break;
2203 #else
2204 case TRAP_debug:
2206 void store_cpu_user_regs(struct cpu_user_regs *regs);
2208 if ( test_bit(_DOMF_debugging, &v->domain->domain_flags) )
2210 store_cpu_user_regs(&regs);
2211 domain_pause_for_debugger();
2212 __vm_clear_bit(GUEST_PENDING_DBG_EXCEPTIONS,
2213 PENDING_DEBUG_EXC_BS);
2215 else
2217 vmx_reflect_exception(v);
2218 __vm_clear_bit(GUEST_PENDING_DBG_EXCEPTIONS,
2219 PENDING_DEBUG_EXC_BS);
2222 break;
2224 case TRAP_int3:
2226 if ( test_bit(_DOMF_debugging, &v->domain->domain_flags) )
2227 domain_pause_for_debugger();
2228 else
2229 vmx_reflect_exception(v);
2230 break;
2232 #endif
2233 case TRAP_no_device:
2235 vmx_do_no_device_fault();
2236 break;
2238 case TRAP_page_fault:
2240 __vmread(EXIT_QUALIFICATION, &va);
2241 __vmread(VM_EXIT_INTR_ERROR_CODE, &regs.error_code);
2243 TRACE_VMEXIT(3,regs.error_code);
2244 TRACE_VMEXIT(4,va);
2246 HVM_DBG_LOG(DBG_LEVEL_VMMU,
2247 "eax=%lx, ebx=%lx, ecx=%lx, edx=%lx, esi=%lx, edi=%lx",
2248 (unsigned long)regs.eax, (unsigned long)regs.ebx,
2249 (unsigned long)regs.ecx, (unsigned long)regs.edx,
2250 (unsigned long)regs.esi, (unsigned long)regs.edi);
2252 if (!(error = vmx_do_page_fault(va, &regs))) {
2253 /*
2254 * Inject #PG using Interruption-Information Fields
2255 */
2256 vmx_inject_hw_exception(v, TRAP_page_fault, regs.error_code);
2257 v->arch.hvm_vmx.cpu_cr2 = va;
2258 TRACE_3D(TRC_VMX_INT, v->domain->domain_id, TRAP_page_fault, va);
2260 break;
2262 case TRAP_nmi:
2263 do_nmi(&regs);
2264 break;
2265 default:
2266 vmx_reflect_exception(v);
2267 break;
2269 break;
2271 case EXIT_REASON_EXTERNAL_INTERRUPT:
2272 vmx_vmexit_do_extint(&regs);
2273 break;
2274 case EXIT_REASON_PENDING_INTERRUPT:
2275 /*
2276 * Not sure exactly what the purpose of this is. The only bits set
2277 * and cleared at this point are CPU_BASED_VIRTUAL_INTR_PENDING.
2278 * (in io.c:{enable,disable}_irq_window(). So presumably we want to
2279 * set it to the original value...
2280 */
2281 v->arch.hvm_vcpu.u.vmx.exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
2282 v->arch.hvm_vcpu.u.vmx.exec_control |=
2283 (MONITOR_CPU_BASED_EXEC_CONTROLS & CPU_BASED_VIRTUAL_INTR_PENDING);
2284 __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
2285 v->arch.hvm_vcpu.u.vmx.exec_control);
2286 break;
2287 case EXIT_REASON_TASK_SWITCH:
2288 __hvm_bug(&regs);
2289 break;
2290 case EXIT_REASON_CPUID:
2291 vmx_vmexit_do_cpuid(&regs);
2292 __get_instruction_length(inst_len);
2293 __update_guest_eip(inst_len);
2294 break;
2295 case EXIT_REASON_HLT:
2296 __get_instruction_length(inst_len);
2297 __update_guest_eip(inst_len);
2298 vmx_vmexit_do_hlt();
2299 break;
2300 case EXIT_REASON_INVLPG:
2302 unsigned long va;
2304 __vmread(EXIT_QUALIFICATION, &va);
2305 vmx_vmexit_do_invlpg(va);
2306 __get_instruction_length(inst_len);
2307 __update_guest_eip(inst_len);
2308 break;
2310 case EXIT_REASON_VMCALL:
2312 __get_instruction_length(inst_len);
2313 __vmread(GUEST_RIP, &eip);
2314 __vmread(EXIT_QUALIFICATION, &exit_qualification);
2316 hvm_do_hypercall(&regs);
2317 __update_guest_eip(inst_len);
2318 break;
2320 case EXIT_REASON_CR_ACCESS:
2322 __vmread(GUEST_RIP, &eip);
2323 __get_instruction_length(inst_len);
2324 __vmread(EXIT_QUALIFICATION, &exit_qualification);
2326 HVM_DBG_LOG(DBG_LEVEL_1, "eip = %lx, inst_len =%lx, exit_qualification = %lx",
2327 eip, inst_len, exit_qualification);
2328 if (vmx_cr_access(exit_qualification, &regs))
2329 __update_guest_eip(inst_len);
2330 TRACE_VMEXIT(3,regs.error_code);
2331 TRACE_VMEXIT(4,exit_qualification);
2332 break;
2334 case EXIT_REASON_DR_ACCESS:
2335 __vmread(EXIT_QUALIFICATION, &exit_qualification);
2336 vmx_dr_access(exit_qualification, &regs);
2337 __get_instruction_length(inst_len);
2338 __update_guest_eip(inst_len);
2339 break;
2340 case EXIT_REASON_IO_INSTRUCTION:
2341 __vmread(EXIT_QUALIFICATION, &exit_qualification);
2342 __get_instruction_length(inst_len);
2343 vmx_io_instruction(exit_qualification, inst_len);
2344 TRACE_VMEXIT(4,exit_qualification);
2345 break;
2346 case EXIT_REASON_MSR_READ:
2347 __get_instruction_length(inst_len);
2348 vmx_do_msr_read(&regs);
2349 __update_guest_eip(inst_len);
2350 break;
2351 case EXIT_REASON_MSR_WRITE:
2352 __vmread(GUEST_RIP, &eip);
2353 vmx_do_msr_write(&regs);
2354 __get_instruction_length(inst_len);
2355 __update_guest_eip(inst_len);
2356 break;
2357 case EXIT_REASON_MWAIT_INSTRUCTION:
2358 __hvm_bug(&regs);
2359 break;
2360 case EXIT_REASON_VMCLEAR:
2361 case EXIT_REASON_VMLAUNCH:
2362 case EXIT_REASON_VMPTRLD:
2363 case EXIT_REASON_VMPTRST:
2364 case EXIT_REASON_VMREAD:
2365 case EXIT_REASON_VMRESUME:
2366 case EXIT_REASON_VMWRITE:
2367 case EXIT_REASON_VMOFF:
2368 case EXIT_REASON_VMON:
2369 /* Report invalid opcode exception when a VMX guest tries to execute
2370 any of the VMX instructions */
2371 vmx_inject_hw_exception(v, TRAP_invalid_op, VMX_DELIVER_NO_ERROR_CODE);
2372 break;
2374 default:
2375 __hvm_bug(&regs); /* should not happen */
2379 asmlinkage void vmx_load_cr2(void)
2381 struct vcpu *v = current;
2383 local_irq_disable();
2384 asm volatile("mov %0,%%cr2": :"r" (v->arch.hvm_vmx.cpu_cr2));
2387 asmlinkage void vmx_trace_vmentry (void)
2389 TRACE_5D(TRC_VMX_VMENTRY,
2390 trace_values[smp_processor_id()][0],
2391 trace_values[smp_processor_id()][1],
2392 trace_values[smp_processor_id()][2],
2393 trace_values[smp_processor_id()][3],
2394 trace_values[smp_processor_id()][4]);
2395 TRACE_VMEXIT(0,9);
2396 TRACE_VMEXIT(1,9);
2397 TRACE_VMEXIT(2,9);
2398 TRACE_VMEXIT(3,9);
2399 TRACE_VMEXIT(4,9);
2400 return;
2403 asmlinkage void vmx_trace_vmexit (void)
2405 TRACE_3D(TRC_VMX_VMEXIT,0,0,0);
2406 return;
2409 /*
2410 * Local variables:
2411 * mode: C
2412 * c-set-style: "BSD"
2413 * c-basic-offset: 4
2414 * tab-width: 4
2415 * indent-tabs-mode: nil
2416 * End:
2417 */