ia64/xen-unstable

view xen/arch/x86/hvm/vmx/vmx.c @ 14090:cdc765772f69

hvm: Clean up initialisation of hvm_funcs.
Signed-off-by: Keir Fraser <keir@xensource.com>
author kfraser@localhost.localdomain
date Fri Feb 23 11:32:25 2007 +0000 (2007-02-23)
parents e8470a1a01af
children d2a91b73899a
line source
1 /*
2 * vmx.c: handling VMX architecture-related VM exits
3 * Copyright (c) 2004, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
16 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 *
18 */
20 #include <xen/config.h>
21 #include <xen/init.h>
22 #include <xen/lib.h>
23 #include <xen/trace.h>
24 #include <xen/sched.h>
25 #include <xen/irq.h>
26 #include <xen/softirq.h>
27 #include <xen/domain_page.h>
28 #include <xen/hypercall.h>
29 #include <xen/perfc.h>
30 #include <asm/current.h>
31 #include <asm/io.h>
32 #include <asm/regs.h>
33 #include <asm/cpufeature.h>
34 #include <asm/processor.h>
35 #include <asm/types.h>
36 #include <asm/msr.h>
37 #include <asm/spinlock.h>
38 #include <asm/paging.h>
39 #include <asm/p2m.h>
40 #include <asm/hvm/hvm.h>
41 #include <asm/hvm/support.h>
42 #include <asm/hvm/vmx/vmx.h>
43 #include <asm/hvm/vmx/vmcs.h>
44 #include <asm/hvm/vmx/cpu.h>
45 #include <public/sched.h>
46 #include <public/hvm/ioreq.h>
47 #include <asm/hvm/vpic.h>
48 #include <asm/hvm/vlapic.h>
49 #include <asm/x86_emulate.h>
50 #include <asm/hvm/vpt.h>
51 #include <public/hvm/save.h>
53 static void vmx_ctxt_switch_from(struct vcpu *v);
54 static void vmx_ctxt_switch_to(struct vcpu *v);
56 static int vmx_vcpu_initialise(struct vcpu *v)
57 {
58 int rc;
60 spin_lock_init(&v->arch.hvm_vmx.vmcs_lock);
62 v->arch.schedule_tail = arch_vmx_do_resume;
63 v->arch.ctxt_switch_from = vmx_ctxt_switch_from;
64 v->arch.ctxt_switch_to = vmx_ctxt_switch_to;
66 if ( (rc = vmx_create_vmcs(v)) != 0 )
67 {
68 dprintk(XENLOG_WARNING,
69 "Failed to create VMCS for vcpu %d: err=%d.\n",
70 v->vcpu_id, rc);
71 return rc;
72 }
74 return 0;
75 }
77 static void vmx_vcpu_destroy(struct vcpu *v)
78 {
79 vmx_destroy_vmcs(v);
80 }
82 #ifdef __x86_64__
84 static DEFINE_PER_CPU(struct vmx_msr_state, host_msr_state);
86 static u32 msr_index[VMX_MSR_COUNT] =
87 {
88 MSR_LSTAR, MSR_STAR, MSR_CSTAR,
89 MSR_SYSCALL_MASK, MSR_EFER,
90 };
92 static void vmx_save_host_msrs(void)
93 {
94 struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state);
95 int i;
97 for ( i = 0; i < VMX_MSR_COUNT; i++ )
98 rdmsrl(msr_index[i], host_msr_state->msrs[i]);
99 }
101 #define WRITE_MSR(address) \
102 guest_msr_state->msrs[VMX_INDEX_MSR_ ## address] = msr_content; \
103 if ( !test_bit(VMX_INDEX_MSR_ ## address, &guest_msr_state->flags) )\
104 set_bit(VMX_INDEX_MSR_ ## address, &guest_msr_state->flags); \
105 wrmsrl(MSR_ ## address, msr_content); \
106 set_bit(VMX_INDEX_MSR_ ## address, &host_msr_state->flags); \
107 break
109 static inline int long_mode_do_msr_read(struct cpu_user_regs *regs)
110 {
111 u64 msr_content = 0;
112 struct vcpu *v = current;
113 struct vmx_msr_state *guest_msr_state = &v->arch.hvm_vmx.msr_state;
115 switch ( (u32)regs->ecx ) {
116 case MSR_EFER:
117 HVM_DBG_LOG(DBG_LEVEL_2, "EFER msr_content 0x%"PRIx64, msr_content);
118 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_EFER];
119 break;
121 case MSR_FS_BASE:
122 msr_content = __vmread(GUEST_FS_BASE);
123 goto check_long_mode;
125 case MSR_GS_BASE:
126 msr_content = __vmread(GUEST_GS_BASE);
127 goto check_long_mode;
129 case MSR_SHADOW_GS_BASE:
130 msr_content = guest_msr_state->shadow_gs;
131 check_long_mode:
132 if ( !(vmx_long_mode_enabled(v)) )
133 {
134 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
135 return 0;
136 }
137 break;
139 case MSR_STAR:
140 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_STAR];
141 break;
143 case MSR_LSTAR:
144 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_LSTAR];
145 break;
147 case MSR_CSTAR:
148 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_CSTAR];
149 break;
151 case MSR_SYSCALL_MASK:
152 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_SYSCALL_MASK];
153 break;
155 default:
156 return 0;
157 }
159 HVM_DBG_LOG(DBG_LEVEL_2, "msr_content: 0x%"PRIx64, msr_content);
161 regs->eax = (u32)(msr_content >> 0);
162 regs->edx = (u32)(msr_content >> 32);
164 return 1;
165 }
167 static inline int long_mode_do_msr_write(struct cpu_user_regs *regs)
168 {
169 u64 msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
170 u32 ecx = regs->ecx;
171 struct vcpu *v = current;
172 struct vmx_msr_state *guest_msr_state = &v->arch.hvm_vmx.msr_state;
173 struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state);
175 HVM_DBG_LOG(DBG_LEVEL_1, "msr 0x%x msr_content 0x%"PRIx64"\n",
176 ecx, msr_content);
178 switch ( ecx )
179 {
180 case MSR_EFER:
181 /* offending reserved bit will cause #GP */
182 if ( msr_content & ~(EFER_LME | EFER_LMA | EFER_NX | EFER_SCE) )
183 {
184 gdprintk(XENLOG_WARNING, "Trying to set reserved bit in "
185 "EFER: %"PRIx64"\n", msr_content);
186 goto gp_fault;
187 }
189 if ( (msr_content & EFER_LME)
190 && !(guest_msr_state->msrs[VMX_INDEX_MSR_EFER] & EFER_LME) )
191 {
192 if ( unlikely(vmx_paging_enabled(v)) )
193 {
194 gdprintk(XENLOG_WARNING,
195 "Trying to set EFER.LME with paging enabled\n");
196 goto gp_fault;
197 }
198 }
199 else if ( !(msr_content & EFER_LME)
200 && (guest_msr_state->msrs[VMX_INDEX_MSR_EFER] & EFER_LME) )
201 {
202 if ( unlikely(vmx_paging_enabled(v)) )
203 {
204 gdprintk(XENLOG_WARNING,
205 "Trying to clear EFER.LME with paging enabled\n");
206 goto gp_fault;
207 }
208 }
210 guest_msr_state->msrs[VMX_INDEX_MSR_EFER] = msr_content;
211 break;
213 case MSR_FS_BASE:
214 case MSR_GS_BASE:
215 case MSR_SHADOW_GS_BASE:
216 if ( !vmx_long_mode_enabled(v) )
217 goto gp_fault;
219 if ( !is_canonical_address(msr_content) )
220 goto uncanonical_address;
222 if ( ecx == MSR_FS_BASE )
223 __vmwrite(GUEST_FS_BASE, msr_content);
224 else if ( ecx == MSR_GS_BASE )
225 __vmwrite(GUEST_GS_BASE, msr_content);
226 else
227 {
228 v->arch.hvm_vmx.msr_state.shadow_gs = msr_content;
229 wrmsrl(MSR_SHADOW_GS_BASE, msr_content);
230 }
232 break;
234 case MSR_STAR:
235 WRITE_MSR(STAR);
237 case MSR_LSTAR:
238 if ( !is_canonical_address(msr_content) )
239 goto uncanonical_address;
240 WRITE_MSR(LSTAR);
242 case MSR_CSTAR:
243 if ( !is_canonical_address(msr_content) )
244 goto uncanonical_address;
245 WRITE_MSR(CSTAR);
247 case MSR_SYSCALL_MASK:
248 WRITE_MSR(SYSCALL_MASK);
250 default:
251 return 0;
252 }
254 return 1;
256 uncanonical_address:
257 HVM_DBG_LOG(DBG_LEVEL_1, "Not cano address of msr write %x\n", ecx);
258 gp_fault:
259 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
260 return 0;
261 }
263 /*
264 * To avoid MSR save/restore at every VM exit/entry time, we restore
265 * the x86_64 specific MSRs at domain switch time. Since these MSRs
266 * are not modified once set for para domains, we don't save them,
267 * but simply reset them to values set in percpu_traps_init().
268 */
269 static void vmx_restore_host_msrs(void)
270 {
271 struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state);
272 int i;
274 while ( host_msr_state->flags )
275 {
276 i = find_first_set_bit(host_msr_state->flags);
277 wrmsrl(msr_index[i], host_msr_state->msrs[i]);
278 clear_bit(i, &host_msr_state->flags);
279 }
280 }
282 static void vmx_save_guest_msrs(struct vcpu *v)
283 {
284 /* MSR_SHADOW_GS_BASE may have been changed by swapgs instruction. */
285 rdmsrl(MSR_SHADOW_GS_BASE, v->arch.hvm_vmx.msr_state.shadow_gs);
286 }
288 static void vmx_restore_guest_msrs(struct vcpu *v)
289 {
290 struct vmx_msr_state *guest_msr_state, *host_msr_state;
291 unsigned long guest_flags;
292 int i;
294 guest_msr_state = &v->arch.hvm_vmx.msr_state;
295 host_msr_state = &this_cpu(host_msr_state);
297 wrmsrl(MSR_SHADOW_GS_BASE, guest_msr_state->shadow_gs);
299 guest_flags = guest_msr_state->flags;
300 if ( !guest_flags )
301 return;
303 while ( guest_flags ) {
304 i = find_first_set_bit(guest_flags);
306 HVM_DBG_LOG(DBG_LEVEL_2,
307 "restore guest's index %d msr %x with value %lx",
308 i, msr_index[i], guest_msr_state->msrs[i]);
309 set_bit(i, &host_msr_state->flags);
310 wrmsrl(msr_index[i], guest_msr_state->msrs[i]);
311 clear_bit(i, &guest_flags);
312 }
313 }
315 #else /* __i386__ */
317 #define vmx_save_host_msrs() ((void)0)
318 #define vmx_restore_host_msrs() ((void)0)
319 #define vmx_save_guest_msrs(v) ((void)0)
320 #define vmx_restore_guest_msrs(v) ((void)0)
322 static inline int long_mode_do_msr_read(struct cpu_user_regs *regs)
323 {
324 return 0;
325 }
327 static inline int long_mode_do_msr_write(struct cpu_user_regs *regs)
328 {
329 return 0;
330 }
332 #endif /* __i386__ */
334 #define loaddebug(_v,_reg) \
335 __asm__ __volatile__ ("mov %0,%%db" #_reg : : "r" ((_v)->debugreg[_reg]))
336 #define savedebug(_v,_reg) \
337 __asm__ __volatile__ ("mov %%db" #_reg ",%0" : : "r" ((_v)->debugreg[_reg]))
339 static inline void vmx_save_dr(struct vcpu *v)
340 {
341 if ( !v->arch.hvm_vcpu.flag_dr_dirty )
342 return;
344 /* Clear the DR dirty flag and re-enable intercepts for DR accesses. */
345 v->arch.hvm_vcpu.flag_dr_dirty = 0;
346 v->arch.hvm_vcpu.u.vmx.exec_control |= CPU_BASED_MOV_DR_EXITING;
347 __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vcpu.u.vmx.exec_control);
349 savedebug(&v->arch.guest_context, 0);
350 savedebug(&v->arch.guest_context, 1);
351 savedebug(&v->arch.guest_context, 2);
352 savedebug(&v->arch.guest_context, 3);
353 savedebug(&v->arch.guest_context, 6);
354 v->arch.guest_context.debugreg[7] = __vmread(GUEST_DR7);
355 }
357 static inline void __restore_debug_registers(struct vcpu *v)
358 {
359 loaddebug(&v->arch.guest_context, 0);
360 loaddebug(&v->arch.guest_context, 1);
361 loaddebug(&v->arch.guest_context, 2);
362 loaddebug(&v->arch.guest_context, 3);
363 /* No 4 and 5 */
364 loaddebug(&v->arch.guest_context, 6);
365 /* DR7 is loaded from the VMCS. */
366 }
368 int vmx_vmcs_save(struct vcpu *v, struct hvm_hw_cpu *c)
369 {
370 c->eip = __vmread(GUEST_RIP);
371 c->esp = __vmread(GUEST_RSP);
372 c->eflags = __vmread(GUEST_RFLAGS);
374 c->cr0 = v->arch.hvm_vmx.cpu_shadow_cr0;
375 c->cr3 = v->arch.hvm_vmx.cpu_cr3;
376 c->cr4 = v->arch.hvm_vmx.cpu_shadow_cr4;
378 #ifdef HVM_DEBUG_SUSPEND
379 printk("vmx_vmcs_save: cr3=0x%"PRIx64", cr0=0x%"PRIx64", cr4=0x%"PRIx64".\n",
380 c->cr3,
381 c->cr0,
382 c->cr4);
383 #endif
385 c->idtr_limit = __vmread(GUEST_IDTR_LIMIT);
386 c->idtr_base = __vmread(GUEST_IDTR_BASE);
388 c->gdtr_limit = __vmread(GUEST_GDTR_LIMIT);
389 c->gdtr_base = __vmread(GUEST_GDTR_BASE);
391 c->cs_sel = __vmread(GUEST_CS_SELECTOR);
392 c->cs_limit = __vmread(GUEST_CS_LIMIT);
393 c->cs_base = __vmread(GUEST_CS_BASE);
394 c->cs_arbytes = __vmread(GUEST_CS_AR_BYTES);
396 c->ds_sel = __vmread(GUEST_DS_SELECTOR);
397 c->ds_limit = __vmread(GUEST_DS_LIMIT);
398 c->ds_base = __vmread(GUEST_DS_BASE);
399 c->ds_arbytes = __vmread(GUEST_DS_AR_BYTES);
401 c->es_sel = __vmread(GUEST_ES_SELECTOR);
402 c->es_limit = __vmread(GUEST_ES_LIMIT);
403 c->es_base = __vmread(GUEST_ES_BASE);
404 c->es_arbytes = __vmread(GUEST_ES_AR_BYTES);
406 c->ss_sel = __vmread(GUEST_SS_SELECTOR);
407 c->ss_limit = __vmread(GUEST_SS_LIMIT);
408 c->ss_base = __vmread(GUEST_SS_BASE);
409 c->ss_arbytes = __vmread(GUEST_SS_AR_BYTES);
411 c->fs_sel = __vmread(GUEST_FS_SELECTOR);
412 c->fs_limit = __vmread(GUEST_FS_LIMIT);
413 c->fs_base = __vmread(GUEST_FS_BASE);
414 c->fs_arbytes = __vmread(GUEST_FS_AR_BYTES);
416 c->gs_sel = __vmread(GUEST_GS_SELECTOR);
417 c->gs_limit = __vmread(GUEST_GS_LIMIT);
418 c->gs_base = __vmread(GUEST_GS_BASE);
419 c->gs_arbytes = __vmread(GUEST_GS_AR_BYTES);
421 c->tr_sel = __vmread(GUEST_TR_SELECTOR);
422 c->tr_limit = __vmread(GUEST_TR_LIMIT);
423 c->tr_base = __vmread(GUEST_TR_BASE);
424 c->tr_arbytes = __vmread(GUEST_TR_AR_BYTES);
426 c->ldtr_sel = __vmread(GUEST_LDTR_SELECTOR);
427 c->ldtr_limit = __vmread(GUEST_LDTR_LIMIT);
428 c->ldtr_base = __vmread(GUEST_LDTR_BASE);
429 c->ldtr_arbytes = __vmread(GUEST_LDTR_AR_BYTES);
431 c->sysenter_cs = __vmread(GUEST_SYSENTER_CS);
432 c->sysenter_esp = __vmread(GUEST_SYSENTER_ESP);
433 c->sysenter_eip = __vmread(GUEST_SYSENTER_EIP);
435 return 1;
436 }
438 int vmx_vmcs_restore(struct vcpu *v, struct hvm_hw_cpu *c)
439 {
440 unsigned long mfn, old_base_mfn;
442 vmx_vmcs_enter(v);
444 __vmwrite(GUEST_RIP, c->eip);
445 __vmwrite(GUEST_RSP, c->esp);
446 __vmwrite(GUEST_RFLAGS, c->eflags);
448 v->arch.hvm_vmx.cpu_shadow_cr0 = c->cr0;
449 __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr0);
451 #ifdef HVM_DEBUG_SUSPEND
452 printk("vmx_vmcs_restore: cr3=0x%"PRIx64", cr0=0x%"PRIx64", cr4=0x%"PRIx64".\n",
453 c->cr3,
454 c->cr0,
455 c->cr4);
456 #endif
458 if (!vmx_paging_enabled(v)) {
459 printk("vmx_vmcs_restore: paging not enabled.");
460 goto skip_cr3;
461 }
463 if (c->cr3 == v->arch.hvm_vmx.cpu_cr3) {
464 /*
465 * This is simple TLB flush, implying the guest has
466 * removed some translation or changed page attributes.
467 * We simply invalidate the shadow.
468 */
469 mfn = gmfn_to_mfn(v->domain, c->cr3 >> PAGE_SHIFT);
470 if (mfn != pagetable_get_pfn(v->arch.guest_table)) {
471 goto bad_cr3;
472 }
473 } else {
474 /*
475 * If different, make a shadow. Check if the PDBR is valid
476 * first.
477 */
478 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 c->cr3 = %"PRIx64"", c->cr3);
479 /* current!=vcpu as not called by arch_vmx_do_launch */
480 mfn = gmfn_to_mfn(v->domain, c->cr3 >> PAGE_SHIFT);
481 if( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), v->domain)) {
482 goto bad_cr3;
483 }
484 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
485 v->arch.guest_table = pagetable_from_pfn(mfn);
486 if (old_base_mfn)
487 put_page(mfn_to_page(old_base_mfn));
488 v->arch.hvm_vmx.cpu_cr3 = c->cr3;
489 }
491 skip_cr3:
492 #if defined(__x86_64__)
493 if (vmx_long_mode_enabled(v)) {
494 unsigned long vm_entry_value;
495 vm_entry_value = __vmread(VM_ENTRY_CONTROLS);
496 vm_entry_value |= VM_ENTRY_IA32E_MODE;
497 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
498 }
499 #endif
501 __vmwrite(GUEST_CR4, (c->cr4 | VMX_CR4_HOST_MASK));
502 v->arch.hvm_vmx.cpu_shadow_cr4 = c->cr4;
503 __vmwrite(CR4_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr4);
505 __vmwrite(GUEST_IDTR_LIMIT, c->idtr_limit);
506 __vmwrite(GUEST_IDTR_BASE, c->idtr_base);
508 __vmwrite(GUEST_GDTR_LIMIT, c->gdtr_limit);
509 __vmwrite(GUEST_GDTR_BASE, c->gdtr_base);
511 __vmwrite(GUEST_CS_SELECTOR, c->cs_sel);
512 __vmwrite(GUEST_CS_LIMIT, c->cs_limit);
513 __vmwrite(GUEST_CS_BASE, c->cs_base);
514 __vmwrite(GUEST_CS_AR_BYTES, c->cs_arbytes);
516 __vmwrite(GUEST_DS_SELECTOR, c->ds_sel);
517 __vmwrite(GUEST_DS_LIMIT, c->ds_limit);
518 __vmwrite(GUEST_DS_BASE, c->ds_base);
519 __vmwrite(GUEST_DS_AR_BYTES, c->ds_arbytes);
521 __vmwrite(GUEST_ES_SELECTOR, c->es_sel);
522 __vmwrite(GUEST_ES_LIMIT, c->es_limit);
523 __vmwrite(GUEST_ES_BASE, c->es_base);
524 __vmwrite(GUEST_ES_AR_BYTES, c->es_arbytes);
526 __vmwrite(GUEST_SS_SELECTOR, c->ss_sel);
527 __vmwrite(GUEST_SS_LIMIT, c->ss_limit);
528 __vmwrite(GUEST_SS_BASE, c->ss_base);
529 __vmwrite(GUEST_SS_AR_BYTES, c->ss_arbytes);
531 __vmwrite(GUEST_FS_SELECTOR, c->fs_sel);
532 __vmwrite(GUEST_FS_LIMIT, c->fs_limit);
533 __vmwrite(GUEST_FS_BASE, c->fs_base);
534 __vmwrite(GUEST_FS_AR_BYTES, c->fs_arbytes);
536 __vmwrite(GUEST_GS_SELECTOR, c->gs_sel);
537 __vmwrite(GUEST_GS_LIMIT, c->gs_limit);
538 __vmwrite(GUEST_GS_BASE, c->gs_base);
539 __vmwrite(GUEST_GS_AR_BYTES, c->gs_arbytes);
541 __vmwrite(GUEST_TR_SELECTOR, c->tr_sel);
542 __vmwrite(GUEST_TR_LIMIT, c->tr_limit);
543 __vmwrite(GUEST_TR_BASE, c->tr_base);
544 __vmwrite(GUEST_TR_AR_BYTES, c->tr_arbytes);
546 __vmwrite(GUEST_LDTR_SELECTOR, c->ldtr_sel);
547 __vmwrite(GUEST_LDTR_LIMIT, c->ldtr_limit);
548 __vmwrite(GUEST_LDTR_BASE, c->ldtr_base);
549 __vmwrite(GUEST_LDTR_AR_BYTES, c->ldtr_arbytes);
551 __vmwrite(GUEST_SYSENTER_CS, c->sysenter_cs);
552 __vmwrite(GUEST_SYSENTER_ESP, c->sysenter_esp);
553 __vmwrite(GUEST_SYSENTER_EIP, c->sysenter_eip);
555 vmx_vmcs_exit(v);
557 paging_update_paging_modes(v);
558 return 0;
560 bad_cr3:
561 gdprintk(XENLOG_ERR, "Invalid CR3 value=0x%"PRIx64"", c->cr3);
562 vmx_vmcs_exit(v);
563 return -EINVAL;
564 }
566 #ifdef HVM_DEBUG_SUSPEND
567 static void dump_msr_state(struct vmx_msr_state *m)
568 {
569 int i = 0;
570 printk("**** msr state ****\n");
571 printk("shadow_gs=0x%lx, flags=0x%lx, msr_items:", m->shadow_gs, m->flags);
572 for (i = 0; i < VMX_MSR_COUNT; i++)
573 printk("0x%lx,", m->msrs[i]);
574 printk("\n");
575 }
576 #else
577 static void dump_msr_state(struct vmx_msr_state *m)
578 {
579 }
580 #endif
582 void vmx_save_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data)
583 {
584 struct vmx_msr_state *guest_state = &v->arch.hvm_vmx.msr_state;
585 unsigned long guest_flags = guest_state->flags;
587 data->shadow_gs = guest_state->shadow_gs;
589 /* save msrs */
590 data->flags = guest_flags;
591 data->msr_lstar = guest_state->msrs[VMX_INDEX_MSR_LSTAR];
592 data->msr_star = guest_state->msrs[VMX_INDEX_MSR_STAR];
593 data->msr_cstar = guest_state->msrs[VMX_INDEX_MSR_CSTAR];
594 data->msr_syscall_mask = guest_state->msrs[VMX_INDEX_MSR_SYSCALL_MASK];
595 data->msr_efer = guest_state->msrs[VMX_INDEX_MSR_EFER];
597 data->tsc = hvm_get_guest_time(v);
599 dump_msr_state(guest_state);
600 }
602 void vmx_load_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data)
603 {
604 struct vmx_msr_state *guest_state = &v->arch.hvm_vmx.msr_state;
606 /* restore msrs */
607 guest_state->flags = data->flags;
608 guest_state->msrs[VMX_INDEX_MSR_LSTAR] = data->msr_lstar;
609 guest_state->msrs[VMX_INDEX_MSR_STAR] = data->msr_star;
610 guest_state->msrs[VMX_INDEX_MSR_CSTAR] = data->msr_cstar;
611 guest_state->msrs[VMX_INDEX_MSR_SYSCALL_MASK] = data->msr_syscall_mask;
612 guest_state->msrs[VMX_INDEX_MSR_EFER] = data->msr_efer;
614 guest_state->shadow_gs = data->shadow_gs;
616 v->arch.hvm_vmx.vmxassist_enabled = !(data->cr0 & X86_CR0_PE);
618 hvm_set_guest_time(v, data->tsc);
620 dump_msr_state(guest_state);
621 }
624 void vmx_save_vmcs_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt)
625 {
626 vmx_save_cpu_state(v, ctxt);
627 vmx_vmcs_enter(v);
628 vmx_vmcs_save(v, ctxt);
629 vmx_vmcs_exit(v);
630 }
632 int vmx_load_vmcs_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt)
633 {
634 vmx_load_cpu_state(v, ctxt);
635 if (vmx_vmcs_restore(v, ctxt)) {
636 printk("vmx_vmcs restore failed!\n");
637 domain_crash(v->domain);
638 return -EINVAL;
639 }
641 return 0;
642 }
644 /*
645 * DR7 is saved and restored on every vmexit. Other debug registers only
646 * need to be restored if their value is going to affect execution -- i.e.,
647 * if one of the breakpoints is enabled. So mask out all bits that don't
648 * enable some breakpoint functionality.
649 */
650 #define DR7_ACTIVE_MASK 0xff
652 static inline void vmx_restore_dr(struct vcpu *v)
653 {
654 /* NB. __vmread() is not usable here, so we cannot read from the VMCS. */
655 if ( unlikely(v->arch.guest_context.debugreg[7] & DR7_ACTIVE_MASK) )
656 __restore_debug_registers(v);
657 }
659 static void vmx_ctxt_switch_from(struct vcpu *v)
660 {
661 vmx_save_guest_msrs(v);
662 vmx_restore_host_msrs();
663 vmx_save_dr(v);
664 }
666 static void vmx_ctxt_switch_to(struct vcpu *v)
667 {
668 vmx_restore_guest_msrs(v);
669 vmx_restore_dr(v);
670 }
672 static void stop_vmx(void)
673 {
674 if ( !(read_cr4() & X86_CR4_VMXE) )
675 return;
677 __vmxoff();
678 clear_in_cr4(X86_CR4_VMXE);
679 }
681 static void vmx_store_cpu_guest_regs(
682 struct vcpu *v, struct cpu_user_regs *regs, unsigned long *crs)
683 {
684 vmx_vmcs_enter(v);
686 if ( regs != NULL )
687 {
688 regs->eflags = __vmread(GUEST_RFLAGS);
689 regs->ss = __vmread(GUEST_SS_SELECTOR);
690 regs->cs = __vmread(GUEST_CS_SELECTOR);
691 regs->eip = __vmread(GUEST_RIP);
692 regs->esp = __vmread(GUEST_RSP);
693 }
695 if ( crs != NULL )
696 {
697 crs[0] = v->arch.hvm_vmx.cpu_shadow_cr0;
698 crs[2] = v->arch.hvm_vmx.cpu_cr2;
699 crs[3] = v->arch.hvm_vmx.cpu_cr3;
700 crs[4] = v->arch.hvm_vmx.cpu_shadow_cr4;
701 }
703 vmx_vmcs_exit(v);
704 }
706 static void vmx_load_cpu_guest_regs(struct vcpu *v, struct cpu_user_regs *regs)
707 {
708 unsigned long base;
710 vmx_vmcs_enter(v);
712 __vmwrite(GUEST_SS_SELECTOR, regs->ss);
713 __vmwrite(GUEST_RSP, regs->esp);
715 /* NB. Bit 1 of RFLAGS must be set for VMENTRY to succeed. */
716 __vmwrite(GUEST_RFLAGS, regs->eflags | 2UL);
718 if ( regs->eflags & EF_TF )
719 __vm_set_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_DB);
720 else
721 __vm_clear_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_DB);
723 if ( regs->eflags & EF_VM )
724 {
725 /*
726 * The VMX spec (section 4.3.1.2, Checks on Guest Segment
727 * Registers) says that virtual-8086 mode guests' segment
728 * base-address fields in the VMCS must be equal to their
729 * corresponding segment selector field shifted right by
730 * four bits upon vmentry.
731 */
732 base = __vmread(GUEST_CS_BASE);
733 if ( (regs->cs << 4) != base )
734 __vmwrite(GUEST_CS_BASE, regs->cs << 4);
735 base = __vmread(GUEST_SS_BASE);
736 if ( (regs->ss << 4) != base )
737 __vmwrite(GUEST_SS_BASE, regs->ss << 4);
738 }
740 __vmwrite(GUEST_CS_SELECTOR, regs->cs);
741 __vmwrite(GUEST_RIP, regs->eip);
743 vmx_vmcs_exit(v);
744 }
746 static unsigned long vmx_get_ctrl_reg(struct vcpu *v, unsigned int num)
747 {
748 switch ( num )
749 {
750 case 0:
751 return v->arch.hvm_vmx.cpu_cr0;
752 case 2:
753 return v->arch.hvm_vmx.cpu_cr2;
754 case 3:
755 return v->arch.hvm_vmx.cpu_cr3;
756 case 4:
757 return v->arch.hvm_vmx.cpu_shadow_cr4;
758 default:
759 BUG();
760 }
761 return 0; /* dummy */
762 }
764 static unsigned long vmx_get_segment_base(struct vcpu *v, enum x86_segment seg)
765 {
766 unsigned long base = 0;
767 int long_mode = 0;
769 ASSERT(v == current);
771 #ifdef __x86_64__
772 if ( vmx_long_mode_enabled(v) && (__vmread(GUEST_CS_AR_BYTES) & (1u<<13)) )
773 long_mode = 1;
774 #endif
776 switch ( seg )
777 {
778 case x86_seg_cs: if ( !long_mode ) base = __vmread(GUEST_CS_BASE); break;
779 case x86_seg_ds: if ( !long_mode ) base = __vmread(GUEST_DS_BASE); break;
780 case x86_seg_es: if ( !long_mode ) base = __vmread(GUEST_ES_BASE); break;
781 case x86_seg_fs: base = __vmread(GUEST_FS_BASE); break;
782 case x86_seg_gs: base = __vmread(GUEST_GS_BASE); break;
783 case x86_seg_ss: if ( !long_mode ) base = __vmread(GUEST_SS_BASE); break;
784 case x86_seg_tr: base = __vmread(GUEST_TR_BASE); break;
785 case x86_seg_gdtr: base = __vmread(GUEST_GDTR_BASE); break;
786 case x86_seg_idtr: base = __vmread(GUEST_IDTR_BASE); break;
787 case x86_seg_ldtr: base = __vmread(GUEST_LDTR_BASE); break;
788 default: BUG(); break;
789 }
791 return base;
792 }
794 static void vmx_get_segment_register(struct vcpu *v, enum x86_segment seg,
795 struct segment_register *reg)
796 {
797 u16 attr = 0;
799 ASSERT(v == current);
801 switch ( seg )
802 {
803 case x86_seg_cs:
804 reg->sel = __vmread(GUEST_CS_SELECTOR);
805 reg->limit = __vmread(GUEST_CS_LIMIT);
806 reg->base = __vmread(GUEST_CS_BASE);
807 attr = __vmread(GUEST_CS_AR_BYTES);
808 break;
809 case x86_seg_ds:
810 reg->sel = __vmread(GUEST_DS_SELECTOR);
811 reg->limit = __vmread(GUEST_DS_LIMIT);
812 reg->base = __vmread(GUEST_DS_BASE);
813 attr = __vmread(GUEST_DS_AR_BYTES);
814 break;
815 case x86_seg_es:
816 reg->sel = __vmread(GUEST_ES_SELECTOR);
817 reg->limit = __vmread(GUEST_ES_LIMIT);
818 reg->base = __vmread(GUEST_ES_BASE);
819 attr = __vmread(GUEST_ES_AR_BYTES);
820 break;
821 case x86_seg_fs:
822 reg->sel = __vmread(GUEST_FS_SELECTOR);
823 reg->limit = __vmread(GUEST_FS_LIMIT);
824 reg->base = __vmread(GUEST_FS_BASE);
825 attr = __vmread(GUEST_FS_AR_BYTES);
826 break;
827 case x86_seg_gs:
828 reg->sel = __vmread(GUEST_GS_SELECTOR);
829 reg->limit = __vmread(GUEST_GS_LIMIT);
830 reg->base = __vmread(GUEST_GS_BASE);
831 attr = __vmread(GUEST_GS_AR_BYTES);
832 break;
833 case x86_seg_ss:
834 reg->sel = __vmread(GUEST_SS_SELECTOR);
835 reg->limit = __vmread(GUEST_SS_LIMIT);
836 reg->base = __vmread(GUEST_SS_BASE);
837 attr = __vmread(GUEST_SS_AR_BYTES);
838 break;
839 case x86_seg_tr:
840 reg->sel = __vmread(GUEST_TR_SELECTOR);
841 reg->limit = __vmread(GUEST_TR_LIMIT);
842 reg->base = __vmread(GUEST_TR_BASE);
843 attr = __vmread(GUEST_TR_AR_BYTES);
844 break;
845 case x86_seg_gdtr:
846 reg->limit = __vmread(GUEST_GDTR_LIMIT);
847 reg->base = __vmread(GUEST_GDTR_BASE);
848 break;
849 case x86_seg_idtr:
850 reg->limit = __vmread(GUEST_IDTR_LIMIT);
851 reg->base = __vmread(GUEST_IDTR_BASE);
852 break;
853 case x86_seg_ldtr:
854 reg->sel = __vmread(GUEST_LDTR_SELECTOR);
855 reg->limit = __vmread(GUEST_LDTR_LIMIT);
856 reg->base = __vmread(GUEST_LDTR_BASE);
857 attr = __vmread(GUEST_LDTR_AR_BYTES);
858 break;
859 default:
860 BUG();
861 }
863 reg->attr.bytes = (attr & 0xff) | ((attr >> 4) & 0xf00);
864 }
866 /* Make sure that xen intercepts any FP accesses from current */
867 static void vmx_stts(struct vcpu *v)
868 {
869 /* VMX depends on operating on the current vcpu */
870 ASSERT(v == current);
872 /*
873 * If the guest does not have TS enabled then we must cause and handle an
874 * exception on first use of the FPU. If the guest *does* have TS enabled
875 * then this is not necessary: no FPU activity can occur until the guest
876 * clears CR0.TS, and we will initialise the FPU when that happens.
877 */
878 if ( !(v->arch.hvm_vmx.cpu_shadow_cr0 & X86_CR0_TS) )
879 {
880 v->arch.hvm_vmx.cpu_cr0 |= X86_CR0_TS;
881 __vmwrite(GUEST_CR0, v->arch.hvm_vmx.cpu_cr0);
882 __vm_set_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_NM);
883 }
884 }
886 static void vmx_set_tsc_offset(struct vcpu *v, u64 offset)
887 {
888 vmx_vmcs_enter(v);
889 __vmwrite(TSC_OFFSET, offset);
890 #if defined (__i386__)
891 __vmwrite(TSC_OFFSET_HIGH, offset >> 32);
892 #endif
893 vmx_vmcs_exit(v);
894 }
896 static void vmx_init_ap_context(
897 struct vcpu_guest_context *ctxt, int vcpuid, int trampoline_vector)
898 {
899 memset(ctxt, 0, sizeof(*ctxt));
900 ctxt->user_regs.eip = VMXASSIST_BASE;
901 ctxt->user_regs.edx = vcpuid;
902 ctxt->user_regs.ebx = trampoline_vector;
903 }
905 void do_nmi(struct cpu_user_regs *);
907 static void vmx_init_hypercall_page(struct domain *d, void *hypercall_page)
908 {
909 char *p;
910 int i;
912 memset(hypercall_page, 0, PAGE_SIZE);
914 for ( i = 0; i < (PAGE_SIZE / 32); i++ )
915 {
916 p = (char *)(hypercall_page + (i * 32));
917 *(u8 *)(p + 0) = 0xb8; /* mov imm32, %eax */
918 *(u32 *)(p + 1) = i;
919 *(u8 *)(p + 5) = 0x0f; /* vmcall */
920 *(u8 *)(p + 6) = 0x01;
921 *(u8 *)(p + 7) = 0xc1;
922 *(u8 *)(p + 8) = 0xc3; /* ret */
923 }
925 /* Don't support HYPERVISOR_iret at the moment */
926 *(u16 *)(hypercall_page + (__HYPERVISOR_iret * 32)) = 0x0b0f; /* ud2 */
927 }
929 static int vmx_realmode(struct vcpu *v)
930 {
931 unsigned long rflags;
933 ASSERT(v == current);
935 rflags = __vmread(GUEST_RFLAGS);
936 return rflags & X86_EFLAGS_VM;
937 }
939 static int vmx_guest_x86_mode(struct vcpu *v)
940 {
941 unsigned long cs_ar_bytes;
943 ASSERT(v == current);
945 cs_ar_bytes = __vmread(GUEST_CS_AR_BYTES);
947 if ( vmx_long_mode_enabled(v) && (cs_ar_bytes & (1u<<13)) )
948 return 8;
950 if ( vmx_realmode(v) )
951 return 2;
953 return ((cs_ar_bytes & (1u<<14)) ? 4 : 2);
954 }
956 static int vmx_pae_enabled(struct vcpu *v)
957 {
958 unsigned long cr4 = v->arch.hvm_vmx.cpu_shadow_cr4;
959 return (vmx_paging_enabled(v) && (cr4 & X86_CR4_PAE));
960 }
962 static void vmx_update_host_cr3(struct vcpu *v)
963 {
964 ASSERT( (v == current) || !vcpu_runnable(v) );
965 vmx_vmcs_enter(v);
966 __vmwrite(HOST_CR3, v->arch.cr3);
967 vmx_vmcs_exit(v);
968 }
970 static void vmx_update_guest_cr3(struct vcpu *v)
971 {
972 ASSERT( (v == current) || !vcpu_runnable(v) );
973 vmx_vmcs_enter(v);
974 __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
975 vmx_vmcs_exit(v);
976 }
979 static void vmx_inject_exception(
980 unsigned int trapnr, int errcode, unsigned long cr2)
981 {
982 struct vcpu *v = current;
983 vmx_inject_hw_exception(v, trapnr, errcode);
984 if ( trapnr == TRAP_page_fault )
985 v->arch.hvm_vmx.cpu_cr2 = cr2;
986 }
988 static void vmx_update_vtpr(struct vcpu *v, unsigned long value)
989 {
990 /* VMX doesn't have a V_TPR field */
991 }
993 static int vmx_event_injection_faulted(struct vcpu *v)
994 {
995 unsigned int idtv_info_field;
997 ASSERT(v == current);
999 idtv_info_field = __vmread(IDT_VECTORING_INFO_FIELD);
1000 return (idtv_info_field & INTR_INFO_VALID_MASK);
1003 static struct hvm_function_table vmx_function_table = {
1004 .disable = stop_vmx,
1005 .vcpu_initialise = vmx_vcpu_initialise,
1006 .vcpu_destroy = vmx_vcpu_destroy,
1007 .store_cpu_guest_regs = vmx_store_cpu_guest_regs,
1008 .load_cpu_guest_regs = vmx_load_cpu_guest_regs,
1009 .save_cpu_ctxt = vmx_save_vmcs_ctxt,
1010 .load_cpu_ctxt = vmx_load_vmcs_ctxt,
1011 .paging_enabled = vmx_paging_enabled,
1012 .long_mode_enabled = vmx_long_mode_enabled,
1013 .pae_enabled = vmx_pae_enabled,
1014 .guest_x86_mode = vmx_guest_x86_mode,
1015 .get_guest_ctrl_reg = vmx_get_ctrl_reg,
1016 .get_segment_base = vmx_get_segment_base,
1017 .get_segment_register = vmx_get_segment_register,
1018 .update_host_cr3 = vmx_update_host_cr3,
1019 .update_guest_cr3 = vmx_update_guest_cr3,
1020 .update_vtpr = vmx_update_vtpr,
1021 .stts = vmx_stts,
1022 .set_tsc_offset = vmx_set_tsc_offset,
1023 .inject_exception = vmx_inject_exception,
1024 .init_ap_context = vmx_init_ap_context,
1025 .init_hypercall_page = vmx_init_hypercall_page,
1026 .event_injection_faulted = vmx_event_injection_faulted
1027 };
1029 int start_vmx(void)
1031 u32 eax, edx;
1032 struct vmcs_struct *vmcs;
1034 /*
1035 * Xen does not fill x86_capability words except 0.
1036 */
1037 boot_cpu_data.x86_capability[4] = cpuid_ecx(1);
1039 if ( !test_bit(X86_FEATURE_VMXE, &boot_cpu_data.x86_capability) )
1040 return 0;
1042 rdmsr(IA32_FEATURE_CONTROL_MSR, eax, edx);
1044 if ( eax & IA32_FEATURE_CONTROL_MSR_LOCK )
1046 if ( (eax & IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON) == 0x0 )
1048 printk("VMX disabled by Feature Control MSR.\n");
1049 return 0;
1052 else
1054 wrmsr(IA32_FEATURE_CONTROL_MSR,
1055 IA32_FEATURE_CONTROL_MSR_LOCK |
1056 IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON, 0);
1059 set_in_cr4(X86_CR4_VMXE);
1061 vmx_init_vmcs_config();
1063 if ( smp_processor_id() == 0 )
1064 setup_vmcs_dump();
1066 if ( (vmcs = vmx_alloc_host_vmcs()) == NULL )
1068 clear_in_cr4(X86_CR4_VMXE);
1069 printk("Failed to allocate host VMCS\n");
1070 return 0;
1073 if ( __vmxon(virt_to_maddr(vmcs)) )
1075 clear_in_cr4(X86_CR4_VMXE);
1076 printk("VMXON failed\n");
1077 vmx_free_host_vmcs(vmcs);
1078 return 0;
1081 printk("VMXON is done\n");
1083 vmx_save_host_msrs();
1085 hvm_enable(&vmx_function_table);
1087 return 1;
1090 /*
1091 * Not all cases receive valid value in the VM-exit instruction length field.
1092 * Callers must know what they're doing!
1093 */
1094 static int __get_instruction_length(void)
1096 int len;
1097 len = __vmread(VM_EXIT_INSTRUCTION_LEN); /* Safe: callers audited */
1098 BUG_ON((len < 1) || (len > 15));
1099 return len;
1102 static void inline __update_guest_eip(unsigned long inst_len)
1104 unsigned long current_eip;
1106 current_eip = __vmread(GUEST_RIP);
1107 __vmwrite(GUEST_RIP, current_eip + inst_len);
1108 __vmwrite(GUEST_INTERRUPTIBILITY_INFO, 0);
1111 static int vmx_do_page_fault(unsigned long va, struct cpu_user_regs *regs)
1113 int result;
1115 #if 0 /* keep for debugging */
1117 unsigned long eip, cs;
1119 cs = __vmread(GUEST_CS_BASE);
1120 eip = __vmread(GUEST_RIP);
1121 HVM_DBG_LOG(DBG_LEVEL_VMMU,
1122 "vmx_do_page_fault = 0x%lx, cs_base=%lx, "
1123 "eip = %lx, error_code = %lx\n",
1124 va, cs, eip, (unsigned long)regs->error_code);
1126 #endif
1128 result = paging_fault(va, regs);
1130 TRACE_VMEXIT(2, result);
1131 #if 0
1132 if ( !result )
1134 eip = __vmread(GUEST_RIP);
1135 printk("vmx pgfault to guest va=%lx eip=%lx\n", va, eip);
1137 #endif
1139 return result;
1142 static void vmx_do_no_device_fault(void)
1144 struct vcpu *v = current;
1146 setup_fpu(current);
1147 __vm_clear_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_NM);
1149 /* Disable TS in guest CR0 unless the guest wants the exception too. */
1150 if ( !(v->arch.hvm_vmx.cpu_shadow_cr0 & X86_CR0_TS) )
1152 v->arch.hvm_vmx.cpu_cr0 &= ~X86_CR0_TS;
1153 __vmwrite(GUEST_CR0, v->arch.hvm_vmx.cpu_cr0);
1157 #define bitmaskof(idx) (1U << ((idx) & 31))
1158 static void vmx_do_cpuid(struct cpu_user_regs *regs)
1160 unsigned int input = (unsigned int)regs->eax;
1161 unsigned int count = (unsigned int)regs->ecx;
1162 unsigned int eax, ebx, ecx, edx;
1164 if ( input == 0x00000004 )
1166 cpuid_count(input, count, &eax, &ebx, &ecx, &edx);
1167 eax &= NUM_CORES_RESET_MASK;
1169 else if ( input == 0x40000003 )
1171 /*
1172 * NB. Unsupported interface for private use of VMXASSIST only.
1173 * Note that this leaf lives at <max-hypervisor-leaf> + 1.
1174 */
1175 u64 value = ((u64)regs->edx << 32) | (u32)regs->ecx;
1176 unsigned long mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
1177 struct vcpu *v = current;
1178 char *p;
1180 gdprintk(XENLOG_INFO, "Input address is 0x%"PRIx64".\n", value);
1182 /* 8-byte aligned valid pseudophys address from vmxassist, please. */
1183 if ( (value & 7) || (mfn == INVALID_MFN) ||
1184 !v->arch.hvm_vmx.vmxassist_enabled )
1186 domain_crash(v->domain);
1187 return;
1190 p = map_domain_page(mfn);
1191 value = *((uint64_t *)(p + (value & (PAGE_SIZE - 1))));
1192 unmap_domain_page(p);
1194 gdprintk(XENLOG_INFO, "Output value is 0x%"PRIx64".\n", value);
1195 ecx = (u32)value;
1196 edx = (u32)(value >> 32);
1197 } else {
1198 hvm_cpuid(input, &eax, &ebx, &ecx, &edx);
1200 if ( input == 0x00000001 )
1202 /* Mask off reserved bits. */
1203 ecx &= ~VMX_VCPU_CPUID_L1_ECX_RESERVED;
1205 ebx &= NUM_THREADS_RESET_MASK;
1207 /* Unsupportable for virtualised CPUs. */
1208 ecx &= ~(bitmaskof(X86_FEATURE_VMXE) |
1209 bitmaskof(X86_FEATURE_EST) |
1210 bitmaskof(X86_FEATURE_TM2) |
1211 bitmaskof(X86_FEATURE_CID));
1213 edx &= ~(bitmaskof(X86_FEATURE_HT) |
1214 bitmaskof(X86_FEATURE_ACPI) |
1215 bitmaskof(X86_FEATURE_ACC));
1218 if ( input == 0x00000006 || input == 0x00000009 || input == 0x0000000A )
1219 eax = ebx = ecx = edx = 0x0;
1222 regs->eax = (unsigned long)eax;
1223 regs->ebx = (unsigned long)ebx;
1224 regs->ecx = (unsigned long)ecx;
1225 regs->edx = (unsigned long)edx;
1228 #define CASE_GET_REG_P(REG, reg) \
1229 case REG_ ## REG: reg_p = (unsigned long *)&(regs->reg); break
1231 #ifdef __i386__
1232 #define CASE_EXTEND_GET_REG_P
1233 #else
1234 #define CASE_EXTEND_GET_REG_P \
1235 CASE_GET_REG_P(R8, r8); \
1236 CASE_GET_REG_P(R9, r9); \
1237 CASE_GET_REG_P(R10, r10); \
1238 CASE_GET_REG_P(R11, r11); \
1239 CASE_GET_REG_P(R12, r12); \
1240 CASE_GET_REG_P(R13, r13); \
1241 CASE_GET_REG_P(R14, r14); \
1242 CASE_GET_REG_P(R15, r15)
1243 #endif
1245 static void vmx_dr_access(unsigned long exit_qualification,
1246 struct cpu_user_regs *regs)
1248 struct vcpu *v = current;
1250 v->arch.hvm_vcpu.flag_dr_dirty = 1;
1252 /* We could probably be smarter about this */
1253 __restore_debug_registers(v);
1255 /* Allow guest direct access to DR registers */
1256 v->arch.hvm_vcpu.u.vmx.exec_control &= ~CPU_BASED_MOV_DR_EXITING;
1257 __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
1258 v->arch.hvm_vcpu.u.vmx.exec_control);
1261 /*
1262 * Invalidate the TLB for va. Invalidate the shadow page corresponding
1263 * the address va.
1264 */
1265 static void vmx_do_invlpg(unsigned long va)
1267 unsigned long eip;
1268 struct vcpu *v = current;
1270 eip = __vmread(GUEST_RIP);
1272 HVM_DBG_LOG(DBG_LEVEL_VMMU, "eip=%lx, va=%lx",
1273 eip, va);
1275 /*
1276 * We do the safest things first, then try to update the shadow
1277 * copying from guest
1278 */
1279 paging_invlpg(v, va);
1283 static int vmx_check_descriptor(int long_mode, unsigned long eip, int inst_len,
1284 enum x86_segment seg, unsigned long *base,
1285 u32 *limit, u32 *ar_bytes)
1287 enum vmcs_field ar_field, base_field, limit_field;
1289 *base = 0;
1290 *limit = 0;
1291 if ( seg != x86_seg_es )
1293 unsigned char inst[MAX_INST_LEN];
1294 int i;
1295 extern int inst_copy_from_guest(unsigned char *, unsigned long, int);
1297 if ( !long_mode )
1298 eip += __vmread(GUEST_CS_BASE);
1299 memset(inst, 0, MAX_INST_LEN);
1300 if ( inst_copy_from_guest(inst, eip, inst_len) != inst_len )
1302 gdprintk(XENLOG_ERR, "Get guest instruction failed\n");
1303 domain_crash(current->domain);
1304 return 0;
1307 for ( i = 0; i < inst_len; i++ )
1309 switch ( inst[i] )
1311 case 0xf3: /* REPZ */
1312 case 0xf2: /* REPNZ */
1313 case 0xf0: /* LOCK */
1314 case 0x66: /* data32 */
1315 case 0x67: /* addr32 */
1316 #ifdef __x86_64__
1317 case 0x40 ... 0x4f: /* REX */
1318 #endif
1319 continue;
1320 case 0x2e: /* CS */
1321 seg = x86_seg_cs;
1322 continue;
1323 case 0x36: /* SS */
1324 seg = x86_seg_ss;
1325 continue;
1326 case 0x26: /* ES */
1327 seg = x86_seg_es;
1328 continue;
1329 case 0x64: /* FS */
1330 seg = x86_seg_fs;
1331 continue;
1332 case 0x65: /* GS */
1333 seg = x86_seg_gs;
1334 continue;
1335 case 0x3e: /* DS */
1336 seg = x86_seg_ds;
1337 continue;
1342 switch ( seg )
1344 case x86_seg_cs:
1345 ar_field = GUEST_CS_AR_BYTES;
1346 base_field = GUEST_CS_BASE;
1347 limit_field = GUEST_CS_LIMIT;
1348 break;
1349 case x86_seg_ds:
1350 ar_field = GUEST_DS_AR_BYTES;
1351 base_field = GUEST_DS_BASE;
1352 limit_field = GUEST_DS_LIMIT;
1353 break;
1354 case x86_seg_es:
1355 ar_field = GUEST_ES_AR_BYTES;
1356 base_field = GUEST_ES_BASE;
1357 limit_field = GUEST_ES_LIMIT;
1358 break;
1359 case x86_seg_fs:
1360 ar_field = GUEST_FS_AR_BYTES;
1361 base_field = GUEST_FS_BASE;
1362 limit_field = GUEST_FS_LIMIT;
1363 break;
1364 case x86_seg_gs:
1365 ar_field = GUEST_FS_AR_BYTES;
1366 base_field = GUEST_FS_BASE;
1367 limit_field = GUEST_FS_LIMIT;
1368 break;
1369 case x86_seg_ss:
1370 ar_field = GUEST_GS_AR_BYTES;
1371 base_field = GUEST_GS_BASE;
1372 limit_field = GUEST_GS_LIMIT;
1373 break;
1374 default:
1375 BUG();
1376 return 0;
1379 if ( !long_mode || seg == x86_seg_fs || seg == x86_seg_gs )
1381 *base = __vmread(base_field);
1382 *limit = __vmread(limit_field);
1384 *ar_bytes = __vmread(ar_field);
1386 return !(*ar_bytes & 0x10000);
1389 static void vmx_io_instruction(unsigned long exit_qualification,
1390 unsigned long inst_len)
1392 struct cpu_user_regs *regs;
1393 struct hvm_io_op *pio_opp;
1394 unsigned int port, size;
1395 int dir, df, vm86;
1397 pio_opp = &current->arch.hvm_vcpu.io_op;
1398 pio_opp->instr = INSTR_PIO;
1399 pio_opp->flags = 0;
1401 regs = &pio_opp->io_context;
1403 /* Copy current guest state into io instruction state structure. */
1404 memcpy(regs, guest_cpu_user_regs(), HVM_CONTEXT_STACK_BYTES);
1405 hvm_store_cpu_guest_regs(current, regs, NULL);
1407 vm86 = regs->eflags & X86_EFLAGS_VM ? 1 : 0;
1408 df = regs->eflags & X86_EFLAGS_DF ? 1 : 0;
1410 HVM_DBG_LOG(DBG_LEVEL_IO, "vm86 %d, eip=%x:%lx, "
1411 "exit_qualification = %lx",
1412 vm86, regs->cs, (unsigned long)regs->eip, exit_qualification);
1414 if ( test_bit(6, &exit_qualification) )
1415 port = (exit_qualification >> 16) & 0xFFFF;
1416 else
1417 port = regs->edx & 0xffff;
1419 TRACE_VMEXIT(1, port);
1421 size = (exit_qualification & 7) + 1;
1422 dir = test_bit(3, &exit_qualification); /* direction */
1424 if ( test_bit(4, &exit_qualification) ) { /* string instruction */
1425 unsigned long addr, count = 1, base;
1426 paddr_t paddr;
1427 unsigned long gfn;
1428 u32 ar_bytes, limit;
1429 int sign = regs->eflags & X86_EFLAGS_DF ? -1 : 1;
1430 int long_mode = 0;
1432 ar_bytes = __vmread(GUEST_CS_AR_BYTES);
1433 #ifdef __x86_64__
1434 if ( vmx_long_mode_enabled(current) && (ar_bytes & (1u<<13)) )
1435 long_mode = 1;
1436 #endif
1437 addr = __vmread(GUEST_LINEAR_ADDRESS);
1439 if ( test_bit(5, &exit_qualification) ) { /* "rep" prefix */
1440 pio_opp->flags |= REPZ;
1441 count = regs->ecx;
1442 if ( !long_mode && (vm86 || !(ar_bytes & (1u<<14))) )
1443 count &= 0xFFFF;
1446 /*
1447 * In protected mode, guest linear address is invalid if the
1448 * selector is null.
1449 */
1450 if ( !vmx_check_descriptor(long_mode, regs->eip, inst_len,
1451 dir==IOREQ_WRITE ? x86_seg_ds : x86_seg_es,
1452 &base, &limit, &ar_bytes) ) {
1453 if ( !long_mode ) {
1454 vmx_inject_hw_exception(current, TRAP_gp_fault, 0);
1455 return;
1457 addr = dir == IOREQ_WRITE ? base + regs->esi : regs->edi;
1460 if ( !long_mode ) {
1461 unsigned long ea = addr - base;
1463 /* Segment must be readable for outs and writeable for ins. */
1464 if ( dir == IOREQ_WRITE ? (ar_bytes & 0xa) == 0x8
1465 : (ar_bytes & 0xa) != 0x2 ) {
1466 vmx_inject_hw_exception(current, TRAP_gp_fault, 0);
1467 return;
1470 /* Offset must be within limits. */
1471 ASSERT(ea == (u32)ea);
1472 if ( (u32)(ea + size - 1) < (u32)ea ||
1473 (ar_bytes & 0xc) != 0x4 ? ea + size - 1 > limit
1474 : ea <= limit )
1476 vmx_inject_hw_exception(current, TRAP_gp_fault, 0);
1477 return;
1480 /* Check the limit for repeated instructions, as above we checked
1481 only the first instance. Truncate the count if a limit violation
1482 would occur. Note that the checking is not necessary for page
1483 granular segments as transfers crossing page boundaries will be
1484 broken up anyway. */
1485 if ( !(ar_bytes & (1u<<15)) && count > 1 )
1487 if ( (ar_bytes & 0xc) != 0x4 )
1489 /* expand-up */
1490 if ( !df )
1492 if ( ea + count * size - 1 < ea ||
1493 ea + count * size - 1 > limit )
1494 count = (limit + 1UL - ea) / size;
1496 else
1498 if ( count - 1 > ea / size )
1499 count = ea / size + 1;
1502 else
1504 /* expand-down */
1505 if ( !df )
1507 if ( count - 1 > -(s32)ea / size )
1508 count = -(s32)ea / size + 1UL;
1510 else
1512 if ( ea < (count - 1) * size ||
1513 ea - (count - 1) * size <= limit )
1514 count = (ea - limit - 1) / size + 1;
1517 ASSERT(count);
1520 #ifdef __x86_64__
1521 else
1523 if ( !is_canonical_address(addr) ||
1524 !is_canonical_address(addr + size - 1) )
1526 vmx_inject_hw_exception(current, TRAP_gp_fault, 0);
1527 return;
1529 if ( count > (1UL << 48) / size )
1530 count = (1UL << 48) / size;
1531 if ( !(regs->eflags & EF_DF) )
1533 if ( addr + count * size - 1 < addr ||
1534 !is_canonical_address(addr + count * size - 1) )
1535 count = (addr & ~((1UL << 48) - 1)) / size;
1537 else
1539 if ( (count - 1) * size > addr ||
1540 !is_canonical_address(addr + (count - 1) * size) )
1541 count = (addr & ~((1UL << 48) - 1)) / size + 1;
1543 ASSERT(count);
1545 #endif
1547 /* Translate the address to a physical address */
1548 gfn = paging_gva_to_gfn(current, addr);
1549 if ( gfn == INVALID_GFN )
1551 /* The guest does not have the RAM address mapped.
1552 * Need to send in a page fault */
1553 int errcode = 0;
1554 /* IO read --> memory write */
1555 if ( dir == IOREQ_READ ) errcode |= PFEC_write_access;
1556 vmx_inject_exception(TRAP_page_fault, errcode, addr);
1557 return;
1559 paddr = (paddr_t)gfn << PAGE_SHIFT | (addr & ~PAGE_MASK);
1561 /*
1562 * Handle string pio instructions that cross pages or that
1563 * are unaligned. See the comments in hvm_domain.c/handle_mmio()
1564 */
1565 if ( (addr & PAGE_MASK) != ((addr + size - 1) & PAGE_MASK) ) {
1566 unsigned long value = 0;
1568 pio_opp->flags |= OVERLAP;
1570 if ( dir == IOREQ_WRITE ) /* OUTS */
1572 if ( hvm_paging_enabled(current) )
1574 int rv = hvm_copy_from_guest_virt(&value, addr, size);
1575 if ( rv != 0 )
1577 /* Failed on the page-spanning copy. Inject PF into
1578 * the guest for the address where we failed. */
1579 addr += size - rv;
1580 gdprintk(XENLOG_DEBUG, "Pagefault reading non-io side "
1581 "of a page-spanning PIO: va=%#lx\n", addr);
1582 vmx_inject_exception(TRAP_page_fault, 0, addr);
1583 return;
1586 else
1587 (void) hvm_copy_from_guest_phys(&value, addr, size);
1588 } else /* dir != IOREQ_WRITE */
1589 /* Remember where to write the result, as a *VA*.
1590 * Must be a VA so we can handle the page overlap
1591 * correctly in hvm_pio_assist() */
1592 pio_opp->addr = addr;
1594 if ( count == 1 )
1595 regs->eip += inst_len;
1597 send_pio_req(port, 1, size, value, dir, df, 0);
1598 } else {
1599 unsigned long last_addr = sign > 0 ? addr + count * size - 1
1600 : addr - (count - 1) * size;
1602 if ( (addr & PAGE_MASK) != (last_addr & PAGE_MASK) )
1604 if ( sign > 0 )
1605 count = (PAGE_SIZE - (addr & ~PAGE_MASK)) / size;
1606 else
1607 count = (addr & ~PAGE_MASK) / size + 1;
1608 } else
1609 regs->eip += inst_len;
1611 send_pio_req(port, count, size, paddr, dir, df, 1);
1613 } else {
1614 if ( port == 0xe9 && dir == IOREQ_WRITE && size == 1 )
1615 hvm_print_line(current, regs->eax); /* guest debug output */
1617 if ( dir == IOREQ_WRITE )
1618 TRACE_VMEXIT(2, regs->eax);
1620 regs->eip += inst_len;
1621 send_pio_req(port, 1, size, regs->eax, dir, df, 0);
1625 static void vmx_world_save(struct vcpu *v, struct vmx_assist_context *c)
1627 /* NB. Skip transition instruction. */
1628 c->eip = __vmread(GUEST_RIP);
1629 c->eip += __get_instruction_length(); /* Safe: MOV Cn, LMSW, CLTS */
1631 c->esp = __vmread(GUEST_RSP);
1632 c->eflags = __vmread(GUEST_RFLAGS);
1634 c->cr0 = v->arch.hvm_vmx.cpu_shadow_cr0;
1635 c->cr3 = v->arch.hvm_vmx.cpu_cr3;
1636 c->cr4 = v->arch.hvm_vmx.cpu_shadow_cr4;
1638 c->idtr_limit = __vmread(GUEST_IDTR_LIMIT);
1639 c->idtr_base = __vmread(GUEST_IDTR_BASE);
1641 c->gdtr_limit = __vmread(GUEST_GDTR_LIMIT);
1642 c->gdtr_base = __vmread(GUEST_GDTR_BASE);
1644 c->cs_sel = __vmread(GUEST_CS_SELECTOR);
1645 c->cs_limit = __vmread(GUEST_CS_LIMIT);
1646 c->cs_base = __vmread(GUEST_CS_BASE);
1647 c->cs_arbytes.bytes = __vmread(GUEST_CS_AR_BYTES);
1649 c->ds_sel = __vmread(GUEST_DS_SELECTOR);
1650 c->ds_limit = __vmread(GUEST_DS_LIMIT);
1651 c->ds_base = __vmread(GUEST_DS_BASE);
1652 c->ds_arbytes.bytes = __vmread(GUEST_DS_AR_BYTES);
1654 c->es_sel = __vmread(GUEST_ES_SELECTOR);
1655 c->es_limit = __vmread(GUEST_ES_LIMIT);
1656 c->es_base = __vmread(GUEST_ES_BASE);
1657 c->es_arbytes.bytes = __vmread(GUEST_ES_AR_BYTES);
1659 c->ss_sel = __vmread(GUEST_SS_SELECTOR);
1660 c->ss_limit = __vmread(GUEST_SS_LIMIT);
1661 c->ss_base = __vmread(GUEST_SS_BASE);
1662 c->ss_arbytes.bytes = __vmread(GUEST_SS_AR_BYTES);
1664 c->fs_sel = __vmread(GUEST_FS_SELECTOR);
1665 c->fs_limit = __vmread(GUEST_FS_LIMIT);
1666 c->fs_base = __vmread(GUEST_FS_BASE);
1667 c->fs_arbytes.bytes = __vmread(GUEST_FS_AR_BYTES);
1669 c->gs_sel = __vmread(GUEST_GS_SELECTOR);
1670 c->gs_limit = __vmread(GUEST_GS_LIMIT);
1671 c->gs_base = __vmread(GUEST_GS_BASE);
1672 c->gs_arbytes.bytes = __vmread(GUEST_GS_AR_BYTES);
1674 c->tr_sel = __vmread(GUEST_TR_SELECTOR);
1675 c->tr_limit = __vmread(GUEST_TR_LIMIT);
1676 c->tr_base = __vmread(GUEST_TR_BASE);
1677 c->tr_arbytes.bytes = __vmread(GUEST_TR_AR_BYTES);
1679 c->ldtr_sel = __vmread(GUEST_LDTR_SELECTOR);
1680 c->ldtr_limit = __vmread(GUEST_LDTR_LIMIT);
1681 c->ldtr_base = __vmread(GUEST_LDTR_BASE);
1682 c->ldtr_arbytes.bytes = __vmread(GUEST_LDTR_AR_BYTES);
1685 static int vmx_world_restore(struct vcpu *v, struct vmx_assist_context *c)
1687 unsigned long mfn, old_base_mfn;
1689 __vmwrite(GUEST_RIP, c->eip);
1690 __vmwrite(GUEST_RSP, c->esp);
1691 __vmwrite(GUEST_RFLAGS, c->eflags);
1693 v->arch.hvm_vmx.cpu_shadow_cr0 = c->cr0;
1694 __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr0);
1696 if ( !vmx_paging_enabled(v) )
1697 goto skip_cr3;
1699 if ( c->cr3 == v->arch.hvm_vmx.cpu_cr3 )
1701 /*
1702 * This is simple TLB flush, implying the guest has
1703 * removed some translation or changed page attributes.
1704 * We simply invalidate the shadow.
1705 */
1706 mfn = get_mfn_from_gpfn(c->cr3 >> PAGE_SHIFT);
1707 if ( mfn != pagetable_get_pfn(v->arch.guest_table) )
1708 goto bad_cr3;
1710 else
1712 /*
1713 * If different, make a shadow. Check if the PDBR is valid
1714 * first.
1715 */
1716 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 c->cr3 = %x", c->cr3);
1717 mfn = get_mfn_from_gpfn(c->cr3 >> PAGE_SHIFT);
1718 if ( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), v->domain) )
1719 goto bad_cr3;
1720 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1721 v->arch.guest_table = pagetable_from_pfn(mfn);
1722 if (old_base_mfn)
1723 put_page(mfn_to_page(old_base_mfn));
1724 v->arch.hvm_vmx.cpu_cr3 = c->cr3;
1727 skip_cr3:
1728 if ( !vmx_paging_enabled(v) )
1729 HVM_DBG_LOG(DBG_LEVEL_VMMU, "switching to vmxassist. use phys table");
1730 else
1731 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %x", c->cr3);
1733 __vmwrite(GUEST_CR4, (c->cr4 | VMX_CR4_HOST_MASK));
1734 v->arch.hvm_vmx.cpu_shadow_cr4 = c->cr4;
1735 __vmwrite(CR4_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr4);
1737 __vmwrite(GUEST_IDTR_LIMIT, c->idtr_limit);
1738 __vmwrite(GUEST_IDTR_BASE, c->idtr_base);
1740 __vmwrite(GUEST_GDTR_LIMIT, c->gdtr_limit);
1741 __vmwrite(GUEST_GDTR_BASE, c->gdtr_base);
1743 __vmwrite(GUEST_CS_SELECTOR, c->cs_sel);
1744 __vmwrite(GUEST_CS_LIMIT, c->cs_limit);
1745 __vmwrite(GUEST_CS_BASE, c->cs_base);
1746 __vmwrite(GUEST_CS_AR_BYTES, c->cs_arbytes.bytes);
1748 __vmwrite(GUEST_DS_SELECTOR, c->ds_sel);
1749 __vmwrite(GUEST_DS_LIMIT, c->ds_limit);
1750 __vmwrite(GUEST_DS_BASE, c->ds_base);
1751 __vmwrite(GUEST_DS_AR_BYTES, c->ds_arbytes.bytes);
1753 __vmwrite(GUEST_ES_SELECTOR, c->es_sel);
1754 __vmwrite(GUEST_ES_LIMIT, c->es_limit);
1755 __vmwrite(GUEST_ES_BASE, c->es_base);
1756 __vmwrite(GUEST_ES_AR_BYTES, c->es_arbytes.bytes);
1758 __vmwrite(GUEST_SS_SELECTOR, c->ss_sel);
1759 __vmwrite(GUEST_SS_LIMIT, c->ss_limit);
1760 __vmwrite(GUEST_SS_BASE, c->ss_base);
1761 __vmwrite(GUEST_SS_AR_BYTES, c->ss_arbytes.bytes);
1763 __vmwrite(GUEST_FS_SELECTOR, c->fs_sel);
1764 __vmwrite(GUEST_FS_LIMIT, c->fs_limit);
1765 __vmwrite(GUEST_FS_BASE, c->fs_base);
1766 __vmwrite(GUEST_FS_AR_BYTES, c->fs_arbytes.bytes);
1768 __vmwrite(GUEST_GS_SELECTOR, c->gs_sel);
1769 __vmwrite(GUEST_GS_LIMIT, c->gs_limit);
1770 __vmwrite(GUEST_GS_BASE, c->gs_base);
1771 __vmwrite(GUEST_GS_AR_BYTES, c->gs_arbytes.bytes);
1773 __vmwrite(GUEST_TR_SELECTOR, c->tr_sel);
1774 __vmwrite(GUEST_TR_LIMIT, c->tr_limit);
1775 __vmwrite(GUEST_TR_BASE, c->tr_base);
1776 __vmwrite(GUEST_TR_AR_BYTES, c->tr_arbytes.bytes);
1778 __vmwrite(GUEST_LDTR_SELECTOR, c->ldtr_sel);
1779 __vmwrite(GUEST_LDTR_LIMIT, c->ldtr_limit);
1780 __vmwrite(GUEST_LDTR_BASE, c->ldtr_base);
1781 __vmwrite(GUEST_LDTR_AR_BYTES, c->ldtr_arbytes.bytes);
1783 paging_update_paging_modes(v);
1784 return 0;
1786 bad_cr3:
1787 gdprintk(XENLOG_ERR, "Invalid CR3 value=%x", c->cr3);
1788 return -EINVAL;
1791 enum { VMX_ASSIST_INVOKE = 0, VMX_ASSIST_RESTORE };
1793 static int vmx_assist(struct vcpu *v, int mode)
1795 struct vmx_assist_context c;
1796 u32 magic;
1797 u32 cp;
1799 /* make sure vmxassist exists (this is not an error) */
1800 if (hvm_copy_from_guest_phys(&magic, VMXASSIST_MAGIC_OFFSET,
1801 sizeof(magic)))
1802 return 0;
1803 if (magic != VMXASSIST_MAGIC)
1804 return 0;
1806 switch (mode) {
1807 /*
1808 * Transfer control to vmxassist.
1809 * Store the current context in VMXASSIST_OLD_CONTEXT and load
1810 * the new VMXASSIST_NEW_CONTEXT context. This context was created
1811 * by vmxassist and will transfer control to it.
1812 */
1813 case VMX_ASSIST_INVOKE:
1814 /* save the old context */
1815 if (hvm_copy_from_guest_phys(&cp, VMXASSIST_OLD_CONTEXT, sizeof(cp)))
1816 goto error;
1817 if (cp != 0) {
1818 vmx_world_save(v, &c);
1819 if (hvm_copy_to_guest_phys(cp, &c, sizeof(c)))
1820 goto error;
1823 /* restore the new context, this should activate vmxassist */
1824 if (hvm_copy_from_guest_phys(&cp, VMXASSIST_NEW_CONTEXT, sizeof(cp)))
1825 goto error;
1826 if (cp != 0) {
1827 if (hvm_copy_from_guest_phys(&c, cp, sizeof(c)))
1828 goto error;
1829 if ( vmx_world_restore(v, &c) != 0 )
1830 goto error;
1831 v->arch.hvm_vmx.vmxassist_enabled = 1;
1832 return 1;
1834 break;
1836 /*
1837 * Restore the VMXASSIST_OLD_CONTEXT that was saved by
1838 * VMX_ASSIST_INVOKE above.
1839 */
1840 case VMX_ASSIST_RESTORE:
1841 /* save the old context */
1842 if (hvm_copy_from_guest_phys(&cp, VMXASSIST_OLD_CONTEXT, sizeof(cp)))
1843 goto error;
1844 if (cp != 0) {
1845 if (hvm_copy_from_guest_phys(&c, cp, sizeof(c)))
1846 goto error;
1847 if ( vmx_world_restore(v, &c) != 0 )
1848 goto error;
1849 v->arch.hvm_vmx.vmxassist_enabled = 0;
1850 return 1;
1852 break;
1855 error:
1856 gdprintk(XENLOG_ERR, "Failed to transfer to vmxassist\n");
1857 domain_crash(v->domain);
1858 return 0;
1861 static int vmx_set_cr0(unsigned long value)
1863 struct vcpu *v = current;
1864 unsigned long mfn;
1865 unsigned long eip;
1866 int paging_enabled;
1867 unsigned long vm_entry_value;
1868 unsigned long old_cr0;
1869 unsigned long old_base_mfn;
1871 /*
1872 * CR0: We don't want to lose PE and PG.
1873 */
1874 old_cr0 = v->arch.hvm_vmx.cpu_shadow_cr0;
1875 paging_enabled = (old_cr0 & X86_CR0_PE) && (old_cr0 & X86_CR0_PG);
1877 /* TS cleared? Then initialise FPU now. */
1878 if ( !(value & X86_CR0_TS) )
1880 setup_fpu(v);
1881 __vm_clear_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_NM);
1884 v->arch.hvm_vmx.cpu_cr0 = (value | X86_CR0_PE | X86_CR0_PG
1885 | X86_CR0_NE | X86_CR0_WP);
1886 __vmwrite(GUEST_CR0, v->arch.hvm_vmx.cpu_cr0);
1888 v->arch.hvm_vmx.cpu_shadow_cr0 = value;
1889 __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr0);
1891 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR0 value = %lx\n", value);
1893 if ( (value & X86_CR0_PE) && (value & X86_CR0_PG) && !paging_enabled )
1895 /*
1896 * Trying to enable guest paging.
1897 * The guest CR3 must be pointing to the guest physical.
1898 */
1899 mfn = get_mfn_from_gpfn(v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT);
1900 if ( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), v->domain) )
1902 gdprintk(XENLOG_ERR, "Invalid CR3 value = %lx (mfn=%lx)\n",
1903 v->arch.hvm_vmx.cpu_cr3, mfn);
1904 domain_crash(v->domain);
1905 return 0;
1908 #if defined(__x86_64__)
1909 if ( vmx_lme_is_set(v) )
1911 if ( !(v->arch.hvm_vmx.cpu_shadow_cr4 & X86_CR4_PAE) )
1913 HVM_DBG_LOG(DBG_LEVEL_1, "Guest enabled paging "
1914 "with EFER.LME set but not CR4.PAE\n");
1915 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
1917 else
1919 HVM_DBG_LOG(DBG_LEVEL_1, "Enabling long mode\n");
1920 v->arch.hvm_vmx.msr_state.msrs[VMX_INDEX_MSR_EFER]
1921 |= EFER_LMA;
1922 vm_entry_value = __vmread(VM_ENTRY_CONTROLS);
1923 vm_entry_value |= VM_ENTRY_IA32E_MODE;
1924 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
1927 #endif
1929 /*
1930 * Now arch.guest_table points to machine physical.
1931 */
1932 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1933 v->arch.guest_table = pagetable_from_pfn(mfn);
1934 if (old_base_mfn)
1935 put_page(mfn_to_page(old_base_mfn));
1936 paging_update_paging_modes(v);
1938 HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
1939 (unsigned long) (mfn << PAGE_SHIFT));
1941 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, mfn = %lx",
1942 v->arch.hvm_vmx.cpu_cr3, mfn);
1945 if ( !((value & X86_CR0_PE) && (value & X86_CR0_PG)) && paging_enabled )
1946 if ( v->arch.hvm_vmx.cpu_cr3 ) {
1947 put_page(mfn_to_page(get_mfn_from_gpfn(
1948 v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT)));
1949 v->arch.guest_table = pagetable_null();
1952 /*
1953 * VMX does not implement real-mode virtualization. We emulate
1954 * real-mode by performing a world switch to VMXAssist whenever
1955 * a partition disables the CR0.PE bit.
1956 */
1957 if ( (value & X86_CR0_PE) == 0 )
1959 if ( value & X86_CR0_PG ) {
1960 /* inject GP here */
1961 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
1962 return 0;
1963 } else {
1964 /*
1965 * Disable paging here.
1966 * Same to PE == 1 && PG == 0
1967 */
1968 if ( vmx_long_mode_enabled(v) )
1970 v->arch.hvm_vmx.msr_state.msrs[VMX_INDEX_MSR_EFER]
1971 &= ~EFER_LMA;
1972 vm_entry_value = __vmread(VM_ENTRY_CONTROLS);
1973 vm_entry_value &= ~VM_ENTRY_IA32E_MODE;
1974 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
1978 if ( vmx_assist(v, VMX_ASSIST_INVOKE) )
1980 eip = __vmread(GUEST_RIP);
1981 HVM_DBG_LOG(DBG_LEVEL_1,
1982 "Transfering control to vmxassist %%eip 0x%lx\n", eip);
1983 return 0; /* do not update eip! */
1986 else if ( v->arch.hvm_vmx.vmxassist_enabled )
1988 eip = __vmread(GUEST_RIP);
1989 HVM_DBG_LOG(DBG_LEVEL_1,
1990 "Enabling CR0.PE at %%eip 0x%lx\n", eip);
1991 if ( vmx_assist(v, VMX_ASSIST_RESTORE) )
1993 eip = __vmread(GUEST_RIP);
1994 HVM_DBG_LOG(DBG_LEVEL_1,
1995 "Restoring to %%eip 0x%lx\n", eip);
1996 return 0; /* do not update eip! */
1999 else if ( (value & (X86_CR0_PE | X86_CR0_PG)) == X86_CR0_PE )
2001 if ( vmx_long_mode_enabled(v) )
2003 v->arch.hvm_vmx.msr_state.msrs[VMX_INDEX_MSR_EFER] &= ~EFER_LMA;
2004 vm_entry_value = __vmread(VM_ENTRY_CONTROLS);
2005 vm_entry_value &= ~VM_ENTRY_IA32E_MODE;
2006 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
2008 paging_update_paging_modes(v);
2011 return 1;
2014 #define CASE_SET_REG(REG, reg) \
2015 case REG_ ## REG: regs->reg = value; break
2016 #define CASE_GET_REG(REG, reg) \
2017 case REG_ ## REG: value = regs->reg; break
2019 #define CASE_EXTEND_SET_REG \
2020 CASE_EXTEND_REG(S)
2021 #define CASE_EXTEND_GET_REG \
2022 CASE_EXTEND_REG(G)
2024 #ifdef __i386__
2025 #define CASE_EXTEND_REG(T)
2026 #else
2027 #define CASE_EXTEND_REG(T) \
2028 CASE_ ## T ## ET_REG(R8, r8); \
2029 CASE_ ## T ## ET_REG(R9, r9); \
2030 CASE_ ## T ## ET_REG(R10, r10); \
2031 CASE_ ## T ## ET_REG(R11, r11); \
2032 CASE_ ## T ## ET_REG(R12, r12); \
2033 CASE_ ## T ## ET_REG(R13, r13); \
2034 CASE_ ## T ## ET_REG(R14, r14); \
2035 CASE_ ## T ## ET_REG(R15, r15)
2036 #endif
2038 /*
2039 * Write to control registers
2040 */
2041 static int mov_to_cr(int gp, int cr, struct cpu_user_regs *regs)
2043 unsigned long value, old_cr, old_base_mfn, mfn;
2044 struct vcpu *v = current;
2045 struct vlapic *vlapic = vcpu_vlapic(v);
2047 switch ( gp )
2049 CASE_GET_REG(EAX, eax);
2050 CASE_GET_REG(ECX, ecx);
2051 CASE_GET_REG(EDX, edx);
2052 CASE_GET_REG(EBX, ebx);
2053 CASE_GET_REG(EBP, ebp);
2054 CASE_GET_REG(ESI, esi);
2055 CASE_GET_REG(EDI, edi);
2056 CASE_EXTEND_GET_REG;
2057 case REG_ESP:
2058 value = __vmread(GUEST_RSP);
2059 break;
2060 default:
2061 gdprintk(XENLOG_ERR, "invalid gp: %d\n", gp);
2062 goto exit_and_crash;
2065 TRACE_VMEXIT(1, TYPE_MOV_TO_CR);
2066 TRACE_VMEXIT(2, cr);
2067 TRACE_VMEXIT(3, value);
2069 HVM_DBG_LOG(DBG_LEVEL_1, "CR%d, value = %lx", cr, value);
2071 switch ( cr )
2073 case 0:
2074 return vmx_set_cr0(value);
2076 case 3:
2077 /*
2078 * If paging is not enabled yet, simply copy the value to CR3.
2079 */
2080 if (!vmx_paging_enabled(v)) {
2081 v->arch.hvm_vmx.cpu_cr3 = value;
2082 break;
2085 /*
2086 * We make a new one if the shadow does not exist.
2087 */
2088 if (value == v->arch.hvm_vmx.cpu_cr3) {
2089 /*
2090 * This is simple TLB flush, implying the guest has
2091 * removed some translation or changed page attributes.
2092 * We simply invalidate the shadow.
2093 */
2094 mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
2095 if (mfn != pagetable_get_pfn(v->arch.guest_table))
2096 goto bad_cr3;
2097 paging_update_cr3(v);
2098 } else {
2099 /*
2100 * If different, make a shadow. Check if the PDBR is valid
2101 * first.
2102 */
2103 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 value = %lx", value);
2104 mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
2105 if ( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), v->domain) )
2106 goto bad_cr3;
2107 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
2108 v->arch.guest_table = pagetable_from_pfn(mfn);
2109 if (old_base_mfn)
2110 put_page(mfn_to_page(old_base_mfn));
2111 v->arch.hvm_vmx.cpu_cr3 = value;
2112 update_cr3(v);
2113 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx", value);
2115 break;
2117 case 4: /* CR4 */
2118 old_cr = v->arch.hvm_vmx.cpu_shadow_cr4;
2120 if ( (value & X86_CR4_PAE) && !(old_cr & X86_CR4_PAE) )
2122 if ( vmx_pgbit_test(v) )
2124 /* The guest is a 32-bit PAE guest. */
2125 #if CONFIG_PAGING_LEVELS >= 3
2126 unsigned long mfn, old_base_mfn;
2127 mfn = get_mfn_from_gpfn(v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT);
2128 if ( !mfn_valid(mfn) ||
2129 !get_page(mfn_to_page(mfn), v->domain) )
2130 goto bad_cr3;
2132 /*
2133 * Now arch.guest_table points to machine physical.
2134 */
2136 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
2137 v->arch.guest_table = pagetable_from_pfn(mfn);
2138 if ( old_base_mfn )
2139 put_page(mfn_to_page(old_base_mfn));
2141 HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
2142 (unsigned long) (mfn << PAGE_SHIFT));
2144 HVM_DBG_LOG(DBG_LEVEL_VMMU,
2145 "Update CR3 value = %lx, mfn = %lx",
2146 v->arch.hvm_vmx.cpu_cr3, mfn);
2147 #endif
2150 else if ( !(value & X86_CR4_PAE) )
2152 if ( unlikely(vmx_long_mode_enabled(v)) )
2154 HVM_DBG_LOG(DBG_LEVEL_1, "Guest cleared CR4.PAE while "
2155 "EFER.LMA is set\n");
2156 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
2160 __vmwrite(GUEST_CR4, value| VMX_CR4_HOST_MASK);
2161 v->arch.hvm_vmx.cpu_shadow_cr4 = value;
2162 __vmwrite(CR4_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr4);
2164 /*
2165 * Writing to CR4 to modify the PSE, PGE, or PAE flag invalidates
2166 * all TLB entries except global entries.
2167 */
2168 if ( (old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE) )
2169 paging_update_paging_modes(v);
2170 break;
2172 case 8:
2173 vlapic_set_reg(vlapic, APIC_TASKPRI, ((value & 0x0F) << 4));
2174 break;
2176 default:
2177 gdprintk(XENLOG_ERR, "invalid cr: %d\n", cr);
2178 domain_crash(v->domain);
2179 return 0;
2182 return 1;
2184 bad_cr3:
2185 gdprintk(XENLOG_ERR, "Invalid CR3\n");
2186 exit_and_crash:
2187 domain_crash(v->domain);
2188 return 0;
2191 /*
2192 * Read from control registers. CR0 and CR4 are read from the shadow.
2193 */
2194 static void mov_from_cr(int cr, int gp, struct cpu_user_regs *regs)
2196 unsigned long value = 0;
2197 struct vcpu *v = current;
2198 struct vlapic *vlapic = vcpu_vlapic(v);
2200 switch ( cr )
2202 case 3:
2203 value = (unsigned long)v->arch.hvm_vmx.cpu_cr3;
2204 break;
2205 case 8:
2206 value = (unsigned long)vlapic_get_reg(vlapic, APIC_TASKPRI);
2207 value = (value & 0xF0) >> 4;
2208 break;
2209 default:
2210 gdprintk(XENLOG_ERR, "invalid cr: %d\n", cr);
2211 domain_crash(v->domain);
2212 break;
2215 switch ( gp ) {
2216 CASE_SET_REG(EAX, eax);
2217 CASE_SET_REG(ECX, ecx);
2218 CASE_SET_REG(EDX, edx);
2219 CASE_SET_REG(EBX, ebx);
2220 CASE_SET_REG(EBP, ebp);
2221 CASE_SET_REG(ESI, esi);
2222 CASE_SET_REG(EDI, edi);
2223 CASE_EXTEND_SET_REG;
2224 case REG_ESP:
2225 __vmwrite(GUEST_RSP, value);
2226 regs->esp = value;
2227 break;
2228 default:
2229 printk("invalid gp: %d\n", gp);
2230 domain_crash(v->domain);
2231 break;
2234 TRACE_VMEXIT(1, TYPE_MOV_FROM_CR);
2235 TRACE_VMEXIT(2, cr);
2236 TRACE_VMEXIT(3, value);
2238 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR%d, value = %lx", cr, value);
2241 static int vmx_cr_access(unsigned long exit_qualification,
2242 struct cpu_user_regs *regs)
2244 unsigned int gp, cr;
2245 unsigned long value;
2246 struct vcpu *v = current;
2248 switch (exit_qualification & CONTROL_REG_ACCESS_TYPE) {
2249 case TYPE_MOV_TO_CR:
2250 gp = exit_qualification & CONTROL_REG_ACCESS_REG;
2251 cr = exit_qualification & CONTROL_REG_ACCESS_NUM;
2252 return mov_to_cr(gp, cr, regs);
2253 case TYPE_MOV_FROM_CR:
2254 gp = exit_qualification & CONTROL_REG_ACCESS_REG;
2255 cr = exit_qualification & CONTROL_REG_ACCESS_NUM;
2256 mov_from_cr(cr, gp, regs);
2257 break;
2258 case TYPE_CLTS:
2259 TRACE_VMEXIT(1, TYPE_CLTS);
2261 /* We initialise the FPU now, to avoid needing another vmexit. */
2262 setup_fpu(v);
2263 __vm_clear_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_NM);
2265 v->arch.hvm_vmx.cpu_cr0 &= ~X86_CR0_TS; /* clear TS */
2266 __vmwrite(GUEST_CR0, v->arch.hvm_vmx.cpu_cr0);
2268 v->arch.hvm_vmx.cpu_shadow_cr0 &= ~X86_CR0_TS; /* clear TS */
2269 __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr0);
2270 break;
2271 case TYPE_LMSW:
2272 value = v->arch.hvm_vmx.cpu_shadow_cr0;
2273 value = (value & ~0xF) |
2274 (((exit_qualification & LMSW_SOURCE_DATA) >> 16) & 0xF);
2275 TRACE_VMEXIT(1, TYPE_LMSW);
2276 TRACE_VMEXIT(2, value);
2277 return vmx_set_cr0(value);
2278 break;
2279 default:
2280 BUG();
2283 return 1;
2286 static inline int vmx_do_msr_read(struct cpu_user_regs *regs)
2288 u64 msr_content = 0;
2289 u32 ecx = regs->ecx, eax, edx;
2290 struct vcpu *v = current;
2292 HVM_DBG_LOG(DBG_LEVEL_1, "ecx=%x, eax=%x, edx=%x",
2293 ecx, (u32)regs->eax, (u32)regs->edx);
2295 switch (ecx) {
2296 case MSR_IA32_TIME_STAMP_COUNTER:
2297 msr_content = hvm_get_guest_time(v);
2298 break;
2299 case MSR_IA32_SYSENTER_CS:
2300 msr_content = (u32)__vmread(GUEST_SYSENTER_CS);
2301 break;
2302 case MSR_IA32_SYSENTER_ESP:
2303 msr_content = __vmread(GUEST_SYSENTER_ESP);
2304 break;
2305 case MSR_IA32_SYSENTER_EIP:
2306 msr_content = __vmread(GUEST_SYSENTER_EIP);
2307 break;
2308 case MSR_IA32_APICBASE:
2309 msr_content = vcpu_vlapic(v)->hw.apic_base_msr;
2310 break;
2311 default:
2312 if ( long_mode_do_msr_read(regs) )
2313 goto done;
2315 if ( rdmsr_hypervisor_regs(ecx, &eax, &edx) ||
2316 rdmsr_safe(ecx, eax, edx) == 0 )
2318 regs->eax = eax;
2319 regs->edx = edx;
2320 goto done;
2322 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
2323 return 0;
2326 regs->eax = msr_content & 0xFFFFFFFF;
2327 regs->edx = msr_content >> 32;
2329 done:
2330 HVM_DBG_LOG(DBG_LEVEL_1, "returns: ecx=%x, eax=%lx, edx=%lx",
2331 ecx, (unsigned long)regs->eax,
2332 (unsigned long)regs->edx);
2333 return 1;
2336 static inline int vmx_do_msr_write(struct cpu_user_regs *regs)
2338 u32 ecx = regs->ecx;
2339 u64 msr_content;
2340 struct vcpu *v = current;
2342 HVM_DBG_LOG(DBG_LEVEL_1, "ecx=%x, eax=%x, edx=%x",
2343 ecx, (u32)regs->eax, (u32)regs->edx);
2345 msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
2347 switch (ecx) {
2348 case MSR_IA32_TIME_STAMP_COUNTER:
2349 hvm_set_guest_time(v, msr_content);
2350 pt_reset(v);
2351 break;
2352 case MSR_IA32_SYSENTER_CS:
2353 __vmwrite(GUEST_SYSENTER_CS, msr_content);
2354 break;
2355 case MSR_IA32_SYSENTER_ESP:
2356 __vmwrite(GUEST_SYSENTER_ESP, msr_content);
2357 break;
2358 case MSR_IA32_SYSENTER_EIP:
2359 __vmwrite(GUEST_SYSENTER_EIP, msr_content);
2360 break;
2361 case MSR_IA32_APICBASE:
2362 vlapic_msr_set(vcpu_vlapic(v), msr_content);
2363 break;
2364 default:
2365 if ( !long_mode_do_msr_write(regs) )
2366 wrmsr_hypervisor_regs(ecx, regs->eax, regs->edx);
2367 break;
2370 return 1;
2373 static void vmx_do_hlt(void)
2375 unsigned long rflags;
2376 rflags = __vmread(GUEST_RFLAGS);
2377 hvm_hlt(rflags);
2380 static inline void vmx_do_extint(struct cpu_user_regs *regs)
2382 unsigned int vector;
2384 asmlinkage void do_IRQ(struct cpu_user_regs *);
2385 fastcall void smp_apic_timer_interrupt(struct cpu_user_regs *);
2386 fastcall void smp_event_check_interrupt(void);
2387 fastcall void smp_invalidate_interrupt(void);
2388 fastcall void smp_call_function_interrupt(void);
2389 fastcall void smp_spurious_interrupt(struct cpu_user_regs *regs);
2390 fastcall void smp_error_interrupt(struct cpu_user_regs *regs);
2391 #ifdef CONFIG_X86_MCE_P4THERMAL
2392 fastcall void smp_thermal_interrupt(struct cpu_user_regs *regs);
2393 #endif
2395 vector = __vmread(VM_EXIT_INTR_INFO);
2396 BUG_ON(!(vector & INTR_INFO_VALID_MASK));
2398 vector &= INTR_INFO_VECTOR_MASK;
2399 TRACE_VMEXIT(1, vector);
2401 switch(vector) {
2402 case LOCAL_TIMER_VECTOR:
2403 smp_apic_timer_interrupt(regs);
2404 break;
2405 case EVENT_CHECK_VECTOR:
2406 smp_event_check_interrupt();
2407 break;
2408 case INVALIDATE_TLB_VECTOR:
2409 smp_invalidate_interrupt();
2410 break;
2411 case CALL_FUNCTION_VECTOR:
2412 smp_call_function_interrupt();
2413 break;
2414 case SPURIOUS_APIC_VECTOR:
2415 smp_spurious_interrupt(regs);
2416 break;
2417 case ERROR_APIC_VECTOR:
2418 smp_error_interrupt(regs);
2419 break;
2420 #ifdef CONFIG_X86_MCE_P4THERMAL
2421 case THERMAL_APIC_VECTOR:
2422 smp_thermal_interrupt(regs);
2423 break;
2424 #endif
2425 default:
2426 regs->entry_vector = vector;
2427 do_IRQ(regs);
2428 break;
2432 #if defined (__x86_64__)
2433 void store_cpu_user_regs(struct cpu_user_regs *regs)
2435 regs->ss = __vmread(GUEST_SS_SELECTOR);
2436 regs->rsp = __vmread(GUEST_RSP);
2437 regs->rflags = __vmread(GUEST_RFLAGS);
2438 regs->cs = __vmread(GUEST_CS_SELECTOR);
2439 regs->ds = __vmread(GUEST_DS_SELECTOR);
2440 regs->es = __vmread(GUEST_ES_SELECTOR);
2441 regs->rip = __vmread(GUEST_RIP);
2443 #elif defined (__i386__)
2444 void store_cpu_user_regs(struct cpu_user_regs *regs)
2446 regs->ss = __vmread(GUEST_SS_SELECTOR);
2447 regs->esp = __vmread(GUEST_RSP);
2448 regs->eflags = __vmread(GUEST_RFLAGS);
2449 regs->cs = __vmread(GUEST_CS_SELECTOR);
2450 regs->ds = __vmread(GUEST_DS_SELECTOR);
2451 regs->es = __vmread(GUEST_ES_SELECTOR);
2452 regs->eip = __vmread(GUEST_RIP);
2454 #endif
2456 #ifdef XEN_DEBUGGER
2457 void save_cpu_user_regs(struct cpu_user_regs *regs)
2459 regs->xss = __vmread(GUEST_SS_SELECTOR);
2460 regs->esp = __vmread(GUEST_RSP);
2461 regs->eflags = __vmread(GUEST_RFLAGS);
2462 regs->xcs = __vmread(GUEST_CS_SELECTOR);
2463 regs->eip = __vmread(GUEST_RIP);
2465 regs->xgs = __vmread(GUEST_GS_SELECTOR);
2466 regs->xfs = __vmread(GUEST_FS_SELECTOR);
2467 regs->xes = __vmread(GUEST_ES_SELECTOR);
2468 regs->xds = __vmread(GUEST_DS_SELECTOR);
2471 void restore_cpu_user_regs(struct cpu_user_regs *regs)
2473 __vmwrite(GUEST_SS_SELECTOR, regs->xss);
2474 __vmwrite(GUEST_RSP, regs->esp);
2475 __vmwrite(GUEST_RFLAGS, regs->eflags);
2476 __vmwrite(GUEST_CS_SELECTOR, regs->xcs);
2477 __vmwrite(GUEST_RIP, regs->eip);
2479 __vmwrite(GUEST_GS_SELECTOR, regs->xgs);
2480 __vmwrite(GUEST_FS_SELECTOR, regs->xfs);
2481 __vmwrite(GUEST_ES_SELECTOR, regs->xes);
2482 __vmwrite(GUEST_DS_SELECTOR, regs->xds);
2484 #endif
2486 static void vmx_reflect_exception(struct vcpu *v)
2488 int error_code, intr_info, vector;
2490 intr_info = __vmread(VM_EXIT_INTR_INFO);
2491 vector = intr_info & 0xff;
2492 if ( intr_info & INTR_INFO_DELIVER_CODE_MASK )
2493 error_code = __vmread(VM_EXIT_INTR_ERROR_CODE);
2494 else
2495 error_code = VMX_DELIVER_NO_ERROR_CODE;
2497 #ifndef NDEBUG
2499 unsigned long rip;
2501 rip = __vmread(GUEST_RIP);
2502 HVM_DBG_LOG(DBG_LEVEL_1, "rip = %lx, error_code = %x",
2503 rip, error_code);
2505 #endif /* NDEBUG */
2507 /*
2508 * According to Intel Virtualization Technology Specification for
2509 * the IA-32 Intel Architecture (C97063-002 April 2005), section
2510 * 2.8.3, SW_EXCEPTION should be used for #BP and #OV, and
2511 * HW_EXCEPTION used for everything else. The main difference
2512 * appears to be that for SW_EXCEPTION, the EIP/RIP is incremented
2513 * by VM_ENTER_INSTRUCTION_LEN bytes, whereas for HW_EXCEPTION,
2514 * it is not.
2515 */
2516 if ( (intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_SW_EXCEPTION )
2518 int ilen = __get_instruction_length(); /* Safe: software exception */
2519 vmx_inject_sw_exception(v, vector, ilen);
2521 else
2523 vmx_inject_hw_exception(v, vector, error_code);
2527 static void vmx_failed_vmentry(unsigned int exit_reason)
2529 unsigned int failed_vmentry_reason = (uint16_t)exit_reason;
2530 unsigned long exit_qualification;
2532 exit_qualification = __vmread(EXIT_QUALIFICATION);
2533 printk("Failed vm entry (exit reason 0x%x) ", exit_reason);
2534 switch ( failed_vmentry_reason )
2536 case EXIT_REASON_INVALID_GUEST_STATE:
2537 printk("caused by invalid guest state (%ld).\n", exit_qualification);
2538 break;
2539 case EXIT_REASON_MSR_LOADING:
2540 printk("caused by MSR entry %ld loading.\n", exit_qualification);
2541 break;
2542 case EXIT_REASON_MACHINE_CHECK:
2543 printk("caused by machine check.\n");
2544 break;
2545 default:
2546 printk("reason not known yet!");
2547 break;
2550 printk("************* VMCS Area **************\n");
2551 vmcs_dump_vcpu();
2552 printk("**************************************\n");
2554 domain_crash(current->domain);
2557 asmlinkage void vmx_vmexit_handler(struct cpu_user_regs *regs)
2559 unsigned int exit_reason;
2560 unsigned long exit_qualification, inst_len = 0;
2561 struct vcpu *v = current;
2563 TRACE_3D(TRC_VMX_VMEXIT + v->vcpu_id, 0, 0, 0);
2565 exit_reason = __vmread(VM_EXIT_REASON);
2567 perfc_incra(vmexits, exit_reason);
2568 TRACE_VMEXIT(0, exit_reason);
2570 if ( exit_reason != EXIT_REASON_EXTERNAL_INTERRUPT )
2571 local_irq_enable();
2573 if ( unlikely(exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) )
2574 return vmx_failed_vmentry(exit_reason);
2576 switch ( exit_reason )
2578 case EXIT_REASON_EXCEPTION_NMI:
2580 /*
2581 * We don't set the software-interrupt exiting (INT n).
2582 * (1) We can get an exception (e.g. #PG) in the guest, or
2583 * (2) NMI
2584 */
2585 unsigned int intr_info, vector;
2587 intr_info = __vmread(VM_EXIT_INTR_INFO);
2588 BUG_ON(!(intr_info & INTR_INFO_VALID_MASK));
2590 vector = intr_info & INTR_INFO_VECTOR_MASK;
2592 TRACE_VMEXIT(1, vector);
2593 perfc_incra(cause_vector, vector);
2595 switch ( vector )
2597 #ifdef XEN_DEBUGGER
2598 case TRAP_debug:
2600 save_cpu_user_regs(regs);
2601 pdb_handle_exception(1, regs, 1);
2602 restore_cpu_user_regs(regs);
2603 break;
2605 case TRAP_int3:
2607 save_cpu_user_regs(regs);
2608 pdb_handle_exception(3, regs, 1);
2609 restore_cpu_user_regs(regs);
2610 break;
2612 #else
2613 case TRAP_debug:
2615 if ( test_bit(_DOMF_debugging, &v->domain->domain_flags) )
2617 store_cpu_user_regs(regs);
2618 domain_pause_for_debugger();
2619 __vm_clear_bit(GUEST_PENDING_DBG_EXCEPTIONS,
2620 PENDING_DEBUG_EXC_BS);
2622 else
2624 vmx_reflect_exception(v);
2625 __vm_clear_bit(GUEST_PENDING_DBG_EXCEPTIONS,
2626 PENDING_DEBUG_EXC_BS);
2629 break;
2631 case TRAP_int3:
2633 if ( test_bit(_DOMF_debugging, &v->domain->domain_flags) )
2634 domain_pause_for_debugger();
2635 else
2636 vmx_reflect_exception(v);
2637 break;
2639 #endif
2640 case TRAP_no_device:
2642 vmx_do_no_device_fault();
2643 break;
2645 case TRAP_page_fault:
2647 exit_qualification = __vmread(EXIT_QUALIFICATION);
2648 regs->error_code = __vmread(VM_EXIT_INTR_ERROR_CODE);
2650 TRACE_VMEXIT(3, regs->error_code);
2651 TRACE_VMEXIT(4, exit_qualification);
2653 HVM_DBG_LOG(DBG_LEVEL_VMMU,
2654 "eax=%lx, ebx=%lx, ecx=%lx, edx=%lx, esi=%lx, edi=%lx",
2655 (unsigned long)regs->eax, (unsigned long)regs->ebx,
2656 (unsigned long)regs->ecx, (unsigned long)regs->edx,
2657 (unsigned long)regs->esi, (unsigned long)regs->edi);
2659 if ( !vmx_do_page_fault(exit_qualification, regs) )
2661 /* Inject #PG using Interruption-Information Fields. */
2662 vmx_inject_hw_exception(v, TRAP_page_fault, regs->error_code);
2663 v->arch.hvm_vmx.cpu_cr2 = exit_qualification;
2664 TRACE_3D(TRC_VMX_INTR, v->domain->domain_id,
2665 TRAP_page_fault, exit_qualification);
2667 break;
2669 case TRAP_nmi:
2670 if ( (intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI )
2671 do_nmi(regs); /* Real NMI, vector 2: normal processing. */
2672 else
2673 vmx_reflect_exception(v);
2674 break;
2675 default:
2676 vmx_reflect_exception(v);
2677 break;
2679 break;
2681 case EXIT_REASON_EXTERNAL_INTERRUPT:
2682 vmx_do_extint(regs);
2683 break;
2684 case EXIT_REASON_TRIPLE_FAULT:
2685 hvm_triple_fault();
2686 break;
2687 case EXIT_REASON_PENDING_INTERRUPT:
2688 /* Disable the interrupt window. */
2689 v->arch.hvm_vcpu.u.vmx.exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
2690 __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
2691 v->arch.hvm_vcpu.u.vmx.exec_control);
2692 break;
2693 case EXIT_REASON_TASK_SWITCH:
2694 goto exit_and_crash;
2695 case EXIT_REASON_CPUID:
2696 inst_len = __get_instruction_length(); /* Safe: CPUID */
2697 __update_guest_eip(inst_len);
2698 vmx_do_cpuid(regs);
2699 break;
2700 case EXIT_REASON_HLT:
2701 inst_len = __get_instruction_length(); /* Safe: HLT */
2702 __update_guest_eip(inst_len);
2703 vmx_do_hlt();
2704 break;
2705 case EXIT_REASON_INVLPG:
2707 inst_len = __get_instruction_length(); /* Safe: INVLPG */
2708 __update_guest_eip(inst_len);
2709 exit_qualification = __vmread(EXIT_QUALIFICATION);
2710 vmx_do_invlpg(exit_qualification);
2711 TRACE_VMEXIT(4, exit_qualification);
2712 break;
2714 case EXIT_REASON_VMCALL:
2716 inst_len = __get_instruction_length(); /* Safe: VMCALL */
2717 __update_guest_eip(inst_len);
2718 hvm_do_hypercall(regs);
2719 break;
2721 case EXIT_REASON_CR_ACCESS:
2723 exit_qualification = __vmread(EXIT_QUALIFICATION);
2724 inst_len = __get_instruction_length(); /* Safe: MOV Cn, LMSW, CLTS */
2725 if ( vmx_cr_access(exit_qualification, regs) )
2726 __update_guest_eip(inst_len);
2727 TRACE_VMEXIT(4, exit_qualification);
2728 break;
2730 case EXIT_REASON_DR_ACCESS:
2731 exit_qualification = __vmread(EXIT_QUALIFICATION);
2732 vmx_dr_access(exit_qualification, regs);
2733 break;
2734 case EXIT_REASON_IO_INSTRUCTION:
2735 exit_qualification = __vmread(EXIT_QUALIFICATION);
2736 inst_len = __get_instruction_length(); /* Safe: IN, INS, OUT, OUTS */
2737 vmx_io_instruction(exit_qualification, inst_len);
2738 TRACE_VMEXIT(4, exit_qualification);
2739 break;
2740 case EXIT_REASON_MSR_READ:
2741 inst_len = __get_instruction_length(); /* Safe: RDMSR */
2742 if ( vmx_do_msr_read(regs) )
2743 __update_guest_eip(inst_len);
2744 TRACE_VMEXIT(1, regs->ecx);
2745 TRACE_VMEXIT(2, regs->eax);
2746 TRACE_VMEXIT(3, regs->edx);
2747 break;
2748 case EXIT_REASON_MSR_WRITE:
2749 inst_len = __get_instruction_length(); /* Safe: WRMSR */
2750 if ( vmx_do_msr_write(regs) )
2751 __update_guest_eip(inst_len);
2752 TRACE_VMEXIT(1, regs->ecx);
2753 TRACE_VMEXIT(2, regs->eax);
2754 TRACE_VMEXIT(3, regs->edx);
2755 break;
2756 case EXIT_REASON_MWAIT_INSTRUCTION:
2757 case EXIT_REASON_MONITOR_INSTRUCTION:
2758 case EXIT_REASON_PAUSE_INSTRUCTION:
2759 goto exit_and_crash;
2760 case EXIT_REASON_VMCLEAR:
2761 case EXIT_REASON_VMLAUNCH:
2762 case EXIT_REASON_VMPTRLD:
2763 case EXIT_REASON_VMPTRST:
2764 case EXIT_REASON_VMREAD:
2765 case EXIT_REASON_VMRESUME:
2766 case EXIT_REASON_VMWRITE:
2767 case EXIT_REASON_VMXOFF:
2768 case EXIT_REASON_VMXON:
2769 /* Report invalid opcode exception when a VMX guest tries to execute
2770 any of the VMX instructions */
2771 vmx_inject_hw_exception(v, TRAP_invalid_op, VMX_DELIVER_NO_ERROR_CODE);
2772 break;
2774 case EXIT_REASON_TPR_BELOW_THRESHOLD:
2775 break;
2777 default:
2778 exit_and_crash:
2779 gdprintk(XENLOG_ERR, "Bad vmexit (reason %x)\n", exit_reason);
2780 domain_crash(v->domain);
2781 break;
2785 asmlinkage void vmx_trace_vmentry(void)
2787 struct vcpu *v = current;
2788 TRACE_5D(TRC_VMX_VMENTRY + current->vcpu_id,
2789 v->arch.hvm_vcpu.hvm_trace_values[0],
2790 v->arch.hvm_vcpu.hvm_trace_values[1],
2791 v->arch.hvm_vcpu.hvm_trace_values[2],
2792 v->arch.hvm_vcpu.hvm_trace_values[3],
2793 v->arch.hvm_vcpu.hvm_trace_values[4]);
2795 TRACE_VMEXIT(0, 0);
2796 TRACE_VMEXIT(1, 0);
2797 TRACE_VMEXIT(2, 0);
2798 TRACE_VMEXIT(3, 0);
2799 TRACE_VMEXIT(4, 0);
2802 /*
2803 * Local variables:
2804 * mode: C
2805 * c-set-style: "BSD"
2806 * c-basic-offset: 4
2807 * tab-width: 4
2808 * indent-tabs-mode: nil
2809 * End:
2810 */