ia64/xen-unstable

view xen/arch/x86/hvm/vmx/vmx.c @ 14635:5c52e5ca8459

hvm: Clean up handling of exception intercepts.
Only intercept #DB/#BP if a debugger is attached.
Signed-off-by: Keir Fraser <keir@xensource.com>
author Keir Fraser <keir@xensource.com>
date Wed Mar 28 18:47:17 2007 +0100 (2007-03-28)
parents d2a91b73899a
children 98b049ed2540
line source
1 /*
2 * vmx.c: handling VMX architecture-related VM exits
3 * Copyright (c) 2004, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
16 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 *
18 */
20 #include <xen/config.h>
21 #include <xen/init.h>
22 #include <xen/lib.h>
23 #include <xen/trace.h>
24 #include <xen/sched.h>
25 #include <xen/irq.h>
26 #include <xen/softirq.h>
27 #include <xen/domain_page.h>
28 #include <xen/hypercall.h>
29 #include <xen/perfc.h>
30 #include <asm/current.h>
31 #include <asm/io.h>
32 #include <asm/regs.h>
33 #include <asm/cpufeature.h>
34 #include <asm/processor.h>
35 #include <asm/types.h>
36 #include <asm/msr.h>
37 #include <asm/spinlock.h>
38 #include <asm/paging.h>
39 #include <asm/p2m.h>
40 #include <asm/hvm/hvm.h>
41 #include <asm/hvm/support.h>
42 #include <asm/hvm/vmx/vmx.h>
43 #include <asm/hvm/vmx/vmcs.h>
44 #include <asm/hvm/vmx/cpu.h>
45 #include <public/sched.h>
46 #include <public/hvm/ioreq.h>
47 #include <asm/hvm/vpic.h>
48 #include <asm/hvm/vlapic.h>
49 #include <asm/x86_emulate.h>
50 #include <asm/hvm/vpt.h>
51 #include <public/hvm/save.h>
52 #include <asm/hvm/trace.h>
54 static void vmx_ctxt_switch_from(struct vcpu *v);
55 static void vmx_ctxt_switch_to(struct vcpu *v);
57 static int vmx_vcpu_initialise(struct vcpu *v)
58 {
59 int rc;
61 spin_lock_init(&v->arch.hvm_vmx.vmcs_lock);
63 v->arch.schedule_tail = vmx_do_resume;
64 v->arch.ctxt_switch_from = vmx_ctxt_switch_from;
65 v->arch.ctxt_switch_to = vmx_ctxt_switch_to;
67 if ( (rc = vmx_create_vmcs(v)) != 0 )
68 {
69 dprintk(XENLOG_WARNING,
70 "Failed to create VMCS for vcpu %d: err=%d.\n",
71 v->vcpu_id, rc);
72 return rc;
73 }
75 return 0;
76 }
78 static void vmx_vcpu_destroy(struct vcpu *v)
79 {
80 vmx_destroy_vmcs(v);
81 }
83 #ifdef __x86_64__
85 static DEFINE_PER_CPU(struct vmx_msr_state, host_msr_state);
87 static u32 msr_index[VMX_MSR_COUNT] =
88 {
89 MSR_LSTAR, MSR_STAR, MSR_CSTAR,
90 MSR_SYSCALL_MASK, MSR_EFER,
91 };
93 static void vmx_save_host_msrs(void)
94 {
95 struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state);
96 int i;
98 for ( i = 0; i < VMX_MSR_COUNT; i++ )
99 rdmsrl(msr_index[i], host_msr_state->msrs[i]);
100 }
102 #define WRITE_MSR(address) \
103 guest_msr_state->msrs[VMX_INDEX_MSR_ ## address] = msr_content; \
104 if ( !test_bit(VMX_INDEX_MSR_ ## address, &guest_msr_state->flags) )\
105 set_bit(VMX_INDEX_MSR_ ## address, &guest_msr_state->flags); \
106 wrmsrl(MSR_ ## address, msr_content); \
107 set_bit(VMX_INDEX_MSR_ ## address, &host_msr_state->flags); \
108 break
110 static inline int long_mode_do_msr_read(struct cpu_user_regs *regs)
111 {
112 u64 msr_content = 0;
113 struct vcpu *v = current;
114 struct vmx_msr_state *guest_msr_state = &v->arch.hvm_vmx.msr_state;
116 switch ( (u32)regs->ecx ) {
117 case MSR_EFER:
118 HVM_DBG_LOG(DBG_LEVEL_2, "EFER msr_content 0x%"PRIx64, msr_content);
119 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_EFER];
120 break;
122 case MSR_FS_BASE:
123 msr_content = __vmread(GUEST_FS_BASE);
124 goto check_long_mode;
126 case MSR_GS_BASE:
127 msr_content = __vmread(GUEST_GS_BASE);
128 goto check_long_mode;
130 case MSR_SHADOW_GS_BASE:
131 msr_content = guest_msr_state->shadow_gs;
132 check_long_mode:
133 if ( !(vmx_long_mode_enabled(v)) )
134 {
135 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
136 return 0;
137 }
138 break;
140 case MSR_STAR:
141 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_STAR];
142 break;
144 case MSR_LSTAR:
145 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_LSTAR];
146 break;
148 case MSR_CSTAR:
149 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_CSTAR];
150 break;
152 case MSR_SYSCALL_MASK:
153 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_SYSCALL_MASK];
154 break;
156 default:
157 return 0;
158 }
160 HVM_DBG_LOG(DBG_LEVEL_2, "msr_content: 0x%"PRIx64, msr_content);
162 regs->eax = (u32)(msr_content >> 0);
163 regs->edx = (u32)(msr_content >> 32);
165 return 1;
166 }
168 static inline int long_mode_do_msr_write(struct cpu_user_regs *regs)
169 {
170 u64 msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
171 u32 ecx = regs->ecx;
172 struct vcpu *v = current;
173 struct vmx_msr_state *guest_msr_state = &v->arch.hvm_vmx.msr_state;
174 struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state);
176 HVM_DBG_LOG(DBG_LEVEL_1, "msr 0x%x msr_content 0x%"PRIx64"\n",
177 ecx, msr_content);
179 switch ( ecx )
180 {
181 case MSR_EFER:
182 /* offending reserved bit will cause #GP */
183 if ( msr_content & ~(EFER_LME | EFER_LMA | EFER_NX | EFER_SCE) )
184 {
185 gdprintk(XENLOG_WARNING, "Trying to set reserved bit in "
186 "EFER: %"PRIx64"\n", msr_content);
187 goto gp_fault;
188 }
190 if ( (msr_content & EFER_LME)
191 && !(guest_msr_state->msrs[VMX_INDEX_MSR_EFER] & EFER_LME) )
192 {
193 if ( unlikely(vmx_paging_enabled(v)) )
194 {
195 gdprintk(XENLOG_WARNING,
196 "Trying to set EFER.LME with paging enabled\n");
197 goto gp_fault;
198 }
199 }
200 else if ( !(msr_content & EFER_LME)
201 && (guest_msr_state->msrs[VMX_INDEX_MSR_EFER] & EFER_LME) )
202 {
203 if ( unlikely(vmx_paging_enabled(v)) )
204 {
205 gdprintk(XENLOG_WARNING,
206 "Trying to clear EFER.LME with paging enabled\n");
207 goto gp_fault;
208 }
209 }
211 guest_msr_state->msrs[VMX_INDEX_MSR_EFER] = msr_content;
212 break;
214 case MSR_FS_BASE:
215 case MSR_GS_BASE:
216 case MSR_SHADOW_GS_BASE:
217 if ( !vmx_long_mode_enabled(v) )
218 goto gp_fault;
220 if ( !is_canonical_address(msr_content) )
221 goto uncanonical_address;
223 if ( ecx == MSR_FS_BASE )
224 __vmwrite(GUEST_FS_BASE, msr_content);
225 else if ( ecx == MSR_GS_BASE )
226 __vmwrite(GUEST_GS_BASE, msr_content);
227 else
228 {
229 v->arch.hvm_vmx.msr_state.shadow_gs = msr_content;
230 wrmsrl(MSR_SHADOW_GS_BASE, msr_content);
231 }
233 break;
235 case MSR_STAR:
236 WRITE_MSR(STAR);
238 case MSR_LSTAR:
239 if ( !is_canonical_address(msr_content) )
240 goto uncanonical_address;
241 WRITE_MSR(LSTAR);
243 case MSR_CSTAR:
244 if ( !is_canonical_address(msr_content) )
245 goto uncanonical_address;
246 WRITE_MSR(CSTAR);
248 case MSR_SYSCALL_MASK:
249 WRITE_MSR(SYSCALL_MASK);
251 default:
252 return 0;
253 }
255 return 1;
257 uncanonical_address:
258 HVM_DBG_LOG(DBG_LEVEL_1, "Not cano address of msr write %x\n", ecx);
259 gp_fault:
260 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
261 return 0;
262 }
264 /*
265 * To avoid MSR save/restore at every VM exit/entry time, we restore
266 * the x86_64 specific MSRs at domain switch time. Since these MSRs
267 * are not modified once set for para domains, we don't save them,
268 * but simply reset them to values set in percpu_traps_init().
269 */
270 static void vmx_restore_host_msrs(void)
271 {
272 struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state);
273 int i;
275 while ( host_msr_state->flags )
276 {
277 i = find_first_set_bit(host_msr_state->flags);
278 wrmsrl(msr_index[i], host_msr_state->msrs[i]);
279 clear_bit(i, &host_msr_state->flags);
280 }
281 }
283 static void vmx_save_guest_msrs(struct vcpu *v)
284 {
285 /* MSR_SHADOW_GS_BASE may have been changed by swapgs instruction. */
286 rdmsrl(MSR_SHADOW_GS_BASE, v->arch.hvm_vmx.msr_state.shadow_gs);
287 }
289 static void vmx_restore_guest_msrs(struct vcpu *v)
290 {
291 struct vmx_msr_state *guest_msr_state, *host_msr_state;
292 unsigned long guest_flags;
293 int i;
295 guest_msr_state = &v->arch.hvm_vmx.msr_state;
296 host_msr_state = &this_cpu(host_msr_state);
298 wrmsrl(MSR_SHADOW_GS_BASE, guest_msr_state->shadow_gs);
300 guest_flags = guest_msr_state->flags;
301 if ( !guest_flags )
302 return;
304 while ( guest_flags ) {
305 i = find_first_set_bit(guest_flags);
307 HVM_DBG_LOG(DBG_LEVEL_2,
308 "restore guest's index %d msr %x with value %lx",
309 i, msr_index[i], guest_msr_state->msrs[i]);
310 set_bit(i, &host_msr_state->flags);
311 wrmsrl(msr_index[i], guest_msr_state->msrs[i]);
312 clear_bit(i, &guest_flags);
313 }
314 }
316 #else /* __i386__ */
318 #define vmx_save_host_msrs() ((void)0)
319 #define vmx_restore_host_msrs() ((void)0)
320 #define vmx_save_guest_msrs(v) ((void)0)
321 #define vmx_restore_guest_msrs(v) ((void)0)
323 static inline int long_mode_do_msr_read(struct cpu_user_regs *regs)
324 {
325 return 0;
326 }
328 static inline int long_mode_do_msr_write(struct cpu_user_regs *regs)
329 {
330 return 0;
331 }
333 #endif /* __i386__ */
335 #define loaddebug(_v,_reg) \
336 __asm__ __volatile__ ("mov %0,%%db" #_reg : : "r" ((_v)->debugreg[_reg]))
337 #define savedebug(_v,_reg) \
338 __asm__ __volatile__ ("mov %%db" #_reg ",%0" : : "r" ((_v)->debugreg[_reg]))
340 static inline void vmx_save_dr(struct vcpu *v)
341 {
342 if ( !v->arch.hvm_vcpu.flag_dr_dirty )
343 return;
345 /* Clear the DR dirty flag and re-enable intercepts for DR accesses. */
346 v->arch.hvm_vcpu.flag_dr_dirty = 0;
347 v->arch.hvm_vcpu.u.vmx.exec_control |= CPU_BASED_MOV_DR_EXITING;
348 __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vcpu.u.vmx.exec_control);
350 savedebug(&v->arch.guest_context, 0);
351 savedebug(&v->arch.guest_context, 1);
352 savedebug(&v->arch.guest_context, 2);
353 savedebug(&v->arch.guest_context, 3);
354 savedebug(&v->arch.guest_context, 6);
355 v->arch.guest_context.debugreg[7] = __vmread(GUEST_DR7);
356 }
358 static inline void __restore_debug_registers(struct vcpu *v)
359 {
360 loaddebug(&v->arch.guest_context, 0);
361 loaddebug(&v->arch.guest_context, 1);
362 loaddebug(&v->arch.guest_context, 2);
363 loaddebug(&v->arch.guest_context, 3);
364 /* No 4 and 5 */
365 loaddebug(&v->arch.guest_context, 6);
366 /* DR7 is loaded from the VMCS. */
367 }
369 int vmx_vmcs_save(struct vcpu *v, struct hvm_hw_cpu *c)
370 {
371 c->eip = __vmread(GUEST_RIP);
372 c->esp = __vmread(GUEST_RSP);
373 c->eflags = __vmread(GUEST_RFLAGS);
375 c->cr0 = v->arch.hvm_vmx.cpu_shadow_cr0;
376 c->cr3 = v->arch.hvm_vmx.cpu_cr3;
377 c->cr4 = v->arch.hvm_vmx.cpu_shadow_cr4;
379 #ifdef HVM_DEBUG_SUSPEND
380 printk("vmx_vmcs_save: cr3=0x%"PRIx64", cr0=0x%"PRIx64", cr4=0x%"PRIx64".\n",
381 c->cr3,
382 c->cr0,
383 c->cr4);
384 #endif
386 c->idtr_limit = __vmread(GUEST_IDTR_LIMIT);
387 c->idtr_base = __vmread(GUEST_IDTR_BASE);
389 c->gdtr_limit = __vmread(GUEST_GDTR_LIMIT);
390 c->gdtr_base = __vmread(GUEST_GDTR_BASE);
392 c->cs_sel = __vmread(GUEST_CS_SELECTOR);
393 c->cs_limit = __vmread(GUEST_CS_LIMIT);
394 c->cs_base = __vmread(GUEST_CS_BASE);
395 c->cs_arbytes = __vmread(GUEST_CS_AR_BYTES);
397 c->ds_sel = __vmread(GUEST_DS_SELECTOR);
398 c->ds_limit = __vmread(GUEST_DS_LIMIT);
399 c->ds_base = __vmread(GUEST_DS_BASE);
400 c->ds_arbytes = __vmread(GUEST_DS_AR_BYTES);
402 c->es_sel = __vmread(GUEST_ES_SELECTOR);
403 c->es_limit = __vmread(GUEST_ES_LIMIT);
404 c->es_base = __vmread(GUEST_ES_BASE);
405 c->es_arbytes = __vmread(GUEST_ES_AR_BYTES);
407 c->ss_sel = __vmread(GUEST_SS_SELECTOR);
408 c->ss_limit = __vmread(GUEST_SS_LIMIT);
409 c->ss_base = __vmread(GUEST_SS_BASE);
410 c->ss_arbytes = __vmread(GUEST_SS_AR_BYTES);
412 c->fs_sel = __vmread(GUEST_FS_SELECTOR);
413 c->fs_limit = __vmread(GUEST_FS_LIMIT);
414 c->fs_base = __vmread(GUEST_FS_BASE);
415 c->fs_arbytes = __vmread(GUEST_FS_AR_BYTES);
417 c->gs_sel = __vmread(GUEST_GS_SELECTOR);
418 c->gs_limit = __vmread(GUEST_GS_LIMIT);
419 c->gs_base = __vmread(GUEST_GS_BASE);
420 c->gs_arbytes = __vmread(GUEST_GS_AR_BYTES);
422 c->tr_sel = __vmread(GUEST_TR_SELECTOR);
423 c->tr_limit = __vmread(GUEST_TR_LIMIT);
424 c->tr_base = __vmread(GUEST_TR_BASE);
425 c->tr_arbytes = __vmread(GUEST_TR_AR_BYTES);
427 c->ldtr_sel = __vmread(GUEST_LDTR_SELECTOR);
428 c->ldtr_limit = __vmread(GUEST_LDTR_LIMIT);
429 c->ldtr_base = __vmread(GUEST_LDTR_BASE);
430 c->ldtr_arbytes = __vmread(GUEST_LDTR_AR_BYTES);
432 c->sysenter_cs = __vmread(GUEST_SYSENTER_CS);
433 c->sysenter_esp = __vmread(GUEST_SYSENTER_ESP);
434 c->sysenter_eip = __vmread(GUEST_SYSENTER_EIP);
436 return 1;
437 }
439 int vmx_vmcs_restore(struct vcpu *v, struct hvm_hw_cpu *c)
440 {
441 unsigned long mfn, old_base_mfn;
443 vmx_vmcs_enter(v);
445 __vmwrite(GUEST_RIP, c->eip);
446 __vmwrite(GUEST_RSP, c->esp);
447 __vmwrite(GUEST_RFLAGS, c->eflags);
449 v->arch.hvm_vmx.cpu_shadow_cr0 = c->cr0;
450 __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr0);
452 #ifdef HVM_DEBUG_SUSPEND
453 printk("vmx_vmcs_restore: cr3=0x%"PRIx64", cr0=0x%"PRIx64", cr4=0x%"PRIx64".\n",
454 c->cr3,
455 c->cr0,
456 c->cr4);
457 #endif
459 if (!vmx_paging_enabled(v)) {
460 printk("vmx_vmcs_restore: paging not enabled.");
461 goto skip_cr3;
462 }
464 if (c->cr3 == v->arch.hvm_vmx.cpu_cr3) {
465 /*
466 * This is simple TLB flush, implying the guest has
467 * removed some translation or changed page attributes.
468 * We simply invalidate the shadow.
469 */
470 mfn = gmfn_to_mfn(v->domain, c->cr3 >> PAGE_SHIFT);
471 if (mfn != pagetable_get_pfn(v->arch.guest_table)) {
472 goto bad_cr3;
473 }
474 } else {
475 /*
476 * If different, make a shadow. Check if the PDBR is valid
477 * first.
478 */
479 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 c->cr3 = %"PRIx64"", c->cr3);
480 /* current!=vcpu as not called by arch_vmx_do_launch */
481 mfn = gmfn_to_mfn(v->domain, c->cr3 >> PAGE_SHIFT);
482 if( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), v->domain)) {
483 goto bad_cr3;
484 }
485 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
486 v->arch.guest_table = pagetable_from_pfn(mfn);
487 if (old_base_mfn)
488 put_page(mfn_to_page(old_base_mfn));
489 v->arch.hvm_vmx.cpu_cr3 = c->cr3;
490 }
492 skip_cr3:
493 #if defined(__x86_64__)
494 if (vmx_long_mode_enabled(v)) {
495 unsigned long vm_entry_value;
496 vm_entry_value = __vmread(VM_ENTRY_CONTROLS);
497 vm_entry_value |= VM_ENTRY_IA32E_MODE;
498 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
499 }
500 #endif
502 __vmwrite(GUEST_CR4, (c->cr4 | VMX_CR4_HOST_MASK));
503 v->arch.hvm_vmx.cpu_shadow_cr4 = c->cr4;
504 __vmwrite(CR4_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr4);
506 __vmwrite(GUEST_IDTR_LIMIT, c->idtr_limit);
507 __vmwrite(GUEST_IDTR_BASE, c->idtr_base);
509 __vmwrite(GUEST_GDTR_LIMIT, c->gdtr_limit);
510 __vmwrite(GUEST_GDTR_BASE, c->gdtr_base);
512 __vmwrite(GUEST_CS_SELECTOR, c->cs_sel);
513 __vmwrite(GUEST_CS_LIMIT, c->cs_limit);
514 __vmwrite(GUEST_CS_BASE, c->cs_base);
515 __vmwrite(GUEST_CS_AR_BYTES, c->cs_arbytes);
517 __vmwrite(GUEST_DS_SELECTOR, c->ds_sel);
518 __vmwrite(GUEST_DS_LIMIT, c->ds_limit);
519 __vmwrite(GUEST_DS_BASE, c->ds_base);
520 __vmwrite(GUEST_DS_AR_BYTES, c->ds_arbytes);
522 __vmwrite(GUEST_ES_SELECTOR, c->es_sel);
523 __vmwrite(GUEST_ES_LIMIT, c->es_limit);
524 __vmwrite(GUEST_ES_BASE, c->es_base);
525 __vmwrite(GUEST_ES_AR_BYTES, c->es_arbytes);
527 __vmwrite(GUEST_SS_SELECTOR, c->ss_sel);
528 __vmwrite(GUEST_SS_LIMIT, c->ss_limit);
529 __vmwrite(GUEST_SS_BASE, c->ss_base);
530 __vmwrite(GUEST_SS_AR_BYTES, c->ss_arbytes);
532 __vmwrite(GUEST_FS_SELECTOR, c->fs_sel);
533 __vmwrite(GUEST_FS_LIMIT, c->fs_limit);
534 __vmwrite(GUEST_FS_BASE, c->fs_base);
535 __vmwrite(GUEST_FS_AR_BYTES, c->fs_arbytes);
537 __vmwrite(GUEST_GS_SELECTOR, c->gs_sel);
538 __vmwrite(GUEST_GS_LIMIT, c->gs_limit);
539 __vmwrite(GUEST_GS_BASE, c->gs_base);
540 __vmwrite(GUEST_GS_AR_BYTES, c->gs_arbytes);
542 __vmwrite(GUEST_TR_SELECTOR, c->tr_sel);
543 __vmwrite(GUEST_TR_LIMIT, c->tr_limit);
544 __vmwrite(GUEST_TR_BASE, c->tr_base);
545 __vmwrite(GUEST_TR_AR_BYTES, c->tr_arbytes);
547 __vmwrite(GUEST_LDTR_SELECTOR, c->ldtr_sel);
548 __vmwrite(GUEST_LDTR_LIMIT, c->ldtr_limit);
549 __vmwrite(GUEST_LDTR_BASE, c->ldtr_base);
550 __vmwrite(GUEST_LDTR_AR_BYTES, c->ldtr_arbytes);
552 __vmwrite(GUEST_SYSENTER_CS, c->sysenter_cs);
553 __vmwrite(GUEST_SYSENTER_ESP, c->sysenter_esp);
554 __vmwrite(GUEST_SYSENTER_EIP, c->sysenter_eip);
556 vmx_vmcs_exit(v);
558 paging_update_paging_modes(v);
559 return 0;
561 bad_cr3:
562 gdprintk(XENLOG_ERR, "Invalid CR3 value=0x%"PRIx64"", c->cr3);
563 vmx_vmcs_exit(v);
564 return -EINVAL;
565 }
567 #ifdef HVM_DEBUG_SUSPEND
568 static void dump_msr_state(struct vmx_msr_state *m)
569 {
570 int i = 0;
571 printk("**** msr state ****\n");
572 printk("shadow_gs=0x%lx, flags=0x%lx, msr_items:", m->shadow_gs, m->flags);
573 for (i = 0; i < VMX_MSR_COUNT; i++)
574 printk("0x%lx,", m->msrs[i]);
575 printk("\n");
576 }
577 #else
578 static void dump_msr_state(struct vmx_msr_state *m)
579 {
580 }
581 #endif
583 void vmx_save_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data)
584 {
585 struct vmx_msr_state *guest_state = &v->arch.hvm_vmx.msr_state;
586 unsigned long guest_flags = guest_state->flags;
588 data->shadow_gs = guest_state->shadow_gs;
590 /* save msrs */
591 data->flags = guest_flags;
592 data->msr_lstar = guest_state->msrs[VMX_INDEX_MSR_LSTAR];
593 data->msr_star = guest_state->msrs[VMX_INDEX_MSR_STAR];
594 data->msr_cstar = guest_state->msrs[VMX_INDEX_MSR_CSTAR];
595 data->msr_syscall_mask = guest_state->msrs[VMX_INDEX_MSR_SYSCALL_MASK];
596 data->msr_efer = guest_state->msrs[VMX_INDEX_MSR_EFER];
598 data->tsc = hvm_get_guest_time(v);
600 dump_msr_state(guest_state);
601 }
603 void vmx_load_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data)
604 {
605 struct vmx_msr_state *guest_state = &v->arch.hvm_vmx.msr_state;
607 /* restore msrs */
608 guest_state->flags = data->flags;
609 guest_state->msrs[VMX_INDEX_MSR_LSTAR] = data->msr_lstar;
610 guest_state->msrs[VMX_INDEX_MSR_STAR] = data->msr_star;
611 guest_state->msrs[VMX_INDEX_MSR_CSTAR] = data->msr_cstar;
612 guest_state->msrs[VMX_INDEX_MSR_SYSCALL_MASK] = data->msr_syscall_mask;
613 guest_state->msrs[VMX_INDEX_MSR_EFER] = data->msr_efer;
615 guest_state->shadow_gs = data->shadow_gs;
617 v->arch.hvm_vmx.vmxassist_enabled = !(data->cr0 & X86_CR0_PE);
619 hvm_set_guest_time(v, data->tsc);
621 dump_msr_state(guest_state);
622 }
625 void vmx_save_vmcs_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt)
626 {
627 vmx_save_cpu_state(v, ctxt);
628 vmx_vmcs_enter(v);
629 vmx_vmcs_save(v, ctxt);
630 vmx_vmcs_exit(v);
631 }
633 int vmx_load_vmcs_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt)
634 {
635 vmx_load_cpu_state(v, ctxt);
636 if (vmx_vmcs_restore(v, ctxt)) {
637 printk("vmx_vmcs restore failed!\n");
638 domain_crash(v->domain);
639 return -EINVAL;
640 }
642 return 0;
643 }
645 /*
646 * DR7 is saved and restored on every vmexit. Other debug registers only
647 * need to be restored if their value is going to affect execution -- i.e.,
648 * if one of the breakpoints is enabled. So mask out all bits that don't
649 * enable some breakpoint functionality.
650 */
651 #define DR7_ACTIVE_MASK 0xff
653 static inline void vmx_restore_dr(struct vcpu *v)
654 {
655 /* NB. __vmread() is not usable here, so we cannot read from the VMCS. */
656 if ( unlikely(v->arch.guest_context.debugreg[7] & DR7_ACTIVE_MASK) )
657 __restore_debug_registers(v);
658 }
660 static void vmx_ctxt_switch_from(struct vcpu *v)
661 {
662 vmx_save_guest_msrs(v);
663 vmx_restore_host_msrs();
664 vmx_save_dr(v);
665 }
667 static void vmx_ctxt_switch_to(struct vcpu *v)
668 {
669 vmx_restore_guest_msrs(v);
670 vmx_restore_dr(v);
671 }
673 static void stop_vmx(void)
674 {
675 if ( !(read_cr4() & X86_CR4_VMXE) )
676 return;
678 __vmxoff();
679 clear_in_cr4(X86_CR4_VMXE);
680 }
682 static void vmx_store_cpu_guest_regs(
683 struct vcpu *v, struct cpu_user_regs *regs, unsigned long *crs)
684 {
685 vmx_vmcs_enter(v);
687 if ( regs != NULL )
688 {
689 regs->eflags = __vmread(GUEST_RFLAGS);
690 regs->ss = __vmread(GUEST_SS_SELECTOR);
691 regs->cs = __vmread(GUEST_CS_SELECTOR);
692 regs->eip = __vmread(GUEST_RIP);
693 regs->esp = __vmread(GUEST_RSP);
694 }
696 if ( crs != NULL )
697 {
698 crs[0] = v->arch.hvm_vmx.cpu_shadow_cr0;
699 crs[2] = v->arch.hvm_vmx.cpu_cr2;
700 crs[3] = v->arch.hvm_vmx.cpu_cr3;
701 crs[4] = v->arch.hvm_vmx.cpu_shadow_cr4;
702 }
704 vmx_vmcs_exit(v);
705 }
707 static void vmx_load_cpu_guest_regs(struct vcpu *v, struct cpu_user_regs *regs)
708 {
709 unsigned long base;
711 vmx_vmcs_enter(v);
713 __vmwrite(GUEST_SS_SELECTOR, regs->ss);
714 __vmwrite(GUEST_RSP, regs->esp);
716 /* NB. Bit 1 of RFLAGS must be set for VMENTRY to succeed. */
717 __vmwrite(GUEST_RFLAGS, regs->eflags | 2UL);
719 if ( regs->eflags & EF_VM )
720 {
721 /*
722 * The VMX spec (section 4.3.1.2, Checks on Guest Segment
723 * Registers) says that virtual-8086 mode guests' segment
724 * base-address fields in the VMCS must be equal to their
725 * corresponding segment selector field shifted right by
726 * four bits upon vmentry.
727 */
728 base = __vmread(GUEST_CS_BASE);
729 if ( (regs->cs << 4) != base )
730 __vmwrite(GUEST_CS_BASE, regs->cs << 4);
731 base = __vmread(GUEST_SS_BASE);
732 if ( (regs->ss << 4) != base )
733 __vmwrite(GUEST_SS_BASE, regs->ss << 4);
734 }
736 __vmwrite(GUEST_CS_SELECTOR, regs->cs);
737 __vmwrite(GUEST_RIP, regs->eip);
739 vmx_vmcs_exit(v);
740 }
742 static unsigned long vmx_get_ctrl_reg(struct vcpu *v, unsigned int num)
743 {
744 switch ( num )
745 {
746 case 0:
747 return v->arch.hvm_vmx.cpu_cr0;
748 case 2:
749 return v->arch.hvm_vmx.cpu_cr2;
750 case 3:
751 return v->arch.hvm_vmx.cpu_cr3;
752 case 4:
753 return v->arch.hvm_vmx.cpu_shadow_cr4;
754 default:
755 BUG();
756 }
757 return 0; /* dummy */
758 }
760 static unsigned long vmx_get_segment_base(struct vcpu *v, enum x86_segment seg)
761 {
762 unsigned long base = 0;
763 int long_mode = 0;
765 ASSERT(v == current);
767 #ifdef __x86_64__
768 if ( vmx_long_mode_enabled(v) && (__vmread(GUEST_CS_AR_BYTES) & (1u<<13)) )
769 long_mode = 1;
770 #endif
772 switch ( seg )
773 {
774 case x86_seg_cs: if ( !long_mode ) base = __vmread(GUEST_CS_BASE); break;
775 case x86_seg_ds: if ( !long_mode ) base = __vmread(GUEST_DS_BASE); break;
776 case x86_seg_es: if ( !long_mode ) base = __vmread(GUEST_ES_BASE); break;
777 case x86_seg_fs: base = __vmread(GUEST_FS_BASE); break;
778 case x86_seg_gs: base = __vmread(GUEST_GS_BASE); break;
779 case x86_seg_ss: if ( !long_mode ) base = __vmread(GUEST_SS_BASE); break;
780 case x86_seg_tr: base = __vmread(GUEST_TR_BASE); break;
781 case x86_seg_gdtr: base = __vmread(GUEST_GDTR_BASE); break;
782 case x86_seg_idtr: base = __vmread(GUEST_IDTR_BASE); break;
783 case x86_seg_ldtr: base = __vmread(GUEST_LDTR_BASE); break;
784 default: BUG(); break;
785 }
787 return base;
788 }
790 static void vmx_get_segment_register(struct vcpu *v, enum x86_segment seg,
791 struct segment_register *reg)
792 {
793 u16 attr = 0;
795 ASSERT(v == current);
797 switch ( seg )
798 {
799 case x86_seg_cs:
800 reg->sel = __vmread(GUEST_CS_SELECTOR);
801 reg->limit = __vmread(GUEST_CS_LIMIT);
802 reg->base = __vmread(GUEST_CS_BASE);
803 attr = __vmread(GUEST_CS_AR_BYTES);
804 break;
805 case x86_seg_ds:
806 reg->sel = __vmread(GUEST_DS_SELECTOR);
807 reg->limit = __vmread(GUEST_DS_LIMIT);
808 reg->base = __vmread(GUEST_DS_BASE);
809 attr = __vmread(GUEST_DS_AR_BYTES);
810 break;
811 case x86_seg_es:
812 reg->sel = __vmread(GUEST_ES_SELECTOR);
813 reg->limit = __vmread(GUEST_ES_LIMIT);
814 reg->base = __vmread(GUEST_ES_BASE);
815 attr = __vmread(GUEST_ES_AR_BYTES);
816 break;
817 case x86_seg_fs:
818 reg->sel = __vmread(GUEST_FS_SELECTOR);
819 reg->limit = __vmread(GUEST_FS_LIMIT);
820 reg->base = __vmread(GUEST_FS_BASE);
821 attr = __vmread(GUEST_FS_AR_BYTES);
822 break;
823 case x86_seg_gs:
824 reg->sel = __vmread(GUEST_GS_SELECTOR);
825 reg->limit = __vmread(GUEST_GS_LIMIT);
826 reg->base = __vmread(GUEST_GS_BASE);
827 attr = __vmread(GUEST_GS_AR_BYTES);
828 break;
829 case x86_seg_ss:
830 reg->sel = __vmread(GUEST_SS_SELECTOR);
831 reg->limit = __vmread(GUEST_SS_LIMIT);
832 reg->base = __vmread(GUEST_SS_BASE);
833 attr = __vmread(GUEST_SS_AR_BYTES);
834 break;
835 case x86_seg_tr:
836 reg->sel = __vmread(GUEST_TR_SELECTOR);
837 reg->limit = __vmread(GUEST_TR_LIMIT);
838 reg->base = __vmread(GUEST_TR_BASE);
839 attr = __vmread(GUEST_TR_AR_BYTES);
840 break;
841 case x86_seg_gdtr:
842 reg->limit = __vmread(GUEST_GDTR_LIMIT);
843 reg->base = __vmread(GUEST_GDTR_BASE);
844 break;
845 case x86_seg_idtr:
846 reg->limit = __vmread(GUEST_IDTR_LIMIT);
847 reg->base = __vmread(GUEST_IDTR_BASE);
848 break;
849 case x86_seg_ldtr:
850 reg->sel = __vmread(GUEST_LDTR_SELECTOR);
851 reg->limit = __vmread(GUEST_LDTR_LIMIT);
852 reg->base = __vmread(GUEST_LDTR_BASE);
853 attr = __vmread(GUEST_LDTR_AR_BYTES);
854 break;
855 default:
856 BUG();
857 }
859 reg->attr.bytes = (attr & 0xff) | ((attr >> 4) & 0xf00);
860 }
862 /* Make sure that xen intercepts any FP accesses from current */
863 static void vmx_stts(struct vcpu *v)
864 {
865 /* VMX depends on operating on the current vcpu */
866 ASSERT(v == current);
868 /*
869 * If the guest does not have TS enabled then we must cause and handle an
870 * exception on first use of the FPU. If the guest *does* have TS enabled
871 * then this is not necessary: no FPU activity can occur until the guest
872 * clears CR0.TS, and we will initialise the FPU when that happens.
873 */
874 if ( !(v->arch.hvm_vmx.cpu_shadow_cr0 & X86_CR0_TS) )
875 {
876 v->arch.hvm_vmx.cpu_cr0 |= X86_CR0_TS;
877 __vmwrite(GUEST_CR0, v->arch.hvm_vmx.cpu_cr0);
878 __vm_set_bit(EXCEPTION_BITMAP, TRAP_no_device);
879 }
880 }
882 static void vmx_set_tsc_offset(struct vcpu *v, u64 offset)
883 {
884 vmx_vmcs_enter(v);
885 __vmwrite(TSC_OFFSET, offset);
886 #if defined (__i386__)
887 __vmwrite(TSC_OFFSET_HIGH, offset >> 32);
888 #endif
889 vmx_vmcs_exit(v);
890 }
892 static void vmx_init_ap_context(
893 struct vcpu_guest_context *ctxt, int vcpuid, int trampoline_vector)
894 {
895 memset(ctxt, 0, sizeof(*ctxt));
896 ctxt->user_regs.eip = VMXASSIST_BASE;
897 ctxt->user_regs.edx = vcpuid;
898 ctxt->user_regs.ebx = trampoline_vector;
899 }
901 void do_nmi(struct cpu_user_regs *);
903 static void vmx_init_hypercall_page(struct domain *d, void *hypercall_page)
904 {
905 char *p;
906 int i;
908 memset(hypercall_page, 0, PAGE_SIZE);
910 for ( i = 0; i < (PAGE_SIZE / 32); i++ )
911 {
912 p = (char *)(hypercall_page + (i * 32));
913 *(u8 *)(p + 0) = 0xb8; /* mov imm32, %eax */
914 *(u32 *)(p + 1) = i;
915 *(u8 *)(p + 5) = 0x0f; /* vmcall */
916 *(u8 *)(p + 6) = 0x01;
917 *(u8 *)(p + 7) = 0xc1;
918 *(u8 *)(p + 8) = 0xc3; /* ret */
919 }
921 /* Don't support HYPERVISOR_iret at the moment */
922 *(u16 *)(hypercall_page + (__HYPERVISOR_iret * 32)) = 0x0b0f; /* ud2 */
923 }
925 static int vmx_realmode(struct vcpu *v)
926 {
927 unsigned long rflags;
929 ASSERT(v == current);
931 rflags = __vmread(GUEST_RFLAGS);
932 return rflags & X86_EFLAGS_VM;
933 }
935 static int vmx_guest_x86_mode(struct vcpu *v)
936 {
937 unsigned long cs_ar_bytes;
939 ASSERT(v == current);
941 cs_ar_bytes = __vmread(GUEST_CS_AR_BYTES);
943 if ( vmx_long_mode_enabled(v) && (cs_ar_bytes & (1u<<13)) )
944 return 8;
946 if ( vmx_realmode(v) )
947 return 2;
949 return ((cs_ar_bytes & (1u<<14)) ? 4 : 2);
950 }
952 static int vmx_pae_enabled(struct vcpu *v)
953 {
954 unsigned long cr4 = v->arch.hvm_vmx.cpu_shadow_cr4;
955 return (vmx_paging_enabled(v) && (cr4 & X86_CR4_PAE));
956 }
958 static void vmx_update_host_cr3(struct vcpu *v)
959 {
960 ASSERT( (v == current) || !vcpu_runnable(v) );
961 vmx_vmcs_enter(v);
962 __vmwrite(HOST_CR3, v->arch.cr3);
963 vmx_vmcs_exit(v);
964 }
966 static void vmx_update_guest_cr3(struct vcpu *v)
967 {
968 ASSERT( (v == current) || !vcpu_runnable(v) );
969 vmx_vmcs_enter(v);
970 __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
971 vmx_vmcs_exit(v);
972 }
975 static void vmx_inject_exception(
976 unsigned int trapnr, int errcode, unsigned long cr2)
977 {
978 struct vcpu *v = current;
979 vmx_inject_hw_exception(v, trapnr, errcode);
980 if ( trapnr == TRAP_page_fault )
981 v->arch.hvm_vmx.cpu_cr2 = cr2;
982 }
984 static void vmx_update_vtpr(struct vcpu *v, unsigned long value)
985 {
986 /* VMX doesn't have a V_TPR field */
987 }
989 static int vmx_event_injection_faulted(struct vcpu *v)
990 {
991 unsigned int idtv_info_field;
993 ASSERT(v == current);
995 idtv_info_field = __vmread(IDT_VECTORING_INFO_FIELD);
996 return (idtv_info_field & INTR_INFO_VALID_MASK);
997 }
999 static struct hvm_function_table vmx_function_table = {
1000 .disable = stop_vmx,
1001 .vcpu_initialise = vmx_vcpu_initialise,
1002 .vcpu_destroy = vmx_vcpu_destroy,
1003 .store_cpu_guest_regs = vmx_store_cpu_guest_regs,
1004 .load_cpu_guest_regs = vmx_load_cpu_guest_regs,
1005 .save_cpu_ctxt = vmx_save_vmcs_ctxt,
1006 .load_cpu_ctxt = vmx_load_vmcs_ctxt,
1007 .paging_enabled = vmx_paging_enabled,
1008 .long_mode_enabled = vmx_long_mode_enabled,
1009 .pae_enabled = vmx_pae_enabled,
1010 .guest_x86_mode = vmx_guest_x86_mode,
1011 .get_guest_ctrl_reg = vmx_get_ctrl_reg,
1012 .get_segment_base = vmx_get_segment_base,
1013 .get_segment_register = vmx_get_segment_register,
1014 .update_host_cr3 = vmx_update_host_cr3,
1015 .update_guest_cr3 = vmx_update_guest_cr3,
1016 .update_vtpr = vmx_update_vtpr,
1017 .stts = vmx_stts,
1018 .set_tsc_offset = vmx_set_tsc_offset,
1019 .inject_exception = vmx_inject_exception,
1020 .init_ap_context = vmx_init_ap_context,
1021 .init_hypercall_page = vmx_init_hypercall_page,
1022 .event_injection_faulted = vmx_event_injection_faulted
1023 };
1025 int start_vmx(void)
1027 u32 eax, edx;
1028 struct vmcs_struct *vmcs;
1030 /*
1031 * Xen does not fill x86_capability words except 0.
1032 */
1033 boot_cpu_data.x86_capability[4] = cpuid_ecx(1);
1035 if ( !test_bit(X86_FEATURE_VMXE, &boot_cpu_data.x86_capability) )
1036 return 0;
1038 rdmsr(IA32_FEATURE_CONTROL_MSR, eax, edx);
1040 if ( eax & IA32_FEATURE_CONTROL_MSR_LOCK )
1042 if ( (eax & IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON) == 0x0 )
1044 printk("VMX disabled by Feature Control MSR.\n");
1045 return 0;
1048 else
1050 wrmsr(IA32_FEATURE_CONTROL_MSR,
1051 IA32_FEATURE_CONTROL_MSR_LOCK |
1052 IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON, 0);
1055 set_in_cr4(X86_CR4_VMXE);
1057 vmx_init_vmcs_config();
1059 if ( smp_processor_id() == 0 )
1060 setup_vmcs_dump();
1062 if ( (vmcs = vmx_alloc_host_vmcs()) == NULL )
1064 clear_in_cr4(X86_CR4_VMXE);
1065 printk("Failed to allocate host VMCS\n");
1066 return 0;
1069 if ( __vmxon(virt_to_maddr(vmcs)) )
1071 clear_in_cr4(X86_CR4_VMXE);
1072 printk("VMXON failed\n");
1073 vmx_free_host_vmcs(vmcs);
1074 return 0;
1077 printk("VMXON is done\n");
1079 vmx_save_host_msrs();
1081 hvm_enable(&vmx_function_table);
1083 return 1;
1086 /*
1087 * Not all cases receive valid value in the VM-exit instruction length field.
1088 * Callers must know what they're doing!
1089 */
1090 static int __get_instruction_length(void)
1092 int len;
1093 len = __vmread(VM_EXIT_INSTRUCTION_LEN); /* Safe: callers audited */
1094 BUG_ON((len < 1) || (len > 15));
1095 return len;
1098 static void inline __update_guest_eip(unsigned long inst_len)
1100 unsigned long current_eip;
1102 current_eip = __vmread(GUEST_RIP);
1103 __vmwrite(GUEST_RIP, current_eip + inst_len);
1104 __vmwrite(GUEST_INTERRUPTIBILITY_INFO, 0);
1107 static int vmx_do_page_fault(unsigned long va, struct cpu_user_regs *regs)
1109 int result;
1111 #if 0 /* keep for debugging */
1113 unsigned long eip, cs;
1115 cs = __vmread(GUEST_CS_BASE);
1116 eip = __vmread(GUEST_RIP);
1117 HVM_DBG_LOG(DBG_LEVEL_VMMU,
1118 "vmx_do_page_fault = 0x%lx, cs_base=%lx, "
1119 "eip = %lx, error_code = %lx\n",
1120 va, cs, eip, (unsigned long)regs->error_code);
1122 #endif
1124 result = paging_fault(va, regs);
1126 #if 0
1127 if ( !result )
1129 eip = __vmread(GUEST_RIP);
1130 printk("vmx pgfault to guest va=%lx eip=%lx\n", va, eip);
1132 #endif
1134 return result;
1137 static void vmx_do_no_device_fault(void)
1139 struct vcpu *v = current;
1141 setup_fpu(current);
1142 __vm_clear_bit(EXCEPTION_BITMAP, TRAP_no_device);
1144 /* Disable TS in guest CR0 unless the guest wants the exception too. */
1145 if ( !(v->arch.hvm_vmx.cpu_shadow_cr0 & X86_CR0_TS) )
1147 v->arch.hvm_vmx.cpu_cr0 &= ~X86_CR0_TS;
1148 __vmwrite(GUEST_CR0, v->arch.hvm_vmx.cpu_cr0);
1152 #define bitmaskof(idx) (1U << ((idx) & 31))
1153 static void vmx_do_cpuid(struct cpu_user_regs *regs)
1155 unsigned int input = (unsigned int)regs->eax;
1156 unsigned int count = (unsigned int)regs->ecx;
1157 unsigned int eax, ebx, ecx, edx;
1159 if ( input == 0x00000004 )
1161 cpuid_count(input, count, &eax, &ebx, &ecx, &edx);
1162 eax &= NUM_CORES_RESET_MASK;
1164 else if ( input == 0x40000003 )
1166 /*
1167 * NB. Unsupported interface for private use of VMXASSIST only.
1168 * Note that this leaf lives at <max-hypervisor-leaf> + 1.
1169 */
1170 u64 value = ((u64)regs->edx << 32) | (u32)regs->ecx;
1171 unsigned long mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
1172 struct vcpu *v = current;
1173 char *p;
1175 gdprintk(XENLOG_INFO, "Input address is 0x%"PRIx64".\n", value);
1177 /* 8-byte aligned valid pseudophys address from vmxassist, please. */
1178 if ( (value & 7) || (mfn == INVALID_MFN) ||
1179 !v->arch.hvm_vmx.vmxassist_enabled )
1181 domain_crash(v->domain);
1182 return;
1185 p = map_domain_page(mfn);
1186 value = *((uint64_t *)(p + (value & (PAGE_SIZE - 1))));
1187 unmap_domain_page(p);
1189 gdprintk(XENLOG_INFO, "Output value is 0x%"PRIx64".\n", value);
1190 ecx = (u32)value;
1191 edx = (u32)(value >> 32);
1192 } else {
1193 hvm_cpuid(input, &eax, &ebx, &ecx, &edx);
1195 if ( input == 0x00000001 )
1197 /* Mask off reserved bits. */
1198 ecx &= ~VMX_VCPU_CPUID_L1_ECX_RESERVED;
1200 ebx &= NUM_THREADS_RESET_MASK;
1202 /* Unsupportable for virtualised CPUs. */
1203 ecx &= ~(bitmaskof(X86_FEATURE_VMXE) |
1204 bitmaskof(X86_FEATURE_EST) |
1205 bitmaskof(X86_FEATURE_TM2) |
1206 bitmaskof(X86_FEATURE_CID));
1208 edx &= ~(bitmaskof(X86_FEATURE_HT) |
1209 bitmaskof(X86_FEATURE_ACPI) |
1210 bitmaskof(X86_FEATURE_ACC));
1213 if ( input == 0x00000006 || input == 0x00000009 || input == 0x0000000A )
1214 eax = ebx = ecx = edx = 0x0;
1217 regs->eax = (unsigned long)eax;
1218 regs->ebx = (unsigned long)ebx;
1219 regs->ecx = (unsigned long)ecx;
1220 regs->edx = (unsigned long)edx;
1222 HVMTRACE_3D(CPUID, current, input,
1223 ((uint64_t)eax << 32) | ebx, ((uint64_t)ecx << 32) | edx);
1226 #define CASE_GET_REG_P(REG, reg) \
1227 case REG_ ## REG: reg_p = (unsigned long *)&(regs->reg); break
1229 #ifdef __i386__
1230 #define CASE_EXTEND_GET_REG_P
1231 #else
1232 #define CASE_EXTEND_GET_REG_P \
1233 CASE_GET_REG_P(R8, r8); \
1234 CASE_GET_REG_P(R9, r9); \
1235 CASE_GET_REG_P(R10, r10); \
1236 CASE_GET_REG_P(R11, r11); \
1237 CASE_GET_REG_P(R12, r12); \
1238 CASE_GET_REG_P(R13, r13); \
1239 CASE_GET_REG_P(R14, r14); \
1240 CASE_GET_REG_P(R15, r15)
1241 #endif
1243 static void vmx_dr_access(unsigned long exit_qualification,
1244 struct cpu_user_regs *regs)
1246 struct vcpu *v = current;
1248 HVMTRACE_0D(DR_WRITE, v);
1250 v->arch.hvm_vcpu.flag_dr_dirty = 1;
1252 /* We could probably be smarter about this */
1253 __restore_debug_registers(v);
1255 /* Allow guest direct access to DR registers */
1256 v->arch.hvm_vcpu.u.vmx.exec_control &= ~CPU_BASED_MOV_DR_EXITING;
1257 __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
1258 v->arch.hvm_vcpu.u.vmx.exec_control);
1261 /*
1262 * Invalidate the TLB for va. Invalidate the shadow page corresponding
1263 * the address va.
1264 */
1265 static void vmx_do_invlpg(unsigned long va)
1267 unsigned long eip;
1268 struct vcpu *v = current;
1270 HVMTRACE_2D(INVLPG, v, /*invlpga=*/ 0, va);
1272 eip = __vmread(GUEST_RIP);
1274 HVM_DBG_LOG(DBG_LEVEL_VMMU, "eip=%lx, va=%lx",
1275 eip, va);
1277 /*
1278 * We do the safest things first, then try to update the shadow
1279 * copying from guest
1280 */
1281 paging_invlpg(v, va);
1285 static int vmx_check_descriptor(int long_mode, unsigned long eip, int inst_len,
1286 enum x86_segment seg, unsigned long *base,
1287 u32 *limit, u32 *ar_bytes)
1289 enum vmcs_field ar_field, base_field, limit_field;
1291 *base = 0;
1292 *limit = 0;
1293 if ( seg != x86_seg_es )
1295 unsigned char inst[MAX_INST_LEN];
1296 int i;
1297 extern int inst_copy_from_guest(unsigned char *, unsigned long, int);
1299 if ( !long_mode )
1300 eip += __vmread(GUEST_CS_BASE);
1301 memset(inst, 0, MAX_INST_LEN);
1302 if ( inst_copy_from_guest(inst, eip, inst_len) != inst_len )
1304 gdprintk(XENLOG_ERR, "Get guest instruction failed\n");
1305 domain_crash(current->domain);
1306 return 0;
1309 for ( i = 0; i < inst_len; i++ )
1311 switch ( inst[i] )
1313 case 0xf3: /* REPZ */
1314 case 0xf2: /* REPNZ */
1315 case 0xf0: /* LOCK */
1316 case 0x66: /* data32 */
1317 case 0x67: /* addr32 */
1318 #ifdef __x86_64__
1319 case 0x40 ... 0x4f: /* REX */
1320 #endif
1321 continue;
1322 case 0x2e: /* CS */
1323 seg = x86_seg_cs;
1324 continue;
1325 case 0x36: /* SS */
1326 seg = x86_seg_ss;
1327 continue;
1328 case 0x26: /* ES */
1329 seg = x86_seg_es;
1330 continue;
1331 case 0x64: /* FS */
1332 seg = x86_seg_fs;
1333 continue;
1334 case 0x65: /* GS */
1335 seg = x86_seg_gs;
1336 continue;
1337 case 0x3e: /* DS */
1338 seg = x86_seg_ds;
1339 continue;
1344 switch ( seg )
1346 case x86_seg_cs:
1347 ar_field = GUEST_CS_AR_BYTES;
1348 base_field = GUEST_CS_BASE;
1349 limit_field = GUEST_CS_LIMIT;
1350 break;
1351 case x86_seg_ds:
1352 ar_field = GUEST_DS_AR_BYTES;
1353 base_field = GUEST_DS_BASE;
1354 limit_field = GUEST_DS_LIMIT;
1355 break;
1356 case x86_seg_es:
1357 ar_field = GUEST_ES_AR_BYTES;
1358 base_field = GUEST_ES_BASE;
1359 limit_field = GUEST_ES_LIMIT;
1360 break;
1361 case x86_seg_fs:
1362 ar_field = GUEST_FS_AR_BYTES;
1363 base_field = GUEST_FS_BASE;
1364 limit_field = GUEST_FS_LIMIT;
1365 break;
1366 case x86_seg_gs:
1367 ar_field = GUEST_FS_AR_BYTES;
1368 base_field = GUEST_FS_BASE;
1369 limit_field = GUEST_FS_LIMIT;
1370 break;
1371 case x86_seg_ss:
1372 ar_field = GUEST_GS_AR_BYTES;
1373 base_field = GUEST_GS_BASE;
1374 limit_field = GUEST_GS_LIMIT;
1375 break;
1376 default:
1377 BUG();
1378 return 0;
1381 if ( !long_mode || seg == x86_seg_fs || seg == x86_seg_gs )
1383 *base = __vmread(base_field);
1384 *limit = __vmread(limit_field);
1386 *ar_bytes = __vmread(ar_field);
1388 return !(*ar_bytes & 0x10000);
1391 static void vmx_io_instruction(unsigned long exit_qualification,
1392 unsigned long inst_len)
1394 struct cpu_user_regs *regs;
1395 struct hvm_io_op *pio_opp;
1396 unsigned int port, size;
1397 int dir, df, vm86;
1399 pio_opp = &current->arch.hvm_vcpu.io_op;
1400 pio_opp->instr = INSTR_PIO;
1401 pio_opp->flags = 0;
1403 regs = &pio_opp->io_context;
1405 /* Copy current guest state into io instruction state structure. */
1406 memcpy(regs, guest_cpu_user_regs(), HVM_CONTEXT_STACK_BYTES);
1407 hvm_store_cpu_guest_regs(current, regs, NULL);
1409 vm86 = regs->eflags & X86_EFLAGS_VM ? 1 : 0;
1410 df = regs->eflags & X86_EFLAGS_DF ? 1 : 0;
1412 HVM_DBG_LOG(DBG_LEVEL_IO, "vm86 %d, eip=%x:%lx, "
1413 "exit_qualification = %lx",
1414 vm86, regs->cs, (unsigned long)regs->eip, exit_qualification);
1416 if ( test_bit(6, &exit_qualification) )
1417 port = (exit_qualification >> 16) & 0xFFFF;
1418 else
1419 port = regs->edx & 0xffff;
1421 size = (exit_qualification & 7) + 1;
1422 dir = test_bit(3, &exit_qualification); /* direction */
1424 if (dir==IOREQ_READ)
1425 HVMTRACE_2D(IO_READ, current, port, size);
1426 else
1427 HVMTRACE_2D(IO_WRITE, current, port, size);
1429 if ( test_bit(4, &exit_qualification) ) { /* string instruction */
1430 unsigned long addr, count = 1, base;
1431 paddr_t paddr;
1432 unsigned long gfn;
1433 u32 ar_bytes, limit;
1434 int sign = regs->eflags & X86_EFLAGS_DF ? -1 : 1;
1435 int long_mode = 0;
1437 ar_bytes = __vmread(GUEST_CS_AR_BYTES);
1438 #ifdef __x86_64__
1439 if ( vmx_long_mode_enabled(current) && (ar_bytes & (1u<<13)) )
1440 long_mode = 1;
1441 #endif
1442 addr = __vmread(GUEST_LINEAR_ADDRESS);
1444 if ( test_bit(5, &exit_qualification) ) { /* "rep" prefix */
1445 pio_opp->flags |= REPZ;
1446 count = regs->ecx;
1447 if ( !long_mode && (vm86 || !(ar_bytes & (1u<<14))) )
1448 count &= 0xFFFF;
1451 /*
1452 * In protected mode, guest linear address is invalid if the
1453 * selector is null.
1454 */
1455 if ( !vmx_check_descriptor(long_mode, regs->eip, inst_len,
1456 dir==IOREQ_WRITE ? x86_seg_ds : x86_seg_es,
1457 &base, &limit, &ar_bytes) ) {
1458 if ( !long_mode ) {
1459 vmx_inject_hw_exception(current, TRAP_gp_fault, 0);
1460 return;
1462 addr = dir == IOREQ_WRITE ? base + regs->esi : regs->edi;
1465 if ( !long_mode ) {
1466 unsigned long ea = addr - base;
1468 /* Segment must be readable for outs and writeable for ins. */
1469 if ( dir == IOREQ_WRITE ? (ar_bytes & 0xa) == 0x8
1470 : (ar_bytes & 0xa) != 0x2 ) {
1471 vmx_inject_hw_exception(current, TRAP_gp_fault, 0);
1472 return;
1475 /* Offset must be within limits. */
1476 ASSERT(ea == (u32)ea);
1477 if ( (u32)(ea + size - 1) < (u32)ea ||
1478 (ar_bytes & 0xc) != 0x4 ? ea + size - 1 > limit
1479 : ea <= limit )
1481 vmx_inject_hw_exception(current, TRAP_gp_fault, 0);
1482 return;
1485 /* Check the limit for repeated instructions, as above we checked
1486 only the first instance. Truncate the count if a limit violation
1487 would occur. Note that the checking is not necessary for page
1488 granular segments as transfers crossing page boundaries will be
1489 broken up anyway. */
1490 if ( !(ar_bytes & (1u<<15)) && count > 1 )
1492 if ( (ar_bytes & 0xc) != 0x4 )
1494 /* expand-up */
1495 if ( !df )
1497 if ( ea + count * size - 1 < ea ||
1498 ea + count * size - 1 > limit )
1499 count = (limit + 1UL - ea) / size;
1501 else
1503 if ( count - 1 > ea / size )
1504 count = ea / size + 1;
1507 else
1509 /* expand-down */
1510 if ( !df )
1512 if ( count - 1 > -(s32)ea / size )
1513 count = -(s32)ea / size + 1UL;
1515 else
1517 if ( ea < (count - 1) * size ||
1518 ea - (count - 1) * size <= limit )
1519 count = (ea - limit - 1) / size + 1;
1522 ASSERT(count);
1525 #ifdef __x86_64__
1526 else
1528 if ( !is_canonical_address(addr) ||
1529 !is_canonical_address(addr + size - 1) )
1531 vmx_inject_hw_exception(current, TRAP_gp_fault, 0);
1532 return;
1534 if ( count > (1UL << 48) / size )
1535 count = (1UL << 48) / size;
1536 if ( !(regs->eflags & EF_DF) )
1538 if ( addr + count * size - 1 < addr ||
1539 !is_canonical_address(addr + count * size - 1) )
1540 count = (addr & ~((1UL << 48) - 1)) / size;
1542 else
1544 if ( (count - 1) * size > addr ||
1545 !is_canonical_address(addr + (count - 1) * size) )
1546 count = (addr & ~((1UL << 48) - 1)) / size + 1;
1548 ASSERT(count);
1550 #endif
1552 /* Translate the address to a physical address */
1553 gfn = paging_gva_to_gfn(current, addr);
1554 if ( gfn == INVALID_GFN )
1556 /* The guest does not have the RAM address mapped.
1557 * Need to send in a page fault */
1558 int errcode = 0;
1559 /* IO read --> memory write */
1560 if ( dir == IOREQ_READ ) errcode |= PFEC_write_access;
1561 vmx_inject_exception(TRAP_page_fault, errcode, addr);
1562 return;
1564 paddr = (paddr_t)gfn << PAGE_SHIFT | (addr & ~PAGE_MASK);
1566 /*
1567 * Handle string pio instructions that cross pages or that
1568 * are unaligned. See the comments in hvm_domain.c/handle_mmio()
1569 */
1570 if ( (addr & PAGE_MASK) != ((addr + size - 1) & PAGE_MASK) ) {
1571 unsigned long value = 0;
1573 pio_opp->flags |= OVERLAP;
1575 if ( dir == IOREQ_WRITE ) /* OUTS */
1577 if ( hvm_paging_enabled(current) )
1579 int rv = hvm_copy_from_guest_virt(&value, addr, size);
1580 if ( rv != 0 )
1582 /* Failed on the page-spanning copy. Inject PF into
1583 * the guest for the address where we failed. */
1584 addr += size - rv;
1585 gdprintk(XENLOG_DEBUG, "Pagefault reading non-io side "
1586 "of a page-spanning PIO: va=%#lx\n", addr);
1587 vmx_inject_exception(TRAP_page_fault, 0, addr);
1588 return;
1591 else
1592 (void) hvm_copy_from_guest_phys(&value, addr, size);
1593 } else /* dir != IOREQ_WRITE */
1594 /* Remember where to write the result, as a *VA*.
1595 * Must be a VA so we can handle the page overlap
1596 * correctly in hvm_pio_assist() */
1597 pio_opp->addr = addr;
1599 if ( count == 1 )
1600 regs->eip += inst_len;
1602 send_pio_req(port, 1, size, value, dir, df, 0);
1603 } else {
1604 unsigned long last_addr = sign > 0 ? addr + count * size - 1
1605 : addr - (count - 1) * size;
1607 if ( (addr & PAGE_MASK) != (last_addr & PAGE_MASK) )
1609 if ( sign > 0 )
1610 count = (PAGE_SIZE - (addr & ~PAGE_MASK)) / size;
1611 else
1612 count = (addr & ~PAGE_MASK) / size + 1;
1613 } else
1614 regs->eip += inst_len;
1616 send_pio_req(port, count, size, paddr, dir, df, 1);
1618 } else {
1619 if ( port == 0xe9 && dir == IOREQ_WRITE && size == 1 )
1620 hvm_print_line(current, regs->eax); /* guest debug output */
1622 regs->eip += inst_len;
1623 send_pio_req(port, 1, size, regs->eax, dir, df, 0);
1627 static void vmx_world_save(struct vcpu *v, struct vmx_assist_context *c)
1629 /* NB. Skip transition instruction. */
1630 c->eip = __vmread(GUEST_RIP);
1631 c->eip += __get_instruction_length(); /* Safe: MOV Cn, LMSW, CLTS */
1633 c->esp = __vmread(GUEST_RSP);
1634 c->eflags = __vmread(GUEST_RFLAGS);
1636 c->cr0 = v->arch.hvm_vmx.cpu_shadow_cr0;
1637 c->cr3 = v->arch.hvm_vmx.cpu_cr3;
1638 c->cr4 = v->arch.hvm_vmx.cpu_shadow_cr4;
1640 c->idtr_limit = __vmread(GUEST_IDTR_LIMIT);
1641 c->idtr_base = __vmread(GUEST_IDTR_BASE);
1643 c->gdtr_limit = __vmread(GUEST_GDTR_LIMIT);
1644 c->gdtr_base = __vmread(GUEST_GDTR_BASE);
1646 c->cs_sel = __vmread(GUEST_CS_SELECTOR);
1647 c->cs_limit = __vmread(GUEST_CS_LIMIT);
1648 c->cs_base = __vmread(GUEST_CS_BASE);
1649 c->cs_arbytes.bytes = __vmread(GUEST_CS_AR_BYTES);
1651 c->ds_sel = __vmread(GUEST_DS_SELECTOR);
1652 c->ds_limit = __vmread(GUEST_DS_LIMIT);
1653 c->ds_base = __vmread(GUEST_DS_BASE);
1654 c->ds_arbytes.bytes = __vmread(GUEST_DS_AR_BYTES);
1656 c->es_sel = __vmread(GUEST_ES_SELECTOR);
1657 c->es_limit = __vmread(GUEST_ES_LIMIT);
1658 c->es_base = __vmread(GUEST_ES_BASE);
1659 c->es_arbytes.bytes = __vmread(GUEST_ES_AR_BYTES);
1661 c->ss_sel = __vmread(GUEST_SS_SELECTOR);
1662 c->ss_limit = __vmread(GUEST_SS_LIMIT);
1663 c->ss_base = __vmread(GUEST_SS_BASE);
1664 c->ss_arbytes.bytes = __vmread(GUEST_SS_AR_BYTES);
1666 c->fs_sel = __vmread(GUEST_FS_SELECTOR);
1667 c->fs_limit = __vmread(GUEST_FS_LIMIT);
1668 c->fs_base = __vmread(GUEST_FS_BASE);
1669 c->fs_arbytes.bytes = __vmread(GUEST_FS_AR_BYTES);
1671 c->gs_sel = __vmread(GUEST_GS_SELECTOR);
1672 c->gs_limit = __vmread(GUEST_GS_LIMIT);
1673 c->gs_base = __vmread(GUEST_GS_BASE);
1674 c->gs_arbytes.bytes = __vmread(GUEST_GS_AR_BYTES);
1676 c->tr_sel = __vmread(GUEST_TR_SELECTOR);
1677 c->tr_limit = __vmread(GUEST_TR_LIMIT);
1678 c->tr_base = __vmread(GUEST_TR_BASE);
1679 c->tr_arbytes.bytes = __vmread(GUEST_TR_AR_BYTES);
1681 c->ldtr_sel = __vmread(GUEST_LDTR_SELECTOR);
1682 c->ldtr_limit = __vmread(GUEST_LDTR_LIMIT);
1683 c->ldtr_base = __vmread(GUEST_LDTR_BASE);
1684 c->ldtr_arbytes.bytes = __vmread(GUEST_LDTR_AR_BYTES);
1687 static int vmx_world_restore(struct vcpu *v, struct vmx_assist_context *c)
1689 unsigned long mfn, old_base_mfn;
1691 __vmwrite(GUEST_RIP, c->eip);
1692 __vmwrite(GUEST_RSP, c->esp);
1693 __vmwrite(GUEST_RFLAGS, c->eflags);
1695 v->arch.hvm_vmx.cpu_shadow_cr0 = c->cr0;
1696 __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr0);
1698 if ( !vmx_paging_enabled(v) )
1699 goto skip_cr3;
1701 if ( c->cr3 == v->arch.hvm_vmx.cpu_cr3 )
1703 /*
1704 * This is simple TLB flush, implying the guest has
1705 * removed some translation or changed page attributes.
1706 * We simply invalidate the shadow.
1707 */
1708 mfn = get_mfn_from_gpfn(c->cr3 >> PAGE_SHIFT);
1709 if ( mfn != pagetable_get_pfn(v->arch.guest_table) )
1710 goto bad_cr3;
1712 else
1714 /*
1715 * If different, make a shadow. Check if the PDBR is valid
1716 * first.
1717 */
1718 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 c->cr3 = %x", c->cr3);
1719 mfn = get_mfn_from_gpfn(c->cr3 >> PAGE_SHIFT);
1720 if ( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), v->domain) )
1721 goto bad_cr3;
1722 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1723 v->arch.guest_table = pagetable_from_pfn(mfn);
1724 if (old_base_mfn)
1725 put_page(mfn_to_page(old_base_mfn));
1726 v->arch.hvm_vmx.cpu_cr3 = c->cr3;
1729 skip_cr3:
1730 if ( !vmx_paging_enabled(v) )
1731 HVM_DBG_LOG(DBG_LEVEL_VMMU, "switching to vmxassist. use phys table");
1732 else
1733 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %x", c->cr3);
1735 __vmwrite(GUEST_CR4, (c->cr4 | VMX_CR4_HOST_MASK));
1736 v->arch.hvm_vmx.cpu_shadow_cr4 = c->cr4;
1737 __vmwrite(CR4_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr4);
1739 __vmwrite(GUEST_IDTR_LIMIT, c->idtr_limit);
1740 __vmwrite(GUEST_IDTR_BASE, c->idtr_base);
1742 __vmwrite(GUEST_GDTR_LIMIT, c->gdtr_limit);
1743 __vmwrite(GUEST_GDTR_BASE, c->gdtr_base);
1745 __vmwrite(GUEST_CS_SELECTOR, c->cs_sel);
1746 __vmwrite(GUEST_CS_LIMIT, c->cs_limit);
1747 __vmwrite(GUEST_CS_BASE, c->cs_base);
1748 __vmwrite(GUEST_CS_AR_BYTES, c->cs_arbytes.bytes);
1750 __vmwrite(GUEST_DS_SELECTOR, c->ds_sel);
1751 __vmwrite(GUEST_DS_LIMIT, c->ds_limit);
1752 __vmwrite(GUEST_DS_BASE, c->ds_base);
1753 __vmwrite(GUEST_DS_AR_BYTES, c->ds_arbytes.bytes);
1755 __vmwrite(GUEST_ES_SELECTOR, c->es_sel);
1756 __vmwrite(GUEST_ES_LIMIT, c->es_limit);
1757 __vmwrite(GUEST_ES_BASE, c->es_base);
1758 __vmwrite(GUEST_ES_AR_BYTES, c->es_arbytes.bytes);
1760 __vmwrite(GUEST_SS_SELECTOR, c->ss_sel);
1761 __vmwrite(GUEST_SS_LIMIT, c->ss_limit);
1762 __vmwrite(GUEST_SS_BASE, c->ss_base);
1763 __vmwrite(GUEST_SS_AR_BYTES, c->ss_arbytes.bytes);
1765 __vmwrite(GUEST_FS_SELECTOR, c->fs_sel);
1766 __vmwrite(GUEST_FS_LIMIT, c->fs_limit);
1767 __vmwrite(GUEST_FS_BASE, c->fs_base);
1768 __vmwrite(GUEST_FS_AR_BYTES, c->fs_arbytes.bytes);
1770 __vmwrite(GUEST_GS_SELECTOR, c->gs_sel);
1771 __vmwrite(GUEST_GS_LIMIT, c->gs_limit);
1772 __vmwrite(GUEST_GS_BASE, c->gs_base);
1773 __vmwrite(GUEST_GS_AR_BYTES, c->gs_arbytes.bytes);
1775 __vmwrite(GUEST_TR_SELECTOR, c->tr_sel);
1776 __vmwrite(GUEST_TR_LIMIT, c->tr_limit);
1777 __vmwrite(GUEST_TR_BASE, c->tr_base);
1778 __vmwrite(GUEST_TR_AR_BYTES, c->tr_arbytes.bytes);
1780 __vmwrite(GUEST_LDTR_SELECTOR, c->ldtr_sel);
1781 __vmwrite(GUEST_LDTR_LIMIT, c->ldtr_limit);
1782 __vmwrite(GUEST_LDTR_BASE, c->ldtr_base);
1783 __vmwrite(GUEST_LDTR_AR_BYTES, c->ldtr_arbytes.bytes);
1785 paging_update_paging_modes(v);
1786 return 0;
1788 bad_cr3:
1789 gdprintk(XENLOG_ERR, "Invalid CR3 value=%x", c->cr3);
1790 return -EINVAL;
1793 enum { VMX_ASSIST_INVOKE = 0, VMX_ASSIST_RESTORE };
1795 static int vmx_assist(struct vcpu *v, int mode)
1797 struct vmx_assist_context c;
1798 u32 magic;
1799 u32 cp;
1801 /* make sure vmxassist exists (this is not an error) */
1802 if (hvm_copy_from_guest_phys(&magic, VMXASSIST_MAGIC_OFFSET,
1803 sizeof(magic)))
1804 return 0;
1805 if (magic != VMXASSIST_MAGIC)
1806 return 0;
1808 switch (mode) {
1809 /*
1810 * Transfer control to vmxassist.
1811 * Store the current context in VMXASSIST_OLD_CONTEXT and load
1812 * the new VMXASSIST_NEW_CONTEXT context. This context was created
1813 * by vmxassist and will transfer control to it.
1814 */
1815 case VMX_ASSIST_INVOKE:
1816 /* save the old context */
1817 if (hvm_copy_from_guest_phys(&cp, VMXASSIST_OLD_CONTEXT, sizeof(cp)))
1818 goto error;
1819 if (cp != 0) {
1820 vmx_world_save(v, &c);
1821 if (hvm_copy_to_guest_phys(cp, &c, sizeof(c)))
1822 goto error;
1825 /* restore the new context, this should activate vmxassist */
1826 if (hvm_copy_from_guest_phys(&cp, VMXASSIST_NEW_CONTEXT, sizeof(cp)))
1827 goto error;
1828 if (cp != 0) {
1829 if (hvm_copy_from_guest_phys(&c, cp, sizeof(c)))
1830 goto error;
1831 if ( vmx_world_restore(v, &c) != 0 )
1832 goto error;
1833 v->arch.hvm_vmx.vmxassist_enabled = 1;
1834 return 1;
1836 break;
1838 /*
1839 * Restore the VMXASSIST_OLD_CONTEXT that was saved by
1840 * VMX_ASSIST_INVOKE above.
1841 */
1842 case VMX_ASSIST_RESTORE:
1843 /* save the old context */
1844 if (hvm_copy_from_guest_phys(&cp, VMXASSIST_OLD_CONTEXT, sizeof(cp)))
1845 goto error;
1846 if (cp != 0) {
1847 if (hvm_copy_from_guest_phys(&c, cp, sizeof(c)))
1848 goto error;
1849 if ( vmx_world_restore(v, &c) != 0 )
1850 goto error;
1851 v->arch.hvm_vmx.vmxassist_enabled = 0;
1852 return 1;
1854 break;
1857 error:
1858 gdprintk(XENLOG_ERR, "Failed to transfer to vmxassist\n");
1859 domain_crash(v->domain);
1860 return 0;
1863 static int vmx_set_cr0(unsigned long value)
1865 struct vcpu *v = current;
1866 unsigned long mfn;
1867 unsigned long eip;
1868 int paging_enabled;
1869 unsigned long vm_entry_value;
1870 unsigned long old_cr0;
1871 unsigned long old_base_mfn;
1873 /*
1874 * CR0: We don't want to lose PE and PG.
1875 */
1876 old_cr0 = v->arch.hvm_vmx.cpu_shadow_cr0;
1877 paging_enabled = (old_cr0 & X86_CR0_PE) && (old_cr0 & X86_CR0_PG);
1879 /* TS cleared? Then initialise FPU now. */
1880 if ( !(value & X86_CR0_TS) )
1882 setup_fpu(v);
1883 __vm_clear_bit(EXCEPTION_BITMAP, TRAP_no_device);
1886 v->arch.hvm_vmx.cpu_cr0 = (value | X86_CR0_PE | X86_CR0_PG
1887 | X86_CR0_NE | X86_CR0_WP);
1888 __vmwrite(GUEST_CR0, v->arch.hvm_vmx.cpu_cr0);
1890 v->arch.hvm_vmx.cpu_shadow_cr0 = value;
1891 __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr0);
1893 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR0 value = %lx\n", value);
1895 if ( (value & X86_CR0_PE) && (value & X86_CR0_PG) && !paging_enabled )
1897 /*
1898 * Trying to enable guest paging.
1899 * The guest CR3 must be pointing to the guest physical.
1900 */
1901 mfn = get_mfn_from_gpfn(v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT);
1902 if ( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), v->domain) )
1904 gdprintk(XENLOG_ERR, "Invalid CR3 value = %lx (mfn=%lx)\n",
1905 v->arch.hvm_vmx.cpu_cr3, mfn);
1906 domain_crash(v->domain);
1907 return 0;
1910 #if defined(__x86_64__)
1911 if ( vmx_lme_is_set(v) )
1913 if ( !(v->arch.hvm_vmx.cpu_shadow_cr4 & X86_CR4_PAE) )
1915 HVM_DBG_LOG(DBG_LEVEL_1, "Guest enabled paging "
1916 "with EFER.LME set but not CR4.PAE\n");
1917 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
1919 else
1921 HVM_DBG_LOG(DBG_LEVEL_1, "Enabling long mode\n");
1922 v->arch.hvm_vmx.msr_state.msrs[VMX_INDEX_MSR_EFER]
1923 |= EFER_LMA;
1924 vm_entry_value = __vmread(VM_ENTRY_CONTROLS);
1925 vm_entry_value |= VM_ENTRY_IA32E_MODE;
1926 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
1929 #endif
1931 /*
1932 * Now arch.guest_table points to machine physical.
1933 */
1934 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1935 v->arch.guest_table = pagetable_from_pfn(mfn);
1936 if (old_base_mfn)
1937 put_page(mfn_to_page(old_base_mfn));
1938 paging_update_paging_modes(v);
1940 HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
1941 (unsigned long) (mfn << PAGE_SHIFT));
1943 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, mfn = %lx",
1944 v->arch.hvm_vmx.cpu_cr3, mfn);
1947 if ( !((value & X86_CR0_PE) && (value & X86_CR0_PG)) && paging_enabled )
1948 if ( v->arch.hvm_vmx.cpu_cr3 ) {
1949 put_page(mfn_to_page(get_mfn_from_gpfn(
1950 v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT)));
1951 v->arch.guest_table = pagetable_null();
1954 /*
1955 * VMX does not implement real-mode virtualization. We emulate
1956 * real-mode by performing a world switch to VMXAssist whenever
1957 * a partition disables the CR0.PE bit.
1958 */
1959 if ( (value & X86_CR0_PE) == 0 )
1961 if ( value & X86_CR0_PG ) {
1962 /* inject GP here */
1963 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
1964 return 0;
1965 } else {
1966 /*
1967 * Disable paging here.
1968 * Same to PE == 1 && PG == 0
1969 */
1970 if ( vmx_long_mode_enabled(v) )
1972 v->arch.hvm_vmx.msr_state.msrs[VMX_INDEX_MSR_EFER]
1973 &= ~EFER_LMA;
1974 vm_entry_value = __vmread(VM_ENTRY_CONTROLS);
1975 vm_entry_value &= ~VM_ENTRY_IA32E_MODE;
1976 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
1980 if ( vmx_assist(v, VMX_ASSIST_INVOKE) )
1982 eip = __vmread(GUEST_RIP);
1983 HVM_DBG_LOG(DBG_LEVEL_1,
1984 "Transfering control to vmxassist %%eip 0x%lx\n", eip);
1985 return 0; /* do not update eip! */
1988 else if ( v->arch.hvm_vmx.vmxassist_enabled )
1990 eip = __vmread(GUEST_RIP);
1991 HVM_DBG_LOG(DBG_LEVEL_1,
1992 "Enabling CR0.PE at %%eip 0x%lx\n", eip);
1993 if ( vmx_assist(v, VMX_ASSIST_RESTORE) )
1995 eip = __vmread(GUEST_RIP);
1996 HVM_DBG_LOG(DBG_LEVEL_1,
1997 "Restoring to %%eip 0x%lx\n", eip);
1998 return 0; /* do not update eip! */
2001 else if ( (value & (X86_CR0_PE | X86_CR0_PG)) == X86_CR0_PE )
2003 if ( vmx_long_mode_enabled(v) )
2005 v->arch.hvm_vmx.msr_state.msrs[VMX_INDEX_MSR_EFER] &= ~EFER_LMA;
2006 vm_entry_value = __vmread(VM_ENTRY_CONTROLS);
2007 vm_entry_value &= ~VM_ENTRY_IA32E_MODE;
2008 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
2010 paging_update_paging_modes(v);
2013 return 1;
2016 #define CASE_SET_REG(REG, reg) \
2017 case REG_ ## REG: regs->reg = value; break
2018 #define CASE_GET_REG(REG, reg) \
2019 case REG_ ## REG: value = regs->reg; break
2021 #define CASE_EXTEND_SET_REG \
2022 CASE_EXTEND_REG(S)
2023 #define CASE_EXTEND_GET_REG \
2024 CASE_EXTEND_REG(G)
2026 #ifdef __i386__
2027 #define CASE_EXTEND_REG(T)
2028 #else
2029 #define CASE_EXTEND_REG(T) \
2030 CASE_ ## T ## ET_REG(R8, r8); \
2031 CASE_ ## T ## ET_REG(R9, r9); \
2032 CASE_ ## T ## ET_REG(R10, r10); \
2033 CASE_ ## T ## ET_REG(R11, r11); \
2034 CASE_ ## T ## ET_REG(R12, r12); \
2035 CASE_ ## T ## ET_REG(R13, r13); \
2036 CASE_ ## T ## ET_REG(R14, r14); \
2037 CASE_ ## T ## ET_REG(R15, r15)
2038 #endif
2040 /*
2041 * Write to control registers
2042 */
2043 static int mov_to_cr(int gp, int cr, struct cpu_user_regs *regs)
2045 unsigned long value, old_cr, old_base_mfn, mfn;
2046 struct vcpu *v = current;
2047 struct vlapic *vlapic = vcpu_vlapic(v);
2049 switch ( gp )
2051 CASE_GET_REG(EAX, eax);
2052 CASE_GET_REG(ECX, ecx);
2053 CASE_GET_REG(EDX, edx);
2054 CASE_GET_REG(EBX, ebx);
2055 CASE_GET_REG(EBP, ebp);
2056 CASE_GET_REG(ESI, esi);
2057 CASE_GET_REG(EDI, edi);
2058 CASE_EXTEND_GET_REG;
2059 case REG_ESP:
2060 value = __vmread(GUEST_RSP);
2061 break;
2062 default:
2063 gdprintk(XENLOG_ERR, "invalid gp: %d\n", gp);
2064 goto exit_and_crash;
2067 HVMTRACE_2D(CR_WRITE, v, cr, value);
2069 HVM_DBG_LOG(DBG_LEVEL_1, "CR%d, value = %lx", cr, value);
2071 switch ( cr )
2073 case 0:
2074 return vmx_set_cr0(value);
2076 case 3:
2077 /*
2078 * If paging is not enabled yet, simply copy the value to CR3.
2079 */
2080 if (!vmx_paging_enabled(v)) {
2081 v->arch.hvm_vmx.cpu_cr3 = value;
2082 break;
2085 /*
2086 * We make a new one if the shadow does not exist.
2087 */
2088 if (value == v->arch.hvm_vmx.cpu_cr3) {
2089 /*
2090 * This is simple TLB flush, implying the guest has
2091 * removed some translation or changed page attributes.
2092 * We simply invalidate the shadow.
2093 */
2094 mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
2095 if (mfn != pagetable_get_pfn(v->arch.guest_table))
2096 goto bad_cr3;
2097 paging_update_cr3(v);
2098 } else {
2099 /*
2100 * If different, make a shadow. Check if the PDBR is valid
2101 * first.
2102 */
2103 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 value = %lx", value);
2104 mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
2105 if ( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), v->domain) )
2106 goto bad_cr3;
2107 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
2108 v->arch.guest_table = pagetable_from_pfn(mfn);
2109 if (old_base_mfn)
2110 put_page(mfn_to_page(old_base_mfn));
2111 v->arch.hvm_vmx.cpu_cr3 = value;
2112 update_cr3(v);
2113 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx", value);
2115 break;
2117 case 4: /* CR4 */
2118 old_cr = v->arch.hvm_vmx.cpu_shadow_cr4;
2120 if ( (value & X86_CR4_PAE) && !(old_cr & X86_CR4_PAE) )
2122 if ( vmx_pgbit_test(v) )
2124 /* The guest is a 32-bit PAE guest. */
2125 #if CONFIG_PAGING_LEVELS >= 3
2126 unsigned long mfn, old_base_mfn;
2127 mfn = get_mfn_from_gpfn(v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT);
2128 if ( !mfn_valid(mfn) ||
2129 !get_page(mfn_to_page(mfn), v->domain) )
2130 goto bad_cr3;
2132 /*
2133 * Now arch.guest_table points to machine physical.
2134 */
2136 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
2137 v->arch.guest_table = pagetable_from_pfn(mfn);
2138 if ( old_base_mfn )
2139 put_page(mfn_to_page(old_base_mfn));
2141 HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
2142 (unsigned long) (mfn << PAGE_SHIFT));
2144 HVM_DBG_LOG(DBG_LEVEL_VMMU,
2145 "Update CR3 value = %lx, mfn = %lx",
2146 v->arch.hvm_vmx.cpu_cr3, mfn);
2147 #endif
2150 else if ( !(value & X86_CR4_PAE) )
2152 if ( unlikely(vmx_long_mode_enabled(v)) )
2154 HVM_DBG_LOG(DBG_LEVEL_1, "Guest cleared CR4.PAE while "
2155 "EFER.LMA is set\n");
2156 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
2160 __vmwrite(GUEST_CR4, value| VMX_CR4_HOST_MASK);
2161 v->arch.hvm_vmx.cpu_shadow_cr4 = value;
2162 __vmwrite(CR4_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr4);
2164 /*
2165 * Writing to CR4 to modify the PSE, PGE, or PAE flag invalidates
2166 * all TLB entries except global entries.
2167 */
2168 if ( (old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE) )
2169 paging_update_paging_modes(v);
2170 break;
2172 case 8:
2173 vlapic_set_reg(vlapic, APIC_TASKPRI, ((value & 0x0F) << 4));
2174 break;
2176 default:
2177 gdprintk(XENLOG_ERR, "invalid cr: %d\n", cr);
2178 domain_crash(v->domain);
2179 return 0;
2182 return 1;
2184 bad_cr3:
2185 gdprintk(XENLOG_ERR, "Invalid CR3\n");
2186 exit_and_crash:
2187 domain_crash(v->domain);
2188 return 0;
2191 /*
2192 * Read from control registers. CR0 and CR4 are read from the shadow.
2193 */
2194 static void mov_from_cr(int cr, int gp, struct cpu_user_regs *regs)
2196 unsigned long value = 0;
2197 struct vcpu *v = current;
2198 struct vlapic *vlapic = vcpu_vlapic(v);
2200 switch ( cr )
2202 case 3:
2203 value = (unsigned long)v->arch.hvm_vmx.cpu_cr3;
2204 break;
2205 case 8:
2206 value = (unsigned long)vlapic_get_reg(vlapic, APIC_TASKPRI);
2207 value = (value & 0xF0) >> 4;
2208 break;
2209 default:
2210 gdprintk(XENLOG_ERR, "invalid cr: %d\n", cr);
2211 domain_crash(v->domain);
2212 break;
2215 switch ( gp ) {
2216 CASE_SET_REG(EAX, eax);
2217 CASE_SET_REG(ECX, ecx);
2218 CASE_SET_REG(EDX, edx);
2219 CASE_SET_REG(EBX, ebx);
2220 CASE_SET_REG(EBP, ebp);
2221 CASE_SET_REG(ESI, esi);
2222 CASE_SET_REG(EDI, edi);
2223 CASE_EXTEND_SET_REG;
2224 case REG_ESP:
2225 __vmwrite(GUEST_RSP, value);
2226 regs->esp = value;
2227 break;
2228 default:
2229 printk("invalid gp: %d\n", gp);
2230 domain_crash(v->domain);
2231 break;
2234 HVMTRACE_2D(CR_READ, v, cr, value);
2236 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR%d, value = %lx", cr, value);
2239 static int vmx_cr_access(unsigned long exit_qualification,
2240 struct cpu_user_regs *regs)
2242 unsigned int gp, cr;
2243 unsigned long value;
2244 struct vcpu *v = current;
2246 switch (exit_qualification & CONTROL_REG_ACCESS_TYPE) {
2247 case TYPE_MOV_TO_CR:
2248 gp = exit_qualification & CONTROL_REG_ACCESS_REG;
2249 cr = exit_qualification & CONTROL_REG_ACCESS_NUM;
2250 return mov_to_cr(gp, cr, regs);
2251 case TYPE_MOV_FROM_CR:
2252 gp = exit_qualification & CONTROL_REG_ACCESS_REG;
2253 cr = exit_qualification & CONTROL_REG_ACCESS_NUM;
2254 mov_from_cr(cr, gp, regs);
2255 break;
2256 case TYPE_CLTS:
2257 /* We initialise the FPU now, to avoid needing another vmexit. */
2258 setup_fpu(v);
2259 __vm_clear_bit(EXCEPTION_BITMAP, TRAP_no_device);
2261 v->arch.hvm_vmx.cpu_cr0 &= ~X86_CR0_TS; /* clear TS */
2262 __vmwrite(GUEST_CR0, v->arch.hvm_vmx.cpu_cr0);
2264 v->arch.hvm_vmx.cpu_shadow_cr0 &= ~X86_CR0_TS; /* clear TS */
2265 __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr0);
2266 break;
2267 case TYPE_LMSW:
2268 value = v->arch.hvm_vmx.cpu_shadow_cr0;
2269 value = (value & ~0xF) |
2270 (((exit_qualification & LMSW_SOURCE_DATA) >> 16) & 0xF);
2271 return vmx_set_cr0(value);
2272 default:
2273 BUG();
2276 return 1;
2279 static inline int vmx_do_msr_read(struct cpu_user_regs *regs)
2281 u64 msr_content = 0;
2282 u32 ecx = regs->ecx, eax, edx;
2283 struct vcpu *v = current;
2285 HVM_DBG_LOG(DBG_LEVEL_1, "ecx=%x, eax=%x, edx=%x",
2286 ecx, (u32)regs->eax, (u32)regs->edx);
2288 switch (ecx) {
2289 case MSR_IA32_TIME_STAMP_COUNTER:
2290 msr_content = hvm_get_guest_time(v);
2291 break;
2292 case MSR_IA32_SYSENTER_CS:
2293 msr_content = (u32)__vmread(GUEST_SYSENTER_CS);
2294 break;
2295 case MSR_IA32_SYSENTER_ESP:
2296 msr_content = __vmread(GUEST_SYSENTER_ESP);
2297 break;
2298 case MSR_IA32_SYSENTER_EIP:
2299 msr_content = __vmread(GUEST_SYSENTER_EIP);
2300 break;
2301 case MSR_IA32_APICBASE:
2302 msr_content = vcpu_vlapic(v)->hw.apic_base_msr;
2303 break;
2304 default:
2305 if ( long_mode_do_msr_read(regs) )
2306 goto done;
2308 if ( rdmsr_hypervisor_regs(ecx, &eax, &edx) ||
2309 rdmsr_safe(ecx, eax, edx) == 0 )
2311 regs->eax = eax;
2312 regs->edx = edx;
2313 goto done;
2315 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
2316 return 0;
2319 regs->eax = msr_content & 0xFFFFFFFF;
2320 regs->edx = msr_content >> 32;
2322 done:
2323 HVMTRACE_2D(MSR_READ, v, ecx, msr_content);
2324 HVM_DBG_LOG(DBG_LEVEL_1, "returns: ecx=%x, eax=%lx, edx=%lx",
2325 ecx, (unsigned long)regs->eax,
2326 (unsigned long)regs->edx);
2327 return 1;
2330 static inline int vmx_do_msr_write(struct cpu_user_regs *regs)
2332 u32 ecx = regs->ecx;
2333 u64 msr_content;
2334 struct vcpu *v = current;
2336 HVM_DBG_LOG(DBG_LEVEL_1, "ecx=%x, eax=%x, edx=%x",
2337 ecx, (u32)regs->eax, (u32)regs->edx);
2339 msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
2340 HVMTRACE_2D(MSR_WRITE, v, ecx, msr_content);
2342 switch (ecx) {
2343 case MSR_IA32_TIME_STAMP_COUNTER:
2344 hvm_set_guest_time(v, msr_content);
2345 pt_reset(v);
2346 break;
2347 case MSR_IA32_SYSENTER_CS:
2348 __vmwrite(GUEST_SYSENTER_CS, msr_content);
2349 break;
2350 case MSR_IA32_SYSENTER_ESP:
2351 __vmwrite(GUEST_SYSENTER_ESP, msr_content);
2352 break;
2353 case MSR_IA32_SYSENTER_EIP:
2354 __vmwrite(GUEST_SYSENTER_EIP, msr_content);
2355 break;
2356 case MSR_IA32_APICBASE:
2357 vlapic_msr_set(vcpu_vlapic(v), msr_content);
2358 break;
2359 default:
2360 if ( !long_mode_do_msr_write(regs) )
2361 wrmsr_hypervisor_regs(ecx, regs->eax, regs->edx);
2362 break;
2365 return 1;
2368 static void vmx_do_hlt(void)
2370 unsigned long rflags;
2371 HVMTRACE_0D(HLT, current);
2372 rflags = __vmread(GUEST_RFLAGS);
2373 hvm_hlt(rflags);
2376 static inline void vmx_do_extint(struct cpu_user_regs *regs)
2378 unsigned int vector;
2380 asmlinkage void do_IRQ(struct cpu_user_regs *);
2381 fastcall void smp_apic_timer_interrupt(struct cpu_user_regs *);
2382 fastcall void smp_event_check_interrupt(void);
2383 fastcall void smp_invalidate_interrupt(void);
2384 fastcall void smp_call_function_interrupt(void);
2385 fastcall void smp_spurious_interrupt(struct cpu_user_regs *regs);
2386 fastcall void smp_error_interrupt(struct cpu_user_regs *regs);
2387 #ifdef CONFIG_X86_MCE_P4THERMAL
2388 fastcall void smp_thermal_interrupt(struct cpu_user_regs *regs);
2389 #endif
2391 vector = __vmread(VM_EXIT_INTR_INFO);
2392 BUG_ON(!(vector & INTR_INFO_VALID_MASK));
2394 vector &= INTR_INFO_VECTOR_MASK;
2395 HVMTRACE_1D(INTR, current, vector);
2397 switch(vector) {
2398 case LOCAL_TIMER_VECTOR:
2399 smp_apic_timer_interrupt(regs);
2400 break;
2401 case EVENT_CHECK_VECTOR:
2402 smp_event_check_interrupt();
2403 break;
2404 case INVALIDATE_TLB_VECTOR:
2405 smp_invalidate_interrupt();
2406 break;
2407 case CALL_FUNCTION_VECTOR:
2408 smp_call_function_interrupt();
2409 break;
2410 case SPURIOUS_APIC_VECTOR:
2411 smp_spurious_interrupt(regs);
2412 break;
2413 case ERROR_APIC_VECTOR:
2414 smp_error_interrupt(regs);
2415 break;
2416 #ifdef CONFIG_X86_MCE_P4THERMAL
2417 case THERMAL_APIC_VECTOR:
2418 smp_thermal_interrupt(regs);
2419 break;
2420 #endif
2421 default:
2422 regs->entry_vector = vector;
2423 do_IRQ(regs);
2424 break;
2428 static void vmx_reflect_exception(struct vcpu *v)
2430 int error_code, intr_info, vector;
2432 intr_info = __vmread(VM_EXIT_INTR_INFO);
2433 vector = intr_info & 0xff;
2434 if ( intr_info & INTR_INFO_DELIVER_CODE_MASK )
2435 error_code = __vmread(VM_EXIT_INTR_ERROR_CODE);
2436 else
2437 error_code = VMX_DELIVER_NO_ERROR_CODE;
2439 #ifndef NDEBUG
2441 unsigned long rip;
2443 rip = __vmread(GUEST_RIP);
2444 HVM_DBG_LOG(DBG_LEVEL_1, "rip = %lx, error_code = %x",
2445 rip, error_code);
2447 #endif /* NDEBUG */
2449 /*
2450 * According to Intel Virtualization Technology Specification for
2451 * the IA-32 Intel Architecture (C97063-002 April 2005), section
2452 * 2.8.3, SW_EXCEPTION should be used for #BP and #OV, and
2453 * HW_EXCEPTION used for everything else. The main difference
2454 * appears to be that for SW_EXCEPTION, the EIP/RIP is incremented
2455 * by VM_ENTER_INSTRUCTION_LEN bytes, whereas for HW_EXCEPTION,
2456 * it is not.
2457 */
2458 if ( (intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_SW_EXCEPTION )
2460 int ilen = __get_instruction_length(); /* Safe: software exception */
2461 vmx_inject_sw_exception(v, vector, ilen);
2463 else
2465 vmx_inject_hw_exception(v, vector, error_code);
2469 static void vmx_failed_vmentry(unsigned int exit_reason)
2471 unsigned int failed_vmentry_reason = (uint16_t)exit_reason;
2472 unsigned long exit_qualification;
2474 exit_qualification = __vmread(EXIT_QUALIFICATION);
2475 printk("Failed vm entry (exit reason 0x%x) ", exit_reason);
2476 switch ( failed_vmentry_reason )
2478 case EXIT_REASON_INVALID_GUEST_STATE:
2479 printk("caused by invalid guest state (%ld).\n", exit_qualification);
2480 break;
2481 case EXIT_REASON_MSR_LOADING:
2482 printk("caused by MSR entry %ld loading.\n", exit_qualification);
2483 break;
2484 case EXIT_REASON_MACHINE_CHECK:
2485 printk("caused by machine check.\n");
2486 break;
2487 default:
2488 printk("reason not known yet!");
2489 break;
2492 printk("************* VMCS Area **************\n");
2493 vmcs_dump_vcpu();
2494 printk("**************************************\n");
2496 domain_crash(current->domain);
2499 asmlinkage void vmx_vmexit_handler(struct cpu_user_regs *regs)
2501 unsigned int exit_reason;
2502 unsigned long exit_qualification, inst_len = 0;
2503 struct vcpu *v = current;
2505 exit_reason = __vmread(VM_EXIT_REASON);
2507 HVMTRACE_2D(VMEXIT, v, __vmread(GUEST_RIP), exit_reason);
2509 perfc_incra(vmexits, exit_reason);
2511 if ( exit_reason != EXIT_REASON_EXTERNAL_INTERRUPT )
2512 local_irq_enable();
2514 if ( unlikely(exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) )
2515 return vmx_failed_vmentry(exit_reason);
2517 switch ( exit_reason )
2519 case EXIT_REASON_EXCEPTION_NMI:
2521 /*
2522 * We don't set the software-interrupt exiting (INT n).
2523 * (1) We can get an exception (e.g. #PG) in the guest, or
2524 * (2) NMI
2525 */
2526 unsigned int intr_info, vector;
2528 intr_info = __vmread(VM_EXIT_INTR_INFO);
2529 BUG_ON(!(intr_info & INTR_INFO_VALID_MASK));
2531 vector = intr_info & INTR_INFO_VECTOR_MASK;
2533 perfc_incra(cause_vector, vector);
2535 switch ( vector )
2537 case TRAP_debug:
2538 if ( v->domain->debugger_attached )
2539 domain_pause_for_debugger();
2540 else
2541 vmx_reflect_exception(v);
2542 break;
2543 case TRAP_int3:
2544 if ( v->domain->debugger_attached )
2545 domain_pause_for_debugger();
2546 else
2547 vmx_reflect_exception(v);
2548 break;
2549 case TRAP_no_device:
2550 vmx_do_no_device_fault();
2551 break;
2552 case TRAP_page_fault:
2553 exit_qualification = __vmread(EXIT_QUALIFICATION);
2554 regs->error_code = __vmread(VM_EXIT_INTR_ERROR_CODE);
2556 HVM_DBG_LOG(DBG_LEVEL_VMMU,
2557 "eax=%lx, ebx=%lx, ecx=%lx, edx=%lx, esi=%lx, edi=%lx",
2558 (unsigned long)regs->eax, (unsigned long)regs->ebx,
2559 (unsigned long)regs->ecx, (unsigned long)regs->edx,
2560 (unsigned long)regs->esi, (unsigned long)regs->edi);
2562 if ( vmx_do_page_fault(exit_qualification, regs) )
2564 HVMTRACE_2D(PF_XEN, v, exit_qualification, regs->error_code);
2565 break;
2568 v->arch.hvm_vmx.cpu_cr2 = exit_qualification;
2569 vmx_inject_hw_exception(v, TRAP_page_fault, regs->error_code);
2570 break;
2571 case TRAP_nmi:
2572 HVMTRACE_0D(NMI, v);
2573 if ( (intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI )
2574 do_nmi(regs); /* Real NMI, vector 2: normal processing. */
2575 else
2576 vmx_reflect_exception(v);
2577 break;
2578 default:
2579 vmx_reflect_exception(v);
2580 break;
2582 break;
2584 case EXIT_REASON_EXTERNAL_INTERRUPT:
2585 vmx_do_extint(regs);
2586 break;
2587 case EXIT_REASON_TRIPLE_FAULT:
2588 hvm_triple_fault();
2589 break;
2590 case EXIT_REASON_PENDING_INTERRUPT:
2591 /* Disable the interrupt window. */
2592 v->arch.hvm_vcpu.u.vmx.exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
2593 __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
2594 v->arch.hvm_vcpu.u.vmx.exec_control);
2595 break;
2596 case EXIT_REASON_TASK_SWITCH:
2597 goto exit_and_crash;
2598 case EXIT_REASON_CPUID:
2599 inst_len = __get_instruction_length(); /* Safe: CPUID */
2600 __update_guest_eip(inst_len);
2601 vmx_do_cpuid(regs);
2602 break;
2603 case EXIT_REASON_HLT:
2604 inst_len = __get_instruction_length(); /* Safe: HLT */
2605 __update_guest_eip(inst_len);
2606 vmx_do_hlt();
2607 break;
2608 case EXIT_REASON_INVLPG:
2610 inst_len = __get_instruction_length(); /* Safe: INVLPG */
2611 __update_guest_eip(inst_len);
2612 exit_qualification = __vmread(EXIT_QUALIFICATION);
2613 vmx_do_invlpg(exit_qualification);
2614 break;
2616 case EXIT_REASON_VMCALL:
2618 HVMTRACE_1D(VMMCALL, v, regs->eax);
2619 inst_len = __get_instruction_length(); /* Safe: VMCALL */
2620 __update_guest_eip(inst_len);
2621 hvm_do_hypercall(regs);
2622 break;
2624 case EXIT_REASON_CR_ACCESS:
2626 exit_qualification = __vmread(EXIT_QUALIFICATION);
2627 inst_len = __get_instruction_length(); /* Safe: MOV Cn, LMSW, CLTS */
2628 if ( vmx_cr_access(exit_qualification, regs) )
2629 __update_guest_eip(inst_len);
2630 break;
2632 case EXIT_REASON_DR_ACCESS:
2633 exit_qualification = __vmread(EXIT_QUALIFICATION);
2634 vmx_dr_access(exit_qualification, regs);
2635 break;
2636 case EXIT_REASON_IO_INSTRUCTION:
2637 exit_qualification = __vmread(EXIT_QUALIFICATION);
2638 inst_len = __get_instruction_length(); /* Safe: IN, INS, OUT, OUTS */
2639 vmx_io_instruction(exit_qualification, inst_len);
2640 break;
2641 case EXIT_REASON_MSR_READ:
2642 inst_len = __get_instruction_length(); /* Safe: RDMSR */
2643 if ( vmx_do_msr_read(regs) )
2644 __update_guest_eip(inst_len);
2645 break;
2646 case EXIT_REASON_MSR_WRITE:
2647 inst_len = __get_instruction_length(); /* Safe: WRMSR */
2648 if ( vmx_do_msr_write(regs) )
2649 __update_guest_eip(inst_len);
2650 break;
2651 case EXIT_REASON_MWAIT_INSTRUCTION:
2652 case EXIT_REASON_MONITOR_INSTRUCTION:
2653 case EXIT_REASON_PAUSE_INSTRUCTION:
2654 goto exit_and_crash;
2655 case EXIT_REASON_VMCLEAR:
2656 case EXIT_REASON_VMLAUNCH:
2657 case EXIT_REASON_VMPTRLD:
2658 case EXIT_REASON_VMPTRST:
2659 case EXIT_REASON_VMREAD:
2660 case EXIT_REASON_VMRESUME:
2661 case EXIT_REASON_VMWRITE:
2662 case EXIT_REASON_VMXOFF:
2663 case EXIT_REASON_VMXON:
2664 /* Report invalid opcode exception when a VMX guest tries to execute
2665 any of the VMX instructions */
2666 vmx_inject_hw_exception(v, TRAP_invalid_op, VMX_DELIVER_NO_ERROR_CODE);
2667 break;
2669 case EXIT_REASON_TPR_BELOW_THRESHOLD:
2670 break;
2672 default:
2673 exit_and_crash:
2674 gdprintk(XENLOG_ERR, "Bad vmexit (reason %x)\n", exit_reason);
2675 domain_crash(v->domain);
2676 break;
2680 asmlinkage void vmx_trace_vmentry(void)
2682 struct vcpu *v = current;
2683 HVMTRACE_0D(VMENTRY, v);
2686 /*
2687 * Local variables:
2688 * mode: C
2689 * c-set-style: "BSD"
2690 * c-basic-offset: 4
2691 * tab-width: 4
2692 * indent-tabs-mode: nil
2693 * End:
2694 */