ia64/xen-unstable

view xen/arch/x86/hvm/vmx/vmx.c @ 15203:7ff65f888804

vmxassist: Fix some copy-and-paste mistakes in vmx_check_descriptor()
Signed-off-by: Dexuan Cui <dexuan.cui@intel.com>
author kfraser@localhost.localdomain
date Thu May 24 13:42:41 2007 +0100 (2007-05-24)
parents cb006eecd6f5
children 1f8fb764f843
line source
1 /*
2 * vmx.c: handling VMX architecture-related VM exits
3 * Copyright (c) 2004, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
16 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 *
18 */
20 #include <xen/config.h>
21 #include <xen/init.h>
22 #include <xen/lib.h>
23 #include <xen/trace.h>
24 #include <xen/sched.h>
25 #include <xen/irq.h>
26 #include <xen/softirq.h>
27 #include <xen/domain_page.h>
28 #include <xen/hypercall.h>
29 #include <xen/perfc.h>
30 #include <asm/current.h>
31 #include <asm/io.h>
32 #include <asm/regs.h>
33 #include <asm/cpufeature.h>
34 #include <asm/processor.h>
35 #include <asm/types.h>
36 #include <asm/msr.h>
37 #include <asm/spinlock.h>
38 #include <asm/paging.h>
39 #include <asm/p2m.h>
40 #include <asm/hvm/hvm.h>
41 #include <asm/hvm/support.h>
42 #include <asm/hvm/vmx/vmx.h>
43 #include <asm/hvm/vmx/vmcs.h>
44 #include <asm/hvm/vmx/cpu.h>
45 #include <public/sched.h>
46 #include <public/hvm/ioreq.h>
47 #include <asm/hvm/vpic.h>
48 #include <asm/hvm/vlapic.h>
49 #include <asm/x86_emulate.h>
50 #include <asm/hvm/vpt.h>
51 #include <public/hvm/save.h>
52 #include <asm/hvm/trace.h>
54 char *vmx_msr_bitmap;
56 static void vmx_ctxt_switch_from(struct vcpu *v);
57 static void vmx_ctxt_switch_to(struct vcpu *v);
59 static int vmx_vcpu_initialise(struct vcpu *v)
60 {
61 int rc;
63 spin_lock_init(&v->arch.hvm_vmx.vmcs_lock);
65 v->arch.schedule_tail = vmx_do_resume;
66 v->arch.ctxt_switch_from = vmx_ctxt_switch_from;
67 v->arch.ctxt_switch_to = vmx_ctxt_switch_to;
69 if ( (rc = vmx_create_vmcs(v)) != 0 )
70 {
71 dprintk(XENLOG_WARNING,
72 "Failed to create VMCS for vcpu %d: err=%d.\n",
73 v->vcpu_id, rc);
74 return rc;
75 }
77 return 0;
78 }
80 static void vmx_vcpu_destroy(struct vcpu *v)
81 {
82 vmx_destroy_vmcs(v);
83 }
85 #ifdef __x86_64__
87 static DEFINE_PER_CPU(struct vmx_msr_state, host_msr_state);
89 static u32 msr_index[VMX_MSR_COUNT] =
90 {
91 MSR_LSTAR, MSR_STAR, MSR_SYSCALL_MASK
92 };
94 static void vmx_save_host_msrs(void)
95 {
96 struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state);
97 int i;
99 for ( i = 0; i < VMX_MSR_COUNT; i++ )
100 rdmsrl(msr_index[i], host_msr_state->msrs[i]);
101 }
103 #define WRITE_MSR(address) \
104 guest_msr_state->msrs[VMX_INDEX_MSR_ ## address] = msr_content; \
105 set_bit(VMX_INDEX_MSR_ ## address, &guest_msr_state->flags); \
106 wrmsrl(MSR_ ## address, msr_content); \
107 set_bit(VMX_INDEX_MSR_ ## address, &host_msr_state->flags); \
108 break
110 static inline int long_mode_do_msr_read(struct cpu_user_regs *regs)
111 {
112 u64 msr_content = 0;
113 u32 ecx = regs->ecx;
114 struct vcpu *v = current;
115 struct vmx_msr_state *guest_msr_state = &v->arch.hvm_vmx.msr_state;
117 switch ( ecx ) {
118 case MSR_EFER:
119 msr_content = v->arch.hvm_vmx.efer;
120 break;
122 case MSR_FS_BASE:
123 msr_content = __vmread(GUEST_FS_BASE);
124 goto check_long_mode;
126 case MSR_GS_BASE:
127 msr_content = __vmread(GUEST_GS_BASE);
128 goto check_long_mode;
130 case MSR_SHADOW_GS_BASE:
131 msr_content = v->arch.hvm_vmx.shadow_gs;
132 check_long_mode:
133 if ( !(vmx_long_mode_enabled(v)) )
134 {
135 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
136 return 0;
137 }
138 break;
140 case MSR_STAR:
141 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_STAR];
142 break;
144 case MSR_LSTAR:
145 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_LSTAR];
146 break;
148 case MSR_CSTAR:
149 msr_content = v->arch.hvm_vmx.cstar;
150 break;
152 case MSR_SYSCALL_MASK:
153 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_SYSCALL_MASK];
154 break;
156 default:
157 return 0;
158 }
160 HVM_DBG_LOG(DBG_LEVEL_0, "msr 0x%x content 0x%"PRIx64, ecx, msr_content);
162 regs->eax = (u32)(msr_content >> 0);
163 regs->edx = (u32)(msr_content >> 32);
165 return 1;
166 }
168 static inline int long_mode_do_msr_write(struct cpu_user_regs *regs)
169 {
170 u64 msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
171 u32 ecx = regs->ecx;
172 struct vcpu *v = current;
173 struct vmx_msr_state *guest_msr_state = &v->arch.hvm_vmx.msr_state;
174 struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state);
176 HVM_DBG_LOG(DBG_LEVEL_0, "msr 0x%x content 0x%"PRIx64, ecx, msr_content);
178 switch ( ecx )
179 {
180 case MSR_EFER:
181 /* offending reserved bit will cause #GP */
182 if ( (msr_content & ~(EFER_LME | EFER_LMA | EFER_NX | EFER_SCE)) ||
183 (!cpu_has_nx && (msr_content & EFER_NX)) ||
184 (!cpu_has_syscall && (msr_content & EFER_SCE)) )
185 {
186 gdprintk(XENLOG_WARNING, "Trying to set reserved bit in "
187 "EFER: %"PRIx64"\n", msr_content);
188 goto gp_fault;
189 }
191 if ( (msr_content & EFER_LME)
192 && !(v->arch.hvm_vmx.efer & EFER_LME) )
193 {
194 if ( unlikely(vmx_paging_enabled(v)) )
195 {
196 gdprintk(XENLOG_WARNING,
197 "Trying to set EFER.LME with paging enabled\n");
198 goto gp_fault;
199 }
200 }
201 else if ( !(msr_content & EFER_LME)
202 && (v->arch.hvm_vmx.efer & EFER_LME) )
203 {
204 if ( unlikely(vmx_paging_enabled(v)) )
205 {
206 gdprintk(XENLOG_WARNING,
207 "Trying to clear EFER.LME with paging enabled\n");
208 goto gp_fault;
209 }
210 }
212 if ( (msr_content ^ v->arch.hvm_vmx.efer) & (EFER_NX|EFER_SCE) )
213 write_efer((read_efer() & ~(EFER_NX|EFER_SCE)) |
214 (msr_content & (EFER_NX|EFER_SCE)));
216 v->arch.hvm_vmx.efer = msr_content;
217 break;
219 case MSR_FS_BASE:
220 case MSR_GS_BASE:
221 case MSR_SHADOW_GS_BASE:
222 if ( !vmx_long_mode_enabled(v) )
223 goto gp_fault;
225 if ( !is_canonical_address(msr_content) )
226 goto uncanonical_address;
228 if ( ecx == MSR_FS_BASE )
229 __vmwrite(GUEST_FS_BASE, msr_content);
230 else if ( ecx == MSR_GS_BASE )
231 __vmwrite(GUEST_GS_BASE, msr_content);
232 else
233 {
234 v->arch.hvm_vmx.shadow_gs = msr_content;
235 wrmsrl(MSR_SHADOW_GS_BASE, msr_content);
236 }
238 break;
240 case MSR_STAR:
241 WRITE_MSR(STAR);
243 case MSR_LSTAR:
244 if ( !is_canonical_address(msr_content) )
245 goto uncanonical_address;
246 WRITE_MSR(LSTAR);
248 case MSR_CSTAR:
249 if ( !is_canonical_address(msr_content) )
250 goto uncanonical_address;
251 v->arch.hvm_vmx.cstar = msr_content;
252 break;
254 case MSR_SYSCALL_MASK:
255 WRITE_MSR(SYSCALL_MASK);
257 default:
258 return 0;
259 }
261 return 1;
263 uncanonical_address:
264 HVM_DBG_LOG(DBG_LEVEL_0, "Not cano address of msr write %x", ecx);
265 gp_fault:
266 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
267 return 0;
268 }
270 /*
271 * To avoid MSR save/restore at every VM exit/entry time, we restore
272 * the x86_64 specific MSRs at domain switch time. Since these MSRs
273 * are not modified once set for para domains, we don't save them,
274 * but simply reset them to values set in percpu_traps_init().
275 */
276 static void vmx_restore_host_msrs(void)
277 {
278 struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state);
279 int i;
281 while ( host_msr_state->flags )
282 {
283 i = find_first_set_bit(host_msr_state->flags);
284 wrmsrl(msr_index[i], host_msr_state->msrs[i]);
285 clear_bit(i, &host_msr_state->flags);
286 }
287 if ( cpu_has_nx && !(read_efer() & EFER_NX) )
288 write_efer(read_efer() | EFER_NX);
289 }
291 static void vmx_save_guest_msrs(struct vcpu *v)
292 {
293 /* MSR_SHADOW_GS_BASE may have been changed by swapgs instruction. */
294 rdmsrl(MSR_SHADOW_GS_BASE, v->arch.hvm_vmx.shadow_gs);
295 }
297 static void vmx_restore_guest_msrs(struct vcpu *v)
298 {
299 struct vmx_msr_state *guest_msr_state, *host_msr_state;
300 unsigned long guest_flags;
301 int i;
303 guest_msr_state = &v->arch.hvm_vmx.msr_state;
304 host_msr_state = &this_cpu(host_msr_state);
306 wrmsrl(MSR_SHADOW_GS_BASE, v->arch.hvm_vmx.shadow_gs);
308 guest_flags = guest_msr_state->flags;
310 while ( guest_flags ) {
311 i = find_first_set_bit(guest_flags);
313 HVM_DBG_LOG(DBG_LEVEL_2,
314 "restore guest's index %d msr %x with value %lx",
315 i, msr_index[i], guest_msr_state->msrs[i]);
316 set_bit(i, &host_msr_state->flags);
317 wrmsrl(msr_index[i], guest_msr_state->msrs[i]);
318 clear_bit(i, &guest_flags);
319 }
321 if ( (v->arch.hvm_vmx.efer ^ read_efer()) & (EFER_NX|EFER_SCE) )
322 {
323 HVM_DBG_LOG(DBG_LEVEL_2,
324 "restore guest's EFER with value %lx",
325 v->arch.hvm_vmx.efer);
326 write_efer((read_efer() & ~(EFER_NX|EFER_SCE)) |
327 (v->arch.hvm_vmx.efer & (EFER_NX|EFER_SCE)));
328 }
329 }
331 #else /* __i386__ */
333 #define vmx_save_host_msrs() ((void)0)
335 static void vmx_restore_host_msrs(void)
336 {
337 if ( cpu_has_nx && !(read_efer() & EFER_NX) )
338 write_efer(read_efer() | EFER_NX);
339 }
341 #define vmx_save_guest_msrs(v) ((void)0)
343 static void vmx_restore_guest_msrs(struct vcpu *v)
344 {
345 if ( (v->arch.hvm_vmx.efer ^ read_efer()) & EFER_NX )
346 {
347 HVM_DBG_LOG(DBG_LEVEL_2,
348 "restore guest's EFER with value %lx",
349 v->arch.hvm_vmx.efer);
350 write_efer((read_efer() & ~EFER_NX) |
351 (v->arch.hvm_vmx.efer & EFER_NX));
352 }
353 }
355 static inline int long_mode_do_msr_read(struct cpu_user_regs *regs)
356 {
357 u64 msr_content = 0;
358 struct vcpu *v = current;
360 switch ( regs->ecx ) {
361 case MSR_EFER:
362 msr_content = v->arch.hvm_vmx.efer;
363 break;
365 default:
366 return 0;
367 }
369 regs->eax = msr_content >> 0;
370 regs->edx = msr_content >> 32;
372 return 1;
373 }
375 static inline int long_mode_do_msr_write(struct cpu_user_regs *regs)
376 {
377 u64 msr_content = regs->eax | ((u64)regs->edx << 32);
378 struct vcpu *v = current;
380 switch ( regs->ecx )
381 {
382 case MSR_EFER:
383 /* offending reserved bit will cause #GP */
384 if ( (msr_content & ~EFER_NX) ||
385 (!cpu_has_nx && (msr_content & EFER_NX)) )
386 {
387 gdprintk(XENLOG_WARNING, "Trying to set reserved bit in "
388 "EFER: %"PRIx64"\n", msr_content);
389 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
390 return 0;
391 }
393 if ( (msr_content ^ v->arch.hvm_vmx.efer) & EFER_NX )
394 write_efer((read_efer() & ~EFER_NX) | (msr_content & EFER_NX));
396 v->arch.hvm_vmx.efer = msr_content;
397 break;
399 default:
400 return 0;
401 }
403 return 1;
404 }
406 #endif /* __i386__ */
408 #define loaddebug(_v,_reg) \
409 __asm__ __volatile__ ("mov %0,%%db" #_reg : : "r" ((_v)->debugreg[_reg]))
410 #define savedebug(_v,_reg) \
411 __asm__ __volatile__ ("mov %%db" #_reg ",%0" : : "r" ((_v)->debugreg[_reg]))
413 static inline void vmx_save_dr(struct vcpu *v)
414 {
415 if ( !v->arch.hvm_vcpu.flag_dr_dirty )
416 return;
418 /* Clear the DR dirty flag and re-enable intercepts for DR accesses. */
419 v->arch.hvm_vcpu.flag_dr_dirty = 0;
420 v->arch.hvm_vcpu.u.vmx.exec_control |= CPU_BASED_MOV_DR_EXITING;
421 __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vcpu.u.vmx.exec_control);
423 savedebug(&v->arch.guest_context, 0);
424 savedebug(&v->arch.guest_context, 1);
425 savedebug(&v->arch.guest_context, 2);
426 savedebug(&v->arch.guest_context, 3);
427 savedebug(&v->arch.guest_context, 6);
428 v->arch.guest_context.debugreg[7] = __vmread(GUEST_DR7);
429 }
431 static inline void __restore_debug_registers(struct vcpu *v)
432 {
433 loaddebug(&v->arch.guest_context, 0);
434 loaddebug(&v->arch.guest_context, 1);
435 loaddebug(&v->arch.guest_context, 2);
436 loaddebug(&v->arch.guest_context, 3);
437 /* No 4 and 5 */
438 loaddebug(&v->arch.guest_context, 6);
439 /* DR7 is loaded from the VMCS. */
440 }
442 int vmx_vmcs_save(struct vcpu *v, struct hvm_hw_cpu *c)
443 {
444 uint32_t ev;
446 c->rip = __vmread(GUEST_RIP);
447 c->rsp = __vmread(GUEST_RSP);
448 c->rflags = __vmread(GUEST_RFLAGS);
450 c->cr0 = v->arch.hvm_vmx.cpu_shadow_cr0;
451 c->cr2 = v->arch.hvm_vmx.cpu_cr2;
452 c->cr3 = v->arch.hvm_vmx.cpu_cr3;
453 c->cr4 = v->arch.hvm_vmx.cpu_shadow_cr4;
455 #ifdef HVM_DEBUG_SUSPEND
456 printk("vmx_vmcs_save: cr3=0x%"PRIx64", cr0=0x%"PRIx64", cr4=0x%"PRIx64".\n",
457 c->cr3,
458 c->cr0,
459 c->cr4);
460 #endif
462 c->idtr_limit = __vmread(GUEST_IDTR_LIMIT);
463 c->idtr_base = __vmread(GUEST_IDTR_BASE);
465 c->gdtr_limit = __vmread(GUEST_GDTR_LIMIT);
466 c->gdtr_base = __vmread(GUEST_GDTR_BASE);
468 c->cs_sel = __vmread(GUEST_CS_SELECTOR);
469 c->cs_limit = __vmread(GUEST_CS_LIMIT);
470 c->cs_base = __vmread(GUEST_CS_BASE);
471 c->cs_arbytes = __vmread(GUEST_CS_AR_BYTES);
473 c->ds_sel = __vmread(GUEST_DS_SELECTOR);
474 c->ds_limit = __vmread(GUEST_DS_LIMIT);
475 c->ds_base = __vmread(GUEST_DS_BASE);
476 c->ds_arbytes = __vmread(GUEST_DS_AR_BYTES);
478 c->es_sel = __vmread(GUEST_ES_SELECTOR);
479 c->es_limit = __vmread(GUEST_ES_LIMIT);
480 c->es_base = __vmread(GUEST_ES_BASE);
481 c->es_arbytes = __vmread(GUEST_ES_AR_BYTES);
483 c->ss_sel = __vmread(GUEST_SS_SELECTOR);
484 c->ss_limit = __vmread(GUEST_SS_LIMIT);
485 c->ss_base = __vmread(GUEST_SS_BASE);
486 c->ss_arbytes = __vmread(GUEST_SS_AR_BYTES);
488 c->fs_sel = __vmread(GUEST_FS_SELECTOR);
489 c->fs_limit = __vmread(GUEST_FS_LIMIT);
490 c->fs_base = __vmread(GUEST_FS_BASE);
491 c->fs_arbytes = __vmread(GUEST_FS_AR_BYTES);
493 c->gs_sel = __vmread(GUEST_GS_SELECTOR);
494 c->gs_limit = __vmread(GUEST_GS_LIMIT);
495 c->gs_base = __vmread(GUEST_GS_BASE);
496 c->gs_arbytes = __vmread(GUEST_GS_AR_BYTES);
498 c->tr_sel = __vmread(GUEST_TR_SELECTOR);
499 c->tr_limit = __vmread(GUEST_TR_LIMIT);
500 c->tr_base = __vmread(GUEST_TR_BASE);
501 c->tr_arbytes = __vmread(GUEST_TR_AR_BYTES);
503 c->ldtr_sel = __vmread(GUEST_LDTR_SELECTOR);
504 c->ldtr_limit = __vmread(GUEST_LDTR_LIMIT);
505 c->ldtr_base = __vmread(GUEST_LDTR_BASE);
506 c->ldtr_arbytes = __vmread(GUEST_LDTR_AR_BYTES);
508 c->sysenter_cs = __vmread(GUEST_SYSENTER_CS);
509 c->sysenter_esp = __vmread(GUEST_SYSENTER_ESP);
510 c->sysenter_eip = __vmread(GUEST_SYSENTER_EIP);
512 /* Save any event/interrupt that was being injected when we last
513 * exited. IDT_VECTORING_INFO_FIELD has priority, as anything in
514 * VM_ENTRY_INTR_INFO_FIELD is either a fault caused by the first
515 * event, which will happen the next time, or an interrupt, which we
516 * never inject when IDT_VECTORING_INFO_FIELD is valid.*/
517 if ( (ev = __vmread(IDT_VECTORING_INFO_FIELD)) & INTR_INFO_VALID_MASK )
518 {
519 c->pending_event = ev;
520 c->error_code = __vmread(IDT_VECTORING_ERROR_CODE);
521 }
522 else if ( (ev = __vmread(VM_ENTRY_INTR_INFO_FIELD))
523 & INTR_INFO_VALID_MASK )
524 {
525 c->pending_event = ev;
526 c->error_code = __vmread(VM_ENTRY_EXCEPTION_ERROR_CODE);
527 }
528 else
529 {
530 c->pending_event = 0;
531 c->error_code = 0;
532 }
534 return 1;
535 }
537 int vmx_vmcs_restore(struct vcpu *v, struct hvm_hw_cpu *c)
538 {
539 unsigned long mfn, old_base_mfn;
541 vmx_vmcs_enter(v);
543 __vmwrite(GUEST_RIP, c->rip);
544 __vmwrite(GUEST_RSP, c->rsp);
545 __vmwrite(GUEST_RFLAGS, c->rflags);
547 v->arch.hvm_vmx.cpu_shadow_cr0 = c->cr0;
548 __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr0);
550 v->arch.hvm_vmx.cpu_cr2 = c->cr2;
552 #ifdef HVM_DEBUG_SUSPEND
553 printk("vmx_vmcs_restore: cr3=0x%"PRIx64", cr0=0x%"PRIx64", cr4=0x%"PRIx64".\n",
554 c->cr3,
555 c->cr0,
556 c->cr4);
557 #endif
559 if (!vmx_paging_enabled(v)) {
560 printk("vmx_vmcs_restore: paging not enabled.");
561 goto skip_cr3;
562 }
564 if (c->cr3 == v->arch.hvm_vmx.cpu_cr3) {
565 /*
566 * This is simple TLB flush, implying the guest has
567 * removed some translation or changed page attributes.
568 * We simply invalidate the shadow.
569 */
570 mfn = gmfn_to_mfn(v->domain, c->cr3 >> PAGE_SHIFT);
571 if (mfn != pagetable_get_pfn(v->arch.guest_table)) {
572 goto bad_cr3;
573 }
574 } else {
575 /*
576 * If different, make a shadow. Check if the PDBR is valid
577 * first.
578 */
579 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 c->cr3 = %"PRIx64, c->cr3);
580 /* current!=vcpu as not called by arch_vmx_do_launch */
581 mfn = gmfn_to_mfn(v->domain, c->cr3 >> PAGE_SHIFT);
582 if( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), v->domain)) {
583 goto bad_cr3;
584 }
585 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
586 v->arch.guest_table = pagetable_from_pfn(mfn);
587 if (old_base_mfn)
588 put_page(mfn_to_page(old_base_mfn));
589 v->arch.hvm_vmx.cpu_cr3 = c->cr3;
590 }
592 skip_cr3:
593 #if defined(__x86_64__)
594 if (vmx_long_mode_enabled(v)) {
595 unsigned long vm_entry_value;
596 vm_entry_value = __vmread(VM_ENTRY_CONTROLS);
597 vm_entry_value |= VM_ENTRY_IA32E_MODE;
598 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
599 }
600 #endif
602 __vmwrite(GUEST_CR4, (c->cr4 | VMX_CR4_HOST_MASK));
603 v->arch.hvm_vmx.cpu_shadow_cr4 = c->cr4;
604 __vmwrite(CR4_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr4);
606 __vmwrite(GUEST_IDTR_LIMIT, c->idtr_limit);
607 __vmwrite(GUEST_IDTR_BASE, c->idtr_base);
609 __vmwrite(GUEST_GDTR_LIMIT, c->gdtr_limit);
610 __vmwrite(GUEST_GDTR_BASE, c->gdtr_base);
612 __vmwrite(GUEST_CS_SELECTOR, c->cs_sel);
613 __vmwrite(GUEST_CS_LIMIT, c->cs_limit);
614 __vmwrite(GUEST_CS_BASE, c->cs_base);
615 __vmwrite(GUEST_CS_AR_BYTES, c->cs_arbytes);
617 __vmwrite(GUEST_DS_SELECTOR, c->ds_sel);
618 __vmwrite(GUEST_DS_LIMIT, c->ds_limit);
619 __vmwrite(GUEST_DS_BASE, c->ds_base);
620 __vmwrite(GUEST_DS_AR_BYTES, c->ds_arbytes);
622 __vmwrite(GUEST_ES_SELECTOR, c->es_sel);
623 __vmwrite(GUEST_ES_LIMIT, c->es_limit);
624 __vmwrite(GUEST_ES_BASE, c->es_base);
625 __vmwrite(GUEST_ES_AR_BYTES, c->es_arbytes);
627 __vmwrite(GUEST_SS_SELECTOR, c->ss_sel);
628 __vmwrite(GUEST_SS_LIMIT, c->ss_limit);
629 __vmwrite(GUEST_SS_BASE, c->ss_base);
630 __vmwrite(GUEST_SS_AR_BYTES, c->ss_arbytes);
632 __vmwrite(GUEST_FS_SELECTOR, c->fs_sel);
633 __vmwrite(GUEST_FS_LIMIT, c->fs_limit);
634 __vmwrite(GUEST_FS_BASE, c->fs_base);
635 __vmwrite(GUEST_FS_AR_BYTES, c->fs_arbytes);
637 __vmwrite(GUEST_GS_SELECTOR, c->gs_sel);
638 __vmwrite(GUEST_GS_LIMIT, c->gs_limit);
639 __vmwrite(GUEST_GS_BASE, c->gs_base);
640 __vmwrite(GUEST_GS_AR_BYTES, c->gs_arbytes);
642 __vmwrite(GUEST_TR_SELECTOR, c->tr_sel);
643 __vmwrite(GUEST_TR_LIMIT, c->tr_limit);
644 __vmwrite(GUEST_TR_BASE, c->tr_base);
645 __vmwrite(GUEST_TR_AR_BYTES, c->tr_arbytes);
647 __vmwrite(GUEST_LDTR_SELECTOR, c->ldtr_sel);
648 __vmwrite(GUEST_LDTR_LIMIT, c->ldtr_limit);
649 __vmwrite(GUEST_LDTR_BASE, c->ldtr_base);
650 __vmwrite(GUEST_LDTR_AR_BYTES, c->ldtr_arbytes);
652 __vmwrite(GUEST_SYSENTER_CS, c->sysenter_cs);
653 __vmwrite(GUEST_SYSENTER_ESP, c->sysenter_esp);
654 __vmwrite(GUEST_SYSENTER_EIP, c->sysenter_eip);
656 __vmwrite(GUEST_DR7, c->dr7);
658 vmx_vmcs_exit(v);
660 paging_update_paging_modes(v);
662 if ( c->pending_valid )
663 {
664 vmx_vmcs_enter(v);
665 gdprintk(XENLOG_INFO, "Re-injecting 0x%"PRIx32", 0x%"PRIx32"\n",
666 c->pending_event, c->error_code);
668 /* SVM uses type 3 ("Exception") for #OF and #BP; VMX uses type 6 */
669 if ( c->pending_type == 3
670 && (c->pending_vector == 3 || c->pending_vector == 4) )
671 c->pending_type = 6;
673 /* For software exceptions, we need to tell the hardware the
674 * instruction length as well (hmmm). */
675 if ( c->pending_type > 4 )
676 {
677 int addrbytes, ilen;
678 if ( (c->cs_arbytes & (1u<<13)) && (c->msr_efer & EFER_LMA) )
679 addrbytes = 8;
680 else if ( (c->cs_arbytes & (1u<<14)) )
681 addrbytes = 4;
682 else
683 addrbytes = 2;
684 ilen = hvm_instruction_length(c->rip, addrbytes);
685 __vmwrite(VM_ENTRY_INSTRUCTION_LEN, ilen);
686 }
688 /* Sanity check */
689 if ( c->pending_type == 1 || c->pending_type > 6
690 || c->pending_reserved != 0 )
691 {
692 gdprintk(XENLOG_ERR, "Invalid pending event 0x%"PRIx32"\n",
693 c->pending_event);
694 return -EINVAL;
695 }
696 /* Re-inject the exception */
697 __vmwrite(VM_ENTRY_INTR_INFO_FIELD, c->pending_event);
698 __vmwrite(VM_ENTRY_EXCEPTION_ERROR_CODE, c->error_code);
699 v->arch.hvm_vmx.vector_injected = 1;
700 vmx_vmcs_exit(v);
701 }
703 return 0;
705 bad_cr3:
706 gdprintk(XENLOG_ERR, "Invalid CR3 value=0x%"PRIx64"", c->cr3);
707 vmx_vmcs_exit(v);
708 return -EINVAL;
709 }
711 #if defined(__x86_64__) && defined(HVM_DEBUG_SUSPEND)
712 static void dump_msr_state(struct vmx_msr_state *m)
713 {
714 int i = 0;
715 printk("**** msr state ****\n");
716 printk("shadow_gs=0x%lx, flags=0x%lx, msr_items:", m->shadow_gs, m->flags);
717 for (i = 0; i < VMX_MSR_COUNT; i++)
718 printk("0x%lx,", m->msrs[i]);
719 printk("\n");
720 }
721 #else
722 #define dump_msr_state(m) ((void)0)
723 #endif
725 static void vmx_save_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data)
726 {
727 #ifdef __x86_64__
728 struct vmx_msr_state *guest_state = &v->arch.hvm_vmx.msr_state;
729 unsigned long guest_flags = guest_state->flags;
731 data->shadow_gs = v->arch.hvm_vmx.shadow_gs;
732 data->msr_cstar = v->arch.hvm_vmx.cstar;
734 /* save msrs */
735 data->msr_flags = guest_flags;
736 data->msr_lstar = guest_state->msrs[VMX_INDEX_MSR_LSTAR];
737 data->msr_star = guest_state->msrs[VMX_INDEX_MSR_STAR];
738 data->msr_syscall_mask = guest_state->msrs[VMX_INDEX_MSR_SYSCALL_MASK];
739 #endif
741 data->msr_efer = v->arch.hvm_vmx.efer;
743 data->tsc = hvm_get_guest_time(v);
745 dump_msr_state(guest_state);
746 }
748 static void vmx_load_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data)
749 {
750 #ifdef __x86_64__
751 struct vmx_msr_state *guest_state = &v->arch.hvm_vmx.msr_state;
753 /* restore msrs */
754 guest_state->flags = data->msr_flags;
755 guest_state->msrs[VMX_INDEX_MSR_LSTAR] = data->msr_lstar;
756 guest_state->msrs[VMX_INDEX_MSR_STAR] = data->msr_star;
757 guest_state->msrs[VMX_INDEX_MSR_SYSCALL_MASK] = data->msr_syscall_mask;
759 v->arch.hvm_vmx.cstar = data->msr_cstar;
760 v->arch.hvm_vmx.shadow_gs = data->shadow_gs;
761 #endif
763 v->arch.hvm_vmx.efer = data->msr_efer;
765 v->arch.hvm_vmx.vmxassist_enabled = !(data->cr0 & X86_CR0_PE);
767 hvm_set_guest_time(v, data->tsc);
769 dump_msr_state(guest_state);
770 }
773 static void vmx_save_vmcs_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt)
774 {
775 vmx_save_cpu_state(v, ctxt);
776 vmx_vmcs_enter(v);
777 vmx_vmcs_save(v, ctxt);
778 vmx_vmcs_exit(v);
779 }
781 static int vmx_load_vmcs_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt)
782 {
783 vmx_load_cpu_state(v, ctxt);
784 if (vmx_vmcs_restore(v, ctxt)) {
785 printk("vmx_vmcs restore failed!\n");
786 domain_crash(v->domain);
787 return -EINVAL;
788 }
790 return 0;
791 }
793 /*
794 * DR7 is saved and restored on every vmexit. Other debug registers only
795 * need to be restored if their value is going to affect execution -- i.e.,
796 * if one of the breakpoints is enabled. So mask out all bits that don't
797 * enable some breakpoint functionality.
798 */
799 #define DR7_ACTIVE_MASK 0xff
801 static inline void vmx_restore_dr(struct vcpu *v)
802 {
803 /* NB. __vmread() is not usable here, so we cannot read from the VMCS. */
804 if ( unlikely(v->arch.guest_context.debugreg[7] & DR7_ACTIVE_MASK) )
805 __restore_debug_registers(v);
806 }
808 static void vmx_ctxt_switch_from(struct vcpu *v)
809 {
810 vmx_save_guest_msrs(v);
811 vmx_restore_host_msrs();
812 vmx_save_dr(v);
813 }
815 static void vmx_ctxt_switch_to(struct vcpu *v)
816 {
817 vmx_restore_guest_msrs(v);
818 vmx_restore_dr(v);
819 }
821 static void stop_vmx(void)
822 {
823 if ( !(read_cr4() & X86_CR4_VMXE) )
824 return;
826 __vmxoff();
827 clear_in_cr4(X86_CR4_VMXE);
828 }
830 static void vmx_store_cpu_guest_regs(
831 struct vcpu *v, struct cpu_user_regs *regs, unsigned long *crs)
832 {
833 vmx_vmcs_enter(v);
835 if ( regs != NULL )
836 {
837 regs->eflags = __vmread(GUEST_RFLAGS);
838 regs->ss = __vmread(GUEST_SS_SELECTOR);
839 regs->cs = __vmread(GUEST_CS_SELECTOR);
840 regs->eip = __vmread(GUEST_RIP);
841 regs->esp = __vmread(GUEST_RSP);
842 }
844 if ( crs != NULL )
845 {
846 crs[0] = v->arch.hvm_vmx.cpu_shadow_cr0;
847 crs[2] = v->arch.hvm_vmx.cpu_cr2;
848 crs[3] = v->arch.hvm_vmx.cpu_cr3;
849 crs[4] = v->arch.hvm_vmx.cpu_shadow_cr4;
850 }
852 vmx_vmcs_exit(v);
853 }
855 static void vmx_load_cpu_guest_regs(struct vcpu *v, struct cpu_user_regs *regs)
856 {
857 unsigned long base;
859 vmx_vmcs_enter(v);
861 __vmwrite(GUEST_SS_SELECTOR, regs->ss);
862 __vmwrite(GUEST_RSP, regs->esp);
864 /* NB. Bit 1 of RFLAGS must be set for VMENTRY to succeed. */
865 __vmwrite(GUEST_RFLAGS, regs->eflags | 2UL);
867 if ( regs->eflags & EF_VM )
868 {
869 /*
870 * The VMX spec (section 4.3.1.2, Checks on Guest Segment
871 * Registers) says that virtual-8086 mode guests' segment
872 * base-address fields in the VMCS must be equal to their
873 * corresponding segment selector field shifted right by
874 * four bits upon vmentry.
875 */
876 base = __vmread(GUEST_CS_BASE);
877 if ( (regs->cs << 4) != base )
878 __vmwrite(GUEST_CS_BASE, regs->cs << 4);
879 base = __vmread(GUEST_SS_BASE);
880 if ( (regs->ss << 4) != base )
881 __vmwrite(GUEST_SS_BASE, regs->ss << 4);
882 }
884 __vmwrite(GUEST_CS_SELECTOR, regs->cs);
885 __vmwrite(GUEST_RIP, regs->eip);
887 vmx_vmcs_exit(v);
888 }
890 static unsigned long vmx_get_ctrl_reg(struct vcpu *v, unsigned int num)
891 {
892 switch ( num )
893 {
894 case 0:
895 return v->arch.hvm_vmx.cpu_cr0;
896 case 2:
897 return v->arch.hvm_vmx.cpu_cr2;
898 case 3:
899 return v->arch.hvm_vmx.cpu_cr3;
900 case 4:
901 return v->arch.hvm_vmx.cpu_shadow_cr4;
902 default:
903 BUG();
904 }
905 return 0; /* dummy */
906 }
908 static unsigned long vmx_get_segment_base(struct vcpu *v, enum x86_segment seg)
909 {
910 unsigned long base = 0;
911 int long_mode = 0;
913 ASSERT(v == current);
915 #ifdef __x86_64__
916 if ( vmx_long_mode_enabled(v) && (__vmread(GUEST_CS_AR_BYTES) & (1u<<13)) )
917 long_mode = 1;
918 #endif
920 switch ( seg )
921 {
922 case x86_seg_cs: if ( !long_mode ) base = __vmread(GUEST_CS_BASE); break;
923 case x86_seg_ds: if ( !long_mode ) base = __vmread(GUEST_DS_BASE); break;
924 case x86_seg_es: if ( !long_mode ) base = __vmread(GUEST_ES_BASE); break;
925 case x86_seg_fs: base = __vmread(GUEST_FS_BASE); break;
926 case x86_seg_gs: base = __vmread(GUEST_GS_BASE); break;
927 case x86_seg_ss: if ( !long_mode ) base = __vmread(GUEST_SS_BASE); break;
928 case x86_seg_tr: base = __vmread(GUEST_TR_BASE); break;
929 case x86_seg_gdtr: base = __vmread(GUEST_GDTR_BASE); break;
930 case x86_seg_idtr: base = __vmread(GUEST_IDTR_BASE); break;
931 case x86_seg_ldtr: base = __vmread(GUEST_LDTR_BASE); break;
932 default: BUG(); break;
933 }
935 return base;
936 }
938 static void vmx_get_segment_register(struct vcpu *v, enum x86_segment seg,
939 struct segment_register *reg)
940 {
941 u16 attr = 0;
943 ASSERT(v == current);
945 switch ( seg )
946 {
947 case x86_seg_cs:
948 reg->sel = __vmread(GUEST_CS_SELECTOR);
949 reg->limit = __vmread(GUEST_CS_LIMIT);
950 reg->base = __vmread(GUEST_CS_BASE);
951 attr = __vmread(GUEST_CS_AR_BYTES);
952 break;
953 case x86_seg_ds:
954 reg->sel = __vmread(GUEST_DS_SELECTOR);
955 reg->limit = __vmread(GUEST_DS_LIMIT);
956 reg->base = __vmread(GUEST_DS_BASE);
957 attr = __vmread(GUEST_DS_AR_BYTES);
958 break;
959 case x86_seg_es:
960 reg->sel = __vmread(GUEST_ES_SELECTOR);
961 reg->limit = __vmread(GUEST_ES_LIMIT);
962 reg->base = __vmread(GUEST_ES_BASE);
963 attr = __vmread(GUEST_ES_AR_BYTES);
964 break;
965 case x86_seg_fs:
966 reg->sel = __vmread(GUEST_FS_SELECTOR);
967 reg->limit = __vmread(GUEST_FS_LIMIT);
968 reg->base = __vmread(GUEST_FS_BASE);
969 attr = __vmread(GUEST_FS_AR_BYTES);
970 break;
971 case x86_seg_gs:
972 reg->sel = __vmread(GUEST_GS_SELECTOR);
973 reg->limit = __vmread(GUEST_GS_LIMIT);
974 reg->base = __vmread(GUEST_GS_BASE);
975 attr = __vmread(GUEST_GS_AR_BYTES);
976 break;
977 case x86_seg_ss:
978 reg->sel = __vmread(GUEST_SS_SELECTOR);
979 reg->limit = __vmread(GUEST_SS_LIMIT);
980 reg->base = __vmread(GUEST_SS_BASE);
981 attr = __vmread(GUEST_SS_AR_BYTES);
982 break;
983 case x86_seg_tr:
984 reg->sel = __vmread(GUEST_TR_SELECTOR);
985 reg->limit = __vmread(GUEST_TR_LIMIT);
986 reg->base = __vmread(GUEST_TR_BASE);
987 attr = __vmread(GUEST_TR_AR_BYTES);
988 break;
989 case x86_seg_gdtr:
990 reg->limit = __vmread(GUEST_GDTR_LIMIT);
991 reg->base = __vmread(GUEST_GDTR_BASE);
992 break;
993 case x86_seg_idtr:
994 reg->limit = __vmread(GUEST_IDTR_LIMIT);
995 reg->base = __vmread(GUEST_IDTR_BASE);
996 break;
997 case x86_seg_ldtr:
998 reg->sel = __vmread(GUEST_LDTR_SELECTOR);
999 reg->limit = __vmread(GUEST_LDTR_LIMIT);
1000 reg->base = __vmread(GUEST_LDTR_BASE);
1001 attr = __vmread(GUEST_LDTR_AR_BYTES);
1002 break;
1003 default:
1004 BUG();
1007 reg->attr.bytes = (attr & 0xff) | ((attr >> 4) & 0xf00);
1010 /* Make sure that xen intercepts any FP accesses from current */
1011 static void vmx_stts(struct vcpu *v)
1013 /* VMX depends on operating on the current vcpu */
1014 ASSERT(v == current);
1016 /*
1017 * If the guest does not have TS enabled then we must cause and handle an
1018 * exception on first use of the FPU. If the guest *does* have TS enabled
1019 * then this is not necessary: no FPU activity can occur until the guest
1020 * clears CR0.TS, and we will initialise the FPU when that happens.
1021 */
1022 if ( !(v->arch.hvm_vmx.cpu_shadow_cr0 & X86_CR0_TS) )
1024 v->arch.hvm_vmx.cpu_cr0 |= X86_CR0_TS;
1025 __vmwrite(GUEST_CR0, v->arch.hvm_vmx.cpu_cr0);
1026 __vm_set_bit(EXCEPTION_BITMAP, TRAP_no_device);
1030 static void vmx_set_tsc_offset(struct vcpu *v, u64 offset)
1032 vmx_vmcs_enter(v);
1033 __vmwrite(TSC_OFFSET, offset);
1034 #if defined (__i386__)
1035 __vmwrite(TSC_OFFSET_HIGH, offset >> 32);
1036 #endif
1037 vmx_vmcs_exit(v);
1040 static void vmx_init_ap_context(
1041 struct vcpu_guest_context *ctxt, int vcpuid, int trampoline_vector)
1043 memset(ctxt, 0, sizeof(*ctxt));
1044 ctxt->user_regs.eip = VMXASSIST_BASE;
1045 ctxt->user_regs.edx = vcpuid;
1046 ctxt->user_regs.ebx = trampoline_vector;
1049 void do_nmi(struct cpu_user_regs *);
1051 static void vmx_init_hypercall_page(struct domain *d, void *hypercall_page)
1053 char *p;
1054 int i;
1056 memset(hypercall_page, 0, PAGE_SIZE);
1058 for ( i = 0; i < (PAGE_SIZE / 32); i++ )
1060 p = (char *)(hypercall_page + (i * 32));
1061 *(u8 *)(p + 0) = 0xb8; /* mov imm32, %eax */
1062 *(u32 *)(p + 1) = i;
1063 *(u8 *)(p + 5) = 0x0f; /* vmcall */
1064 *(u8 *)(p + 6) = 0x01;
1065 *(u8 *)(p + 7) = 0xc1;
1066 *(u8 *)(p + 8) = 0xc3; /* ret */
1069 /* Don't support HYPERVISOR_iret at the moment */
1070 *(u16 *)(hypercall_page + (__HYPERVISOR_iret * 32)) = 0x0b0f; /* ud2 */
1073 static int vmx_guest_x86_mode(struct vcpu *v)
1075 unsigned int cs_ar_bytes;
1077 ASSERT(v == current);
1079 if ( unlikely(!(v->arch.hvm_vmx.cpu_shadow_cr0 & X86_CR0_PE)) )
1080 return 0;
1081 if ( unlikely(__vmread(GUEST_RFLAGS) & X86_EFLAGS_VM) )
1082 return 1;
1083 cs_ar_bytes = __vmread(GUEST_CS_AR_BYTES);
1084 if ( vmx_long_mode_enabled(v) && likely(cs_ar_bytes & (1u<<13)) )
1085 return 8;
1086 return (likely(cs_ar_bytes & (1u<<14)) ? 4 : 2);
1089 static int vmx_pae_enabled(struct vcpu *v)
1091 unsigned long cr4 = v->arch.hvm_vmx.cpu_shadow_cr4;
1092 return (vmx_paging_enabled(v) && (cr4 & X86_CR4_PAE));
1095 static int vmx_nx_enabled(struct vcpu *v)
1097 return v->arch.hvm_vmx.efer & EFER_NX;
1100 static int vmx_interrupts_enabled(struct vcpu *v)
1102 unsigned long eflags = __vmread(GUEST_RFLAGS);
1103 return !irq_masked(eflags);
1107 static void vmx_update_host_cr3(struct vcpu *v)
1109 ASSERT( (v == current) || !vcpu_runnable(v) );
1110 vmx_vmcs_enter(v);
1111 __vmwrite(HOST_CR3, v->arch.cr3);
1112 vmx_vmcs_exit(v);
1115 static void vmx_update_guest_cr3(struct vcpu *v)
1117 ASSERT( (v == current) || !vcpu_runnable(v) );
1118 vmx_vmcs_enter(v);
1119 __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
1120 vmx_vmcs_exit(v);
1124 static void vmx_inject_exception(
1125 unsigned int trapnr, int errcode, unsigned long cr2)
1127 struct vcpu *v = current;
1128 vmx_inject_hw_exception(v, trapnr, errcode);
1129 if ( trapnr == TRAP_page_fault )
1130 v->arch.hvm_vmx.cpu_cr2 = cr2;
1133 static void vmx_update_vtpr(struct vcpu *v, unsigned long value)
1135 /* VMX doesn't have a V_TPR field */
1138 static int vmx_event_injection_faulted(struct vcpu *v)
1140 unsigned int idtv_info_field;
1142 ASSERT(v == current);
1144 idtv_info_field = __vmread(IDT_VECTORING_INFO_FIELD);
1145 return (idtv_info_field & INTR_INFO_VALID_MASK);
1148 static void disable_intercept_for_msr(u32 msr)
1150 /*
1151 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
1152 * have the write-low and read-high bitmap offsets the wrong way round.
1153 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
1154 */
1155 if ( msr <= 0x1fff )
1157 __clear_bit(msr, vmx_msr_bitmap + 0x000); /* read-low */
1158 __clear_bit(msr, vmx_msr_bitmap + 0x800); /* write-low */
1160 else if ( (msr >= 0xc0000000) && (msr <= 0xc0001fff) )
1162 msr &= 0x1fff;
1163 __clear_bit(msr, vmx_msr_bitmap + 0x400); /* read-high */
1164 __clear_bit(msr, vmx_msr_bitmap + 0xc00); /* write-high */
1168 static struct hvm_function_table vmx_function_table = {
1169 .name = "VMX",
1170 .disable = stop_vmx,
1171 .vcpu_initialise = vmx_vcpu_initialise,
1172 .vcpu_destroy = vmx_vcpu_destroy,
1173 .store_cpu_guest_regs = vmx_store_cpu_guest_regs,
1174 .load_cpu_guest_regs = vmx_load_cpu_guest_regs,
1175 .save_cpu_ctxt = vmx_save_vmcs_ctxt,
1176 .load_cpu_ctxt = vmx_load_vmcs_ctxt,
1177 .paging_enabled = vmx_paging_enabled,
1178 .long_mode_enabled = vmx_long_mode_enabled,
1179 .pae_enabled = vmx_pae_enabled,
1180 .nx_enabled = vmx_nx_enabled,
1181 .interrupts_enabled = vmx_interrupts_enabled,
1182 .guest_x86_mode = vmx_guest_x86_mode,
1183 .get_guest_ctrl_reg = vmx_get_ctrl_reg,
1184 .get_segment_base = vmx_get_segment_base,
1185 .get_segment_register = vmx_get_segment_register,
1186 .update_host_cr3 = vmx_update_host_cr3,
1187 .update_guest_cr3 = vmx_update_guest_cr3,
1188 .update_vtpr = vmx_update_vtpr,
1189 .stts = vmx_stts,
1190 .set_tsc_offset = vmx_set_tsc_offset,
1191 .inject_exception = vmx_inject_exception,
1192 .init_ap_context = vmx_init_ap_context,
1193 .init_hypercall_page = vmx_init_hypercall_page,
1194 .event_injection_faulted = vmx_event_injection_faulted
1195 };
1197 int start_vmx(void)
1199 u32 eax, edx;
1200 struct vmcs_struct *vmcs;
1202 /*
1203 * Xen does not fill x86_capability words except 0.
1204 */
1205 boot_cpu_data.x86_capability[4] = cpuid_ecx(1);
1207 if ( !test_bit(X86_FEATURE_VMXE, &boot_cpu_data.x86_capability) )
1208 return 0;
1210 rdmsr(IA32_FEATURE_CONTROL_MSR, eax, edx);
1212 if ( eax & IA32_FEATURE_CONTROL_MSR_LOCK )
1214 if ( (eax & IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON) == 0x0 )
1216 printk("VMX disabled by Feature Control MSR.\n");
1217 return 0;
1220 else
1222 wrmsr(IA32_FEATURE_CONTROL_MSR,
1223 IA32_FEATURE_CONTROL_MSR_LOCK |
1224 IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON, 0);
1227 set_in_cr4(X86_CR4_VMXE);
1229 vmx_init_vmcs_config();
1231 if ( smp_processor_id() == 0 )
1232 setup_vmcs_dump();
1234 if ( (vmcs = vmx_alloc_host_vmcs()) == NULL )
1236 clear_in_cr4(X86_CR4_VMXE);
1237 printk("Failed to allocate host VMCS\n");
1238 return 0;
1241 if ( __vmxon(virt_to_maddr(vmcs)) )
1243 clear_in_cr4(X86_CR4_VMXE);
1244 printk("VMXON failed\n");
1245 vmx_free_host_vmcs(vmcs);
1246 return 0;
1249 vmx_save_host_msrs();
1251 if ( smp_processor_id() != 0 )
1252 return 1;
1254 hvm_enable(&vmx_function_table);
1256 if ( cpu_has_vmx_msr_bitmap )
1258 printk("VMX: MSR intercept bitmap enabled\n");
1259 vmx_msr_bitmap = alloc_xenheap_page();
1260 BUG_ON(vmx_msr_bitmap == NULL);
1261 memset(vmx_msr_bitmap, ~0, PAGE_SIZE);
1262 disable_intercept_for_msr(MSR_FS_BASE);
1263 disable_intercept_for_msr(MSR_GS_BASE);
1266 return 1;
1269 /*
1270 * Not all cases receive valid value in the VM-exit instruction length field.
1271 * Callers must know what they're doing!
1272 */
1273 static int __get_instruction_length(void)
1275 int len;
1276 len = __vmread(VM_EXIT_INSTRUCTION_LEN); /* Safe: callers audited */
1277 BUG_ON((len < 1) || (len > 15));
1278 return len;
1281 static void inline __update_guest_eip(unsigned long inst_len)
1283 unsigned long current_eip;
1285 current_eip = __vmread(GUEST_RIP);
1286 __vmwrite(GUEST_RIP, current_eip + inst_len);
1287 __vmwrite(GUEST_INTERRUPTIBILITY_INFO, 0);
1290 static void vmx_do_no_device_fault(void)
1292 struct vcpu *v = current;
1294 setup_fpu(current);
1295 __vm_clear_bit(EXCEPTION_BITMAP, TRAP_no_device);
1297 /* Disable TS in guest CR0 unless the guest wants the exception too. */
1298 if ( !(v->arch.hvm_vmx.cpu_shadow_cr0 & X86_CR0_TS) )
1300 v->arch.hvm_vmx.cpu_cr0 &= ~X86_CR0_TS;
1301 __vmwrite(GUEST_CR0, v->arch.hvm_vmx.cpu_cr0);
1305 #define bitmaskof(idx) (1U << ((idx) & 31))
1306 static void vmx_do_cpuid(struct cpu_user_regs *regs)
1308 unsigned int input = (unsigned int)regs->eax;
1309 unsigned int count = (unsigned int)regs->ecx;
1310 unsigned int eax, ebx, ecx, edx;
1312 if ( input == 0x00000004 )
1314 cpuid_count(input, count, &eax, &ebx, &ecx, &edx);
1315 eax &= NUM_CORES_RESET_MASK;
1317 else if ( input == 0x40000003 )
1319 /*
1320 * NB. Unsupported interface for private use of VMXASSIST only.
1321 * Note that this leaf lives at <max-hypervisor-leaf> + 1.
1322 */
1323 u64 value = ((u64)regs->edx << 32) | (u32)regs->ecx;
1324 unsigned long mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
1325 struct vcpu *v = current;
1326 char *p;
1328 gdprintk(XENLOG_INFO, "Input address is 0x%"PRIx64".\n", value);
1330 /* 8-byte aligned valid pseudophys address from vmxassist, please. */
1331 if ( (value & 7) || (mfn == INVALID_MFN) ||
1332 !v->arch.hvm_vmx.vmxassist_enabled )
1334 domain_crash(v->domain);
1335 return;
1338 p = map_domain_page(mfn);
1339 value = *((uint64_t *)(p + (value & (PAGE_SIZE - 1))));
1340 unmap_domain_page(p);
1342 gdprintk(XENLOG_INFO, "Output value is 0x%"PRIx64".\n", value);
1343 ecx = (u32)value;
1344 edx = (u32)(value >> 32);
1345 } else {
1346 hvm_cpuid(input, &eax, &ebx, &ecx, &edx);
1348 if ( input == 0x00000001 )
1350 /* Mask off reserved bits. */
1351 ecx &= ~VMX_VCPU_CPUID_L1_ECX_RESERVED;
1353 ebx &= NUM_THREADS_RESET_MASK;
1355 /* Unsupportable for virtualised CPUs. */
1356 ecx &= ~(bitmaskof(X86_FEATURE_VMXE) |
1357 bitmaskof(X86_FEATURE_EST) |
1358 bitmaskof(X86_FEATURE_TM2) |
1359 bitmaskof(X86_FEATURE_CID));
1361 edx &= ~(bitmaskof(X86_FEATURE_HT) |
1362 bitmaskof(X86_FEATURE_ACPI) |
1363 bitmaskof(X86_FEATURE_ACC));
1366 if ( input == 0x00000006 || input == 0x00000009 || input == 0x0000000A )
1367 eax = ebx = ecx = edx = 0x0;
1370 regs->eax = (unsigned long)eax;
1371 regs->ebx = (unsigned long)ebx;
1372 regs->ecx = (unsigned long)ecx;
1373 regs->edx = (unsigned long)edx;
1375 HVMTRACE_3D(CPUID, current, input,
1376 ((uint64_t)eax << 32) | ebx, ((uint64_t)ecx << 32) | edx);
1379 #define CASE_GET_REG_P(REG, reg) \
1380 case REG_ ## REG: reg_p = (unsigned long *)&(regs->reg); break
1382 #ifdef __i386__
1383 #define CASE_EXTEND_GET_REG_P
1384 #else
1385 #define CASE_EXTEND_GET_REG_P \
1386 CASE_GET_REG_P(R8, r8); \
1387 CASE_GET_REG_P(R9, r9); \
1388 CASE_GET_REG_P(R10, r10); \
1389 CASE_GET_REG_P(R11, r11); \
1390 CASE_GET_REG_P(R12, r12); \
1391 CASE_GET_REG_P(R13, r13); \
1392 CASE_GET_REG_P(R14, r14); \
1393 CASE_GET_REG_P(R15, r15)
1394 #endif
1396 static void vmx_dr_access(unsigned long exit_qualification,
1397 struct cpu_user_regs *regs)
1399 struct vcpu *v = current;
1401 HVMTRACE_0D(DR_WRITE, v);
1403 v->arch.hvm_vcpu.flag_dr_dirty = 1;
1405 /* We could probably be smarter about this */
1406 __restore_debug_registers(v);
1408 /* Allow guest direct access to DR registers */
1409 v->arch.hvm_vcpu.u.vmx.exec_control &= ~CPU_BASED_MOV_DR_EXITING;
1410 __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
1411 v->arch.hvm_vcpu.u.vmx.exec_control);
1414 /*
1415 * Invalidate the TLB for va. Invalidate the shadow page corresponding
1416 * the address va.
1417 */
1418 static void vmx_do_invlpg(unsigned long va)
1420 unsigned long eip;
1421 struct vcpu *v = current;
1423 HVMTRACE_2D(INVLPG, v, /*invlpga=*/ 0, va);
1425 eip = __vmread(GUEST_RIP);
1427 HVM_DBG_LOG(DBG_LEVEL_VMMU, "eip=%lx, va=%lx",
1428 eip, va);
1430 /*
1431 * We do the safest things first, then try to update the shadow
1432 * copying from guest
1433 */
1434 paging_invlpg(v, va);
1438 static int vmx_check_descriptor(int long_mode, unsigned long eip, int inst_len,
1439 enum x86_segment seg, unsigned long *base,
1440 u32 *limit, u32 *ar_bytes)
1442 enum vmcs_field ar_field, base_field, limit_field;
1444 *base = 0;
1445 *limit = 0;
1446 if ( seg != x86_seg_es )
1448 unsigned char inst[MAX_INST_LEN];
1449 int i;
1450 extern int inst_copy_from_guest(unsigned char *, unsigned long, int);
1452 if ( !long_mode )
1453 eip += __vmread(GUEST_CS_BASE);
1454 memset(inst, 0, MAX_INST_LEN);
1455 if ( inst_copy_from_guest(inst, eip, inst_len) != inst_len )
1457 gdprintk(XENLOG_ERR, "Get guest instruction failed\n");
1458 domain_crash(current->domain);
1459 return 0;
1462 for ( i = 0; i < inst_len; i++ )
1464 switch ( inst[i] )
1466 case 0xf3: /* REPZ */
1467 case 0xf2: /* REPNZ */
1468 case 0xf0: /* LOCK */
1469 case 0x66: /* data32 */
1470 case 0x67: /* addr32 */
1471 #ifdef __x86_64__
1472 case 0x40 ... 0x4f: /* REX */
1473 #endif
1474 continue;
1475 case 0x2e: /* CS */
1476 seg = x86_seg_cs;
1477 continue;
1478 case 0x36: /* SS */
1479 seg = x86_seg_ss;
1480 continue;
1481 case 0x26: /* ES */
1482 seg = x86_seg_es;
1483 continue;
1484 case 0x64: /* FS */
1485 seg = x86_seg_fs;
1486 continue;
1487 case 0x65: /* GS */
1488 seg = x86_seg_gs;
1489 continue;
1490 case 0x3e: /* DS */
1491 seg = x86_seg_ds;
1492 continue;
1497 switch ( seg )
1499 case x86_seg_cs:
1500 ar_field = GUEST_CS_AR_BYTES;
1501 base_field = GUEST_CS_BASE;
1502 limit_field = GUEST_CS_LIMIT;
1503 break;
1504 case x86_seg_ds:
1505 ar_field = GUEST_DS_AR_BYTES;
1506 base_field = GUEST_DS_BASE;
1507 limit_field = GUEST_DS_LIMIT;
1508 break;
1509 case x86_seg_es:
1510 ar_field = GUEST_ES_AR_BYTES;
1511 base_field = GUEST_ES_BASE;
1512 limit_field = GUEST_ES_LIMIT;
1513 break;
1514 case x86_seg_fs:
1515 ar_field = GUEST_FS_AR_BYTES;
1516 base_field = GUEST_FS_BASE;
1517 limit_field = GUEST_FS_LIMIT;
1518 break;
1519 case x86_seg_gs:
1520 ar_field = GUEST_GS_AR_BYTES;
1521 base_field = GUEST_GS_BASE;
1522 limit_field = GUEST_GS_LIMIT;
1523 break;
1524 case x86_seg_ss:
1525 ar_field = GUEST_SS_AR_BYTES;
1526 base_field = GUEST_SS_BASE;
1527 limit_field = GUEST_SS_LIMIT;
1528 break;
1529 default:
1530 BUG();
1531 return 0;
1534 if ( !long_mode || seg == x86_seg_fs || seg == x86_seg_gs )
1536 *base = __vmread(base_field);
1537 *limit = __vmread(limit_field);
1539 *ar_bytes = __vmread(ar_field);
1541 return !(*ar_bytes & 0x10000);
1544 static void vmx_io_instruction(unsigned long exit_qualification,
1545 unsigned long inst_len)
1547 struct cpu_user_regs *regs;
1548 struct hvm_io_op *pio_opp;
1549 unsigned int port, size;
1550 int dir, df, vm86;
1552 pio_opp = &current->arch.hvm_vcpu.io_op;
1553 pio_opp->instr = INSTR_PIO;
1554 pio_opp->flags = 0;
1556 regs = &pio_opp->io_context;
1558 /* Copy current guest state into io instruction state structure. */
1559 memcpy(regs, guest_cpu_user_regs(), HVM_CONTEXT_STACK_BYTES);
1560 hvm_store_cpu_guest_regs(current, regs, NULL);
1562 vm86 = regs->eflags & X86_EFLAGS_VM ? 1 : 0;
1563 df = regs->eflags & X86_EFLAGS_DF ? 1 : 0;
1565 HVM_DBG_LOG(DBG_LEVEL_IO, "vm86 %d, eip=%x:%lx, "
1566 "exit_qualification = %lx",
1567 vm86, regs->cs, (unsigned long)regs->eip, exit_qualification);
1569 if ( test_bit(6, &exit_qualification) )
1570 port = (exit_qualification >> 16) & 0xFFFF;
1571 else
1572 port = regs->edx & 0xffff;
1574 size = (exit_qualification & 7) + 1;
1575 dir = test_bit(3, &exit_qualification); /* direction */
1577 if (dir==IOREQ_READ)
1578 HVMTRACE_2D(IO_READ, current, port, size);
1579 else
1580 HVMTRACE_2D(IO_WRITE, current, port, size);
1582 if ( test_bit(4, &exit_qualification) ) { /* string instruction */
1583 unsigned long addr, count = 1, base;
1584 paddr_t paddr;
1585 unsigned long gfn;
1586 u32 ar_bytes, limit;
1587 int sign = regs->eflags & X86_EFLAGS_DF ? -1 : 1;
1588 int long_mode = 0;
1590 ar_bytes = __vmread(GUEST_CS_AR_BYTES);
1591 #ifdef __x86_64__
1592 if ( vmx_long_mode_enabled(current) && (ar_bytes & (1u<<13)) )
1593 long_mode = 1;
1594 #endif
1595 addr = __vmread(GUEST_LINEAR_ADDRESS);
1597 if ( test_bit(5, &exit_qualification) ) { /* "rep" prefix */
1598 pio_opp->flags |= REPZ;
1599 count = regs->ecx;
1600 if ( !long_mode && (vm86 || !(ar_bytes & (1u<<14))) )
1601 count &= 0xFFFF;
1604 /*
1605 * In protected mode, guest linear address is invalid if the
1606 * selector is null.
1607 */
1608 if ( !vmx_check_descriptor(long_mode, regs->eip, inst_len,
1609 dir==IOREQ_WRITE ? x86_seg_ds : x86_seg_es,
1610 &base, &limit, &ar_bytes) ) {
1611 if ( !long_mode ) {
1612 vmx_inject_hw_exception(current, TRAP_gp_fault, 0);
1613 return;
1615 addr = dir == IOREQ_WRITE ? base + regs->esi : regs->edi;
1618 if ( !long_mode ) {
1619 unsigned long ea = addr - base;
1621 /* Segment must be readable for outs and writeable for ins. */
1622 if ( dir == IOREQ_WRITE ? (ar_bytes & 0xa) == 0x8
1623 : (ar_bytes & 0xa) != 0x2 ) {
1624 vmx_inject_hw_exception(current, TRAP_gp_fault, 0);
1625 return;
1628 /* Offset must be within limits. */
1629 ASSERT(ea == (u32)ea);
1630 if ( (u32)(ea + size - 1) < (u32)ea ||
1631 (ar_bytes & 0xc) != 0x4 ? ea + size - 1 > limit
1632 : ea <= limit )
1634 vmx_inject_hw_exception(current, TRAP_gp_fault, 0);
1635 return;
1638 /* Check the limit for repeated instructions, as above we checked
1639 only the first instance. Truncate the count if a limit violation
1640 would occur. Note that the checking is not necessary for page
1641 granular segments as transfers crossing page boundaries will be
1642 broken up anyway. */
1643 if ( !(ar_bytes & (1u<<15)) && count > 1 )
1645 if ( (ar_bytes & 0xc) != 0x4 )
1647 /* expand-up */
1648 if ( !df )
1650 if ( ea + count * size - 1 < ea ||
1651 ea + count * size - 1 > limit )
1652 count = (limit + 1UL - ea) / size;
1654 else
1656 if ( count - 1 > ea / size )
1657 count = ea / size + 1;
1660 else
1662 /* expand-down */
1663 if ( !df )
1665 if ( count - 1 > -(s32)ea / size )
1666 count = -(s32)ea / size + 1UL;
1668 else
1670 if ( ea < (count - 1) * size ||
1671 ea - (count - 1) * size <= limit )
1672 count = (ea - limit - 1) / size + 1;
1675 ASSERT(count);
1678 #ifdef __x86_64__
1679 else
1681 if ( !is_canonical_address(addr) ||
1682 !is_canonical_address(addr + size - 1) )
1684 vmx_inject_hw_exception(current, TRAP_gp_fault, 0);
1685 return;
1687 if ( count > (1UL << 48) / size )
1688 count = (1UL << 48) / size;
1689 if ( !(regs->eflags & EF_DF) )
1691 if ( addr + count * size - 1 < addr ||
1692 !is_canonical_address(addr + count * size - 1) )
1693 count = (addr & ~((1UL << 48) - 1)) / size;
1695 else
1697 if ( (count - 1) * size > addr ||
1698 !is_canonical_address(addr + (count - 1) * size) )
1699 count = (addr & ~((1UL << 48) - 1)) / size + 1;
1701 ASSERT(count);
1703 #endif
1705 /* Translate the address to a physical address */
1706 gfn = paging_gva_to_gfn(current, addr);
1707 if ( gfn == INVALID_GFN )
1709 /* The guest does not have the RAM address mapped.
1710 * Need to send in a page fault */
1711 int errcode = 0;
1712 /* IO read --> memory write */
1713 if ( dir == IOREQ_READ ) errcode |= PFEC_write_access;
1714 vmx_inject_exception(TRAP_page_fault, errcode, addr);
1715 return;
1717 paddr = (paddr_t)gfn << PAGE_SHIFT | (addr & ~PAGE_MASK);
1719 /*
1720 * Handle string pio instructions that cross pages or that
1721 * are unaligned. See the comments in hvm_domain.c/handle_mmio()
1722 */
1723 if ( (addr & PAGE_MASK) != ((addr + size - 1) & PAGE_MASK) ) {
1724 unsigned long value = 0;
1726 pio_opp->flags |= OVERLAP;
1728 if ( dir == IOREQ_WRITE ) /* OUTS */
1730 if ( hvm_paging_enabled(current) )
1732 int rv = hvm_copy_from_guest_virt(&value, addr, size);
1733 if ( rv != 0 )
1735 /* Failed on the page-spanning copy. Inject PF into
1736 * the guest for the address where we failed. */
1737 addr += size - rv;
1738 gdprintk(XENLOG_DEBUG, "Pagefault reading non-io side "
1739 "of a page-spanning PIO: va=%#lx\n", addr);
1740 vmx_inject_exception(TRAP_page_fault, 0, addr);
1741 return;
1744 else
1745 (void) hvm_copy_from_guest_phys(&value, addr, size);
1746 } else /* dir != IOREQ_WRITE */
1747 /* Remember where to write the result, as a *VA*.
1748 * Must be a VA so we can handle the page overlap
1749 * correctly in hvm_pio_assist() */
1750 pio_opp->addr = addr;
1752 if ( count == 1 )
1753 regs->eip += inst_len;
1755 send_pio_req(port, 1, size, value, dir, df, 0);
1756 } else {
1757 unsigned long last_addr = sign > 0 ? addr + count * size - 1
1758 : addr - (count - 1) * size;
1760 if ( (addr & PAGE_MASK) != (last_addr & PAGE_MASK) )
1762 if ( sign > 0 )
1763 count = (PAGE_SIZE - (addr & ~PAGE_MASK)) / size;
1764 else
1765 count = (addr & ~PAGE_MASK) / size + 1;
1766 } else
1767 regs->eip += inst_len;
1769 send_pio_req(port, count, size, paddr, dir, df, 1);
1771 } else {
1772 if ( port == 0xe9 && dir == IOREQ_WRITE && size == 1 )
1773 hvm_print_line(current, regs->eax); /* guest debug output */
1775 regs->eip += inst_len;
1776 send_pio_req(port, 1, size, regs->eax, dir, df, 0);
1780 static void vmx_world_save(struct vcpu *v, struct vmx_assist_context *c)
1782 /* NB. Skip transition instruction. */
1783 c->eip = __vmread(GUEST_RIP);
1784 c->eip += __get_instruction_length(); /* Safe: MOV Cn, LMSW, CLTS */
1786 c->esp = __vmread(GUEST_RSP);
1787 c->eflags = __vmread(GUEST_RFLAGS);
1789 c->cr0 = v->arch.hvm_vmx.cpu_shadow_cr0;
1790 c->cr3 = v->arch.hvm_vmx.cpu_cr3;
1791 c->cr4 = v->arch.hvm_vmx.cpu_shadow_cr4;
1793 c->idtr_limit = __vmread(GUEST_IDTR_LIMIT);
1794 c->idtr_base = __vmread(GUEST_IDTR_BASE);
1796 c->gdtr_limit = __vmread(GUEST_GDTR_LIMIT);
1797 c->gdtr_base = __vmread(GUEST_GDTR_BASE);
1799 c->cs_sel = __vmread(GUEST_CS_SELECTOR);
1800 c->cs_limit = __vmread(GUEST_CS_LIMIT);
1801 c->cs_base = __vmread(GUEST_CS_BASE);
1802 c->cs_arbytes.bytes = __vmread(GUEST_CS_AR_BYTES);
1804 c->ds_sel = __vmread(GUEST_DS_SELECTOR);
1805 c->ds_limit = __vmread(GUEST_DS_LIMIT);
1806 c->ds_base = __vmread(GUEST_DS_BASE);
1807 c->ds_arbytes.bytes = __vmread(GUEST_DS_AR_BYTES);
1809 c->es_sel = __vmread(GUEST_ES_SELECTOR);
1810 c->es_limit = __vmread(GUEST_ES_LIMIT);
1811 c->es_base = __vmread(GUEST_ES_BASE);
1812 c->es_arbytes.bytes = __vmread(GUEST_ES_AR_BYTES);
1814 c->ss_sel = __vmread(GUEST_SS_SELECTOR);
1815 c->ss_limit = __vmread(GUEST_SS_LIMIT);
1816 c->ss_base = __vmread(GUEST_SS_BASE);
1817 c->ss_arbytes.bytes = __vmread(GUEST_SS_AR_BYTES);
1819 c->fs_sel = __vmread(GUEST_FS_SELECTOR);
1820 c->fs_limit = __vmread(GUEST_FS_LIMIT);
1821 c->fs_base = __vmread(GUEST_FS_BASE);
1822 c->fs_arbytes.bytes = __vmread(GUEST_FS_AR_BYTES);
1824 c->gs_sel = __vmread(GUEST_GS_SELECTOR);
1825 c->gs_limit = __vmread(GUEST_GS_LIMIT);
1826 c->gs_base = __vmread(GUEST_GS_BASE);
1827 c->gs_arbytes.bytes = __vmread(GUEST_GS_AR_BYTES);
1829 c->tr_sel = __vmread(GUEST_TR_SELECTOR);
1830 c->tr_limit = __vmread(GUEST_TR_LIMIT);
1831 c->tr_base = __vmread(GUEST_TR_BASE);
1832 c->tr_arbytes.bytes = __vmread(GUEST_TR_AR_BYTES);
1834 c->ldtr_sel = __vmread(GUEST_LDTR_SELECTOR);
1835 c->ldtr_limit = __vmread(GUEST_LDTR_LIMIT);
1836 c->ldtr_base = __vmread(GUEST_LDTR_BASE);
1837 c->ldtr_arbytes.bytes = __vmread(GUEST_LDTR_AR_BYTES);
1840 static int vmx_world_restore(struct vcpu *v, struct vmx_assist_context *c)
1842 unsigned long mfn, old_base_mfn;
1844 __vmwrite(GUEST_RIP, c->eip);
1845 __vmwrite(GUEST_RSP, c->esp);
1846 __vmwrite(GUEST_RFLAGS, c->eflags);
1848 v->arch.hvm_vmx.cpu_shadow_cr0 = c->cr0;
1849 __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr0);
1851 if ( !vmx_paging_enabled(v) )
1852 goto skip_cr3;
1854 if ( c->cr3 == v->arch.hvm_vmx.cpu_cr3 )
1856 /*
1857 * This is simple TLB flush, implying the guest has
1858 * removed some translation or changed page attributes.
1859 * We simply invalidate the shadow.
1860 */
1861 mfn = get_mfn_from_gpfn(c->cr3 >> PAGE_SHIFT);
1862 if ( mfn != pagetable_get_pfn(v->arch.guest_table) )
1863 goto bad_cr3;
1865 else
1867 /*
1868 * If different, make a shadow. Check if the PDBR is valid
1869 * first.
1870 */
1871 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 c->cr3 = %x", c->cr3);
1872 mfn = get_mfn_from_gpfn(c->cr3 >> PAGE_SHIFT);
1873 if ( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), v->domain) )
1874 goto bad_cr3;
1875 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1876 v->arch.guest_table = pagetable_from_pfn(mfn);
1877 if (old_base_mfn)
1878 put_page(mfn_to_page(old_base_mfn));
1879 v->arch.hvm_vmx.cpu_cr3 = c->cr3;
1882 skip_cr3:
1883 if ( !vmx_paging_enabled(v) )
1884 HVM_DBG_LOG(DBG_LEVEL_VMMU, "switching to vmxassist. use phys table");
1885 else
1886 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %x", c->cr3);
1888 __vmwrite(GUEST_CR4, (c->cr4 | VMX_CR4_HOST_MASK));
1889 v->arch.hvm_vmx.cpu_shadow_cr4 = c->cr4;
1890 __vmwrite(CR4_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr4);
1892 __vmwrite(GUEST_IDTR_LIMIT, c->idtr_limit);
1893 __vmwrite(GUEST_IDTR_BASE, c->idtr_base);
1895 __vmwrite(GUEST_GDTR_LIMIT, c->gdtr_limit);
1896 __vmwrite(GUEST_GDTR_BASE, c->gdtr_base);
1898 __vmwrite(GUEST_CS_SELECTOR, c->cs_sel);
1899 __vmwrite(GUEST_CS_LIMIT, c->cs_limit);
1900 __vmwrite(GUEST_CS_BASE, c->cs_base);
1901 __vmwrite(GUEST_CS_AR_BYTES, c->cs_arbytes.bytes);
1903 __vmwrite(GUEST_DS_SELECTOR, c->ds_sel);
1904 __vmwrite(GUEST_DS_LIMIT, c->ds_limit);
1905 __vmwrite(GUEST_DS_BASE, c->ds_base);
1906 __vmwrite(GUEST_DS_AR_BYTES, c->ds_arbytes.bytes);
1908 __vmwrite(GUEST_ES_SELECTOR, c->es_sel);
1909 __vmwrite(GUEST_ES_LIMIT, c->es_limit);
1910 __vmwrite(GUEST_ES_BASE, c->es_base);
1911 __vmwrite(GUEST_ES_AR_BYTES, c->es_arbytes.bytes);
1913 __vmwrite(GUEST_SS_SELECTOR, c->ss_sel);
1914 __vmwrite(GUEST_SS_LIMIT, c->ss_limit);
1915 __vmwrite(GUEST_SS_BASE, c->ss_base);
1916 __vmwrite(GUEST_SS_AR_BYTES, c->ss_arbytes.bytes);
1918 __vmwrite(GUEST_FS_SELECTOR, c->fs_sel);
1919 __vmwrite(GUEST_FS_LIMIT, c->fs_limit);
1920 __vmwrite(GUEST_FS_BASE, c->fs_base);
1921 __vmwrite(GUEST_FS_AR_BYTES, c->fs_arbytes.bytes);
1923 __vmwrite(GUEST_GS_SELECTOR, c->gs_sel);
1924 __vmwrite(GUEST_GS_LIMIT, c->gs_limit);
1925 __vmwrite(GUEST_GS_BASE, c->gs_base);
1926 __vmwrite(GUEST_GS_AR_BYTES, c->gs_arbytes.bytes);
1928 __vmwrite(GUEST_TR_SELECTOR, c->tr_sel);
1929 __vmwrite(GUEST_TR_LIMIT, c->tr_limit);
1930 __vmwrite(GUEST_TR_BASE, c->tr_base);
1931 __vmwrite(GUEST_TR_AR_BYTES, c->tr_arbytes.bytes);
1933 __vmwrite(GUEST_LDTR_SELECTOR, c->ldtr_sel);
1934 __vmwrite(GUEST_LDTR_LIMIT, c->ldtr_limit);
1935 __vmwrite(GUEST_LDTR_BASE, c->ldtr_base);
1936 __vmwrite(GUEST_LDTR_AR_BYTES, c->ldtr_arbytes.bytes);
1938 paging_update_paging_modes(v);
1939 return 0;
1941 bad_cr3:
1942 gdprintk(XENLOG_ERR, "Invalid CR3 value=%x", c->cr3);
1943 return -EINVAL;
1946 enum { VMX_ASSIST_INVOKE = 0, VMX_ASSIST_RESTORE };
1948 static int vmx_assist(struct vcpu *v, int mode)
1950 struct vmx_assist_context c;
1951 u32 magic;
1952 u32 cp;
1954 /* make sure vmxassist exists (this is not an error) */
1955 if (hvm_copy_from_guest_phys(&magic, VMXASSIST_MAGIC_OFFSET,
1956 sizeof(magic)))
1957 return 0;
1958 if (magic != VMXASSIST_MAGIC)
1959 return 0;
1961 switch (mode) {
1962 /*
1963 * Transfer control to vmxassist.
1964 * Store the current context in VMXASSIST_OLD_CONTEXT and load
1965 * the new VMXASSIST_NEW_CONTEXT context. This context was created
1966 * by vmxassist and will transfer control to it.
1967 */
1968 case VMX_ASSIST_INVOKE:
1969 /* save the old context */
1970 if (hvm_copy_from_guest_phys(&cp, VMXASSIST_OLD_CONTEXT, sizeof(cp)))
1971 goto error;
1972 if (cp != 0) {
1973 vmx_world_save(v, &c);
1974 if (hvm_copy_to_guest_phys(cp, &c, sizeof(c)))
1975 goto error;
1978 /* restore the new context, this should activate vmxassist */
1979 if (hvm_copy_from_guest_phys(&cp, VMXASSIST_NEW_CONTEXT, sizeof(cp)))
1980 goto error;
1981 if (cp != 0) {
1982 if (hvm_copy_from_guest_phys(&c, cp, sizeof(c)))
1983 goto error;
1984 if ( vmx_world_restore(v, &c) != 0 )
1985 goto error;
1986 v->arch.hvm_vmx.vmxassist_enabled = 1;
1987 return 1;
1989 break;
1991 /*
1992 * Restore the VMXASSIST_OLD_CONTEXT that was saved by
1993 * VMX_ASSIST_INVOKE above.
1994 */
1995 case VMX_ASSIST_RESTORE:
1996 /* save the old context */
1997 if (hvm_copy_from_guest_phys(&cp, VMXASSIST_OLD_CONTEXT, sizeof(cp)))
1998 goto error;
1999 if (cp != 0) {
2000 if (hvm_copy_from_guest_phys(&c, cp, sizeof(c)))
2001 goto error;
2002 if ( vmx_world_restore(v, &c) != 0 )
2003 goto error;
2004 v->arch.hvm_vmx.vmxassist_enabled = 0;
2005 return 1;
2007 break;
2010 error:
2011 gdprintk(XENLOG_ERR, "Failed to transfer to vmxassist\n");
2012 domain_crash(v->domain);
2013 return 0;
2016 static int vmx_set_cr0(unsigned long value)
2018 struct vcpu *v = current;
2019 unsigned long mfn;
2020 unsigned long eip;
2021 int paging_enabled;
2022 unsigned long vm_entry_value;
2023 unsigned long old_cr0;
2024 unsigned long old_base_mfn;
2026 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR0 value = %lx", value);
2028 /* ET is reserved and should be always be 1. */
2029 value |= X86_CR0_ET;
2031 if ( (value & (X86_CR0_PE|X86_CR0_PG)) == X86_CR0_PG )
2033 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
2034 return 0;
2037 /* TS cleared? Then initialise FPU now. */
2038 if ( !(value & X86_CR0_TS) )
2040 setup_fpu(v);
2041 __vm_clear_bit(EXCEPTION_BITMAP, TRAP_no_device);
2044 old_cr0 = v->arch.hvm_vmx.cpu_shadow_cr0;
2045 paging_enabled = old_cr0 & X86_CR0_PG;
2047 v->arch.hvm_vmx.cpu_cr0 = (value | X86_CR0_PE | X86_CR0_PG
2048 | X86_CR0_NE | X86_CR0_WP);
2049 __vmwrite(GUEST_CR0, v->arch.hvm_vmx.cpu_cr0);
2051 v->arch.hvm_vmx.cpu_shadow_cr0 = value;
2052 __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr0);
2054 if ( (value & X86_CR0_PE) && (value & X86_CR0_PG) && !paging_enabled )
2056 /*
2057 * Trying to enable guest paging.
2058 * The guest CR3 must be pointing to the guest physical.
2059 */
2060 mfn = get_mfn_from_gpfn(v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT);
2061 if ( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), v->domain) )
2063 gdprintk(XENLOG_ERR, "Invalid CR3 value = %lx (mfn=%lx)\n",
2064 v->arch.hvm_vmx.cpu_cr3, mfn);
2065 domain_crash(v->domain);
2066 return 0;
2069 #if defined(__x86_64__)
2070 if ( vmx_lme_is_set(v) )
2072 if ( !(v->arch.hvm_vmx.cpu_shadow_cr4 & X86_CR4_PAE) )
2074 HVM_DBG_LOG(DBG_LEVEL_1, "Guest enabled paging "
2075 "with EFER.LME set but not CR4.PAE");
2076 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
2078 else
2080 HVM_DBG_LOG(DBG_LEVEL_1, "Enabling long mode");
2081 v->arch.hvm_vmx.efer |= EFER_LMA;
2082 vm_entry_value = __vmread(VM_ENTRY_CONTROLS);
2083 vm_entry_value |= VM_ENTRY_IA32E_MODE;
2084 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
2087 #endif
2089 /*
2090 * Now arch.guest_table points to machine physical.
2091 */
2092 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
2093 v->arch.guest_table = pagetable_from_pfn(mfn);
2094 if (old_base_mfn)
2095 put_page(mfn_to_page(old_base_mfn));
2096 paging_update_paging_modes(v);
2098 HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
2099 (unsigned long) (mfn << PAGE_SHIFT));
2101 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, mfn = %lx",
2102 v->arch.hvm_vmx.cpu_cr3, mfn);
2105 if ( !((value & X86_CR0_PE) && (value & X86_CR0_PG)) && paging_enabled )
2106 if ( v->arch.hvm_vmx.cpu_cr3 ) {
2107 put_page(mfn_to_page(get_mfn_from_gpfn(
2108 v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT)));
2109 v->arch.guest_table = pagetable_null();
2112 /*
2113 * VMX does not implement real-mode virtualization. We emulate
2114 * real-mode by performing a world switch to VMXAssist whenever
2115 * a partition disables the CR0.PE bit.
2116 */
2117 if ( (value & X86_CR0_PE) == 0 )
2119 if ( value & X86_CR0_PG ) {
2120 /* inject GP here */
2121 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
2122 return 0;
2123 } else {
2124 /*
2125 * Disable paging here.
2126 * Same to PE == 1 && PG == 0
2127 */
2128 if ( vmx_long_mode_enabled(v) )
2130 v->arch.hvm_vmx.efer &= ~EFER_LMA;
2131 vm_entry_value = __vmread(VM_ENTRY_CONTROLS);
2132 vm_entry_value &= ~VM_ENTRY_IA32E_MODE;
2133 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
2137 if ( vmx_assist(v, VMX_ASSIST_INVOKE) )
2139 eip = __vmread(GUEST_RIP);
2140 HVM_DBG_LOG(DBG_LEVEL_1,
2141 "Transfering control to vmxassist %%eip 0x%lx", eip);
2142 return 0; /* do not update eip! */
2145 else if ( v->arch.hvm_vmx.vmxassist_enabled )
2147 eip = __vmread(GUEST_RIP);
2148 HVM_DBG_LOG(DBG_LEVEL_1,
2149 "Enabling CR0.PE at %%eip 0x%lx", eip);
2150 if ( vmx_assist(v, VMX_ASSIST_RESTORE) )
2152 eip = __vmread(GUEST_RIP);
2153 HVM_DBG_LOG(DBG_LEVEL_1,
2154 "Restoring to %%eip 0x%lx", eip);
2155 return 0; /* do not update eip! */
2158 else if ( (value & (X86_CR0_PE | X86_CR0_PG)) == X86_CR0_PE )
2160 if ( vmx_long_mode_enabled(v) )
2162 v->arch.hvm_vmx.efer &= ~EFER_LMA;
2163 vm_entry_value = __vmread(VM_ENTRY_CONTROLS);
2164 vm_entry_value &= ~VM_ENTRY_IA32E_MODE;
2165 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
2167 paging_update_paging_modes(v);
2170 return 1;
2173 #define CASE_SET_REG(REG, reg) \
2174 case REG_ ## REG: regs->reg = value; break
2175 #define CASE_GET_REG(REG, reg) \
2176 case REG_ ## REG: value = regs->reg; break
2178 #define CASE_EXTEND_SET_REG \
2179 CASE_EXTEND_REG(S)
2180 #define CASE_EXTEND_GET_REG \
2181 CASE_EXTEND_REG(G)
2183 #ifdef __i386__
2184 #define CASE_EXTEND_REG(T)
2185 #else
2186 #define CASE_EXTEND_REG(T) \
2187 CASE_ ## T ## ET_REG(R8, r8); \
2188 CASE_ ## T ## ET_REG(R9, r9); \
2189 CASE_ ## T ## ET_REG(R10, r10); \
2190 CASE_ ## T ## ET_REG(R11, r11); \
2191 CASE_ ## T ## ET_REG(R12, r12); \
2192 CASE_ ## T ## ET_REG(R13, r13); \
2193 CASE_ ## T ## ET_REG(R14, r14); \
2194 CASE_ ## T ## ET_REG(R15, r15)
2195 #endif
2197 /*
2198 * Write to control registers
2199 */
2200 static int mov_to_cr(int gp, int cr, struct cpu_user_regs *regs)
2202 unsigned long value, old_cr, old_base_mfn, mfn;
2203 struct vcpu *v = current;
2204 struct vlapic *vlapic = vcpu_vlapic(v);
2206 switch ( gp )
2208 CASE_GET_REG(EAX, eax);
2209 CASE_GET_REG(ECX, ecx);
2210 CASE_GET_REG(EDX, edx);
2211 CASE_GET_REG(EBX, ebx);
2212 CASE_GET_REG(EBP, ebp);
2213 CASE_GET_REG(ESI, esi);
2214 CASE_GET_REG(EDI, edi);
2215 CASE_EXTEND_GET_REG;
2216 case REG_ESP:
2217 value = __vmread(GUEST_RSP);
2218 break;
2219 default:
2220 gdprintk(XENLOG_ERR, "invalid gp: %d\n", gp);
2221 goto exit_and_crash;
2224 HVMTRACE_2D(CR_WRITE, v, cr, value);
2226 HVM_DBG_LOG(DBG_LEVEL_1, "CR%d, value = %lx", cr, value);
2228 switch ( cr )
2230 case 0:
2231 return vmx_set_cr0(value);
2233 case 3:
2234 /*
2235 * If paging is not enabled yet, simply copy the value to CR3.
2236 */
2237 if (!vmx_paging_enabled(v)) {
2238 v->arch.hvm_vmx.cpu_cr3 = value;
2239 break;
2242 /*
2243 * We make a new one if the shadow does not exist.
2244 */
2245 if (value == v->arch.hvm_vmx.cpu_cr3) {
2246 /*
2247 * This is simple TLB flush, implying the guest has
2248 * removed some translation or changed page attributes.
2249 * We simply invalidate the shadow.
2250 */
2251 mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
2252 if (mfn != pagetable_get_pfn(v->arch.guest_table))
2253 goto bad_cr3;
2254 paging_update_cr3(v);
2255 } else {
2256 /*
2257 * If different, make a shadow. Check if the PDBR is valid
2258 * first.
2259 */
2260 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 value = %lx", value);
2261 mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
2262 if ( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), v->domain) )
2263 goto bad_cr3;
2264 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
2265 v->arch.guest_table = pagetable_from_pfn(mfn);
2266 if (old_base_mfn)
2267 put_page(mfn_to_page(old_base_mfn));
2268 v->arch.hvm_vmx.cpu_cr3 = value;
2269 update_cr3(v);
2270 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx", value);
2272 break;
2274 case 4: /* CR4 */
2275 old_cr = v->arch.hvm_vmx.cpu_shadow_cr4;
2277 if ( (value & X86_CR4_PAE) && !(old_cr & X86_CR4_PAE) )
2279 if ( vmx_pgbit_test(v) )
2281 /* The guest is a 32-bit PAE guest. */
2282 #if CONFIG_PAGING_LEVELS >= 3
2283 unsigned long mfn, old_base_mfn;
2284 mfn = get_mfn_from_gpfn(v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT);
2285 if ( !mfn_valid(mfn) ||
2286 !get_page(mfn_to_page(mfn), v->domain) )
2287 goto bad_cr3;
2289 /*
2290 * Now arch.guest_table points to machine physical.
2291 */
2293 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
2294 v->arch.guest_table = pagetable_from_pfn(mfn);
2295 if ( old_base_mfn )
2296 put_page(mfn_to_page(old_base_mfn));
2298 HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
2299 (unsigned long) (mfn << PAGE_SHIFT));
2301 HVM_DBG_LOG(DBG_LEVEL_VMMU,
2302 "Update CR3 value = %lx, mfn = %lx",
2303 v->arch.hvm_vmx.cpu_cr3, mfn);
2304 #endif
2307 else if ( !(value & X86_CR4_PAE) )
2309 if ( unlikely(vmx_long_mode_enabled(v)) )
2311 HVM_DBG_LOG(DBG_LEVEL_1, "Guest cleared CR4.PAE while "
2312 "EFER.LMA is set");
2313 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
2317 __vmwrite(GUEST_CR4, value| VMX_CR4_HOST_MASK);
2318 v->arch.hvm_vmx.cpu_shadow_cr4 = value;
2319 __vmwrite(CR4_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr4);
2321 /*
2322 * Writing to CR4 to modify the PSE, PGE, or PAE flag invalidates
2323 * all TLB entries except global entries.
2324 */
2325 if ( (old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE) )
2326 paging_update_paging_modes(v);
2327 break;
2329 case 8:
2330 vlapic_set_reg(vlapic, APIC_TASKPRI, ((value & 0x0F) << 4));
2331 break;
2333 default:
2334 gdprintk(XENLOG_ERR, "invalid cr: %d\n", cr);
2335 domain_crash(v->domain);
2336 return 0;
2339 return 1;
2341 bad_cr3:
2342 gdprintk(XENLOG_ERR, "Invalid CR3\n");
2343 exit_and_crash:
2344 domain_crash(v->domain);
2345 return 0;
2348 /*
2349 * Read from control registers. CR0 and CR4 are read from the shadow.
2350 */
2351 static void mov_from_cr(int cr, int gp, struct cpu_user_regs *regs)
2353 unsigned long value = 0;
2354 struct vcpu *v = current;
2355 struct vlapic *vlapic = vcpu_vlapic(v);
2357 switch ( cr )
2359 case 3:
2360 value = (unsigned long)v->arch.hvm_vmx.cpu_cr3;
2361 break;
2362 case 8:
2363 value = (unsigned long)vlapic_get_reg(vlapic, APIC_TASKPRI);
2364 value = (value & 0xF0) >> 4;
2365 break;
2366 default:
2367 gdprintk(XENLOG_ERR, "invalid cr: %d\n", cr);
2368 domain_crash(v->domain);
2369 break;
2372 switch ( gp ) {
2373 CASE_SET_REG(EAX, eax);
2374 CASE_SET_REG(ECX, ecx);
2375 CASE_SET_REG(EDX, edx);
2376 CASE_SET_REG(EBX, ebx);
2377 CASE_SET_REG(EBP, ebp);
2378 CASE_SET_REG(ESI, esi);
2379 CASE_SET_REG(EDI, edi);
2380 CASE_EXTEND_SET_REG;
2381 case REG_ESP:
2382 __vmwrite(GUEST_RSP, value);
2383 regs->esp = value;
2384 break;
2385 default:
2386 printk("invalid gp: %d\n", gp);
2387 domain_crash(v->domain);
2388 break;
2391 HVMTRACE_2D(CR_READ, v, cr, value);
2393 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR%d, value = %lx", cr, value);
2396 static int vmx_cr_access(unsigned long exit_qualification,
2397 struct cpu_user_regs *regs)
2399 unsigned int gp, cr;
2400 unsigned long value;
2401 struct vcpu *v = current;
2403 switch (exit_qualification & CONTROL_REG_ACCESS_TYPE) {
2404 case TYPE_MOV_TO_CR:
2405 gp = exit_qualification & CONTROL_REG_ACCESS_REG;
2406 cr = exit_qualification & CONTROL_REG_ACCESS_NUM;
2407 return mov_to_cr(gp, cr, regs);
2408 case TYPE_MOV_FROM_CR:
2409 gp = exit_qualification & CONTROL_REG_ACCESS_REG;
2410 cr = exit_qualification & CONTROL_REG_ACCESS_NUM;
2411 mov_from_cr(cr, gp, regs);
2412 break;
2413 case TYPE_CLTS:
2414 /* We initialise the FPU now, to avoid needing another vmexit. */
2415 setup_fpu(v);
2416 __vm_clear_bit(EXCEPTION_BITMAP, TRAP_no_device);
2418 v->arch.hvm_vmx.cpu_cr0 &= ~X86_CR0_TS; /* clear TS */
2419 __vmwrite(GUEST_CR0, v->arch.hvm_vmx.cpu_cr0);
2421 v->arch.hvm_vmx.cpu_shadow_cr0 &= ~X86_CR0_TS; /* clear TS */
2422 __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr0);
2423 break;
2424 case TYPE_LMSW:
2425 value = v->arch.hvm_vmx.cpu_shadow_cr0;
2426 value = (value & ~0xF) |
2427 (((exit_qualification & LMSW_SOURCE_DATA) >> 16) & 0xF);
2428 return vmx_set_cr0(value);
2429 default:
2430 BUG();
2433 return 1;
2436 static inline int vmx_do_msr_read(struct cpu_user_regs *regs)
2438 u64 msr_content = 0;
2439 u32 ecx = regs->ecx, eax, edx;
2440 struct vcpu *v = current;
2442 HVM_DBG_LOG(DBG_LEVEL_1, "ecx=%x", ecx);
2444 switch (ecx) {
2445 case MSR_IA32_TIME_STAMP_COUNTER:
2446 msr_content = hvm_get_guest_time(v);
2447 break;
2448 case MSR_IA32_SYSENTER_CS:
2449 msr_content = (u32)__vmread(GUEST_SYSENTER_CS);
2450 break;
2451 case MSR_IA32_SYSENTER_ESP:
2452 msr_content = __vmread(GUEST_SYSENTER_ESP);
2453 break;
2454 case MSR_IA32_SYSENTER_EIP:
2455 msr_content = __vmread(GUEST_SYSENTER_EIP);
2456 break;
2457 case MSR_IA32_APICBASE:
2458 msr_content = vcpu_vlapic(v)->hw.apic_base_msr;
2459 break;
2460 default:
2461 if ( long_mode_do_msr_read(regs) )
2462 goto done;
2464 if ( rdmsr_hypervisor_regs(ecx, &eax, &edx) ||
2465 rdmsr_safe(ecx, eax, edx) == 0 )
2467 regs->eax = eax;
2468 regs->edx = edx;
2469 goto done;
2471 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
2472 return 0;
2475 regs->eax = msr_content & 0xFFFFFFFF;
2476 regs->edx = msr_content >> 32;
2478 done:
2479 HVMTRACE_2D(MSR_READ, v, ecx, msr_content);
2480 HVM_DBG_LOG(DBG_LEVEL_1, "returns: ecx=%x, eax=%lx, edx=%lx",
2481 ecx, (unsigned long)regs->eax,
2482 (unsigned long)regs->edx);
2483 return 1;
2486 static inline int vmx_do_msr_write(struct cpu_user_regs *regs)
2488 u32 ecx = regs->ecx;
2489 u64 msr_content;
2490 struct vcpu *v = current;
2492 HVM_DBG_LOG(DBG_LEVEL_1, "ecx=%x, eax=%x, edx=%x",
2493 ecx, (u32)regs->eax, (u32)regs->edx);
2495 msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
2496 HVMTRACE_2D(MSR_WRITE, v, ecx, msr_content);
2498 switch (ecx) {
2499 case MSR_IA32_TIME_STAMP_COUNTER:
2500 hvm_set_guest_time(v, msr_content);
2501 pt_reset(v);
2502 break;
2503 case MSR_IA32_SYSENTER_CS:
2504 __vmwrite(GUEST_SYSENTER_CS, msr_content);
2505 break;
2506 case MSR_IA32_SYSENTER_ESP:
2507 __vmwrite(GUEST_SYSENTER_ESP, msr_content);
2508 break;
2509 case MSR_IA32_SYSENTER_EIP:
2510 __vmwrite(GUEST_SYSENTER_EIP, msr_content);
2511 break;
2512 case MSR_IA32_APICBASE:
2513 vlapic_msr_set(vcpu_vlapic(v), msr_content);
2514 break;
2515 default:
2516 if ( !long_mode_do_msr_write(regs) )
2517 wrmsr_hypervisor_regs(ecx, regs->eax, regs->edx);
2518 break;
2521 return 1;
2524 static void vmx_do_hlt(void)
2526 unsigned long rflags;
2527 HVMTRACE_0D(HLT, current);
2528 rflags = __vmread(GUEST_RFLAGS);
2529 hvm_hlt(rflags);
2532 static inline void vmx_do_extint(struct cpu_user_regs *regs)
2534 unsigned int vector;
2536 asmlinkage void do_IRQ(struct cpu_user_regs *);
2537 fastcall void smp_apic_timer_interrupt(struct cpu_user_regs *);
2538 fastcall void smp_event_check_interrupt(void);
2539 fastcall void smp_invalidate_interrupt(void);
2540 fastcall void smp_call_function_interrupt(void);
2541 fastcall void smp_spurious_interrupt(struct cpu_user_regs *regs);
2542 fastcall void smp_error_interrupt(struct cpu_user_regs *regs);
2543 #ifdef CONFIG_X86_MCE_P4THERMAL
2544 fastcall void smp_thermal_interrupt(struct cpu_user_regs *regs);
2545 #endif
2547 vector = __vmread(VM_EXIT_INTR_INFO);
2548 BUG_ON(!(vector & INTR_INFO_VALID_MASK));
2550 vector &= INTR_INFO_VECTOR_MASK;
2551 HVMTRACE_1D(INTR, current, vector);
2553 switch(vector) {
2554 case LOCAL_TIMER_VECTOR:
2555 smp_apic_timer_interrupt(regs);
2556 break;
2557 case EVENT_CHECK_VECTOR:
2558 smp_event_check_interrupt();
2559 break;
2560 case INVALIDATE_TLB_VECTOR:
2561 smp_invalidate_interrupt();
2562 break;
2563 case CALL_FUNCTION_VECTOR:
2564 smp_call_function_interrupt();
2565 break;
2566 case SPURIOUS_APIC_VECTOR:
2567 smp_spurious_interrupt(regs);
2568 break;
2569 case ERROR_APIC_VECTOR:
2570 smp_error_interrupt(regs);
2571 break;
2572 #ifdef CONFIG_X86_MCE_P4THERMAL
2573 case THERMAL_APIC_VECTOR:
2574 smp_thermal_interrupt(regs);
2575 break;
2576 #endif
2577 default:
2578 regs->entry_vector = vector;
2579 do_IRQ(regs);
2580 break;
2584 static void vmx_reflect_exception(struct vcpu *v)
2586 int error_code, intr_info, vector;
2588 intr_info = __vmread(VM_EXIT_INTR_INFO);
2589 vector = intr_info & 0xff;
2590 if ( intr_info & INTR_INFO_DELIVER_CODE_MASK )
2591 error_code = __vmread(VM_EXIT_INTR_ERROR_CODE);
2592 else
2593 error_code = VMX_DELIVER_NO_ERROR_CODE;
2595 #ifndef NDEBUG
2597 unsigned long rip;
2599 rip = __vmread(GUEST_RIP);
2600 HVM_DBG_LOG(DBG_LEVEL_1, "rip = %lx, error_code = %x",
2601 rip, error_code);
2603 #endif /* NDEBUG */
2605 /*
2606 * According to Intel Virtualization Technology Specification for
2607 * the IA-32 Intel Architecture (C97063-002 April 2005), section
2608 * 2.8.3, SW_EXCEPTION should be used for #BP and #OV, and
2609 * HW_EXCEPTION used for everything else. The main difference
2610 * appears to be that for SW_EXCEPTION, the EIP/RIP is incremented
2611 * by VM_ENTER_INSTRUCTION_LEN bytes, whereas for HW_EXCEPTION,
2612 * it is not.
2613 */
2614 if ( (intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_SW_EXCEPTION )
2616 int ilen = __get_instruction_length(); /* Safe: software exception */
2617 vmx_inject_sw_exception(v, vector, ilen);
2619 else
2621 vmx_inject_hw_exception(v, vector, error_code);
2625 static void vmx_failed_vmentry(unsigned int exit_reason)
2627 unsigned int failed_vmentry_reason = (uint16_t)exit_reason;
2628 unsigned long exit_qualification;
2630 exit_qualification = __vmread(EXIT_QUALIFICATION);
2631 printk("Failed vm entry (exit reason 0x%x) ", exit_reason);
2632 switch ( failed_vmentry_reason )
2634 case EXIT_REASON_INVALID_GUEST_STATE:
2635 printk("caused by invalid guest state (%ld).\n", exit_qualification);
2636 break;
2637 case EXIT_REASON_MSR_LOADING:
2638 printk("caused by MSR entry %ld loading.\n", exit_qualification);
2639 break;
2640 case EXIT_REASON_MACHINE_CHECK:
2641 printk("caused by machine check.\n");
2642 break;
2643 default:
2644 printk("reason not known yet!");
2645 break;
2648 printk("************* VMCS Area **************\n");
2649 vmcs_dump_vcpu();
2650 printk("**************************************\n");
2652 domain_crash(current->domain);
2655 asmlinkage void vmx_vmexit_handler(struct cpu_user_regs *regs)
2657 unsigned int exit_reason;
2658 unsigned long exit_qualification, inst_len = 0;
2659 struct vcpu *v = current;
2661 exit_reason = __vmread(VM_EXIT_REASON);
2663 HVMTRACE_2D(VMEXIT, v, __vmread(GUEST_RIP), exit_reason);
2665 perfc_incra(vmexits, exit_reason);
2667 if ( exit_reason != EXIT_REASON_EXTERNAL_INTERRUPT )
2668 local_irq_enable();
2670 if ( unlikely(exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) )
2671 return vmx_failed_vmentry(exit_reason);
2673 switch ( exit_reason )
2675 case EXIT_REASON_EXCEPTION_NMI:
2677 /*
2678 * We don't set the software-interrupt exiting (INT n).
2679 * (1) We can get an exception (e.g. #PG) in the guest, or
2680 * (2) NMI
2681 */
2682 unsigned int intr_info, vector;
2684 intr_info = __vmread(VM_EXIT_INTR_INFO);
2685 BUG_ON(!(intr_info & INTR_INFO_VALID_MASK));
2687 vector = intr_info & INTR_INFO_VECTOR_MASK;
2689 perfc_incra(cause_vector, vector);
2691 switch ( vector )
2693 case TRAP_debug:
2694 case TRAP_int3:
2695 if ( !v->domain->debugger_attached )
2696 goto exit_and_crash;
2697 domain_pause_for_debugger();
2698 break;
2699 case TRAP_no_device:
2700 vmx_do_no_device_fault();
2701 break;
2702 case TRAP_page_fault:
2703 exit_qualification = __vmread(EXIT_QUALIFICATION);
2704 regs->error_code = __vmread(VM_EXIT_INTR_ERROR_CODE);
2706 HVM_DBG_LOG(DBG_LEVEL_VMMU,
2707 "eax=%lx, ebx=%lx, ecx=%lx, edx=%lx, esi=%lx, edi=%lx",
2708 (unsigned long)regs->eax, (unsigned long)regs->ebx,
2709 (unsigned long)regs->ecx, (unsigned long)regs->edx,
2710 (unsigned long)regs->esi, (unsigned long)regs->edi);
2712 if ( paging_fault(exit_qualification, regs) )
2714 HVMTRACE_2D(PF_XEN, v, exit_qualification, regs->error_code);
2715 break;
2718 v->arch.hvm_vmx.cpu_cr2 = exit_qualification;
2719 vmx_inject_hw_exception(v, TRAP_page_fault, regs->error_code);
2720 break;
2721 case TRAP_nmi:
2722 HVMTRACE_0D(NMI, v);
2723 if ( (intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI )
2724 do_nmi(regs); /* Real NMI, vector 2: normal processing. */
2725 else
2726 vmx_reflect_exception(v);
2727 break;
2728 default:
2729 goto exit_and_crash;
2731 break;
2733 case EXIT_REASON_EXTERNAL_INTERRUPT:
2734 vmx_do_extint(regs);
2735 break;
2736 case EXIT_REASON_TRIPLE_FAULT:
2737 hvm_triple_fault();
2738 break;
2739 case EXIT_REASON_PENDING_INTERRUPT:
2740 /* Disable the interrupt window. */
2741 v->arch.hvm_vcpu.u.vmx.exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
2742 __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
2743 v->arch.hvm_vcpu.u.vmx.exec_control);
2744 break;
2745 case EXIT_REASON_TASK_SWITCH:
2746 goto exit_and_crash;
2747 case EXIT_REASON_CPUID:
2748 inst_len = __get_instruction_length(); /* Safe: CPUID */
2749 __update_guest_eip(inst_len);
2750 vmx_do_cpuid(regs);
2751 break;
2752 case EXIT_REASON_HLT:
2753 inst_len = __get_instruction_length(); /* Safe: HLT */
2754 __update_guest_eip(inst_len);
2755 vmx_do_hlt();
2756 break;
2757 case EXIT_REASON_INVLPG:
2759 inst_len = __get_instruction_length(); /* Safe: INVLPG */
2760 __update_guest_eip(inst_len);
2761 exit_qualification = __vmread(EXIT_QUALIFICATION);
2762 vmx_do_invlpg(exit_qualification);
2763 break;
2765 case EXIT_REASON_VMCALL:
2767 int rc;
2768 HVMTRACE_1D(VMMCALL, v, regs->eax);
2769 inst_len = __get_instruction_length(); /* Safe: VMCALL */
2770 rc = hvm_do_hypercall(regs);
2771 if ( rc != HVM_HCALL_preempted )
2773 __update_guest_eip(inst_len);
2774 if ( rc == HVM_HCALL_invalidate )
2775 send_invalidate_req();
2777 break;
2779 case EXIT_REASON_CR_ACCESS:
2781 exit_qualification = __vmread(EXIT_QUALIFICATION);
2782 inst_len = __get_instruction_length(); /* Safe: MOV Cn, LMSW, CLTS */
2783 if ( vmx_cr_access(exit_qualification, regs) )
2784 __update_guest_eip(inst_len);
2785 break;
2787 case EXIT_REASON_DR_ACCESS:
2788 exit_qualification = __vmread(EXIT_QUALIFICATION);
2789 vmx_dr_access(exit_qualification, regs);
2790 break;
2791 case EXIT_REASON_IO_INSTRUCTION:
2792 exit_qualification = __vmread(EXIT_QUALIFICATION);
2793 inst_len = __get_instruction_length(); /* Safe: IN, INS, OUT, OUTS */
2794 vmx_io_instruction(exit_qualification, inst_len);
2795 break;
2796 case EXIT_REASON_MSR_READ:
2797 inst_len = __get_instruction_length(); /* Safe: RDMSR */
2798 if ( vmx_do_msr_read(regs) )
2799 __update_guest_eip(inst_len);
2800 break;
2801 case EXIT_REASON_MSR_WRITE:
2802 inst_len = __get_instruction_length(); /* Safe: WRMSR */
2803 if ( vmx_do_msr_write(regs) )
2804 __update_guest_eip(inst_len);
2805 break;
2806 case EXIT_REASON_MWAIT_INSTRUCTION:
2807 case EXIT_REASON_MONITOR_INSTRUCTION:
2808 case EXIT_REASON_PAUSE_INSTRUCTION:
2809 goto exit_and_crash;
2810 case EXIT_REASON_VMCLEAR:
2811 case EXIT_REASON_VMLAUNCH:
2812 case EXIT_REASON_VMPTRLD:
2813 case EXIT_REASON_VMPTRST:
2814 case EXIT_REASON_VMREAD:
2815 case EXIT_REASON_VMRESUME:
2816 case EXIT_REASON_VMWRITE:
2817 case EXIT_REASON_VMXOFF:
2818 case EXIT_REASON_VMXON:
2819 /* Report invalid opcode exception when a VMX guest tries to execute
2820 any of the VMX instructions */
2821 vmx_inject_hw_exception(v, TRAP_invalid_op, VMX_DELIVER_NO_ERROR_CODE);
2822 break;
2824 case EXIT_REASON_TPR_BELOW_THRESHOLD:
2825 break;
2827 default:
2828 exit_and_crash:
2829 gdprintk(XENLOG_ERR, "Bad vmexit (reason %x)\n", exit_reason);
2830 domain_crash(v->domain);
2831 break;
2835 asmlinkage void vmx_trace_vmentry(void)
2837 struct vcpu *v = current;
2838 HVMTRACE_0D(VMENTRY, v);
2841 /*
2842 * Local variables:
2843 * mode: C
2844 * c-set-style: "BSD"
2845 * c-basic-offset: 4
2846 * tab-width: 4
2847 * indent-tabs-mode: nil
2848 * End:
2849 */