ia64/xen-unstable

view xen/arch/x86/hvm/vmx/vmx.c @ 15956:a956ef58b012

Fix build after tracing changes.
Signed-off-by: Keir Fraser <keir@xensource.com>
author Keir Fraser <keir@xensource.com>
date Fri Sep 21 17:10:00 2007 +0100 (2007-09-21)
parents 305a8dbc264c
children ff4ff3e3ebbe
line source
1 /*
2 * vmx.c: handling VMX architecture-related VM exits
3 * Copyright (c) 2004, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
16 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 *
18 */
20 #include <xen/config.h>
21 #include <xen/init.h>
22 #include <xen/lib.h>
23 #include <xen/trace.h>
24 #include <xen/sched.h>
25 #include <xen/irq.h>
26 #include <xen/softirq.h>
27 #include <xen/domain_page.h>
28 #include <xen/hypercall.h>
29 #include <xen/perfc.h>
30 #include <asm/current.h>
31 #include <asm/io.h>
32 #include <asm/regs.h>
33 #include <asm/cpufeature.h>
34 #include <asm/processor.h>
35 #include <asm/types.h>
36 #include <asm/msr.h>
37 #include <asm/spinlock.h>
38 #include <asm/paging.h>
39 #include <asm/p2m.h>
40 #include <asm/hvm/hvm.h>
41 #include <asm/hvm/support.h>
42 #include <asm/hvm/vmx/vmx.h>
43 #include <asm/hvm/vmx/vmcs.h>
44 #include <asm/hvm/vmx/cpu.h>
45 #include <public/sched.h>
46 #include <public/hvm/ioreq.h>
47 #include <asm/hvm/vpic.h>
48 #include <asm/hvm/vlapic.h>
49 #include <asm/x86_emulate.h>
50 #include <asm/hvm/vpt.h>
51 #include <public/hvm/save.h>
52 #include <asm/hvm/trace.h>
54 enum handler_return { HNDL_done, HNDL_unhandled, HNDL_exception_raised };
56 char *vmx_msr_bitmap;
58 static void vmx_ctxt_switch_from(struct vcpu *v);
59 static void vmx_ctxt_switch_to(struct vcpu *v);
61 static int vmx_alloc_vlapic_mapping(struct domain *d);
62 static void vmx_free_vlapic_mapping(struct domain *d);
63 static void vmx_install_vlapic_mapping(struct vcpu *v);
64 static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr);
65 static void vmx_update_guest_efer(struct vcpu *v);
67 static int vmx_domain_initialise(struct domain *d)
68 {
69 return vmx_alloc_vlapic_mapping(d);
70 }
72 static void vmx_domain_destroy(struct domain *d)
73 {
74 vmx_free_vlapic_mapping(d);
75 }
77 static int vmx_vcpu_initialise(struct vcpu *v)
78 {
79 int rc;
81 spin_lock_init(&v->arch.hvm_vmx.vmcs_lock);
83 v->arch.schedule_tail = vmx_do_resume;
84 v->arch.ctxt_switch_from = vmx_ctxt_switch_from;
85 v->arch.ctxt_switch_to = vmx_ctxt_switch_to;
87 if ( (rc = vmx_create_vmcs(v)) != 0 )
88 {
89 dprintk(XENLOG_WARNING,
90 "Failed to create VMCS for vcpu %d: err=%d.\n",
91 v->vcpu_id, rc);
92 return rc;
93 }
95 vmx_install_vlapic_mapping(v);
97 return 0;
98 }
100 static void vmx_vcpu_destroy(struct vcpu *v)
101 {
102 vmx_destroy_vmcs(v);
103 }
105 #ifdef __x86_64__
107 static DEFINE_PER_CPU(struct vmx_msr_state, host_msr_state);
109 static u32 msr_index[VMX_MSR_COUNT] =
110 {
111 MSR_LSTAR, MSR_STAR, MSR_SYSCALL_MASK
112 };
114 static void vmx_save_host_msrs(void)
115 {
116 struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state);
117 int i;
119 for ( i = 0; i < VMX_MSR_COUNT; i++ )
120 rdmsrl(msr_index[i], host_msr_state->msrs[i]);
121 }
123 #define WRITE_MSR(address) \
124 guest_msr_state->msrs[VMX_INDEX_MSR_ ## address] = msr_content; \
125 set_bit(VMX_INDEX_MSR_ ## address, &guest_msr_state->flags); \
126 wrmsrl(MSR_ ## address, msr_content); \
127 set_bit(VMX_INDEX_MSR_ ## address, &host_msr_state->flags); \
128 break
130 static enum handler_return long_mode_do_msr_read(struct cpu_user_regs *regs)
131 {
132 u64 msr_content = 0;
133 u32 ecx = regs->ecx;
134 struct vcpu *v = current;
135 struct vmx_msr_state *guest_msr_state = &v->arch.hvm_vmx.msr_state;
137 switch ( ecx )
138 {
139 case MSR_EFER:
140 msr_content = v->arch.hvm_vcpu.guest_efer;
141 break;
143 case MSR_FS_BASE:
144 msr_content = __vmread(GUEST_FS_BASE);
145 goto check_long_mode;
147 case MSR_GS_BASE:
148 msr_content = __vmread(GUEST_GS_BASE);
149 goto check_long_mode;
151 case MSR_SHADOW_GS_BASE:
152 msr_content = v->arch.hvm_vmx.shadow_gs;
153 check_long_mode:
154 if ( !(hvm_long_mode_enabled(v)) )
155 {
156 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
157 return HNDL_exception_raised;
158 }
159 break;
161 case MSR_STAR:
162 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_STAR];
163 break;
165 case MSR_LSTAR:
166 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_LSTAR];
167 break;
169 case MSR_CSTAR:
170 msr_content = v->arch.hvm_vmx.cstar;
171 break;
173 case MSR_SYSCALL_MASK:
174 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_SYSCALL_MASK];
175 break;
177 default:
178 return HNDL_unhandled;
179 }
181 HVM_DBG_LOG(DBG_LEVEL_0, "msr 0x%x content 0x%"PRIx64, ecx, msr_content);
183 regs->eax = (u32)(msr_content >> 0);
184 regs->edx = (u32)(msr_content >> 32);
186 return HNDL_done;
187 }
189 static enum handler_return long_mode_do_msr_write(struct cpu_user_regs *regs)
190 {
191 u64 msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
192 u32 ecx = regs->ecx;
193 struct vcpu *v = current;
194 struct vmx_msr_state *guest_msr_state = &v->arch.hvm_vmx.msr_state;
195 struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state);
197 HVM_DBG_LOG(DBG_LEVEL_0, "msr 0x%x content 0x%"PRIx64, ecx, msr_content);
199 switch ( ecx )
200 {
201 case MSR_EFER:
202 /* offending reserved bit will cause #GP */
203 if ( (msr_content & ~(EFER_LME | EFER_LMA | EFER_NX | EFER_SCE)) ||
204 (!cpu_has_nx && (msr_content & EFER_NX)) ||
205 (!cpu_has_syscall && (msr_content & EFER_SCE)) )
206 {
207 gdprintk(XENLOG_WARNING, "Trying to set reserved bit in "
208 "EFER: %"PRIx64"\n", msr_content);
209 goto gp_fault;
210 }
212 if ( (msr_content & EFER_LME)
213 && !(v->arch.hvm_vcpu.guest_efer & EFER_LME) )
214 {
215 if ( unlikely(hvm_paging_enabled(v)) )
216 {
217 gdprintk(XENLOG_WARNING,
218 "Trying to set EFER.LME with paging enabled\n");
219 goto gp_fault;
220 }
221 }
222 else if ( !(msr_content & EFER_LME)
223 && (v->arch.hvm_vcpu.guest_efer & EFER_LME) )
224 {
225 if ( unlikely(hvm_paging_enabled(v)) )
226 {
227 gdprintk(XENLOG_WARNING,
228 "Trying to clear EFER.LME with paging enabled\n");
229 goto gp_fault;
230 }
231 }
233 if ( (msr_content ^ v->arch.hvm_vcpu.guest_efer) & (EFER_NX|EFER_SCE) )
234 write_efer((read_efer() & ~(EFER_NX|EFER_SCE)) |
235 (msr_content & (EFER_NX|EFER_SCE)));
237 v->arch.hvm_vcpu.guest_efer = msr_content;
238 break;
240 case MSR_FS_BASE:
241 case MSR_GS_BASE:
242 case MSR_SHADOW_GS_BASE:
243 if ( !hvm_long_mode_enabled(v) )
244 goto gp_fault;
246 if ( !is_canonical_address(msr_content) )
247 goto uncanonical_address;
249 if ( ecx == MSR_FS_BASE )
250 __vmwrite(GUEST_FS_BASE, msr_content);
251 else if ( ecx == MSR_GS_BASE )
252 __vmwrite(GUEST_GS_BASE, msr_content);
253 else
254 {
255 v->arch.hvm_vmx.shadow_gs = msr_content;
256 wrmsrl(MSR_SHADOW_GS_BASE, msr_content);
257 }
259 break;
261 case MSR_STAR:
262 WRITE_MSR(STAR);
264 case MSR_LSTAR:
265 if ( !is_canonical_address(msr_content) )
266 goto uncanonical_address;
267 WRITE_MSR(LSTAR);
269 case MSR_CSTAR:
270 if ( !is_canonical_address(msr_content) )
271 goto uncanonical_address;
272 v->arch.hvm_vmx.cstar = msr_content;
273 break;
275 case MSR_SYSCALL_MASK:
276 WRITE_MSR(SYSCALL_MASK);
278 default:
279 return HNDL_unhandled;
280 }
282 return HNDL_done;
284 uncanonical_address:
285 HVM_DBG_LOG(DBG_LEVEL_0, "Not cano address of msr write %x", ecx);
286 gp_fault:
287 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
288 return HNDL_exception_raised;
289 }
291 /*
292 * To avoid MSR save/restore at every VM exit/entry time, we restore
293 * the x86_64 specific MSRs at domain switch time. Since these MSRs
294 * are not modified once set for para domains, we don't save them,
295 * but simply reset them to values set in percpu_traps_init().
296 */
297 static void vmx_restore_host_msrs(void)
298 {
299 struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state);
300 int i;
302 while ( host_msr_state->flags )
303 {
304 i = find_first_set_bit(host_msr_state->flags);
305 wrmsrl(msr_index[i], host_msr_state->msrs[i]);
306 clear_bit(i, &host_msr_state->flags);
307 }
309 if ( cpu_has_nx && !(read_efer() & EFER_NX) )
310 write_efer(read_efer() | EFER_NX);
311 }
313 static void vmx_save_guest_msrs(struct vcpu *v)
314 {
315 /* MSR_SHADOW_GS_BASE may have been changed by swapgs instruction. */
316 rdmsrl(MSR_SHADOW_GS_BASE, v->arch.hvm_vmx.shadow_gs);
317 }
319 static void vmx_restore_guest_msrs(struct vcpu *v)
320 {
321 struct vmx_msr_state *guest_msr_state, *host_msr_state;
322 unsigned long guest_flags;
323 int i;
325 guest_msr_state = &v->arch.hvm_vmx.msr_state;
326 host_msr_state = &this_cpu(host_msr_state);
328 wrmsrl(MSR_SHADOW_GS_BASE, v->arch.hvm_vmx.shadow_gs);
330 guest_flags = guest_msr_state->flags;
332 while ( guest_flags )
333 {
334 i = find_first_set_bit(guest_flags);
336 HVM_DBG_LOG(DBG_LEVEL_2,
337 "restore guest's index %d msr %x with value %lx",
338 i, msr_index[i], guest_msr_state->msrs[i]);
339 set_bit(i, &host_msr_state->flags);
340 wrmsrl(msr_index[i], guest_msr_state->msrs[i]);
341 clear_bit(i, &guest_flags);
342 }
344 if ( (v->arch.hvm_vcpu.guest_efer ^ read_efer()) & (EFER_NX | EFER_SCE) )
345 {
346 HVM_DBG_LOG(DBG_LEVEL_2,
347 "restore guest's EFER with value %lx",
348 v->arch.hvm_vcpu.guest_efer);
349 write_efer((read_efer() & ~(EFER_NX | EFER_SCE)) |
350 (v->arch.hvm_vcpu.guest_efer & (EFER_NX | EFER_SCE)));
351 }
352 }
354 #else /* __i386__ */
356 #define vmx_save_host_msrs() ((void)0)
358 static void vmx_restore_host_msrs(void)
359 {
360 if ( cpu_has_nx && !(read_efer() & EFER_NX) )
361 write_efer(read_efer() | EFER_NX);
362 }
364 #define vmx_save_guest_msrs(v) ((void)0)
366 static void vmx_restore_guest_msrs(struct vcpu *v)
367 {
368 if ( (v->arch.hvm_vcpu.guest_efer ^ read_efer()) & EFER_NX )
369 {
370 HVM_DBG_LOG(DBG_LEVEL_2,
371 "restore guest's EFER with value %lx",
372 v->arch.hvm_vcpu.guest_efer);
373 write_efer((read_efer() & ~EFER_NX) |
374 (v->arch.hvm_vcpu.guest_efer & EFER_NX));
375 }
376 }
378 static enum handler_return long_mode_do_msr_read(struct cpu_user_regs *regs)
379 {
380 u64 msr_content = 0;
381 struct vcpu *v = current;
383 switch ( regs->ecx ) {
384 case MSR_EFER:
385 msr_content = v->arch.hvm_vcpu.guest_efer;
386 break;
388 default:
389 return HNDL_unhandled;
390 }
392 regs->eax = msr_content >> 0;
393 regs->edx = msr_content >> 32;
395 return HNDL_done;
396 }
398 static enum handler_return long_mode_do_msr_write(struct cpu_user_regs *regs)
399 {
400 u64 msr_content = regs->eax | ((u64)regs->edx << 32);
401 struct vcpu *v = current;
403 switch ( regs->ecx )
404 {
405 case MSR_EFER:
406 /* offending reserved bit will cause #GP */
407 if ( (msr_content & ~EFER_NX) ||
408 (!cpu_has_nx && (msr_content & EFER_NX)) )
409 {
410 gdprintk(XENLOG_WARNING, "Trying to set reserved bit in "
411 "EFER: %"PRIx64"\n", msr_content);
412 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
413 return HNDL_exception_raised;
414 }
416 if ( (msr_content ^ v->arch.hvm_vcpu.guest_efer) & EFER_NX )
417 write_efer((read_efer() & ~EFER_NX) | (msr_content & EFER_NX));
419 v->arch.hvm_vcpu.guest_efer = msr_content;
420 break;
422 default:
423 return HNDL_unhandled;
424 }
426 return HNDL_done;
427 }
429 #endif /* __i386__ */
431 #define loaddebug(_v,_reg) \
432 __asm__ __volatile__ ("mov %0,%%db" #_reg : : "r" ((_v)->debugreg[_reg]))
433 #define savedebug(_v,_reg) \
434 __asm__ __volatile__ ("mov %%db" #_reg ",%0" : : "r" ((_v)->debugreg[_reg]))
436 static int vmx_guest_x86_mode(struct vcpu *v)
437 {
438 unsigned int cs_ar_bytes;
440 if ( unlikely(!(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE)) )
441 return 0;
442 if ( unlikely(guest_cpu_user_regs()->eflags & X86_EFLAGS_VM) )
443 return 1;
444 cs_ar_bytes = __vmread(GUEST_CS_AR_BYTES);
445 if ( hvm_long_mode_enabled(v) &&
446 likely(cs_ar_bytes & X86_SEG_AR_CS_LM_ACTIVE) )
447 return 8;
448 return (likely(cs_ar_bytes & X86_SEG_AR_DEF_OP_SIZE) ? 4 : 2);
449 }
451 static void vmx_save_dr(struct vcpu *v)
452 {
453 if ( !v->arch.hvm_vcpu.flag_dr_dirty )
454 return;
456 /* Clear the DR dirty flag and re-enable intercepts for DR accesses. */
457 v->arch.hvm_vcpu.flag_dr_dirty = 0;
458 v->arch.hvm_vmx.exec_control |= CPU_BASED_MOV_DR_EXITING;
459 __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control);
461 savedebug(&v->arch.guest_context, 0);
462 savedebug(&v->arch.guest_context, 1);
463 savedebug(&v->arch.guest_context, 2);
464 savedebug(&v->arch.guest_context, 3);
465 savedebug(&v->arch.guest_context, 6);
466 v->arch.guest_context.debugreg[7] = __vmread(GUEST_DR7);
467 }
469 static void __restore_debug_registers(struct vcpu *v)
470 {
471 loaddebug(&v->arch.guest_context, 0);
472 loaddebug(&v->arch.guest_context, 1);
473 loaddebug(&v->arch.guest_context, 2);
474 loaddebug(&v->arch.guest_context, 3);
475 /* No 4 and 5 */
476 loaddebug(&v->arch.guest_context, 6);
477 /* DR7 is loaded from the VMCS. */
478 }
480 void vmx_vmcs_save(struct vcpu *v, struct hvm_hw_cpu *c)
481 {
482 uint32_t ev;
484 vmx_vmcs_enter(v);
486 c->cr0 = v->arch.hvm_vcpu.guest_cr[0];
487 c->cr2 = v->arch.hvm_vcpu.guest_cr[2];
488 c->cr3 = v->arch.hvm_vcpu.guest_cr[3];
489 c->cr4 = v->arch.hvm_vcpu.guest_cr[4];
491 c->msr_efer = v->arch.hvm_vcpu.guest_efer;
493 c->idtr_limit = __vmread(GUEST_IDTR_LIMIT);
494 c->idtr_base = __vmread(GUEST_IDTR_BASE);
496 c->gdtr_limit = __vmread(GUEST_GDTR_LIMIT);
497 c->gdtr_base = __vmread(GUEST_GDTR_BASE);
499 c->cs_sel = __vmread(GUEST_CS_SELECTOR);
500 c->cs_limit = __vmread(GUEST_CS_LIMIT);
501 c->cs_base = __vmread(GUEST_CS_BASE);
502 c->cs_arbytes = __vmread(GUEST_CS_AR_BYTES);
504 c->ds_sel = __vmread(GUEST_DS_SELECTOR);
505 c->ds_limit = __vmread(GUEST_DS_LIMIT);
506 c->ds_base = __vmread(GUEST_DS_BASE);
507 c->ds_arbytes = __vmread(GUEST_DS_AR_BYTES);
509 c->es_sel = __vmread(GUEST_ES_SELECTOR);
510 c->es_limit = __vmread(GUEST_ES_LIMIT);
511 c->es_base = __vmread(GUEST_ES_BASE);
512 c->es_arbytes = __vmread(GUEST_ES_AR_BYTES);
514 c->ss_sel = __vmread(GUEST_SS_SELECTOR);
515 c->ss_limit = __vmread(GUEST_SS_LIMIT);
516 c->ss_base = __vmread(GUEST_SS_BASE);
517 c->ss_arbytes = __vmread(GUEST_SS_AR_BYTES);
519 c->fs_sel = __vmread(GUEST_FS_SELECTOR);
520 c->fs_limit = __vmread(GUEST_FS_LIMIT);
521 c->fs_base = __vmread(GUEST_FS_BASE);
522 c->fs_arbytes = __vmread(GUEST_FS_AR_BYTES);
524 c->gs_sel = __vmread(GUEST_GS_SELECTOR);
525 c->gs_limit = __vmread(GUEST_GS_LIMIT);
526 c->gs_base = __vmread(GUEST_GS_BASE);
527 c->gs_arbytes = __vmread(GUEST_GS_AR_BYTES);
529 c->tr_sel = __vmread(GUEST_TR_SELECTOR);
530 c->tr_limit = __vmread(GUEST_TR_LIMIT);
531 c->tr_base = __vmread(GUEST_TR_BASE);
532 c->tr_arbytes = __vmread(GUEST_TR_AR_BYTES);
534 c->ldtr_sel = __vmread(GUEST_LDTR_SELECTOR);
535 c->ldtr_limit = __vmread(GUEST_LDTR_LIMIT);
536 c->ldtr_base = __vmread(GUEST_LDTR_BASE);
537 c->ldtr_arbytes = __vmread(GUEST_LDTR_AR_BYTES);
539 c->sysenter_cs = __vmread(GUEST_SYSENTER_CS);
540 c->sysenter_esp = __vmread(GUEST_SYSENTER_ESP);
541 c->sysenter_eip = __vmread(GUEST_SYSENTER_EIP);
543 c->pending_event = 0;
544 c->error_code = 0;
545 if ( ((ev = __vmread(VM_ENTRY_INTR_INFO)) & INTR_INFO_VALID_MASK) &&
546 hvm_event_needs_reinjection((ev >> 8) & 7, ev & 0xff) )
547 {
548 c->pending_event = ev;
549 c->error_code = __vmread(VM_ENTRY_EXCEPTION_ERROR_CODE);
550 }
552 vmx_vmcs_exit(v);
553 }
555 int vmx_vmcs_restore(struct vcpu *v, struct hvm_hw_cpu *c)
556 {
557 unsigned long mfn = 0;
558 p2m_type_t p2mt;
560 if ( c->pending_valid &&
561 ((c->pending_type == 1) || (c->pending_type > 6) ||
562 (c->pending_reserved != 0)) )
563 {
564 gdprintk(XENLOG_ERR, "Invalid pending event 0x%"PRIx32".\n",
565 c->pending_event);
566 return -EINVAL;
567 }
569 if ( c->cr0 & X86_CR0_PG )
570 {
571 mfn = mfn_x(gfn_to_mfn(v->domain, c->cr3 >> PAGE_SHIFT, &p2mt));
572 if ( !p2m_is_ram(p2mt) || !get_page(mfn_to_page(mfn), v->domain) )
573 {
574 gdprintk(XENLOG_ERR, "Invalid CR3 value=0x%"PRIx64"\n", c->cr3);
575 return -EINVAL;
576 }
577 }
579 if ( v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PG )
580 put_page(pagetable_get_page(v->arch.guest_table));
582 v->arch.guest_table = pagetable_from_pfn(mfn);
584 vmx_vmcs_enter(v);
586 v->arch.hvm_vcpu.guest_cr[0] = c->cr0 | X86_CR0_ET;
587 v->arch.hvm_vcpu.guest_cr[2] = c->cr2;
588 v->arch.hvm_vcpu.guest_cr[3] = c->cr3;
589 v->arch.hvm_vcpu.guest_cr[4] = c->cr4;
590 vmx_update_guest_cr(v, 0);
591 vmx_update_guest_cr(v, 2);
592 vmx_update_guest_cr(v, 4);
594 #ifdef HVM_DEBUG_SUSPEND
595 printk("%s: cr3=0x%"PRIx64", cr0=0x%"PRIx64", cr4=0x%"PRIx64".\n",
596 __func__, c->cr3, c->cr0, c->cr4);
597 #endif
599 v->arch.hvm_vcpu.guest_efer = c->msr_efer;
600 vmx_update_guest_efer(v);
602 __vmwrite(GUEST_IDTR_LIMIT, c->idtr_limit);
603 __vmwrite(GUEST_IDTR_BASE, c->idtr_base);
605 __vmwrite(GUEST_GDTR_LIMIT, c->gdtr_limit);
606 __vmwrite(GUEST_GDTR_BASE, c->gdtr_base);
608 __vmwrite(GUEST_CS_SELECTOR, c->cs_sel);
609 __vmwrite(GUEST_CS_LIMIT, c->cs_limit);
610 __vmwrite(GUEST_CS_BASE, c->cs_base);
611 __vmwrite(GUEST_CS_AR_BYTES, c->cs_arbytes);
613 __vmwrite(GUEST_DS_SELECTOR, c->ds_sel);
614 __vmwrite(GUEST_DS_LIMIT, c->ds_limit);
615 __vmwrite(GUEST_DS_BASE, c->ds_base);
616 __vmwrite(GUEST_DS_AR_BYTES, c->ds_arbytes);
618 __vmwrite(GUEST_ES_SELECTOR, c->es_sel);
619 __vmwrite(GUEST_ES_LIMIT, c->es_limit);
620 __vmwrite(GUEST_ES_BASE, c->es_base);
621 __vmwrite(GUEST_ES_AR_BYTES, c->es_arbytes);
623 __vmwrite(GUEST_SS_SELECTOR, c->ss_sel);
624 __vmwrite(GUEST_SS_LIMIT, c->ss_limit);
625 __vmwrite(GUEST_SS_BASE, c->ss_base);
626 __vmwrite(GUEST_SS_AR_BYTES, c->ss_arbytes);
628 __vmwrite(GUEST_FS_SELECTOR, c->fs_sel);
629 __vmwrite(GUEST_FS_LIMIT, c->fs_limit);
630 __vmwrite(GUEST_FS_BASE, c->fs_base);
631 __vmwrite(GUEST_FS_AR_BYTES, c->fs_arbytes);
633 __vmwrite(GUEST_GS_SELECTOR, c->gs_sel);
634 __vmwrite(GUEST_GS_LIMIT, c->gs_limit);
635 __vmwrite(GUEST_GS_BASE, c->gs_base);
636 __vmwrite(GUEST_GS_AR_BYTES, c->gs_arbytes);
638 __vmwrite(GUEST_TR_SELECTOR, c->tr_sel);
639 __vmwrite(GUEST_TR_LIMIT, c->tr_limit);
640 __vmwrite(GUEST_TR_BASE, c->tr_base);
641 __vmwrite(GUEST_TR_AR_BYTES, c->tr_arbytes);
643 __vmwrite(GUEST_LDTR_SELECTOR, c->ldtr_sel);
644 __vmwrite(GUEST_LDTR_LIMIT, c->ldtr_limit);
645 __vmwrite(GUEST_LDTR_BASE, c->ldtr_base);
646 __vmwrite(GUEST_LDTR_AR_BYTES, c->ldtr_arbytes);
648 __vmwrite(GUEST_SYSENTER_CS, c->sysenter_cs);
649 __vmwrite(GUEST_SYSENTER_ESP, c->sysenter_esp);
650 __vmwrite(GUEST_SYSENTER_EIP, c->sysenter_eip);
652 __vmwrite(GUEST_DR7, c->dr7);
654 vmx_vmcs_exit(v);
656 paging_update_paging_modes(v);
658 if ( c->pending_valid )
659 {
660 gdprintk(XENLOG_INFO, "Re-injecting 0x%"PRIx32", 0x%"PRIx32"\n",
661 c->pending_event, c->error_code);
663 if ( hvm_event_needs_reinjection(c->pending_type, c->pending_vector) )
664 {
665 vmx_vmcs_enter(v);
666 __vmwrite(VM_ENTRY_INTR_INFO, c->pending_event);
667 __vmwrite(VM_ENTRY_EXCEPTION_ERROR_CODE, c->error_code);
668 vmx_vmcs_exit(v);
669 }
670 }
672 return 0;
673 }
675 #if defined(__x86_64__) && defined(HVM_DEBUG_SUSPEND)
676 static void dump_msr_state(struct vmx_msr_state *m)
677 {
678 int i = 0;
679 printk("**** msr state ****\n");
680 printk("shadow_gs=0x%lx, flags=0x%lx, msr_items:", m->shadow_gs, m->flags);
681 for ( i = 0; i < VMX_MSR_COUNT; i++ )
682 printk("0x%lx,", m->msrs[i]);
683 printk("\n");
684 }
685 #else
686 #define dump_msr_state(m) ((void)0)
687 #endif
689 static void vmx_save_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data)
690 {
691 #ifdef __x86_64__
692 struct vmx_msr_state *guest_state = &v->arch.hvm_vmx.msr_state;
693 unsigned long guest_flags = guest_state->flags;
695 data->shadow_gs = v->arch.hvm_vmx.shadow_gs;
696 data->msr_cstar = v->arch.hvm_vmx.cstar;
698 /* save msrs */
699 data->msr_flags = guest_flags;
700 data->msr_lstar = guest_state->msrs[VMX_INDEX_MSR_LSTAR];
701 data->msr_star = guest_state->msrs[VMX_INDEX_MSR_STAR];
702 data->msr_syscall_mask = guest_state->msrs[VMX_INDEX_MSR_SYSCALL_MASK];
703 #endif
705 data->tsc = hvm_get_guest_time(v);
707 dump_msr_state(guest_state);
708 }
710 static void vmx_load_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data)
711 {
712 #ifdef __x86_64__
713 struct vmx_msr_state *guest_state = &v->arch.hvm_vmx.msr_state;
715 /* restore msrs */
716 guest_state->flags = data->msr_flags;
717 guest_state->msrs[VMX_INDEX_MSR_LSTAR] = data->msr_lstar;
718 guest_state->msrs[VMX_INDEX_MSR_STAR] = data->msr_star;
719 guest_state->msrs[VMX_INDEX_MSR_SYSCALL_MASK] = data->msr_syscall_mask;
721 v->arch.hvm_vmx.cstar = data->msr_cstar;
722 v->arch.hvm_vmx.shadow_gs = data->shadow_gs;
723 #endif
725 v->arch.hvm_vmx.vmxassist_enabled = !(data->cr0 & X86_CR0_PE);
727 hvm_set_guest_time(v, data->tsc);
729 dump_msr_state(guest_state);
730 }
733 static void vmx_save_vmcs_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt)
734 {
735 vmx_save_cpu_state(v, ctxt);
736 vmx_vmcs_save(v, ctxt);
737 }
739 static int vmx_load_vmcs_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt)
740 {
741 vmx_load_cpu_state(v, ctxt);
743 if ( vmx_vmcs_restore(v, ctxt) )
744 {
745 gdprintk(XENLOG_ERR, "vmx_vmcs restore failed!\n");
746 domain_crash(v->domain);
747 return -EINVAL;
748 }
750 return 0;
751 }
753 /*
754 * DR7 is saved and restored on every vmexit. Other debug registers only
755 * need to be restored if their value is going to affect execution -- i.e.,
756 * if one of the breakpoints is enabled. So mask out all bits that don't
757 * enable some breakpoint functionality.
758 */
759 #define DR7_ACTIVE_MASK 0xff
761 static void vmx_restore_dr(struct vcpu *v)
762 {
763 /* NB. __vmread() is not usable here, so we cannot read from the VMCS. */
764 if ( unlikely(v->arch.guest_context.debugreg[7] & DR7_ACTIVE_MASK) )
765 __restore_debug_registers(v);
766 }
768 static void vmx_ctxt_switch_from(struct vcpu *v)
769 {
770 vmx_save_guest_msrs(v);
771 vmx_restore_host_msrs();
772 vmx_save_dr(v);
773 }
775 static void vmx_ctxt_switch_to(struct vcpu *v)
776 {
777 vmx_restore_guest_msrs(v);
778 vmx_restore_dr(v);
779 }
781 static unsigned long vmx_get_segment_base(struct vcpu *v, enum x86_segment seg)
782 {
783 unsigned long base = 0;
784 int long_mode = 0;
786 ASSERT(v == current);
788 if ( hvm_long_mode_enabled(v) &&
789 (__vmread(GUEST_CS_AR_BYTES) & X86_SEG_AR_CS_LM_ACTIVE) )
790 long_mode = 1;
792 switch ( seg )
793 {
794 case x86_seg_cs: if ( !long_mode ) base = __vmread(GUEST_CS_BASE); break;
795 case x86_seg_ds: if ( !long_mode ) base = __vmread(GUEST_DS_BASE); break;
796 case x86_seg_es: if ( !long_mode ) base = __vmread(GUEST_ES_BASE); break;
797 case x86_seg_fs: base = __vmread(GUEST_FS_BASE); break;
798 case x86_seg_gs: base = __vmread(GUEST_GS_BASE); break;
799 case x86_seg_ss: if ( !long_mode ) base = __vmread(GUEST_SS_BASE); break;
800 case x86_seg_tr: base = __vmread(GUEST_TR_BASE); break;
801 case x86_seg_gdtr: base = __vmread(GUEST_GDTR_BASE); break;
802 case x86_seg_idtr: base = __vmread(GUEST_IDTR_BASE); break;
803 case x86_seg_ldtr: base = __vmread(GUEST_LDTR_BASE); break;
804 default: BUG(); break;
805 }
807 return base;
808 }
810 static void vmx_get_segment_register(struct vcpu *v, enum x86_segment seg,
811 struct segment_register *reg)
812 {
813 uint32_t attr = 0;
815 ASSERT(v == current);
817 switch ( seg )
818 {
819 case x86_seg_cs:
820 reg->sel = __vmread(GUEST_CS_SELECTOR);
821 reg->limit = __vmread(GUEST_CS_LIMIT);
822 reg->base = __vmread(GUEST_CS_BASE);
823 attr = __vmread(GUEST_CS_AR_BYTES);
824 break;
825 case x86_seg_ds:
826 reg->sel = __vmread(GUEST_DS_SELECTOR);
827 reg->limit = __vmread(GUEST_DS_LIMIT);
828 reg->base = __vmread(GUEST_DS_BASE);
829 attr = __vmread(GUEST_DS_AR_BYTES);
830 break;
831 case x86_seg_es:
832 reg->sel = __vmread(GUEST_ES_SELECTOR);
833 reg->limit = __vmread(GUEST_ES_LIMIT);
834 reg->base = __vmread(GUEST_ES_BASE);
835 attr = __vmread(GUEST_ES_AR_BYTES);
836 break;
837 case x86_seg_fs:
838 reg->sel = __vmread(GUEST_FS_SELECTOR);
839 reg->limit = __vmread(GUEST_FS_LIMIT);
840 reg->base = __vmread(GUEST_FS_BASE);
841 attr = __vmread(GUEST_FS_AR_BYTES);
842 break;
843 case x86_seg_gs:
844 reg->sel = __vmread(GUEST_GS_SELECTOR);
845 reg->limit = __vmread(GUEST_GS_LIMIT);
846 reg->base = __vmread(GUEST_GS_BASE);
847 attr = __vmread(GUEST_GS_AR_BYTES);
848 break;
849 case x86_seg_ss:
850 reg->sel = __vmread(GUEST_SS_SELECTOR);
851 reg->limit = __vmread(GUEST_SS_LIMIT);
852 reg->base = __vmread(GUEST_SS_BASE);
853 attr = __vmread(GUEST_SS_AR_BYTES);
854 break;
855 case x86_seg_tr:
856 reg->sel = __vmread(GUEST_TR_SELECTOR);
857 reg->limit = __vmread(GUEST_TR_LIMIT);
858 reg->base = __vmread(GUEST_TR_BASE);
859 attr = __vmread(GUEST_TR_AR_BYTES);
860 break;
861 case x86_seg_gdtr:
862 reg->limit = __vmread(GUEST_GDTR_LIMIT);
863 reg->base = __vmread(GUEST_GDTR_BASE);
864 break;
865 case x86_seg_idtr:
866 reg->limit = __vmread(GUEST_IDTR_LIMIT);
867 reg->base = __vmread(GUEST_IDTR_BASE);
868 break;
869 case x86_seg_ldtr:
870 reg->sel = __vmread(GUEST_LDTR_SELECTOR);
871 reg->limit = __vmread(GUEST_LDTR_LIMIT);
872 reg->base = __vmread(GUEST_LDTR_BASE);
873 attr = __vmread(GUEST_LDTR_AR_BYTES);
874 break;
875 default:
876 BUG();
877 }
879 reg->attr.bytes = (attr & 0xff) | ((attr >> 4) & 0xf00);
880 /* Unusable flag is folded into Present flag. */
881 if ( attr & (1u<<16) )
882 reg->attr.fields.p = 0;
883 }
885 static void vmx_set_segment_register(struct vcpu *v, enum x86_segment seg,
886 struct segment_register *reg)
887 {
888 uint32_t attr;
890 ASSERT(v == current);
892 attr = reg->attr.bytes;
893 attr = ((attr & 0xf00) << 4) | (attr & 0xff);
895 /* Not-present must mean unusable. */
896 if ( !reg->attr.fields.p )
897 attr |= (1u << 16);
899 switch ( seg )
900 {
901 case x86_seg_cs:
902 __vmwrite(GUEST_CS_SELECTOR, reg->sel);
903 __vmwrite(GUEST_CS_LIMIT, reg->limit);
904 __vmwrite(GUEST_CS_BASE, reg->base);
905 __vmwrite(GUEST_CS_AR_BYTES, attr);
906 break;
907 case x86_seg_ds:
908 __vmwrite(GUEST_DS_SELECTOR, reg->sel);
909 __vmwrite(GUEST_DS_LIMIT, reg->limit);
910 __vmwrite(GUEST_DS_BASE, reg->base);
911 __vmwrite(GUEST_DS_AR_BYTES, attr);
912 break;
913 case x86_seg_es:
914 __vmwrite(GUEST_ES_SELECTOR, reg->sel);
915 __vmwrite(GUEST_ES_LIMIT, reg->limit);
916 __vmwrite(GUEST_ES_BASE, reg->base);
917 __vmwrite(GUEST_ES_AR_BYTES, attr);
918 break;
919 case x86_seg_fs:
920 __vmwrite(GUEST_FS_SELECTOR, reg->sel);
921 __vmwrite(GUEST_FS_LIMIT, reg->limit);
922 __vmwrite(GUEST_FS_BASE, reg->base);
923 __vmwrite(GUEST_FS_AR_BYTES, attr);
924 break;
925 case x86_seg_gs:
926 __vmwrite(GUEST_GS_SELECTOR, reg->sel);
927 __vmwrite(GUEST_GS_LIMIT, reg->limit);
928 __vmwrite(GUEST_GS_BASE, reg->base);
929 __vmwrite(GUEST_GS_AR_BYTES, attr);
930 break;
931 case x86_seg_ss:
932 __vmwrite(GUEST_SS_SELECTOR, reg->sel);
933 __vmwrite(GUEST_SS_LIMIT, reg->limit);
934 __vmwrite(GUEST_SS_BASE, reg->base);
935 __vmwrite(GUEST_SS_AR_BYTES, attr);
936 break;
937 case x86_seg_tr:
938 __vmwrite(GUEST_TR_SELECTOR, reg->sel);
939 __vmwrite(GUEST_TR_LIMIT, reg->limit);
940 __vmwrite(GUEST_TR_BASE, reg->base);
941 __vmwrite(GUEST_TR_AR_BYTES, attr);
942 break;
943 case x86_seg_gdtr:
944 __vmwrite(GUEST_GDTR_LIMIT, reg->limit);
945 __vmwrite(GUEST_GDTR_BASE, reg->base);
946 break;
947 case x86_seg_idtr:
948 __vmwrite(GUEST_IDTR_LIMIT, reg->limit);
949 __vmwrite(GUEST_IDTR_BASE, reg->base);
950 break;
951 case x86_seg_ldtr:
952 __vmwrite(GUEST_LDTR_SELECTOR, reg->sel);
953 __vmwrite(GUEST_LDTR_LIMIT, reg->limit);
954 __vmwrite(GUEST_LDTR_BASE, reg->base);
955 __vmwrite(GUEST_LDTR_AR_BYTES, attr);
956 break;
957 default:
958 BUG();
959 }
960 }
962 /* Make sure that xen intercepts any FP accesses from current */
963 static void vmx_stts(struct vcpu *v)
964 {
965 /* VMX depends on operating on the current vcpu */
966 ASSERT(v == current);
968 /*
969 * If the guest does not have TS enabled then we must cause and handle an
970 * exception on first use of the FPU. If the guest *does* have TS enabled
971 * then this is not necessary: no FPU activity can occur until the guest
972 * clears CR0.TS, and we will initialise the FPU when that happens.
973 */
974 if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) )
975 {
976 v->arch.hvm_vcpu.hw_cr[0] |= X86_CR0_TS;
977 __vmwrite(GUEST_CR0, v->arch.hvm_vcpu.hw_cr[0]);
978 __vm_set_bit(EXCEPTION_BITMAP, TRAP_no_device);
979 }
980 }
982 static void vmx_set_tsc_offset(struct vcpu *v, u64 offset)
983 {
984 vmx_vmcs_enter(v);
985 __vmwrite(TSC_OFFSET, offset);
986 #if defined (__i386__)
987 __vmwrite(TSC_OFFSET_HIGH, offset >> 32);
988 #endif
989 vmx_vmcs_exit(v);
990 }
992 static void vmx_init_ap_context(
993 struct vcpu_guest_context *ctxt, int vcpuid, int trampoline_vector)
994 {
995 memset(ctxt, 0, sizeof(*ctxt));
996 ctxt->user_regs.eip = VMXASSIST_BASE;
997 ctxt->user_regs.edx = vcpuid;
998 ctxt->user_regs.ebx = trampoline_vector;
999 }
1001 void do_nmi(struct cpu_user_regs *);
1003 static void vmx_init_hypercall_page(struct domain *d, void *hypercall_page)
1005 char *p;
1006 int i;
1008 for ( i = 0; i < (PAGE_SIZE / 32); i++ )
1010 p = (char *)(hypercall_page + (i * 32));
1011 *(u8 *)(p + 0) = 0xb8; /* mov imm32, %eax */
1012 *(u32 *)(p + 1) = i;
1013 *(u8 *)(p + 5) = 0x0f; /* vmcall */
1014 *(u8 *)(p + 6) = 0x01;
1015 *(u8 *)(p + 7) = 0xc1;
1016 *(u8 *)(p + 8) = 0xc3; /* ret */
1019 /* Don't support HYPERVISOR_iret at the moment */
1020 *(u16 *)(hypercall_page + (__HYPERVISOR_iret * 32)) = 0x0b0f; /* ud2 */
1023 static int vmx_interrupts_enabled(struct vcpu *v, enum hvm_intack type)
1025 unsigned long intr_shadow;
1027 intr_shadow = __vmread(GUEST_INTERRUPTIBILITY_INFO);
1029 if ( type == hvm_intack_nmi )
1030 return !(intr_shadow & (VMX_INTR_SHADOW_STI|
1031 VMX_INTR_SHADOW_MOV_SS|
1032 VMX_INTR_SHADOW_NMI));
1034 ASSERT((type == hvm_intack_pic) || (type == hvm_intack_lapic));
1035 return (!irq_masked(guest_cpu_user_regs()->eflags) &&
1036 !(intr_shadow & (VMX_INTR_SHADOW_STI|VMX_INTR_SHADOW_MOV_SS)));
1039 static void vmx_update_host_cr3(struct vcpu *v)
1041 ASSERT((v == current) || !vcpu_runnable(v));
1042 vmx_vmcs_enter(v);
1043 __vmwrite(HOST_CR3, v->arch.cr3);
1044 vmx_vmcs_exit(v);
1047 static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr)
1049 ASSERT((v == current) || !vcpu_runnable(v));
1051 vmx_vmcs_enter(v);
1053 switch ( cr )
1055 case 0:
1056 v->arch.hvm_vcpu.hw_cr[0] =
1057 v->arch.hvm_vcpu.guest_cr[0] |
1058 X86_CR0_PE | X86_CR0_NE | X86_CR0_PG | X86_CR0_WP;
1059 __vmwrite(GUEST_CR0, v->arch.hvm_vcpu.hw_cr[0]);
1060 __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vcpu.guest_cr[0]);
1061 break;
1062 case 2:
1063 /* CR2 is updated in exit stub. */
1064 break;
1065 case 3:
1066 __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr[3]);
1067 break;
1068 case 4:
1069 v->arch.hvm_vcpu.hw_cr[4] =
1070 v->arch.hvm_vcpu.guest_cr[4] | HVM_CR4_HOST_MASK;
1071 __vmwrite(GUEST_CR4, v->arch.hvm_vcpu.hw_cr[4]);
1072 __vmwrite(CR4_READ_SHADOW, v->arch.hvm_vcpu.guest_cr[4]);
1073 break;
1074 default:
1075 BUG();
1078 vmx_vmcs_exit(v);
1081 static void vmx_update_guest_efer(struct vcpu *v)
1083 #ifdef __x86_64__
1084 unsigned long vm_entry_value;
1086 ASSERT((v == current) || !vcpu_runnable(v));
1088 vmx_vmcs_enter(v);
1090 vm_entry_value = __vmread(VM_ENTRY_CONTROLS);
1091 if ( v->arch.hvm_vcpu.guest_efer & EFER_LMA )
1092 vm_entry_value |= VM_ENTRY_IA32E_MODE;
1093 else
1094 vm_entry_value &= ~VM_ENTRY_IA32E_MODE;
1095 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
1097 vmx_vmcs_exit(v);
1098 #endif
1101 static void vmx_flush_guest_tlbs(void)
1103 /* No tagged TLB support on VMX yet. The fact that we're in Xen
1104 * at all means any guest will have a clean TLB when it's next run,
1105 * because VMRESUME will flush it for us. */
1108 static void vmx_inject_exception(
1109 unsigned int trapnr, int errcode, unsigned long cr2)
1111 struct vcpu *v = current;
1112 vmx_inject_hw_exception(v, trapnr, errcode);
1113 if ( trapnr == TRAP_page_fault )
1114 v->arch.hvm_vcpu.guest_cr[2] = cr2;
1117 static void vmx_update_vtpr(struct vcpu *v, unsigned long value)
1119 /* VMX doesn't have a V_TPR field */
1122 static int vmx_event_pending(struct vcpu *v)
1124 ASSERT(v == current);
1125 return (__vmread(VM_ENTRY_INTR_INFO) & INTR_INFO_VALID_MASK);
1128 static void disable_intercept_for_msr(u32 msr)
1130 /*
1131 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
1132 * have the write-low and read-high bitmap offsets the wrong way round.
1133 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
1134 */
1135 if ( msr <= 0x1fff )
1137 __clear_bit(msr, vmx_msr_bitmap + 0x000); /* read-low */
1138 __clear_bit(msr, vmx_msr_bitmap + 0x800); /* write-low */
1140 else if ( (msr >= 0xc0000000) && (msr <= 0xc0001fff) )
1142 msr &= 0x1fff;
1143 __clear_bit(msr, vmx_msr_bitmap + 0x400); /* read-high */
1144 __clear_bit(msr, vmx_msr_bitmap + 0xc00); /* write-high */
1148 static struct hvm_function_table vmx_function_table = {
1149 .name = "VMX",
1150 .domain_initialise = vmx_domain_initialise,
1151 .domain_destroy = vmx_domain_destroy,
1152 .vcpu_initialise = vmx_vcpu_initialise,
1153 .vcpu_destroy = vmx_vcpu_destroy,
1154 .save_cpu_ctxt = vmx_save_vmcs_ctxt,
1155 .load_cpu_ctxt = vmx_load_vmcs_ctxt,
1156 .interrupts_enabled = vmx_interrupts_enabled,
1157 .guest_x86_mode = vmx_guest_x86_mode,
1158 .get_segment_base = vmx_get_segment_base,
1159 .get_segment_register = vmx_get_segment_register,
1160 .set_segment_register = vmx_set_segment_register,
1161 .update_host_cr3 = vmx_update_host_cr3,
1162 .update_guest_cr = vmx_update_guest_cr,
1163 .update_guest_efer = vmx_update_guest_efer,
1164 .flush_guest_tlbs = vmx_flush_guest_tlbs,
1165 .update_vtpr = vmx_update_vtpr,
1166 .stts = vmx_stts,
1167 .set_tsc_offset = vmx_set_tsc_offset,
1168 .inject_exception = vmx_inject_exception,
1169 .init_ap_context = vmx_init_ap_context,
1170 .init_hypercall_page = vmx_init_hypercall_page,
1171 .event_pending = vmx_event_pending,
1172 .cpu_up = vmx_cpu_up,
1173 .cpu_down = vmx_cpu_down,
1174 };
1176 void start_vmx(void)
1178 static int bootstrapped;
1180 vmx_save_host_msrs();
1182 if ( bootstrapped )
1184 if ( hvm_enabled && !vmx_cpu_up() )
1186 printk("VMX: FATAL: failed to initialise CPU%d!\n",
1187 smp_processor_id());
1188 BUG();
1190 return;
1193 bootstrapped = 1;
1195 /* Xen does not fill x86_capability words except 0. */
1196 boot_cpu_data.x86_capability[4] = cpuid_ecx(1);
1198 if ( !test_bit(X86_FEATURE_VMXE, &boot_cpu_data.x86_capability) )
1199 return;
1201 set_in_cr4(X86_CR4_VMXE);
1203 if ( !vmx_cpu_up() )
1205 printk("VMX: failed to initialise.\n");
1206 return;
1209 setup_vmcs_dump();
1211 hvm_enable(&vmx_function_table);
1213 if ( cpu_has_vmx_msr_bitmap )
1215 printk("VMX: MSR intercept bitmap enabled\n");
1216 vmx_msr_bitmap = alloc_xenheap_page();
1217 BUG_ON(vmx_msr_bitmap == NULL);
1218 memset(vmx_msr_bitmap, ~0, PAGE_SIZE);
1220 disable_intercept_for_msr(MSR_FS_BASE);
1221 disable_intercept_for_msr(MSR_GS_BASE);
1223 disable_intercept_for_msr(MSR_IA32_SYSENTER_CS);
1224 disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP);
1225 disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP);
1229 /*
1230 * Not all cases receive valid value in the VM-exit instruction length field.
1231 * Callers must know what they're doing!
1232 */
1233 static int __get_instruction_length(void)
1235 int len;
1236 len = __vmread(VM_EXIT_INSTRUCTION_LEN); /* Safe: callers audited */
1237 BUG_ON((len < 1) || (len > 15));
1238 return len;
1241 static void __update_guest_eip(unsigned long inst_len)
1243 struct cpu_user_regs *regs = guest_cpu_user_regs();
1244 unsigned long x;
1246 regs->eip += inst_len;
1247 regs->eflags &= ~X86_EFLAGS_RF;
1249 x = __vmread(GUEST_INTERRUPTIBILITY_INFO);
1250 if ( x & (VMX_INTR_SHADOW_STI | VMX_INTR_SHADOW_MOV_SS) )
1252 x &= ~(VMX_INTR_SHADOW_STI | VMX_INTR_SHADOW_MOV_SS);
1253 __vmwrite(GUEST_INTERRUPTIBILITY_INFO, x);
1257 static void vmx_do_no_device_fault(void)
1259 struct vcpu *v = current;
1261 setup_fpu(current);
1262 __vm_clear_bit(EXCEPTION_BITMAP, TRAP_no_device);
1264 /* Disable TS in guest CR0 unless the guest wants the exception too. */
1265 if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) )
1267 v->arch.hvm_vcpu.hw_cr[0] &= ~X86_CR0_TS;
1268 __vmwrite(GUEST_CR0, v->arch.hvm_vcpu.hw_cr[0]);
1272 #define bitmaskof(idx) (1U << ((idx) & 31))
1273 static void vmx_do_cpuid(struct cpu_user_regs *regs)
1275 unsigned int input = (unsigned int)regs->eax;
1276 unsigned int count = (unsigned int)regs->ecx;
1277 unsigned int eax, ebx, ecx, edx;
1279 if ( input == 0x00000004 )
1281 cpuid_count(input, count, &eax, &ebx, &ecx, &edx);
1282 eax &= NUM_CORES_RESET_MASK;
1284 else if ( input == 0x40000003 )
1286 /*
1287 * NB. Unsupported interface for private use of VMXASSIST only.
1288 * Note that this leaf lives at <max-hypervisor-leaf> + 1.
1289 */
1290 u64 value = ((u64)regs->edx << 32) | (u32)regs->ecx;
1291 p2m_type_t p2mt;
1292 unsigned long mfn;
1293 struct vcpu *v = current;
1294 char *p;
1296 mfn = mfn_x(gfn_to_mfn_current(value >> PAGE_SHIFT, &p2mt));
1298 gdprintk(XENLOG_INFO, "Input address is 0x%"PRIx64".\n", value);
1300 /* 8-byte aligned valid pseudophys address from vmxassist, please. */
1301 if ( (value & 7) || !p2m_is_ram(p2mt) ||
1302 !v->arch.hvm_vmx.vmxassist_enabled )
1304 domain_crash(v->domain);
1305 return;
1307 ASSERT(mfn_valid(mfn));
1309 p = map_domain_page(mfn);
1310 value = *((uint64_t *)(p + (value & (PAGE_SIZE - 1))));
1311 unmap_domain_page(p);
1313 gdprintk(XENLOG_INFO, "Output value is 0x%"PRIx64".\n", value);
1314 ecx = (u32)value;
1315 edx = (u32)(value >> 32);
1316 } else {
1317 hvm_cpuid(input, &eax, &ebx, &ecx, &edx);
1319 if ( input == 0x00000001 )
1321 /* Mask off reserved bits. */
1322 ecx &= ~VMX_VCPU_CPUID_L1_ECX_RESERVED;
1324 ebx &= NUM_THREADS_RESET_MASK;
1326 /* Unsupportable for virtualised CPUs. */
1327 ecx &= ~(bitmaskof(X86_FEATURE_VMXE) |
1328 bitmaskof(X86_FEATURE_EST) |
1329 bitmaskof(X86_FEATURE_TM2) |
1330 bitmaskof(X86_FEATURE_CID));
1332 edx &= ~(bitmaskof(X86_FEATURE_HT) |
1333 bitmaskof(X86_FEATURE_ACPI) |
1334 bitmaskof(X86_FEATURE_ACC));
1337 if ( input == 0x00000006 || input == 0x00000009 || input == 0x0000000A )
1338 eax = ebx = ecx = edx = 0x0;
1341 regs->eax = (unsigned long)eax;
1342 regs->ebx = (unsigned long)ebx;
1343 regs->ecx = (unsigned long)ecx;
1344 regs->edx = (unsigned long)edx;
1346 HVMTRACE_3D(CPUID, current, input,
1347 ((uint64_t)eax << 32) | ebx, ((uint64_t)ecx << 32) | edx);
1350 #define CASE_GET_REG_P(REG, reg) \
1351 case REG_ ## REG: reg_p = (unsigned long *)&(regs->reg); break
1353 #ifdef __i386__
1354 #define CASE_EXTEND_GET_REG_P
1355 #else
1356 #define CASE_EXTEND_GET_REG_P \
1357 CASE_GET_REG_P(R8, r8); \
1358 CASE_GET_REG_P(R9, r9); \
1359 CASE_GET_REG_P(R10, r10); \
1360 CASE_GET_REG_P(R11, r11); \
1361 CASE_GET_REG_P(R12, r12); \
1362 CASE_GET_REG_P(R13, r13); \
1363 CASE_GET_REG_P(R14, r14); \
1364 CASE_GET_REG_P(R15, r15)
1365 #endif
1367 static void vmx_dr_access(unsigned long exit_qualification,
1368 struct cpu_user_regs *regs)
1370 struct vcpu *v = current;
1372 HVMTRACE_0D(DR_WRITE, v);
1374 v->arch.hvm_vcpu.flag_dr_dirty = 1;
1376 /* We could probably be smarter about this */
1377 __restore_debug_registers(v);
1379 /* Allow guest direct access to DR registers */
1380 v->arch.hvm_vmx.exec_control &= ~CPU_BASED_MOV_DR_EXITING;
1381 __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
1382 v->arch.hvm_vmx.exec_control);
1385 /*
1386 * Invalidate the TLB for va. Invalidate the shadow page corresponding
1387 * the address va.
1388 */
1389 static void vmx_do_invlpg(unsigned long va)
1391 struct vcpu *v = current;
1393 HVMTRACE_2D(INVLPG, v, /*invlpga=*/ 0, va);
1395 /*
1396 * We do the safest things first, then try to update the shadow
1397 * copying from guest
1398 */
1399 paging_invlpg(v, va);
1402 /* Get segment for OUTS according to guest instruction. */
1403 static enum x86_segment vmx_outs_get_segment(
1404 int long_mode, unsigned long eip, int inst_len)
1406 unsigned char inst[MAX_INST_LEN];
1407 enum x86_segment seg = x86_seg_ds;
1408 int i;
1409 extern int inst_copy_from_guest(unsigned char *, unsigned long, int);
1411 if ( likely(cpu_has_vmx_ins_outs_instr_info) )
1413 unsigned int instr_info = __vmread(VMX_INSTRUCTION_INFO);
1415 /* Get segment register according to bits 17:15. */
1416 switch ( (instr_info >> 15) & 7 )
1418 case 0: seg = x86_seg_es; break;
1419 case 1: seg = x86_seg_cs; break;
1420 case 2: seg = x86_seg_ss; break;
1421 case 3: seg = x86_seg_ds; break;
1422 case 4: seg = x86_seg_fs; break;
1423 case 5: seg = x86_seg_gs; break;
1424 default: BUG();
1427 goto out;
1430 if ( !long_mode )
1431 eip += __vmread(GUEST_CS_BASE);
1433 memset(inst, 0, MAX_INST_LEN);
1434 if ( inst_copy_from_guest(inst, eip, inst_len) != inst_len )
1436 gdprintk(XENLOG_ERR, "Get guest instruction failed\n");
1437 domain_crash(current->domain);
1438 goto out;
1441 for ( i = 0; i < inst_len; i++ )
1443 switch ( inst[i] )
1445 case 0xf3: /* REPZ */
1446 case 0xf2: /* REPNZ */
1447 case 0xf0: /* LOCK */
1448 case 0x66: /* data32 */
1449 case 0x67: /* addr32 */
1450 #ifdef __x86_64__
1451 case 0x40 ... 0x4f: /* REX */
1452 #endif
1453 continue;
1454 case 0x2e: /* CS */
1455 seg = x86_seg_cs;
1456 continue;
1457 case 0x36: /* SS */
1458 seg = x86_seg_ss;
1459 continue;
1460 case 0x26: /* ES */
1461 seg = x86_seg_es;
1462 continue;
1463 case 0x64: /* FS */
1464 seg = x86_seg_fs;
1465 continue;
1466 case 0x65: /* GS */
1467 seg = x86_seg_gs;
1468 continue;
1469 case 0x3e: /* DS */
1470 seg = x86_seg_ds;
1471 continue;
1475 out:
1476 return seg;
1479 static int vmx_str_pio_check_descriptor(int long_mode, unsigned long eip,
1480 int inst_len, enum x86_segment seg,
1481 unsigned long *base, u32 *limit,
1482 u32 *ar_bytes)
1484 enum vmcs_field ar_field, base_field, limit_field;
1486 *base = 0;
1487 *limit = 0;
1488 if ( seg != x86_seg_es )
1489 seg = vmx_outs_get_segment(long_mode, eip, inst_len);
1491 switch ( seg )
1493 case x86_seg_cs:
1494 ar_field = GUEST_CS_AR_BYTES;
1495 base_field = GUEST_CS_BASE;
1496 limit_field = GUEST_CS_LIMIT;
1497 break;
1498 case x86_seg_ds:
1499 ar_field = GUEST_DS_AR_BYTES;
1500 base_field = GUEST_DS_BASE;
1501 limit_field = GUEST_DS_LIMIT;
1502 break;
1503 case x86_seg_es:
1504 ar_field = GUEST_ES_AR_BYTES;
1505 base_field = GUEST_ES_BASE;
1506 limit_field = GUEST_ES_LIMIT;
1507 break;
1508 case x86_seg_fs:
1509 ar_field = GUEST_FS_AR_BYTES;
1510 base_field = GUEST_FS_BASE;
1511 limit_field = GUEST_FS_LIMIT;
1512 break;
1513 case x86_seg_gs:
1514 ar_field = GUEST_GS_AR_BYTES;
1515 base_field = GUEST_GS_BASE;
1516 limit_field = GUEST_GS_LIMIT;
1517 break;
1518 case x86_seg_ss:
1519 ar_field = GUEST_SS_AR_BYTES;
1520 base_field = GUEST_SS_BASE;
1521 limit_field = GUEST_SS_LIMIT;
1522 break;
1523 default:
1524 BUG();
1525 return 0;
1528 if ( !long_mode || seg == x86_seg_fs || seg == x86_seg_gs )
1530 *base = __vmread(base_field);
1531 *limit = __vmread(limit_field);
1533 *ar_bytes = __vmread(ar_field);
1535 return !(*ar_bytes & X86_SEG_AR_SEG_UNUSABLE);
1539 static int vmx_str_pio_check_limit(u32 limit, unsigned int size,
1540 u32 ar_bytes, unsigned long addr,
1541 unsigned long base, int df,
1542 unsigned long *count)
1544 unsigned long ea = addr - base;
1546 /* Offset must be within limits. */
1547 ASSERT(ea == (u32)ea);
1548 if ( (u32)(ea + size - 1) < (u32)ea ||
1549 (ar_bytes & 0xc) != 0x4 ? ea + size - 1 > limit
1550 : ea <= limit )
1551 return 0;
1553 /* Check the limit for repeated instructions, as above we checked
1554 only the first instance. Truncate the count if a limit violation
1555 would occur. Note that the checking is not necessary for page
1556 granular segments as transfers crossing page boundaries will be
1557 broken up anyway. */
1558 if ( !(ar_bytes & X86_SEG_AR_GRANULARITY) && *count > 1 )
1560 if ( (ar_bytes & 0xc) != 0x4 )
1562 /* expand-up */
1563 if ( !df )
1565 if ( ea + *count * size - 1 < ea ||
1566 ea + *count * size - 1 > limit )
1567 *count = (limit + 1UL - ea) / size;
1569 else
1571 if ( *count - 1 > ea / size )
1572 *count = ea / size + 1;
1575 else
1577 /* expand-down */
1578 if ( !df )
1580 if ( *count - 1 > -(s32)ea / size )
1581 *count = -(s32)ea / size + 1UL;
1583 else
1585 if ( ea < (*count - 1) * size ||
1586 ea - (*count - 1) * size <= limit )
1587 *count = (ea - limit - 1) / size + 1;
1590 ASSERT(*count);
1593 return 1;
1596 #ifdef __x86_64__
1597 static int vmx_str_pio_lm_check_limit(struct cpu_user_regs *regs,
1598 unsigned int size,
1599 unsigned long addr,
1600 unsigned long *count)
1602 if ( !is_canonical_address(addr) ||
1603 !is_canonical_address(addr + size - 1) )
1604 return 0;
1606 if ( *count > (1UL << 48) / size )
1607 *count = (1UL << 48) / size;
1609 if ( !(regs->eflags & EF_DF) )
1611 if ( addr + *count * size - 1 < addr ||
1612 !is_canonical_address(addr + *count * size - 1) )
1613 *count = (addr & ~((1UL << 48) - 1)) / size;
1615 else
1617 if ( (*count - 1) * size > addr ||
1618 !is_canonical_address(addr + (*count - 1) * size) )
1619 *count = (addr & ~((1UL << 48) - 1)) / size + 1;
1622 ASSERT(*count);
1624 return 1;
1626 #endif
1628 static void vmx_send_str_pio(struct cpu_user_regs *regs,
1629 struct hvm_io_op *pio_opp,
1630 unsigned long inst_len, unsigned int port,
1631 int sign, unsigned int size, int dir,
1632 int df, unsigned long addr,
1633 unsigned long paddr, unsigned long count)
1635 /*
1636 * Handle string pio instructions that cross pages or that
1637 * are unaligned. See the comments in hvm_domain.c/handle_mmio()
1638 */
1639 if ( (addr & PAGE_MASK) != ((addr + size - 1) & PAGE_MASK) ) {
1640 unsigned long value = 0;
1642 pio_opp->flags |= OVERLAP;
1644 if ( dir == IOREQ_WRITE ) /* OUTS */
1646 if ( hvm_paging_enabled(current) )
1648 int rv = hvm_copy_from_guest_virt(&value, addr, size);
1649 if ( rv != 0 )
1651 /* Failed on the page-spanning copy. Inject PF into
1652 * the guest for the address where we failed. */
1653 addr += size - rv;
1654 gdprintk(XENLOG_DEBUG, "Pagefault reading non-io side "
1655 "of a page-spanning PIO: va=%#lx\n", addr);
1656 vmx_inject_exception(TRAP_page_fault, 0, addr);
1657 return;
1660 else
1661 (void) hvm_copy_from_guest_phys(&value, addr, size);
1662 } else /* dir != IOREQ_WRITE */
1663 /* Remember where to write the result, as a *VA*.
1664 * Must be a VA so we can handle the page overlap
1665 * correctly in hvm_pio_assist() */
1666 pio_opp->addr = addr;
1668 if ( count == 1 )
1669 regs->eip += inst_len;
1671 send_pio_req(port, 1, size, value, dir, df, 0);
1672 } else {
1673 unsigned long last_addr = sign > 0 ? addr + count * size - 1
1674 : addr - (count - 1) * size;
1676 if ( (addr & PAGE_MASK) != (last_addr & PAGE_MASK) )
1678 if ( sign > 0 )
1679 count = (PAGE_SIZE - (addr & ~PAGE_MASK)) / size;
1680 else
1681 count = (addr & ~PAGE_MASK) / size + 1;
1682 } else
1683 regs->eip += inst_len;
1685 send_pio_req(port, count, size, paddr, dir, df, 1);
1689 static void vmx_do_str_pio(unsigned long exit_qualification,
1690 unsigned long inst_len,
1691 struct cpu_user_regs *regs,
1692 struct hvm_io_op *pio_opp)
1694 unsigned int port, size;
1695 int dir, df, vm86;
1696 unsigned long addr, count = 1, base;
1697 paddr_t paddr;
1698 unsigned long gfn;
1699 u32 ar_bytes, limit;
1700 int sign;
1701 int long_mode = 0;
1703 vm86 = regs->eflags & X86_EFLAGS_VM ? 1 : 0;
1704 df = regs->eflags & X86_EFLAGS_DF ? 1 : 0;
1706 if ( test_bit(6, &exit_qualification) )
1707 port = (exit_qualification >> 16) & 0xFFFF;
1708 else
1709 port = regs->edx & 0xffff;
1711 size = (exit_qualification & 7) + 1;
1712 dir = test_bit(3, &exit_qualification); /* direction */
1714 if ( dir == IOREQ_READ )
1715 HVMTRACE_2D(IO_READ, current, port, size);
1716 else
1717 HVMTRACE_2D(IO_WRITE, current, port, size);
1719 sign = regs->eflags & X86_EFLAGS_DF ? -1 : 1;
1720 ar_bytes = __vmread(GUEST_CS_AR_BYTES);
1721 if ( hvm_long_mode_enabled(current) &&
1722 (ar_bytes & X86_SEG_AR_CS_LM_ACTIVE) )
1723 long_mode = 1;
1724 addr = __vmread(GUEST_LINEAR_ADDRESS);
1726 if ( test_bit(5, &exit_qualification) ) { /* "rep" prefix */
1727 pio_opp->flags |= REPZ;
1728 count = regs->ecx;
1729 if ( !long_mode &&
1730 (vm86 || !(ar_bytes & X86_SEG_AR_DEF_OP_SIZE)) )
1731 count &= 0xFFFF;
1734 /*
1735 * In protected mode, guest linear address is invalid if the
1736 * selector is null.
1737 */
1738 if ( !vmx_str_pio_check_descriptor(long_mode, regs->eip, inst_len,
1739 dir==IOREQ_WRITE ? x86_seg_ds :
1740 x86_seg_es, &base, &limit,
1741 &ar_bytes) ) {
1742 if ( !long_mode ) {
1743 vmx_inject_hw_exception(current, TRAP_gp_fault, 0);
1744 return;
1746 addr = dir == IOREQ_WRITE ? base + regs->esi : regs->edi;
1749 if ( !long_mode )
1751 /* Segment must be readable for outs and writeable for ins. */
1752 if ( ((dir == IOREQ_WRITE)
1753 ? ((ar_bytes & 0xa) == 0x8)
1754 : ((ar_bytes & 0xa) != 0x2)) ||
1755 !vmx_str_pio_check_limit(limit, size, ar_bytes,
1756 addr, base, df, &count) )
1758 vmx_inject_hw_exception(current, TRAP_gp_fault, 0);
1759 return;
1762 #ifdef __x86_64__
1763 else if ( !vmx_str_pio_lm_check_limit(regs, size, addr, &count) )
1765 vmx_inject_hw_exception(current, TRAP_gp_fault, 0);
1766 return;
1768 #endif
1770 /* Translate the address to a physical address */
1771 gfn = paging_gva_to_gfn(current, addr);
1772 if ( gfn == INVALID_GFN )
1774 /* The guest does not have the RAM address mapped.
1775 * Need to send in a page fault */
1776 int errcode = 0;
1777 /* IO read --> memory write */
1778 if ( dir == IOREQ_READ ) errcode |= PFEC_write_access;
1779 vmx_inject_exception(TRAP_page_fault, errcode, addr);
1780 return;
1782 paddr = (paddr_t)gfn << PAGE_SHIFT | (addr & ~PAGE_MASK);
1784 vmx_send_str_pio(regs, pio_opp, inst_len, port, sign,
1785 size, dir, df, addr, paddr, count);
1788 static void vmx_io_instruction(unsigned long exit_qualification,
1789 unsigned long inst_len)
1791 struct cpu_user_regs *regs;
1792 struct hvm_io_op *pio_opp;
1794 pio_opp = &current->arch.hvm_vcpu.io_op;
1795 pio_opp->instr = INSTR_PIO;
1796 pio_opp->flags = 0;
1798 regs = &pio_opp->io_context;
1800 /* Copy current guest state into io instruction state structure. */
1801 memcpy(regs, guest_cpu_user_regs(), HVM_CONTEXT_STACK_BYTES);
1803 HVM_DBG_LOG(DBG_LEVEL_IO, "vm86 %d, eip=%x:%lx, "
1804 "exit_qualification = %lx",
1805 regs->eflags & X86_EFLAGS_VM ? 1 : 0,
1806 regs->cs, (unsigned long)regs->eip, exit_qualification);
1808 if ( test_bit(4, &exit_qualification) ) /* string instrucation */
1809 vmx_do_str_pio(exit_qualification, inst_len, regs, pio_opp);
1810 else
1812 unsigned int port, size;
1813 int dir, df;
1815 df = regs->eflags & X86_EFLAGS_DF ? 1 : 0;
1817 if ( test_bit(6, &exit_qualification) )
1818 port = (exit_qualification >> 16) & 0xFFFF;
1819 else
1820 port = regs->edx & 0xffff;
1822 size = (exit_qualification & 7) + 1;
1823 dir = test_bit(3, &exit_qualification); /* direction */
1825 if ( dir == IOREQ_READ )
1826 HVMTRACE_2D(IO_READ, current, port, size);
1827 else
1828 HVMTRACE_3D(IO_WRITE, current, port, size, regs->eax);
1830 if ( port == 0xe9 && dir == IOREQ_WRITE && size == 1 )
1831 hvm_print_line(current, regs->eax); /* guest debug output */
1833 regs->eip += inst_len;
1834 send_pio_req(port, 1, size, regs->eax, dir, df, 0);
1838 static void vmx_world_save(struct vcpu *v, struct vmx_assist_context *c)
1840 struct cpu_user_regs *regs = guest_cpu_user_regs();
1842 c->eip = regs->eip;
1843 c->eip += __get_instruction_length(); /* Safe: MOV Cn, LMSW, CLTS */
1844 c->esp = regs->esp;
1845 c->eflags = regs->eflags & ~X86_EFLAGS_RF;
1847 c->cr0 = v->arch.hvm_vcpu.guest_cr[0];
1848 c->cr3 = v->arch.hvm_vcpu.guest_cr[3];
1849 c->cr4 = v->arch.hvm_vcpu.guest_cr[4];
1851 c->idtr_limit = __vmread(GUEST_IDTR_LIMIT);
1852 c->idtr_base = __vmread(GUEST_IDTR_BASE);
1854 c->gdtr_limit = __vmread(GUEST_GDTR_LIMIT);
1855 c->gdtr_base = __vmread(GUEST_GDTR_BASE);
1857 c->cs_sel = __vmread(GUEST_CS_SELECTOR);
1858 c->cs_limit = __vmread(GUEST_CS_LIMIT);
1859 c->cs_base = __vmread(GUEST_CS_BASE);
1860 c->cs_arbytes.bytes = __vmread(GUEST_CS_AR_BYTES);
1862 c->ds_sel = __vmread(GUEST_DS_SELECTOR);
1863 c->ds_limit = __vmread(GUEST_DS_LIMIT);
1864 c->ds_base = __vmread(GUEST_DS_BASE);
1865 c->ds_arbytes.bytes = __vmread(GUEST_DS_AR_BYTES);
1867 c->es_sel = __vmread(GUEST_ES_SELECTOR);
1868 c->es_limit = __vmread(GUEST_ES_LIMIT);
1869 c->es_base = __vmread(GUEST_ES_BASE);
1870 c->es_arbytes.bytes = __vmread(GUEST_ES_AR_BYTES);
1872 c->ss_sel = __vmread(GUEST_SS_SELECTOR);
1873 c->ss_limit = __vmread(GUEST_SS_LIMIT);
1874 c->ss_base = __vmread(GUEST_SS_BASE);
1875 c->ss_arbytes.bytes = __vmread(GUEST_SS_AR_BYTES);
1877 c->fs_sel = __vmread(GUEST_FS_SELECTOR);
1878 c->fs_limit = __vmread(GUEST_FS_LIMIT);
1879 c->fs_base = __vmread(GUEST_FS_BASE);
1880 c->fs_arbytes.bytes = __vmread(GUEST_FS_AR_BYTES);
1882 c->gs_sel = __vmread(GUEST_GS_SELECTOR);
1883 c->gs_limit = __vmread(GUEST_GS_LIMIT);
1884 c->gs_base = __vmread(GUEST_GS_BASE);
1885 c->gs_arbytes.bytes = __vmread(GUEST_GS_AR_BYTES);
1887 c->tr_sel = __vmread(GUEST_TR_SELECTOR);
1888 c->tr_limit = __vmread(GUEST_TR_LIMIT);
1889 c->tr_base = __vmread(GUEST_TR_BASE);
1890 c->tr_arbytes.bytes = __vmread(GUEST_TR_AR_BYTES);
1892 c->ldtr_sel = __vmread(GUEST_LDTR_SELECTOR);
1893 c->ldtr_limit = __vmread(GUEST_LDTR_LIMIT);
1894 c->ldtr_base = __vmread(GUEST_LDTR_BASE);
1895 c->ldtr_arbytes.bytes = __vmread(GUEST_LDTR_AR_BYTES);
1898 static int vmx_world_restore(struct vcpu *v, struct vmx_assist_context *c)
1900 struct cpu_user_regs *regs = guest_cpu_user_regs();
1901 unsigned long mfn = 0;
1902 p2m_type_t p2mt;
1904 if ( c->cr0 & X86_CR0_PG )
1906 mfn = mfn_x(gfn_to_mfn(v->domain, c->cr3 >> PAGE_SHIFT, &p2mt));
1907 if ( !p2m_is_ram(p2mt) || !get_page(mfn_to_page(mfn), v->domain) )
1909 gdprintk(XENLOG_ERR, "Invalid CR3 value=%x", c->cr3);
1910 return -EINVAL;
1914 if ( v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PG )
1915 put_page(pagetable_get_page(v->arch.guest_table));
1917 v->arch.guest_table = pagetable_from_pfn(mfn);
1919 regs->eip = c->eip;
1920 regs->esp = c->esp;
1921 regs->eflags = c->eflags | 2;
1923 v->arch.hvm_vcpu.guest_cr[0] = c->cr0;
1924 v->arch.hvm_vcpu.guest_cr[3] = c->cr3;
1925 v->arch.hvm_vcpu.guest_cr[4] = c->cr4;
1926 vmx_update_guest_cr(v, 0);
1927 vmx_update_guest_cr(v, 4);
1929 __vmwrite(GUEST_IDTR_LIMIT, c->idtr_limit);
1930 __vmwrite(GUEST_IDTR_BASE, c->idtr_base);
1932 __vmwrite(GUEST_GDTR_LIMIT, c->gdtr_limit);
1933 __vmwrite(GUEST_GDTR_BASE, c->gdtr_base);
1935 __vmwrite(GUEST_CS_SELECTOR, c->cs_sel);
1936 __vmwrite(GUEST_CS_LIMIT, c->cs_limit);
1937 __vmwrite(GUEST_CS_BASE, c->cs_base);
1938 __vmwrite(GUEST_CS_AR_BYTES, c->cs_arbytes.bytes);
1940 __vmwrite(GUEST_DS_SELECTOR, c->ds_sel);
1941 __vmwrite(GUEST_DS_LIMIT, c->ds_limit);
1942 __vmwrite(GUEST_DS_BASE, c->ds_base);
1943 __vmwrite(GUEST_DS_AR_BYTES, c->ds_arbytes.bytes);
1945 __vmwrite(GUEST_ES_SELECTOR, c->es_sel);
1946 __vmwrite(GUEST_ES_LIMIT, c->es_limit);
1947 __vmwrite(GUEST_ES_BASE, c->es_base);
1948 __vmwrite(GUEST_ES_AR_BYTES, c->es_arbytes.bytes);
1950 __vmwrite(GUEST_SS_SELECTOR, c->ss_sel);
1951 __vmwrite(GUEST_SS_LIMIT, c->ss_limit);
1952 __vmwrite(GUEST_SS_BASE, c->ss_base);
1953 __vmwrite(GUEST_SS_AR_BYTES, c->ss_arbytes.bytes);
1955 __vmwrite(GUEST_FS_SELECTOR, c->fs_sel);
1956 __vmwrite(GUEST_FS_LIMIT, c->fs_limit);
1957 __vmwrite(GUEST_FS_BASE, c->fs_base);
1958 __vmwrite(GUEST_FS_AR_BYTES, c->fs_arbytes.bytes);
1960 __vmwrite(GUEST_GS_SELECTOR, c->gs_sel);
1961 __vmwrite(GUEST_GS_LIMIT, c->gs_limit);
1962 __vmwrite(GUEST_GS_BASE, c->gs_base);
1963 __vmwrite(GUEST_GS_AR_BYTES, c->gs_arbytes.bytes);
1965 __vmwrite(GUEST_TR_SELECTOR, c->tr_sel);
1966 __vmwrite(GUEST_TR_LIMIT, c->tr_limit);
1967 __vmwrite(GUEST_TR_BASE, c->tr_base);
1968 __vmwrite(GUEST_TR_AR_BYTES, c->tr_arbytes.bytes);
1970 __vmwrite(GUEST_LDTR_SELECTOR, c->ldtr_sel);
1971 __vmwrite(GUEST_LDTR_LIMIT, c->ldtr_limit);
1972 __vmwrite(GUEST_LDTR_BASE, c->ldtr_base);
1973 __vmwrite(GUEST_LDTR_AR_BYTES, c->ldtr_arbytes.bytes);
1975 paging_update_paging_modes(v);
1976 return 0;
1979 enum { VMX_ASSIST_INVOKE = 0, VMX_ASSIST_RESTORE };
1981 static int vmx_assist(struct vcpu *v, int mode)
1983 struct vmx_assist_context c;
1984 struct hvm_hw_vpic *vpic = v->domain->arch.hvm_domain.vpic;
1985 u32 magic, cp;
1987 if ( hvm_copy_from_guest_phys(&magic, VMXASSIST_MAGIC_OFFSET,
1988 sizeof(magic)) )
1990 gdprintk(XENLOG_ERR, "No vmxassist: can't execute real mode code\n");
1991 domain_crash(v->domain);
1992 return 0;
1995 if ( magic != VMXASSIST_MAGIC )
1997 gdprintk(XENLOG_ERR, "vmxassist magic number not match\n");
1998 domain_crash(v->domain);
1999 return 0;
2002 switch ( mode ) {
2003 /*
2004 * Transfer control to vmxassist.
2005 * Store the current context in VMXASSIST_OLD_CONTEXT and load
2006 * the new VMXASSIST_NEW_CONTEXT context. This context was created
2007 * by vmxassist and will transfer control to it.
2008 */
2009 case VMX_ASSIST_INVOKE:
2010 /* save the old context */
2011 if ( hvm_copy_from_guest_phys(&cp, VMXASSIST_OLD_CONTEXT, sizeof(cp)) )
2012 goto error;
2013 if ( cp != 0 ) {
2014 vmx_world_save(v, &c);
2015 if ( hvm_copy_to_guest_phys(cp, &c, sizeof(c)) )
2016 goto error;
2019 /* restore the new context, this should activate vmxassist */
2020 if ( hvm_copy_from_guest_phys(&cp, VMXASSIST_NEW_CONTEXT, sizeof(cp)) )
2021 goto error;
2022 if ( cp != 0 ) {
2023 if ( hvm_copy_from_guest_phys(&c, cp, sizeof(c)) )
2024 goto error;
2025 if ( vmx_world_restore(v, &c) != 0 )
2026 goto error;
2027 v->arch.hvm_vmx.pm_irqbase[0] = vpic[0].irq_base;
2028 v->arch.hvm_vmx.pm_irqbase[1] = vpic[1].irq_base;
2029 vpic[0].irq_base = NR_EXCEPTION_HANDLER;
2030 vpic[1].irq_base = NR_EXCEPTION_HANDLER + 8;
2031 v->arch.hvm_vmx.vmxassist_enabled = 1;
2032 return 1;
2034 break;
2036 /*
2037 * Restore the VMXASSIST_OLD_CONTEXT that was saved by
2038 * VMX_ASSIST_INVOKE above.
2039 */
2040 case VMX_ASSIST_RESTORE:
2041 /* save the old context */
2042 if ( hvm_copy_from_guest_phys(&cp, VMXASSIST_OLD_CONTEXT, sizeof(cp)) )
2043 goto error;
2044 if ( cp != 0 ) {
2045 if ( hvm_copy_from_guest_phys(&c, cp, sizeof(c)) )
2046 goto error;
2047 if ( vmx_world_restore(v, &c) != 0 )
2048 goto error;
2049 if ( v->arch.hvm_vmx.irqbase_mode ) {
2050 vpic[0].irq_base = c.rm_irqbase[0] & 0xf8;
2051 vpic[1].irq_base = c.rm_irqbase[1] & 0xf8;
2052 } else {
2053 vpic[0].irq_base = v->arch.hvm_vmx.pm_irqbase[0];
2054 vpic[1].irq_base = v->arch.hvm_vmx.pm_irqbase[1];
2056 v->arch.hvm_vmx.vmxassist_enabled = 0;
2057 return 1;
2059 break;
2062 error:
2063 gdprintk(XENLOG_ERR, "Failed to transfer to vmxassist\n");
2064 domain_crash(v->domain);
2065 return 0;
2068 static int vmx_set_cr0(unsigned long value)
2070 struct vcpu *v = current;
2071 int rc = hvm_set_cr0(value);
2073 if ( rc == 0 )
2074 return 0;
2076 /* TS cleared? Then initialise FPU now. */
2077 if ( !(value & X86_CR0_TS) )
2079 setup_fpu(v);
2080 __vm_clear_bit(EXCEPTION_BITMAP, TRAP_no_device);
2083 /*
2084 * VMX does not implement real-mode virtualization. We emulate
2085 * real-mode by performing a world switch to VMXAssist whenever
2086 * a partition disables the CR0.PE bit.
2087 */
2088 if ( !(value & X86_CR0_PE) )
2090 if ( vmx_assist(v, VMX_ASSIST_INVOKE) )
2091 return 0; /* do not update eip! */
2093 else if ( v->arch.hvm_vmx.vmxassist_enabled )
2095 if ( vmx_assist(v, VMX_ASSIST_RESTORE) )
2096 return 0; /* do not update eip! */
2099 return 1;
2102 #define CASE_SET_REG(REG, reg) \
2103 case REG_ ## REG: regs->reg = value; break
2104 #define CASE_GET_REG(REG, reg) \
2105 case REG_ ## REG: value = regs->reg; break
2107 #define CASE_EXTEND_SET_REG \
2108 CASE_EXTEND_REG(S)
2109 #define CASE_EXTEND_GET_REG \
2110 CASE_EXTEND_REG(G)
2112 #ifdef __i386__
2113 #define CASE_EXTEND_REG(T)
2114 #else
2115 #define CASE_EXTEND_REG(T) \
2116 CASE_ ## T ## ET_REG(R8, r8); \
2117 CASE_ ## T ## ET_REG(R9, r9); \
2118 CASE_ ## T ## ET_REG(R10, r10); \
2119 CASE_ ## T ## ET_REG(R11, r11); \
2120 CASE_ ## T ## ET_REG(R12, r12); \
2121 CASE_ ## T ## ET_REG(R13, r13); \
2122 CASE_ ## T ## ET_REG(R14, r14); \
2123 CASE_ ## T ## ET_REG(R15, r15)
2124 #endif
2126 static int mov_to_cr(int gp, int cr, struct cpu_user_regs *regs)
2128 unsigned long value;
2129 struct vcpu *v = current;
2130 struct vlapic *vlapic = vcpu_vlapic(v);
2132 switch ( gp )
2134 CASE_GET_REG(EAX, eax);
2135 CASE_GET_REG(ECX, ecx);
2136 CASE_GET_REG(EDX, edx);
2137 CASE_GET_REG(EBX, ebx);
2138 CASE_GET_REG(EBP, ebp);
2139 CASE_GET_REG(ESI, esi);
2140 CASE_GET_REG(EDI, edi);
2141 CASE_GET_REG(ESP, esp);
2142 CASE_EXTEND_GET_REG;
2143 default:
2144 gdprintk(XENLOG_ERR, "invalid gp: %d\n", gp);
2145 goto exit_and_crash;
2148 HVMTRACE_2D(CR_WRITE, v, cr, value);
2150 HVM_DBG_LOG(DBG_LEVEL_1, "CR%d, value = %lx", cr, value);
2152 switch ( cr )
2154 case 0:
2155 return vmx_set_cr0(value);
2157 case 3:
2158 return hvm_set_cr3(value);
2160 case 4:
2161 return hvm_set_cr4(value);
2163 case 8:
2164 vlapic_set_reg(vlapic, APIC_TASKPRI, ((value & 0x0F) << 4));
2165 break;
2167 default:
2168 gdprintk(XENLOG_ERR, "invalid cr: %d\n", cr);
2169 goto exit_and_crash;
2172 return 1;
2174 exit_and_crash:
2175 domain_crash(v->domain);
2176 return 0;
2179 /*
2180 * Read from control registers. CR0 and CR4 are read from the shadow.
2181 */
2182 static void mov_from_cr(int cr, int gp, struct cpu_user_regs *regs)
2184 unsigned long value = 0;
2185 struct vcpu *v = current;
2186 struct vlapic *vlapic = vcpu_vlapic(v);
2188 switch ( cr )
2190 case 3:
2191 value = (unsigned long)v->arch.hvm_vcpu.guest_cr[3];
2192 break;
2193 case 8:
2194 value = (unsigned long)vlapic_get_reg(vlapic, APIC_TASKPRI);
2195 value = (value & 0xF0) >> 4;
2196 break;
2197 default:
2198 gdprintk(XENLOG_ERR, "invalid cr: %d\n", cr);
2199 domain_crash(v->domain);
2200 break;
2203 switch ( gp ) {
2204 CASE_SET_REG(EAX, eax);
2205 CASE_SET_REG(ECX, ecx);
2206 CASE_SET_REG(EDX, edx);
2207 CASE_SET_REG(EBX, ebx);
2208 CASE_SET_REG(EBP, ebp);
2209 CASE_SET_REG(ESI, esi);
2210 CASE_SET_REG(EDI, edi);
2211 CASE_SET_REG(ESP, esp);
2212 CASE_EXTEND_SET_REG;
2213 default:
2214 printk("invalid gp: %d\n", gp);
2215 domain_crash(v->domain);
2216 break;
2219 HVMTRACE_2D(CR_READ, v, cr, value);
2221 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR%d, value = %lx", cr, value);
2224 static int vmx_cr_access(unsigned long exit_qualification,
2225 struct cpu_user_regs *regs)
2227 unsigned int gp, cr;
2228 unsigned long value;
2229 struct vcpu *v = current;
2231 switch ( exit_qualification & CONTROL_REG_ACCESS_TYPE )
2233 case TYPE_MOV_TO_CR:
2234 gp = exit_qualification & CONTROL_REG_ACCESS_REG;
2235 cr = exit_qualification & CONTROL_REG_ACCESS_NUM;
2236 return mov_to_cr(gp, cr, regs);
2237 case TYPE_MOV_FROM_CR:
2238 gp = exit_qualification & CONTROL_REG_ACCESS_REG;
2239 cr = exit_qualification & CONTROL_REG_ACCESS_NUM;
2240 mov_from_cr(cr, gp, regs);
2241 break;
2242 case TYPE_CLTS:
2243 /* We initialise the FPU now, to avoid needing another vmexit. */
2244 setup_fpu(v);
2245 __vm_clear_bit(EXCEPTION_BITMAP, TRAP_no_device);
2247 v->arch.hvm_vcpu.hw_cr[0] &= ~X86_CR0_TS; /* clear TS */
2248 __vmwrite(GUEST_CR0, v->arch.hvm_vcpu.hw_cr[0]);
2250 v->arch.hvm_vcpu.guest_cr[0] &= ~X86_CR0_TS; /* clear TS */
2251 __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vcpu.guest_cr[0]);
2252 HVMTRACE_0D(CLTS, current);
2253 break;
2254 case TYPE_LMSW:
2255 value = v->arch.hvm_vcpu.guest_cr[0];
2256 value = (value & ~0xF) |
2257 (((exit_qualification & LMSW_SOURCE_DATA) >> 16) & 0xF);
2258 HVMTRACE_1D(LMSW, current, value);
2259 return vmx_set_cr0(value);
2260 default:
2261 BUG();
2264 return 1;
2267 static int vmx_do_msr_read(struct cpu_user_regs *regs)
2269 u64 msr_content = 0;
2270 u32 ecx = regs->ecx, eax, edx;
2271 struct vcpu *v = current;
2273 HVM_DBG_LOG(DBG_LEVEL_1, "ecx=%x", ecx);
2275 switch ( ecx )
2277 case MSR_IA32_TIME_STAMP_COUNTER:
2278 msr_content = hvm_get_guest_time(v);
2279 break;
2280 case MSR_IA32_SYSENTER_CS:
2281 msr_content = (u32)__vmread(GUEST_SYSENTER_CS);
2282 break;
2283 case MSR_IA32_SYSENTER_ESP:
2284 msr_content = __vmread(GUEST_SYSENTER_ESP);
2285 break;
2286 case MSR_IA32_SYSENTER_EIP:
2287 msr_content = __vmread(GUEST_SYSENTER_EIP);
2288 break;
2289 case MSR_IA32_APICBASE:
2290 msr_content = vcpu_vlapic(v)->hw.apic_base_msr;
2291 break;
2292 case MSR_IA32_VMX_BASIC...MSR_IA32_VMX_PROCBASED_CTLS2:
2293 goto gp_fault;
2294 case MSR_IA32_MCG_CAP:
2295 case MSR_IA32_MCG_STATUS:
2296 case MSR_IA32_MC0_STATUS:
2297 case MSR_K8_MC1_STATUS:
2298 case MSR_K8_MC2_STATUS:
2299 case MSR_K8_MC3_STATUS:
2300 case MSR_K8_MC4_STATUS:
2301 case MSR_K8_MC5_STATUS:
2302 /* No point in letting the guest see real MCEs */
2303 msr_content = 0;
2304 break;
2305 default:
2306 switch ( long_mode_do_msr_read(regs) )
2308 case HNDL_unhandled:
2309 break;
2310 case HNDL_exception_raised:
2311 return 0;
2312 case HNDL_done:
2313 goto done;
2316 if ( rdmsr_hypervisor_regs(ecx, &eax, &edx) ||
2317 rdmsr_safe(ecx, eax, edx) == 0 )
2319 regs->eax = eax;
2320 regs->edx = edx;
2321 goto done;
2324 goto gp_fault;
2327 regs->eax = msr_content & 0xFFFFFFFF;
2328 regs->edx = msr_content >> 32;
2330 done:
2331 hvmtrace_msr_read(v, ecx, msr_content);
2332 HVM_DBG_LOG(DBG_LEVEL_1, "returns: ecx=%x, eax=%lx, edx=%lx",
2333 ecx, (unsigned long)regs->eax,
2334 (unsigned long)regs->edx);
2335 return 1;
2337 gp_fault:
2338 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
2339 return 0;
2342 static int vmx_alloc_vlapic_mapping(struct domain *d)
2344 void *apic_va;
2346 if ( !cpu_has_vmx_virtualize_apic_accesses )
2347 return 0;
2349 apic_va = alloc_xenheap_page();
2350 if ( apic_va == NULL )
2351 return -ENOMEM;
2352 share_xen_page_with_guest(virt_to_page(apic_va), d, XENSHARE_writable);
2353 guest_physmap_add_page(
2354 d, paddr_to_pfn(APIC_DEFAULT_PHYS_BASE), virt_to_mfn(apic_va));
2355 d->arch.hvm_domain.vmx_apic_access_mfn = virt_to_mfn(apic_va);
2357 return 0;
2360 static void vmx_free_vlapic_mapping(struct domain *d)
2362 unsigned long mfn = d->arch.hvm_domain.vmx_apic_access_mfn;
2363 if ( mfn != 0 )
2364 free_xenheap_page(mfn_to_virt(mfn));
2367 static void vmx_install_vlapic_mapping(struct vcpu *v)
2369 unsigned long virt_page_ma, apic_page_ma;
2371 if ( !cpu_has_vmx_virtualize_apic_accesses )
2372 return;
2374 virt_page_ma = page_to_maddr(vcpu_vlapic(v)->regs_page);
2375 apic_page_ma = v->domain->arch.hvm_domain.vmx_apic_access_mfn;
2376 apic_page_ma <<= PAGE_SHIFT;
2378 vmx_vmcs_enter(v);
2379 __vmwrite(VIRTUAL_APIC_PAGE_ADDR, virt_page_ma);
2380 __vmwrite(APIC_ACCESS_ADDR, apic_page_ma);
2381 vmx_vmcs_exit(v);
2384 void vmx_vlapic_msr_changed(struct vcpu *v)
2386 struct vlapic *vlapic = vcpu_vlapic(v);
2387 uint32_t ctl;
2389 if ( !cpu_has_vmx_virtualize_apic_accesses )
2390 return;
2392 vmx_vmcs_enter(v);
2393 ctl = __vmread(SECONDARY_VM_EXEC_CONTROL);
2394 ctl &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
2395 if ( !vlapic_hw_disabled(vlapic) &&
2396 (vlapic_base_address(vlapic) == APIC_DEFAULT_PHYS_BASE) )
2397 ctl |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
2398 __vmwrite(SECONDARY_VM_EXEC_CONTROL, ctl);
2399 vmx_vmcs_exit(v);
2402 static int vmx_do_msr_write(struct cpu_user_regs *regs)
2404 u32 ecx = regs->ecx;
2405 u64 msr_content;
2406 struct vcpu *v = current;
2408 HVM_DBG_LOG(DBG_LEVEL_1, "ecx=%x, eax=%x, edx=%x",
2409 ecx, (u32)regs->eax, (u32)regs->edx);
2411 msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
2413 hvmtrace_msr_write(v, ecx, msr_content);
2415 switch ( ecx )
2417 case MSR_IA32_TIME_STAMP_COUNTER:
2418 hvm_set_guest_time(v, msr_content);
2419 pt_reset(v);
2420 break;
2421 case MSR_IA32_SYSENTER_CS:
2422 __vmwrite(GUEST_SYSENTER_CS, msr_content);
2423 break;
2424 case MSR_IA32_SYSENTER_ESP:
2425 __vmwrite(GUEST_SYSENTER_ESP, msr_content);
2426 break;
2427 case MSR_IA32_SYSENTER_EIP:
2428 __vmwrite(GUEST_SYSENTER_EIP, msr_content);
2429 break;
2430 case MSR_IA32_APICBASE:
2431 vlapic_msr_set(vcpu_vlapic(v), msr_content);
2432 break;
2433 case MSR_IA32_VMX_BASIC...MSR_IA32_VMX_PROCBASED_CTLS2:
2434 goto gp_fault;
2435 default:
2436 switch ( long_mode_do_msr_write(regs) )
2438 case HNDL_unhandled:
2439 wrmsr_hypervisor_regs(ecx, regs->eax, regs->edx);
2440 break;
2441 case HNDL_exception_raised:
2442 return 0;
2443 case HNDL_done:
2444 break;
2446 break;
2449 return 1;
2451 gp_fault:
2452 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
2453 return 0;
2456 static void vmx_do_hlt(struct cpu_user_regs *regs)
2458 HVMTRACE_0D(HLT, current);
2459 hvm_hlt(regs->eflags);
2462 static void vmx_do_extint(struct cpu_user_regs *regs)
2464 unsigned int vector;
2466 asmlinkage void do_IRQ(struct cpu_user_regs *);
2467 fastcall void smp_apic_timer_interrupt(struct cpu_user_regs *);
2468 fastcall void smp_event_check_interrupt(void);
2469 fastcall void smp_invalidate_interrupt(void);
2470 fastcall void smp_call_function_interrupt(void);
2471 fastcall void smp_spurious_interrupt(struct cpu_user_regs *regs);
2472 fastcall void smp_error_interrupt(struct cpu_user_regs *regs);
2473 #ifdef CONFIG_X86_MCE_P4THERMAL
2474 fastcall void smp_thermal_interrupt(struct cpu_user_regs *regs);
2475 #endif
2477 vector = __vmread(VM_EXIT_INTR_INFO);
2478 BUG_ON(!(vector & INTR_INFO_VALID_MASK));
2480 vector &= INTR_INFO_VECTOR_MASK;
2481 HVMTRACE_1D(INTR, current, vector);
2483 switch ( vector )
2485 case LOCAL_TIMER_VECTOR:
2486 smp_apic_timer_interrupt(regs);
2487 break;
2488 case EVENT_CHECK_VECTOR:
2489 smp_event_check_interrupt();
2490 break;
2491 case INVALIDATE_TLB_VECTOR:
2492 smp_invalidate_interrupt();
2493 break;
2494 case CALL_FUNCTION_VECTOR:
2495 smp_call_function_interrupt();
2496 break;
2497 case SPURIOUS_APIC_VECTOR:
2498 smp_spurious_interrupt(regs);
2499 break;
2500 case ERROR_APIC_VECTOR:
2501 smp_error_interrupt(regs);
2502 break;
2503 #ifdef CONFIG_X86_MCE_P4THERMAL
2504 case THERMAL_APIC_VECTOR:
2505 smp_thermal_interrupt(regs);
2506 break;
2507 #endif
2508 default:
2509 regs->entry_vector = vector;
2510 do_IRQ(regs);
2511 break;
2515 static void vmx_failed_vmentry(unsigned int exit_reason,
2516 struct cpu_user_regs *regs)
2518 unsigned int failed_vmentry_reason = (uint16_t)exit_reason;
2519 unsigned long exit_qualification;
2521 exit_qualification = __vmread(EXIT_QUALIFICATION);
2522 printk("Failed vm entry (exit reason 0x%x) ", exit_reason);
2523 switch ( failed_vmentry_reason )
2525 case EXIT_REASON_INVALID_GUEST_STATE:
2526 printk("caused by invalid guest state (%ld).\n", exit_qualification);
2527 break;
2528 case EXIT_REASON_MSR_LOADING:
2529 printk("caused by MSR entry %ld loading.\n", exit_qualification);
2530 break;
2531 case EXIT_REASON_MACHINE_CHECK:
2532 printk("caused by machine check.\n");
2533 HVMTRACE_0D(MCE, current);
2534 do_machine_check(regs);
2535 break;
2536 default:
2537 printk("reason not known yet!");
2538 break;
2541 printk("************* VMCS Area **************\n");
2542 vmcs_dump_vcpu();
2543 printk("**************************************\n");
2545 domain_crash(current->domain);
2548 asmlinkage void vmx_vmexit_handler(struct cpu_user_regs *regs)
2550 unsigned int exit_reason, idtv_info;
2551 unsigned long exit_qualification, inst_len = 0;
2552 struct vcpu *v = current;
2554 exit_reason = __vmread(VM_EXIT_REASON);
2556 hvmtrace_vmexit(v, regs->eip, exit_reason);
2558 perfc_incra(vmexits, exit_reason);
2560 if ( exit_reason != EXIT_REASON_EXTERNAL_INTERRUPT )
2561 local_irq_enable();
2563 if ( unlikely(exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) )
2564 return vmx_failed_vmentry(exit_reason, regs);
2566 /* Event delivery caused this intercept? Queue for redelivery. */
2567 idtv_info = __vmread(IDT_VECTORING_INFO);
2568 if ( unlikely(idtv_info & INTR_INFO_VALID_MASK) &&
2569 (exit_reason != EXIT_REASON_TASK_SWITCH) )
2571 if ( hvm_event_needs_reinjection((idtv_info>>8)&7, idtv_info&0xff) )
2573 /* See SDM 3B 25.7.1.1 and .2 for info about masking resvd bits. */
2574 __vmwrite(VM_ENTRY_INTR_INFO,
2575 idtv_info & ~INTR_INFO_RESVD_BITS_MASK);
2576 if ( idtv_info & INTR_INFO_DELIVER_CODE_MASK )
2577 __vmwrite(VM_ENTRY_EXCEPTION_ERROR_CODE,
2578 __vmread(IDT_VECTORING_ERROR_CODE));
2581 /*
2582 * Clear NMI-blocking interruptibility info if an NMI delivery faulted.
2583 * Re-delivery will re-set it (see SDM 3B 25.7.1.2).
2584 */
2585 if ( (idtv_info & INTR_INFO_INTR_TYPE_MASK) == (X86_EVENTTYPE_NMI<<8) )
2586 __vmwrite(GUEST_INTERRUPTIBILITY_INFO,
2587 __vmread(GUEST_INTERRUPTIBILITY_INFO) &
2588 ~VMX_INTR_SHADOW_NMI);
2591 switch ( exit_reason )
2593 case EXIT_REASON_EXCEPTION_NMI:
2595 /*
2596 * We don't set the software-interrupt exiting (INT n).
2597 * (1) We can get an exception (e.g. #PG) in the guest, or
2598 * (2) NMI
2599 */
2600 unsigned int intr_info, vector;
2602 intr_info = __vmread(VM_EXIT_INTR_INFO);
2603 BUG_ON(!(intr_info & INTR_INFO_VALID_MASK));
2605 vector = intr_info & INTR_INFO_VECTOR_MASK;
2607 /*
2608 * Re-set the NMI shadow if vmexit caused by a guest IRET fault (see 3B
2609 * 25.7.1.2, "Resuming Guest Software after Handling an Exception").
2610 * (NB. If we emulate this IRET for any reason, we should re-clear!)
2611 */
2612 if ( unlikely(intr_info & INTR_INFO_NMI_UNBLOCKED_BY_IRET) &&
2613 !(__vmread(IDT_VECTORING_INFO) & INTR_INFO_VALID_MASK) &&
2614 (vector != TRAP_double_fault) )
2615 __vmwrite(GUEST_INTERRUPTIBILITY_INFO,
2616 __vmread(GUEST_INTERRUPTIBILITY_INFO)|VMX_INTR_SHADOW_NMI);
2618 perfc_incra(cause_vector, vector);
2620 switch ( vector )
2622 case TRAP_debug:
2623 case TRAP_int3:
2624 if ( !v->domain->debugger_attached )
2625 goto exit_and_crash;
2626 domain_pause_for_debugger();
2627 break;
2628 case TRAP_no_device:
2629 vmx_do_no_device_fault();
2630 break;
2631 case TRAP_page_fault:
2632 exit_qualification = __vmread(EXIT_QUALIFICATION);
2633 regs->error_code = __vmread(VM_EXIT_INTR_ERROR_CODE);
2635 HVM_DBG_LOG(DBG_LEVEL_VMMU,
2636 "eax=%lx, ebx=%lx, ecx=%lx, edx=%lx, esi=%lx, edi=%lx",
2637 (unsigned long)regs->eax, (unsigned long)regs->ebx,
2638 (unsigned long)regs->ecx, (unsigned long)regs->edx,
2639 (unsigned long)regs->esi, (unsigned long)regs->edi);
2641 if ( paging_fault(exit_qualification, regs) )
2643 hvmtrace_pf_xen(v, exit_qualification, regs->error_code);
2644 break;
2647 v->arch.hvm_vcpu.guest_cr[2] = exit_qualification;
2648 vmx_inject_hw_exception(v, TRAP_page_fault, regs->error_code);
2649 break;
2650 case TRAP_nmi:
2651 if ( (intr_info & INTR_INFO_INTR_TYPE_MASK) !=
2652 (X86_EVENTTYPE_NMI << 8) )
2653 goto exit_and_crash;
2654 HVMTRACE_0D(NMI, v);
2655 do_nmi(regs); /* Real NMI, vector 2: normal processing. */
2656 break;
2657 case TRAP_machine_check:
2658 HVMTRACE_0D(MCE, v);
2659 do_machine_check(regs);
2660 break;
2661 default:
2662 goto exit_and_crash;
2664 break;
2666 case EXIT_REASON_EXTERNAL_INTERRUPT:
2667 vmx_do_extint(regs);
2668 break;
2669 case EXIT_REASON_TRIPLE_FAULT:
2670 hvm_triple_fault();
2671 break;
2672 case EXIT_REASON_PENDING_VIRT_INTR:
2673 /* Disable the interrupt window. */
2674 v->arch.hvm_vmx.exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
2675 __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
2676 v->arch.hvm_vmx.exec_control);
2677 break;
2678 case EXIT_REASON_PENDING_VIRT_NMI:
2679 /* Disable the NMI window. */
2680 v->arch.hvm_vmx.exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
2681 __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
2682 v->arch.hvm_vmx.exec_control);
2683 break;
2684 case EXIT_REASON_TASK_SWITCH: {
2685 const enum hvm_task_switch_reason reasons[] = {
2686 TSW_call_or_int, TSW_iret, TSW_jmp, TSW_call_or_int };
2687 int32_t errcode = -1;
2688 exit_qualification = __vmread(EXIT_QUALIFICATION);
2689 if ( (idtv_info & INTR_INFO_VALID_MASK) &&
2690 (idtv_info & INTR_INFO_DELIVER_CODE_MASK) )
2691 errcode = __vmread(IDT_VECTORING_ERROR_CODE);
2692 hvm_task_switch((uint16_t)exit_qualification,
2693 reasons[(exit_qualification >> 30) & 3],
2694 errcode);
2695 break;
2697 case EXIT_REASON_CPUID:
2698 inst_len = __get_instruction_length(); /* Safe: CPUID */
2699 __update_guest_eip(inst_len);
2700 vmx_do_cpuid(regs);
2701 break;
2702 case EXIT_REASON_HLT:
2703 inst_len = __get_instruction_length(); /* Safe: HLT */
2704 __update_guest_eip(inst_len);
2705 vmx_do_hlt(regs);
2706 break;
2707 case EXIT_REASON_INVLPG:
2709 inst_len = __get_instruction_length(); /* Safe: INVLPG */
2710 __update_guest_eip(inst_len);
2711 exit_qualification = __vmread(EXIT_QUALIFICATION);
2712 vmx_do_invlpg(exit_qualification);
2713 break;
2715 case EXIT_REASON_VMCALL:
2717 int rc;
2718 HVMTRACE_1D(VMMCALL, v, regs->eax);
2719 inst_len = __get_instruction_length(); /* Safe: VMCALL */
2720 rc = hvm_do_hypercall(regs);
2721 if ( rc != HVM_HCALL_preempted )
2723 __update_guest_eip(inst_len);
2724 if ( rc == HVM_HCALL_invalidate )
2725 send_invalidate_req();
2727 break;
2729 case EXIT_REASON_CR_ACCESS:
2731 exit_qualification = __vmread(EXIT_QUALIFICATION);
2732 inst_len = __get_instruction_length(); /* Safe: MOV Cn, LMSW, CLTS */
2733 if ( vmx_cr_access(exit_qualification, regs) )
2734 __update_guest_eip(inst_len);
2735 break;
2737 case EXIT_REASON_DR_ACCESS:
2738 exit_qualification = __vmread(EXIT_QUALIFICATION);
2739 vmx_dr_access(exit_qualification, regs);
2740 break;
2741 case EXIT_REASON_IO_INSTRUCTION:
2742 exit_qualification = __vmread(EXIT_QUALIFICATION);
2743 inst_len = __get_instruction_length(); /* Safe: IN, INS, OUT, OUTS */
2744 vmx_io_instruction(exit_qualification, inst_len);
2745 break;
2746 case EXIT_REASON_MSR_READ:
2747 inst_len = __get_instruction_length(); /* Safe: RDMSR */
2748 if ( vmx_do_msr_read(regs) )
2749 __update_guest_eip(inst_len);
2750 break;
2751 case EXIT_REASON_MSR_WRITE:
2752 inst_len = __get_instruction_length(); /* Safe: WRMSR */
2753 if ( vmx_do_msr_write(regs) )
2754 __update_guest_eip(inst_len);
2755 break;
2756 case EXIT_REASON_MWAIT_INSTRUCTION:
2757 case EXIT_REASON_MONITOR_INSTRUCTION:
2758 case EXIT_REASON_PAUSE_INSTRUCTION:
2759 goto exit_and_crash;
2760 case EXIT_REASON_VMCLEAR:
2761 case EXIT_REASON_VMLAUNCH:
2762 case EXIT_REASON_VMPTRLD:
2763 case EXIT_REASON_VMPTRST:
2764 case EXIT_REASON_VMREAD:
2765 case EXIT_REASON_VMRESUME:
2766 case EXIT_REASON_VMWRITE:
2767 case EXIT_REASON_VMXOFF:
2768 case EXIT_REASON_VMXON:
2769 /* Report invalid opcode exception when a VMX guest tries to execute
2770 any of the VMX instructions */
2771 vmx_inject_hw_exception(v, TRAP_invalid_op, VMX_DELIVER_NO_ERROR_CODE);
2772 break;
2774 case EXIT_REASON_TPR_BELOW_THRESHOLD:
2775 break;
2777 case EXIT_REASON_APIC_ACCESS:
2779 unsigned long offset;
2780 exit_qualification = __vmread(EXIT_QUALIFICATION);
2781 offset = exit_qualification & 0x0fffUL;
2782 handle_mmio(APIC_DEFAULT_PHYS_BASE | offset);
2783 break;
2786 default:
2787 exit_and_crash:
2788 gdprintk(XENLOG_ERR, "Bad vmexit (reason %x)\n", exit_reason);
2789 domain_crash(v->domain);
2790 break;
2794 asmlinkage void vmx_trace_vmentry(void)
2796 struct vcpu *v = current;
2798 hvmtrace_vmentry(v);
2801 /*
2802 * Local variables:
2803 * mode: C
2804 * c-set-style: "BSD"
2805 * c-basic-offset: 4
2806 * tab-width: 4
2807 * indent-tabs-mode: nil
2808 * End:
2809 */