ia64/xen-unstable

view xen/arch/x86/hvm/vmx/vmx.c @ 16383:ef4b60c99735

x86, hvm: Small code cleanups.
Based on patch from Xin Li.
Signed-off-by: Keir Fraser <keir.fraser@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Fri Nov 16 16:22:00 2007 +0000 (2007-11-16)
parents 8d8d179b9b05
children e82fb0729b51
line source
1 /*
2 * vmx.c: handling VMX architecture-related VM exits
3 * Copyright (c) 2004, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
16 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 *
18 */
20 #include <xen/config.h>
21 #include <xen/init.h>
22 #include <xen/lib.h>
23 #include <xen/trace.h>
24 #include <xen/sched.h>
25 #include <xen/irq.h>
26 #include <xen/softirq.h>
27 #include <xen/domain_page.h>
28 #include <xen/hypercall.h>
29 #include <xen/perfc.h>
30 #include <asm/current.h>
31 #include <asm/io.h>
32 #include <asm/regs.h>
33 #include <asm/cpufeature.h>
34 #include <asm/processor.h>
35 #include <asm/types.h>
36 #include <asm/msr.h>
37 #include <asm/spinlock.h>
38 #include <asm/paging.h>
39 #include <asm/p2m.h>
40 #include <asm/hvm/hvm.h>
41 #include <asm/hvm/support.h>
42 #include <asm/hvm/vmx/vmx.h>
43 #include <asm/hvm/vmx/vmcs.h>
44 #include <asm/hvm/vmx/cpu.h>
45 #include <public/sched.h>
46 #include <public/hvm/ioreq.h>
47 #include <asm/hvm/vpic.h>
48 #include <asm/hvm/vlapic.h>
49 #include <asm/x86_emulate.h>
50 #include <asm/hvm/vpt.h>
51 #include <public/hvm/save.h>
52 #include <asm/hvm/trace.h>
54 enum handler_return { HNDL_done, HNDL_unhandled, HNDL_exception_raised };
56 static void vmx_ctxt_switch_from(struct vcpu *v);
57 static void vmx_ctxt_switch_to(struct vcpu *v);
59 static int vmx_alloc_vlapic_mapping(struct domain *d);
60 static void vmx_free_vlapic_mapping(struct domain *d);
61 static void vmx_install_vlapic_mapping(struct vcpu *v);
62 static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr);
63 static void vmx_update_guest_efer(struct vcpu *v);
65 static int vmx_domain_initialise(struct domain *d)
66 {
67 return vmx_alloc_vlapic_mapping(d);
68 }
70 static void vmx_domain_destroy(struct domain *d)
71 {
72 vmx_free_vlapic_mapping(d);
73 }
75 static int vmx_vcpu_initialise(struct vcpu *v)
76 {
77 int rc;
79 spin_lock_init(&v->arch.hvm_vmx.vmcs_lock);
81 v->arch.schedule_tail = vmx_do_resume;
82 v->arch.ctxt_switch_from = vmx_ctxt_switch_from;
83 v->arch.ctxt_switch_to = vmx_ctxt_switch_to;
85 if ( (rc = vmx_create_vmcs(v)) != 0 )
86 {
87 dprintk(XENLOG_WARNING,
88 "Failed to create VMCS for vcpu %d: err=%d.\n",
89 v->vcpu_id, rc);
90 return rc;
91 }
93 vmx_install_vlapic_mapping(v);
95 return 0;
96 }
98 static void vmx_vcpu_destroy(struct vcpu *v)
99 {
100 vmx_destroy_vmcs(v);
101 }
103 #ifdef __x86_64__
105 static DEFINE_PER_CPU(struct vmx_msr_state, host_msr_state);
107 static u32 msr_index[VMX_MSR_COUNT] =
108 {
109 MSR_LSTAR, MSR_STAR, MSR_SYSCALL_MASK
110 };
112 static void vmx_save_host_msrs(void)
113 {
114 struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state);
115 int i;
117 for ( i = 0; i < VMX_MSR_COUNT; i++ )
118 rdmsrl(msr_index[i], host_msr_state->msrs[i]);
119 }
121 #define WRITE_MSR(address) \
122 guest_msr_state->msrs[VMX_INDEX_MSR_ ## address] = msr_content; \
123 set_bit(VMX_INDEX_MSR_ ## address, &guest_msr_state->flags); \
124 wrmsrl(MSR_ ## address, msr_content); \
125 set_bit(VMX_INDEX_MSR_ ## address, &host_msr_state->flags); \
126 break
128 static enum handler_return long_mode_do_msr_read(struct cpu_user_regs *regs)
129 {
130 u64 msr_content = 0;
131 u32 ecx = regs->ecx;
132 struct vcpu *v = current;
133 struct vmx_msr_state *guest_msr_state = &v->arch.hvm_vmx.msr_state;
135 switch ( ecx )
136 {
137 case MSR_EFER:
138 msr_content = v->arch.hvm_vcpu.guest_efer;
139 break;
141 case MSR_FS_BASE:
142 msr_content = __vmread(GUEST_FS_BASE);
143 goto check_long_mode;
145 case MSR_GS_BASE:
146 msr_content = __vmread(GUEST_GS_BASE);
147 goto check_long_mode;
149 case MSR_SHADOW_GS_BASE:
150 msr_content = v->arch.hvm_vmx.shadow_gs;
151 check_long_mode:
152 if ( !(hvm_long_mode_enabled(v)) )
153 {
154 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
155 return HNDL_exception_raised;
156 }
157 break;
159 case MSR_STAR:
160 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_STAR];
161 break;
163 case MSR_LSTAR:
164 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_LSTAR];
165 break;
167 case MSR_CSTAR:
168 msr_content = v->arch.hvm_vmx.cstar;
169 break;
171 case MSR_SYSCALL_MASK:
172 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_SYSCALL_MASK];
173 break;
175 default:
176 return HNDL_unhandled;
177 }
179 HVM_DBG_LOG(DBG_LEVEL_0, "msr 0x%x content 0x%"PRIx64, ecx, msr_content);
181 regs->eax = (u32)(msr_content >> 0);
182 regs->edx = (u32)(msr_content >> 32);
184 return HNDL_done;
185 }
187 static enum handler_return long_mode_do_msr_write(struct cpu_user_regs *regs)
188 {
189 u64 msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
190 u32 ecx = regs->ecx;
191 struct vcpu *v = current;
192 struct vmx_msr_state *guest_msr_state = &v->arch.hvm_vmx.msr_state;
193 struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state);
195 HVM_DBG_LOG(DBG_LEVEL_0, "msr 0x%x content 0x%"PRIx64, ecx, msr_content);
197 switch ( ecx )
198 {
199 case MSR_EFER:
200 if ( !hvm_set_efer(msr_content) )
201 goto exception_raised;
202 break;
204 case MSR_FS_BASE:
205 case MSR_GS_BASE:
206 case MSR_SHADOW_GS_BASE:
207 if ( !hvm_long_mode_enabled(v) )
208 goto gp_fault;
210 if ( !is_canonical_address(msr_content) )
211 goto uncanonical_address;
213 if ( ecx == MSR_FS_BASE )
214 __vmwrite(GUEST_FS_BASE, msr_content);
215 else if ( ecx == MSR_GS_BASE )
216 __vmwrite(GUEST_GS_BASE, msr_content);
217 else
218 {
219 v->arch.hvm_vmx.shadow_gs = msr_content;
220 wrmsrl(MSR_SHADOW_GS_BASE, msr_content);
221 }
223 break;
225 case MSR_STAR:
226 WRITE_MSR(STAR);
228 case MSR_LSTAR:
229 if ( !is_canonical_address(msr_content) )
230 goto uncanonical_address;
231 WRITE_MSR(LSTAR);
233 case MSR_CSTAR:
234 if ( !is_canonical_address(msr_content) )
235 goto uncanonical_address;
236 v->arch.hvm_vmx.cstar = msr_content;
237 break;
239 case MSR_SYSCALL_MASK:
240 WRITE_MSR(SYSCALL_MASK);
242 default:
243 return HNDL_unhandled;
244 }
246 return HNDL_done;
248 uncanonical_address:
249 HVM_DBG_LOG(DBG_LEVEL_0, "Not cano address of msr write %x", ecx);
250 gp_fault:
251 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
252 exception_raised:
253 return HNDL_exception_raised;
254 }
256 /*
257 * To avoid MSR save/restore at every VM exit/entry time, we restore
258 * the x86_64 specific MSRs at domain switch time. Since these MSRs
259 * are not modified once set for para domains, we don't save them,
260 * but simply reset them to values set in percpu_traps_init().
261 */
262 static void vmx_restore_host_msrs(void)
263 {
264 struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state);
265 int i;
267 while ( host_msr_state->flags )
268 {
269 i = find_first_set_bit(host_msr_state->flags);
270 wrmsrl(msr_index[i], host_msr_state->msrs[i]);
271 clear_bit(i, &host_msr_state->flags);
272 }
274 if ( cpu_has_nx && !(read_efer() & EFER_NX) )
275 write_efer(read_efer() | EFER_NX);
276 }
278 static void vmx_save_guest_msrs(struct vcpu *v)
279 {
280 /* MSR_SHADOW_GS_BASE may have been changed by swapgs instruction. */
281 rdmsrl(MSR_SHADOW_GS_BASE, v->arch.hvm_vmx.shadow_gs);
282 }
284 static void vmx_restore_guest_msrs(struct vcpu *v)
285 {
286 struct vmx_msr_state *guest_msr_state, *host_msr_state;
287 unsigned long guest_flags;
288 int i;
290 guest_msr_state = &v->arch.hvm_vmx.msr_state;
291 host_msr_state = &this_cpu(host_msr_state);
293 wrmsrl(MSR_SHADOW_GS_BASE, v->arch.hvm_vmx.shadow_gs);
295 guest_flags = guest_msr_state->flags;
297 while ( guest_flags )
298 {
299 i = find_first_set_bit(guest_flags);
301 HVM_DBG_LOG(DBG_LEVEL_2,
302 "restore guest's index %d msr %x with value %lx",
303 i, msr_index[i], guest_msr_state->msrs[i]);
304 set_bit(i, &host_msr_state->flags);
305 wrmsrl(msr_index[i], guest_msr_state->msrs[i]);
306 clear_bit(i, &guest_flags);
307 }
309 if ( (v->arch.hvm_vcpu.guest_efer ^ read_efer()) & (EFER_NX | EFER_SCE) )
310 {
311 HVM_DBG_LOG(DBG_LEVEL_2,
312 "restore guest's EFER with value %lx",
313 v->arch.hvm_vcpu.guest_efer);
314 write_efer((read_efer() & ~(EFER_NX | EFER_SCE)) |
315 (v->arch.hvm_vcpu.guest_efer & (EFER_NX | EFER_SCE)));
316 }
317 }
319 #else /* __i386__ */
321 #define vmx_save_host_msrs() ((void)0)
323 static void vmx_restore_host_msrs(void)
324 {
325 if ( cpu_has_nx && !(read_efer() & EFER_NX) )
326 write_efer(read_efer() | EFER_NX);
327 }
329 #define vmx_save_guest_msrs(v) ((void)0)
331 static void vmx_restore_guest_msrs(struct vcpu *v)
332 {
333 if ( (v->arch.hvm_vcpu.guest_efer ^ read_efer()) & EFER_NX )
334 {
335 HVM_DBG_LOG(DBG_LEVEL_2,
336 "restore guest's EFER with value %lx",
337 v->arch.hvm_vcpu.guest_efer);
338 write_efer((read_efer() & ~EFER_NX) |
339 (v->arch.hvm_vcpu.guest_efer & EFER_NX));
340 }
341 }
343 static enum handler_return long_mode_do_msr_read(struct cpu_user_regs *regs)
344 {
345 u64 msr_content = 0;
346 struct vcpu *v = current;
348 switch ( regs->ecx )
349 {
350 case MSR_EFER:
351 msr_content = v->arch.hvm_vcpu.guest_efer;
352 break;
354 default:
355 return HNDL_unhandled;
356 }
358 regs->eax = msr_content >> 0;
359 regs->edx = msr_content >> 32;
361 return HNDL_done;
362 }
364 static enum handler_return long_mode_do_msr_write(struct cpu_user_regs *regs)
365 {
366 u64 msr_content = regs->eax | ((u64)regs->edx << 32);
368 switch ( regs->ecx )
369 {
370 case MSR_EFER:
371 if ( !hvm_set_efer(msr_content) )
372 return HNDL_exception_raised;
373 break;
375 default:
376 return HNDL_unhandled;
377 }
379 return HNDL_done;
380 }
382 #endif /* __i386__ */
384 static int vmx_guest_x86_mode(struct vcpu *v)
385 {
386 unsigned int cs_ar_bytes;
388 if ( unlikely(!(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE)) )
389 return 0;
390 if ( unlikely(guest_cpu_user_regs()->eflags & X86_EFLAGS_VM) )
391 return 1;
392 cs_ar_bytes = __vmread(GUEST_CS_AR_BYTES);
393 if ( hvm_long_mode_enabled(v) &&
394 likely(cs_ar_bytes & X86_SEG_AR_CS_LM_ACTIVE) )
395 return 8;
396 return (likely(cs_ar_bytes & X86_SEG_AR_DEF_OP_SIZE) ? 4 : 2);
397 }
399 static void vmx_save_dr(struct vcpu *v)
400 {
401 if ( !v->arch.hvm_vcpu.flag_dr_dirty )
402 return;
404 /* Clear the DR dirty flag and re-enable intercepts for DR accesses. */
405 v->arch.hvm_vcpu.flag_dr_dirty = 0;
406 v->arch.hvm_vmx.exec_control |= CPU_BASED_MOV_DR_EXITING;
407 __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control);
409 v->arch.guest_context.debugreg[0] = read_debugreg(0);
410 v->arch.guest_context.debugreg[1] = read_debugreg(1);
411 v->arch.guest_context.debugreg[2] = read_debugreg(2);
412 v->arch.guest_context.debugreg[3] = read_debugreg(3);
413 v->arch.guest_context.debugreg[6] = read_debugreg(6);
414 /* DR7 must be saved as it is used by vmx_restore_dr(). */
415 v->arch.guest_context.debugreg[7] = __vmread(GUEST_DR7);
416 }
418 static void __restore_debug_registers(struct vcpu *v)
419 {
420 ASSERT(!v->arch.hvm_vcpu.flag_dr_dirty);
421 v->arch.hvm_vcpu.flag_dr_dirty = 1;
423 write_debugreg(0, v->arch.guest_context.debugreg[0]);
424 write_debugreg(1, v->arch.guest_context.debugreg[1]);
425 write_debugreg(2, v->arch.guest_context.debugreg[2]);
426 write_debugreg(3, v->arch.guest_context.debugreg[3]);
427 write_debugreg(6, v->arch.guest_context.debugreg[6]);
428 /* DR7 is loaded from the VMCS. */
429 }
431 /*
432 * DR7 is saved and restored on every vmexit. Other debug registers only
433 * need to be restored if their value is going to affect execution -- i.e.,
434 * if one of the breakpoints is enabled. So mask out all bits that don't
435 * enable some breakpoint functionality.
436 */
437 #define DR7_ACTIVE_MASK 0xff
439 static void vmx_restore_dr(struct vcpu *v)
440 {
441 /* NB. __vmread() is not usable here, so we cannot read from the VMCS. */
442 if ( unlikely(v->arch.guest_context.debugreg[7] & DR7_ACTIVE_MASK) )
443 __restore_debug_registers(v);
444 }
446 void vmx_vmcs_save(struct vcpu *v, struct hvm_hw_cpu *c)
447 {
448 uint32_t ev;
450 vmx_vmcs_enter(v);
452 c->cr0 = v->arch.hvm_vcpu.guest_cr[0];
453 c->cr2 = v->arch.hvm_vcpu.guest_cr[2];
454 c->cr3 = v->arch.hvm_vcpu.guest_cr[3];
455 c->cr4 = v->arch.hvm_vcpu.guest_cr[4];
457 c->msr_efer = v->arch.hvm_vcpu.guest_efer;
459 c->idtr_limit = __vmread(GUEST_IDTR_LIMIT);
460 c->idtr_base = __vmread(GUEST_IDTR_BASE);
462 c->gdtr_limit = __vmread(GUEST_GDTR_LIMIT);
463 c->gdtr_base = __vmread(GUEST_GDTR_BASE);
465 c->cs_sel = __vmread(GUEST_CS_SELECTOR);
466 c->cs_limit = __vmread(GUEST_CS_LIMIT);
467 c->cs_base = __vmread(GUEST_CS_BASE);
468 c->cs_arbytes = __vmread(GUEST_CS_AR_BYTES);
470 c->ds_sel = __vmread(GUEST_DS_SELECTOR);
471 c->ds_limit = __vmread(GUEST_DS_LIMIT);
472 c->ds_base = __vmread(GUEST_DS_BASE);
473 c->ds_arbytes = __vmread(GUEST_DS_AR_BYTES);
475 c->es_sel = __vmread(GUEST_ES_SELECTOR);
476 c->es_limit = __vmread(GUEST_ES_LIMIT);
477 c->es_base = __vmread(GUEST_ES_BASE);
478 c->es_arbytes = __vmread(GUEST_ES_AR_BYTES);
480 c->ss_sel = __vmread(GUEST_SS_SELECTOR);
481 c->ss_limit = __vmread(GUEST_SS_LIMIT);
482 c->ss_base = __vmread(GUEST_SS_BASE);
483 c->ss_arbytes = __vmread(GUEST_SS_AR_BYTES);
485 c->fs_sel = __vmread(GUEST_FS_SELECTOR);
486 c->fs_limit = __vmread(GUEST_FS_LIMIT);
487 c->fs_base = __vmread(GUEST_FS_BASE);
488 c->fs_arbytes = __vmread(GUEST_FS_AR_BYTES);
490 c->gs_sel = __vmread(GUEST_GS_SELECTOR);
491 c->gs_limit = __vmread(GUEST_GS_LIMIT);
492 c->gs_base = __vmread(GUEST_GS_BASE);
493 c->gs_arbytes = __vmread(GUEST_GS_AR_BYTES);
495 c->tr_sel = __vmread(GUEST_TR_SELECTOR);
496 c->tr_limit = __vmread(GUEST_TR_LIMIT);
497 c->tr_base = __vmread(GUEST_TR_BASE);
498 c->tr_arbytes = __vmread(GUEST_TR_AR_BYTES);
500 c->ldtr_sel = __vmread(GUEST_LDTR_SELECTOR);
501 c->ldtr_limit = __vmread(GUEST_LDTR_LIMIT);
502 c->ldtr_base = __vmread(GUEST_LDTR_BASE);
503 c->ldtr_arbytes = __vmread(GUEST_LDTR_AR_BYTES);
505 c->sysenter_cs = __vmread(GUEST_SYSENTER_CS);
506 c->sysenter_esp = __vmread(GUEST_SYSENTER_ESP);
507 c->sysenter_eip = __vmread(GUEST_SYSENTER_EIP);
509 c->pending_event = 0;
510 c->error_code = 0;
511 if ( ((ev = __vmread(VM_ENTRY_INTR_INFO)) & INTR_INFO_VALID_MASK) &&
512 hvm_event_needs_reinjection((ev >> 8) & 7, ev & 0xff) )
513 {
514 c->pending_event = ev;
515 c->error_code = __vmread(VM_ENTRY_EXCEPTION_ERROR_CODE);
516 }
518 vmx_vmcs_exit(v);
519 }
521 static int vmx_restore_cr0_cr3(
522 struct vcpu *v, unsigned long cr0, unsigned long cr3)
523 {
524 unsigned long mfn = 0;
525 p2m_type_t p2mt;
527 if ( cr0 & X86_CR0_PG )
528 {
529 mfn = mfn_x(gfn_to_mfn(v->domain, cr3 >> PAGE_SHIFT, &p2mt));
530 if ( !p2m_is_ram(p2mt) || !get_page(mfn_to_page(mfn), v->domain) )
531 {
532 gdprintk(XENLOG_ERR, "Invalid CR3 value=0x%lx\n", cr3);
533 return -EINVAL;
534 }
535 }
537 if ( v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PG )
538 put_page(pagetable_get_page(v->arch.guest_table));
540 v->arch.guest_table = pagetable_from_pfn(mfn);
542 v->arch.hvm_vcpu.guest_cr[0] = cr0 | X86_CR0_ET;
543 v->arch.hvm_vcpu.guest_cr[3] = cr3;
545 return 0;
546 }
548 int vmx_vmcs_restore(struct vcpu *v, struct hvm_hw_cpu *c)
549 {
550 int rc;
552 if ( c->pending_valid &&
553 ((c->pending_type == 1) || (c->pending_type > 6) ||
554 (c->pending_reserved != 0)) )
555 {
556 gdprintk(XENLOG_ERR, "Invalid pending event 0x%"PRIx32".\n",
557 c->pending_event);
558 return -EINVAL;
559 }
561 rc = vmx_restore_cr0_cr3(v, c->cr0, c->cr3);
562 if ( rc )
563 return rc;
565 vmx_vmcs_enter(v);
567 v->arch.hvm_vcpu.guest_cr[2] = c->cr2;
568 v->arch.hvm_vcpu.guest_cr[4] = c->cr4;
569 vmx_update_guest_cr(v, 0);
570 vmx_update_guest_cr(v, 2);
571 vmx_update_guest_cr(v, 4);
573 #ifdef HVM_DEBUG_SUSPEND
574 printk("%s: cr3=0x%"PRIx64", cr0=0x%"PRIx64", cr4=0x%"PRIx64".\n",
575 __func__, c->cr3, c->cr0, c->cr4);
576 #endif
578 v->arch.hvm_vcpu.guest_efer = c->msr_efer;
579 vmx_update_guest_efer(v);
581 __vmwrite(GUEST_IDTR_LIMIT, c->idtr_limit);
582 __vmwrite(GUEST_IDTR_BASE, c->idtr_base);
584 __vmwrite(GUEST_GDTR_LIMIT, c->gdtr_limit);
585 __vmwrite(GUEST_GDTR_BASE, c->gdtr_base);
587 __vmwrite(GUEST_CS_SELECTOR, c->cs_sel);
588 __vmwrite(GUEST_CS_LIMIT, c->cs_limit);
589 __vmwrite(GUEST_CS_BASE, c->cs_base);
590 __vmwrite(GUEST_CS_AR_BYTES, c->cs_arbytes);
592 __vmwrite(GUEST_DS_SELECTOR, c->ds_sel);
593 __vmwrite(GUEST_DS_LIMIT, c->ds_limit);
594 __vmwrite(GUEST_DS_BASE, c->ds_base);
595 __vmwrite(GUEST_DS_AR_BYTES, c->ds_arbytes);
597 __vmwrite(GUEST_ES_SELECTOR, c->es_sel);
598 __vmwrite(GUEST_ES_LIMIT, c->es_limit);
599 __vmwrite(GUEST_ES_BASE, c->es_base);
600 __vmwrite(GUEST_ES_AR_BYTES, c->es_arbytes);
602 __vmwrite(GUEST_SS_SELECTOR, c->ss_sel);
603 __vmwrite(GUEST_SS_LIMIT, c->ss_limit);
604 __vmwrite(GUEST_SS_BASE, c->ss_base);
605 __vmwrite(GUEST_SS_AR_BYTES, c->ss_arbytes);
607 __vmwrite(GUEST_FS_SELECTOR, c->fs_sel);
608 __vmwrite(GUEST_FS_LIMIT, c->fs_limit);
609 __vmwrite(GUEST_FS_BASE, c->fs_base);
610 __vmwrite(GUEST_FS_AR_BYTES, c->fs_arbytes);
612 __vmwrite(GUEST_GS_SELECTOR, c->gs_sel);
613 __vmwrite(GUEST_GS_LIMIT, c->gs_limit);
614 __vmwrite(GUEST_GS_BASE, c->gs_base);
615 __vmwrite(GUEST_GS_AR_BYTES, c->gs_arbytes);
617 __vmwrite(GUEST_TR_SELECTOR, c->tr_sel);
618 __vmwrite(GUEST_TR_LIMIT, c->tr_limit);
619 __vmwrite(GUEST_TR_BASE, c->tr_base);
620 __vmwrite(GUEST_TR_AR_BYTES, c->tr_arbytes);
622 __vmwrite(GUEST_LDTR_SELECTOR, c->ldtr_sel);
623 __vmwrite(GUEST_LDTR_LIMIT, c->ldtr_limit);
624 __vmwrite(GUEST_LDTR_BASE, c->ldtr_base);
625 __vmwrite(GUEST_LDTR_AR_BYTES, c->ldtr_arbytes);
627 __vmwrite(GUEST_SYSENTER_CS, c->sysenter_cs);
628 __vmwrite(GUEST_SYSENTER_ESP, c->sysenter_esp);
629 __vmwrite(GUEST_SYSENTER_EIP, c->sysenter_eip);
631 __vmwrite(GUEST_DR7, c->dr7);
633 vmx_vmcs_exit(v);
635 paging_update_paging_modes(v);
637 if ( c->pending_valid )
638 {
639 gdprintk(XENLOG_INFO, "Re-injecting 0x%"PRIx32", 0x%"PRIx32"\n",
640 c->pending_event, c->error_code);
642 if ( hvm_event_needs_reinjection(c->pending_type, c->pending_vector) )
643 {
644 vmx_vmcs_enter(v);
645 __vmwrite(VM_ENTRY_INTR_INFO, c->pending_event);
646 __vmwrite(VM_ENTRY_EXCEPTION_ERROR_CODE, c->error_code);
647 vmx_vmcs_exit(v);
648 }
649 }
651 return 0;
652 }
654 #if defined(__x86_64__) && defined(HVM_DEBUG_SUSPEND)
655 static void dump_msr_state(struct vmx_msr_state *m)
656 {
657 int i = 0;
658 printk("**** msr state ****\n");
659 printk("shadow_gs=0x%lx, flags=0x%lx, msr_items:", m->shadow_gs, m->flags);
660 for ( i = 0; i < VMX_MSR_COUNT; i++ )
661 printk("0x%lx,", m->msrs[i]);
662 printk("\n");
663 }
664 #else
665 #define dump_msr_state(m) ((void)0)
666 #endif
668 static void vmx_save_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data)
669 {
670 #ifdef __x86_64__
671 struct vmx_msr_state *guest_state = &v->arch.hvm_vmx.msr_state;
672 unsigned long guest_flags = guest_state->flags;
674 data->shadow_gs = v->arch.hvm_vmx.shadow_gs;
675 data->msr_cstar = v->arch.hvm_vmx.cstar;
677 /* save msrs */
678 data->msr_flags = guest_flags;
679 data->msr_lstar = guest_state->msrs[VMX_INDEX_MSR_LSTAR];
680 data->msr_star = guest_state->msrs[VMX_INDEX_MSR_STAR];
681 data->msr_syscall_mask = guest_state->msrs[VMX_INDEX_MSR_SYSCALL_MASK];
682 #endif
684 data->tsc = hvm_get_guest_time(v);
686 dump_msr_state(guest_state);
687 }
689 static void vmx_load_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data)
690 {
691 #ifdef __x86_64__
692 struct vmx_msr_state *guest_state = &v->arch.hvm_vmx.msr_state;
694 /* restore msrs */
695 guest_state->flags = data->msr_flags;
696 guest_state->msrs[VMX_INDEX_MSR_LSTAR] = data->msr_lstar;
697 guest_state->msrs[VMX_INDEX_MSR_STAR] = data->msr_star;
698 guest_state->msrs[VMX_INDEX_MSR_SYSCALL_MASK] = data->msr_syscall_mask;
700 v->arch.hvm_vmx.cstar = data->msr_cstar;
701 v->arch.hvm_vmx.shadow_gs = data->shadow_gs;
702 #endif
704 v->arch.hvm_vmx.vmxassist_enabled = !(data->cr0 & X86_CR0_PE);
706 hvm_set_guest_time(v, data->tsc);
708 dump_msr_state(guest_state);
709 }
712 static void vmx_save_vmcs_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt)
713 {
714 vmx_save_cpu_state(v, ctxt);
715 vmx_vmcs_save(v, ctxt);
716 }
718 static int vmx_load_vmcs_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt)
719 {
720 vmx_load_cpu_state(v, ctxt);
722 if ( vmx_vmcs_restore(v, ctxt) )
723 {
724 gdprintk(XENLOG_ERR, "vmx_vmcs restore failed!\n");
725 domain_crash(v->domain);
726 return -EINVAL;
727 }
729 return 0;
730 }
732 static void vmx_ctxt_switch_from(struct vcpu *v)
733 {
734 vmx_save_guest_msrs(v);
735 vmx_restore_host_msrs();
736 vmx_save_dr(v);
737 }
739 static void vmx_ctxt_switch_to(struct vcpu *v)
740 {
741 /* HOST_CR4 in VMCS is always mmu_cr4_features. Sync CR4 now. */
742 if ( unlikely(read_cr4() != mmu_cr4_features) )
743 write_cr4(mmu_cr4_features);
745 vmx_restore_guest_msrs(v);
746 vmx_restore_dr(v);
747 }
749 static unsigned long vmx_get_segment_base(struct vcpu *v, enum x86_segment seg)
750 {
751 unsigned long base = 0;
752 int long_mode = 0;
754 ASSERT(v == current);
756 if ( hvm_long_mode_enabled(v) &&
757 (__vmread(GUEST_CS_AR_BYTES) & X86_SEG_AR_CS_LM_ACTIVE) )
758 long_mode = 1;
760 switch ( seg )
761 {
762 case x86_seg_cs: if ( !long_mode ) base = __vmread(GUEST_CS_BASE); break;
763 case x86_seg_ds: if ( !long_mode ) base = __vmread(GUEST_DS_BASE); break;
764 case x86_seg_es: if ( !long_mode ) base = __vmread(GUEST_ES_BASE); break;
765 case x86_seg_fs: base = __vmread(GUEST_FS_BASE); break;
766 case x86_seg_gs: base = __vmread(GUEST_GS_BASE); break;
767 case x86_seg_ss: if ( !long_mode ) base = __vmread(GUEST_SS_BASE); break;
768 case x86_seg_tr: base = __vmread(GUEST_TR_BASE); break;
769 case x86_seg_gdtr: base = __vmread(GUEST_GDTR_BASE); break;
770 case x86_seg_idtr: base = __vmread(GUEST_IDTR_BASE); break;
771 case x86_seg_ldtr: base = __vmread(GUEST_LDTR_BASE); break;
772 default: BUG(); break;
773 }
775 return base;
776 }
778 static void vmx_get_segment_register(struct vcpu *v, enum x86_segment seg,
779 struct segment_register *reg)
780 {
781 uint32_t attr = 0;
783 ASSERT(v == current);
785 switch ( seg )
786 {
787 case x86_seg_cs:
788 reg->sel = __vmread(GUEST_CS_SELECTOR);
789 reg->limit = __vmread(GUEST_CS_LIMIT);
790 reg->base = __vmread(GUEST_CS_BASE);
791 attr = __vmread(GUEST_CS_AR_BYTES);
792 break;
793 case x86_seg_ds:
794 reg->sel = __vmread(GUEST_DS_SELECTOR);
795 reg->limit = __vmread(GUEST_DS_LIMIT);
796 reg->base = __vmread(GUEST_DS_BASE);
797 attr = __vmread(GUEST_DS_AR_BYTES);
798 break;
799 case x86_seg_es:
800 reg->sel = __vmread(GUEST_ES_SELECTOR);
801 reg->limit = __vmread(GUEST_ES_LIMIT);
802 reg->base = __vmread(GUEST_ES_BASE);
803 attr = __vmread(GUEST_ES_AR_BYTES);
804 break;
805 case x86_seg_fs:
806 reg->sel = __vmread(GUEST_FS_SELECTOR);
807 reg->limit = __vmread(GUEST_FS_LIMIT);
808 reg->base = __vmread(GUEST_FS_BASE);
809 attr = __vmread(GUEST_FS_AR_BYTES);
810 break;
811 case x86_seg_gs:
812 reg->sel = __vmread(GUEST_GS_SELECTOR);
813 reg->limit = __vmread(GUEST_GS_LIMIT);
814 reg->base = __vmread(GUEST_GS_BASE);
815 attr = __vmread(GUEST_GS_AR_BYTES);
816 break;
817 case x86_seg_ss:
818 reg->sel = __vmread(GUEST_SS_SELECTOR);
819 reg->limit = __vmread(GUEST_SS_LIMIT);
820 reg->base = __vmread(GUEST_SS_BASE);
821 attr = __vmread(GUEST_SS_AR_BYTES);
822 break;
823 case x86_seg_tr:
824 reg->sel = __vmread(GUEST_TR_SELECTOR);
825 reg->limit = __vmread(GUEST_TR_LIMIT);
826 reg->base = __vmread(GUEST_TR_BASE);
827 attr = __vmread(GUEST_TR_AR_BYTES);
828 break;
829 case x86_seg_gdtr:
830 reg->limit = __vmread(GUEST_GDTR_LIMIT);
831 reg->base = __vmread(GUEST_GDTR_BASE);
832 break;
833 case x86_seg_idtr:
834 reg->limit = __vmread(GUEST_IDTR_LIMIT);
835 reg->base = __vmread(GUEST_IDTR_BASE);
836 break;
837 case x86_seg_ldtr:
838 reg->sel = __vmread(GUEST_LDTR_SELECTOR);
839 reg->limit = __vmread(GUEST_LDTR_LIMIT);
840 reg->base = __vmread(GUEST_LDTR_BASE);
841 attr = __vmread(GUEST_LDTR_AR_BYTES);
842 break;
843 default:
844 BUG();
845 }
847 reg->attr.bytes = (attr & 0xff) | ((attr >> 4) & 0xf00);
848 /* Unusable flag is folded into Present flag. */
849 if ( attr & (1u<<16) )
850 reg->attr.fields.p = 0;
851 }
853 static void vmx_set_segment_register(struct vcpu *v, enum x86_segment seg,
854 struct segment_register *reg)
855 {
856 uint32_t attr;
858 ASSERT(v == current);
860 attr = reg->attr.bytes;
861 attr = ((attr & 0xf00) << 4) | (attr & 0xff);
863 /* Not-present must mean unusable. */
864 if ( !reg->attr.fields.p )
865 attr |= (1u << 16);
867 switch ( seg )
868 {
869 case x86_seg_cs:
870 __vmwrite(GUEST_CS_SELECTOR, reg->sel);
871 __vmwrite(GUEST_CS_LIMIT, reg->limit);
872 __vmwrite(GUEST_CS_BASE, reg->base);
873 __vmwrite(GUEST_CS_AR_BYTES, attr);
874 break;
875 case x86_seg_ds:
876 __vmwrite(GUEST_DS_SELECTOR, reg->sel);
877 __vmwrite(GUEST_DS_LIMIT, reg->limit);
878 __vmwrite(GUEST_DS_BASE, reg->base);
879 __vmwrite(GUEST_DS_AR_BYTES, attr);
880 break;
881 case x86_seg_es:
882 __vmwrite(GUEST_ES_SELECTOR, reg->sel);
883 __vmwrite(GUEST_ES_LIMIT, reg->limit);
884 __vmwrite(GUEST_ES_BASE, reg->base);
885 __vmwrite(GUEST_ES_AR_BYTES, attr);
886 break;
887 case x86_seg_fs:
888 __vmwrite(GUEST_FS_SELECTOR, reg->sel);
889 __vmwrite(GUEST_FS_LIMIT, reg->limit);
890 __vmwrite(GUEST_FS_BASE, reg->base);
891 __vmwrite(GUEST_FS_AR_BYTES, attr);
892 break;
893 case x86_seg_gs:
894 __vmwrite(GUEST_GS_SELECTOR, reg->sel);
895 __vmwrite(GUEST_GS_LIMIT, reg->limit);
896 __vmwrite(GUEST_GS_BASE, reg->base);
897 __vmwrite(GUEST_GS_AR_BYTES, attr);
898 break;
899 case x86_seg_ss:
900 __vmwrite(GUEST_SS_SELECTOR, reg->sel);
901 __vmwrite(GUEST_SS_LIMIT, reg->limit);
902 __vmwrite(GUEST_SS_BASE, reg->base);
903 __vmwrite(GUEST_SS_AR_BYTES, attr);
904 break;
905 case x86_seg_tr:
906 __vmwrite(GUEST_TR_SELECTOR, reg->sel);
907 __vmwrite(GUEST_TR_LIMIT, reg->limit);
908 __vmwrite(GUEST_TR_BASE, reg->base);
909 __vmwrite(GUEST_TR_AR_BYTES, attr);
910 break;
911 case x86_seg_gdtr:
912 __vmwrite(GUEST_GDTR_LIMIT, reg->limit);
913 __vmwrite(GUEST_GDTR_BASE, reg->base);
914 break;
915 case x86_seg_idtr:
916 __vmwrite(GUEST_IDTR_LIMIT, reg->limit);
917 __vmwrite(GUEST_IDTR_BASE, reg->base);
918 break;
919 case x86_seg_ldtr:
920 __vmwrite(GUEST_LDTR_SELECTOR, reg->sel);
921 __vmwrite(GUEST_LDTR_LIMIT, reg->limit);
922 __vmwrite(GUEST_LDTR_BASE, reg->base);
923 __vmwrite(GUEST_LDTR_AR_BYTES, attr);
924 break;
925 default:
926 BUG();
927 }
928 }
930 /* Make sure that xen intercepts any FP accesses from current */
931 static void vmx_stts(struct vcpu *v)
932 {
933 /* VMX depends on operating on the current vcpu */
934 ASSERT(v == current);
936 /*
937 * If the guest does not have TS enabled then we must cause and handle an
938 * exception on first use of the FPU. If the guest *does* have TS enabled
939 * then this is not necessary: no FPU activity can occur until the guest
940 * clears CR0.TS, and we will initialise the FPU when that happens.
941 */
942 if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) )
943 {
944 v->arch.hvm_vcpu.hw_cr[0] |= X86_CR0_TS;
945 __vmwrite(GUEST_CR0, v->arch.hvm_vcpu.hw_cr[0]);
946 __vm_set_bit(EXCEPTION_BITMAP, TRAP_no_device);
947 }
948 }
950 static void vmx_set_tsc_offset(struct vcpu *v, u64 offset)
951 {
952 vmx_vmcs_enter(v);
953 __vmwrite(TSC_OFFSET, offset);
954 #if defined (__i386__)
955 __vmwrite(TSC_OFFSET_HIGH, offset >> 32);
956 #endif
957 vmx_vmcs_exit(v);
958 }
960 static void vmx_init_ap_context(
961 struct vcpu_guest_context *ctxt, int vcpuid, int trampoline_vector)
962 {
963 memset(ctxt, 0, sizeof(*ctxt));
964 ctxt->user_regs.eip = VMXASSIST_BASE;
965 ctxt->user_regs.edx = vcpuid;
966 ctxt->user_regs.ebx = trampoline_vector;
967 }
969 void do_nmi(struct cpu_user_regs *);
971 static void vmx_init_hypercall_page(struct domain *d, void *hypercall_page)
972 {
973 char *p;
974 int i;
976 for ( i = 0; i < (PAGE_SIZE / 32); i++ )
977 {
978 p = (char *)(hypercall_page + (i * 32));
979 *(u8 *)(p + 0) = 0xb8; /* mov imm32, %eax */
980 *(u32 *)(p + 1) = i;
981 *(u8 *)(p + 5) = 0x0f; /* vmcall */
982 *(u8 *)(p + 6) = 0x01;
983 *(u8 *)(p + 7) = 0xc1;
984 *(u8 *)(p + 8) = 0xc3; /* ret */
985 }
987 /* Don't support HYPERVISOR_iret at the moment */
988 *(u16 *)(hypercall_page + (__HYPERVISOR_iret * 32)) = 0x0b0f; /* ud2 */
989 }
991 static enum hvm_intblk vmx_interrupt_blocked(
992 struct vcpu *v, struct hvm_intack intack)
993 {
994 unsigned long intr_shadow;
996 intr_shadow = __vmread(GUEST_INTERRUPTIBILITY_INFO);
998 if ( intr_shadow & (VMX_INTR_SHADOW_STI|VMX_INTR_SHADOW_MOV_SS) )
999 return hvm_intblk_shadow;
1001 if ( intack.source == hvm_intsrc_nmi )
1002 return ((intr_shadow & VMX_INTR_SHADOW_NMI) ?
1003 hvm_intblk_nmi_iret : hvm_intblk_none);
1005 ASSERT((intack.source == hvm_intsrc_pic) ||
1006 (intack.source == hvm_intsrc_lapic));
1008 if ( !(guest_cpu_user_regs()->eflags & X86_EFLAGS_IF) )
1009 return hvm_intblk_rflags_ie;
1011 if ( intack.source == hvm_intsrc_lapic )
1013 uint32_t tpr = vlapic_get_reg(vcpu_vlapic(v), APIC_TASKPRI) & 0xF0;
1014 if ( (tpr >> 4) >= (intack.vector >> 4) )
1015 return hvm_intblk_tpr;
1018 return hvm_intblk_none;
1021 static void vmx_update_host_cr3(struct vcpu *v)
1023 ASSERT((v == current) || !vcpu_runnable(v));
1024 vmx_vmcs_enter(v);
1025 __vmwrite(HOST_CR3, v->arch.cr3);
1026 vmx_vmcs_exit(v);
1029 static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr)
1031 ASSERT((v == current) || !vcpu_runnable(v));
1033 vmx_vmcs_enter(v);
1035 switch ( cr )
1037 case 0:
1038 /* TS cleared? Then initialise FPU now. */
1039 if ( (v == current) && !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) &&
1040 (v->arch.hvm_vcpu.hw_cr[0] & X86_CR0_TS) )
1042 setup_fpu(v);
1043 __vm_clear_bit(EXCEPTION_BITMAP, TRAP_no_device);
1046 v->arch.hvm_vcpu.hw_cr[0] =
1047 v->arch.hvm_vcpu.guest_cr[0] |
1048 X86_CR0_PE | X86_CR0_NE | X86_CR0_PG | X86_CR0_WP;
1049 __vmwrite(GUEST_CR0, v->arch.hvm_vcpu.hw_cr[0]);
1050 __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vcpu.guest_cr[0]);
1051 break;
1052 case 2:
1053 /* CR2 is updated in exit stub. */
1054 break;
1055 case 3:
1056 __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr[3]);
1057 break;
1058 case 4:
1059 v->arch.hvm_vcpu.hw_cr[4] =
1060 v->arch.hvm_vcpu.guest_cr[4] | HVM_CR4_HOST_MASK;
1061 __vmwrite(GUEST_CR4, v->arch.hvm_vcpu.hw_cr[4]);
1062 __vmwrite(CR4_READ_SHADOW, v->arch.hvm_vcpu.guest_cr[4]);
1063 break;
1064 default:
1065 BUG();
1068 vmx_vmcs_exit(v);
1071 static void vmx_update_guest_efer(struct vcpu *v)
1073 #ifdef __x86_64__
1074 unsigned long vm_entry_value;
1076 ASSERT((v == current) || !vcpu_runnable(v));
1078 vmx_vmcs_enter(v);
1080 vm_entry_value = __vmread(VM_ENTRY_CONTROLS);
1081 if ( v->arch.hvm_vcpu.guest_efer & EFER_LMA )
1082 vm_entry_value |= VM_ENTRY_IA32E_MODE;
1083 else
1084 vm_entry_value &= ~VM_ENTRY_IA32E_MODE;
1085 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
1087 vmx_vmcs_exit(v);
1088 #endif
1090 if ( v == current )
1091 write_efer((read_efer() & ~(EFER_NX|EFER_SCE)) |
1092 (v->arch.hvm_vcpu.guest_efer & (EFER_NX|EFER_SCE)));
1095 static void vmx_flush_guest_tlbs(void)
1097 /* No tagged TLB support on VMX yet. The fact that we're in Xen
1098 * at all means any guest will have a clean TLB when it's next run,
1099 * because VMRESUME will flush it for us. */
1102 static void vmx_inject_exception(
1103 unsigned int trapnr, int errcode, unsigned long cr2)
1105 struct vcpu *v = current;
1106 vmx_inject_hw_exception(v, trapnr, errcode);
1107 if ( trapnr == TRAP_page_fault )
1108 v->arch.hvm_vcpu.guest_cr[2] = cr2;
1111 static void vmx_update_vtpr(struct vcpu *v, unsigned long value)
1113 /* VMX doesn't have a V_TPR field */
1116 static int vmx_event_pending(struct vcpu *v)
1118 ASSERT(v == current);
1119 return (__vmread(VM_ENTRY_INTR_INFO) & INTR_INFO_VALID_MASK);
1122 static struct hvm_function_table vmx_function_table = {
1123 .name = "VMX",
1124 .domain_initialise = vmx_domain_initialise,
1125 .domain_destroy = vmx_domain_destroy,
1126 .vcpu_initialise = vmx_vcpu_initialise,
1127 .vcpu_destroy = vmx_vcpu_destroy,
1128 .save_cpu_ctxt = vmx_save_vmcs_ctxt,
1129 .load_cpu_ctxt = vmx_load_vmcs_ctxt,
1130 .interrupt_blocked = vmx_interrupt_blocked,
1131 .guest_x86_mode = vmx_guest_x86_mode,
1132 .get_segment_base = vmx_get_segment_base,
1133 .get_segment_register = vmx_get_segment_register,
1134 .set_segment_register = vmx_set_segment_register,
1135 .update_host_cr3 = vmx_update_host_cr3,
1136 .update_guest_cr = vmx_update_guest_cr,
1137 .update_guest_efer = vmx_update_guest_efer,
1138 .flush_guest_tlbs = vmx_flush_guest_tlbs,
1139 .update_vtpr = vmx_update_vtpr,
1140 .stts = vmx_stts,
1141 .set_tsc_offset = vmx_set_tsc_offset,
1142 .inject_exception = vmx_inject_exception,
1143 .init_ap_context = vmx_init_ap_context,
1144 .init_hypercall_page = vmx_init_hypercall_page,
1145 .event_pending = vmx_event_pending,
1146 .cpu_up = vmx_cpu_up,
1147 .cpu_down = vmx_cpu_down,
1148 };
1150 void start_vmx(void)
1152 static int bootstrapped;
1154 vmx_save_host_msrs();
1156 if ( bootstrapped )
1158 if ( hvm_enabled && !vmx_cpu_up() )
1160 printk("VMX: FATAL: failed to initialise CPU%d!\n",
1161 smp_processor_id());
1162 BUG();
1164 return;
1167 bootstrapped = 1;
1169 /* Xen does not fill x86_capability words except 0. */
1170 boot_cpu_data.x86_capability[4] = cpuid_ecx(1);
1172 if ( !test_bit(X86_FEATURE_VMXE, &boot_cpu_data.x86_capability) )
1173 return;
1175 set_in_cr4(X86_CR4_VMXE);
1177 if ( !vmx_cpu_up() )
1179 printk("VMX: failed to initialise.\n");
1180 return;
1183 setup_vmcs_dump();
1185 hvm_enable(&vmx_function_table);
1188 /*
1189 * Not all cases receive valid value in the VM-exit instruction length field.
1190 * Callers must know what they're doing!
1191 */
1192 static int __get_instruction_length(void)
1194 int len;
1195 len = __vmread(VM_EXIT_INSTRUCTION_LEN); /* Safe: callers audited */
1196 BUG_ON((len < 1) || (len > 15));
1197 return len;
1200 static void __update_guest_eip(unsigned long inst_len)
1202 struct cpu_user_regs *regs = guest_cpu_user_regs();
1203 unsigned long x;
1205 regs->eip += inst_len;
1206 regs->eflags &= ~X86_EFLAGS_RF;
1208 x = __vmread(GUEST_INTERRUPTIBILITY_INFO);
1209 if ( x & (VMX_INTR_SHADOW_STI | VMX_INTR_SHADOW_MOV_SS) )
1211 x &= ~(VMX_INTR_SHADOW_STI | VMX_INTR_SHADOW_MOV_SS);
1212 __vmwrite(GUEST_INTERRUPTIBILITY_INFO, x);
1216 static void vmx_do_no_device_fault(void)
1218 struct vcpu *v = current;
1220 setup_fpu(current);
1221 __vm_clear_bit(EXCEPTION_BITMAP, TRAP_no_device);
1223 /* Disable TS in guest CR0 unless the guest wants the exception too. */
1224 if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) )
1226 v->arch.hvm_vcpu.hw_cr[0] &= ~X86_CR0_TS;
1227 __vmwrite(GUEST_CR0, v->arch.hvm_vcpu.hw_cr[0]);
1231 #define bitmaskof(idx) (1U << ((idx) & 31))
1232 static void vmx_do_cpuid(struct cpu_user_regs *regs)
1234 unsigned int input = regs->eax;
1235 unsigned int eax, ebx, ecx, edx;
1237 if ( input == 0x40000003 )
1239 /*
1240 * NB. Unsupported interface for private use of VMXASSIST only.
1241 * Note that this leaf lives at <max-hypervisor-leaf> + 1.
1242 */
1243 u64 value = ((u64)regs->edx << 32) | (u32)regs->ecx;
1244 p2m_type_t p2mt;
1245 unsigned long mfn;
1246 struct vcpu *v = current;
1247 char *p;
1249 mfn = mfn_x(gfn_to_mfn_current(value >> PAGE_SHIFT, &p2mt));
1251 gdprintk(XENLOG_INFO, "Input address is 0x%"PRIx64".\n", value);
1253 /* 8-byte aligned valid pseudophys address from vmxassist, please. */
1254 if ( (value & 7) || !p2m_is_ram(p2mt) ||
1255 !v->arch.hvm_vmx.vmxassist_enabled )
1257 domain_crash(v->domain);
1258 return;
1260 ASSERT(mfn_valid(mfn));
1262 p = map_domain_page(mfn);
1263 value = *((uint64_t *)(p + (value & (PAGE_SIZE - 1))));
1264 unmap_domain_page(p);
1266 gdprintk(XENLOG_INFO, "Output value is 0x%"PRIx64".\n", value);
1267 regs->ecx = (u32)value;
1268 regs->edx = (u32)(value >> 32);
1269 return;
1272 hvm_cpuid(input, &eax, &ebx, &ecx, &edx);
1274 switch ( input )
1276 case 0x00000001:
1277 ecx &= ~VMX_VCPU_CPUID_L1_ECX_RESERVED;
1278 ebx &= NUM_THREADS_RESET_MASK;
1279 ecx &= ~(bitmaskof(X86_FEATURE_VMXE) |
1280 bitmaskof(X86_FEATURE_EST) |
1281 bitmaskof(X86_FEATURE_TM2) |
1282 bitmaskof(X86_FEATURE_CID) |
1283 bitmaskof(X86_FEATURE_PDCM) |
1284 bitmaskof(X86_FEATURE_DSCPL));
1285 edx &= ~(bitmaskof(X86_FEATURE_HT) |
1286 bitmaskof(X86_FEATURE_ACPI) |
1287 bitmaskof(X86_FEATURE_ACC) |
1288 bitmaskof(X86_FEATURE_DS));
1289 break;
1291 case 0x00000004:
1292 cpuid_count(input, regs->ecx, &eax, &ebx, &ecx, &edx);
1293 eax &= NUM_CORES_RESET_MASK;
1294 break;
1296 case 0x00000006:
1297 case 0x00000009:
1298 case 0x0000000A:
1299 eax = ebx = ecx = edx = 0;
1300 break;
1303 regs->eax = eax;
1304 regs->ebx = ebx;
1305 regs->ecx = ecx;
1306 regs->edx = edx;
1308 HVMTRACE_3D(CPUID, current, input,
1309 ((uint64_t)eax << 32) | ebx, ((uint64_t)ecx << 32) | edx);
1312 #define CASE_GET_REG_P(REG, reg) \
1313 case REG_ ## REG: reg_p = (unsigned long *)&(regs->reg); break
1315 #ifdef __i386__
1316 #define CASE_EXTEND_GET_REG_P
1317 #else
1318 #define CASE_EXTEND_GET_REG_P \
1319 CASE_GET_REG_P(R8, r8); \
1320 CASE_GET_REG_P(R9, r9); \
1321 CASE_GET_REG_P(R10, r10); \
1322 CASE_GET_REG_P(R11, r11); \
1323 CASE_GET_REG_P(R12, r12); \
1324 CASE_GET_REG_P(R13, r13); \
1325 CASE_GET_REG_P(R14, r14); \
1326 CASE_GET_REG_P(R15, r15)
1327 #endif
1329 static void vmx_dr_access(unsigned long exit_qualification,
1330 struct cpu_user_regs *regs)
1332 struct vcpu *v = current;
1334 HVMTRACE_0D(DR_WRITE, v);
1336 if ( !v->arch.hvm_vcpu.flag_dr_dirty )
1337 __restore_debug_registers(v);
1339 /* Allow guest direct access to DR registers */
1340 v->arch.hvm_vmx.exec_control &= ~CPU_BASED_MOV_DR_EXITING;
1341 __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control);
1344 /*
1345 * Invalidate the TLB for va. Invalidate the shadow page corresponding
1346 * the address va.
1347 */
1348 static void vmx_do_invlpg(unsigned long va)
1350 struct vcpu *v = current;
1352 HVMTRACE_2D(INVLPG, v, /*invlpga=*/ 0, va);
1354 /*
1355 * We do the safest things first, then try to update the shadow
1356 * copying from guest
1357 */
1358 paging_invlpg(v, va);
1361 /* Get segment for OUTS according to guest instruction. */
1362 static enum x86_segment vmx_outs_get_segment(
1363 int long_mode, unsigned long eip, int inst_len)
1365 unsigned char inst[MAX_INST_LEN];
1366 enum x86_segment seg = x86_seg_ds;
1367 int i;
1368 extern int inst_copy_from_guest(unsigned char *, unsigned long, int);
1370 if ( likely(cpu_has_vmx_ins_outs_instr_info) )
1372 unsigned int instr_info = __vmread(VMX_INSTRUCTION_INFO);
1374 /* Get segment register according to bits 17:15. */
1375 switch ( (instr_info >> 15) & 7 )
1377 case 0: seg = x86_seg_es; break;
1378 case 1: seg = x86_seg_cs; break;
1379 case 2: seg = x86_seg_ss; break;
1380 case 3: seg = x86_seg_ds; break;
1381 case 4: seg = x86_seg_fs; break;
1382 case 5: seg = x86_seg_gs; break;
1383 default: BUG();
1386 goto out;
1389 if ( !long_mode )
1390 eip += __vmread(GUEST_CS_BASE);
1392 memset(inst, 0, MAX_INST_LEN);
1393 if ( inst_copy_from_guest(inst, eip, inst_len) != inst_len )
1395 gdprintk(XENLOG_ERR, "Get guest instruction failed\n");
1396 domain_crash(current->domain);
1397 goto out;
1400 for ( i = 0; i < inst_len; i++ )
1402 switch ( inst[i] )
1404 case 0xf3: /* REPZ */
1405 case 0xf2: /* REPNZ */
1406 case 0xf0: /* LOCK */
1407 case 0x66: /* data32 */
1408 case 0x67: /* addr32 */
1409 #ifdef __x86_64__
1410 case 0x40 ... 0x4f: /* REX */
1411 #endif
1412 continue;
1413 case 0x2e: /* CS */
1414 seg = x86_seg_cs;
1415 continue;
1416 case 0x36: /* SS */
1417 seg = x86_seg_ss;
1418 continue;
1419 case 0x26: /* ES */
1420 seg = x86_seg_es;
1421 continue;
1422 case 0x64: /* FS */
1423 seg = x86_seg_fs;
1424 continue;
1425 case 0x65: /* GS */
1426 seg = x86_seg_gs;
1427 continue;
1428 case 0x3e: /* DS */
1429 seg = x86_seg_ds;
1430 continue;
1434 out:
1435 return seg;
1438 static int vmx_str_pio_check_descriptor(int long_mode, unsigned long eip,
1439 int inst_len, enum x86_segment seg,
1440 unsigned long *base, u32 *limit,
1441 u32 *ar_bytes)
1443 enum vmcs_field ar_field, base_field, limit_field;
1445 *base = 0;
1446 *limit = 0;
1447 if ( seg != x86_seg_es )
1448 seg = vmx_outs_get_segment(long_mode, eip, inst_len);
1450 switch ( seg )
1452 case x86_seg_cs:
1453 ar_field = GUEST_CS_AR_BYTES;
1454 base_field = GUEST_CS_BASE;
1455 limit_field = GUEST_CS_LIMIT;
1456 break;
1457 case x86_seg_ds:
1458 ar_field = GUEST_DS_AR_BYTES;
1459 base_field = GUEST_DS_BASE;
1460 limit_field = GUEST_DS_LIMIT;
1461 break;
1462 case x86_seg_es:
1463 ar_field = GUEST_ES_AR_BYTES;
1464 base_field = GUEST_ES_BASE;
1465 limit_field = GUEST_ES_LIMIT;
1466 break;
1467 case x86_seg_fs:
1468 ar_field = GUEST_FS_AR_BYTES;
1469 base_field = GUEST_FS_BASE;
1470 limit_field = GUEST_FS_LIMIT;
1471 break;
1472 case x86_seg_gs:
1473 ar_field = GUEST_GS_AR_BYTES;
1474 base_field = GUEST_GS_BASE;
1475 limit_field = GUEST_GS_LIMIT;
1476 break;
1477 case x86_seg_ss:
1478 ar_field = GUEST_SS_AR_BYTES;
1479 base_field = GUEST_SS_BASE;
1480 limit_field = GUEST_SS_LIMIT;
1481 break;
1482 default:
1483 BUG();
1484 return 0;
1487 if ( !long_mode || seg == x86_seg_fs || seg == x86_seg_gs )
1489 *base = __vmread(base_field);
1490 *limit = __vmread(limit_field);
1492 *ar_bytes = __vmread(ar_field);
1494 return !(*ar_bytes & X86_SEG_AR_SEG_UNUSABLE);
1498 static int vmx_str_pio_check_limit(u32 limit, unsigned int size,
1499 u32 ar_bytes, unsigned long addr,
1500 unsigned long base, int df,
1501 unsigned long *count)
1503 unsigned long ea = addr - base;
1505 /* Offset must be within limits. */
1506 ASSERT(ea == (u32)ea);
1507 if ( (u32)(ea + size - 1) < (u32)ea ||
1508 (ar_bytes & 0xc) != 0x4 ? ea + size - 1 > limit
1509 : ea <= limit )
1510 return 0;
1512 /* Check the limit for repeated instructions, as above we checked
1513 only the first instance. Truncate the count if a limit violation
1514 would occur. Note that the checking is not necessary for page
1515 granular segments as transfers crossing page boundaries will be
1516 broken up anyway. */
1517 if ( !(ar_bytes & X86_SEG_AR_GRANULARITY) && *count > 1 )
1519 if ( (ar_bytes & 0xc) != 0x4 )
1521 /* expand-up */
1522 if ( !df )
1524 if ( ea + *count * size - 1 < ea ||
1525 ea + *count * size - 1 > limit )
1526 *count = (limit + 1UL - ea) / size;
1528 else
1530 if ( *count - 1 > ea / size )
1531 *count = ea / size + 1;
1534 else
1536 /* expand-down */
1537 if ( !df )
1539 if ( *count - 1 > -(s32)ea / size )
1540 *count = -(s32)ea / size + 1UL;
1542 else
1544 if ( ea < (*count - 1) * size ||
1545 ea - (*count - 1) * size <= limit )
1546 *count = (ea - limit - 1) / size + 1;
1549 ASSERT(*count);
1552 return 1;
1555 #ifdef __x86_64__
1556 static int vmx_str_pio_lm_check_limit(struct cpu_user_regs *regs,
1557 unsigned int size,
1558 unsigned long addr,
1559 unsigned long *count)
1561 if ( !is_canonical_address(addr) ||
1562 !is_canonical_address(addr + size - 1) )
1563 return 0;
1565 if ( *count > (1UL << 48) / size )
1566 *count = (1UL << 48) / size;
1568 if ( !(regs->eflags & EF_DF) )
1570 if ( addr + *count * size - 1 < addr ||
1571 !is_canonical_address(addr + *count * size - 1) )
1572 *count = (addr & ~((1UL << 48) - 1)) / size;
1574 else
1576 if ( (*count - 1) * size > addr ||
1577 !is_canonical_address(addr + (*count - 1) * size) )
1578 *count = (addr & ~((1UL << 48) - 1)) / size + 1;
1581 ASSERT(*count);
1583 return 1;
1585 #endif
1587 static void vmx_send_str_pio(struct cpu_user_regs *regs,
1588 struct hvm_io_op *pio_opp,
1589 unsigned long inst_len, unsigned int port,
1590 int sign, unsigned int size, int dir,
1591 int df, unsigned long addr,
1592 unsigned long paddr, unsigned long count)
1594 /*
1595 * Handle string pio instructions that cross pages or that
1596 * are unaligned. See the comments in hvm_domain.c/handle_mmio()
1597 */
1598 if ( (addr & PAGE_MASK) != ((addr + size - 1) & PAGE_MASK) ) {
1599 unsigned long value = 0;
1601 pio_opp->flags |= OVERLAP;
1603 if ( dir == IOREQ_WRITE ) /* OUTS */
1605 if ( hvm_paging_enabled(current) )
1607 int rv = hvm_copy_from_guest_virt(&value, addr, size);
1608 if ( rv != 0 )
1610 /* Failed on the page-spanning copy. Inject PF into
1611 * the guest for the address where we failed. */
1612 addr += size - rv;
1613 gdprintk(XENLOG_DEBUG, "Pagefault reading non-io side "
1614 "of a page-spanning PIO: va=%#lx\n", addr);
1615 vmx_inject_exception(TRAP_page_fault, 0, addr);
1616 return;
1619 else
1620 (void) hvm_copy_from_guest_phys(&value, addr, size);
1621 } else /* dir != IOREQ_WRITE */
1622 /* Remember where to write the result, as a *VA*.
1623 * Must be a VA so we can handle the page overlap
1624 * correctly in hvm_pio_assist() */
1625 pio_opp->addr = addr;
1627 if ( count == 1 )
1628 regs->eip += inst_len;
1630 send_pio_req(port, 1, size, value, dir, df, 0);
1631 } else {
1632 unsigned long last_addr = sign > 0 ? addr + count * size - 1
1633 : addr - (count - 1) * size;
1635 if ( (addr & PAGE_MASK) != (last_addr & PAGE_MASK) )
1637 if ( sign > 0 )
1638 count = (PAGE_SIZE - (addr & ~PAGE_MASK)) / size;
1639 else
1640 count = (addr & ~PAGE_MASK) / size + 1;
1641 } else
1642 regs->eip += inst_len;
1644 send_pio_req(port, count, size, paddr, dir, df, 1);
1648 static void vmx_do_str_pio(unsigned long exit_qualification,
1649 unsigned long inst_len,
1650 struct cpu_user_regs *regs,
1651 struct hvm_io_op *pio_opp)
1653 unsigned int port, size;
1654 int dir, df, vm86;
1655 unsigned long addr, count = 1, base;
1656 paddr_t paddr;
1657 unsigned long gfn;
1658 u32 ar_bytes, limit, pfec;
1659 int sign;
1660 int long_mode = 0;
1662 vm86 = regs->eflags & X86_EFLAGS_VM ? 1 : 0;
1663 df = regs->eflags & X86_EFLAGS_DF ? 1 : 0;
1665 if ( test_bit(6, &exit_qualification) )
1666 port = (exit_qualification >> 16) & 0xFFFF;
1667 else
1668 port = regs->edx & 0xffff;
1670 size = (exit_qualification & 7) + 1;
1671 dir = test_bit(3, &exit_qualification); /* direction */
1673 if ( dir == IOREQ_READ )
1674 HVMTRACE_2D(IO_READ, current, port, size);
1675 else
1676 HVMTRACE_2D(IO_WRITE, current, port, size);
1678 sign = regs->eflags & X86_EFLAGS_DF ? -1 : 1;
1679 ar_bytes = __vmread(GUEST_CS_AR_BYTES);
1680 if ( hvm_long_mode_enabled(current) &&
1681 (ar_bytes & X86_SEG_AR_CS_LM_ACTIVE) )
1682 long_mode = 1;
1683 addr = __vmread(GUEST_LINEAR_ADDRESS);
1685 if ( test_bit(5, &exit_qualification) ) { /* "rep" prefix */
1686 pio_opp->flags |= REPZ;
1687 count = regs->ecx;
1688 if ( !long_mode &&
1689 (vm86 || !(ar_bytes & X86_SEG_AR_DEF_OP_SIZE)) )
1690 count &= 0xFFFF;
1693 /*
1694 * In protected mode, guest linear address is invalid if the
1695 * selector is null.
1696 */
1697 if ( !vmx_str_pio_check_descriptor(long_mode, regs->eip, inst_len,
1698 dir==IOREQ_WRITE ? x86_seg_ds :
1699 x86_seg_es, &base, &limit,
1700 &ar_bytes) ) {
1701 if ( !long_mode ) {
1702 vmx_inject_hw_exception(current, TRAP_gp_fault, 0);
1703 return;
1705 addr = dir == IOREQ_WRITE ? base + regs->esi : regs->edi;
1708 if ( !long_mode )
1710 /* Segment must be readable for outs and writeable for ins. */
1711 if ( ((dir == IOREQ_WRITE)
1712 ? ((ar_bytes & 0xa) == 0x8)
1713 : ((ar_bytes & 0xa) != 0x2)) ||
1714 !vmx_str_pio_check_limit(limit, size, ar_bytes,
1715 addr, base, df, &count) )
1717 vmx_inject_hw_exception(current, TRAP_gp_fault, 0);
1718 return;
1721 #ifdef __x86_64__
1722 else if ( !vmx_str_pio_lm_check_limit(regs, size, addr, &count) )
1724 vmx_inject_hw_exception(current, TRAP_gp_fault, 0);
1725 return;
1727 #endif
1729 /* Translate the address to a physical address */
1730 pfec = PFEC_page_present;
1731 if ( dir == IOREQ_READ ) /* Read from PIO --> write to RAM */
1732 pfec |= PFEC_write_access;
1733 if ( ring_3(regs) )
1734 pfec |= PFEC_user_mode;
1735 gfn = paging_gva_to_gfn(current, addr, &pfec);
1736 if ( gfn == INVALID_GFN )
1738 /* The guest does not have the RAM address mapped.
1739 * Need to send in a page fault */
1740 vmx_inject_exception(TRAP_page_fault, pfec, addr);
1741 return;
1743 paddr = (paddr_t)gfn << PAGE_SHIFT | (addr & ~PAGE_MASK);
1745 vmx_send_str_pio(regs, pio_opp, inst_len, port, sign,
1746 size, dir, df, addr, paddr, count);
1749 static void vmx_io_instruction(unsigned long exit_qualification,
1750 unsigned long inst_len)
1752 struct cpu_user_regs *regs;
1753 struct hvm_io_op *pio_opp;
1755 pio_opp = &current->arch.hvm_vcpu.io_op;
1756 pio_opp->instr = INSTR_PIO;
1757 pio_opp->flags = 0;
1759 regs = &pio_opp->io_context;
1761 /* Copy current guest state into io instruction state structure. */
1762 memcpy(regs, guest_cpu_user_regs(), HVM_CONTEXT_STACK_BYTES);
1764 HVM_DBG_LOG(DBG_LEVEL_IO, "vm86 %d, eip=%x:%lx, "
1765 "exit_qualification = %lx",
1766 regs->eflags & X86_EFLAGS_VM ? 1 : 0,
1767 regs->cs, (unsigned long)regs->eip, exit_qualification);
1769 if ( test_bit(4, &exit_qualification) ) /* string instrucation */
1770 vmx_do_str_pio(exit_qualification, inst_len, regs, pio_opp);
1771 else
1773 unsigned int port, size;
1774 int dir, df;
1776 df = regs->eflags & X86_EFLAGS_DF ? 1 : 0;
1778 if ( test_bit(6, &exit_qualification) )
1779 port = (exit_qualification >> 16) & 0xFFFF;
1780 else
1781 port = regs->edx & 0xffff;
1783 size = (exit_qualification & 7) + 1;
1784 dir = test_bit(3, &exit_qualification); /* direction */
1786 if ( dir == IOREQ_READ )
1787 HVMTRACE_2D(IO_READ, current, port, size);
1788 else
1789 HVMTRACE_3D(IO_WRITE, current, port, size, regs->eax);
1791 if ( port == 0xe9 && dir == IOREQ_WRITE && size == 1 )
1792 hvm_print_line(current, regs->eax); /* guest debug output */
1794 regs->eip += inst_len;
1795 send_pio_req(port, 1, size, regs->eax, dir, df, 0);
1799 static void vmx_world_save(struct vcpu *v, struct vmx_assist_context *c)
1801 struct cpu_user_regs *regs = guest_cpu_user_regs();
1803 c->eip = regs->eip;
1804 c->eip += __get_instruction_length(); /* Safe: MOV Cn, LMSW, CLTS */
1805 c->esp = regs->esp;
1806 c->eflags = regs->eflags & ~X86_EFLAGS_RF;
1808 c->cr0 = v->arch.hvm_vcpu.guest_cr[0];
1809 c->cr3 = v->arch.hvm_vcpu.guest_cr[3];
1810 c->cr4 = v->arch.hvm_vcpu.guest_cr[4];
1812 c->idtr_limit = __vmread(GUEST_IDTR_LIMIT);
1813 c->idtr_base = __vmread(GUEST_IDTR_BASE);
1815 c->gdtr_limit = __vmread(GUEST_GDTR_LIMIT);
1816 c->gdtr_base = __vmread(GUEST_GDTR_BASE);
1818 c->cs_sel = __vmread(GUEST_CS_SELECTOR);
1819 c->cs_limit = __vmread(GUEST_CS_LIMIT);
1820 c->cs_base = __vmread(GUEST_CS_BASE);
1821 c->cs_arbytes.bytes = __vmread(GUEST_CS_AR_BYTES);
1823 c->ds_sel = __vmread(GUEST_DS_SELECTOR);
1824 c->ds_limit = __vmread(GUEST_DS_LIMIT);
1825 c->ds_base = __vmread(GUEST_DS_BASE);
1826 c->ds_arbytes.bytes = __vmread(GUEST_DS_AR_BYTES);
1828 c->es_sel = __vmread(GUEST_ES_SELECTOR);
1829 c->es_limit = __vmread(GUEST_ES_LIMIT);
1830 c->es_base = __vmread(GUEST_ES_BASE);
1831 c->es_arbytes.bytes = __vmread(GUEST_ES_AR_BYTES);
1833 c->ss_sel = __vmread(GUEST_SS_SELECTOR);
1834 c->ss_limit = __vmread(GUEST_SS_LIMIT);
1835 c->ss_base = __vmread(GUEST_SS_BASE);
1836 c->ss_arbytes.bytes = __vmread(GUEST_SS_AR_BYTES);
1838 c->fs_sel = __vmread(GUEST_FS_SELECTOR);
1839 c->fs_limit = __vmread(GUEST_FS_LIMIT);
1840 c->fs_base = __vmread(GUEST_FS_BASE);
1841 c->fs_arbytes.bytes = __vmread(GUEST_FS_AR_BYTES);
1843 c->gs_sel = __vmread(GUEST_GS_SELECTOR);
1844 c->gs_limit = __vmread(GUEST_GS_LIMIT);
1845 c->gs_base = __vmread(GUEST_GS_BASE);
1846 c->gs_arbytes.bytes = __vmread(GUEST_GS_AR_BYTES);
1848 c->tr_sel = __vmread(GUEST_TR_SELECTOR);
1849 c->tr_limit = __vmread(GUEST_TR_LIMIT);
1850 c->tr_base = __vmread(GUEST_TR_BASE);
1851 c->tr_arbytes.bytes = __vmread(GUEST_TR_AR_BYTES);
1853 c->ldtr_sel = __vmread(GUEST_LDTR_SELECTOR);
1854 c->ldtr_limit = __vmread(GUEST_LDTR_LIMIT);
1855 c->ldtr_base = __vmread(GUEST_LDTR_BASE);
1856 c->ldtr_arbytes.bytes = __vmread(GUEST_LDTR_AR_BYTES);
1859 static int vmx_world_restore(struct vcpu *v, struct vmx_assist_context *c)
1861 struct cpu_user_regs *regs = guest_cpu_user_regs();
1862 int rc;
1864 rc = vmx_restore_cr0_cr3(v, c->cr0, c->cr3);
1865 if ( rc )
1866 return rc;
1868 regs->eip = c->eip;
1869 regs->esp = c->esp;
1870 regs->eflags = c->eflags | 2;
1872 v->arch.hvm_vcpu.guest_cr[4] = c->cr4;
1873 vmx_update_guest_cr(v, 0);
1874 vmx_update_guest_cr(v, 4);
1876 __vmwrite(GUEST_IDTR_LIMIT, c->idtr_limit);
1877 __vmwrite(GUEST_IDTR_BASE, c->idtr_base);
1879 __vmwrite(GUEST_GDTR_LIMIT, c->gdtr_limit);
1880 __vmwrite(GUEST_GDTR_BASE, c->gdtr_base);
1882 __vmwrite(GUEST_CS_SELECTOR, c->cs_sel);
1883 __vmwrite(GUEST_CS_LIMIT, c->cs_limit);
1884 __vmwrite(GUEST_CS_BASE, c->cs_base);
1885 __vmwrite(GUEST_CS_AR_BYTES, c->cs_arbytes.bytes);
1887 __vmwrite(GUEST_DS_SELECTOR, c->ds_sel);
1888 __vmwrite(GUEST_DS_LIMIT, c->ds_limit);
1889 __vmwrite(GUEST_DS_BASE, c->ds_base);
1890 __vmwrite(GUEST_DS_AR_BYTES, c->ds_arbytes.bytes);
1892 __vmwrite(GUEST_ES_SELECTOR, c->es_sel);
1893 __vmwrite(GUEST_ES_LIMIT, c->es_limit);
1894 __vmwrite(GUEST_ES_BASE, c->es_base);
1895 __vmwrite(GUEST_ES_AR_BYTES, c->es_arbytes.bytes);
1897 __vmwrite(GUEST_SS_SELECTOR, c->ss_sel);
1898 __vmwrite(GUEST_SS_LIMIT, c->ss_limit);
1899 __vmwrite(GUEST_SS_BASE, c->ss_base);
1900 __vmwrite(GUEST_SS_AR_BYTES, c->ss_arbytes.bytes);
1902 __vmwrite(GUEST_FS_SELECTOR, c->fs_sel);
1903 __vmwrite(GUEST_FS_LIMIT, c->fs_limit);
1904 __vmwrite(GUEST_FS_BASE, c->fs_base);
1905 __vmwrite(GUEST_FS_AR_BYTES, c->fs_arbytes.bytes);
1907 __vmwrite(GUEST_GS_SELECTOR, c->gs_sel);
1908 __vmwrite(GUEST_GS_LIMIT, c->gs_limit);
1909 __vmwrite(GUEST_GS_BASE, c->gs_base);
1910 __vmwrite(GUEST_GS_AR_BYTES, c->gs_arbytes.bytes);
1912 __vmwrite(GUEST_TR_SELECTOR, c->tr_sel);
1913 __vmwrite(GUEST_TR_LIMIT, c->tr_limit);
1914 __vmwrite(GUEST_TR_BASE, c->tr_base);
1915 __vmwrite(GUEST_TR_AR_BYTES, c->tr_arbytes.bytes);
1917 __vmwrite(GUEST_LDTR_SELECTOR, c->ldtr_sel);
1918 __vmwrite(GUEST_LDTR_LIMIT, c->ldtr_limit);
1919 __vmwrite(GUEST_LDTR_BASE, c->ldtr_base);
1920 __vmwrite(GUEST_LDTR_AR_BYTES, c->ldtr_arbytes.bytes);
1922 paging_update_paging_modes(v);
1923 return 0;
1926 enum { VMX_ASSIST_INVOKE = 0, VMX_ASSIST_RESTORE };
1928 static int vmx_assist(struct vcpu *v, int mode)
1930 struct vmx_assist_context c;
1931 struct hvm_hw_vpic *vpic = v->domain->arch.hvm_domain.vpic;
1932 u32 magic, cp;
1934 if ( hvm_copy_from_guest_phys(&magic, VMXASSIST_MAGIC_OFFSET,
1935 sizeof(magic)) )
1937 gdprintk(XENLOG_ERR, "No vmxassist: can't execute real mode code\n");
1938 domain_crash(v->domain);
1939 return 0;
1942 if ( magic != VMXASSIST_MAGIC )
1944 gdprintk(XENLOG_ERR, "vmxassist magic number not match\n");
1945 domain_crash(v->domain);
1946 return 0;
1949 switch ( mode ) {
1950 /*
1951 * Transfer control to vmxassist.
1952 * Store the current context in VMXASSIST_OLD_CONTEXT and load
1953 * the new VMXASSIST_NEW_CONTEXT context. This context was created
1954 * by vmxassist and will transfer control to it.
1955 */
1956 case VMX_ASSIST_INVOKE:
1957 /* save the old context */
1958 if ( hvm_copy_from_guest_phys(&cp, VMXASSIST_OLD_CONTEXT, sizeof(cp)) )
1959 goto error;
1960 if ( cp != 0 ) {
1961 vmx_world_save(v, &c);
1962 if ( hvm_copy_to_guest_phys(cp, &c, sizeof(c)) )
1963 goto error;
1966 /* restore the new context, this should activate vmxassist */
1967 if ( hvm_copy_from_guest_phys(&cp, VMXASSIST_NEW_CONTEXT, sizeof(cp)) )
1968 goto error;
1969 if ( cp != 0 ) {
1970 if ( hvm_copy_from_guest_phys(&c, cp, sizeof(c)) )
1971 goto error;
1972 if ( vmx_world_restore(v, &c) != 0 )
1973 goto error;
1974 v->arch.hvm_vmx.pm_irqbase[0] = vpic[0].irq_base;
1975 v->arch.hvm_vmx.pm_irqbase[1] = vpic[1].irq_base;
1976 vpic[0].irq_base = NR_EXCEPTION_HANDLER;
1977 vpic[1].irq_base = NR_EXCEPTION_HANDLER + 8;
1978 v->arch.hvm_vmx.vmxassist_enabled = 1;
1979 return 1;
1981 break;
1983 /*
1984 * Restore the VMXASSIST_OLD_CONTEXT that was saved by
1985 * VMX_ASSIST_INVOKE above.
1986 */
1987 case VMX_ASSIST_RESTORE:
1988 /* save the old context */
1989 if ( hvm_copy_from_guest_phys(&cp, VMXASSIST_OLD_CONTEXT, sizeof(cp)) )
1990 goto error;
1991 if ( cp != 0 ) {
1992 if ( hvm_copy_from_guest_phys(&c, cp, sizeof(c)) )
1993 goto error;
1994 if ( vmx_world_restore(v, &c) != 0 )
1995 goto error;
1996 if ( v->arch.hvm_vmx.irqbase_mode ) {
1997 vpic[0].irq_base = c.rm_irqbase[0] & 0xf8;
1998 vpic[1].irq_base = c.rm_irqbase[1] & 0xf8;
1999 } else {
2000 vpic[0].irq_base = v->arch.hvm_vmx.pm_irqbase[0];
2001 vpic[1].irq_base = v->arch.hvm_vmx.pm_irqbase[1];
2003 v->arch.hvm_vmx.vmxassist_enabled = 0;
2004 return 1;
2006 break;
2009 error:
2010 gdprintk(XENLOG_ERR, "Failed to transfer to vmxassist\n");
2011 domain_crash(v->domain);
2012 return 0;
2015 static int vmx_set_cr0(unsigned long value)
2017 struct vcpu *v = current;
2019 if ( hvm_set_cr0(value) == 0 )
2020 return 0;
2022 /*
2023 * VMX does not implement real-mode virtualization. We emulate
2024 * real-mode by performing a world switch to VMXAssist whenever
2025 * a partition disables the CR0.PE bit.
2026 */
2027 if ( !(value & X86_CR0_PE) )
2029 if ( vmx_assist(v, VMX_ASSIST_INVOKE) )
2030 return 0; /* do not update eip! */
2032 else if ( v->arch.hvm_vmx.vmxassist_enabled )
2034 if ( vmx_assist(v, VMX_ASSIST_RESTORE) )
2035 return 0; /* do not update eip! */
2038 return 1;
2041 #define CASE_SET_REG(REG, reg) \
2042 case REG_ ## REG: regs->reg = value; break
2043 #define CASE_GET_REG(REG, reg) \
2044 case REG_ ## REG: value = regs->reg; break
2046 #define CASE_EXTEND_SET_REG \
2047 CASE_EXTEND_REG(S)
2048 #define CASE_EXTEND_GET_REG \
2049 CASE_EXTEND_REG(G)
2051 #ifdef __i386__
2052 #define CASE_EXTEND_REG(T)
2053 #else
2054 #define CASE_EXTEND_REG(T) \
2055 CASE_ ## T ## ET_REG(R8, r8); \
2056 CASE_ ## T ## ET_REG(R9, r9); \
2057 CASE_ ## T ## ET_REG(R10, r10); \
2058 CASE_ ## T ## ET_REG(R11, r11); \
2059 CASE_ ## T ## ET_REG(R12, r12); \
2060 CASE_ ## T ## ET_REG(R13, r13); \
2061 CASE_ ## T ## ET_REG(R14, r14); \
2062 CASE_ ## T ## ET_REG(R15, r15)
2063 #endif
2065 static int mov_to_cr(int gp, int cr, struct cpu_user_regs *regs)
2067 unsigned long value;
2068 struct vcpu *v = current;
2069 struct vlapic *vlapic = vcpu_vlapic(v);
2071 switch ( gp )
2073 CASE_GET_REG(EAX, eax);
2074 CASE_GET_REG(ECX, ecx);
2075 CASE_GET_REG(EDX, edx);
2076 CASE_GET_REG(EBX, ebx);
2077 CASE_GET_REG(EBP, ebp);
2078 CASE_GET_REG(ESI, esi);
2079 CASE_GET_REG(EDI, edi);
2080 CASE_GET_REG(ESP, esp);
2081 CASE_EXTEND_GET_REG;
2082 default:
2083 gdprintk(XENLOG_ERR, "invalid gp: %d\n", gp);
2084 goto exit_and_crash;
2087 HVMTRACE_2D(CR_WRITE, v, cr, value);
2089 HVM_DBG_LOG(DBG_LEVEL_1, "CR%d, value = %lx", cr, value);
2091 switch ( cr )
2093 case 0:
2094 return vmx_set_cr0(value);
2096 case 3:
2097 return hvm_set_cr3(value);
2099 case 4:
2100 return hvm_set_cr4(value);
2102 case 8:
2103 vlapic_set_reg(vlapic, APIC_TASKPRI, ((value & 0x0F) << 4));
2104 break;
2106 default:
2107 gdprintk(XENLOG_ERR, "invalid cr: %d\n", cr);
2108 goto exit_and_crash;
2111 return 1;
2113 exit_and_crash:
2114 domain_crash(v->domain);
2115 return 0;
2118 /*
2119 * Read from control registers. CR0 and CR4 are read from the shadow.
2120 */
2121 static void mov_from_cr(int cr, int gp, struct cpu_user_regs *regs)
2123 unsigned long value = 0;
2124 struct vcpu *v = current;
2125 struct vlapic *vlapic = vcpu_vlapic(v);
2127 switch ( cr )
2129 case 3:
2130 value = (unsigned long)v->arch.hvm_vcpu.guest_cr[3];
2131 break;
2132 case 8:
2133 value = (unsigned long)vlapic_get_reg(vlapic, APIC_TASKPRI);
2134 value = (value & 0xF0) >> 4;
2135 break;
2136 default:
2137 gdprintk(XENLOG_ERR, "invalid cr: %d\n", cr);
2138 domain_crash(v->domain);
2139 break;
2142 switch ( gp ) {
2143 CASE_SET_REG(EAX, eax);
2144 CASE_SET_REG(ECX, ecx);
2145 CASE_SET_REG(EDX, edx);
2146 CASE_SET_REG(EBX, ebx);
2147 CASE_SET_REG(EBP, ebp);
2148 CASE_SET_REG(ESI, esi);
2149 CASE_SET_REG(EDI, edi);
2150 CASE_SET_REG(ESP, esp);
2151 CASE_EXTEND_SET_REG;
2152 default:
2153 printk("invalid gp: %d\n", gp);
2154 domain_crash(v->domain);
2155 break;
2158 HVMTRACE_2D(CR_READ, v, cr, value);
2160 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR%d, value = %lx", cr, value);
2163 static int vmx_cr_access(unsigned long exit_qualification,
2164 struct cpu_user_regs *regs)
2166 unsigned int gp, cr;
2167 unsigned long value;
2168 struct vcpu *v = current;
2170 switch ( exit_qualification & CONTROL_REG_ACCESS_TYPE )
2172 case TYPE_MOV_TO_CR:
2173 gp = exit_qualification & CONTROL_REG_ACCESS_REG;
2174 cr = exit_qualification & CONTROL_REG_ACCESS_NUM;
2175 return mov_to_cr(gp, cr, regs);
2176 case TYPE_MOV_FROM_CR:
2177 gp = exit_qualification & CONTROL_REG_ACCESS_REG;
2178 cr = exit_qualification & CONTROL_REG_ACCESS_NUM;
2179 mov_from_cr(cr, gp, regs);
2180 break;
2181 case TYPE_CLTS:
2182 /* We initialise the FPU now, to avoid needing another vmexit. */
2183 setup_fpu(v);
2184 __vm_clear_bit(EXCEPTION_BITMAP, TRAP_no_device);
2186 v->arch.hvm_vcpu.hw_cr[0] &= ~X86_CR0_TS; /* clear TS */
2187 __vmwrite(GUEST_CR0, v->arch.hvm_vcpu.hw_cr[0]);
2189 v->arch.hvm_vcpu.guest_cr[0] &= ~X86_CR0_TS; /* clear TS */
2190 __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vcpu.guest_cr[0]);
2191 HVMTRACE_0D(CLTS, current);
2192 break;
2193 case TYPE_LMSW:
2194 value = v->arch.hvm_vcpu.guest_cr[0];
2195 value = (value & ~0xF) |
2196 (((exit_qualification & LMSW_SOURCE_DATA) >> 16) & 0xF);
2197 HVMTRACE_1D(LMSW, current, value);
2198 return vmx_set_cr0(value);
2199 default:
2200 BUG();
2203 return 1;
2206 static const struct lbr_info {
2207 u32 base, count;
2208 } p4_lbr[] = {
2209 { MSR_P4_LER_FROM_LIP, 1 },
2210 { MSR_P4_LER_TO_LIP, 1 },
2211 { MSR_P4_LASTBRANCH_TOS, 1 },
2212 { MSR_P4_LASTBRANCH_0_FROM_LIP, NUM_MSR_P4_LASTBRANCH_FROM_TO },
2213 { MSR_P4_LASTBRANCH_0_TO_LIP, NUM_MSR_P4_LASTBRANCH_FROM_TO },
2214 { 0, 0 }
2215 }, c2_lbr[] = {
2216 { MSR_IA32_LASTINTFROMIP, 1 },
2217 { MSR_IA32_LASTINTTOIP, 1 },
2218 { MSR_C2_LASTBRANCH_TOS, 1 },
2219 { MSR_C2_LASTBRANCH_0_FROM_IP, NUM_MSR_C2_LASTBRANCH_FROM_TO },
2220 { MSR_C2_LASTBRANCH_0_TO_IP, NUM_MSR_C2_LASTBRANCH_FROM_TO },
2221 { 0, 0 }
2222 #ifdef __i386__
2223 }, pm_lbr[] = {
2224 { MSR_IA32_LASTINTFROMIP, 1 },
2225 { MSR_IA32_LASTINTTOIP, 1 },
2226 { MSR_PM_LASTBRANCH_TOS, 1 },
2227 { MSR_PM_LASTBRANCH_0, NUM_MSR_PM_LASTBRANCH },
2228 { 0, 0 }
2229 #endif
2230 };
2232 static const struct lbr_info *last_branch_msr_get(void)
2234 switch ( boot_cpu_data.x86 )
2236 case 6:
2237 switch ( boot_cpu_data.x86_model )
2239 #ifdef __i386__
2240 /* PentiumM */
2241 case 9: case 13:
2242 /* Core Solo/Duo */
2243 case 14:
2244 return pm_lbr;
2245 break;
2246 #endif
2247 /* Core2 Duo */
2248 case 15:
2249 return c2_lbr;
2250 break;
2252 break;
2254 case 15:
2255 switch ( boot_cpu_data.x86_model )
2257 /* Pentium4/Xeon with em64t */
2258 case 3: case 4: case 6:
2259 return p4_lbr;
2260 break;
2262 break;
2265 return NULL;
2268 static int is_last_branch_msr(u32 ecx)
2270 const struct lbr_info *lbr = last_branch_msr_get();
2272 if ( lbr == NULL )
2273 return 0;
2275 for ( ; lbr->count; lbr++ )
2276 if ( (ecx >= lbr->base) && (ecx < (lbr->base + lbr->count)) )
2277 return 1;
2279 return 0;
2282 static int vmx_do_msr_read(struct cpu_user_regs *regs)
2284 u64 msr_content = 0;
2285 u32 ecx = regs->ecx, eax, edx;
2286 struct vcpu *v = current;
2287 int index;
2288 u64 *var_range_base = (u64*)v->arch.hvm_vcpu.mtrr.var_ranges;
2289 u64 *fixed_range_base = (u64*)v->arch.hvm_vcpu.mtrr.fixed_ranges;
2291 HVM_DBG_LOG(DBG_LEVEL_1, "ecx=%x", ecx);
2293 switch ( ecx )
2295 case MSR_IA32_TSC:
2296 msr_content = hvm_get_guest_time(v);
2297 break;
2298 case MSR_IA32_SYSENTER_CS:
2299 msr_content = (u32)__vmread(GUEST_SYSENTER_CS);
2300 break;
2301 case MSR_IA32_SYSENTER_ESP:
2302 msr_content = __vmread(GUEST_SYSENTER_ESP);
2303 break;
2304 case MSR_IA32_SYSENTER_EIP:
2305 msr_content = __vmread(GUEST_SYSENTER_EIP);
2306 break;
2307 case MSR_IA32_APICBASE:
2308 msr_content = vcpu_vlapic(v)->hw.apic_base_msr;
2309 break;
2310 case MSR_IA32_CR_PAT:
2311 msr_content = v->arch.hvm_vcpu.pat_cr;
2312 break;
2313 case MSR_MTRRcap:
2314 msr_content = v->arch.hvm_vcpu.mtrr.mtrr_cap;
2315 break;
2316 case MSR_MTRRdefType:
2317 msr_content = v->arch.hvm_vcpu.mtrr.def_type
2318 | (v->arch.hvm_vcpu.mtrr.enabled << 10);
2319 break;
2320 case MSR_MTRRfix64K_00000:
2321 msr_content = fixed_range_base[0];
2322 break;
2323 case MSR_MTRRfix16K_80000:
2324 case MSR_MTRRfix16K_A0000:
2325 index = regs->ecx - MSR_MTRRfix16K_80000;
2326 msr_content = fixed_range_base[index + 1];
2327 break;
2328 case MSR_MTRRfix4K_C0000...MSR_MTRRfix4K_F8000:
2329 index = regs->ecx - MSR_MTRRfix4K_C0000;
2330 msr_content = fixed_range_base[index + 3];
2331 break;
2332 case MSR_IA32_MTRR_PHYSBASE0...MSR_IA32_MTRR_PHYSMASK7:
2333 index = regs->ecx - MSR_IA32_MTRR_PHYSBASE0;
2334 msr_content = var_range_base[index];
2335 break;
2336 case MSR_IA32_DEBUGCTLMSR:
2337 if ( vmx_read_guest_msr(v, ecx, &msr_content) != 0 )
2338 msr_content = 0;
2339 break;
2340 case MSR_IA32_VMX_BASIC...MSR_IA32_VMX_PROCBASED_CTLS2:
2341 goto gp_fault;
2342 case MSR_IA32_MCG_CAP:
2343 case MSR_IA32_MCG_STATUS:
2344 case MSR_IA32_MC0_STATUS:
2345 case MSR_IA32_MC1_STATUS:
2346 case MSR_IA32_MC2_STATUS:
2347 case MSR_IA32_MC3_STATUS:
2348 case MSR_IA32_MC4_STATUS:
2349 case MSR_IA32_MC5_STATUS:
2350 /* No point in letting the guest see real MCEs */
2351 msr_content = 0;
2352 break;
2353 default:
2354 switch ( long_mode_do_msr_read(regs) )
2356 case HNDL_unhandled:
2357 break;
2358 case HNDL_exception_raised:
2359 return 0;
2360 case HNDL_done:
2361 goto done;
2364 if ( vmx_read_guest_msr(v, ecx, &msr_content) == 0 )
2365 break;
2367 if ( is_last_branch_msr(ecx) )
2369 msr_content = 0;
2370 break;
2373 if ( rdmsr_hypervisor_regs(ecx, &eax, &edx) ||
2374 rdmsr_safe(ecx, eax, edx) == 0 )
2376 regs->eax = eax;
2377 regs->edx = edx;
2378 goto done;
2381 goto gp_fault;
2384 regs->eax = msr_content & 0xFFFFFFFF;
2385 regs->edx = msr_content >> 32;
2387 done:
2388 hvmtrace_msr_read(v, ecx, msr_content);
2389 HVM_DBG_LOG(DBG_LEVEL_1, "returns: ecx=%x, eax=%lx, edx=%lx",
2390 ecx, (unsigned long)regs->eax,
2391 (unsigned long)regs->edx);
2392 return 1;
2394 gp_fault:
2395 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
2396 return 0;
2399 static int vmx_alloc_vlapic_mapping(struct domain *d)
2401 void *apic_va;
2403 if ( !cpu_has_vmx_virtualize_apic_accesses )
2404 return 0;
2406 apic_va = alloc_xenheap_page();
2407 if ( apic_va == NULL )
2408 return -ENOMEM;
2409 share_xen_page_with_guest(virt_to_page(apic_va), d, XENSHARE_writable);
2410 guest_physmap_add_page(
2411 d, paddr_to_pfn(APIC_DEFAULT_PHYS_BASE), virt_to_mfn(apic_va));
2412 d->arch.hvm_domain.vmx_apic_access_mfn = virt_to_mfn(apic_va);
2414 return 0;
2417 static void vmx_free_vlapic_mapping(struct domain *d)
2419 unsigned long mfn = d->arch.hvm_domain.vmx_apic_access_mfn;
2420 if ( mfn != 0 )
2421 free_xenheap_page(mfn_to_virt(mfn));
2424 static void vmx_install_vlapic_mapping(struct vcpu *v)
2426 unsigned long virt_page_ma, apic_page_ma;
2428 if ( !cpu_has_vmx_virtualize_apic_accesses )
2429 return;
2431 virt_page_ma = page_to_maddr(vcpu_vlapic(v)->regs_page);
2432 apic_page_ma = v->domain->arch.hvm_domain.vmx_apic_access_mfn;
2433 apic_page_ma <<= PAGE_SHIFT;
2435 vmx_vmcs_enter(v);
2436 __vmwrite(VIRTUAL_APIC_PAGE_ADDR, virt_page_ma);
2437 __vmwrite(APIC_ACCESS_ADDR, apic_page_ma);
2438 vmx_vmcs_exit(v);
2441 void vmx_vlapic_msr_changed(struct vcpu *v)
2443 struct vlapic *vlapic = vcpu_vlapic(v);
2444 uint32_t ctl;
2446 if ( !cpu_has_vmx_virtualize_apic_accesses )
2447 return;
2449 vmx_vmcs_enter(v);
2450 ctl = __vmread(SECONDARY_VM_EXEC_CONTROL);
2451 ctl &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
2452 if ( !vlapic_hw_disabled(vlapic) &&
2453 (vlapic_base_address(vlapic) == APIC_DEFAULT_PHYS_BASE) )
2454 ctl |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
2455 __vmwrite(SECONDARY_VM_EXEC_CONTROL, ctl);
2456 vmx_vmcs_exit(v);
2459 extern bool_t mtrr_var_range_msr_set(struct mtrr_state *v,
2460 u32 msr, u64 msr_content);
2461 extern bool_t mtrr_fix_range_msr_set(struct mtrr_state *v,
2462 int row, u64 msr_content);
2463 extern bool_t mtrr_def_type_msr_set(struct mtrr_state *v, u64 msr_content);
2464 extern bool_t pat_msr_set(u64 *pat, u64 msr);
2466 static int vmx_do_msr_write(struct cpu_user_regs *regs)
2468 u32 ecx = regs->ecx;
2469 u64 msr_content;
2470 struct vcpu *v = current;
2471 int index;
2473 HVM_DBG_LOG(DBG_LEVEL_1, "ecx=%x, eax=%x, edx=%x",
2474 ecx, (u32)regs->eax, (u32)regs->edx);
2476 msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
2478 hvmtrace_msr_write(v, ecx, msr_content);
2480 switch ( ecx )
2482 case MSR_IA32_TSC:
2483 hvm_set_guest_time(v, msr_content);
2484 pt_reset(v);
2485 break;
2486 case MSR_IA32_SYSENTER_CS:
2487 __vmwrite(GUEST_SYSENTER_CS, msr_content);
2488 break;
2489 case MSR_IA32_SYSENTER_ESP:
2490 __vmwrite(GUEST_SYSENTER_ESP, msr_content);
2491 break;
2492 case MSR_IA32_SYSENTER_EIP:
2493 __vmwrite(GUEST_SYSENTER_EIP, msr_content);
2494 break;
2495 case MSR_IA32_APICBASE:
2496 vlapic_msr_set(vcpu_vlapic(v), msr_content);
2497 break;
2498 case MSR_IA32_CR_PAT:
2499 if ( !pat_msr_set(&v->arch.hvm_vcpu.pat_cr, msr_content) )
2500 goto gp_fault;
2501 break;
2502 case MSR_MTRRdefType:
2503 if ( !mtrr_def_type_msr_set(&v->arch.hvm_vcpu.mtrr, msr_content) )
2504 goto gp_fault;
2505 break;
2506 case MSR_MTRRfix64K_00000:
2507 if ( !mtrr_fix_range_msr_set(&v->arch.hvm_vcpu.mtrr, 0, msr_content) )
2508 goto gp_fault;
2509 break;
2510 case MSR_MTRRfix16K_80000:
2511 case MSR_MTRRfix16K_A0000:
2512 index = regs->ecx - MSR_MTRRfix16K_80000 + 1;
2513 if ( !mtrr_fix_range_msr_set(&v->arch.hvm_vcpu.mtrr,
2514 index, msr_content) )
2515 goto gp_fault;
2516 break;
2517 case MSR_MTRRfix4K_C0000...MSR_MTRRfix4K_F8000:
2518 index = regs->ecx - MSR_MTRRfix4K_C0000 + 3;
2519 if ( !mtrr_fix_range_msr_set(&v->arch.hvm_vcpu.mtrr,
2520 index, msr_content) )
2521 goto gp_fault;
2522 break;
2523 case MSR_IA32_MTRR_PHYSBASE0...MSR_IA32_MTRR_PHYSMASK7:
2524 if ( !mtrr_var_range_msr_set(&v->arch.hvm_vcpu.mtrr,
2525 regs->ecx, msr_content) )
2526 goto gp_fault;
2527 break;
2528 case MSR_MTRRcap:
2529 goto gp_fault;
2530 case MSR_IA32_DEBUGCTLMSR: {
2531 int i, rc = 0;
2533 if ( !msr_content || (msr_content & ~3) )
2534 break;
2536 if ( msr_content & 1 )
2538 const struct lbr_info *lbr = last_branch_msr_get();
2539 if ( lbr == NULL )
2540 break;
2542 for ( ; (rc == 0) && lbr->count; lbr++ )
2543 for ( i = 0; (rc == 0) && (i < lbr->count); i++ )
2544 if ( (rc = vmx_add_guest_msr(v, lbr->base + i)) == 0 )
2545 vmx_disable_intercept_for_msr(v, lbr->base + i);
2548 if ( (rc < 0) ||
2549 (vmx_add_guest_msr(v, ecx) < 0) ||
2550 (vmx_add_host_load_msr(v, ecx) < 0) )
2551 vmx_inject_hw_exception(v, TRAP_machine_check, 0);
2552 else
2553 vmx_write_guest_msr(v, ecx, msr_content);
2555 break;
2557 case MSR_IA32_VMX_BASIC...MSR_IA32_VMX_PROCBASED_CTLS2:
2558 goto gp_fault;
2559 default:
2560 switch ( long_mode_do_msr_write(regs) )
2562 case HNDL_unhandled:
2563 if ( (vmx_write_guest_msr(v, ecx, msr_content) != 0) &&
2564 !is_last_branch_msr(ecx) )
2565 wrmsr_hypervisor_regs(ecx, regs->eax, regs->edx);
2566 break;
2567 case HNDL_exception_raised:
2568 return 0;
2569 case HNDL_done:
2570 break;
2572 break;
2575 return 1;
2577 gp_fault:
2578 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
2579 return 0;
2582 static void vmx_do_hlt(struct cpu_user_regs *regs)
2584 HVMTRACE_0D(HLT, current);
2585 hvm_hlt(regs->eflags);
2588 static void vmx_do_extint(struct cpu_user_regs *regs)
2590 unsigned int vector;
2592 asmlinkage void do_IRQ(struct cpu_user_regs *);
2593 fastcall void smp_apic_timer_interrupt(struct cpu_user_regs *);
2594 fastcall void smp_event_check_interrupt(void);
2595 fastcall void smp_invalidate_interrupt(void);
2596 fastcall void smp_call_function_interrupt(void);
2597 fastcall void smp_spurious_interrupt(struct cpu_user_regs *regs);
2598 fastcall void smp_error_interrupt(struct cpu_user_regs *regs);
2599 #ifdef CONFIG_X86_MCE_P4THERMAL
2600 fastcall void smp_thermal_interrupt(struct cpu_user_regs *regs);
2601 #endif
2603 vector = __vmread(VM_EXIT_INTR_INFO);
2604 BUG_ON(!(vector & INTR_INFO_VALID_MASK));
2606 vector &= INTR_INFO_VECTOR_MASK;
2607 HVMTRACE_1D(INTR, current, vector);
2609 switch ( vector )
2611 case LOCAL_TIMER_VECTOR:
2612 smp_apic_timer_interrupt(regs);
2613 break;
2614 case EVENT_CHECK_VECTOR:
2615 smp_event_check_interrupt();
2616 break;
2617 case INVALIDATE_TLB_VECTOR:
2618 smp_invalidate_interrupt();
2619 break;
2620 case CALL_FUNCTION_VECTOR:
2621 smp_call_function_interrupt();
2622 break;
2623 case SPURIOUS_APIC_VECTOR:
2624 smp_spurious_interrupt(regs);
2625 break;
2626 case ERROR_APIC_VECTOR:
2627 smp_error_interrupt(regs);
2628 break;
2629 #ifdef CONFIG_X86_MCE_P4THERMAL
2630 case THERMAL_APIC_VECTOR:
2631 smp_thermal_interrupt(regs);
2632 break;
2633 #endif
2634 default:
2635 regs->entry_vector = vector;
2636 do_IRQ(regs);
2637 break;
2641 static void vmx_failed_vmentry(unsigned int exit_reason,
2642 struct cpu_user_regs *regs)
2644 unsigned int failed_vmentry_reason = (uint16_t)exit_reason;
2645 unsigned long exit_qualification;
2647 exit_qualification = __vmread(EXIT_QUALIFICATION);
2648 printk("Failed vm entry (exit reason 0x%x) ", exit_reason);
2649 switch ( failed_vmentry_reason )
2651 case EXIT_REASON_INVALID_GUEST_STATE:
2652 printk("caused by invalid guest state (%ld).\n", exit_qualification);
2653 break;
2654 case EXIT_REASON_MSR_LOADING:
2655 printk("caused by MSR entry %ld loading.\n", exit_qualification);
2656 break;
2657 case EXIT_REASON_MACHINE_CHECK:
2658 printk("caused by machine check.\n");
2659 HVMTRACE_0D(MCE, current);
2660 do_machine_check(regs);
2661 break;
2662 default:
2663 printk("reason not known yet!");
2664 break;
2667 printk("************* VMCS Area **************\n");
2668 vmcs_dump_vcpu();
2669 printk("**************************************\n");
2671 domain_crash(current->domain);
2674 asmlinkage void vmx_vmexit_handler(struct cpu_user_regs *regs)
2676 unsigned int exit_reason, idtv_info;
2677 unsigned long exit_qualification, inst_len = 0;
2678 struct vcpu *v = current;
2680 exit_reason = __vmread(VM_EXIT_REASON);
2682 hvmtrace_vmexit(v, regs->eip, exit_reason);
2684 perfc_incra(vmexits, exit_reason);
2686 if ( exit_reason != EXIT_REASON_EXTERNAL_INTERRUPT )
2687 local_irq_enable();
2689 if ( unlikely(exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) )
2690 return vmx_failed_vmentry(exit_reason, regs);
2692 /* Event delivery caused this intercept? Queue for redelivery. */
2693 idtv_info = __vmread(IDT_VECTORING_INFO);
2694 if ( unlikely(idtv_info & INTR_INFO_VALID_MASK) &&
2695 (exit_reason != EXIT_REASON_TASK_SWITCH) )
2697 if ( hvm_event_needs_reinjection((idtv_info>>8)&7, idtv_info&0xff) )
2699 /* See SDM 3B 25.7.1.1 and .2 for info about masking resvd bits. */
2700 __vmwrite(VM_ENTRY_INTR_INFO,
2701 idtv_info & ~INTR_INFO_RESVD_BITS_MASK);
2702 if ( idtv_info & INTR_INFO_DELIVER_CODE_MASK )
2703 __vmwrite(VM_ENTRY_EXCEPTION_ERROR_CODE,
2704 __vmread(IDT_VECTORING_ERROR_CODE));
2707 /*
2708 * Clear NMI-blocking interruptibility info if an NMI delivery faulted.
2709 * Re-delivery will re-set it (see SDM 3B 25.7.1.2).
2710 */
2711 if ( (idtv_info & INTR_INFO_INTR_TYPE_MASK) == (X86_EVENTTYPE_NMI<<8) )
2712 __vmwrite(GUEST_INTERRUPTIBILITY_INFO,
2713 __vmread(GUEST_INTERRUPTIBILITY_INFO) &
2714 ~VMX_INTR_SHADOW_NMI);
2717 switch ( exit_reason )
2719 case EXIT_REASON_EXCEPTION_NMI:
2721 /*
2722 * We don't set the software-interrupt exiting (INT n).
2723 * (1) We can get an exception (e.g. #PG) in the guest, or
2724 * (2) NMI
2725 */
2726 unsigned int intr_info, vector;
2728 intr_info = __vmread(VM_EXIT_INTR_INFO);
2729 BUG_ON(!(intr_info & INTR_INFO_VALID_MASK));
2731 vector = intr_info & INTR_INFO_VECTOR_MASK;
2733 /*
2734 * Re-set the NMI shadow if vmexit caused by a guest IRET fault (see 3B
2735 * 25.7.1.2, "Resuming Guest Software after Handling an Exception").
2736 * (NB. If we emulate this IRET for any reason, we should re-clear!)
2737 */
2738 if ( unlikely(intr_info & INTR_INFO_NMI_UNBLOCKED_BY_IRET) &&
2739 !(__vmread(IDT_VECTORING_INFO) & INTR_INFO_VALID_MASK) &&
2740 (vector != TRAP_double_fault) )
2741 __vmwrite(GUEST_INTERRUPTIBILITY_INFO,
2742 __vmread(GUEST_INTERRUPTIBILITY_INFO)|VMX_INTR_SHADOW_NMI);
2744 perfc_incra(cause_vector, vector);
2746 switch ( vector )
2748 case TRAP_debug:
2749 case TRAP_int3:
2750 if ( !v->domain->debugger_attached )
2751 goto exit_and_crash;
2752 domain_pause_for_debugger();
2753 break;
2754 case TRAP_no_device:
2755 vmx_do_no_device_fault();
2756 break;
2757 case TRAP_page_fault:
2758 exit_qualification = __vmread(EXIT_QUALIFICATION);
2759 regs->error_code = __vmread(VM_EXIT_INTR_ERROR_CODE);
2761 HVM_DBG_LOG(DBG_LEVEL_VMMU,
2762 "eax=%lx, ebx=%lx, ecx=%lx, edx=%lx, esi=%lx, edi=%lx",
2763 (unsigned long)regs->eax, (unsigned long)regs->ebx,
2764 (unsigned long)regs->ecx, (unsigned long)regs->edx,
2765 (unsigned long)regs->esi, (unsigned long)regs->edi);
2767 if ( paging_fault(exit_qualification, regs) )
2769 hvmtrace_pf_xen(v, exit_qualification, regs->error_code);
2770 break;
2773 v->arch.hvm_vcpu.guest_cr[2] = exit_qualification;
2774 vmx_inject_hw_exception(v, TRAP_page_fault, regs->error_code);
2775 break;
2776 case TRAP_nmi:
2777 if ( (intr_info & INTR_INFO_INTR_TYPE_MASK) !=
2778 (X86_EVENTTYPE_NMI << 8) )
2779 goto exit_and_crash;
2780 HVMTRACE_0D(NMI, v);
2781 do_nmi(regs); /* Real NMI, vector 2: normal processing. */
2782 break;
2783 case TRAP_machine_check:
2784 HVMTRACE_0D(MCE, v);
2785 do_machine_check(regs);
2786 break;
2787 default:
2788 goto exit_and_crash;
2790 break;
2792 case EXIT_REASON_EXTERNAL_INTERRUPT:
2793 vmx_do_extint(regs);
2794 break;
2795 case EXIT_REASON_TRIPLE_FAULT:
2796 hvm_triple_fault();
2797 break;
2798 case EXIT_REASON_PENDING_VIRT_INTR:
2799 /* Disable the interrupt window. */
2800 v->arch.hvm_vmx.exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
2801 __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
2802 v->arch.hvm_vmx.exec_control);
2803 break;
2804 case EXIT_REASON_PENDING_VIRT_NMI:
2805 /* Disable the NMI window. */
2806 v->arch.hvm_vmx.exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
2807 __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
2808 v->arch.hvm_vmx.exec_control);
2809 break;
2810 case EXIT_REASON_TASK_SWITCH: {
2811 const enum hvm_task_switch_reason reasons[] = {
2812 TSW_call_or_int, TSW_iret, TSW_jmp, TSW_call_or_int };
2813 int32_t errcode = -1;
2814 exit_qualification = __vmread(EXIT_QUALIFICATION);
2815 if ( (idtv_info & INTR_INFO_VALID_MASK) &&
2816 (idtv_info & INTR_INFO_DELIVER_CODE_MASK) )
2817 errcode = __vmread(IDT_VECTORING_ERROR_CODE);
2818 hvm_task_switch((uint16_t)exit_qualification,
2819 reasons[(exit_qualification >> 30) & 3],
2820 errcode);
2821 break;
2823 case EXIT_REASON_CPUID:
2824 inst_len = __get_instruction_length(); /* Safe: CPUID */
2825 __update_guest_eip(inst_len);
2826 vmx_do_cpuid(regs);
2827 break;
2828 case EXIT_REASON_HLT:
2829 inst_len = __get_instruction_length(); /* Safe: HLT */
2830 __update_guest_eip(inst_len);
2831 vmx_do_hlt(regs);
2832 break;
2833 case EXIT_REASON_INVLPG:
2835 inst_len = __get_instruction_length(); /* Safe: INVLPG */
2836 __update_guest_eip(inst_len);
2837 exit_qualification = __vmread(EXIT_QUALIFICATION);
2838 vmx_do_invlpg(exit_qualification);
2839 break;
2841 case EXIT_REASON_VMCALL:
2843 int rc;
2844 HVMTRACE_1D(VMMCALL, v, regs->eax);
2845 inst_len = __get_instruction_length(); /* Safe: VMCALL */
2846 rc = hvm_do_hypercall(regs);
2847 if ( rc != HVM_HCALL_preempted )
2849 __update_guest_eip(inst_len);
2850 if ( rc == HVM_HCALL_invalidate )
2851 send_invalidate_req();
2853 break;
2855 case EXIT_REASON_CR_ACCESS:
2857 exit_qualification = __vmread(EXIT_QUALIFICATION);
2858 inst_len = __get_instruction_length(); /* Safe: MOV Cn, LMSW, CLTS */
2859 if ( vmx_cr_access(exit_qualification, regs) )
2860 __update_guest_eip(inst_len);
2861 break;
2863 case EXIT_REASON_DR_ACCESS:
2864 exit_qualification = __vmread(EXIT_QUALIFICATION);
2865 vmx_dr_access(exit_qualification, regs);
2866 break;
2867 case EXIT_REASON_IO_INSTRUCTION:
2868 exit_qualification = __vmread(EXIT_QUALIFICATION);
2869 inst_len = __get_instruction_length(); /* Safe: IN, INS, OUT, OUTS */
2870 vmx_io_instruction(exit_qualification, inst_len);
2871 break;
2872 case EXIT_REASON_MSR_READ:
2873 inst_len = __get_instruction_length(); /* Safe: RDMSR */
2874 if ( vmx_do_msr_read(regs) )
2875 __update_guest_eip(inst_len);
2876 break;
2877 case EXIT_REASON_MSR_WRITE:
2878 inst_len = __get_instruction_length(); /* Safe: WRMSR */
2879 if ( vmx_do_msr_write(regs) )
2880 __update_guest_eip(inst_len);
2881 break;
2883 case EXIT_REASON_MWAIT_INSTRUCTION:
2884 case EXIT_REASON_MONITOR_INSTRUCTION:
2885 case EXIT_REASON_VMCLEAR:
2886 case EXIT_REASON_VMLAUNCH:
2887 case EXIT_REASON_VMPTRLD:
2888 case EXIT_REASON_VMPTRST:
2889 case EXIT_REASON_VMREAD:
2890 case EXIT_REASON_VMRESUME:
2891 case EXIT_REASON_VMWRITE:
2892 case EXIT_REASON_VMXOFF:
2893 case EXIT_REASON_VMXON:
2894 vmx_inject_hw_exception(v, TRAP_invalid_op, VMX_DELIVER_NO_ERROR_CODE);
2895 break;
2897 case EXIT_REASON_TPR_BELOW_THRESHOLD:
2898 break;
2900 case EXIT_REASON_APIC_ACCESS:
2902 unsigned long offset;
2903 exit_qualification = __vmread(EXIT_QUALIFICATION);
2904 offset = exit_qualification & 0x0fffUL;
2905 handle_mmio(APIC_DEFAULT_PHYS_BASE | offset);
2906 break;
2909 case EXIT_REASON_INVD:
2910 case EXIT_REASON_WBINVD:
2912 inst_len = __get_instruction_length(); /* Safe: INVD, WBINVD */
2913 __update_guest_eip(inst_len);
2914 if ( !list_empty(&(domain_hvm_iommu(v->domain)->pdev_list)) )
2916 wbinvd();
2917 /* Disable further WBINVD intercepts. */
2918 if ( (exit_reason == EXIT_REASON_WBINVD) &&
2919 (vmx_cpu_based_exec_control &
2920 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) )
2921 __vmwrite(SECONDARY_VM_EXEC_CONTROL,
2922 vmx_secondary_exec_control &
2923 ~SECONDARY_EXEC_WBINVD_EXITING);
2925 break;
2928 default:
2929 exit_and_crash:
2930 gdprintk(XENLOG_ERR, "Bad vmexit (reason %x)\n", exit_reason);
2931 domain_crash(v->domain);
2932 break;
2936 asmlinkage void vmx_trace_vmentry(void)
2938 struct vcpu *v = current;
2940 hvmtrace_vmentry(v);
2943 /*
2944 * Local variables:
2945 * mode: C
2946 * c-set-style: "BSD"
2947 * c-basic-offset: 4
2948 * tab-width: 4
2949 * indent-tabs-mode: nil
2950 * End:
2951 */