ia64/xen-unstable

view xen/arch/x86/hvm/vmx/vmx.c @ 16989:92734271810a

vmx realmode: Emulate protected-mode transition while CS and SS have
bad selector values (bottom two bits non-zero).

Allows opensuse 10.3 install CD to boot. Unfortunately SUSE Linux 10.1
install CD still fails to work...

Signed-off-by: Keir Fraser <keir.fraser@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Tue Feb 05 15:45:10 2008 +0000 (2008-02-05)
parents e4edc310e949
children 9d0e86d8c1d1
line source
1 /*
2 * vmx.c: handling VMX architecture-related VM exits
3 * Copyright (c) 2004, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
16 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 */
19 #include <xen/config.h>
20 #include <xen/init.h>
21 #include <xen/lib.h>
22 #include <xen/trace.h>
23 #include <xen/sched.h>
24 #include <xen/irq.h>
25 #include <xen/softirq.h>
26 #include <xen/domain_page.h>
27 #include <xen/hypercall.h>
28 #include <xen/perfc.h>
29 #include <asm/current.h>
30 #include <asm/io.h>
31 #include <asm/regs.h>
32 #include <asm/cpufeature.h>
33 #include <asm/processor.h>
34 #include <asm/types.h>
35 #include <asm/debugreg.h>
36 #include <asm/msr.h>
37 #include <asm/spinlock.h>
38 #include <asm/paging.h>
39 #include <asm/p2m.h>
40 #include <asm/hvm/hvm.h>
41 #include <asm/hvm/support.h>
42 #include <asm/hvm/vmx/vmx.h>
43 #include <asm/hvm/vmx/vmcs.h>
44 #include <asm/hvm/vmx/cpu.h>
45 #include <public/sched.h>
46 #include <public/hvm/ioreq.h>
47 #include <asm/hvm/vpic.h>
48 #include <asm/hvm/vlapic.h>
49 #include <asm/x86_emulate.h>
50 #include <asm/hvm/vpt.h>
51 #include <public/hvm/save.h>
52 #include <asm/hvm/trace.h>
54 enum handler_return { HNDL_done, HNDL_unhandled, HNDL_exception_raised };
56 static void vmx_ctxt_switch_from(struct vcpu *v);
57 static void vmx_ctxt_switch_to(struct vcpu *v);
59 static int vmx_alloc_vlapic_mapping(struct domain *d);
60 static void vmx_free_vlapic_mapping(struct domain *d);
61 static void vmx_install_vlapic_mapping(struct vcpu *v);
62 static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr);
63 static void vmx_update_guest_efer(struct vcpu *v);
65 static int vmx_domain_initialise(struct domain *d)
66 {
67 return vmx_alloc_vlapic_mapping(d);
68 }
70 static void vmx_domain_destroy(struct domain *d)
71 {
72 vmx_free_vlapic_mapping(d);
73 }
75 static int vmx_vcpu_initialise(struct vcpu *v)
76 {
77 int rc;
79 spin_lock_init(&v->arch.hvm_vmx.vmcs_lock);
81 v->arch.schedule_tail = vmx_do_resume;
82 v->arch.ctxt_switch_from = vmx_ctxt_switch_from;
83 v->arch.ctxt_switch_to = vmx_ctxt_switch_to;
85 if ( (rc = vmx_create_vmcs(v)) != 0 )
86 {
87 dprintk(XENLOG_WARNING,
88 "Failed to create VMCS for vcpu %d: err=%d.\n",
89 v->vcpu_id, rc);
90 return rc;
91 }
93 vpmu_initialise(v);
95 vmx_install_vlapic_mapping(v);
97 #ifndef VMXASSIST
98 if ( v->vcpu_id == 0 )
99 v->arch.guest_context.user_regs.eax = 1;
100 v->arch.hvm_vcpu.io_complete = vmx_realmode_io_complete;
101 #endif
103 return 0;
104 }
106 static void vmx_vcpu_destroy(struct vcpu *v)
107 {
108 vmx_destroy_vmcs(v);
109 vpmu_destroy(v);
110 }
112 #ifdef __x86_64__
114 static DEFINE_PER_CPU(struct vmx_msr_state, host_msr_state);
116 static u32 msr_index[VMX_MSR_COUNT] =
117 {
118 MSR_LSTAR, MSR_STAR, MSR_SYSCALL_MASK
119 };
121 static void vmx_save_host_msrs(void)
122 {
123 struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state);
124 int i;
126 for ( i = 0; i < VMX_MSR_COUNT; i++ )
127 rdmsrl(msr_index[i], host_msr_state->msrs[i]);
128 }
130 #define WRITE_MSR(address) \
131 guest_msr_state->msrs[VMX_INDEX_MSR_ ## address] = msr_content; \
132 set_bit(VMX_INDEX_MSR_ ## address, &guest_msr_state->flags); \
133 wrmsrl(MSR_ ## address, msr_content); \
134 set_bit(VMX_INDEX_MSR_ ## address, &host_msr_state->flags); \
135 break
137 static enum handler_return long_mode_do_msr_read(struct cpu_user_regs *regs)
138 {
139 u64 msr_content = 0;
140 u32 ecx = regs->ecx;
141 struct vcpu *v = current;
142 struct vmx_msr_state *guest_msr_state = &v->arch.hvm_vmx.msr_state;
144 switch ( ecx )
145 {
146 case MSR_EFER:
147 msr_content = v->arch.hvm_vcpu.guest_efer;
148 break;
150 case MSR_FS_BASE:
151 msr_content = __vmread(GUEST_FS_BASE);
152 goto check_long_mode;
154 case MSR_GS_BASE:
155 msr_content = __vmread(GUEST_GS_BASE);
156 goto check_long_mode;
158 case MSR_SHADOW_GS_BASE:
159 msr_content = v->arch.hvm_vmx.shadow_gs;
160 check_long_mode:
161 if ( !(hvm_long_mode_enabled(v)) )
162 {
163 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
164 return HNDL_exception_raised;
165 }
166 break;
168 case MSR_STAR:
169 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_STAR];
170 break;
172 case MSR_LSTAR:
173 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_LSTAR];
174 break;
176 case MSR_CSTAR:
177 msr_content = v->arch.hvm_vmx.cstar;
178 break;
180 case MSR_SYSCALL_MASK:
181 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_SYSCALL_MASK];
182 break;
184 default:
185 return HNDL_unhandled;
186 }
188 HVM_DBG_LOG(DBG_LEVEL_0, "msr 0x%x content 0x%"PRIx64, ecx, msr_content);
190 regs->eax = (u32)(msr_content >> 0);
191 regs->edx = (u32)(msr_content >> 32);
193 return HNDL_done;
194 }
196 static enum handler_return long_mode_do_msr_write(struct cpu_user_regs *regs)
197 {
198 u64 msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
199 u32 ecx = regs->ecx;
200 struct vcpu *v = current;
201 struct vmx_msr_state *guest_msr_state = &v->arch.hvm_vmx.msr_state;
202 struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state);
204 HVM_DBG_LOG(DBG_LEVEL_0, "msr 0x%x content 0x%"PRIx64, ecx, msr_content);
206 switch ( ecx )
207 {
208 case MSR_EFER:
209 if ( !hvm_set_efer(msr_content) )
210 goto exception_raised;
211 break;
213 case MSR_FS_BASE:
214 case MSR_GS_BASE:
215 case MSR_SHADOW_GS_BASE:
216 if ( !hvm_long_mode_enabled(v) )
217 goto gp_fault;
219 if ( !is_canonical_address(msr_content) )
220 goto uncanonical_address;
222 if ( ecx == MSR_FS_BASE )
223 __vmwrite(GUEST_FS_BASE, msr_content);
224 else if ( ecx == MSR_GS_BASE )
225 __vmwrite(GUEST_GS_BASE, msr_content);
226 else
227 {
228 v->arch.hvm_vmx.shadow_gs = msr_content;
229 wrmsrl(MSR_SHADOW_GS_BASE, msr_content);
230 }
232 break;
234 case MSR_STAR:
235 WRITE_MSR(STAR);
237 case MSR_LSTAR:
238 if ( !is_canonical_address(msr_content) )
239 goto uncanonical_address;
240 WRITE_MSR(LSTAR);
242 case MSR_CSTAR:
243 if ( !is_canonical_address(msr_content) )
244 goto uncanonical_address;
245 v->arch.hvm_vmx.cstar = msr_content;
246 break;
248 case MSR_SYSCALL_MASK:
249 WRITE_MSR(SYSCALL_MASK);
251 default:
252 return HNDL_unhandled;
253 }
255 return HNDL_done;
257 uncanonical_address:
258 HVM_DBG_LOG(DBG_LEVEL_0, "Not cano address of msr write %x", ecx);
259 gp_fault:
260 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
261 exception_raised:
262 return HNDL_exception_raised;
263 }
265 /*
266 * To avoid MSR save/restore at every VM exit/entry time, we restore
267 * the x86_64 specific MSRs at domain switch time. Since these MSRs
268 * are not modified once set for para domains, we don't save them,
269 * but simply reset them to values set in percpu_traps_init().
270 */
271 static void vmx_restore_host_msrs(void)
272 {
273 struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state);
274 int i;
276 while ( host_msr_state->flags )
277 {
278 i = find_first_set_bit(host_msr_state->flags);
279 wrmsrl(msr_index[i], host_msr_state->msrs[i]);
280 clear_bit(i, &host_msr_state->flags);
281 }
283 if ( cpu_has_nx && !(read_efer() & EFER_NX) )
284 write_efer(read_efer() | EFER_NX);
285 }
287 static void vmx_save_guest_msrs(struct vcpu *v)
288 {
289 /* MSR_SHADOW_GS_BASE may have been changed by swapgs instruction. */
290 rdmsrl(MSR_SHADOW_GS_BASE, v->arch.hvm_vmx.shadow_gs);
291 }
293 static void vmx_restore_guest_msrs(struct vcpu *v)
294 {
295 struct vmx_msr_state *guest_msr_state, *host_msr_state;
296 unsigned long guest_flags;
297 int i;
299 guest_msr_state = &v->arch.hvm_vmx.msr_state;
300 host_msr_state = &this_cpu(host_msr_state);
302 wrmsrl(MSR_SHADOW_GS_BASE, v->arch.hvm_vmx.shadow_gs);
304 guest_flags = guest_msr_state->flags;
306 while ( guest_flags )
307 {
308 i = find_first_set_bit(guest_flags);
310 HVM_DBG_LOG(DBG_LEVEL_2,
311 "restore guest's index %d msr %x with value %lx",
312 i, msr_index[i], guest_msr_state->msrs[i]);
313 set_bit(i, &host_msr_state->flags);
314 wrmsrl(msr_index[i], guest_msr_state->msrs[i]);
315 clear_bit(i, &guest_flags);
316 }
318 if ( (v->arch.hvm_vcpu.guest_efer ^ read_efer()) & (EFER_NX | EFER_SCE) )
319 {
320 HVM_DBG_LOG(DBG_LEVEL_2,
321 "restore guest's EFER with value %lx",
322 v->arch.hvm_vcpu.guest_efer);
323 write_efer((read_efer() & ~(EFER_NX | EFER_SCE)) |
324 (v->arch.hvm_vcpu.guest_efer & (EFER_NX | EFER_SCE)));
325 }
326 }
328 #else /* __i386__ */
330 #define vmx_save_host_msrs() ((void)0)
332 static void vmx_restore_host_msrs(void)
333 {
334 if ( cpu_has_nx && !(read_efer() & EFER_NX) )
335 write_efer(read_efer() | EFER_NX);
336 }
338 #define vmx_save_guest_msrs(v) ((void)0)
340 static void vmx_restore_guest_msrs(struct vcpu *v)
341 {
342 if ( (v->arch.hvm_vcpu.guest_efer ^ read_efer()) & EFER_NX )
343 {
344 HVM_DBG_LOG(DBG_LEVEL_2,
345 "restore guest's EFER with value %lx",
346 v->arch.hvm_vcpu.guest_efer);
347 write_efer((read_efer() & ~EFER_NX) |
348 (v->arch.hvm_vcpu.guest_efer & EFER_NX));
349 }
350 }
352 static enum handler_return long_mode_do_msr_read(struct cpu_user_regs *regs)
353 {
354 u64 msr_content = 0;
355 struct vcpu *v = current;
357 switch ( regs->ecx )
358 {
359 case MSR_EFER:
360 msr_content = v->arch.hvm_vcpu.guest_efer;
361 break;
363 default:
364 return HNDL_unhandled;
365 }
367 regs->eax = msr_content >> 0;
368 regs->edx = msr_content >> 32;
370 return HNDL_done;
371 }
373 static enum handler_return long_mode_do_msr_write(struct cpu_user_regs *regs)
374 {
375 u64 msr_content = regs->eax | ((u64)regs->edx << 32);
377 switch ( regs->ecx )
378 {
379 case MSR_EFER:
380 if ( !hvm_set_efer(msr_content) )
381 return HNDL_exception_raised;
382 break;
384 default:
385 return HNDL_unhandled;
386 }
388 return HNDL_done;
389 }
391 #endif /* __i386__ */
393 static int vmx_guest_x86_mode(struct vcpu *v)
394 {
395 unsigned int cs_ar_bytes;
397 if ( unlikely(!(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE)) )
398 return 0;
399 if ( unlikely(guest_cpu_user_regs()->eflags & X86_EFLAGS_VM) )
400 return 1;
401 cs_ar_bytes = __vmread(GUEST_CS_AR_BYTES);
402 if ( hvm_long_mode_enabled(v) &&
403 likely(cs_ar_bytes & X86_SEG_AR_CS_LM_ACTIVE) )
404 return 8;
405 return (likely(cs_ar_bytes & X86_SEG_AR_DEF_OP_SIZE) ? 4 : 2);
406 }
408 static void vmx_save_dr(struct vcpu *v)
409 {
410 if ( !v->arch.hvm_vcpu.flag_dr_dirty )
411 return;
413 /* Clear the DR dirty flag and re-enable intercepts for DR accesses. */
414 v->arch.hvm_vcpu.flag_dr_dirty = 0;
415 v->arch.hvm_vmx.exec_control |= CPU_BASED_MOV_DR_EXITING;
416 __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control);
418 v->arch.guest_context.debugreg[0] = read_debugreg(0);
419 v->arch.guest_context.debugreg[1] = read_debugreg(1);
420 v->arch.guest_context.debugreg[2] = read_debugreg(2);
421 v->arch.guest_context.debugreg[3] = read_debugreg(3);
422 v->arch.guest_context.debugreg[6] = read_debugreg(6);
423 /* DR7 must be saved as it is used by vmx_restore_dr(). */
424 v->arch.guest_context.debugreg[7] = __vmread(GUEST_DR7);
425 }
427 static void __restore_debug_registers(struct vcpu *v)
428 {
429 if ( v->arch.hvm_vcpu.flag_dr_dirty )
430 return;
432 v->arch.hvm_vcpu.flag_dr_dirty = 1;
434 write_debugreg(0, v->arch.guest_context.debugreg[0]);
435 write_debugreg(1, v->arch.guest_context.debugreg[1]);
436 write_debugreg(2, v->arch.guest_context.debugreg[2]);
437 write_debugreg(3, v->arch.guest_context.debugreg[3]);
438 write_debugreg(6, v->arch.guest_context.debugreg[6]);
439 /* DR7 is loaded from the VMCS. */
440 }
442 /*
443 * DR7 is saved and restored on every vmexit. Other debug registers only
444 * need to be restored if their value is going to affect execution -- i.e.,
445 * if one of the breakpoints is enabled. So mask out all bits that don't
446 * enable some breakpoint functionality.
447 */
448 static void vmx_restore_dr(struct vcpu *v)
449 {
450 /* NB. __vmread() is not usable here, so we cannot read from the VMCS. */
451 if ( unlikely(v->arch.guest_context.debugreg[7] & DR7_ACTIVE_MASK) )
452 __restore_debug_registers(v);
453 }
455 void vmx_vmcs_save(struct vcpu *v, struct hvm_hw_cpu *c)
456 {
457 uint32_t ev;
459 vmx_vmcs_enter(v);
461 c->cr0 = v->arch.hvm_vcpu.guest_cr[0];
462 c->cr2 = v->arch.hvm_vcpu.guest_cr[2];
463 c->cr3 = v->arch.hvm_vcpu.guest_cr[3];
464 c->cr4 = v->arch.hvm_vcpu.guest_cr[4];
466 c->msr_efer = v->arch.hvm_vcpu.guest_efer;
468 c->idtr_limit = __vmread(GUEST_IDTR_LIMIT);
469 c->idtr_base = __vmread(GUEST_IDTR_BASE);
471 c->gdtr_limit = __vmread(GUEST_GDTR_LIMIT);
472 c->gdtr_base = __vmread(GUEST_GDTR_BASE);
474 c->cs_sel = __vmread(GUEST_CS_SELECTOR);
475 c->cs_limit = __vmread(GUEST_CS_LIMIT);
476 c->cs_base = __vmread(GUEST_CS_BASE);
477 c->cs_arbytes = __vmread(GUEST_CS_AR_BYTES);
479 c->ds_sel = __vmread(GUEST_DS_SELECTOR);
480 c->ds_limit = __vmread(GUEST_DS_LIMIT);
481 c->ds_base = __vmread(GUEST_DS_BASE);
482 c->ds_arbytes = __vmread(GUEST_DS_AR_BYTES);
484 c->es_sel = __vmread(GUEST_ES_SELECTOR);
485 c->es_limit = __vmread(GUEST_ES_LIMIT);
486 c->es_base = __vmread(GUEST_ES_BASE);
487 c->es_arbytes = __vmread(GUEST_ES_AR_BYTES);
489 c->ss_sel = __vmread(GUEST_SS_SELECTOR);
490 c->ss_limit = __vmread(GUEST_SS_LIMIT);
491 c->ss_base = __vmread(GUEST_SS_BASE);
492 c->ss_arbytes = __vmread(GUEST_SS_AR_BYTES);
494 c->fs_sel = __vmread(GUEST_FS_SELECTOR);
495 c->fs_limit = __vmread(GUEST_FS_LIMIT);
496 c->fs_base = __vmread(GUEST_FS_BASE);
497 c->fs_arbytes = __vmread(GUEST_FS_AR_BYTES);
499 c->gs_sel = __vmread(GUEST_GS_SELECTOR);
500 c->gs_limit = __vmread(GUEST_GS_LIMIT);
501 c->gs_base = __vmread(GUEST_GS_BASE);
502 c->gs_arbytes = __vmread(GUEST_GS_AR_BYTES);
504 c->tr_sel = __vmread(GUEST_TR_SELECTOR);
505 c->tr_limit = __vmread(GUEST_TR_LIMIT);
506 c->tr_base = __vmread(GUEST_TR_BASE);
507 c->tr_arbytes = __vmread(GUEST_TR_AR_BYTES);
509 c->ldtr_sel = __vmread(GUEST_LDTR_SELECTOR);
510 c->ldtr_limit = __vmread(GUEST_LDTR_LIMIT);
511 c->ldtr_base = __vmread(GUEST_LDTR_BASE);
512 c->ldtr_arbytes = __vmread(GUEST_LDTR_AR_BYTES);
514 c->sysenter_cs = __vmread(GUEST_SYSENTER_CS);
515 c->sysenter_esp = __vmread(GUEST_SYSENTER_ESP);
516 c->sysenter_eip = __vmread(GUEST_SYSENTER_EIP);
518 c->pending_event = 0;
519 c->error_code = 0;
520 if ( ((ev = __vmread(VM_ENTRY_INTR_INFO)) & INTR_INFO_VALID_MASK) &&
521 hvm_event_needs_reinjection((ev >> 8) & 7, ev & 0xff) )
522 {
523 c->pending_event = ev;
524 c->error_code = __vmread(VM_ENTRY_EXCEPTION_ERROR_CODE);
525 }
527 vmx_vmcs_exit(v);
528 }
530 static int vmx_restore_cr0_cr3(
531 struct vcpu *v, unsigned long cr0, unsigned long cr3)
532 {
533 unsigned long mfn = 0;
534 p2m_type_t p2mt;
536 if ( cr0 & X86_CR0_PG )
537 {
538 mfn = mfn_x(gfn_to_mfn(v->domain, cr3 >> PAGE_SHIFT, &p2mt));
539 if ( !p2m_is_ram(p2mt) || !get_page(mfn_to_page(mfn), v->domain) )
540 {
541 gdprintk(XENLOG_ERR, "Invalid CR3 value=0x%lx\n", cr3);
542 return -EINVAL;
543 }
544 }
546 if ( v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PG )
547 put_page(pagetable_get_page(v->arch.guest_table));
549 v->arch.guest_table = pagetable_from_pfn(mfn);
551 v->arch.hvm_vcpu.guest_cr[0] = cr0 | X86_CR0_ET;
552 v->arch.hvm_vcpu.guest_cr[3] = cr3;
554 return 0;
555 }
557 int vmx_vmcs_restore(struct vcpu *v, struct hvm_hw_cpu *c)
558 {
559 int rc;
561 if ( c->pending_valid &&
562 ((c->pending_type == 1) || (c->pending_type > 6) ||
563 (c->pending_reserved != 0)) )
564 {
565 gdprintk(XENLOG_ERR, "Invalid pending event 0x%"PRIx32".\n",
566 c->pending_event);
567 return -EINVAL;
568 }
570 rc = vmx_restore_cr0_cr3(v, c->cr0, c->cr3);
571 if ( rc )
572 return rc;
574 vmx_vmcs_enter(v);
576 v->arch.hvm_vcpu.guest_cr[2] = c->cr2;
577 v->arch.hvm_vcpu.guest_cr[4] = c->cr4;
578 vmx_update_guest_cr(v, 0);
579 vmx_update_guest_cr(v, 2);
580 vmx_update_guest_cr(v, 4);
582 #ifdef HVM_DEBUG_SUSPEND
583 printk("%s: cr3=0x%"PRIx64", cr0=0x%"PRIx64", cr4=0x%"PRIx64".\n",
584 __func__, c->cr3, c->cr0, c->cr4);
585 #endif
587 v->arch.hvm_vcpu.guest_efer = c->msr_efer;
588 vmx_update_guest_efer(v);
590 __vmwrite(GUEST_IDTR_LIMIT, c->idtr_limit);
591 __vmwrite(GUEST_IDTR_BASE, c->idtr_base);
593 __vmwrite(GUEST_GDTR_LIMIT, c->gdtr_limit);
594 __vmwrite(GUEST_GDTR_BASE, c->gdtr_base);
596 __vmwrite(GUEST_CS_SELECTOR, c->cs_sel);
597 __vmwrite(GUEST_CS_LIMIT, c->cs_limit);
598 __vmwrite(GUEST_CS_BASE, c->cs_base);
599 __vmwrite(GUEST_CS_AR_BYTES, c->cs_arbytes);
601 __vmwrite(GUEST_DS_SELECTOR, c->ds_sel);
602 __vmwrite(GUEST_DS_LIMIT, c->ds_limit);
603 __vmwrite(GUEST_DS_BASE, c->ds_base);
604 __vmwrite(GUEST_DS_AR_BYTES, c->ds_arbytes);
606 __vmwrite(GUEST_ES_SELECTOR, c->es_sel);
607 __vmwrite(GUEST_ES_LIMIT, c->es_limit);
608 __vmwrite(GUEST_ES_BASE, c->es_base);
609 __vmwrite(GUEST_ES_AR_BYTES, c->es_arbytes);
611 __vmwrite(GUEST_SS_SELECTOR, c->ss_sel);
612 __vmwrite(GUEST_SS_LIMIT, c->ss_limit);
613 __vmwrite(GUEST_SS_BASE, c->ss_base);
614 __vmwrite(GUEST_SS_AR_BYTES, c->ss_arbytes);
616 __vmwrite(GUEST_FS_SELECTOR, c->fs_sel);
617 __vmwrite(GUEST_FS_LIMIT, c->fs_limit);
618 __vmwrite(GUEST_FS_BASE, c->fs_base);
619 __vmwrite(GUEST_FS_AR_BYTES, c->fs_arbytes);
621 __vmwrite(GUEST_GS_SELECTOR, c->gs_sel);
622 __vmwrite(GUEST_GS_LIMIT, c->gs_limit);
623 __vmwrite(GUEST_GS_BASE, c->gs_base);
624 __vmwrite(GUEST_GS_AR_BYTES, c->gs_arbytes);
626 __vmwrite(GUEST_TR_SELECTOR, c->tr_sel);
627 __vmwrite(GUEST_TR_LIMIT, c->tr_limit);
628 __vmwrite(GUEST_TR_BASE, c->tr_base);
629 __vmwrite(GUEST_TR_AR_BYTES, c->tr_arbytes);
631 __vmwrite(GUEST_LDTR_SELECTOR, c->ldtr_sel);
632 __vmwrite(GUEST_LDTR_LIMIT, c->ldtr_limit);
633 __vmwrite(GUEST_LDTR_BASE, c->ldtr_base);
634 __vmwrite(GUEST_LDTR_AR_BYTES, c->ldtr_arbytes);
636 __vmwrite(GUEST_SYSENTER_CS, c->sysenter_cs);
637 __vmwrite(GUEST_SYSENTER_ESP, c->sysenter_esp);
638 __vmwrite(GUEST_SYSENTER_EIP, c->sysenter_eip);
640 __vmwrite(GUEST_DR7, c->dr7);
642 vmx_vmcs_exit(v);
644 paging_update_paging_modes(v);
646 if ( c->pending_valid )
647 {
648 gdprintk(XENLOG_INFO, "Re-injecting 0x%"PRIx32", 0x%"PRIx32"\n",
649 c->pending_event, c->error_code);
651 if ( hvm_event_needs_reinjection(c->pending_type, c->pending_vector) )
652 {
653 vmx_vmcs_enter(v);
654 __vmwrite(VM_ENTRY_INTR_INFO, c->pending_event);
655 __vmwrite(VM_ENTRY_EXCEPTION_ERROR_CODE, c->error_code);
656 vmx_vmcs_exit(v);
657 }
658 }
660 return 0;
661 }
663 #if defined(__x86_64__) && defined(HVM_DEBUG_SUSPEND)
664 static void dump_msr_state(struct vmx_msr_state *m)
665 {
666 int i = 0;
667 printk("**** msr state ****\n");
668 printk("shadow_gs=0x%lx, flags=0x%lx, msr_items:", m->shadow_gs, m->flags);
669 for ( i = 0; i < VMX_MSR_COUNT; i++ )
670 printk("0x%lx,", m->msrs[i]);
671 printk("\n");
672 }
673 #else
674 #define dump_msr_state(m) ((void)0)
675 #endif
677 static void vmx_save_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data)
678 {
679 #ifdef __x86_64__
680 struct vmx_msr_state *guest_state = &v->arch.hvm_vmx.msr_state;
681 unsigned long guest_flags = guest_state->flags;
683 data->shadow_gs = v->arch.hvm_vmx.shadow_gs;
684 data->msr_cstar = v->arch.hvm_vmx.cstar;
686 /* save msrs */
687 data->msr_flags = guest_flags;
688 data->msr_lstar = guest_state->msrs[VMX_INDEX_MSR_LSTAR];
689 data->msr_star = guest_state->msrs[VMX_INDEX_MSR_STAR];
690 data->msr_syscall_mask = guest_state->msrs[VMX_INDEX_MSR_SYSCALL_MASK];
691 #endif
693 data->tsc = hvm_get_guest_time(v);
695 dump_msr_state(guest_state);
696 }
698 static void vmx_load_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data)
699 {
700 #ifdef __x86_64__
701 struct vmx_msr_state *guest_state = &v->arch.hvm_vmx.msr_state;
703 /* restore msrs */
704 guest_state->flags = data->msr_flags;
705 guest_state->msrs[VMX_INDEX_MSR_LSTAR] = data->msr_lstar;
706 guest_state->msrs[VMX_INDEX_MSR_STAR] = data->msr_star;
707 guest_state->msrs[VMX_INDEX_MSR_SYSCALL_MASK] = data->msr_syscall_mask;
709 v->arch.hvm_vmx.cstar = data->msr_cstar;
710 v->arch.hvm_vmx.shadow_gs = data->shadow_gs;
711 #endif
713 #ifdef VMXASSIST
714 v->arch.hvm_vmx.vmxassist_enabled = !(data->cr0 & X86_CR0_PE);
715 #endif
717 hvm_set_guest_time(v, data->tsc);
719 dump_msr_state(guest_state);
720 }
723 static void vmx_save_vmcs_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt)
724 {
725 vmx_save_cpu_state(v, ctxt);
726 vmx_vmcs_save(v, ctxt);
727 }
729 static int vmx_load_vmcs_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt)
730 {
731 vmx_load_cpu_state(v, ctxt);
733 if ( vmx_vmcs_restore(v, ctxt) )
734 {
735 gdprintk(XENLOG_ERR, "vmx_vmcs restore failed!\n");
736 domain_crash(v->domain);
737 return -EINVAL;
738 }
740 return 0;
741 }
743 static void vmx_fpu_enter(struct vcpu *v)
744 {
745 setup_fpu(v);
746 __vm_clear_bit(EXCEPTION_BITMAP, TRAP_no_device);
747 v->arch.hvm_vmx.host_cr0 &= ~X86_CR0_TS;
748 __vmwrite(HOST_CR0, v->arch.hvm_vmx.host_cr0);
749 }
751 static void vmx_fpu_leave(struct vcpu *v)
752 {
753 ASSERT(!v->fpu_dirtied);
754 ASSERT(read_cr0() & X86_CR0_TS);
756 if ( !(v->arch.hvm_vmx.host_cr0 & X86_CR0_TS) )
757 {
758 v->arch.hvm_vmx.host_cr0 |= X86_CR0_TS;
759 __vmwrite(HOST_CR0, v->arch.hvm_vmx.host_cr0);
760 }
762 /*
763 * If the guest does not have TS enabled then we must cause and handle an
764 * exception on first use of the FPU. If the guest *does* have TS enabled
765 * then this is not necessary: no FPU activity can occur until the guest
766 * clears CR0.TS, and we will initialise the FPU when that happens.
767 */
768 if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) )
769 {
770 v->arch.hvm_vcpu.hw_cr[0] |= X86_CR0_TS;
771 __vmwrite(GUEST_CR0, v->arch.hvm_vcpu.hw_cr[0]);
772 __vm_set_bit(EXCEPTION_BITMAP, TRAP_no_device);
773 }
774 }
776 static void vmx_ctxt_switch_from(struct vcpu *v)
777 {
778 vmx_fpu_leave(v);
779 vmx_save_guest_msrs(v);
780 vmx_restore_host_msrs();
781 vmx_save_dr(v);
782 vpmu_save(v);
783 }
785 static void vmx_ctxt_switch_to(struct vcpu *v)
786 {
787 /* HOST_CR4 in VMCS is always mmu_cr4_features. Sync CR4 now. */
788 if ( unlikely(read_cr4() != mmu_cr4_features) )
789 write_cr4(mmu_cr4_features);
791 vmx_restore_guest_msrs(v);
792 vmx_restore_dr(v);
793 vpmu_load(v);
794 }
796 static unsigned long vmx_get_segment_base(struct vcpu *v, enum x86_segment seg)
797 {
798 unsigned long base = 0;
799 int long_mode = 0;
801 ASSERT(v == current);
803 if ( hvm_long_mode_enabled(v) &&
804 (__vmread(GUEST_CS_AR_BYTES) & X86_SEG_AR_CS_LM_ACTIVE) )
805 long_mode = 1;
807 switch ( seg )
808 {
809 case x86_seg_cs: if ( !long_mode ) base = __vmread(GUEST_CS_BASE); break;
810 case x86_seg_ds: if ( !long_mode ) base = __vmread(GUEST_DS_BASE); break;
811 case x86_seg_es: if ( !long_mode ) base = __vmread(GUEST_ES_BASE); break;
812 case x86_seg_fs: base = __vmread(GUEST_FS_BASE); break;
813 case x86_seg_gs: base = __vmread(GUEST_GS_BASE); break;
814 case x86_seg_ss: if ( !long_mode ) base = __vmread(GUEST_SS_BASE); break;
815 case x86_seg_tr: base = __vmread(GUEST_TR_BASE); break;
816 case x86_seg_gdtr: base = __vmread(GUEST_GDTR_BASE); break;
817 case x86_seg_idtr: base = __vmread(GUEST_IDTR_BASE); break;
818 case x86_seg_ldtr: base = __vmread(GUEST_LDTR_BASE); break;
819 default: BUG(); break;
820 }
822 return base;
823 }
825 static void vmx_get_segment_register(struct vcpu *v, enum x86_segment seg,
826 struct segment_register *reg)
827 {
828 uint32_t attr = 0;
830 ASSERT(v == current);
832 switch ( seg )
833 {
834 case x86_seg_cs:
835 reg->sel = __vmread(GUEST_CS_SELECTOR);
836 reg->limit = __vmread(GUEST_CS_LIMIT);
837 reg->base = __vmread(GUEST_CS_BASE);
838 attr = __vmread(GUEST_CS_AR_BYTES);
839 break;
840 case x86_seg_ds:
841 reg->sel = __vmread(GUEST_DS_SELECTOR);
842 reg->limit = __vmread(GUEST_DS_LIMIT);
843 reg->base = __vmread(GUEST_DS_BASE);
844 attr = __vmread(GUEST_DS_AR_BYTES);
845 break;
846 case x86_seg_es:
847 reg->sel = __vmread(GUEST_ES_SELECTOR);
848 reg->limit = __vmread(GUEST_ES_LIMIT);
849 reg->base = __vmread(GUEST_ES_BASE);
850 attr = __vmread(GUEST_ES_AR_BYTES);
851 break;
852 case x86_seg_fs:
853 reg->sel = __vmread(GUEST_FS_SELECTOR);
854 reg->limit = __vmread(GUEST_FS_LIMIT);
855 reg->base = __vmread(GUEST_FS_BASE);
856 attr = __vmread(GUEST_FS_AR_BYTES);
857 break;
858 case x86_seg_gs:
859 reg->sel = __vmread(GUEST_GS_SELECTOR);
860 reg->limit = __vmread(GUEST_GS_LIMIT);
861 reg->base = __vmread(GUEST_GS_BASE);
862 attr = __vmread(GUEST_GS_AR_BYTES);
863 break;
864 case x86_seg_ss:
865 reg->sel = __vmread(GUEST_SS_SELECTOR);
866 reg->limit = __vmread(GUEST_SS_LIMIT);
867 reg->base = __vmread(GUEST_SS_BASE);
868 attr = __vmread(GUEST_SS_AR_BYTES);
869 break;
870 case x86_seg_tr:
871 reg->sel = __vmread(GUEST_TR_SELECTOR);
872 reg->limit = __vmread(GUEST_TR_LIMIT);
873 reg->base = __vmread(GUEST_TR_BASE);
874 attr = __vmread(GUEST_TR_AR_BYTES);
875 break;
876 case x86_seg_gdtr:
877 reg->limit = __vmread(GUEST_GDTR_LIMIT);
878 reg->base = __vmread(GUEST_GDTR_BASE);
879 break;
880 case x86_seg_idtr:
881 reg->limit = __vmread(GUEST_IDTR_LIMIT);
882 reg->base = __vmread(GUEST_IDTR_BASE);
883 break;
884 case x86_seg_ldtr:
885 reg->sel = __vmread(GUEST_LDTR_SELECTOR);
886 reg->limit = __vmread(GUEST_LDTR_LIMIT);
887 reg->base = __vmread(GUEST_LDTR_BASE);
888 attr = __vmread(GUEST_LDTR_AR_BYTES);
889 break;
890 default:
891 BUG();
892 }
894 reg->attr.bytes = (attr & 0xff) | ((attr >> 4) & 0xf00);
895 /* Unusable flag is folded into Present flag. */
896 if ( attr & (1u<<16) )
897 reg->attr.fields.p = 0;
898 }
900 static void vmx_set_segment_register(struct vcpu *v, enum x86_segment seg,
901 struct segment_register *reg)
902 {
903 uint32_t attr;
905 ASSERT((v == current) || !vcpu_runnable(v));
907 attr = reg->attr.bytes;
908 attr = ((attr & 0xf00) << 4) | (attr & 0xff);
910 /* Not-present must mean unusable. */
911 if ( !reg->attr.fields.p )
912 attr |= (1u << 16);
914 vmx_vmcs_enter(v);
916 switch ( seg )
917 {
918 case x86_seg_cs:
919 __vmwrite(GUEST_CS_SELECTOR, reg->sel);
920 __vmwrite(GUEST_CS_LIMIT, reg->limit);
921 __vmwrite(GUEST_CS_BASE, reg->base);
922 __vmwrite(GUEST_CS_AR_BYTES, attr);
923 break;
924 case x86_seg_ds:
925 __vmwrite(GUEST_DS_SELECTOR, reg->sel);
926 __vmwrite(GUEST_DS_LIMIT, reg->limit);
927 __vmwrite(GUEST_DS_BASE, reg->base);
928 __vmwrite(GUEST_DS_AR_BYTES, attr);
929 break;
930 case x86_seg_es:
931 __vmwrite(GUEST_ES_SELECTOR, reg->sel);
932 __vmwrite(GUEST_ES_LIMIT, reg->limit);
933 __vmwrite(GUEST_ES_BASE, reg->base);
934 __vmwrite(GUEST_ES_AR_BYTES, attr);
935 break;
936 case x86_seg_fs:
937 __vmwrite(GUEST_FS_SELECTOR, reg->sel);
938 __vmwrite(GUEST_FS_LIMIT, reg->limit);
939 __vmwrite(GUEST_FS_BASE, reg->base);
940 __vmwrite(GUEST_FS_AR_BYTES, attr);
941 break;
942 case x86_seg_gs:
943 __vmwrite(GUEST_GS_SELECTOR, reg->sel);
944 __vmwrite(GUEST_GS_LIMIT, reg->limit);
945 __vmwrite(GUEST_GS_BASE, reg->base);
946 __vmwrite(GUEST_GS_AR_BYTES, attr);
947 break;
948 case x86_seg_ss:
949 __vmwrite(GUEST_SS_SELECTOR, reg->sel);
950 __vmwrite(GUEST_SS_LIMIT, reg->limit);
951 __vmwrite(GUEST_SS_BASE, reg->base);
952 __vmwrite(GUEST_SS_AR_BYTES, attr);
953 break;
954 case x86_seg_tr:
955 __vmwrite(GUEST_TR_SELECTOR, reg->sel);
956 __vmwrite(GUEST_TR_LIMIT, reg->limit);
957 __vmwrite(GUEST_TR_BASE, reg->base);
958 __vmwrite(GUEST_TR_AR_BYTES, attr);
959 break;
960 case x86_seg_gdtr:
961 __vmwrite(GUEST_GDTR_LIMIT, reg->limit);
962 __vmwrite(GUEST_GDTR_BASE, reg->base);
963 break;
964 case x86_seg_idtr:
965 __vmwrite(GUEST_IDTR_LIMIT, reg->limit);
966 __vmwrite(GUEST_IDTR_BASE, reg->base);
967 break;
968 case x86_seg_ldtr:
969 __vmwrite(GUEST_LDTR_SELECTOR, reg->sel);
970 __vmwrite(GUEST_LDTR_LIMIT, reg->limit);
971 __vmwrite(GUEST_LDTR_BASE, reg->base);
972 __vmwrite(GUEST_LDTR_AR_BYTES, attr);
973 break;
974 default:
975 BUG();
976 }
978 vmx_vmcs_exit(v);
979 }
981 static void vmx_set_tsc_offset(struct vcpu *v, u64 offset)
982 {
983 vmx_vmcs_enter(v);
984 __vmwrite(TSC_OFFSET, offset);
985 #if defined (__i386__)
986 __vmwrite(TSC_OFFSET_HIGH, offset >> 32);
987 #endif
988 vmx_vmcs_exit(v);
989 }
991 void do_nmi(struct cpu_user_regs *);
993 static void vmx_init_hypercall_page(struct domain *d, void *hypercall_page)
994 {
995 char *p;
996 int i;
998 for ( i = 0; i < (PAGE_SIZE / 32); i++ )
999 {
1000 p = (char *)(hypercall_page + (i * 32));
1001 *(u8 *)(p + 0) = 0xb8; /* mov imm32, %eax */
1002 *(u32 *)(p + 1) = i;
1003 *(u8 *)(p + 5) = 0x0f; /* vmcall */
1004 *(u8 *)(p + 6) = 0x01;
1005 *(u8 *)(p + 7) = 0xc1;
1006 *(u8 *)(p + 8) = 0xc3; /* ret */
1009 /* Don't support HYPERVISOR_iret at the moment */
1010 *(u16 *)(hypercall_page + (__HYPERVISOR_iret * 32)) = 0x0b0f; /* ud2 */
1013 static enum hvm_intblk vmx_interrupt_blocked(
1014 struct vcpu *v, struct hvm_intack intack)
1016 unsigned long intr_shadow;
1018 intr_shadow = __vmread(GUEST_INTERRUPTIBILITY_INFO);
1020 if ( intr_shadow & (VMX_INTR_SHADOW_STI|VMX_INTR_SHADOW_MOV_SS) )
1021 return hvm_intblk_shadow;
1023 if ( intack.source == hvm_intsrc_nmi )
1024 return ((intr_shadow & VMX_INTR_SHADOW_NMI) ?
1025 hvm_intblk_nmi_iret : hvm_intblk_none);
1027 ASSERT((intack.source == hvm_intsrc_pic) ||
1028 (intack.source == hvm_intsrc_lapic));
1030 if ( !(guest_cpu_user_regs()->eflags & X86_EFLAGS_IF) )
1031 return hvm_intblk_rflags_ie;
1033 return hvm_intblk_none;
1036 static void vmx_update_host_cr3(struct vcpu *v)
1038 ASSERT((v == current) || !vcpu_runnable(v));
1039 vmx_vmcs_enter(v);
1040 __vmwrite(HOST_CR3, v->arch.cr3);
1041 vmx_vmcs_exit(v);
1044 static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr)
1046 ASSERT((v == current) || !vcpu_runnable(v));
1048 vmx_vmcs_enter(v);
1050 switch ( cr )
1052 case 0: {
1053 unsigned long hw_cr0_mask =
1054 X86_CR0_NE | X86_CR0_PG | X86_CR0_WP | X86_CR0_PE;
1056 if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) )
1058 if ( v != current )
1059 hw_cr0_mask |= X86_CR0_TS;
1060 else if ( v->arch.hvm_vcpu.hw_cr[0] & X86_CR0_TS )
1061 vmx_fpu_enter(v);
1064 v->arch.hvm_vmx.vmxemul &= ~VMXEMUL_REALMODE;
1065 if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE) )
1066 v->arch.hvm_vmx.vmxemul |= VMXEMUL_REALMODE;
1068 v->arch.hvm_vcpu.hw_cr[0] =
1069 v->arch.hvm_vcpu.guest_cr[0] | hw_cr0_mask;
1070 __vmwrite(GUEST_CR0, v->arch.hvm_vcpu.hw_cr[0]);
1071 __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vcpu.guest_cr[0]);
1072 break;
1074 case 2:
1075 /* CR2 is updated in exit stub. */
1076 break;
1077 case 3:
1078 __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr[3]);
1079 break;
1080 case 4:
1081 v->arch.hvm_vcpu.hw_cr[4] =
1082 v->arch.hvm_vcpu.guest_cr[4] | HVM_CR4_HOST_MASK;
1083 __vmwrite(GUEST_CR4, v->arch.hvm_vcpu.hw_cr[4]);
1084 __vmwrite(CR4_READ_SHADOW, v->arch.hvm_vcpu.guest_cr[4]);
1085 break;
1086 default:
1087 BUG();
1090 vmx_vmcs_exit(v);
1093 static void vmx_update_guest_efer(struct vcpu *v)
1095 #ifdef __x86_64__
1096 unsigned long vm_entry_value;
1098 ASSERT((v == current) || !vcpu_runnable(v));
1100 vmx_vmcs_enter(v);
1102 vm_entry_value = __vmread(VM_ENTRY_CONTROLS);
1103 if ( v->arch.hvm_vcpu.guest_efer & EFER_LMA )
1104 vm_entry_value |= VM_ENTRY_IA32E_MODE;
1105 else
1106 vm_entry_value &= ~VM_ENTRY_IA32E_MODE;
1107 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
1109 vmx_vmcs_exit(v);
1110 #endif
1112 if ( v == current )
1113 write_efer((read_efer() & ~(EFER_NX|EFER_SCE)) |
1114 (v->arch.hvm_vcpu.guest_efer & (EFER_NX|EFER_SCE)));
1117 static void vmx_flush_guest_tlbs(void)
1119 /* No tagged TLB support on VMX yet. The fact that we're in Xen
1120 * at all means any guest will have a clean TLB when it's next run,
1121 * because VMRESUME will flush it for us. */
1124 static void vmx_inject_exception(
1125 unsigned int trapnr, int errcode, unsigned long cr2)
1127 struct vcpu *curr = current;
1129 vmx_inject_hw_exception(curr, trapnr, errcode);
1131 if ( trapnr == TRAP_page_fault )
1132 curr->arch.hvm_vcpu.guest_cr[2] = cr2;
1134 if ( (trapnr == TRAP_debug) &&
1135 (guest_cpu_user_regs()->eflags & X86_EFLAGS_TF) )
1137 __restore_debug_registers(curr);
1138 write_debugreg(6, read_debugreg(6) | 0x4000);
1142 static int vmx_event_pending(struct vcpu *v)
1144 ASSERT(v == current);
1145 return (__vmread(VM_ENTRY_INTR_INFO) & INTR_INFO_VALID_MASK);
1148 static int vmx_do_pmu_interrupt(struct cpu_user_regs *regs)
1150 return vpmu_do_interrupt(regs);
1153 static struct hvm_function_table vmx_function_table = {
1154 .name = "VMX",
1155 .domain_initialise = vmx_domain_initialise,
1156 .domain_destroy = vmx_domain_destroy,
1157 .vcpu_initialise = vmx_vcpu_initialise,
1158 .vcpu_destroy = vmx_vcpu_destroy,
1159 .save_cpu_ctxt = vmx_save_vmcs_ctxt,
1160 .load_cpu_ctxt = vmx_load_vmcs_ctxt,
1161 .interrupt_blocked = vmx_interrupt_blocked,
1162 .guest_x86_mode = vmx_guest_x86_mode,
1163 .get_segment_base = vmx_get_segment_base,
1164 .get_segment_register = vmx_get_segment_register,
1165 .set_segment_register = vmx_set_segment_register,
1166 .update_host_cr3 = vmx_update_host_cr3,
1167 .update_guest_cr = vmx_update_guest_cr,
1168 .update_guest_efer = vmx_update_guest_efer,
1169 .flush_guest_tlbs = vmx_flush_guest_tlbs,
1170 .set_tsc_offset = vmx_set_tsc_offset,
1171 .inject_exception = vmx_inject_exception,
1172 .init_hypercall_page = vmx_init_hypercall_page,
1173 .event_pending = vmx_event_pending,
1174 .do_pmu_interrupt = vmx_do_pmu_interrupt,
1175 .cpu_up = vmx_cpu_up,
1176 .cpu_down = vmx_cpu_down,
1177 };
1179 void start_vmx(void)
1181 static int bootstrapped;
1183 vmx_save_host_msrs();
1185 if ( bootstrapped )
1187 if ( hvm_enabled && !vmx_cpu_up() )
1189 printk("VMX: FATAL: failed to initialise CPU%d!\n",
1190 smp_processor_id());
1191 BUG();
1193 return;
1196 bootstrapped = 1;
1198 /* Xen does not fill x86_capability words except 0. */
1199 boot_cpu_data.x86_capability[4] = cpuid_ecx(1);
1201 if ( !test_bit(X86_FEATURE_VMXE, &boot_cpu_data.x86_capability) )
1202 return;
1204 set_in_cr4(X86_CR4_VMXE);
1206 if ( !vmx_cpu_up() )
1208 printk("VMX: failed to initialise.\n");
1209 return;
1212 setup_vmcs_dump();
1214 hvm_enable(&vmx_function_table);
1217 /*
1218 * Not all cases receive valid value in the VM-exit instruction length field.
1219 * Callers must know what they're doing!
1220 */
1221 static int __get_instruction_length(void)
1223 int len;
1224 len = __vmread(VM_EXIT_INSTRUCTION_LEN); /* Safe: callers audited */
1225 BUG_ON((len < 1) || (len > 15));
1226 return len;
1229 static void __update_guest_eip(unsigned long inst_len)
1231 struct cpu_user_regs *regs = guest_cpu_user_regs();
1232 unsigned long x;
1234 regs->eip += inst_len;
1235 regs->eflags &= ~X86_EFLAGS_RF;
1237 x = __vmread(GUEST_INTERRUPTIBILITY_INFO);
1238 if ( x & (VMX_INTR_SHADOW_STI | VMX_INTR_SHADOW_MOV_SS) )
1240 x &= ~(VMX_INTR_SHADOW_STI | VMX_INTR_SHADOW_MOV_SS);
1241 __vmwrite(GUEST_INTERRUPTIBILITY_INFO, x);
1244 if ( regs->eflags & X86_EFLAGS_TF )
1245 vmx_inject_exception(TRAP_debug, HVM_DELIVER_NO_ERROR_CODE, 0);
1248 void vmx_do_no_device_fault(void)
1250 struct vcpu *curr = current;
1252 vmx_fpu_enter(curr);
1254 /* Disable TS in guest CR0 unless the guest wants the exception too. */
1255 if ( !(curr->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) )
1257 curr->arch.hvm_vcpu.hw_cr[0] &= ~X86_CR0_TS;
1258 __vmwrite(GUEST_CR0, curr->arch.hvm_vcpu.hw_cr[0]);
1262 #define bitmaskof(idx) (1U << ((idx) & 31))
1263 void vmx_cpuid_intercept(
1264 unsigned int *eax, unsigned int *ebx,
1265 unsigned int *ecx, unsigned int *edx)
1267 unsigned int input = *eax;
1268 unsigned int count = *ecx;
1270 #ifdef VMXASSIST
1271 if ( input == 0x40000003 )
1273 /*
1274 * NB. Unsupported interface for private use of VMXASSIST only.
1275 * Note that this leaf lives at <max-hypervisor-leaf> + 1.
1276 */
1277 u64 value = ((u64)*edx << 32) | (u32)*ecx;
1278 p2m_type_t p2mt;
1279 unsigned long mfn;
1280 struct vcpu *v = current;
1281 char *p;
1283 mfn = mfn_x(gfn_to_mfn_current(value >> PAGE_SHIFT, &p2mt));
1285 gdprintk(XENLOG_INFO, "Input address is 0x%"PRIx64".\n", value);
1287 /* 8-byte aligned valid pseudophys address from vmxassist, please. */
1288 if ( (value & 7) || !p2m_is_ram(p2mt) ||
1289 !v->arch.hvm_vmx.vmxassist_enabled )
1291 domain_crash(v->domain);
1292 return;
1294 ASSERT(mfn_valid(mfn));
1296 p = map_domain_page(mfn);
1297 value = *((uint64_t *)(p + (value & (PAGE_SIZE - 1))));
1298 unmap_domain_page(p);
1300 gdprintk(XENLOG_INFO, "Output value is 0x%"PRIx64".\n", value);
1301 *ecx = (u32)value;
1302 *edx = (u32)(value >> 32);
1303 return;
1305 #endif
1307 hvm_cpuid(input, eax, ebx, ecx, edx);
1309 switch ( input )
1311 case 0x00000001:
1312 *ecx &= ~VMX_VCPU_CPUID_L1_ECX_RESERVED;
1313 *ebx &= NUM_THREADS_RESET_MASK;
1314 *ecx &= ~(bitmaskof(X86_FEATURE_VMXE) |
1315 bitmaskof(X86_FEATURE_EST) |
1316 bitmaskof(X86_FEATURE_TM2) |
1317 bitmaskof(X86_FEATURE_CID) |
1318 bitmaskof(X86_FEATURE_PDCM) |
1319 bitmaskof(X86_FEATURE_DSCPL));
1320 *edx &= ~(bitmaskof(X86_FEATURE_HT) |
1321 bitmaskof(X86_FEATURE_ACPI) |
1322 bitmaskof(X86_FEATURE_ACC) |
1323 bitmaskof(X86_FEATURE_DS));
1324 break;
1326 case 0x00000004:
1327 cpuid_count(input, count, eax, ebx, ecx, edx);
1328 *eax &= NUM_CORES_RESET_MASK;
1329 break;
1331 case 0x00000006:
1332 case 0x00000009:
1333 *eax = *ebx = *ecx = *edx = 0;
1334 break;
1336 case 0x80000001:
1337 /* Only a few features are advertised in Intel's 0x80000001. */
1338 *ecx &= (bitmaskof(X86_FEATURE_LAHF_LM));
1339 *edx &= (bitmaskof(X86_FEATURE_NX) |
1340 bitmaskof(X86_FEATURE_LM) |
1341 bitmaskof(X86_FEATURE_SYSCALL));
1342 break;
1345 HVMTRACE_3D(CPUID, current, input,
1346 ((uint64_t)*eax << 32) | *ebx, ((uint64_t)*ecx << 32) | *edx);
1349 static void vmx_do_cpuid(struct cpu_user_regs *regs)
1351 unsigned int eax, ebx, ecx, edx;
1353 eax = regs->eax;
1354 ebx = regs->ebx;
1355 ecx = regs->ecx;
1356 edx = regs->edx;
1358 vmx_cpuid_intercept(&eax, &ebx, &ecx, &edx);
1360 regs->eax = eax;
1361 regs->ebx = ebx;
1362 regs->ecx = ecx;
1363 regs->edx = edx;
1366 #define CASE_GET_REG_P(REG, reg) \
1367 case REG_ ## REG: reg_p = (unsigned long *)&(regs->reg); break
1369 #ifdef __i386__
1370 #define CASE_EXTEND_GET_REG_P
1371 #else
1372 #define CASE_EXTEND_GET_REG_P \
1373 CASE_GET_REG_P(R8, r8); \
1374 CASE_GET_REG_P(R9, r9); \
1375 CASE_GET_REG_P(R10, r10); \
1376 CASE_GET_REG_P(R11, r11); \
1377 CASE_GET_REG_P(R12, r12); \
1378 CASE_GET_REG_P(R13, r13); \
1379 CASE_GET_REG_P(R14, r14); \
1380 CASE_GET_REG_P(R15, r15)
1381 #endif
1383 static void vmx_dr_access(unsigned long exit_qualification,
1384 struct cpu_user_regs *regs)
1386 struct vcpu *v = current;
1388 HVMTRACE_0D(DR_WRITE, v);
1390 if ( !v->arch.hvm_vcpu.flag_dr_dirty )
1391 __restore_debug_registers(v);
1393 /* Allow guest direct access to DR registers */
1394 v->arch.hvm_vmx.exec_control &= ~CPU_BASED_MOV_DR_EXITING;
1395 __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control);
1398 /*
1399 * Invalidate the TLB for va. Invalidate the shadow page corresponding
1400 * the address va.
1401 */
1402 static void vmx_do_invlpg(unsigned long va)
1404 struct vcpu *v = current;
1406 HVMTRACE_2D(INVLPG, v, /*invlpga=*/ 0, va);
1408 /*
1409 * We do the safest things first, then try to update the shadow
1410 * copying from guest
1411 */
1412 paging_invlpg(v, va);
1415 /* Get segment for OUTS according to guest instruction. */
1416 static enum x86_segment vmx_outs_get_segment(
1417 int long_mode, unsigned long eip, int inst_len)
1419 unsigned char inst[MAX_INST_LEN];
1420 enum x86_segment seg = x86_seg_ds;
1421 int i;
1422 extern int inst_copy_from_guest(unsigned char *, unsigned long, int);
1424 if ( likely(cpu_has_vmx_ins_outs_instr_info) )
1426 unsigned int instr_info = __vmread(VMX_INSTRUCTION_INFO);
1428 /* Get segment register according to bits 17:15. */
1429 switch ( (instr_info >> 15) & 7 )
1431 case 0: seg = x86_seg_es; break;
1432 case 1: seg = x86_seg_cs; break;
1433 case 2: seg = x86_seg_ss; break;
1434 case 3: seg = x86_seg_ds; break;
1435 case 4: seg = x86_seg_fs; break;
1436 case 5: seg = x86_seg_gs; break;
1437 default: BUG();
1440 goto out;
1443 if ( !long_mode )
1444 eip += __vmread(GUEST_CS_BASE);
1446 memset(inst, 0, MAX_INST_LEN);
1447 if ( inst_copy_from_guest(inst, eip, inst_len) != inst_len )
1449 gdprintk(XENLOG_ERR, "Get guest instruction failed\n");
1450 domain_crash(current->domain);
1451 goto out;
1454 for ( i = 0; i < inst_len; i++ )
1456 switch ( inst[i] )
1458 case 0xf3: /* REPZ */
1459 case 0xf2: /* REPNZ */
1460 case 0xf0: /* LOCK */
1461 case 0x66: /* data32 */
1462 case 0x67: /* addr32 */
1463 #ifdef __x86_64__
1464 case 0x40 ... 0x4f: /* REX */
1465 #endif
1466 continue;
1467 case 0x2e: /* CS */
1468 seg = x86_seg_cs;
1469 continue;
1470 case 0x36: /* SS */
1471 seg = x86_seg_ss;
1472 continue;
1473 case 0x26: /* ES */
1474 seg = x86_seg_es;
1475 continue;
1476 case 0x64: /* FS */
1477 seg = x86_seg_fs;
1478 continue;
1479 case 0x65: /* GS */
1480 seg = x86_seg_gs;
1481 continue;
1482 case 0x3e: /* DS */
1483 seg = x86_seg_ds;
1484 continue;
1488 out:
1489 return seg;
1492 static int vmx_str_pio_check_descriptor(int long_mode, unsigned long eip,
1493 int inst_len, enum x86_segment seg,
1494 unsigned long *base, u32 *limit,
1495 u32 *ar_bytes)
1497 enum vmcs_field ar_field, base_field, limit_field;
1499 *base = 0;
1500 *limit = 0;
1501 if ( seg != x86_seg_es )
1502 seg = vmx_outs_get_segment(long_mode, eip, inst_len);
1504 switch ( seg )
1506 case x86_seg_cs:
1507 ar_field = GUEST_CS_AR_BYTES;
1508 base_field = GUEST_CS_BASE;
1509 limit_field = GUEST_CS_LIMIT;
1510 break;
1511 case x86_seg_ds:
1512 ar_field = GUEST_DS_AR_BYTES;
1513 base_field = GUEST_DS_BASE;
1514 limit_field = GUEST_DS_LIMIT;
1515 break;
1516 case x86_seg_es:
1517 ar_field = GUEST_ES_AR_BYTES;
1518 base_field = GUEST_ES_BASE;
1519 limit_field = GUEST_ES_LIMIT;
1520 break;
1521 case x86_seg_fs:
1522 ar_field = GUEST_FS_AR_BYTES;
1523 base_field = GUEST_FS_BASE;
1524 limit_field = GUEST_FS_LIMIT;
1525 break;
1526 case x86_seg_gs:
1527 ar_field = GUEST_GS_AR_BYTES;
1528 base_field = GUEST_GS_BASE;
1529 limit_field = GUEST_GS_LIMIT;
1530 break;
1531 case x86_seg_ss:
1532 ar_field = GUEST_SS_AR_BYTES;
1533 base_field = GUEST_SS_BASE;
1534 limit_field = GUEST_SS_LIMIT;
1535 break;
1536 default:
1537 BUG();
1538 return 0;
1541 if ( !long_mode || seg == x86_seg_fs || seg == x86_seg_gs )
1543 *base = __vmread(base_field);
1544 *limit = __vmread(limit_field);
1546 *ar_bytes = __vmread(ar_field);
1548 return !(*ar_bytes & X86_SEG_AR_SEG_UNUSABLE);
1552 static int vmx_str_pio_check_limit(u32 limit, unsigned int size,
1553 u32 ar_bytes, unsigned long addr,
1554 unsigned long base, int df,
1555 unsigned long *count)
1557 unsigned long ea = addr - base;
1559 /* Offset must be within limits. */
1560 ASSERT(ea == (u32)ea);
1561 if ( (u32)(ea + size - 1) < (u32)ea ||
1562 (ar_bytes & 0xc) != 0x4 ? ea + size - 1 > limit
1563 : ea <= limit )
1564 return 0;
1566 /* Check the limit for repeated instructions, as above we checked
1567 only the first instance. Truncate the count if a limit violation
1568 would occur. Note that the checking is not necessary for page
1569 granular segments as transfers crossing page boundaries will be
1570 broken up anyway. */
1571 if ( !(ar_bytes & X86_SEG_AR_GRANULARITY) && *count > 1 )
1573 if ( (ar_bytes & 0xc) != 0x4 )
1575 /* expand-up */
1576 if ( !df )
1578 if ( ea + *count * size - 1 < ea ||
1579 ea + *count * size - 1 > limit )
1580 *count = (limit + 1UL - ea) / size;
1582 else
1584 if ( *count - 1 > ea / size )
1585 *count = ea / size + 1;
1588 else
1590 /* expand-down */
1591 if ( !df )
1593 if ( *count - 1 > -(s32)ea / size )
1594 *count = -(s32)ea / size + 1UL;
1596 else
1598 if ( ea < (*count - 1) * size ||
1599 ea - (*count - 1) * size <= limit )
1600 *count = (ea - limit - 1) / size + 1;
1603 ASSERT(*count);
1606 return 1;
1609 #ifdef __x86_64__
1610 static int vmx_str_pio_lm_check_limit(struct cpu_user_regs *regs,
1611 unsigned int size,
1612 unsigned long addr,
1613 unsigned long *count)
1615 if ( !is_canonical_address(addr) ||
1616 !is_canonical_address(addr + size - 1) )
1617 return 0;
1619 if ( *count > (1UL << 48) / size )
1620 *count = (1UL << 48) / size;
1622 if ( !(regs->eflags & EF_DF) )
1624 if ( addr + *count * size - 1 < addr ||
1625 !is_canonical_address(addr + *count * size - 1) )
1626 *count = (addr & ~((1UL << 48) - 1)) / size;
1628 else
1630 if ( (*count - 1) * size > addr ||
1631 !is_canonical_address(addr + (*count - 1) * size) )
1632 *count = (addr & ~((1UL << 48) - 1)) / size + 1;
1635 ASSERT(*count);
1637 return 1;
1639 #endif
1641 static void vmx_send_str_pio(struct cpu_user_regs *regs,
1642 struct hvm_io_op *pio_opp,
1643 unsigned long inst_len, unsigned int port,
1644 int sign, unsigned int size, int dir,
1645 int df, unsigned long addr,
1646 paddr_t paddr, unsigned long count)
1648 /*
1649 * Handle string pio instructions that cross pages or that
1650 * are unaligned. See the comments in hvm_domain.c/handle_mmio()
1651 */
1652 if ( (addr & PAGE_MASK) != ((addr + size - 1) & PAGE_MASK) ) {
1653 unsigned long value = 0;
1655 pio_opp->flags |= OVERLAP;
1657 if ( dir == IOREQ_WRITE ) /* OUTS */
1659 if ( hvm_paging_enabled(current) )
1661 int rv = hvm_copy_from_guest_virt(&value, addr, size);
1662 if ( rv == HVMCOPY_bad_gva_to_gfn )
1663 return; /* exception already injected */
1665 else
1666 (void)hvm_copy_from_guest_phys(&value, addr, size);
1668 else /* dir != IOREQ_WRITE */
1669 /* Remember where to write the result, as a *VA*.
1670 * Must be a VA so we can handle the page overlap
1671 * correctly in hvm_pio_assist() */
1672 pio_opp->addr = addr;
1674 if ( count == 1 )
1675 regs->eip += inst_len;
1677 send_pio_req(port, 1, size, value, dir, df, 0);
1678 } else {
1679 unsigned long last_addr = sign > 0 ? addr + count * size - 1
1680 : addr - (count - 1) * size;
1682 if ( (addr & PAGE_MASK) != (last_addr & PAGE_MASK) )
1684 if ( sign > 0 )
1685 count = (PAGE_SIZE - (addr & ~PAGE_MASK)) / size;
1686 else
1687 count = (addr & ~PAGE_MASK) / size + 1;
1688 } else
1689 regs->eip += inst_len;
1691 send_pio_req(port, count, size, paddr, dir, df, 1);
1695 static void vmx_do_str_pio(unsigned long exit_qualification,
1696 unsigned long inst_len,
1697 struct cpu_user_regs *regs,
1698 struct hvm_io_op *pio_opp)
1700 unsigned int port, size;
1701 int dir, df, vm86;
1702 unsigned long addr, count = 1, base;
1703 paddr_t paddr;
1704 unsigned long gfn;
1705 u32 ar_bytes, limit, pfec;
1706 int sign;
1707 int long_mode = 0;
1709 vm86 = regs->eflags & X86_EFLAGS_VM ? 1 : 0;
1710 df = regs->eflags & X86_EFLAGS_DF ? 1 : 0;
1712 if ( test_bit(6, &exit_qualification) )
1713 port = (exit_qualification >> 16) & 0xFFFF;
1714 else
1715 port = regs->edx & 0xffff;
1717 size = (exit_qualification & 7) + 1;
1718 dir = test_bit(3, &exit_qualification); /* direction */
1720 if ( dir == IOREQ_READ )
1721 HVMTRACE_2D(IO_READ, current, port, size);
1722 else
1723 HVMTRACE_2D(IO_WRITE, current, port, size);
1725 sign = regs->eflags & X86_EFLAGS_DF ? -1 : 1;
1726 ar_bytes = __vmread(GUEST_CS_AR_BYTES);
1727 if ( hvm_long_mode_enabled(current) &&
1728 (ar_bytes & X86_SEG_AR_CS_LM_ACTIVE) )
1729 long_mode = 1;
1730 addr = __vmread(GUEST_LINEAR_ADDRESS);
1732 if ( test_bit(5, &exit_qualification) ) { /* "rep" prefix */
1733 pio_opp->flags |= REPZ;
1734 count = regs->ecx;
1735 if ( !long_mode &&
1736 (vm86 || !(ar_bytes & X86_SEG_AR_DEF_OP_SIZE)) )
1737 count &= 0xFFFF;
1740 /*
1741 * In protected mode, guest linear address is invalid if the
1742 * selector is null.
1743 */
1744 if ( !vmx_str_pio_check_descriptor(long_mode, regs->eip, inst_len,
1745 dir==IOREQ_WRITE ? x86_seg_ds :
1746 x86_seg_es, &base, &limit,
1747 &ar_bytes) ) {
1748 if ( !long_mode ) {
1749 vmx_inject_hw_exception(current, TRAP_gp_fault, 0);
1750 return;
1752 addr = dir == IOREQ_WRITE ? base + regs->esi : regs->edi;
1755 if ( !long_mode )
1757 /* Segment must be readable for outs and writeable for ins. */
1758 if ( ((dir == IOREQ_WRITE)
1759 ? ((ar_bytes & 0xa) == 0x8)
1760 : ((ar_bytes & 0xa) != 0x2)) ||
1761 !vmx_str_pio_check_limit(limit, size, ar_bytes,
1762 addr, base, df, &count) )
1764 vmx_inject_hw_exception(current, TRAP_gp_fault, 0);
1765 return;
1768 #ifdef __x86_64__
1769 else if ( !vmx_str_pio_lm_check_limit(regs, size, addr, &count) )
1771 vmx_inject_hw_exception(current, TRAP_gp_fault, 0);
1772 return;
1774 #endif
1776 /* Translate the address to a physical address */
1777 pfec = PFEC_page_present;
1778 if ( dir == IOREQ_READ ) /* Read from PIO --> write to RAM */
1779 pfec |= PFEC_write_access;
1780 if ( ((__vmread(GUEST_SS_AR_BYTES) >> 5) & 3) == 3 )
1781 pfec |= PFEC_user_mode;
1782 gfn = paging_gva_to_gfn(current, addr, &pfec);
1783 if ( gfn == INVALID_GFN )
1785 /* The guest does not have the RAM address mapped.
1786 * Need to send in a page fault */
1787 vmx_inject_exception(TRAP_page_fault, pfec, addr);
1788 return;
1790 paddr = (paddr_t)gfn << PAGE_SHIFT | (addr & ~PAGE_MASK);
1792 vmx_send_str_pio(regs, pio_opp, inst_len, port, sign,
1793 size, dir, df, addr, paddr, count);
1796 static void vmx_io_instruction(unsigned long exit_qualification,
1797 unsigned long inst_len)
1799 struct cpu_user_regs *regs;
1800 struct hvm_io_op *pio_opp;
1802 pio_opp = &current->arch.hvm_vcpu.io_op;
1803 pio_opp->instr = INSTR_PIO;
1804 pio_opp->flags = 0;
1806 regs = &pio_opp->io_context;
1808 /* Copy current guest state into io instruction state structure. */
1809 memcpy(regs, guest_cpu_user_regs(), HVM_CONTEXT_STACK_BYTES);
1811 HVM_DBG_LOG(DBG_LEVEL_IO, "vm86 %d, eip=%x:%lx, "
1812 "exit_qualification = %lx",
1813 regs->eflags & X86_EFLAGS_VM ? 1 : 0,
1814 regs->cs, (unsigned long)regs->eip, exit_qualification);
1816 if ( test_bit(4, &exit_qualification) ) /* string instrucation */
1817 vmx_do_str_pio(exit_qualification, inst_len, regs, pio_opp);
1818 else
1820 unsigned int port, size;
1821 int dir, df;
1823 df = regs->eflags & X86_EFLAGS_DF ? 1 : 0;
1825 if ( test_bit(6, &exit_qualification) )
1826 port = (exit_qualification >> 16) & 0xFFFF;
1827 else
1828 port = regs->edx & 0xffff;
1830 size = (exit_qualification & 7) + 1;
1831 dir = test_bit(3, &exit_qualification); /* direction */
1833 if ( dir == IOREQ_READ )
1834 HVMTRACE_2D(IO_READ, current, port, size);
1835 else
1836 HVMTRACE_3D(IO_WRITE, current, port, size, regs->eax);
1838 if ( port == 0xe9 && dir == IOREQ_WRITE && size == 1 )
1839 hvm_print_line(current, regs->eax); /* guest debug output */
1841 regs->eip += inst_len;
1842 send_pio_req(port, 1, size, regs->eax, dir, df, 0);
1846 #ifdef VMXASSIST
1848 static void vmx_world_save(struct vcpu *v, struct vmx_assist_context *c)
1850 struct cpu_user_regs *regs = guest_cpu_user_regs();
1852 c->eip = regs->eip;
1853 c->eip += __get_instruction_length(); /* Safe: MOV Cn, LMSW, CLTS */
1854 c->esp = regs->esp;
1855 c->eflags = regs->eflags & ~X86_EFLAGS_RF;
1857 c->cr0 = v->arch.hvm_vcpu.guest_cr[0];
1858 c->cr3 = v->arch.hvm_vcpu.guest_cr[3];
1859 c->cr4 = v->arch.hvm_vcpu.guest_cr[4];
1861 c->idtr_limit = __vmread(GUEST_IDTR_LIMIT);
1862 c->idtr_base = __vmread(GUEST_IDTR_BASE);
1864 c->gdtr_limit = __vmread(GUEST_GDTR_LIMIT);
1865 c->gdtr_base = __vmread(GUEST_GDTR_BASE);
1867 c->cs_sel = __vmread(GUEST_CS_SELECTOR);
1868 c->cs_limit = __vmread(GUEST_CS_LIMIT);
1869 c->cs_base = __vmread(GUEST_CS_BASE);
1870 c->cs_arbytes.bytes = __vmread(GUEST_CS_AR_BYTES);
1872 c->ds_sel = __vmread(GUEST_DS_SELECTOR);
1873 c->ds_limit = __vmread(GUEST_DS_LIMIT);
1874 c->ds_base = __vmread(GUEST_DS_BASE);
1875 c->ds_arbytes.bytes = __vmread(GUEST_DS_AR_BYTES);
1877 c->es_sel = __vmread(GUEST_ES_SELECTOR);
1878 c->es_limit = __vmread(GUEST_ES_LIMIT);
1879 c->es_base = __vmread(GUEST_ES_BASE);
1880 c->es_arbytes.bytes = __vmread(GUEST_ES_AR_BYTES);
1882 c->ss_sel = __vmread(GUEST_SS_SELECTOR);
1883 c->ss_limit = __vmread(GUEST_SS_LIMIT);
1884 c->ss_base = __vmread(GUEST_SS_BASE);
1885 c->ss_arbytes.bytes = __vmread(GUEST_SS_AR_BYTES);
1887 c->fs_sel = __vmread(GUEST_FS_SELECTOR);
1888 c->fs_limit = __vmread(GUEST_FS_LIMIT);
1889 c->fs_base = __vmread(GUEST_FS_BASE);
1890 c->fs_arbytes.bytes = __vmread(GUEST_FS_AR_BYTES);
1892 c->gs_sel = __vmread(GUEST_GS_SELECTOR);
1893 c->gs_limit = __vmread(GUEST_GS_LIMIT);
1894 c->gs_base = __vmread(GUEST_GS_BASE);
1895 c->gs_arbytes.bytes = __vmread(GUEST_GS_AR_BYTES);
1897 c->tr_sel = __vmread(GUEST_TR_SELECTOR);
1898 c->tr_limit = __vmread(GUEST_TR_LIMIT);
1899 c->tr_base = __vmread(GUEST_TR_BASE);
1900 c->tr_arbytes.bytes = __vmread(GUEST_TR_AR_BYTES);
1902 c->ldtr_sel = __vmread(GUEST_LDTR_SELECTOR);
1903 c->ldtr_limit = __vmread(GUEST_LDTR_LIMIT);
1904 c->ldtr_base = __vmread(GUEST_LDTR_BASE);
1905 c->ldtr_arbytes.bytes = __vmread(GUEST_LDTR_AR_BYTES);
1908 static int vmx_world_restore(struct vcpu *v, struct vmx_assist_context *c)
1910 struct cpu_user_regs *regs = guest_cpu_user_regs();
1911 int rc;
1913 rc = vmx_restore_cr0_cr3(v, c->cr0, c->cr3);
1914 if ( rc )
1915 return rc;
1917 regs->eip = c->eip;
1918 regs->esp = c->esp;
1919 regs->eflags = c->eflags | 2;
1921 v->arch.hvm_vcpu.guest_cr[4] = c->cr4;
1922 vmx_update_guest_cr(v, 0);
1923 vmx_update_guest_cr(v, 4);
1925 __vmwrite(GUEST_IDTR_LIMIT, c->idtr_limit);
1926 __vmwrite(GUEST_IDTR_BASE, c->idtr_base);
1928 __vmwrite(GUEST_GDTR_LIMIT, c->gdtr_limit);
1929 __vmwrite(GUEST_GDTR_BASE, c->gdtr_base);
1931 __vmwrite(GUEST_CS_SELECTOR, c->cs_sel);
1932 __vmwrite(GUEST_CS_LIMIT, c->cs_limit);
1933 __vmwrite(GUEST_CS_BASE, c->cs_base);
1934 __vmwrite(GUEST_CS_AR_BYTES, c->cs_arbytes.bytes);
1936 __vmwrite(GUEST_DS_SELECTOR, c->ds_sel);
1937 __vmwrite(GUEST_DS_LIMIT, c->ds_limit);
1938 __vmwrite(GUEST_DS_BASE, c->ds_base);
1939 __vmwrite(GUEST_DS_AR_BYTES, c->ds_arbytes.bytes);
1941 __vmwrite(GUEST_ES_SELECTOR, c->es_sel);
1942 __vmwrite(GUEST_ES_LIMIT, c->es_limit);
1943 __vmwrite(GUEST_ES_BASE, c->es_base);
1944 __vmwrite(GUEST_ES_AR_BYTES, c->es_arbytes.bytes);
1946 __vmwrite(GUEST_SS_SELECTOR, c->ss_sel);
1947 __vmwrite(GUEST_SS_LIMIT, c->ss_limit);
1948 __vmwrite(GUEST_SS_BASE, c->ss_base);
1949 __vmwrite(GUEST_SS_AR_BYTES, c->ss_arbytes.bytes);
1951 __vmwrite(GUEST_FS_SELECTOR, c->fs_sel);
1952 __vmwrite(GUEST_FS_LIMIT, c->fs_limit);
1953 __vmwrite(GUEST_FS_BASE, c->fs_base);
1954 __vmwrite(GUEST_FS_AR_BYTES, c->fs_arbytes.bytes);
1956 __vmwrite(GUEST_GS_SELECTOR, c->gs_sel);
1957 __vmwrite(GUEST_GS_LIMIT, c->gs_limit);
1958 __vmwrite(GUEST_GS_BASE, c->gs_base);
1959 __vmwrite(GUEST_GS_AR_BYTES, c->gs_arbytes.bytes);
1961 __vmwrite(GUEST_TR_SELECTOR, c->tr_sel);
1962 __vmwrite(GUEST_TR_LIMIT, c->tr_limit);
1963 __vmwrite(GUEST_TR_BASE, c->tr_base);
1964 __vmwrite(GUEST_TR_AR_BYTES, c->tr_arbytes.bytes);
1966 __vmwrite(GUEST_LDTR_SELECTOR, c->ldtr_sel);
1967 __vmwrite(GUEST_LDTR_LIMIT, c->ldtr_limit);
1968 __vmwrite(GUEST_LDTR_BASE, c->ldtr_base);
1969 __vmwrite(GUEST_LDTR_AR_BYTES, c->ldtr_arbytes.bytes);
1971 paging_update_paging_modes(v);
1972 return 0;
1975 enum { VMX_ASSIST_INVOKE = 0, VMX_ASSIST_RESTORE };
1977 static int vmx_assist(struct vcpu *v, int mode)
1979 struct vmx_assist_context c;
1980 struct hvm_hw_vpic *vpic = v->domain->arch.hvm_domain.vpic;
1981 u32 magic, cp;
1983 if ( hvm_copy_from_guest_phys(&magic, VMXASSIST_MAGIC_OFFSET,
1984 sizeof(magic)) )
1986 gdprintk(XENLOG_ERR, "No vmxassist: can't execute real mode code\n");
1987 domain_crash(v->domain);
1988 return 0;
1991 if ( magic != VMXASSIST_MAGIC )
1993 gdprintk(XENLOG_ERR, "vmxassist magic number not match\n");
1994 domain_crash(v->domain);
1995 return 0;
1998 switch ( mode ) {
1999 /*
2000 * Transfer control to vmxassist.
2001 * Store the current context in VMXASSIST_OLD_CONTEXT and load
2002 * the new VMXASSIST_NEW_CONTEXT context. This context was created
2003 * by vmxassist and will transfer control to it.
2004 */
2005 case VMX_ASSIST_INVOKE:
2006 /* save the old context */
2007 if ( hvm_copy_from_guest_phys(&cp, VMXASSIST_OLD_CONTEXT, sizeof(cp)) )
2008 goto error;
2009 if ( cp != 0 ) {
2010 vmx_world_save(v, &c);
2011 if ( hvm_copy_to_guest_phys(cp, &c, sizeof(c)) )
2012 goto error;
2015 /* restore the new context, this should activate vmxassist */
2016 if ( hvm_copy_from_guest_phys(&cp, VMXASSIST_NEW_CONTEXT, sizeof(cp)) )
2017 goto error;
2018 if ( cp != 0 ) {
2019 if ( hvm_copy_from_guest_phys(&c, cp, sizeof(c)) )
2020 goto error;
2021 if ( vmx_world_restore(v, &c) != 0 )
2022 goto error;
2023 v->arch.hvm_vmx.pm_irqbase[0] = vpic[0].irq_base;
2024 v->arch.hvm_vmx.pm_irqbase[1] = vpic[1].irq_base;
2025 vpic[0].irq_base = NR_EXCEPTION_HANDLER;
2026 vpic[1].irq_base = NR_EXCEPTION_HANDLER + 8;
2027 v->arch.hvm_vmx.vmxassist_enabled = 1;
2028 return 1;
2030 break;
2032 /*
2033 * Restore the VMXASSIST_OLD_CONTEXT that was saved by
2034 * VMX_ASSIST_INVOKE above.
2035 */
2036 case VMX_ASSIST_RESTORE:
2037 /* save the old context */
2038 if ( hvm_copy_from_guest_phys(&cp, VMXASSIST_OLD_CONTEXT, sizeof(cp)) )
2039 goto error;
2040 if ( cp != 0 ) {
2041 if ( hvm_copy_from_guest_phys(&c, cp, sizeof(c)) )
2042 goto error;
2043 if ( vmx_world_restore(v, &c) != 0 )
2044 goto error;
2045 if ( v->arch.hvm_vmx.irqbase_mode ) {
2046 vpic[0].irq_base = c.rm_irqbase[0] & 0xf8;
2047 vpic[1].irq_base = c.rm_irqbase[1] & 0xf8;
2048 } else {
2049 vpic[0].irq_base = v->arch.hvm_vmx.pm_irqbase[0];
2050 vpic[1].irq_base = v->arch.hvm_vmx.pm_irqbase[1];
2052 v->arch.hvm_vmx.vmxassist_enabled = 0;
2053 return 1;
2055 break;
2058 error:
2059 gdprintk(XENLOG_ERR, "Failed to transfer to vmxassist\n");
2060 domain_crash(v->domain);
2061 return 0;
2064 static int vmx_set_cr0(unsigned long value)
2066 struct vcpu *v = current;
2068 if ( hvm_set_cr0(value) == 0 )
2069 return 0;
2071 /*
2072 * VMX does not implement real-mode virtualization. We emulate
2073 * real-mode by performing a world switch to VMXAssist whenever
2074 * a partition disables the CR0.PE bit.
2075 */
2076 if ( !(value & X86_CR0_PE) )
2078 if ( vmx_assist(v, VMX_ASSIST_INVOKE) )
2079 return 0; /* do not update eip! */
2081 else if ( v->arch.hvm_vmx.vmxassist_enabled )
2083 if ( vmx_assist(v, VMX_ASSIST_RESTORE) )
2084 return 0; /* do not update eip! */
2087 return 1;
2090 #else /* !defined(VMXASSIST) */
2092 #define vmx_set_cr0(v) hvm_set_cr0(v)
2094 #endif
2096 #define CASE_SET_REG(REG, reg) \
2097 case REG_ ## REG: regs->reg = value; break
2098 #define CASE_GET_REG(REG, reg) \
2099 case REG_ ## REG: value = regs->reg; break
2101 #define CASE_EXTEND_SET_REG \
2102 CASE_EXTEND_REG(S)
2103 #define CASE_EXTEND_GET_REG \
2104 CASE_EXTEND_REG(G)
2106 #ifdef __i386__
2107 #define CASE_EXTEND_REG(T)
2108 #else
2109 #define CASE_EXTEND_REG(T) \
2110 CASE_ ## T ## ET_REG(R8, r8); \
2111 CASE_ ## T ## ET_REG(R9, r9); \
2112 CASE_ ## T ## ET_REG(R10, r10); \
2113 CASE_ ## T ## ET_REG(R11, r11); \
2114 CASE_ ## T ## ET_REG(R12, r12); \
2115 CASE_ ## T ## ET_REG(R13, r13); \
2116 CASE_ ## T ## ET_REG(R14, r14); \
2117 CASE_ ## T ## ET_REG(R15, r15)
2118 #endif
2120 static int mov_to_cr(int gp, int cr, struct cpu_user_regs *regs)
2122 unsigned long value;
2123 struct vcpu *v = current;
2124 struct vlapic *vlapic = vcpu_vlapic(v);
2126 switch ( gp )
2128 CASE_GET_REG(EAX, eax);
2129 CASE_GET_REG(ECX, ecx);
2130 CASE_GET_REG(EDX, edx);
2131 CASE_GET_REG(EBX, ebx);
2132 CASE_GET_REG(EBP, ebp);
2133 CASE_GET_REG(ESI, esi);
2134 CASE_GET_REG(EDI, edi);
2135 CASE_GET_REG(ESP, esp);
2136 CASE_EXTEND_GET_REG;
2137 default:
2138 gdprintk(XENLOG_ERR, "invalid gp: %d\n", gp);
2139 goto exit_and_crash;
2142 HVMTRACE_2D(CR_WRITE, v, cr, value);
2144 HVM_DBG_LOG(DBG_LEVEL_1, "CR%d, value = %lx", cr, value);
2146 switch ( cr )
2148 case 0:
2149 return vmx_set_cr0(value);
2151 case 3:
2152 return hvm_set_cr3(value);
2154 case 4:
2155 return hvm_set_cr4(value);
2157 case 8:
2158 vlapic_set_reg(vlapic, APIC_TASKPRI, ((value & 0x0F) << 4));
2159 break;
2161 default:
2162 gdprintk(XENLOG_ERR, "invalid cr: %d\n", cr);
2163 goto exit_and_crash;
2166 return 1;
2168 exit_and_crash:
2169 domain_crash(v->domain);
2170 return 0;
2173 /*
2174 * Read from control registers. CR0 and CR4 are read from the shadow.
2175 */
2176 static void mov_from_cr(int cr, int gp, struct cpu_user_regs *regs)
2178 unsigned long value = 0;
2179 struct vcpu *v = current;
2180 struct vlapic *vlapic = vcpu_vlapic(v);
2182 switch ( cr )
2184 case 3:
2185 value = (unsigned long)v->arch.hvm_vcpu.guest_cr[3];
2186 break;
2187 case 8:
2188 value = (unsigned long)vlapic_get_reg(vlapic, APIC_TASKPRI);
2189 value = (value & 0xF0) >> 4;
2190 break;
2191 default:
2192 gdprintk(XENLOG_ERR, "invalid cr: %d\n", cr);
2193 domain_crash(v->domain);
2194 break;
2197 switch ( gp ) {
2198 CASE_SET_REG(EAX, eax);
2199 CASE_SET_REG(ECX, ecx);
2200 CASE_SET_REG(EDX, edx);
2201 CASE_SET_REG(EBX, ebx);
2202 CASE_SET_REG(EBP, ebp);
2203 CASE_SET_REG(ESI, esi);
2204 CASE_SET_REG(EDI, edi);
2205 CASE_SET_REG(ESP, esp);
2206 CASE_EXTEND_SET_REG;
2207 default:
2208 printk("invalid gp: %d\n", gp);
2209 domain_crash(v->domain);
2210 break;
2213 HVMTRACE_2D(CR_READ, v, cr, value);
2215 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR%d, value = %lx", cr, value);
2218 static int vmx_cr_access(unsigned long exit_qualification,
2219 struct cpu_user_regs *regs)
2221 unsigned int gp, cr;
2222 unsigned long value;
2223 struct vcpu *v = current;
2225 switch ( exit_qualification & CONTROL_REG_ACCESS_TYPE )
2227 case TYPE_MOV_TO_CR:
2228 gp = exit_qualification & CONTROL_REG_ACCESS_REG;
2229 cr = exit_qualification & CONTROL_REG_ACCESS_NUM;
2230 return mov_to_cr(gp, cr, regs);
2231 case TYPE_MOV_FROM_CR:
2232 gp = exit_qualification & CONTROL_REG_ACCESS_REG;
2233 cr = exit_qualification & CONTROL_REG_ACCESS_NUM;
2234 mov_from_cr(cr, gp, regs);
2235 break;
2236 case TYPE_CLTS:
2237 v->arch.hvm_vcpu.guest_cr[0] &= ~X86_CR0_TS;
2238 vmx_update_guest_cr(v, 0);
2239 HVMTRACE_0D(CLTS, current);
2240 break;
2241 case TYPE_LMSW:
2242 value = v->arch.hvm_vcpu.guest_cr[0];
2243 value = (value & ~0xF) |
2244 (((exit_qualification & LMSW_SOURCE_DATA) >> 16) & 0xF);
2245 HVMTRACE_1D(LMSW, current, value);
2246 return vmx_set_cr0(value);
2247 default:
2248 BUG();
2251 return 1;
2254 static const struct lbr_info {
2255 u32 base, count;
2256 } p4_lbr[] = {
2257 { MSR_P4_LER_FROM_LIP, 1 },
2258 { MSR_P4_LER_TO_LIP, 1 },
2259 { MSR_P4_LASTBRANCH_TOS, 1 },
2260 { MSR_P4_LASTBRANCH_0_FROM_LIP, NUM_MSR_P4_LASTBRANCH_FROM_TO },
2261 { MSR_P4_LASTBRANCH_0_TO_LIP, NUM_MSR_P4_LASTBRANCH_FROM_TO },
2262 { 0, 0 }
2263 }, c2_lbr[] = {
2264 { MSR_IA32_LASTINTFROMIP, 1 },
2265 { MSR_IA32_LASTINTTOIP, 1 },
2266 { MSR_C2_LASTBRANCH_TOS, 1 },
2267 { MSR_C2_LASTBRANCH_0_FROM_IP, NUM_MSR_C2_LASTBRANCH_FROM_TO },
2268 { MSR_C2_LASTBRANCH_0_TO_IP, NUM_MSR_C2_LASTBRANCH_FROM_TO },
2269 { 0, 0 }
2270 #ifdef __i386__
2271 }, pm_lbr[] = {
2272 { MSR_IA32_LASTINTFROMIP, 1 },
2273 { MSR_IA32_LASTINTTOIP, 1 },
2274 { MSR_PM_LASTBRANCH_TOS, 1 },
2275 { MSR_PM_LASTBRANCH_0, NUM_MSR_PM_LASTBRANCH },
2276 { 0, 0 }
2277 #endif
2278 };
2280 static const struct lbr_info *last_branch_msr_get(void)
2282 switch ( boot_cpu_data.x86 )
2284 case 6:
2285 switch ( boot_cpu_data.x86_model )
2287 #ifdef __i386__
2288 /* PentiumM */
2289 case 9: case 13:
2290 /* Core Solo/Duo */
2291 case 14:
2292 return pm_lbr;
2293 break;
2294 #endif
2295 /* Core2 Duo */
2296 case 15:
2297 return c2_lbr;
2298 break;
2300 break;
2302 case 15:
2303 switch ( boot_cpu_data.x86_model )
2305 /* Pentium4/Xeon with em64t */
2306 case 3: case 4: case 6:
2307 return p4_lbr;
2308 break;
2310 break;
2313 return NULL;
2316 static int is_last_branch_msr(u32 ecx)
2318 const struct lbr_info *lbr = last_branch_msr_get();
2320 if ( lbr == NULL )
2321 return 0;
2323 for ( ; lbr->count; lbr++ )
2324 if ( (ecx >= lbr->base) && (ecx < (lbr->base + lbr->count)) )
2325 return 1;
2327 return 0;
2330 int vmx_msr_read_intercept(struct cpu_user_regs *regs)
2332 u64 msr_content = 0;
2333 u32 ecx = regs->ecx, eax, edx;
2334 struct vcpu *v = current;
2335 int index;
2336 u64 *var_range_base = (u64*)v->arch.hvm_vcpu.mtrr.var_ranges;
2337 u64 *fixed_range_base = (u64*)v->arch.hvm_vcpu.mtrr.fixed_ranges;
2339 HVM_DBG_LOG(DBG_LEVEL_1, "ecx=%x", ecx);
2341 switch ( ecx )
2343 case MSR_IA32_TSC:
2344 msr_content = hvm_get_guest_time(v);
2345 break;
2346 case MSR_IA32_SYSENTER_CS:
2347 msr_content = (u32)__vmread(GUEST_SYSENTER_CS);
2348 break;
2349 case MSR_IA32_SYSENTER_ESP:
2350 msr_content = __vmread(GUEST_SYSENTER_ESP);
2351 break;
2352 case MSR_IA32_SYSENTER_EIP:
2353 msr_content = __vmread(GUEST_SYSENTER_EIP);
2354 break;
2355 case MSR_IA32_APICBASE:
2356 msr_content = vcpu_vlapic(v)->hw.apic_base_msr;
2357 break;
2358 case MSR_IA32_CR_PAT:
2359 msr_content = v->arch.hvm_vcpu.pat_cr;
2360 break;
2361 case MSR_MTRRcap:
2362 msr_content = v->arch.hvm_vcpu.mtrr.mtrr_cap;
2363 break;
2364 case MSR_MTRRdefType:
2365 msr_content = v->arch.hvm_vcpu.mtrr.def_type
2366 | (v->arch.hvm_vcpu.mtrr.enabled << 10);
2367 break;
2368 case MSR_MTRRfix64K_00000:
2369 msr_content = fixed_range_base[0];
2370 break;
2371 case MSR_MTRRfix16K_80000:
2372 case MSR_MTRRfix16K_A0000:
2373 index = regs->ecx - MSR_MTRRfix16K_80000;
2374 msr_content = fixed_range_base[index + 1];
2375 break;
2376 case MSR_MTRRfix4K_C0000...MSR_MTRRfix4K_F8000:
2377 index = regs->ecx - MSR_MTRRfix4K_C0000;
2378 msr_content = fixed_range_base[index + 3];
2379 break;
2380 case MSR_IA32_MTRR_PHYSBASE0...MSR_IA32_MTRR_PHYSMASK7:
2381 index = regs->ecx - MSR_IA32_MTRR_PHYSBASE0;
2382 msr_content = var_range_base[index];
2383 break;
2384 case MSR_IA32_DEBUGCTLMSR:
2385 if ( vmx_read_guest_msr(v, ecx, &msr_content) != 0 )
2386 msr_content = 0;
2387 break;
2388 case MSR_IA32_VMX_BASIC...MSR_IA32_VMX_PROCBASED_CTLS2:
2389 goto gp_fault;
2390 case MSR_IA32_MCG_CAP:
2391 case MSR_IA32_MCG_STATUS:
2392 case MSR_IA32_MC0_STATUS:
2393 case MSR_IA32_MC1_STATUS:
2394 case MSR_IA32_MC2_STATUS:
2395 case MSR_IA32_MC3_STATUS:
2396 case MSR_IA32_MC4_STATUS:
2397 case MSR_IA32_MC5_STATUS:
2398 /* No point in letting the guest see real MCEs */
2399 msr_content = 0;
2400 break;
2401 case MSR_IA32_MISC_ENABLE:
2402 rdmsrl(MSR_IA32_MISC_ENABLE, msr_content);
2403 /* Debug Trace Store is not supported. */
2404 msr_content |= MSR_IA32_MISC_ENABLE_BTS_UNAVAIL |
2405 MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL;
2406 break;
2407 default:
2408 if ( vpmu_do_rdmsr(regs) )
2409 goto done;
2410 switch ( long_mode_do_msr_read(regs) )
2412 case HNDL_unhandled:
2413 break;
2414 case HNDL_exception_raised:
2415 return 0;
2416 case HNDL_done:
2417 goto done;
2420 if ( vmx_read_guest_msr(v, ecx, &msr_content) == 0 )
2421 break;
2423 if ( is_last_branch_msr(ecx) )
2425 msr_content = 0;
2426 break;
2429 if ( rdmsr_hypervisor_regs(ecx, &eax, &edx) ||
2430 rdmsr_safe(ecx, eax, edx) == 0 )
2432 regs->eax = eax;
2433 regs->edx = edx;
2434 goto done;
2437 goto gp_fault;
2440 regs->eax = msr_content & 0xFFFFFFFF;
2441 regs->edx = msr_content >> 32;
2443 done:
2444 hvmtrace_msr_read(v, ecx, msr_content);
2445 HVM_DBG_LOG(DBG_LEVEL_1, "returns: ecx=%x, eax=%lx, edx=%lx",
2446 ecx, (unsigned long)regs->eax,
2447 (unsigned long)regs->edx);
2448 return 1;
2450 gp_fault:
2451 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
2452 return 0;
2455 static int vmx_alloc_vlapic_mapping(struct domain *d)
2457 void *apic_va;
2459 if ( !cpu_has_vmx_virtualize_apic_accesses )
2460 return 0;
2462 apic_va = alloc_xenheap_page();
2463 if ( apic_va == NULL )
2464 return -ENOMEM;
2465 share_xen_page_with_guest(virt_to_page(apic_va), d, XENSHARE_writable);
2466 set_mmio_p2m_entry(
2467 d, paddr_to_pfn(APIC_DEFAULT_PHYS_BASE), _mfn(virt_to_mfn(apic_va)));
2468 d->arch.hvm_domain.vmx_apic_access_mfn = virt_to_mfn(apic_va);
2470 return 0;
2473 static void vmx_free_vlapic_mapping(struct domain *d)
2475 unsigned long mfn = d->arch.hvm_domain.vmx_apic_access_mfn;
2476 if ( mfn != 0 )
2477 free_xenheap_page(mfn_to_virt(mfn));
2480 static void vmx_install_vlapic_mapping(struct vcpu *v)
2482 paddr_t virt_page_ma, apic_page_ma;
2484 if ( !cpu_has_vmx_virtualize_apic_accesses )
2485 return;
2487 virt_page_ma = page_to_maddr(vcpu_vlapic(v)->regs_page);
2488 apic_page_ma = v->domain->arch.hvm_domain.vmx_apic_access_mfn;
2489 apic_page_ma <<= PAGE_SHIFT;
2491 vmx_vmcs_enter(v);
2492 __vmwrite(VIRTUAL_APIC_PAGE_ADDR, virt_page_ma);
2493 __vmwrite(APIC_ACCESS_ADDR, apic_page_ma);
2494 vmx_vmcs_exit(v);
2497 void vmx_vlapic_msr_changed(struct vcpu *v)
2499 struct vlapic *vlapic = vcpu_vlapic(v);
2500 uint32_t ctl;
2502 if ( !cpu_has_vmx_virtualize_apic_accesses )
2503 return;
2505 vmx_vmcs_enter(v);
2506 ctl = __vmread(SECONDARY_VM_EXEC_CONTROL);
2507 ctl &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
2508 if ( !vlapic_hw_disabled(vlapic) &&
2509 (vlapic_base_address(vlapic) == APIC_DEFAULT_PHYS_BASE) )
2510 ctl |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
2511 __vmwrite(SECONDARY_VM_EXEC_CONTROL, ctl);
2512 vmx_vmcs_exit(v);
2515 extern bool_t mtrr_var_range_msr_set(struct mtrr_state *v,
2516 u32 msr, u64 msr_content);
2517 extern bool_t mtrr_fix_range_msr_set(struct mtrr_state *v,
2518 int row, u64 msr_content);
2519 extern bool_t mtrr_def_type_msr_set(struct mtrr_state *v, u64 msr_content);
2520 extern bool_t pat_msr_set(u64 *pat, u64 msr);
2522 int vmx_msr_write_intercept(struct cpu_user_regs *regs)
2524 u32 ecx = regs->ecx;
2525 u64 msr_content;
2526 struct vcpu *v = current;
2527 int index;
2529 HVM_DBG_LOG(DBG_LEVEL_1, "ecx=%x, eax=%x, edx=%x",
2530 ecx, (u32)regs->eax, (u32)regs->edx);
2532 msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
2534 hvmtrace_msr_write(v, ecx, msr_content);
2536 switch ( ecx )
2538 case MSR_IA32_TSC:
2539 hvm_set_guest_time(v, msr_content);
2540 pt_reset(v);
2541 break;
2542 case MSR_IA32_SYSENTER_CS:
2543 __vmwrite(GUEST_SYSENTER_CS, msr_content);
2544 break;
2545 case MSR_IA32_SYSENTER_ESP:
2546 __vmwrite(GUEST_SYSENTER_ESP, msr_content);
2547 break;
2548 case MSR_IA32_SYSENTER_EIP:
2549 __vmwrite(GUEST_SYSENTER_EIP, msr_content);
2550 break;
2551 case MSR_IA32_APICBASE:
2552 vlapic_msr_set(vcpu_vlapic(v), msr_content);
2553 break;
2554 case MSR_IA32_CR_PAT:
2555 if ( !pat_msr_set(&v->arch.hvm_vcpu.pat_cr, msr_content) )
2556 goto gp_fault;
2557 break;
2558 case MSR_MTRRdefType:
2559 if ( !mtrr_def_type_msr_set(&v->arch.hvm_vcpu.mtrr, msr_content) )
2560 goto gp_fault;
2561 break;
2562 case MSR_MTRRfix64K_00000:
2563 if ( !mtrr_fix_range_msr_set(&v->arch.hvm_vcpu.mtrr, 0, msr_content) )
2564 goto gp_fault;
2565 break;
2566 case MSR_MTRRfix16K_80000:
2567 case MSR_MTRRfix16K_A0000:
2568 index = regs->ecx - MSR_MTRRfix16K_80000 + 1;
2569 if ( !mtrr_fix_range_msr_set(&v->arch.hvm_vcpu.mtrr,
2570 index, msr_content) )
2571 goto gp_fault;
2572 break;
2573 case MSR_MTRRfix4K_C0000...MSR_MTRRfix4K_F8000:
2574 index = regs->ecx - MSR_MTRRfix4K_C0000 + 3;
2575 if ( !mtrr_fix_range_msr_set(&v->arch.hvm_vcpu.mtrr,
2576 index, msr_content) )
2577 goto gp_fault;
2578 break;
2579 case MSR_IA32_MTRR_PHYSBASE0...MSR_IA32_MTRR_PHYSMASK7:
2580 if ( !mtrr_var_range_msr_set(&v->arch.hvm_vcpu.mtrr,
2581 regs->ecx, msr_content) )
2582 goto gp_fault;
2583 break;
2584 case MSR_MTRRcap:
2585 goto gp_fault;
2586 case MSR_IA32_DEBUGCTLMSR: {
2587 int i, rc = 0;
2589 if ( !msr_content || (msr_content & ~3) )
2590 break;
2592 if ( msr_content & 1 )
2594 const struct lbr_info *lbr = last_branch_msr_get();
2595 if ( lbr == NULL )
2596 break;
2598 for ( ; (rc == 0) && lbr->count; lbr++ )
2599 for ( i = 0; (rc == 0) && (i < lbr->count); i++ )
2600 if ( (rc = vmx_add_guest_msr(v, lbr->base + i)) == 0 )
2601 vmx_disable_intercept_for_msr(v, lbr->base + i);
2604 if ( (rc < 0) ||
2605 (vmx_add_guest_msr(v, ecx) < 0) ||
2606 (vmx_add_host_load_msr(v, ecx) < 0) )
2607 vmx_inject_hw_exception(v, TRAP_machine_check, 0);
2608 else
2609 vmx_write_guest_msr(v, ecx, msr_content);
2611 break;
2613 case MSR_IA32_VMX_BASIC...MSR_IA32_VMX_PROCBASED_CTLS2:
2614 goto gp_fault;
2615 default:
2616 if ( vpmu_do_wrmsr(regs) )
2617 return 1;
2618 switch ( long_mode_do_msr_write(regs) )
2620 case HNDL_unhandled:
2621 if ( (vmx_write_guest_msr(v, ecx, msr_content) != 0) &&
2622 !is_last_branch_msr(ecx) )
2623 wrmsr_hypervisor_regs(ecx, regs->eax, regs->edx);
2624 break;
2625 case HNDL_exception_raised:
2626 return 0;
2627 case HNDL_done:
2628 break;
2630 break;
2633 return 1;
2635 gp_fault:
2636 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
2637 return 0;
2640 static void vmx_do_hlt(struct cpu_user_regs *regs)
2642 unsigned long intr_info = __vmread(VM_ENTRY_INTR_INFO);
2643 struct vcpu *curr = current;
2645 /* Check for pending exception. */
2646 if ( intr_info & INTR_INFO_VALID_MASK )
2648 HVMTRACE_1D(HLT, curr, /*int pending=*/ 1);
2649 return;
2652 HVMTRACE_1D(HLT, curr, /*int pending=*/ 0);
2653 hvm_hlt(regs->eflags);
2656 static void vmx_do_extint(struct cpu_user_regs *regs)
2658 unsigned int vector;
2660 asmlinkage void do_IRQ(struct cpu_user_regs *);
2661 fastcall void smp_apic_timer_interrupt(struct cpu_user_regs *);
2662 fastcall void smp_event_check_interrupt(void);
2663 fastcall void smp_invalidate_interrupt(void);
2664 fastcall void smp_call_function_interrupt(void);
2665 fastcall void smp_spurious_interrupt(struct cpu_user_regs *regs);
2666 fastcall void smp_error_interrupt(struct cpu_user_regs *regs);
2667 fastcall void smp_pmu_apic_interrupt(struct cpu_user_regs *regs);
2668 #ifdef CONFIG_X86_MCE_P4THERMAL
2669 fastcall void smp_thermal_interrupt(struct cpu_user_regs *regs);
2670 #endif
2672 vector = __vmread(VM_EXIT_INTR_INFO);
2673 BUG_ON(!(vector & INTR_INFO_VALID_MASK));
2675 vector &= INTR_INFO_VECTOR_MASK;
2676 HVMTRACE_1D(INTR, current, vector);
2678 switch ( vector )
2680 case LOCAL_TIMER_VECTOR:
2681 smp_apic_timer_interrupt(regs);
2682 break;
2683 case EVENT_CHECK_VECTOR:
2684 smp_event_check_interrupt();
2685 break;
2686 case INVALIDATE_TLB_VECTOR:
2687 smp_invalidate_interrupt();
2688 break;
2689 case CALL_FUNCTION_VECTOR:
2690 smp_call_function_interrupt();
2691 break;
2692 case SPURIOUS_APIC_VECTOR:
2693 smp_spurious_interrupt(regs);
2694 break;
2695 case ERROR_APIC_VECTOR:
2696 smp_error_interrupt(regs);
2697 break;
2698 case PMU_APIC_VECTOR:
2699 smp_pmu_apic_interrupt(regs);
2700 break;
2701 #ifdef CONFIG_X86_MCE_P4THERMAL
2702 case THERMAL_APIC_VECTOR:
2703 smp_thermal_interrupt(regs);
2704 break;
2705 #endif
2706 default:
2707 regs->entry_vector = vector;
2708 do_IRQ(regs);
2709 break;
2713 static void wbinvd_ipi(void *info)
2715 wbinvd();
2718 void vmx_wbinvd_intercept(void)
2720 if ( list_empty(&(domain_hvm_iommu(current->domain)->pdev_list)) )
2721 return;
2723 if ( cpu_has_wbinvd_exiting )
2724 on_each_cpu(wbinvd_ipi, NULL, 1, 1);
2725 else
2726 wbinvd();
2729 static void vmx_failed_vmentry(unsigned int exit_reason,
2730 struct cpu_user_regs *regs)
2732 unsigned int failed_vmentry_reason = (uint16_t)exit_reason;
2733 unsigned long exit_qualification = __vmread(EXIT_QUALIFICATION);
2734 struct vcpu *curr = current;
2736 printk("Failed vm entry (exit reason 0x%x) ", exit_reason);
2737 switch ( failed_vmentry_reason )
2739 case EXIT_REASON_INVALID_GUEST_STATE:
2740 printk("caused by invalid guest state (%ld).\n", exit_qualification);
2741 break;
2742 case EXIT_REASON_MSR_LOADING:
2743 printk("caused by MSR entry %ld loading.\n", exit_qualification);
2744 break;
2745 case EXIT_REASON_MACHINE_CHECK:
2746 printk("caused by machine check.\n");
2747 HVMTRACE_0D(MCE, curr);
2748 do_machine_check(regs);
2749 break;
2750 default:
2751 printk("reason not known yet!");
2752 break;
2755 printk("************* VMCS Area **************\n");
2756 vmcs_dump_vcpu(curr);
2757 printk("**************************************\n");
2759 domain_crash(curr->domain);
2762 asmlinkage void vmx_vmexit_handler(struct cpu_user_regs *regs)
2764 unsigned int exit_reason, idtv_info;
2765 unsigned long exit_qualification, inst_len = 0;
2766 struct vcpu *v = current;
2768 exit_reason = __vmread(VM_EXIT_REASON);
2770 hvmtrace_vmexit(v, regs->eip, exit_reason);
2772 perfc_incra(vmexits, exit_reason);
2774 if ( exit_reason != EXIT_REASON_EXTERNAL_INTERRUPT )
2775 local_irq_enable();
2777 if ( unlikely(exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) )
2778 return vmx_failed_vmentry(exit_reason, regs);
2780 hvm_maybe_deassert_evtchn_irq();
2782 /* Event delivery caused this intercept? Queue for redelivery. */
2783 idtv_info = __vmread(IDT_VECTORING_INFO);
2784 if ( unlikely(idtv_info & INTR_INFO_VALID_MASK) &&
2785 (exit_reason != EXIT_REASON_TASK_SWITCH) )
2787 if ( hvm_event_needs_reinjection((idtv_info>>8)&7, idtv_info&0xff) )
2789 /* See SDM 3B 25.7.1.1 and .2 for info about masking resvd bits. */
2790 __vmwrite(VM_ENTRY_INTR_INFO,
2791 idtv_info & ~INTR_INFO_RESVD_BITS_MASK);
2792 if ( idtv_info & INTR_INFO_DELIVER_CODE_MASK )
2793 __vmwrite(VM_ENTRY_EXCEPTION_ERROR_CODE,
2794 __vmread(IDT_VECTORING_ERROR_CODE));
2797 /*
2798 * Clear NMI-blocking interruptibility info if an NMI delivery faulted.
2799 * Re-delivery will re-set it (see SDM 3B 25.7.1.2).
2800 */
2801 if ( (idtv_info & INTR_INFO_INTR_TYPE_MASK) == (X86_EVENTTYPE_NMI<<8) )
2802 __vmwrite(GUEST_INTERRUPTIBILITY_INFO,
2803 __vmread(GUEST_INTERRUPTIBILITY_INFO) &
2804 ~VMX_INTR_SHADOW_NMI);
2807 switch ( exit_reason )
2809 case EXIT_REASON_EXCEPTION_NMI:
2811 /*
2812 * We don't set the software-interrupt exiting (INT n).
2813 * (1) We can get an exception (e.g. #PG) in the guest, or
2814 * (2) NMI
2815 */
2816 unsigned int intr_info, vector;
2818 intr_info = __vmread(VM_EXIT_INTR_INFO);
2819 BUG_ON(!(intr_info & INTR_INFO_VALID_MASK));
2821 vector = intr_info & INTR_INFO_VECTOR_MASK;
2823 /*
2824 * Re-set the NMI shadow if vmexit caused by a guest IRET fault (see 3B
2825 * 25.7.1.2, "Resuming Guest Software after Handling an Exception").
2826 * (NB. If we emulate this IRET for any reason, we should re-clear!)
2827 */
2828 if ( unlikely(intr_info & INTR_INFO_NMI_UNBLOCKED_BY_IRET) &&
2829 !(__vmread(IDT_VECTORING_INFO) & INTR_INFO_VALID_MASK) &&
2830 (vector != TRAP_double_fault) )
2831 __vmwrite(GUEST_INTERRUPTIBILITY_INFO,
2832 __vmread(GUEST_INTERRUPTIBILITY_INFO)|VMX_INTR_SHADOW_NMI);
2834 perfc_incra(cause_vector, vector);
2836 switch ( vector )
2838 case TRAP_debug:
2839 case TRAP_int3:
2840 if ( !v->domain->debugger_attached )
2841 goto exit_and_crash;
2842 domain_pause_for_debugger();
2843 break;
2844 case TRAP_no_device:
2845 vmx_do_no_device_fault();
2846 break;
2847 case TRAP_page_fault:
2848 exit_qualification = __vmread(EXIT_QUALIFICATION);
2849 regs->error_code = __vmread(VM_EXIT_INTR_ERROR_CODE);
2851 HVM_DBG_LOG(DBG_LEVEL_VMMU,
2852 "eax=%lx, ebx=%lx, ecx=%lx, edx=%lx, esi=%lx, edi=%lx",
2853 (unsigned long)regs->eax, (unsigned long)regs->ebx,
2854 (unsigned long)regs->ecx, (unsigned long)regs->edx,
2855 (unsigned long)regs->esi, (unsigned long)regs->edi);
2857 if ( paging_fault(exit_qualification, regs) )
2859 hvmtrace_pf_xen(v, exit_qualification, regs->error_code);
2860 break;
2863 v->arch.hvm_vcpu.guest_cr[2] = exit_qualification;
2864 vmx_inject_hw_exception(v, TRAP_page_fault, regs->error_code);
2865 break;
2866 case TRAP_nmi:
2867 if ( (intr_info & INTR_INFO_INTR_TYPE_MASK) !=
2868 (X86_EVENTTYPE_NMI << 8) )
2869 goto exit_and_crash;
2870 HVMTRACE_0D(NMI, v);
2871 do_nmi(regs); /* Real NMI, vector 2: normal processing. */
2872 break;
2873 case TRAP_machine_check:
2874 HVMTRACE_0D(MCE, v);
2875 do_machine_check(regs);
2876 break;
2877 default:
2878 goto exit_and_crash;
2880 break;
2882 case EXIT_REASON_EXTERNAL_INTERRUPT:
2883 vmx_do_extint(regs);
2884 break;
2885 case EXIT_REASON_TRIPLE_FAULT:
2886 hvm_triple_fault();
2887 break;
2888 case EXIT_REASON_PENDING_VIRT_INTR:
2889 /* Disable the interrupt window. */
2890 v->arch.hvm_vmx.exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
2891 __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
2892 v->arch.hvm_vmx.exec_control);
2893 break;
2894 case EXIT_REASON_PENDING_VIRT_NMI:
2895 /* Disable the NMI window. */
2896 v->arch.hvm_vmx.exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
2897 __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
2898 v->arch.hvm_vmx.exec_control);
2899 break;
2900 case EXIT_REASON_TASK_SWITCH: {
2901 const enum hvm_task_switch_reason reasons[] = {
2902 TSW_call_or_int, TSW_iret, TSW_jmp, TSW_call_or_int };
2903 int32_t errcode = -1;
2904 exit_qualification = __vmread(EXIT_QUALIFICATION);
2905 if ( (idtv_info & INTR_INFO_VALID_MASK) &&
2906 (idtv_info & INTR_INFO_DELIVER_CODE_MASK) )
2907 errcode = __vmread(IDT_VECTORING_ERROR_CODE);
2908 hvm_task_switch((uint16_t)exit_qualification,
2909 reasons[(exit_qualification >> 30) & 3],
2910 errcode);
2911 break;
2913 case EXIT_REASON_CPUID:
2914 inst_len = __get_instruction_length(); /* Safe: CPUID */
2915 __update_guest_eip(inst_len);
2916 vmx_do_cpuid(regs);
2917 break;
2918 case EXIT_REASON_HLT:
2919 inst_len = __get_instruction_length(); /* Safe: HLT */
2920 __update_guest_eip(inst_len);
2921 vmx_do_hlt(regs);
2922 break;
2923 case EXIT_REASON_INVLPG:
2925 inst_len = __get_instruction_length(); /* Safe: INVLPG */
2926 __update_guest_eip(inst_len);
2927 exit_qualification = __vmread(EXIT_QUALIFICATION);
2928 vmx_do_invlpg(exit_qualification);
2929 break;
2931 case EXIT_REASON_VMCALL:
2933 int rc;
2934 HVMTRACE_1D(VMMCALL, v, regs->eax);
2935 inst_len = __get_instruction_length(); /* Safe: VMCALL */
2936 rc = hvm_do_hypercall(regs);
2937 if ( rc != HVM_HCALL_preempted )
2939 __update_guest_eip(inst_len);
2940 if ( rc == HVM_HCALL_invalidate )
2941 send_invalidate_req();
2943 break;
2945 case EXIT_REASON_CR_ACCESS:
2947 exit_qualification = __vmread(EXIT_QUALIFICATION);
2948 inst_len = __get_instruction_length(); /* Safe: MOV Cn, LMSW, CLTS */
2949 if ( vmx_cr_access(exit_qualification, regs) )
2950 __update_guest_eip(inst_len);
2951 break;
2953 case EXIT_REASON_DR_ACCESS:
2954 exit_qualification = __vmread(EXIT_QUALIFICATION);
2955 vmx_dr_access(exit_qualification, regs);
2956 break;
2957 case EXIT_REASON_IO_INSTRUCTION:
2958 exit_qualification = __vmread(EXIT_QUALIFICATION);
2959 inst_len = __get_instruction_length(); /* Safe: IN, INS, OUT, OUTS */
2960 vmx_io_instruction(exit_qualification, inst_len);
2961 break;
2962 case EXIT_REASON_MSR_READ:
2963 inst_len = __get_instruction_length(); /* Safe: RDMSR */
2964 if ( vmx_msr_read_intercept(regs) )
2965 __update_guest_eip(inst_len);
2966 break;
2967 case EXIT_REASON_MSR_WRITE:
2968 inst_len = __get_instruction_length(); /* Safe: WRMSR */
2969 if ( vmx_msr_write_intercept(regs) )
2970 __update_guest_eip(inst_len);
2971 break;
2973 case EXIT_REASON_MWAIT_INSTRUCTION:
2974 case EXIT_REASON_MONITOR_INSTRUCTION:
2975 case EXIT_REASON_VMCLEAR:
2976 case EXIT_REASON_VMLAUNCH:
2977 case EXIT_REASON_VMPTRLD:
2978 case EXIT_REASON_VMPTRST:
2979 case EXIT_REASON_VMREAD:
2980 case EXIT_REASON_VMRESUME:
2981 case EXIT_REASON_VMWRITE:
2982 case EXIT_REASON_VMXOFF:
2983 case EXIT_REASON_VMXON:
2984 vmx_inject_hw_exception(v, TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE);
2985 break;
2987 case EXIT_REASON_TPR_BELOW_THRESHOLD:
2988 break;
2990 case EXIT_REASON_APIC_ACCESS:
2992 unsigned long offset;
2993 exit_qualification = __vmread(EXIT_QUALIFICATION);
2994 offset = exit_qualification & 0x0fffUL;
2995 handle_mmio(APIC_DEFAULT_PHYS_BASE | offset);
2996 break;
2999 case EXIT_REASON_INVD:
3000 case EXIT_REASON_WBINVD:
3002 inst_len = __get_instruction_length(); /* Safe: INVD, WBINVD */
3003 __update_guest_eip(inst_len);
3004 vmx_wbinvd_intercept();
3005 break;
3008 default:
3009 exit_and_crash:
3010 gdprintk(XENLOG_ERR, "Bad vmexit (reason %x)\n", exit_reason);
3011 domain_crash(v->domain);
3012 break;
3016 asmlinkage void vmx_trace_vmentry(void)
3018 struct vcpu *v = current;
3020 hvmtrace_vmentry(v);
3023 /*
3024 * Local variables:
3025 * mode: C
3026 * c-set-style: "BSD"
3027 * c-basic-offset: 4
3028 * tab-width: 4
3029 * indent-tabs-mode: nil
3030 * End:
3031 */