ia64/xen-unstable

view xen/arch/x86/hvm/vmx/vmx.c @ 16427:fd3f6d814f6d

x86: single step after instruction emulation

Inject single step trap after emulating instructions if guest's
EFLAGS.TF is set.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Keir Fraser <keir.fraser@eu.citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Thu Nov 22 18:28:47 2007 +0000 (2007-11-22)
parents 93d129d27f69
children 69b56d3289f5
line source
1 /*
2 * vmx.c: handling VMX architecture-related VM exits
3 * Copyright (c) 2004, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
16 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 */
19 #include <xen/config.h>
20 #include <xen/init.h>
21 #include <xen/lib.h>
22 #include <xen/trace.h>
23 #include <xen/sched.h>
24 #include <xen/irq.h>
25 #include <xen/softirq.h>
26 #include <xen/domain_page.h>
27 #include <xen/hypercall.h>
28 #include <xen/perfc.h>
29 #include <asm/current.h>
30 #include <asm/io.h>
31 #include <asm/regs.h>
32 #include <asm/cpufeature.h>
33 #include <asm/processor.h>
34 #include <asm/types.h>
35 #include <asm/msr.h>
36 #include <asm/spinlock.h>
37 #include <asm/paging.h>
38 #include <asm/p2m.h>
39 #include <asm/hvm/hvm.h>
40 #include <asm/hvm/support.h>
41 #include <asm/hvm/vmx/vmx.h>
42 #include <asm/hvm/vmx/vmcs.h>
43 #include <asm/hvm/vmx/cpu.h>
44 #include <public/sched.h>
45 #include <public/hvm/ioreq.h>
46 #include <asm/hvm/vpic.h>
47 #include <asm/hvm/vlapic.h>
48 #include <asm/x86_emulate.h>
49 #include <asm/hvm/vpt.h>
50 #include <public/hvm/save.h>
51 #include <asm/hvm/trace.h>
53 enum handler_return { HNDL_done, HNDL_unhandled, HNDL_exception_raised };
55 static void vmx_ctxt_switch_from(struct vcpu *v);
56 static void vmx_ctxt_switch_to(struct vcpu *v);
58 static int vmx_alloc_vlapic_mapping(struct domain *d);
59 static void vmx_free_vlapic_mapping(struct domain *d);
60 static void vmx_install_vlapic_mapping(struct vcpu *v);
61 static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr);
62 static void vmx_update_guest_efer(struct vcpu *v);
64 static int vmx_domain_initialise(struct domain *d)
65 {
66 return vmx_alloc_vlapic_mapping(d);
67 }
69 static void vmx_domain_destroy(struct domain *d)
70 {
71 vmx_free_vlapic_mapping(d);
72 }
74 static int vmx_vcpu_initialise(struct vcpu *v)
75 {
76 int rc;
78 spin_lock_init(&v->arch.hvm_vmx.vmcs_lock);
80 v->arch.schedule_tail = vmx_do_resume;
81 v->arch.ctxt_switch_from = vmx_ctxt_switch_from;
82 v->arch.ctxt_switch_to = vmx_ctxt_switch_to;
84 if ( (rc = vmx_create_vmcs(v)) != 0 )
85 {
86 dprintk(XENLOG_WARNING,
87 "Failed to create VMCS for vcpu %d: err=%d.\n",
88 v->vcpu_id, rc);
89 return rc;
90 }
92 vmx_install_vlapic_mapping(v);
94 return 0;
95 }
97 static void vmx_vcpu_destroy(struct vcpu *v)
98 {
99 vmx_destroy_vmcs(v);
100 }
102 #ifdef __x86_64__
104 static DEFINE_PER_CPU(struct vmx_msr_state, host_msr_state);
106 static u32 msr_index[VMX_MSR_COUNT] =
107 {
108 MSR_LSTAR, MSR_STAR, MSR_SYSCALL_MASK
109 };
111 static void vmx_save_host_msrs(void)
112 {
113 struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state);
114 int i;
116 for ( i = 0; i < VMX_MSR_COUNT; i++ )
117 rdmsrl(msr_index[i], host_msr_state->msrs[i]);
118 }
120 #define WRITE_MSR(address) \
121 guest_msr_state->msrs[VMX_INDEX_MSR_ ## address] = msr_content; \
122 set_bit(VMX_INDEX_MSR_ ## address, &guest_msr_state->flags); \
123 wrmsrl(MSR_ ## address, msr_content); \
124 set_bit(VMX_INDEX_MSR_ ## address, &host_msr_state->flags); \
125 break
127 static enum handler_return long_mode_do_msr_read(struct cpu_user_regs *regs)
128 {
129 u64 msr_content = 0;
130 u32 ecx = regs->ecx;
131 struct vcpu *v = current;
132 struct vmx_msr_state *guest_msr_state = &v->arch.hvm_vmx.msr_state;
134 switch ( ecx )
135 {
136 case MSR_EFER:
137 msr_content = v->arch.hvm_vcpu.guest_efer;
138 break;
140 case MSR_FS_BASE:
141 msr_content = __vmread(GUEST_FS_BASE);
142 goto check_long_mode;
144 case MSR_GS_BASE:
145 msr_content = __vmread(GUEST_GS_BASE);
146 goto check_long_mode;
148 case MSR_SHADOW_GS_BASE:
149 msr_content = v->arch.hvm_vmx.shadow_gs;
150 check_long_mode:
151 if ( !(hvm_long_mode_enabled(v)) )
152 {
153 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
154 return HNDL_exception_raised;
155 }
156 break;
158 case MSR_STAR:
159 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_STAR];
160 break;
162 case MSR_LSTAR:
163 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_LSTAR];
164 break;
166 case MSR_CSTAR:
167 msr_content = v->arch.hvm_vmx.cstar;
168 break;
170 case MSR_SYSCALL_MASK:
171 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_SYSCALL_MASK];
172 break;
174 default:
175 return HNDL_unhandled;
176 }
178 HVM_DBG_LOG(DBG_LEVEL_0, "msr 0x%x content 0x%"PRIx64, ecx, msr_content);
180 regs->eax = (u32)(msr_content >> 0);
181 regs->edx = (u32)(msr_content >> 32);
183 return HNDL_done;
184 }
186 static enum handler_return long_mode_do_msr_write(struct cpu_user_regs *regs)
187 {
188 u64 msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
189 u32 ecx = regs->ecx;
190 struct vcpu *v = current;
191 struct vmx_msr_state *guest_msr_state = &v->arch.hvm_vmx.msr_state;
192 struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state);
194 HVM_DBG_LOG(DBG_LEVEL_0, "msr 0x%x content 0x%"PRIx64, ecx, msr_content);
196 switch ( ecx )
197 {
198 case MSR_EFER:
199 if ( !hvm_set_efer(msr_content) )
200 goto exception_raised;
201 break;
203 case MSR_FS_BASE:
204 case MSR_GS_BASE:
205 case MSR_SHADOW_GS_BASE:
206 if ( !hvm_long_mode_enabled(v) )
207 goto gp_fault;
209 if ( !is_canonical_address(msr_content) )
210 goto uncanonical_address;
212 if ( ecx == MSR_FS_BASE )
213 __vmwrite(GUEST_FS_BASE, msr_content);
214 else if ( ecx == MSR_GS_BASE )
215 __vmwrite(GUEST_GS_BASE, msr_content);
216 else
217 {
218 v->arch.hvm_vmx.shadow_gs = msr_content;
219 wrmsrl(MSR_SHADOW_GS_BASE, msr_content);
220 }
222 break;
224 case MSR_STAR:
225 WRITE_MSR(STAR);
227 case MSR_LSTAR:
228 if ( !is_canonical_address(msr_content) )
229 goto uncanonical_address;
230 WRITE_MSR(LSTAR);
232 case MSR_CSTAR:
233 if ( !is_canonical_address(msr_content) )
234 goto uncanonical_address;
235 v->arch.hvm_vmx.cstar = msr_content;
236 break;
238 case MSR_SYSCALL_MASK:
239 WRITE_MSR(SYSCALL_MASK);
241 default:
242 return HNDL_unhandled;
243 }
245 return HNDL_done;
247 uncanonical_address:
248 HVM_DBG_LOG(DBG_LEVEL_0, "Not cano address of msr write %x", ecx);
249 gp_fault:
250 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
251 exception_raised:
252 return HNDL_exception_raised;
253 }
255 /*
256 * To avoid MSR save/restore at every VM exit/entry time, we restore
257 * the x86_64 specific MSRs at domain switch time. Since these MSRs
258 * are not modified once set for para domains, we don't save them,
259 * but simply reset them to values set in percpu_traps_init().
260 */
261 static void vmx_restore_host_msrs(void)
262 {
263 struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state);
264 int i;
266 while ( host_msr_state->flags )
267 {
268 i = find_first_set_bit(host_msr_state->flags);
269 wrmsrl(msr_index[i], host_msr_state->msrs[i]);
270 clear_bit(i, &host_msr_state->flags);
271 }
273 if ( cpu_has_nx && !(read_efer() & EFER_NX) )
274 write_efer(read_efer() | EFER_NX);
275 }
277 static void vmx_save_guest_msrs(struct vcpu *v)
278 {
279 /* MSR_SHADOW_GS_BASE may have been changed by swapgs instruction. */
280 rdmsrl(MSR_SHADOW_GS_BASE, v->arch.hvm_vmx.shadow_gs);
281 }
283 static void vmx_restore_guest_msrs(struct vcpu *v)
284 {
285 struct vmx_msr_state *guest_msr_state, *host_msr_state;
286 unsigned long guest_flags;
287 int i;
289 guest_msr_state = &v->arch.hvm_vmx.msr_state;
290 host_msr_state = &this_cpu(host_msr_state);
292 wrmsrl(MSR_SHADOW_GS_BASE, v->arch.hvm_vmx.shadow_gs);
294 guest_flags = guest_msr_state->flags;
296 while ( guest_flags )
297 {
298 i = find_first_set_bit(guest_flags);
300 HVM_DBG_LOG(DBG_LEVEL_2,
301 "restore guest's index %d msr %x with value %lx",
302 i, msr_index[i], guest_msr_state->msrs[i]);
303 set_bit(i, &host_msr_state->flags);
304 wrmsrl(msr_index[i], guest_msr_state->msrs[i]);
305 clear_bit(i, &guest_flags);
306 }
308 if ( (v->arch.hvm_vcpu.guest_efer ^ read_efer()) & (EFER_NX | EFER_SCE) )
309 {
310 HVM_DBG_LOG(DBG_LEVEL_2,
311 "restore guest's EFER with value %lx",
312 v->arch.hvm_vcpu.guest_efer);
313 write_efer((read_efer() & ~(EFER_NX | EFER_SCE)) |
314 (v->arch.hvm_vcpu.guest_efer & (EFER_NX | EFER_SCE)));
315 }
316 }
318 #else /* __i386__ */
320 #define vmx_save_host_msrs() ((void)0)
322 static void vmx_restore_host_msrs(void)
323 {
324 if ( cpu_has_nx && !(read_efer() & EFER_NX) )
325 write_efer(read_efer() | EFER_NX);
326 }
328 #define vmx_save_guest_msrs(v) ((void)0)
330 static void vmx_restore_guest_msrs(struct vcpu *v)
331 {
332 if ( (v->arch.hvm_vcpu.guest_efer ^ read_efer()) & EFER_NX )
333 {
334 HVM_DBG_LOG(DBG_LEVEL_2,
335 "restore guest's EFER with value %lx",
336 v->arch.hvm_vcpu.guest_efer);
337 write_efer((read_efer() & ~EFER_NX) |
338 (v->arch.hvm_vcpu.guest_efer & EFER_NX));
339 }
340 }
342 static enum handler_return long_mode_do_msr_read(struct cpu_user_regs *regs)
343 {
344 u64 msr_content = 0;
345 struct vcpu *v = current;
347 switch ( regs->ecx )
348 {
349 case MSR_EFER:
350 msr_content = v->arch.hvm_vcpu.guest_efer;
351 break;
353 default:
354 return HNDL_unhandled;
355 }
357 regs->eax = msr_content >> 0;
358 regs->edx = msr_content >> 32;
360 return HNDL_done;
361 }
363 static enum handler_return long_mode_do_msr_write(struct cpu_user_regs *regs)
364 {
365 u64 msr_content = regs->eax | ((u64)regs->edx << 32);
367 switch ( regs->ecx )
368 {
369 case MSR_EFER:
370 if ( !hvm_set_efer(msr_content) )
371 return HNDL_exception_raised;
372 break;
374 default:
375 return HNDL_unhandled;
376 }
378 return HNDL_done;
379 }
381 #endif /* __i386__ */
383 static int vmx_guest_x86_mode(struct vcpu *v)
384 {
385 unsigned int cs_ar_bytes;
387 if ( unlikely(!(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE)) )
388 return 0;
389 if ( unlikely(guest_cpu_user_regs()->eflags & X86_EFLAGS_VM) )
390 return 1;
391 cs_ar_bytes = __vmread(GUEST_CS_AR_BYTES);
392 if ( hvm_long_mode_enabled(v) &&
393 likely(cs_ar_bytes & X86_SEG_AR_CS_LM_ACTIVE) )
394 return 8;
395 return (likely(cs_ar_bytes & X86_SEG_AR_DEF_OP_SIZE) ? 4 : 2);
396 }
398 static void vmx_save_dr(struct vcpu *v)
399 {
400 if ( !v->arch.hvm_vcpu.flag_dr_dirty )
401 return;
403 /* Clear the DR dirty flag and re-enable intercepts for DR accesses. */
404 v->arch.hvm_vcpu.flag_dr_dirty = 0;
405 v->arch.hvm_vmx.exec_control |= CPU_BASED_MOV_DR_EXITING;
406 __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control);
408 v->arch.guest_context.debugreg[0] = read_debugreg(0);
409 v->arch.guest_context.debugreg[1] = read_debugreg(1);
410 v->arch.guest_context.debugreg[2] = read_debugreg(2);
411 v->arch.guest_context.debugreg[3] = read_debugreg(3);
412 v->arch.guest_context.debugreg[6] = read_debugreg(6);
413 /* DR7 must be saved as it is used by vmx_restore_dr(). */
414 v->arch.guest_context.debugreg[7] = __vmread(GUEST_DR7);
415 }
417 static void __restore_debug_registers(struct vcpu *v)
418 {
419 if ( v->arch.hvm_vcpu.flag_dr_dirty )
420 return;
422 v->arch.hvm_vcpu.flag_dr_dirty = 1;
424 write_debugreg(0, v->arch.guest_context.debugreg[0]);
425 write_debugreg(1, v->arch.guest_context.debugreg[1]);
426 write_debugreg(2, v->arch.guest_context.debugreg[2]);
427 write_debugreg(3, v->arch.guest_context.debugreg[3]);
428 write_debugreg(6, v->arch.guest_context.debugreg[6]);
429 /* DR7 is loaded from the VMCS. */
430 }
432 /*
433 * DR7 is saved and restored on every vmexit. Other debug registers only
434 * need to be restored if their value is going to affect execution -- i.e.,
435 * if one of the breakpoints is enabled. So mask out all bits that don't
436 * enable some breakpoint functionality.
437 */
438 #define DR7_ACTIVE_MASK 0xff
440 static void vmx_restore_dr(struct vcpu *v)
441 {
442 /* NB. __vmread() is not usable here, so we cannot read from the VMCS. */
443 if ( unlikely(v->arch.guest_context.debugreg[7] & DR7_ACTIVE_MASK) )
444 __restore_debug_registers(v);
445 }
447 void vmx_vmcs_save(struct vcpu *v, struct hvm_hw_cpu *c)
448 {
449 uint32_t ev;
451 vmx_vmcs_enter(v);
453 c->cr0 = v->arch.hvm_vcpu.guest_cr[0];
454 c->cr2 = v->arch.hvm_vcpu.guest_cr[2];
455 c->cr3 = v->arch.hvm_vcpu.guest_cr[3];
456 c->cr4 = v->arch.hvm_vcpu.guest_cr[4];
458 c->msr_efer = v->arch.hvm_vcpu.guest_efer;
460 c->idtr_limit = __vmread(GUEST_IDTR_LIMIT);
461 c->idtr_base = __vmread(GUEST_IDTR_BASE);
463 c->gdtr_limit = __vmread(GUEST_GDTR_LIMIT);
464 c->gdtr_base = __vmread(GUEST_GDTR_BASE);
466 c->cs_sel = __vmread(GUEST_CS_SELECTOR);
467 c->cs_limit = __vmread(GUEST_CS_LIMIT);
468 c->cs_base = __vmread(GUEST_CS_BASE);
469 c->cs_arbytes = __vmread(GUEST_CS_AR_BYTES);
471 c->ds_sel = __vmread(GUEST_DS_SELECTOR);
472 c->ds_limit = __vmread(GUEST_DS_LIMIT);
473 c->ds_base = __vmread(GUEST_DS_BASE);
474 c->ds_arbytes = __vmread(GUEST_DS_AR_BYTES);
476 c->es_sel = __vmread(GUEST_ES_SELECTOR);
477 c->es_limit = __vmread(GUEST_ES_LIMIT);
478 c->es_base = __vmread(GUEST_ES_BASE);
479 c->es_arbytes = __vmread(GUEST_ES_AR_BYTES);
481 c->ss_sel = __vmread(GUEST_SS_SELECTOR);
482 c->ss_limit = __vmread(GUEST_SS_LIMIT);
483 c->ss_base = __vmread(GUEST_SS_BASE);
484 c->ss_arbytes = __vmread(GUEST_SS_AR_BYTES);
486 c->fs_sel = __vmread(GUEST_FS_SELECTOR);
487 c->fs_limit = __vmread(GUEST_FS_LIMIT);
488 c->fs_base = __vmread(GUEST_FS_BASE);
489 c->fs_arbytes = __vmread(GUEST_FS_AR_BYTES);
491 c->gs_sel = __vmread(GUEST_GS_SELECTOR);
492 c->gs_limit = __vmread(GUEST_GS_LIMIT);
493 c->gs_base = __vmread(GUEST_GS_BASE);
494 c->gs_arbytes = __vmread(GUEST_GS_AR_BYTES);
496 c->tr_sel = __vmread(GUEST_TR_SELECTOR);
497 c->tr_limit = __vmread(GUEST_TR_LIMIT);
498 c->tr_base = __vmread(GUEST_TR_BASE);
499 c->tr_arbytes = __vmread(GUEST_TR_AR_BYTES);
501 c->ldtr_sel = __vmread(GUEST_LDTR_SELECTOR);
502 c->ldtr_limit = __vmread(GUEST_LDTR_LIMIT);
503 c->ldtr_base = __vmread(GUEST_LDTR_BASE);
504 c->ldtr_arbytes = __vmread(GUEST_LDTR_AR_BYTES);
506 c->sysenter_cs = __vmread(GUEST_SYSENTER_CS);
507 c->sysenter_esp = __vmread(GUEST_SYSENTER_ESP);
508 c->sysenter_eip = __vmread(GUEST_SYSENTER_EIP);
510 c->pending_event = 0;
511 c->error_code = 0;
512 if ( ((ev = __vmread(VM_ENTRY_INTR_INFO)) & INTR_INFO_VALID_MASK) &&
513 hvm_event_needs_reinjection((ev >> 8) & 7, ev & 0xff) )
514 {
515 c->pending_event = ev;
516 c->error_code = __vmread(VM_ENTRY_EXCEPTION_ERROR_CODE);
517 }
519 vmx_vmcs_exit(v);
520 }
522 static int vmx_restore_cr0_cr3(
523 struct vcpu *v, unsigned long cr0, unsigned long cr3)
524 {
525 unsigned long mfn = 0;
526 p2m_type_t p2mt;
528 if ( cr0 & X86_CR0_PG )
529 {
530 mfn = mfn_x(gfn_to_mfn(v->domain, cr3 >> PAGE_SHIFT, &p2mt));
531 if ( !p2m_is_ram(p2mt) || !get_page(mfn_to_page(mfn), v->domain) )
532 {
533 gdprintk(XENLOG_ERR, "Invalid CR3 value=0x%lx\n", cr3);
534 return -EINVAL;
535 }
536 }
538 if ( v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PG )
539 put_page(pagetable_get_page(v->arch.guest_table));
541 v->arch.guest_table = pagetable_from_pfn(mfn);
543 v->arch.hvm_vcpu.guest_cr[0] = cr0 | X86_CR0_ET;
544 v->arch.hvm_vcpu.guest_cr[3] = cr3;
546 return 0;
547 }
549 int vmx_vmcs_restore(struct vcpu *v, struct hvm_hw_cpu *c)
550 {
551 int rc;
553 if ( c->pending_valid &&
554 ((c->pending_type == 1) || (c->pending_type > 6) ||
555 (c->pending_reserved != 0)) )
556 {
557 gdprintk(XENLOG_ERR, "Invalid pending event 0x%"PRIx32".\n",
558 c->pending_event);
559 return -EINVAL;
560 }
562 rc = vmx_restore_cr0_cr3(v, c->cr0, c->cr3);
563 if ( rc )
564 return rc;
566 vmx_vmcs_enter(v);
568 v->arch.hvm_vcpu.guest_cr[2] = c->cr2;
569 v->arch.hvm_vcpu.guest_cr[4] = c->cr4;
570 vmx_update_guest_cr(v, 0);
571 vmx_update_guest_cr(v, 2);
572 vmx_update_guest_cr(v, 4);
574 #ifdef HVM_DEBUG_SUSPEND
575 printk("%s: cr3=0x%"PRIx64", cr0=0x%"PRIx64", cr4=0x%"PRIx64".\n",
576 __func__, c->cr3, c->cr0, c->cr4);
577 #endif
579 v->arch.hvm_vcpu.guest_efer = c->msr_efer;
580 vmx_update_guest_efer(v);
582 __vmwrite(GUEST_IDTR_LIMIT, c->idtr_limit);
583 __vmwrite(GUEST_IDTR_BASE, c->idtr_base);
585 __vmwrite(GUEST_GDTR_LIMIT, c->gdtr_limit);
586 __vmwrite(GUEST_GDTR_BASE, c->gdtr_base);
588 __vmwrite(GUEST_CS_SELECTOR, c->cs_sel);
589 __vmwrite(GUEST_CS_LIMIT, c->cs_limit);
590 __vmwrite(GUEST_CS_BASE, c->cs_base);
591 __vmwrite(GUEST_CS_AR_BYTES, c->cs_arbytes);
593 __vmwrite(GUEST_DS_SELECTOR, c->ds_sel);
594 __vmwrite(GUEST_DS_LIMIT, c->ds_limit);
595 __vmwrite(GUEST_DS_BASE, c->ds_base);
596 __vmwrite(GUEST_DS_AR_BYTES, c->ds_arbytes);
598 __vmwrite(GUEST_ES_SELECTOR, c->es_sel);
599 __vmwrite(GUEST_ES_LIMIT, c->es_limit);
600 __vmwrite(GUEST_ES_BASE, c->es_base);
601 __vmwrite(GUEST_ES_AR_BYTES, c->es_arbytes);
603 __vmwrite(GUEST_SS_SELECTOR, c->ss_sel);
604 __vmwrite(GUEST_SS_LIMIT, c->ss_limit);
605 __vmwrite(GUEST_SS_BASE, c->ss_base);
606 __vmwrite(GUEST_SS_AR_BYTES, c->ss_arbytes);
608 __vmwrite(GUEST_FS_SELECTOR, c->fs_sel);
609 __vmwrite(GUEST_FS_LIMIT, c->fs_limit);
610 __vmwrite(GUEST_FS_BASE, c->fs_base);
611 __vmwrite(GUEST_FS_AR_BYTES, c->fs_arbytes);
613 __vmwrite(GUEST_GS_SELECTOR, c->gs_sel);
614 __vmwrite(GUEST_GS_LIMIT, c->gs_limit);
615 __vmwrite(GUEST_GS_BASE, c->gs_base);
616 __vmwrite(GUEST_GS_AR_BYTES, c->gs_arbytes);
618 __vmwrite(GUEST_TR_SELECTOR, c->tr_sel);
619 __vmwrite(GUEST_TR_LIMIT, c->tr_limit);
620 __vmwrite(GUEST_TR_BASE, c->tr_base);
621 __vmwrite(GUEST_TR_AR_BYTES, c->tr_arbytes);
623 __vmwrite(GUEST_LDTR_SELECTOR, c->ldtr_sel);
624 __vmwrite(GUEST_LDTR_LIMIT, c->ldtr_limit);
625 __vmwrite(GUEST_LDTR_BASE, c->ldtr_base);
626 __vmwrite(GUEST_LDTR_AR_BYTES, c->ldtr_arbytes);
628 __vmwrite(GUEST_SYSENTER_CS, c->sysenter_cs);
629 __vmwrite(GUEST_SYSENTER_ESP, c->sysenter_esp);
630 __vmwrite(GUEST_SYSENTER_EIP, c->sysenter_eip);
632 __vmwrite(GUEST_DR7, c->dr7);
634 vmx_vmcs_exit(v);
636 paging_update_paging_modes(v);
638 if ( c->pending_valid )
639 {
640 gdprintk(XENLOG_INFO, "Re-injecting 0x%"PRIx32", 0x%"PRIx32"\n",
641 c->pending_event, c->error_code);
643 if ( hvm_event_needs_reinjection(c->pending_type, c->pending_vector) )
644 {
645 vmx_vmcs_enter(v);
646 __vmwrite(VM_ENTRY_INTR_INFO, c->pending_event);
647 __vmwrite(VM_ENTRY_EXCEPTION_ERROR_CODE, c->error_code);
648 vmx_vmcs_exit(v);
649 }
650 }
652 return 0;
653 }
655 #if defined(__x86_64__) && defined(HVM_DEBUG_SUSPEND)
656 static void dump_msr_state(struct vmx_msr_state *m)
657 {
658 int i = 0;
659 printk("**** msr state ****\n");
660 printk("shadow_gs=0x%lx, flags=0x%lx, msr_items:", m->shadow_gs, m->flags);
661 for ( i = 0; i < VMX_MSR_COUNT; i++ )
662 printk("0x%lx,", m->msrs[i]);
663 printk("\n");
664 }
665 #else
666 #define dump_msr_state(m) ((void)0)
667 #endif
669 static void vmx_save_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data)
670 {
671 #ifdef __x86_64__
672 struct vmx_msr_state *guest_state = &v->arch.hvm_vmx.msr_state;
673 unsigned long guest_flags = guest_state->flags;
675 data->shadow_gs = v->arch.hvm_vmx.shadow_gs;
676 data->msr_cstar = v->arch.hvm_vmx.cstar;
678 /* save msrs */
679 data->msr_flags = guest_flags;
680 data->msr_lstar = guest_state->msrs[VMX_INDEX_MSR_LSTAR];
681 data->msr_star = guest_state->msrs[VMX_INDEX_MSR_STAR];
682 data->msr_syscall_mask = guest_state->msrs[VMX_INDEX_MSR_SYSCALL_MASK];
683 #endif
685 data->tsc = hvm_get_guest_time(v);
687 dump_msr_state(guest_state);
688 }
690 static void vmx_load_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data)
691 {
692 #ifdef __x86_64__
693 struct vmx_msr_state *guest_state = &v->arch.hvm_vmx.msr_state;
695 /* restore msrs */
696 guest_state->flags = data->msr_flags;
697 guest_state->msrs[VMX_INDEX_MSR_LSTAR] = data->msr_lstar;
698 guest_state->msrs[VMX_INDEX_MSR_STAR] = data->msr_star;
699 guest_state->msrs[VMX_INDEX_MSR_SYSCALL_MASK] = data->msr_syscall_mask;
701 v->arch.hvm_vmx.cstar = data->msr_cstar;
702 v->arch.hvm_vmx.shadow_gs = data->shadow_gs;
703 #endif
705 v->arch.hvm_vmx.vmxassist_enabled = !(data->cr0 & X86_CR0_PE);
707 hvm_set_guest_time(v, data->tsc);
709 dump_msr_state(guest_state);
710 }
713 static void vmx_save_vmcs_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt)
714 {
715 vmx_save_cpu_state(v, ctxt);
716 vmx_vmcs_save(v, ctxt);
717 }
719 static int vmx_load_vmcs_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt)
720 {
721 vmx_load_cpu_state(v, ctxt);
723 if ( vmx_vmcs_restore(v, ctxt) )
724 {
725 gdprintk(XENLOG_ERR, "vmx_vmcs restore failed!\n");
726 domain_crash(v->domain);
727 return -EINVAL;
728 }
730 return 0;
731 }
733 static void vmx_ctxt_switch_from(struct vcpu *v)
734 {
735 vmx_save_guest_msrs(v);
736 vmx_restore_host_msrs();
737 vmx_save_dr(v);
738 }
740 static void vmx_ctxt_switch_to(struct vcpu *v)
741 {
742 /* HOST_CR4 in VMCS is always mmu_cr4_features. Sync CR4 now. */
743 if ( unlikely(read_cr4() != mmu_cr4_features) )
744 write_cr4(mmu_cr4_features);
746 vmx_restore_guest_msrs(v);
747 vmx_restore_dr(v);
748 }
750 static unsigned long vmx_get_segment_base(struct vcpu *v, enum x86_segment seg)
751 {
752 unsigned long base = 0;
753 int long_mode = 0;
755 ASSERT(v == current);
757 if ( hvm_long_mode_enabled(v) &&
758 (__vmread(GUEST_CS_AR_BYTES) & X86_SEG_AR_CS_LM_ACTIVE) )
759 long_mode = 1;
761 switch ( seg )
762 {
763 case x86_seg_cs: if ( !long_mode ) base = __vmread(GUEST_CS_BASE); break;
764 case x86_seg_ds: if ( !long_mode ) base = __vmread(GUEST_DS_BASE); break;
765 case x86_seg_es: if ( !long_mode ) base = __vmread(GUEST_ES_BASE); break;
766 case x86_seg_fs: base = __vmread(GUEST_FS_BASE); break;
767 case x86_seg_gs: base = __vmread(GUEST_GS_BASE); break;
768 case x86_seg_ss: if ( !long_mode ) base = __vmread(GUEST_SS_BASE); break;
769 case x86_seg_tr: base = __vmread(GUEST_TR_BASE); break;
770 case x86_seg_gdtr: base = __vmread(GUEST_GDTR_BASE); break;
771 case x86_seg_idtr: base = __vmread(GUEST_IDTR_BASE); break;
772 case x86_seg_ldtr: base = __vmread(GUEST_LDTR_BASE); break;
773 default: BUG(); break;
774 }
776 return base;
777 }
779 static void vmx_get_segment_register(struct vcpu *v, enum x86_segment seg,
780 struct segment_register *reg)
781 {
782 uint32_t attr = 0;
784 ASSERT(v == current);
786 switch ( seg )
787 {
788 case x86_seg_cs:
789 reg->sel = __vmread(GUEST_CS_SELECTOR);
790 reg->limit = __vmread(GUEST_CS_LIMIT);
791 reg->base = __vmread(GUEST_CS_BASE);
792 attr = __vmread(GUEST_CS_AR_BYTES);
793 break;
794 case x86_seg_ds:
795 reg->sel = __vmread(GUEST_DS_SELECTOR);
796 reg->limit = __vmread(GUEST_DS_LIMIT);
797 reg->base = __vmread(GUEST_DS_BASE);
798 attr = __vmread(GUEST_DS_AR_BYTES);
799 break;
800 case x86_seg_es:
801 reg->sel = __vmread(GUEST_ES_SELECTOR);
802 reg->limit = __vmread(GUEST_ES_LIMIT);
803 reg->base = __vmread(GUEST_ES_BASE);
804 attr = __vmread(GUEST_ES_AR_BYTES);
805 break;
806 case x86_seg_fs:
807 reg->sel = __vmread(GUEST_FS_SELECTOR);
808 reg->limit = __vmread(GUEST_FS_LIMIT);
809 reg->base = __vmread(GUEST_FS_BASE);
810 attr = __vmread(GUEST_FS_AR_BYTES);
811 break;
812 case x86_seg_gs:
813 reg->sel = __vmread(GUEST_GS_SELECTOR);
814 reg->limit = __vmread(GUEST_GS_LIMIT);
815 reg->base = __vmread(GUEST_GS_BASE);
816 attr = __vmread(GUEST_GS_AR_BYTES);
817 break;
818 case x86_seg_ss:
819 reg->sel = __vmread(GUEST_SS_SELECTOR);
820 reg->limit = __vmread(GUEST_SS_LIMIT);
821 reg->base = __vmread(GUEST_SS_BASE);
822 attr = __vmread(GUEST_SS_AR_BYTES);
823 break;
824 case x86_seg_tr:
825 reg->sel = __vmread(GUEST_TR_SELECTOR);
826 reg->limit = __vmread(GUEST_TR_LIMIT);
827 reg->base = __vmread(GUEST_TR_BASE);
828 attr = __vmread(GUEST_TR_AR_BYTES);
829 break;
830 case x86_seg_gdtr:
831 reg->limit = __vmread(GUEST_GDTR_LIMIT);
832 reg->base = __vmread(GUEST_GDTR_BASE);
833 break;
834 case x86_seg_idtr:
835 reg->limit = __vmread(GUEST_IDTR_LIMIT);
836 reg->base = __vmread(GUEST_IDTR_BASE);
837 break;
838 case x86_seg_ldtr:
839 reg->sel = __vmread(GUEST_LDTR_SELECTOR);
840 reg->limit = __vmread(GUEST_LDTR_LIMIT);
841 reg->base = __vmread(GUEST_LDTR_BASE);
842 attr = __vmread(GUEST_LDTR_AR_BYTES);
843 break;
844 default:
845 BUG();
846 }
848 reg->attr.bytes = (attr & 0xff) | ((attr >> 4) & 0xf00);
849 /* Unusable flag is folded into Present flag. */
850 if ( attr & (1u<<16) )
851 reg->attr.fields.p = 0;
852 }
854 static void vmx_set_segment_register(struct vcpu *v, enum x86_segment seg,
855 struct segment_register *reg)
856 {
857 uint32_t attr;
859 ASSERT(v == current);
861 attr = reg->attr.bytes;
862 attr = ((attr & 0xf00) << 4) | (attr & 0xff);
864 /* Not-present must mean unusable. */
865 if ( !reg->attr.fields.p )
866 attr |= (1u << 16);
868 switch ( seg )
869 {
870 case x86_seg_cs:
871 __vmwrite(GUEST_CS_SELECTOR, reg->sel);
872 __vmwrite(GUEST_CS_LIMIT, reg->limit);
873 __vmwrite(GUEST_CS_BASE, reg->base);
874 __vmwrite(GUEST_CS_AR_BYTES, attr);
875 break;
876 case x86_seg_ds:
877 __vmwrite(GUEST_DS_SELECTOR, reg->sel);
878 __vmwrite(GUEST_DS_LIMIT, reg->limit);
879 __vmwrite(GUEST_DS_BASE, reg->base);
880 __vmwrite(GUEST_DS_AR_BYTES, attr);
881 break;
882 case x86_seg_es:
883 __vmwrite(GUEST_ES_SELECTOR, reg->sel);
884 __vmwrite(GUEST_ES_LIMIT, reg->limit);
885 __vmwrite(GUEST_ES_BASE, reg->base);
886 __vmwrite(GUEST_ES_AR_BYTES, attr);
887 break;
888 case x86_seg_fs:
889 __vmwrite(GUEST_FS_SELECTOR, reg->sel);
890 __vmwrite(GUEST_FS_LIMIT, reg->limit);
891 __vmwrite(GUEST_FS_BASE, reg->base);
892 __vmwrite(GUEST_FS_AR_BYTES, attr);
893 break;
894 case x86_seg_gs:
895 __vmwrite(GUEST_GS_SELECTOR, reg->sel);
896 __vmwrite(GUEST_GS_LIMIT, reg->limit);
897 __vmwrite(GUEST_GS_BASE, reg->base);
898 __vmwrite(GUEST_GS_AR_BYTES, attr);
899 break;
900 case x86_seg_ss:
901 __vmwrite(GUEST_SS_SELECTOR, reg->sel);
902 __vmwrite(GUEST_SS_LIMIT, reg->limit);
903 __vmwrite(GUEST_SS_BASE, reg->base);
904 __vmwrite(GUEST_SS_AR_BYTES, attr);
905 break;
906 case x86_seg_tr:
907 __vmwrite(GUEST_TR_SELECTOR, reg->sel);
908 __vmwrite(GUEST_TR_LIMIT, reg->limit);
909 __vmwrite(GUEST_TR_BASE, reg->base);
910 __vmwrite(GUEST_TR_AR_BYTES, attr);
911 break;
912 case x86_seg_gdtr:
913 __vmwrite(GUEST_GDTR_LIMIT, reg->limit);
914 __vmwrite(GUEST_GDTR_BASE, reg->base);
915 break;
916 case x86_seg_idtr:
917 __vmwrite(GUEST_IDTR_LIMIT, reg->limit);
918 __vmwrite(GUEST_IDTR_BASE, reg->base);
919 break;
920 case x86_seg_ldtr:
921 __vmwrite(GUEST_LDTR_SELECTOR, reg->sel);
922 __vmwrite(GUEST_LDTR_LIMIT, reg->limit);
923 __vmwrite(GUEST_LDTR_BASE, reg->base);
924 __vmwrite(GUEST_LDTR_AR_BYTES, attr);
925 break;
926 default:
927 BUG();
928 }
929 }
931 /* Make sure that xen intercepts any FP accesses from current */
932 static void vmx_stts(struct vcpu *v)
933 {
934 /* VMX depends on operating on the current vcpu */
935 ASSERT(v == current);
937 /*
938 * If the guest does not have TS enabled then we must cause and handle an
939 * exception on first use of the FPU. If the guest *does* have TS enabled
940 * then this is not necessary: no FPU activity can occur until the guest
941 * clears CR0.TS, and we will initialise the FPU when that happens.
942 */
943 if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) )
944 {
945 v->arch.hvm_vcpu.hw_cr[0] |= X86_CR0_TS;
946 __vmwrite(GUEST_CR0, v->arch.hvm_vcpu.hw_cr[0]);
947 __vm_set_bit(EXCEPTION_BITMAP, TRAP_no_device);
948 }
949 }
951 static void vmx_set_tsc_offset(struct vcpu *v, u64 offset)
952 {
953 vmx_vmcs_enter(v);
954 __vmwrite(TSC_OFFSET, offset);
955 #if defined (__i386__)
956 __vmwrite(TSC_OFFSET_HIGH, offset >> 32);
957 #endif
958 vmx_vmcs_exit(v);
959 }
961 static void vmx_init_ap_context(
962 struct vcpu_guest_context *ctxt, int vcpuid, int trampoline_vector)
963 {
964 memset(ctxt, 0, sizeof(*ctxt));
965 ctxt->user_regs.eip = VMXASSIST_BASE;
966 ctxt->user_regs.edx = vcpuid;
967 ctxt->user_regs.ebx = trampoline_vector;
968 }
970 void do_nmi(struct cpu_user_regs *);
972 static void vmx_init_hypercall_page(struct domain *d, void *hypercall_page)
973 {
974 char *p;
975 int i;
977 for ( i = 0; i < (PAGE_SIZE / 32); i++ )
978 {
979 p = (char *)(hypercall_page + (i * 32));
980 *(u8 *)(p + 0) = 0xb8; /* mov imm32, %eax */
981 *(u32 *)(p + 1) = i;
982 *(u8 *)(p + 5) = 0x0f; /* vmcall */
983 *(u8 *)(p + 6) = 0x01;
984 *(u8 *)(p + 7) = 0xc1;
985 *(u8 *)(p + 8) = 0xc3; /* ret */
986 }
988 /* Don't support HYPERVISOR_iret at the moment */
989 *(u16 *)(hypercall_page + (__HYPERVISOR_iret * 32)) = 0x0b0f; /* ud2 */
990 }
992 static enum hvm_intblk vmx_interrupt_blocked(
993 struct vcpu *v, struct hvm_intack intack)
994 {
995 unsigned long intr_shadow;
997 intr_shadow = __vmread(GUEST_INTERRUPTIBILITY_INFO);
999 if ( intr_shadow & (VMX_INTR_SHADOW_STI|VMX_INTR_SHADOW_MOV_SS) )
1000 return hvm_intblk_shadow;
1002 if ( intack.source == hvm_intsrc_nmi )
1003 return ((intr_shadow & VMX_INTR_SHADOW_NMI) ?
1004 hvm_intblk_nmi_iret : hvm_intblk_none);
1006 ASSERT((intack.source == hvm_intsrc_pic) ||
1007 (intack.source == hvm_intsrc_lapic));
1009 if ( !(guest_cpu_user_regs()->eflags & X86_EFLAGS_IF) )
1010 return hvm_intblk_rflags_ie;
1012 if ( intack.source == hvm_intsrc_lapic )
1014 uint32_t tpr = vlapic_get_reg(vcpu_vlapic(v), APIC_TASKPRI) & 0xF0;
1015 if ( (tpr >> 4) >= (intack.vector >> 4) )
1016 return hvm_intblk_tpr;
1019 return hvm_intblk_none;
1022 static void vmx_update_host_cr3(struct vcpu *v)
1024 ASSERT((v == current) || !vcpu_runnable(v));
1025 vmx_vmcs_enter(v);
1026 __vmwrite(HOST_CR3, v->arch.cr3);
1027 vmx_vmcs_exit(v);
1030 static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr)
1032 ASSERT((v == current) || !vcpu_runnable(v));
1034 vmx_vmcs_enter(v);
1036 switch ( cr )
1038 case 0:
1039 /* TS cleared? Then initialise FPU now. */
1040 if ( (v == current) && !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) &&
1041 (v->arch.hvm_vcpu.hw_cr[0] & X86_CR0_TS) )
1043 setup_fpu(v);
1044 __vm_clear_bit(EXCEPTION_BITMAP, TRAP_no_device);
1047 v->arch.hvm_vcpu.hw_cr[0] =
1048 v->arch.hvm_vcpu.guest_cr[0] |
1049 X86_CR0_PE | X86_CR0_NE | X86_CR0_PG | X86_CR0_WP;
1050 __vmwrite(GUEST_CR0, v->arch.hvm_vcpu.hw_cr[0]);
1051 __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vcpu.guest_cr[0]);
1052 break;
1053 case 2:
1054 /* CR2 is updated in exit stub. */
1055 break;
1056 case 3:
1057 __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr[3]);
1058 break;
1059 case 4:
1060 v->arch.hvm_vcpu.hw_cr[4] =
1061 v->arch.hvm_vcpu.guest_cr[4] | HVM_CR4_HOST_MASK;
1062 __vmwrite(GUEST_CR4, v->arch.hvm_vcpu.hw_cr[4]);
1063 __vmwrite(CR4_READ_SHADOW, v->arch.hvm_vcpu.guest_cr[4]);
1064 break;
1065 default:
1066 BUG();
1069 vmx_vmcs_exit(v);
1072 static void vmx_update_guest_efer(struct vcpu *v)
1074 #ifdef __x86_64__
1075 unsigned long vm_entry_value;
1077 ASSERT((v == current) || !vcpu_runnable(v));
1079 vmx_vmcs_enter(v);
1081 vm_entry_value = __vmread(VM_ENTRY_CONTROLS);
1082 if ( v->arch.hvm_vcpu.guest_efer & EFER_LMA )
1083 vm_entry_value |= VM_ENTRY_IA32E_MODE;
1084 else
1085 vm_entry_value &= ~VM_ENTRY_IA32E_MODE;
1086 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
1088 vmx_vmcs_exit(v);
1089 #endif
1091 if ( v == current )
1092 write_efer((read_efer() & ~(EFER_NX|EFER_SCE)) |
1093 (v->arch.hvm_vcpu.guest_efer & (EFER_NX|EFER_SCE)));
1096 static void vmx_flush_guest_tlbs(void)
1098 /* No tagged TLB support on VMX yet. The fact that we're in Xen
1099 * at all means any guest will have a clean TLB when it's next run,
1100 * because VMRESUME will flush it for us. */
1103 static void vmx_inject_exception(
1104 unsigned int trapnr, int errcode, unsigned long cr2)
1106 struct vcpu *curr = current;
1108 vmx_inject_hw_exception(curr, trapnr, errcode);
1110 if ( trapnr == TRAP_page_fault )
1111 curr->arch.hvm_vcpu.guest_cr[2] = cr2;
1113 if ( (trapnr == TRAP_debug) &&
1114 (guest_cpu_user_regs()->eflags & X86_EFLAGS_TF) )
1116 __restore_debug_registers(curr);
1117 write_debugreg(6, read_debugreg(6) | 0x4000);
1121 static void vmx_update_vtpr(struct vcpu *v, unsigned long value)
1123 /* VMX doesn't have a V_TPR field */
1126 static int vmx_event_pending(struct vcpu *v)
1128 ASSERT(v == current);
1129 return (__vmread(VM_ENTRY_INTR_INFO) & INTR_INFO_VALID_MASK);
1132 static struct hvm_function_table vmx_function_table = {
1133 .name = "VMX",
1134 .domain_initialise = vmx_domain_initialise,
1135 .domain_destroy = vmx_domain_destroy,
1136 .vcpu_initialise = vmx_vcpu_initialise,
1137 .vcpu_destroy = vmx_vcpu_destroy,
1138 .save_cpu_ctxt = vmx_save_vmcs_ctxt,
1139 .load_cpu_ctxt = vmx_load_vmcs_ctxt,
1140 .interrupt_blocked = vmx_interrupt_blocked,
1141 .guest_x86_mode = vmx_guest_x86_mode,
1142 .get_segment_base = vmx_get_segment_base,
1143 .get_segment_register = vmx_get_segment_register,
1144 .set_segment_register = vmx_set_segment_register,
1145 .update_host_cr3 = vmx_update_host_cr3,
1146 .update_guest_cr = vmx_update_guest_cr,
1147 .update_guest_efer = vmx_update_guest_efer,
1148 .flush_guest_tlbs = vmx_flush_guest_tlbs,
1149 .update_vtpr = vmx_update_vtpr,
1150 .stts = vmx_stts,
1151 .set_tsc_offset = vmx_set_tsc_offset,
1152 .inject_exception = vmx_inject_exception,
1153 .init_ap_context = vmx_init_ap_context,
1154 .init_hypercall_page = vmx_init_hypercall_page,
1155 .event_pending = vmx_event_pending,
1156 .cpu_up = vmx_cpu_up,
1157 .cpu_down = vmx_cpu_down,
1158 };
1160 void start_vmx(void)
1162 static int bootstrapped;
1164 vmx_save_host_msrs();
1166 if ( bootstrapped )
1168 if ( hvm_enabled && !vmx_cpu_up() )
1170 printk("VMX: FATAL: failed to initialise CPU%d!\n",
1171 smp_processor_id());
1172 BUG();
1174 return;
1177 bootstrapped = 1;
1179 /* Xen does not fill x86_capability words except 0. */
1180 boot_cpu_data.x86_capability[4] = cpuid_ecx(1);
1182 if ( !test_bit(X86_FEATURE_VMXE, &boot_cpu_data.x86_capability) )
1183 return;
1185 set_in_cr4(X86_CR4_VMXE);
1187 if ( !vmx_cpu_up() )
1189 printk("VMX: failed to initialise.\n");
1190 return;
1193 setup_vmcs_dump();
1195 hvm_enable(&vmx_function_table);
1198 /*
1199 * Not all cases receive valid value in the VM-exit instruction length field.
1200 * Callers must know what they're doing!
1201 */
1202 static int __get_instruction_length(void)
1204 int len;
1205 len = __vmread(VM_EXIT_INSTRUCTION_LEN); /* Safe: callers audited */
1206 BUG_ON((len < 1) || (len > 15));
1207 return len;
1210 static void __update_guest_eip(unsigned long inst_len)
1212 struct cpu_user_regs *regs = guest_cpu_user_regs();
1213 unsigned long x;
1215 regs->eip += inst_len;
1216 regs->eflags &= ~X86_EFLAGS_RF;
1218 x = __vmread(GUEST_INTERRUPTIBILITY_INFO);
1219 if ( x & (VMX_INTR_SHADOW_STI | VMX_INTR_SHADOW_MOV_SS) )
1221 x &= ~(VMX_INTR_SHADOW_STI | VMX_INTR_SHADOW_MOV_SS);
1222 __vmwrite(GUEST_INTERRUPTIBILITY_INFO, x);
1225 if ( regs->eflags & X86_EFLAGS_TF )
1226 vmx_inject_exception(TRAP_debug, HVM_DELIVER_NO_ERROR_CODE, 0);
1229 static void vmx_do_no_device_fault(void)
1231 struct vcpu *v = current;
1233 setup_fpu(current);
1234 __vm_clear_bit(EXCEPTION_BITMAP, TRAP_no_device);
1236 /* Disable TS in guest CR0 unless the guest wants the exception too. */
1237 if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) )
1239 v->arch.hvm_vcpu.hw_cr[0] &= ~X86_CR0_TS;
1240 __vmwrite(GUEST_CR0, v->arch.hvm_vcpu.hw_cr[0]);
1244 #define bitmaskof(idx) (1U << ((idx) & 31))
1245 static void vmx_do_cpuid(struct cpu_user_regs *regs)
1247 unsigned int input = regs->eax;
1248 unsigned int eax, ebx, ecx, edx;
1250 if ( input == 0x40000003 )
1252 /*
1253 * NB. Unsupported interface for private use of VMXASSIST only.
1254 * Note that this leaf lives at <max-hypervisor-leaf> + 1.
1255 */
1256 u64 value = ((u64)regs->edx << 32) | (u32)regs->ecx;
1257 p2m_type_t p2mt;
1258 unsigned long mfn;
1259 struct vcpu *v = current;
1260 char *p;
1262 mfn = mfn_x(gfn_to_mfn_current(value >> PAGE_SHIFT, &p2mt));
1264 gdprintk(XENLOG_INFO, "Input address is 0x%"PRIx64".\n", value);
1266 /* 8-byte aligned valid pseudophys address from vmxassist, please. */
1267 if ( (value & 7) || !p2m_is_ram(p2mt) ||
1268 !v->arch.hvm_vmx.vmxassist_enabled )
1270 domain_crash(v->domain);
1271 return;
1273 ASSERT(mfn_valid(mfn));
1275 p = map_domain_page(mfn);
1276 value = *((uint64_t *)(p + (value & (PAGE_SIZE - 1))));
1277 unmap_domain_page(p);
1279 gdprintk(XENLOG_INFO, "Output value is 0x%"PRIx64".\n", value);
1280 regs->ecx = (u32)value;
1281 regs->edx = (u32)(value >> 32);
1282 return;
1285 hvm_cpuid(input, &eax, &ebx, &ecx, &edx);
1287 switch ( input )
1289 case 0x00000001:
1290 ecx &= ~VMX_VCPU_CPUID_L1_ECX_RESERVED;
1291 ebx &= NUM_THREADS_RESET_MASK;
1292 ecx &= ~(bitmaskof(X86_FEATURE_VMXE) |
1293 bitmaskof(X86_FEATURE_EST) |
1294 bitmaskof(X86_FEATURE_TM2) |
1295 bitmaskof(X86_FEATURE_CID) |
1296 bitmaskof(X86_FEATURE_PDCM) |
1297 bitmaskof(X86_FEATURE_DSCPL));
1298 edx &= ~(bitmaskof(X86_FEATURE_HT) |
1299 bitmaskof(X86_FEATURE_ACPI) |
1300 bitmaskof(X86_FEATURE_ACC) |
1301 bitmaskof(X86_FEATURE_DS));
1302 break;
1304 case 0x00000004:
1305 cpuid_count(input, regs->ecx, &eax, &ebx, &ecx, &edx);
1306 eax &= NUM_CORES_RESET_MASK;
1307 break;
1309 case 0x00000006:
1310 case 0x00000009:
1311 case 0x0000000A:
1312 eax = ebx = ecx = edx = 0;
1313 break;
1315 case 0x80000001:
1316 /* Only a few features are advertised in Intel's 0x80000001. */
1317 ecx &= (bitmaskof(X86_FEATURE_LAHF_LM));
1318 edx &= (bitmaskof(X86_FEATURE_NX) |
1319 bitmaskof(X86_FEATURE_LM) |
1320 bitmaskof(X86_FEATURE_SYSCALL));
1321 break;
1324 regs->eax = eax;
1325 regs->ebx = ebx;
1326 regs->ecx = ecx;
1327 regs->edx = edx;
1329 HVMTRACE_3D(CPUID, current, input,
1330 ((uint64_t)eax << 32) | ebx, ((uint64_t)ecx << 32) | edx);
1333 #define CASE_GET_REG_P(REG, reg) \
1334 case REG_ ## REG: reg_p = (unsigned long *)&(regs->reg); break
1336 #ifdef __i386__
1337 #define CASE_EXTEND_GET_REG_P
1338 #else
1339 #define CASE_EXTEND_GET_REG_P \
1340 CASE_GET_REG_P(R8, r8); \
1341 CASE_GET_REG_P(R9, r9); \
1342 CASE_GET_REG_P(R10, r10); \
1343 CASE_GET_REG_P(R11, r11); \
1344 CASE_GET_REG_P(R12, r12); \
1345 CASE_GET_REG_P(R13, r13); \
1346 CASE_GET_REG_P(R14, r14); \
1347 CASE_GET_REG_P(R15, r15)
1348 #endif
1350 static void vmx_dr_access(unsigned long exit_qualification,
1351 struct cpu_user_regs *regs)
1353 struct vcpu *v = current;
1355 HVMTRACE_0D(DR_WRITE, v);
1357 if ( !v->arch.hvm_vcpu.flag_dr_dirty )
1358 __restore_debug_registers(v);
1360 /* Allow guest direct access to DR registers */
1361 v->arch.hvm_vmx.exec_control &= ~CPU_BASED_MOV_DR_EXITING;
1362 __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control);
1365 /*
1366 * Invalidate the TLB for va. Invalidate the shadow page corresponding
1367 * the address va.
1368 */
1369 static void vmx_do_invlpg(unsigned long va)
1371 struct vcpu *v = current;
1373 HVMTRACE_2D(INVLPG, v, /*invlpga=*/ 0, va);
1375 /*
1376 * We do the safest things first, then try to update the shadow
1377 * copying from guest
1378 */
1379 paging_invlpg(v, va);
1382 /* Get segment for OUTS according to guest instruction. */
1383 static enum x86_segment vmx_outs_get_segment(
1384 int long_mode, unsigned long eip, int inst_len)
1386 unsigned char inst[MAX_INST_LEN];
1387 enum x86_segment seg = x86_seg_ds;
1388 int i;
1389 extern int inst_copy_from_guest(unsigned char *, unsigned long, int);
1391 if ( likely(cpu_has_vmx_ins_outs_instr_info) )
1393 unsigned int instr_info = __vmread(VMX_INSTRUCTION_INFO);
1395 /* Get segment register according to bits 17:15. */
1396 switch ( (instr_info >> 15) & 7 )
1398 case 0: seg = x86_seg_es; break;
1399 case 1: seg = x86_seg_cs; break;
1400 case 2: seg = x86_seg_ss; break;
1401 case 3: seg = x86_seg_ds; break;
1402 case 4: seg = x86_seg_fs; break;
1403 case 5: seg = x86_seg_gs; break;
1404 default: BUG();
1407 goto out;
1410 if ( !long_mode )
1411 eip += __vmread(GUEST_CS_BASE);
1413 memset(inst, 0, MAX_INST_LEN);
1414 if ( inst_copy_from_guest(inst, eip, inst_len) != inst_len )
1416 gdprintk(XENLOG_ERR, "Get guest instruction failed\n");
1417 domain_crash(current->domain);
1418 goto out;
1421 for ( i = 0; i < inst_len; i++ )
1423 switch ( inst[i] )
1425 case 0xf3: /* REPZ */
1426 case 0xf2: /* REPNZ */
1427 case 0xf0: /* LOCK */
1428 case 0x66: /* data32 */
1429 case 0x67: /* addr32 */
1430 #ifdef __x86_64__
1431 case 0x40 ... 0x4f: /* REX */
1432 #endif
1433 continue;
1434 case 0x2e: /* CS */
1435 seg = x86_seg_cs;
1436 continue;
1437 case 0x36: /* SS */
1438 seg = x86_seg_ss;
1439 continue;
1440 case 0x26: /* ES */
1441 seg = x86_seg_es;
1442 continue;
1443 case 0x64: /* FS */
1444 seg = x86_seg_fs;
1445 continue;
1446 case 0x65: /* GS */
1447 seg = x86_seg_gs;
1448 continue;
1449 case 0x3e: /* DS */
1450 seg = x86_seg_ds;
1451 continue;
1455 out:
1456 return seg;
1459 static int vmx_str_pio_check_descriptor(int long_mode, unsigned long eip,
1460 int inst_len, enum x86_segment seg,
1461 unsigned long *base, u32 *limit,
1462 u32 *ar_bytes)
1464 enum vmcs_field ar_field, base_field, limit_field;
1466 *base = 0;
1467 *limit = 0;
1468 if ( seg != x86_seg_es )
1469 seg = vmx_outs_get_segment(long_mode, eip, inst_len);
1471 switch ( seg )
1473 case x86_seg_cs:
1474 ar_field = GUEST_CS_AR_BYTES;
1475 base_field = GUEST_CS_BASE;
1476 limit_field = GUEST_CS_LIMIT;
1477 break;
1478 case x86_seg_ds:
1479 ar_field = GUEST_DS_AR_BYTES;
1480 base_field = GUEST_DS_BASE;
1481 limit_field = GUEST_DS_LIMIT;
1482 break;
1483 case x86_seg_es:
1484 ar_field = GUEST_ES_AR_BYTES;
1485 base_field = GUEST_ES_BASE;
1486 limit_field = GUEST_ES_LIMIT;
1487 break;
1488 case x86_seg_fs:
1489 ar_field = GUEST_FS_AR_BYTES;
1490 base_field = GUEST_FS_BASE;
1491 limit_field = GUEST_FS_LIMIT;
1492 break;
1493 case x86_seg_gs:
1494 ar_field = GUEST_GS_AR_BYTES;
1495 base_field = GUEST_GS_BASE;
1496 limit_field = GUEST_GS_LIMIT;
1497 break;
1498 case x86_seg_ss:
1499 ar_field = GUEST_SS_AR_BYTES;
1500 base_field = GUEST_SS_BASE;
1501 limit_field = GUEST_SS_LIMIT;
1502 break;
1503 default:
1504 BUG();
1505 return 0;
1508 if ( !long_mode || seg == x86_seg_fs || seg == x86_seg_gs )
1510 *base = __vmread(base_field);
1511 *limit = __vmread(limit_field);
1513 *ar_bytes = __vmread(ar_field);
1515 return !(*ar_bytes & X86_SEG_AR_SEG_UNUSABLE);
1519 static int vmx_str_pio_check_limit(u32 limit, unsigned int size,
1520 u32 ar_bytes, unsigned long addr,
1521 unsigned long base, int df,
1522 unsigned long *count)
1524 unsigned long ea = addr - base;
1526 /* Offset must be within limits. */
1527 ASSERT(ea == (u32)ea);
1528 if ( (u32)(ea + size - 1) < (u32)ea ||
1529 (ar_bytes & 0xc) != 0x4 ? ea + size - 1 > limit
1530 : ea <= limit )
1531 return 0;
1533 /* Check the limit for repeated instructions, as above we checked
1534 only the first instance. Truncate the count if a limit violation
1535 would occur. Note that the checking is not necessary for page
1536 granular segments as transfers crossing page boundaries will be
1537 broken up anyway. */
1538 if ( !(ar_bytes & X86_SEG_AR_GRANULARITY) && *count > 1 )
1540 if ( (ar_bytes & 0xc) != 0x4 )
1542 /* expand-up */
1543 if ( !df )
1545 if ( ea + *count * size - 1 < ea ||
1546 ea + *count * size - 1 > limit )
1547 *count = (limit + 1UL - ea) / size;
1549 else
1551 if ( *count - 1 > ea / size )
1552 *count = ea / size + 1;
1555 else
1557 /* expand-down */
1558 if ( !df )
1560 if ( *count - 1 > -(s32)ea / size )
1561 *count = -(s32)ea / size + 1UL;
1563 else
1565 if ( ea < (*count - 1) * size ||
1566 ea - (*count - 1) * size <= limit )
1567 *count = (ea - limit - 1) / size + 1;
1570 ASSERT(*count);
1573 return 1;
1576 #ifdef __x86_64__
1577 static int vmx_str_pio_lm_check_limit(struct cpu_user_regs *regs,
1578 unsigned int size,
1579 unsigned long addr,
1580 unsigned long *count)
1582 if ( !is_canonical_address(addr) ||
1583 !is_canonical_address(addr + size - 1) )
1584 return 0;
1586 if ( *count > (1UL << 48) / size )
1587 *count = (1UL << 48) / size;
1589 if ( !(regs->eflags & EF_DF) )
1591 if ( addr + *count * size - 1 < addr ||
1592 !is_canonical_address(addr + *count * size - 1) )
1593 *count = (addr & ~((1UL << 48) - 1)) / size;
1595 else
1597 if ( (*count - 1) * size > addr ||
1598 !is_canonical_address(addr + (*count - 1) * size) )
1599 *count = (addr & ~((1UL << 48) - 1)) / size + 1;
1602 ASSERT(*count);
1604 return 1;
1606 #endif
1608 static void vmx_send_str_pio(struct cpu_user_regs *regs,
1609 struct hvm_io_op *pio_opp,
1610 unsigned long inst_len, unsigned int port,
1611 int sign, unsigned int size, int dir,
1612 int df, unsigned long addr,
1613 unsigned long paddr, unsigned long count)
1615 /*
1616 * Handle string pio instructions that cross pages or that
1617 * are unaligned. See the comments in hvm_domain.c/handle_mmio()
1618 */
1619 if ( (addr & PAGE_MASK) != ((addr + size - 1) & PAGE_MASK) ) {
1620 unsigned long value = 0;
1622 pio_opp->flags |= OVERLAP;
1624 if ( dir == IOREQ_WRITE ) /* OUTS */
1626 if ( hvm_paging_enabled(current) )
1628 int rv = hvm_copy_from_guest_virt(&value, addr, size);
1629 if ( rv != 0 )
1631 /* Failed on the page-spanning copy. Inject PF into
1632 * the guest for the address where we failed. */
1633 addr += size - rv;
1634 gdprintk(XENLOG_DEBUG, "Pagefault reading non-io side "
1635 "of a page-spanning PIO: va=%#lx\n", addr);
1636 vmx_inject_exception(TRAP_page_fault, 0, addr);
1637 return;
1640 else
1641 (void) hvm_copy_from_guest_phys(&value, addr, size);
1642 } else /* dir != IOREQ_WRITE */
1643 /* Remember where to write the result, as a *VA*.
1644 * Must be a VA so we can handle the page overlap
1645 * correctly in hvm_pio_assist() */
1646 pio_opp->addr = addr;
1648 if ( count == 1 )
1649 regs->eip += inst_len;
1651 send_pio_req(port, 1, size, value, dir, df, 0);
1652 } else {
1653 unsigned long last_addr = sign > 0 ? addr + count * size - 1
1654 : addr - (count - 1) * size;
1656 if ( (addr & PAGE_MASK) != (last_addr & PAGE_MASK) )
1658 if ( sign > 0 )
1659 count = (PAGE_SIZE - (addr & ~PAGE_MASK)) / size;
1660 else
1661 count = (addr & ~PAGE_MASK) / size + 1;
1662 } else
1663 regs->eip += inst_len;
1665 send_pio_req(port, count, size, paddr, dir, df, 1);
1669 static void vmx_do_str_pio(unsigned long exit_qualification,
1670 unsigned long inst_len,
1671 struct cpu_user_regs *regs,
1672 struct hvm_io_op *pio_opp)
1674 unsigned int port, size;
1675 int dir, df, vm86;
1676 unsigned long addr, count = 1, base;
1677 paddr_t paddr;
1678 unsigned long gfn;
1679 u32 ar_bytes, limit, pfec;
1680 int sign;
1681 int long_mode = 0;
1683 vm86 = regs->eflags & X86_EFLAGS_VM ? 1 : 0;
1684 df = regs->eflags & X86_EFLAGS_DF ? 1 : 0;
1686 if ( test_bit(6, &exit_qualification) )
1687 port = (exit_qualification >> 16) & 0xFFFF;
1688 else
1689 port = regs->edx & 0xffff;
1691 size = (exit_qualification & 7) + 1;
1692 dir = test_bit(3, &exit_qualification); /* direction */
1694 if ( dir == IOREQ_READ )
1695 HVMTRACE_2D(IO_READ, current, port, size);
1696 else
1697 HVMTRACE_2D(IO_WRITE, current, port, size);
1699 sign = regs->eflags & X86_EFLAGS_DF ? -1 : 1;
1700 ar_bytes = __vmread(GUEST_CS_AR_BYTES);
1701 if ( hvm_long_mode_enabled(current) &&
1702 (ar_bytes & X86_SEG_AR_CS_LM_ACTIVE) )
1703 long_mode = 1;
1704 addr = __vmread(GUEST_LINEAR_ADDRESS);
1706 if ( test_bit(5, &exit_qualification) ) { /* "rep" prefix */
1707 pio_opp->flags |= REPZ;
1708 count = regs->ecx;
1709 if ( !long_mode &&
1710 (vm86 || !(ar_bytes & X86_SEG_AR_DEF_OP_SIZE)) )
1711 count &= 0xFFFF;
1714 /*
1715 * In protected mode, guest linear address is invalid if the
1716 * selector is null.
1717 */
1718 if ( !vmx_str_pio_check_descriptor(long_mode, regs->eip, inst_len,
1719 dir==IOREQ_WRITE ? x86_seg_ds :
1720 x86_seg_es, &base, &limit,
1721 &ar_bytes) ) {
1722 if ( !long_mode ) {
1723 vmx_inject_hw_exception(current, TRAP_gp_fault, 0);
1724 return;
1726 addr = dir == IOREQ_WRITE ? base + regs->esi : regs->edi;
1729 if ( !long_mode )
1731 /* Segment must be readable for outs and writeable for ins. */
1732 if ( ((dir == IOREQ_WRITE)
1733 ? ((ar_bytes & 0xa) == 0x8)
1734 : ((ar_bytes & 0xa) != 0x2)) ||
1735 !vmx_str_pio_check_limit(limit, size, ar_bytes,
1736 addr, base, df, &count) )
1738 vmx_inject_hw_exception(current, TRAP_gp_fault, 0);
1739 return;
1742 #ifdef __x86_64__
1743 else if ( !vmx_str_pio_lm_check_limit(regs, size, addr, &count) )
1745 vmx_inject_hw_exception(current, TRAP_gp_fault, 0);
1746 return;
1748 #endif
1750 /* Translate the address to a physical address */
1751 pfec = PFEC_page_present;
1752 if ( dir == IOREQ_READ ) /* Read from PIO --> write to RAM */
1753 pfec |= PFEC_write_access;
1754 if ( ring_3(regs) )
1755 pfec |= PFEC_user_mode;
1756 gfn = paging_gva_to_gfn(current, addr, &pfec);
1757 if ( gfn == INVALID_GFN )
1759 /* The guest does not have the RAM address mapped.
1760 * Need to send in a page fault */
1761 vmx_inject_exception(TRAP_page_fault, pfec, addr);
1762 return;
1764 paddr = (paddr_t)gfn << PAGE_SHIFT | (addr & ~PAGE_MASK);
1766 vmx_send_str_pio(regs, pio_opp, inst_len, port, sign,
1767 size, dir, df, addr, paddr, count);
1770 static void vmx_io_instruction(unsigned long exit_qualification,
1771 unsigned long inst_len)
1773 struct cpu_user_regs *regs;
1774 struct hvm_io_op *pio_opp;
1776 pio_opp = &current->arch.hvm_vcpu.io_op;
1777 pio_opp->instr = INSTR_PIO;
1778 pio_opp->flags = 0;
1780 regs = &pio_opp->io_context;
1782 /* Copy current guest state into io instruction state structure. */
1783 memcpy(regs, guest_cpu_user_regs(), HVM_CONTEXT_STACK_BYTES);
1785 HVM_DBG_LOG(DBG_LEVEL_IO, "vm86 %d, eip=%x:%lx, "
1786 "exit_qualification = %lx",
1787 regs->eflags & X86_EFLAGS_VM ? 1 : 0,
1788 regs->cs, (unsigned long)regs->eip, exit_qualification);
1790 if ( test_bit(4, &exit_qualification) ) /* string instrucation */
1791 vmx_do_str_pio(exit_qualification, inst_len, regs, pio_opp);
1792 else
1794 unsigned int port, size;
1795 int dir, df;
1797 df = regs->eflags & X86_EFLAGS_DF ? 1 : 0;
1799 if ( test_bit(6, &exit_qualification) )
1800 port = (exit_qualification >> 16) & 0xFFFF;
1801 else
1802 port = regs->edx & 0xffff;
1804 size = (exit_qualification & 7) + 1;
1805 dir = test_bit(3, &exit_qualification); /* direction */
1807 if ( dir == IOREQ_READ )
1808 HVMTRACE_2D(IO_READ, current, port, size);
1809 else
1810 HVMTRACE_3D(IO_WRITE, current, port, size, regs->eax);
1812 if ( port == 0xe9 && dir == IOREQ_WRITE && size == 1 )
1813 hvm_print_line(current, regs->eax); /* guest debug output */
1815 regs->eip += inst_len;
1816 send_pio_req(port, 1, size, regs->eax, dir, df, 0);
1820 static void vmx_world_save(struct vcpu *v, struct vmx_assist_context *c)
1822 struct cpu_user_regs *regs = guest_cpu_user_regs();
1824 c->eip = regs->eip;
1825 c->eip += __get_instruction_length(); /* Safe: MOV Cn, LMSW, CLTS */
1826 c->esp = regs->esp;
1827 c->eflags = regs->eflags & ~X86_EFLAGS_RF;
1829 c->cr0 = v->arch.hvm_vcpu.guest_cr[0];
1830 c->cr3 = v->arch.hvm_vcpu.guest_cr[3];
1831 c->cr4 = v->arch.hvm_vcpu.guest_cr[4];
1833 c->idtr_limit = __vmread(GUEST_IDTR_LIMIT);
1834 c->idtr_base = __vmread(GUEST_IDTR_BASE);
1836 c->gdtr_limit = __vmread(GUEST_GDTR_LIMIT);
1837 c->gdtr_base = __vmread(GUEST_GDTR_BASE);
1839 c->cs_sel = __vmread(GUEST_CS_SELECTOR);
1840 c->cs_limit = __vmread(GUEST_CS_LIMIT);
1841 c->cs_base = __vmread(GUEST_CS_BASE);
1842 c->cs_arbytes.bytes = __vmread(GUEST_CS_AR_BYTES);
1844 c->ds_sel = __vmread(GUEST_DS_SELECTOR);
1845 c->ds_limit = __vmread(GUEST_DS_LIMIT);
1846 c->ds_base = __vmread(GUEST_DS_BASE);
1847 c->ds_arbytes.bytes = __vmread(GUEST_DS_AR_BYTES);
1849 c->es_sel = __vmread(GUEST_ES_SELECTOR);
1850 c->es_limit = __vmread(GUEST_ES_LIMIT);
1851 c->es_base = __vmread(GUEST_ES_BASE);
1852 c->es_arbytes.bytes = __vmread(GUEST_ES_AR_BYTES);
1854 c->ss_sel = __vmread(GUEST_SS_SELECTOR);
1855 c->ss_limit = __vmread(GUEST_SS_LIMIT);
1856 c->ss_base = __vmread(GUEST_SS_BASE);
1857 c->ss_arbytes.bytes = __vmread(GUEST_SS_AR_BYTES);
1859 c->fs_sel = __vmread(GUEST_FS_SELECTOR);
1860 c->fs_limit = __vmread(GUEST_FS_LIMIT);
1861 c->fs_base = __vmread(GUEST_FS_BASE);
1862 c->fs_arbytes.bytes = __vmread(GUEST_FS_AR_BYTES);
1864 c->gs_sel = __vmread(GUEST_GS_SELECTOR);
1865 c->gs_limit = __vmread(GUEST_GS_LIMIT);
1866 c->gs_base = __vmread(GUEST_GS_BASE);
1867 c->gs_arbytes.bytes = __vmread(GUEST_GS_AR_BYTES);
1869 c->tr_sel = __vmread(GUEST_TR_SELECTOR);
1870 c->tr_limit = __vmread(GUEST_TR_LIMIT);
1871 c->tr_base = __vmread(GUEST_TR_BASE);
1872 c->tr_arbytes.bytes = __vmread(GUEST_TR_AR_BYTES);
1874 c->ldtr_sel = __vmread(GUEST_LDTR_SELECTOR);
1875 c->ldtr_limit = __vmread(GUEST_LDTR_LIMIT);
1876 c->ldtr_base = __vmread(GUEST_LDTR_BASE);
1877 c->ldtr_arbytes.bytes = __vmread(GUEST_LDTR_AR_BYTES);
1880 static int vmx_world_restore(struct vcpu *v, struct vmx_assist_context *c)
1882 struct cpu_user_regs *regs = guest_cpu_user_regs();
1883 int rc;
1885 rc = vmx_restore_cr0_cr3(v, c->cr0, c->cr3);
1886 if ( rc )
1887 return rc;
1889 regs->eip = c->eip;
1890 regs->esp = c->esp;
1891 regs->eflags = c->eflags | 2;
1893 v->arch.hvm_vcpu.guest_cr[4] = c->cr4;
1894 vmx_update_guest_cr(v, 0);
1895 vmx_update_guest_cr(v, 4);
1897 __vmwrite(GUEST_IDTR_LIMIT, c->idtr_limit);
1898 __vmwrite(GUEST_IDTR_BASE, c->idtr_base);
1900 __vmwrite(GUEST_GDTR_LIMIT, c->gdtr_limit);
1901 __vmwrite(GUEST_GDTR_BASE, c->gdtr_base);
1903 __vmwrite(GUEST_CS_SELECTOR, c->cs_sel);
1904 __vmwrite(GUEST_CS_LIMIT, c->cs_limit);
1905 __vmwrite(GUEST_CS_BASE, c->cs_base);
1906 __vmwrite(GUEST_CS_AR_BYTES, c->cs_arbytes.bytes);
1908 __vmwrite(GUEST_DS_SELECTOR, c->ds_sel);
1909 __vmwrite(GUEST_DS_LIMIT, c->ds_limit);
1910 __vmwrite(GUEST_DS_BASE, c->ds_base);
1911 __vmwrite(GUEST_DS_AR_BYTES, c->ds_arbytes.bytes);
1913 __vmwrite(GUEST_ES_SELECTOR, c->es_sel);
1914 __vmwrite(GUEST_ES_LIMIT, c->es_limit);
1915 __vmwrite(GUEST_ES_BASE, c->es_base);
1916 __vmwrite(GUEST_ES_AR_BYTES, c->es_arbytes.bytes);
1918 __vmwrite(GUEST_SS_SELECTOR, c->ss_sel);
1919 __vmwrite(GUEST_SS_LIMIT, c->ss_limit);
1920 __vmwrite(GUEST_SS_BASE, c->ss_base);
1921 __vmwrite(GUEST_SS_AR_BYTES, c->ss_arbytes.bytes);
1923 __vmwrite(GUEST_FS_SELECTOR, c->fs_sel);
1924 __vmwrite(GUEST_FS_LIMIT, c->fs_limit);
1925 __vmwrite(GUEST_FS_BASE, c->fs_base);
1926 __vmwrite(GUEST_FS_AR_BYTES, c->fs_arbytes.bytes);
1928 __vmwrite(GUEST_GS_SELECTOR, c->gs_sel);
1929 __vmwrite(GUEST_GS_LIMIT, c->gs_limit);
1930 __vmwrite(GUEST_GS_BASE, c->gs_base);
1931 __vmwrite(GUEST_GS_AR_BYTES, c->gs_arbytes.bytes);
1933 __vmwrite(GUEST_TR_SELECTOR, c->tr_sel);
1934 __vmwrite(GUEST_TR_LIMIT, c->tr_limit);
1935 __vmwrite(GUEST_TR_BASE, c->tr_base);
1936 __vmwrite(GUEST_TR_AR_BYTES, c->tr_arbytes.bytes);
1938 __vmwrite(GUEST_LDTR_SELECTOR, c->ldtr_sel);
1939 __vmwrite(GUEST_LDTR_LIMIT, c->ldtr_limit);
1940 __vmwrite(GUEST_LDTR_BASE, c->ldtr_base);
1941 __vmwrite(GUEST_LDTR_AR_BYTES, c->ldtr_arbytes.bytes);
1943 paging_update_paging_modes(v);
1944 return 0;
1947 enum { VMX_ASSIST_INVOKE = 0, VMX_ASSIST_RESTORE };
1949 static int vmx_assist(struct vcpu *v, int mode)
1951 struct vmx_assist_context c;
1952 struct hvm_hw_vpic *vpic = v->domain->arch.hvm_domain.vpic;
1953 u32 magic, cp;
1955 if ( hvm_copy_from_guest_phys(&magic, VMXASSIST_MAGIC_OFFSET,
1956 sizeof(magic)) )
1958 gdprintk(XENLOG_ERR, "No vmxassist: can't execute real mode code\n");
1959 domain_crash(v->domain);
1960 return 0;
1963 if ( magic != VMXASSIST_MAGIC )
1965 gdprintk(XENLOG_ERR, "vmxassist magic number not match\n");
1966 domain_crash(v->domain);
1967 return 0;
1970 switch ( mode ) {
1971 /*
1972 * Transfer control to vmxassist.
1973 * Store the current context in VMXASSIST_OLD_CONTEXT and load
1974 * the new VMXASSIST_NEW_CONTEXT context. This context was created
1975 * by vmxassist and will transfer control to it.
1976 */
1977 case VMX_ASSIST_INVOKE:
1978 /* save the old context */
1979 if ( hvm_copy_from_guest_phys(&cp, VMXASSIST_OLD_CONTEXT, sizeof(cp)) )
1980 goto error;
1981 if ( cp != 0 ) {
1982 vmx_world_save(v, &c);
1983 if ( hvm_copy_to_guest_phys(cp, &c, sizeof(c)) )
1984 goto error;
1987 /* restore the new context, this should activate vmxassist */
1988 if ( hvm_copy_from_guest_phys(&cp, VMXASSIST_NEW_CONTEXT, sizeof(cp)) )
1989 goto error;
1990 if ( cp != 0 ) {
1991 if ( hvm_copy_from_guest_phys(&c, cp, sizeof(c)) )
1992 goto error;
1993 if ( vmx_world_restore(v, &c) != 0 )
1994 goto error;
1995 v->arch.hvm_vmx.pm_irqbase[0] = vpic[0].irq_base;
1996 v->arch.hvm_vmx.pm_irqbase[1] = vpic[1].irq_base;
1997 vpic[0].irq_base = NR_EXCEPTION_HANDLER;
1998 vpic[1].irq_base = NR_EXCEPTION_HANDLER + 8;
1999 v->arch.hvm_vmx.vmxassist_enabled = 1;
2000 return 1;
2002 break;
2004 /*
2005 * Restore the VMXASSIST_OLD_CONTEXT that was saved by
2006 * VMX_ASSIST_INVOKE above.
2007 */
2008 case VMX_ASSIST_RESTORE:
2009 /* save the old context */
2010 if ( hvm_copy_from_guest_phys(&cp, VMXASSIST_OLD_CONTEXT, sizeof(cp)) )
2011 goto error;
2012 if ( cp != 0 ) {
2013 if ( hvm_copy_from_guest_phys(&c, cp, sizeof(c)) )
2014 goto error;
2015 if ( vmx_world_restore(v, &c) != 0 )
2016 goto error;
2017 if ( v->arch.hvm_vmx.irqbase_mode ) {
2018 vpic[0].irq_base = c.rm_irqbase[0] & 0xf8;
2019 vpic[1].irq_base = c.rm_irqbase[1] & 0xf8;
2020 } else {
2021 vpic[0].irq_base = v->arch.hvm_vmx.pm_irqbase[0];
2022 vpic[1].irq_base = v->arch.hvm_vmx.pm_irqbase[1];
2024 v->arch.hvm_vmx.vmxassist_enabled = 0;
2025 return 1;
2027 break;
2030 error:
2031 gdprintk(XENLOG_ERR, "Failed to transfer to vmxassist\n");
2032 domain_crash(v->domain);
2033 return 0;
2036 static int vmx_set_cr0(unsigned long value)
2038 struct vcpu *v = current;
2040 if ( hvm_set_cr0(value) == 0 )
2041 return 0;
2043 /*
2044 * VMX does not implement real-mode virtualization. We emulate
2045 * real-mode by performing a world switch to VMXAssist whenever
2046 * a partition disables the CR0.PE bit.
2047 */
2048 if ( !(value & X86_CR0_PE) )
2050 if ( vmx_assist(v, VMX_ASSIST_INVOKE) )
2051 return 0; /* do not update eip! */
2053 else if ( v->arch.hvm_vmx.vmxassist_enabled )
2055 if ( vmx_assist(v, VMX_ASSIST_RESTORE) )
2056 return 0; /* do not update eip! */
2059 return 1;
2062 #define CASE_SET_REG(REG, reg) \
2063 case REG_ ## REG: regs->reg = value; break
2064 #define CASE_GET_REG(REG, reg) \
2065 case REG_ ## REG: value = regs->reg; break
2067 #define CASE_EXTEND_SET_REG \
2068 CASE_EXTEND_REG(S)
2069 #define CASE_EXTEND_GET_REG \
2070 CASE_EXTEND_REG(G)
2072 #ifdef __i386__
2073 #define CASE_EXTEND_REG(T)
2074 #else
2075 #define CASE_EXTEND_REG(T) \
2076 CASE_ ## T ## ET_REG(R8, r8); \
2077 CASE_ ## T ## ET_REG(R9, r9); \
2078 CASE_ ## T ## ET_REG(R10, r10); \
2079 CASE_ ## T ## ET_REG(R11, r11); \
2080 CASE_ ## T ## ET_REG(R12, r12); \
2081 CASE_ ## T ## ET_REG(R13, r13); \
2082 CASE_ ## T ## ET_REG(R14, r14); \
2083 CASE_ ## T ## ET_REG(R15, r15)
2084 #endif
2086 static int mov_to_cr(int gp, int cr, struct cpu_user_regs *regs)
2088 unsigned long value;
2089 struct vcpu *v = current;
2090 struct vlapic *vlapic = vcpu_vlapic(v);
2092 switch ( gp )
2094 CASE_GET_REG(EAX, eax);
2095 CASE_GET_REG(ECX, ecx);
2096 CASE_GET_REG(EDX, edx);
2097 CASE_GET_REG(EBX, ebx);
2098 CASE_GET_REG(EBP, ebp);
2099 CASE_GET_REG(ESI, esi);
2100 CASE_GET_REG(EDI, edi);
2101 CASE_GET_REG(ESP, esp);
2102 CASE_EXTEND_GET_REG;
2103 default:
2104 gdprintk(XENLOG_ERR, "invalid gp: %d\n", gp);
2105 goto exit_and_crash;
2108 HVMTRACE_2D(CR_WRITE, v, cr, value);
2110 HVM_DBG_LOG(DBG_LEVEL_1, "CR%d, value = %lx", cr, value);
2112 switch ( cr )
2114 case 0:
2115 return vmx_set_cr0(value);
2117 case 3:
2118 return hvm_set_cr3(value);
2120 case 4:
2121 return hvm_set_cr4(value);
2123 case 8:
2124 vlapic_set_reg(vlapic, APIC_TASKPRI, ((value & 0x0F) << 4));
2125 break;
2127 default:
2128 gdprintk(XENLOG_ERR, "invalid cr: %d\n", cr);
2129 goto exit_and_crash;
2132 return 1;
2134 exit_and_crash:
2135 domain_crash(v->domain);
2136 return 0;
2139 /*
2140 * Read from control registers. CR0 and CR4 are read from the shadow.
2141 */
2142 static void mov_from_cr(int cr, int gp, struct cpu_user_regs *regs)
2144 unsigned long value = 0;
2145 struct vcpu *v = current;
2146 struct vlapic *vlapic = vcpu_vlapic(v);
2148 switch ( cr )
2150 case 3:
2151 value = (unsigned long)v->arch.hvm_vcpu.guest_cr[3];
2152 break;
2153 case 8:
2154 value = (unsigned long)vlapic_get_reg(vlapic, APIC_TASKPRI);
2155 value = (value & 0xF0) >> 4;
2156 break;
2157 default:
2158 gdprintk(XENLOG_ERR, "invalid cr: %d\n", cr);
2159 domain_crash(v->domain);
2160 break;
2163 switch ( gp ) {
2164 CASE_SET_REG(EAX, eax);
2165 CASE_SET_REG(ECX, ecx);
2166 CASE_SET_REG(EDX, edx);
2167 CASE_SET_REG(EBX, ebx);
2168 CASE_SET_REG(EBP, ebp);
2169 CASE_SET_REG(ESI, esi);
2170 CASE_SET_REG(EDI, edi);
2171 CASE_SET_REG(ESP, esp);
2172 CASE_EXTEND_SET_REG;
2173 default:
2174 printk("invalid gp: %d\n", gp);
2175 domain_crash(v->domain);
2176 break;
2179 HVMTRACE_2D(CR_READ, v, cr, value);
2181 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR%d, value = %lx", cr, value);
2184 static int vmx_cr_access(unsigned long exit_qualification,
2185 struct cpu_user_regs *regs)
2187 unsigned int gp, cr;
2188 unsigned long value;
2189 struct vcpu *v = current;
2191 switch ( exit_qualification & CONTROL_REG_ACCESS_TYPE )
2193 case TYPE_MOV_TO_CR:
2194 gp = exit_qualification & CONTROL_REG_ACCESS_REG;
2195 cr = exit_qualification & CONTROL_REG_ACCESS_NUM;
2196 return mov_to_cr(gp, cr, regs);
2197 case TYPE_MOV_FROM_CR:
2198 gp = exit_qualification & CONTROL_REG_ACCESS_REG;
2199 cr = exit_qualification & CONTROL_REG_ACCESS_NUM;
2200 mov_from_cr(cr, gp, regs);
2201 break;
2202 case TYPE_CLTS:
2203 /* We initialise the FPU now, to avoid needing another vmexit. */
2204 setup_fpu(v);
2205 __vm_clear_bit(EXCEPTION_BITMAP, TRAP_no_device);
2207 v->arch.hvm_vcpu.hw_cr[0] &= ~X86_CR0_TS; /* clear TS */
2208 __vmwrite(GUEST_CR0, v->arch.hvm_vcpu.hw_cr[0]);
2210 v->arch.hvm_vcpu.guest_cr[0] &= ~X86_CR0_TS; /* clear TS */
2211 __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vcpu.guest_cr[0]);
2212 HVMTRACE_0D(CLTS, current);
2213 break;
2214 case TYPE_LMSW:
2215 value = v->arch.hvm_vcpu.guest_cr[0];
2216 value = (value & ~0xF) |
2217 (((exit_qualification & LMSW_SOURCE_DATA) >> 16) & 0xF);
2218 HVMTRACE_1D(LMSW, current, value);
2219 return vmx_set_cr0(value);
2220 default:
2221 BUG();
2224 return 1;
2227 static const struct lbr_info {
2228 u32 base, count;
2229 } p4_lbr[] = {
2230 { MSR_P4_LER_FROM_LIP, 1 },
2231 { MSR_P4_LER_TO_LIP, 1 },
2232 { MSR_P4_LASTBRANCH_TOS, 1 },
2233 { MSR_P4_LASTBRANCH_0_FROM_LIP, NUM_MSR_P4_LASTBRANCH_FROM_TO },
2234 { MSR_P4_LASTBRANCH_0_TO_LIP, NUM_MSR_P4_LASTBRANCH_FROM_TO },
2235 { 0, 0 }
2236 }, c2_lbr[] = {
2237 { MSR_IA32_LASTINTFROMIP, 1 },
2238 { MSR_IA32_LASTINTTOIP, 1 },
2239 { MSR_C2_LASTBRANCH_TOS, 1 },
2240 { MSR_C2_LASTBRANCH_0_FROM_IP, NUM_MSR_C2_LASTBRANCH_FROM_TO },
2241 { MSR_C2_LASTBRANCH_0_TO_IP, NUM_MSR_C2_LASTBRANCH_FROM_TO },
2242 { 0, 0 }
2243 #ifdef __i386__
2244 }, pm_lbr[] = {
2245 { MSR_IA32_LASTINTFROMIP, 1 },
2246 { MSR_IA32_LASTINTTOIP, 1 },
2247 { MSR_PM_LASTBRANCH_TOS, 1 },
2248 { MSR_PM_LASTBRANCH_0, NUM_MSR_PM_LASTBRANCH },
2249 { 0, 0 }
2250 #endif
2251 };
2253 static const struct lbr_info *last_branch_msr_get(void)
2255 switch ( boot_cpu_data.x86 )
2257 case 6:
2258 switch ( boot_cpu_data.x86_model )
2260 #ifdef __i386__
2261 /* PentiumM */
2262 case 9: case 13:
2263 /* Core Solo/Duo */
2264 case 14:
2265 return pm_lbr;
2266 break;
2267 #endif
2268 /* Core2 Duo */
2269 case 15:
2270 return c2_lbr;
2271 break;
2273 break;
2275 case 15:
2276 switch ( boot_cpu_data.x86_model )
2278 /* Pentium4/Xeon with em64t */
2279 case 3: case 4: case 6:
2280 return p4_lbr;
2281 break;
2283 break;
2286 return NULL;
2289 static int is_last_branch_msr(u32 ecx)
2291 const struct lbr_info *lbr = last_branch_msr_get();
2293 if ( lbr == NULL )
2294 return 0;
2296 for ( ; lbr->count; lbr++ )
2297 if ( (ecx >= lbr->base) && (ecx < (lbr->base + lbr->count)) )
2298 return 1;
2300 return 0;
2303 static int vmx_do_msr_read(struct cpu_user_regs *regs)
2305 u64 msr_content = 0;
2306 u32 ecx = regs->ecx, eax, edx;
2307 struct vcpu *v = current;
2308 int index;
2309 u64 *var_range_base = (u64*)v->arch.hvm_vcpu.mtrr.var_ranges;
2310 u64 *fixed_range_base = (u64*)v->arch.hvm_vcpu.mtrr.fixed_ranges;
2312 HVM_DBG_LOG(DBG_LEVEL_1, "ecx=%x", ecx);
2314 switch ( ecx )
2316 case MSR_IA32_TSC:
2317 msr_content = hvm_get_guest_time(v);
2318 break;
2319 case MSR_IA32_SYSENTER_CS:
2320 msr_content = (u32)__vmread(GUEST_SYSENTER_CS);
2321 break;
2322 case MSR_IA32_SYSENTER_ESP:
2323 msr_content = __vmread(GUEST_SYSENTER_ESP);
2324 break;
2325 case MSR_IA32_SYSENTER_EIP:
2326 msr_content = __vmread(GUEST_SYSENTER_EIP);
2327 break;
2328 case MSR_IA32_APICBASE:
2329 msr_content = vcpu_vlapic(v)->hw.apic_base_msr;
2330 break;
2331 case MSR_IA32_CR_PAT:
2332 msr_content = v->arch.hvm_vcpu.pat_cr;
2333 break;
2334 case MSR_MTRRcap:
2335 msr_content = v->arch.hvm_vcpu.mtrr.mtrr_cap;
2336 break;
2337 case MSR_MTRRdefType:
2338 msr_content = v->arch.hvm_vcpu.mtrr.def_type
2339 | (v->arch.hvm_vcpu.mtrr.enabled << 10);
2340 break;
2341 case MSR_MTRRfix64K_00000:
2342 msr_content = fixed_range_base[0];
2343 break;
2344 case MSR_MTRRfix16K_80000:
2345 case MSR_MTRRfix16K_A0000:
2346 index = regs->ecx - MSR_MTRRfix16K_80000;
2347 msr_content = fixed_range_base[index + 1];
2348 break;
2349 case MSR_MTRRfix4K_C0000...MSR_MTRRfix4K_F8000:
2350 index = regs->ecx - MSR_MTRRfix4K_C0000;
2351 msr_content = fixed_range_base[index + 3];
2352 break;
2353 case MSR_IA32_MTRR_PHYSBASE0...MSR_IA32_MTRR_PHYSMASK7:
2354 index = regs->ecx - MSR_IA32_MTRR_PHYSBASE0;
2355 msr_content = var_range_base[index];
2356 break;
2357 case MSR_IA32_DEBUGCTLMSR:
2358 if ( vmx_read_guest_msr(v, ecx, &msr_content) != 0 )
2359 msr_content = 0;
2360 break;
2361 case MSR_IA32_VMX_BASIC...MSR_IA32_VMX_PROCBASED_CTLS2:
2362 goto gp_fault;
2363 case MSR_IA32_MCG_CAP:
2364 case MSR_IA32_MCG_STATUS:
2365 case MSR_IA32_MC0_STATUS:
2366 case MSR_IA32_MC1_STATUS:
2367 case MSR_IA32_MC2_STATUS:
2368 case MSR_IA32_MC3_STATUS:
2369 case MSR_IA32_MC4_STATUS:
2370 case MSR_IA32_MC5_STATUS:
2371 /* No point in letting the guest see real MCEs */
2372 msr_content = 0;
2373 break;
2374 default:
2375 switch ( long_mode_do_msr_read(regs) )
2377 case HNDL_unhandled:
2378 break;
2379 case HNDL_exception_raised:
2380 return 0;
2381 case HNDL_done:
2382 goto done;
2385 if ( vmx_read_guest_msr(v, ecx, &msr_content) == 0 )
2386 break;
2388 if ( is_last_branch_msr(ecx) )
2390 msr_content = 0;
2391 break;
2394 if ( rdmsr_hypervisor_regs(ecx, &eax, &edx) ||
2395 rdmsr_safe(ecx, eax, edx) == 0 )
2397 regs->eax = eax;
2398 regs->edx = edx;
2399 goto done;
2402 goto gp_fault;
2405 regs->eax = msr_content & 0xFFFFFFFF;
2406 regs->edx = msr_content >> 32;
2408 done:
2409 hvmtrace_msr_read(v, ecx, msr_content);
2410 HVM_DBG_LOG(DBG_LEVEL_1, "returns: ecx=%x, eax=%lx, edx=%lx",
2411 ecx, (unsigned long)regs->eax,
2412 (unsigned long)regs->edx);
2413 return 1;
2415 gp_fault:
2416 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
2417 return 0;
2420 static int vmx_alloc_vlapic_mapping(struct domain *d)
2422 void *apic_va;
2424 if ( !cpu_has_vmx_virtualize_apic_accesses )
2425 return 0;
2427 apic_va = alloc_xenheap_page();
2428 if ( apic_va == NULL )
2429 return -ENOMEM;
2430 share_xen_page_with_guest(virt_to_page(apic_va), d, XENSHARE_writable);
2431 guest_physmap_add_page(
2432 d, paddr_to_pfn(APIC_DEFAULT_PHYS_BASE), virt_to_mfn(apic_va));
2433 d->arch.hvm_domain.vmx_apic_access_mfn = virt_to_mfn(apic_va);
2435 return 0;
2438 static void vmx_free_vlapic_mapping(struct domain *d)
2440 unsigned long mfn = d->arch.hvm_domain.vmx_apic_access_mfn;
2441 if ( mfn != 0 )
2442 free_xenheap_page(mfn_to_virt(mfn));
2445 static void vmx_install_vlapic_mapping(struct vcpu *v)
2447 unsigned long virt_page_ma, apic_page_ma;
2449 if ( !cpu_has_vmx_virtualize_apic_accesses )
2450 return;
2452 virt_page_ma = page_to_maddr(vcpu_vlapic(v)->regs_page);
2453 apic_page_ma = v->domain->arch.hvm_domain.vmx_apic_access_mfn;
2454 apic_page_ma <<= PAGE_SHIFT;
2456 vmx_vmcs_enter(v);
2457 __vmwrite(VIRTUAL_APIC_PAGE_ADDR, virt_page_ma);
2458 __vmwrite(APIC_ACCESS_ADDR, apic_page_ma);
2459 vmx_vmcs_exit(v);
2462 void vmx_vlapic_msr_changed(struct vcpu *v)
2464 struct vlapic *vlapic = vcpu_vlapic(v);
2465 uint32_t ctl;
2467 if ( !cpu_has_vmx_virtualize_apic_accesses )
2468 return;
2470 vmx_vmcs_enter(v);
2471 ctl = __vmread(SECONDARY_VM_EXEC_CONTROL);
2472 ctl &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
2473 if ( !vlapic_hw_disabled(vlapic) &&
2474 (vlapic_base_address(vlapic) == APIC_DEFAULT_PHYS_BASE) )
2475 ctl |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
2476 __vmwrite(SECONDARY_VM_EXEC_CONTROL, ctl);
2477 vmx_vmcs_exit(v);
2480 extern bool_t mtrr_var_range_msr_set(struct mtrr_state *v,
2481 u32 msr, u64 msr_content);
2482 extern bool_t mtrr_fix_range_msr_set(struct mtrr_state *v,
2483 int row, u64 msr_content);
2484 extern bool_t mtrr_def_type_msr_set(struct mtrr_state *v, u64 msr_content);
2485 extern bool_t pat_msr_set(u64 *pat, u64 msr);
2487 static int vmx_do_msr_write(struct cpu_user_regs *regs)
2489 u32 ecx = regs->ecx;
2490 u64 msr_content;
2491 struct vcpu *v = current;
2492 int index;
2494 HVM_DBG_LOG(DBG_LEVEL_1, "ecx=%x, eax=%x, edx=%x",
2495 ecx, (u32)regs->eax, (u32)regs->edx);
2497 msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
2499 hvmtrace_msr_write(v, ecx, msr_content);
2501 switch ( ecx )
2503 case MSR_IA32_TSC:
2504 hvm_set_guest_time(v, msr_content);
2505 pt_reset(v);
2506 break;
2507 case MSR_IA32_SYSENTER_CS:
2508 __vmwrite(GUEST_SYSENTER_CS, msr_content);
2509 break;
2510 case MSR_IA32_SYSENTER_ESP:
2511 __vmwrite(GUEST_SYSENTER_ESP, msr_content);
2512 break;
2513 case MSR_IA32_SYSENTER_EIP:
2514 __vmwrite(GUEST_SYSENTER_EIP, msr_content);
2515 break;
2516 case MSR_IA32_APICBASE:
2517 vlapic_msr_set(vcpu_vlapic(v), msr_content);
2518 break;
2519 case MSR_IA32_CR_PAT:
2520 if ( !pat_msr_set(&v->arch.hvm_vcpu.pat_cr, msr_content) )
2521 goto gp_fault;
2522 break;
2523 case MSR_MTRRdefType:
2524 if ( !mtrr_def_type_msr_set(&v->arch.hvm_vcpu.mtrr, msr_content) )
2525 goto gp_fault;
2526 break;
2527 case MSR_MTRRfix64K_00000:
2528 if ( !mtrr_fix_range_msr_set(&v->arch.hvm_vcpu.mtrr, 0, msr_content) )
2529 goto gp_fault;
2530 break;
2531 case MSR_MTRRfix16K_80000:
2532 case MSR_MTRRfix16K_A0000:
2533 index = regs->ecx - MSR_MTRRfix16K_80000 + 1;
2534 if ( !mtrr_fix_range_msr_set(&v->arch.hvm_vcpu.mtrr,
2535 index, msr_content) )
2536 goto gp_fault;
2537 break;
2538 case MSR_MTRRfix4K_C0000...MSR_MTRRfix4K_F8000:
2539 index = regs->ecx - MSR_MTRRfix4K_C0000 + 3;
2540 if ( !mtrr_fix_range_msr_set(&v->arch.hvm_vcpu.mtrr,
2541 index, msr_content) )
2542 goto gp_fault;
2543 break;
2544 case MSR_IA32_MTRR_PHYSBASE0...MSR_IA32_MTRR_PHYSMASK7:
2545 if ( !mtrr_var_range_msr_set(&v->arch.hvm_vcpu.mtrr,
2546 regs->ecx, msr_content) )
2547 goto gp_fault;
2548 break;
2549 case MSR_MTRRcap:
2550 goto gp_fault;
2551 case MSR_IA32_DEBUGCTLMSR: {
2552 int i, rc = 0;
2554 if ( !msr_content || (msr_content & ~3) )
2555 break;
2557 if ( msr_content & 1 )
2559 const struct lbr_info *lbr = last_branch_msr_get();
2560 if ( lbr == NULL )
2561 break;
2563 for ( ; (rc == 0) && lbr->count; lbr++ )
2564 for ( i = 0; (rc == 0) && (i < lbr->count); i++ )
2565 if ( (rc = vmx_add_guest_msr(v, lbr->base + i)) == 0 )
2566 vmx_disable_intercept_for_msr(v, lbr->base + i);
2569 if ( (rc < 0) ||
2570 (vmx_add_guest_msr(v, ecx) < 0) ||
2571 (vmx_add_host_load_msr(v, ecx) < 0) )
2572 vmx_inject_hw_exception(v, TRAP_machine_check, 0);
2573 else
2574 vmx_write_guest_msr(v, ecx, msr_content);
2576 break;
2578 case MSR_IA32_VMX_BASIC...MSR_IA32_VMX_PROCBASED_CTLS2:
2579 goto gp_fault;
2580 default:
2581 switch ( long_mode_do_msr_write(regs) )
2583 case HNDL_unhandled:
2584 if ( (vmx_write_guest_msr(v, ecx, msr_content) != 0) &&
2585 !is_last_branch_msr(ecx) )
2586 wrmsr_hypervisor_regs(ecx, regs->eax, regs->edx);
2587 break;
2588 case HNDL_exception_raised:
2589 return 0;
2590 case HNDL_done:
2591 break;
2593 break;
2596 return 1;
2598 gp_fault:
2599 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
2600 return 0;
2603 static void vmx_do_hlt(struct cpu_user_regs *regs)
2605 unsigned long intr_info = __vmread(VM_ENTRY_INTR_INFO);
2606 struct vcpu *curr = current;
2608 /* Check for pending exception. */
2609 if ( intr_info & INTR_INFO_VALID_MASK )
2611 HVMTRACE_1D(HLT, curr, /*int pending=*/ 1);
2612 return;
2615 HVMTRACE_1D(HLT, curr, /*int pending=*/ 0);
2616 hvm_hlt(regs->eflags);
2619 static void vmx_do_extint(struct cpu_user_regs *regs)
2621 unsigned int vector;
2623 asmlinkage void do_IRQ(struct cpu_user_regs *);
2624 fastcall void smp_apic_timer_interrupt(struct cpu_user_regs *);
2625 fastcall void smp_event_check_interrupt(void);
2626 fastcall void smp_invalidate_interrupt(void);
2627 fastcall void smp_call_function_interrupt(void);
2628 fastcall void smp_spurious_interrupt(struct cpu_user_regs *regs);
2629 fastcall void smp_error_interrupt(struct cpu_user_regs *regs);
2630 #ifdef CONFIG_X86_MCE_P4THERMAL
2631 fastcall void smp_thermal_interrupt(struct cpu_user_regs *regs);
2632 #endif
2634 vector = __vmread(VM_EXIT_INTR_INFO);
2635 BUG_ON(!(vector & INTR_INFO_VALID_MASK));
2637 vector &= INTR_INFO_VECTOR_MASK;
2638 HVMTRACE_1D(INTR, current, vector);
2640 switch ( vector )
2642 case LOCAL_TIMER_VECTOR:
2643 smp_apic_timer_interrupt(regs);
2644 break;
2645 case EVENT_CHECK_VECTOR:
2646 smp_event_check_interrupt();
2647 break;
2648 case INVALIDATE_TLB_VECTOR:
2649 smp_invalidate_interrupt();
2650 break;
2651 case CALL_FUNCTION_VECTOR:
2652 smp_call_function_interrupt();
2653 break;
2654 case SPURIOUS_APIC_VECTOR:
2655 smp_spurious_interrupt(regs);
2656 break;
2657 case ERROR_APIC_VECTOR:
2658 smp_error_interrupt(regs);
2659 break;
2660 #ifdef CONFIG_X86_MCE_P4THERMAL
2661 case THERMAL_APIC_VECTOR:
2662 smp_thermal_interrupt(regs);
2663 break;
2664 #endif
2665 default:
2666 regs->entry_vector = vector;
2667 do_IRQ(regs);
2668 break;
2672 static void wbinvd_ipi(void *info)
2674 wbinvd();
2677 static void vmx_failed_vmentry(unsigned int exit_reason,
2678 struct cpu_user_regs *regs)
2680 unsigned int failed_vmentry_reason = (uint16_t)exit_reason;
2681 unsigned long exit_qualification;
2683 exit_qualification = __vmread(EXIT_QUALIFICATION);
2684 printk("Failed vm entry (exit reason 0x%x) ", exit_reason);
2685 switch ( failed_vmentry_reason )
2687 case EXIT_REASON_INVALID_GUEST_STATE:
2688 printk("caused by invalid guest state (%ld).\n", exit_qualification);
2689 break;
2690 case EXIT_REASON_MSR_LOADING:
2691 printk("caused by MSR entry %ld loading.\n", exit_qualification);
2692 break;
2693 case EXIT_REASON_MACHINE_CHECK:
2694 printk("caused by machine check.\n");
2695 HVMTRACE_0D(MCE, current);
2696 do_machine_check(regs);
2697 break;
2698 default:
2699 printk("reason not known yet!");
2700 break;
2703 printk("************* VMCS Area **************\n");
2704 vmcs_dump_vcpu();
2705 printk("**************************************\n");
2707 domain_crash(current->domain);
2710 asmlinkage void vmx_vmexit_handler(struct cpu_user_regs *regs)
2712 unsigned int exit_reason, idtv_info;
2713 unsigned long exit_qualification, inst_len = 0;
2714 struct vcpu *v = current;
2716 exit_reason = __vmread(VM_EXIT_REASON);
2718 hvmtrace_vmexit(v, regs->eip, exit_reason);
2720 perfc_incra(vmexits, exit_reason);
2722 if ( exit_reason != EXIT_REASON_EXTERNAL_INTERRUPT )
2723 local_irq_enable();
2725 if ( unlikely(exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) )
2726 return vmx_failed_vmentry(exit_reason, regs);
2728 /* Event delivery caused this intercept? Queue for redelivery. */
2729 idtv_info = __vmread(IDT_VECTORING_INFO);
2730 if ( unlikely(idtv_info & INTR_INFO_VALID_MASK) &&
2731 (exit_reason != EXIT_REASON_TASK_SWITCH) )
2733 if ( hvm_event_needs_reinjection((idtv_info>>8)&7, idtv_info&0xff) )
2735 /* See SDM 3B 25.7.1.1 and .2 for info about masking resvd bits. */
2736 __vmwrite(VM_ENTRY_INTR_INFO,
2737 idtv_info & ~INTR_INFO_RESVD_BITS_MASK);
2738 if ( idtv_info & INTR_INFO_DELIVER_CODE_MASK )
2739 __vmwrite(VM_ENTRY_EXCEPTION_ERROR_CODE,
2740 __vmread(IDT_VECTORING_ERROR_CODE));
2743 /*
2744 * Clear NMI-blocking interruptibility info if an NMI delivery faulted.
2745 * Re-delivery will re-set it (see SDM 3B 25.7.1.2).
2746 */
2747 if ( (idtv_info & INTR_INFO_INTR_TYPE_MASK) == (X86_EVENTTYPE_NMI<<8) )
2748 __vmwrite(GUEST_INTERRUPTIBILITY_INFO,
2749 __vmread(GUEST_INTERRUPTIBILITY_INFO) &
2750 ~VMX_INTR_SHADOW_NMI);
2753 switch ( exit_reason )
2755 case EXIT_REASON_EXCEPTION_NMI:
2757 /*
2758 * We don't set the software-interrupt exiting (INT n).
2759 * (1) We can get an exception (e.g. #PG) in the guest, or
2760 * (2) NMI
2761 */
2762 unsigned int intr_info, vector;
2764 intr_info = __vmread(VM_EXIT_INTR_INFO);
2765 BUG_ON(!(intr_info & INTR_INFO_VALID_MASK));
2767 vector = intr_info & INTR_INFO_VECTOR_MASK;
2769 /*
2770 * Re-set the NMI shadow if vmexit caused by a guest IRET fault (see 3B
2771 * 25.7.1.2, "Resuming Guest Software after Handling an Exception").
2772 * (NB. If we emulate this IRET for any reason, we should re-clear!)
2773 */
2774 if ( unlikely(intr_info & INTR_INFO_NMI_UNBLOCKED_BY_IRET) &&
2775 !(__vmread(IDT_VECTORING_INFO) & INTR_INFO_VALID_MASK) &&
2776 (vector != TRAP_double_fault) )
2777 __vmwrite(GUEST_INTERRUPTIBILITY_INFO,
2778 __vmread(GUEST_INTERRUPTIBILITY_INFO)|VMX_INTR_SHADOW_NMI);
2780 perfc_incra(cause_vector, vector);
2782 switch ( vector )
2784 case TRAP_debug:
2785 case TRAP_int3:
2786 if ( !v->domain->debugger_attached )
2787 goto exit_and_crash;
2788 domain_pause_for_debugger();
2789 break;
2790 case TRAP_no_device:
2791 vmx_do_no_device_fault();
2792 break;
2793 case TRAP_page_fault:
2794 exit_qualification = __vmread(EXIT_QUALIFICATION);
2795 regs->error_code = __vmread(VM_EXIT_INTR_ERROR_CODE);
2797 HVM_DBG_LOG(DBG_LEVEL_VMMU,
2798 "eax=%lx, ebx=%lx, ecx=%lx, edx=%lx, esi=%lx, edi=%lx",
2799 (unsigned long)regs->eax, (unsigned long)regs->ebx,
2800 (unsigned long)regs->ecx, (unsigned long)regs->edx,
2801 (unsigned long)regs->esi, (unsigned long)regs->edi);
2803 if ( paging_fault(exit_qualification, regs) )
2805 hvmtrace_pf_xen(v, exit_qualification, regs->error_code);
2806 break;
2809 v->arch.hvm_vcpu.guest_cr[2] = exit_qualification;
2810 vmx_inject_hw_exception(v, TRAP_page_fault, regs->error_code);
2811 break;
2812 case TRAP_nmi:
2813 if ( (intr_info & INTR_INFO_INTR_TYPE_MASK) !=
2814 (X86_EVENTTYPE_NMI << 8) )
2815 goto exit_and_crash;
2816 HVMTRACE_0D(NMI, v);
2817 do_nmi(regs); /* Real NMI, vector 2: normal processing. */
2818 break;
2819 case TRAP_machine_check:
2820 HVMTRACE_0D(MCE, v);
2821 do_machine_check(regs);
2822 break;
2823 default:
2824 goto exit_and_crash;
2826 break;
2828 case EXIT_REASON_EXTERNAL_INTERRUPT:
2829 vmx_do_extint(regs);
2830 break;
2831 case EXIT_REASON_TRIPLE_FAULT:
2832 hvm_triple_fault();
2833 break;
2834 case EXIT_REASON_PENDING_VIRT_INTR:
2835 /* Disable the interrupt window. */
2836 v->arch.hvm_vmx.exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
2837 __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
2838 v->arch.hvm_vmx.exec_control);
2839 break;
2840 case EXIT_REASON_PENDING_VIRT_NMI:
2841 /* Disable the NMI window. */
2842 v->arch.hvm_vmx.exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
2843 __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
2844 v->arch.hvm_vmx.exec_control);
2845 break;
2846 case EXIT_REASON_TASK_SWITCH: {
2847 const enum hvm_task_switch_reason reasons[] = {
2848 TSW_call_or_int, TSW_iret, TSW_jmp, TSW_call_or_int };
2849 int32_t errcode = -1;
2850 exit_qualification = __vmread(EXIT_QUALIFICATION);
2851 if ( (idtv_info & INTR_INFO_VALID_MASK) &&
2852 (idtv_info & INTR_INFO_DELIVER_CODE_MASK) )
2853 errcode = __vmread(IDT_VECTORING_ERROR_CODE);
2854 hvm_task_switch((uint16_t)exit_qualification,
2855 reasons[(exit_qualification >> 30) & 3],
2856 errcode);
2857 break;
2859 case EXIT_REASON_CPUID:
2860 inst_len = __get_instruction_length(); /* Safe: CPUID */
2861 __update_guest_eip(inst_len);
2862 vmx_do_cpuid(regs);
2863 break;
2864 case EXIT_REASON_HLT:
2865 inst_len = __get_instruction_length(); /* Safe: HLT */
2866 __update_guest_eip(inst_len);
2867 vmx_do_hlt(regs);
2868 break;
2869 case EXIT_REASON_INVLPG:
2871 inst_len = __get_instruction_length(); /* Safe: INVLPG */
2872 __update_guest_eip(inst_len);
2873 exit_qualification = __vmread(EXIT_QUALIFICATION);
2874 vmx_do_invlpg(exit_qualification);
2875 break;
2877 case EXIT_REASON_VMCALL:
2879 int rc;
2880 HVMTRACE_1D(VMMCALL, v, regs->eax);
2881 inst_len = __get_instruction_length(); /* Safe: VMCALL */
2882 rc = hvm_do_hypercall(regs);
2883 if ( rc != HVM_HCALL_preempted )
2885 __update_guest_eip(inst_len);
2886 if ( rc == HVM_HCALL_invalidate )
2887 send_invalidate_req();
2889 break;
2891 case EXIT_REASON_CR_ACCESS:
2893 exit_qualification = __vmread(EXIT_QUALIFICATION);
2894 inst_len = __get_instruction_length(); /* Safe: MOV Cn, LMSW, CLTS */
2895 if ( vmx_cr_access(exit_qualification, regs) )
2896 __update_guest_eip(inst_len);
2897 break;
2899 case EXIT_REASON_DR_ACCESS:
2900 exit_qualification = __vmread(EXIT_QUALIFICATION);
2901 vmx_dr_access(exit_qualification, regs);
2902 break;
2903 case EXIT_REASON_IO_INSTRUCTION:
2904 exit_qualification = __vmread(EXIT_QUALIFICATION);
2905 inst_len = __get_instruction_length(); /* Safe: IN, INS, OUT, OUTS */
2906 vmx_io_instruction(exit_qualification, inst_len);
2907 break;
2908 case EXIT_REASON_MSR_READ:
2909 inst_len = __get_instruction_length(); /* Safe: RDMSR */
2910 if ( vmx_do_msr_read(regs) )
2911 __update_guest_eip(inst_len);
2912 break;
2913 case EXIT_REASON_MSR_WRITE:
2914 inst_len = __get_instruction_length(); /* Safe: WRMSR */
2915 if ( vmx_do_msr_write(regs) )
2916 __update_guest_eip(inst_len);
2917 break;
2919 case EXIT_REASON_MWAIT_INSTRUCTION:
2920 case EXIT_REASON_MONITOR_INSTRUCTION:
2921 case EXIT_REASON_VMCLEAR:
2922 case EXIT_REASON_VMLAUNCH:
2923 case EXIT_REASON_VMPTRLD:
2924 case EXIT_REASON_VMPTRST:
2925 case EXIT_REASON_VMREAD:
2926 case EXIT_REASON_VMRESUME:
2927 case EXIT_REASON_VMWRITE:
2928 case EXIT_REASON_VMXOFF:
2929 case EXIT_REASON_VMXON:
2930 vmx_inject_hw_exception(v, TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE);
2931 break;
2933 case EXIT_REASON_TPR_BELOW_THRESHOLD:
2934 break;
2936 case EXIT_REASON_APIC_ACCESS:
2938 unsigned long offset;
2939 exit_qualification = __vmread(EXIT_QUALIFICATION);
2940 offset = exit_qualification & 0x0fffUL;
2941 handle_mmio(APIC_DEFAULT_PHYS_BASE | offset);
2942 break;
2945 case EXIT_REASON_INVD:
2946 case EXIT_REASON_WBINVD:
2948 inst_len = __get_instruction_length(); /* Safe: INVD, WBINVD */
2949 __update_guest_eip(inst_len);
2950 if ( !list_empty(&(domain_hvm_iommu(v->domain)->pdev_list)) )
2952 if ( cpu_has_wbinvd_exiting )
2954 on_each_cpu(wbinvd_ipi, NULL, 1, 1);
2956 else
2958 wbinvd();
2959 /* Disable further WBINVD intercepts. */
2960 if ( (exit_reason == EXIT_REASON_WBINVD) &&
2961 (vmx_cpu_based_exec_control &
2962 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) )
2963 __vmwrite(SECONDARY_VM_EXEC_CONTROL,
2964 vmx_secondary_exec_control &
2965 ~SECONDARY_EXEC_WBINVD_EXITING);
2968 break;
2971 default:
2972 exit_and_crash:
2973 gdprintk(XENLOG_ERR, "Bad vmexit (reason %x)\n", exit_reason);
2974 domain_crash(v->domain);
2975 break;
2979 asmlinkage void vmx_trace_vmentry(void)
2981 struct vcpu *v = current;
2983 hvmtrace_vmentry(v);
2986 /*
2987 * Local variables:
2988 * mode: C
2989 * c-set-style: "BSD"
2990 * c-basic-offset: 4
2991 * tab-width: 4
2992 * indent-tabs-mode: nil
2993 * End:
2994 */