ia64/xen-unstable

view xen/arch/x86/hvm/vmx/vmx.c @ 16620:966a6d3b7408

SVM: Treat the vlapic's tpr as the master copy and sync the vtpr to it
before every vm entry. This fixes HVM save/restore/migrate, as the
vtpr value was only being synced on guest TPR writes before.

Signed-off-by: Tim Deegan <Tim.Deegan@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Fri Dec 14 11:50:24 2007 +0000 (2007-12-14)
parents 98e9485d8fcf
children 3f0f0bd3f1c1
line source
1 /*
2 * vmx.c: handling VMX architecture-related VM exits
3 * Copyright (c) 2004, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
16 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 */
19 #include <xen/config.h>
20 #include <xen/init.h>
21 #include <xen/lib.h>
22 #include <xen/trace.h>
23 #include <xen/sched.h>
24 #include <xen/irq.h>
25 #include <xen/softirq.h>
26 #include <xen/domain_page.h>
27 #include <xen/hypercall.h>
28 #include <xen/perfc.h>
29 #include <asm/current.h>
30 #include <asm/io.h>
31 #include <asm/regs.h>
32 #include <asm/cpufeature.h>
33 #include <asm/processor.h>
34 #include <asm/types.h>
35 #include <asm/debugreg.h>
36 #include <asm/msr.h>
37 #include <asm/spinlock.h>
38 #include <asm/paging.h>
39 #include <asm/p2m.h>
40 #include <asm/hvm/hvm.h>
41 #include <asm/hvm/support.h>
42 #include <asm/hvm/vmx/vmx.h>
43 #include <asm/hvm/vmx/vmcs.h>
44 #include <asm/hvm/vmx/cpu.h>
45 #include <public/sched.h>
46 #include <public/hvm/ioreq.h>
47 #include <asm/hvm/vpic.h>
48 #include <asm/hvm/vlapic.h>
49 #include <asm/x86_emulate.h>
50 #include <asm/hvm/vpt.h>
51 #include <public/hvm/save.h>
52 #include <asm/hvm/trace.h>
54 enum handler_return { HNDL_done, HNDL_unhandled, HNDL_exception_raised };
56 static void vmx_ctxt_switch_from(struct vcpu *v);
57 static void vmx_ctxt_switch_to(struct vcpu *v);
59 static int vmx_alloc_vlapic_mapping(struct domain *d);
60 static void vmx_free_vlapic_mapping(struct domain *d);
61 static void vmx_install_vlapic_mapping(struct vcpu *v);
62 static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr);
63 static void vmx_update_guest_efer(struct vcpu *v);
65 static int vmx_domain_initialise(struct domain *d)
66 {
67 return vmx_alloc_vlapic_mapping(d);
68 }
70 static void vmx_domain_destroy(struct domain *d)
71 {
72 vmx_free_vlapic_mapping(d);
73 }
75 static int vmx_vcpu_initialise(struct vcpu *v)
76 {
77 int rc;
79 spin_lock_init(&v->arch.hvm_vmx.vmcs_lock);
81 v->arch.schedule_tail = vmx_do_resume;
82 v->arch.ctxt_switch_from = vmx_ctxt_switch_from;
83 v->arch.ctxt_switch_to = vmx_ctxt_switch_to;
85 if ( (rc = vmx_create_vmcs(v)) != 0 )
86 {
87 dprintk(XENLOG_WARNING,
88 "Failed to create VMCS for vcpu %d: err=%d.\n",
89 v->vcpu_id, rc);
90 return rc;
91 }
93 vmx_install_vlapic_mapping(v);
95 #ifndef VMXASSIST
96 if ( v->vcpu_id == 0 )
97 v->arch.guest_context.user_regs.eax = 1;
98 v->arch.hvm_vcpu.io_complete = vmx_realmode_io_complete;
99 #endif
101 return 0;
102 }
104 static void vmx_vcpu_destroy(struct vcpu *v)
105 {
106 vmx_destroy_vmcs(v);
107 }
109 #ifdef __x86_64__
111 static DEFINE_PER_CPU(struct vmx_msr_state, host_msr_state);
113 static u32 msr_index[VMX_MSR_COUNT] =
114 {
115 MSR_LSTAR, MSR_STAR, MSR_SYSCALL_MASK
116 };
118 static void vmx_save_host_msrs(void)
119 {
120 struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state);
121 int i;
123 for ( i = 0; i < VMX_MSR_COUNT; i++ )
124 rdmsrl(msr_index[i], host_msr_state->msrs[i]);
125 }
127 #define WRITE_MSR(address) \
128 guest_msr_state->msrs[VMX_INDEX_MSR_ ## address] = msr_content; \
129 set_bit(VMX_INDEX_MSR_ ## address, &guest_msr_state->flags); \
130 wrmsrl(MSR_ ## address, msr_content); \
131 set_bit(VMX_INDEX_MSR_ ## address, &host_msr_state->flags); \
132 break
134 static enum handler_return long_mode_do_msr_read(struct cpu_user_regs *regs)
135 {
136 u64 msr_content = 0;
137 u32 ecx = regs->ecx;
138 struct vcpu *v = current;
139 struct vmx_msr_state *guest_msr_state = &v->arch.hvm_vmx.msr_state;
141 switch ( ecx )
142 {
143 case MSR_EFER:
144 msr_content = v->arch.hvm_vcpu.guest_efer;
145 break;
147 case MSR_FS_BASE:
148 msr_content = __vmread(GUEST_FS_BASE);
149 goto check_long_mode;
151 case MSR_GS_BASE:
152 msr_content = __vmread(GUEST_GS_BASE);
153 goto check_long_mode;
155 case MSR_SHADOW_GS_BASE:
156 msr_content = v->arch.hvm_vmx.shadow_gs;
157 check_long_mode:
158 if ( !(hvm_long_mode_enabled(v)) )
159 {
160 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
161 return HNDL_exception_raised;
162 }
163 break;
165 case MSR_STAR:
166 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_STAR];
167 break;
169 case MSR_LSTAR:
170 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_LSTAR];
171 break;
173 case MSR_CSTAR:
174 msr_content = v->arch.hvm_vmx.cstar;
175 break;
177 case MSR_SYSCALL_MASK:
178 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_SYSCALL_MASK];
179 break;
181 default:
182 return HNDL_unhandled;
183 }
185 HVM_DBG_LOG(DBG_LEVEL_0, "msr 0x%x content 0x%"PRIx64, ecx, msr_content);
187 regs->eax = (u32)(msr_content >> 0);
188 regs->edx = (u32)(msr_content >> 32);
190 return HNDL_done;
191 }
193 static enum handler_return long_mode_do_msr_write(struct cpu_user_regs *regs)
194 {
195 u64 msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
196 u32 ecx = regs->ecx;
197 struct vcpu *v = current;
198 struct vmx_msr_state *guest_msr_state = &v->arch.hvm_vmx.msr_state;
199 struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state);
201 HVM_DBG_LOG(DBG_LEVEL_0, "msr 0x%x content 0x%"PRIx64, ecx, msr_content);
203 switch ( ecx )
204 {
205 case MSR_EFER:
206 if ( !hvm_set_efer(msr_content) )
207 goto exception_raised;
208 break;
210 case MSR_FS_BASE:
211 case MSR_GS_BASE:
212 case MSR_SHADOW_GS_BASE:
213 if ( !hvm_long_mode_enabled(v) )
214 goto gp_fault;
216 if ( !is_canonical_address(msr_content) )
217 goto uncanonical_address;
219 if ( ecx == MSR_FS_BASE )
220 __vmwrite(GUEST_FS_BASE, msr_content);
221 else if ( ecx == MSR_GS_BASE )
222 __vmwrite(GUEST_GS_BASE, msr_content);
223 else
224 {
225 v->arch.hvm_vmx.shadow_gs = msr_content;
226 wrmsrl(MSR_SHADOW_GS_BASE, msr_content);
227 }
229 break;
231 case MSR_STAR:
232 WRITE_MSR(STAR);
234 case MSR_LSTAR:
235 if ( !is_canonical_address(msr_content) )
236 goto uncanonical_address;
237 WRITE_MSR(LSTAR);
239 case MSR_CSTAR:
240 if ( !is_canonical_address(msr_content) )
241 goto uncanonical_address;
242 v->arch.hvm_vmx.cstar = msr_content;
243 break;
245 case MSR_SYSCALL_MASK:
246 WRITE_MSR(SYSCALL_MASK);
248 default:
249 return HNDL_unhandled;
250 }
252 return HNDL_done;
254 uncanonical_address:
255 HVM_DBG_LOG(DBG_LEVEL_0, "Not cano address of msr write %x", ecx);
256 gp_fault:
257 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
258 exception_raised:
259 return HNDL_exception_raised;
260 }
262 /*
263 * To avoid MSR save/restore at every VM exit/entry time, we restore
264 * the x86_64 specific MSRs at domain switch time. Since these MSRs
265 * are not modified once set for para domains, we don't save them,
266 * but simply reset them to values set in percpu_traps_init().
267 */
268 static void vmx_restore_host_msrs(void)
269 {
270 struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state);
271 int i;
273 while ( host_msr_state->flags )
274 {
275 i = find_first_set_bit(host_msr_state->flags);
276 wrmsrl(msr_index[i], host_msr_state->msrs[i]);
277 clear_bit(i, &host_msr_state->flags);
278 }
280 if ( cpu_has_nx && !(read_efer() & EFER_NX) )
281 write_efer(read_efer() | EFER_NX);
282 }
284 static void vmx_save_guest_msrs(struct vcpu *v)
285 {
286 /* MSR_SHADOW_GS_BASE may have been changed by swapgs instruction. */
287 rdmsrl(MSR_SHADOW_GS_BASE, v->arch.hvm_vmx.shadow_gs);
288 }
290 static void vmx_restore_guest_msrs(struct vcpu *v)
291 {
292 struct vmx_msr_state *guest_msr_state, *host_msr_state;
293 unsigned long guest_flags;
294 int i;
296 guest_msr_state = &v->arch.hvm_vmx.msr_state;
297 host_msr_state = &this_cpu(host_msr_state);
299 wrmsrl(MSR_SHADOW_GS_BASE, v->arch.hvm_vmx.shadow_gs);
301 guest_flags = guest_msr_state->flags;
303 while ( guest_flags )
304 {
305 i = find_first_set_bit(guest_flags);
307 HVM_DBG_LOG(DBG_LEVEL_2,
308 "restore guest's index %d msr %x with value %lx",
309 i, msr_index[i], guest_msr_state->msrs[i]);
310 set_bit(i, &host_msr_state->flags);
311 wrmsrl(msr_index[i], guest_msr_state->msrs[i]);
312 clear_bit(i, &guest_flags);
313 }
315 if ( (v->arch.hvm_vcpu.guest_efer ^ read_efer()) & (EFER_NX | EFER_SCE) )
316 {
317 HVM_DBG_LOG(DBG_LEVEL_2,
318 "restore guest's EFER with value %lx",
319 v->arch.hvm_vcpu.guest_efer);
320 write_efer((read_efer() & ~(EFER_NX | EFER_SCE)) |
321 (v->arch.hvm_vcpu.guest_efer & (EFER_NX | EFER_SCE)));
322 }
323 }
325 #else /* __i386__ */
327 #define vmx_save_host_msrs() ((void)0)
329 static void vmx_restore_host_msrs(void)
330 {
331 if ( cpu_has_nx && !(read_efer() & EFER_NX) )
332 write_efer(read_efer() | EFER_NX);
333 }
335 #define vmx_save_guest_msrs(v) ((void)0)
337 static void vmx_restore_guest_msrs(struct vcpu *v)
338 {
339 if ( (v->arch.hvm_vcpu.guest_efer ^ read_efer()) & EFER_NX )
340 {
341 HVM_DBG_LOG(DBG_LEVEL_2,
342 "restore guest's EFER with value %lx",
343 v->arch.hvm_vcpu.guest_efer);
344 write_efer((read_efer() & ~EFER_NX) |
345 (v->arch.hvm_vcpu.guest_efer & EFER_NX));
346 }
347 }
349 static enum handler_return long_mode_do_msr_read(struct cpu_user_regs *regs)
350 {
351 u64 msr_content = 0;
352 struct vcpu *v = current;
354 switch ( regs->ecx )
355 {
356 case MSR_EFER:
357 msr_content = v->arch.hvm_vcpu.guest_efer;
358 break;
360 default:
361 return HNDL_unhandled;
362 }
364 regs->eax = msr_content >> 0;
365 regs->edx = msr_content >> 32;
367 return HNDL_done;
368 }
370 static enum handler_return long_mode_do_msr_write(struct cpu_user_regs *regs)
371 {
372 u64 msr_content = regs->eax | ((u64)regs->edx << 32);
374 switch ( regs->ecx )
375 {
376 case MSR_EFER:
377 if ( !hvm_set_efer(msr_content) )
378 return HNDL_exception_raised;
379 break;
381 default:
382 return HNDL_unhandled;
383 }
385 return HNDL_done;
386 }
388 #endif /* __i386__ */
390 static int vmx_guest_x86_mode(struct vcpu *v)
391 {
392 unsigned int cs_ar_bytes;
394 if ( unlikely(!(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE)) )
395 return 0;
396 if ( unlikely(guest_cpu_user_regs()->eflags & X86_EFLAGS_VM) )
397 return 1;
398 cs_ar_bytes = __vmread(GUEST_CS_AR_BYTES);
399 if ( hvm_long_mode_enabled(v) &&
400 likely(cs_ar_bytes & X86_SEG_AR_CS_LM_ACTIVE) )
401 return 8;
402 return (likely(cs_ar_bytes & X86_SEG_AR_DEF_OP_SIZE) ? 4 : 2);
403 }
405 static void vmx_save_dr(struct vcpu *v)
406 {
407 if ( !v->arch.hvm_vcpu.flag_dr_dirty )
408 return;
410 /* Clear the DR dirty flag and re-enable intercepts for DR accesses. */
411 v->arch.hvm_vcpu.flag_dr_dirty = 0;
412 v->arch.hvm_vmx.exec_control |= CPU_BASED_MOV_DR_EXITING;
413 __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control);
415 v->arch.guest_context.debugreg[0] = read_debugreg(0);
416 v->arch.guest_context.debugreg[1] = read_debugreg(1);
417 v->arch.guest_context.debugreg[2] = read_debugreg(2);
418 v->arch.guest_context.debugreg[3] = read_debugreg(3);
419 v->arch.guest_context.debugreg[6] = read_debugreg(6);
420 /* DR7 must be saved as it is used by vmx_restore_dr(). */
421 v->arch.guest_context.debugreg[7] = __vmread(GUEST_DR7);
422 }
424 static void __restore_debug_registers(struct vcpu *v)
425 {
426 if ( v->arch.hvm_vcpu.flag_dr_dirty )
427 return;
429 v->arch.hvm_vcpu.flag_dr_dirty = 1;
431 write_debugreg(0, v->arch.guest_context.debugreg[0]);
432 write_debugreg(1, v->arch.guest_context.debugreg[1]);
433 write_debugreg(2, v->arch.guest_context.debugreg[2]);
434 write_debugreg(3, v->arch.guest_context.debugreg[3]);
435 write_debugreg(6, v->arch.guest_context.debugreg[6]);
436 /* DR7 is loaded from the VMCS. */
437 }
439 /*
440 * DR7 is saved and restored on every vmexit. Other debug registers only
441 * need to be restored if their value is going to affect execution -- i.e.,
442 * if one of the breakpoints is enabled. So mask out all bits that don't
443 * enable some breakpoint functionality.
444 */
445 static void vmx_restore_dr(struct vcpu *v)
446 {
447 /* NB. __vmread() is not usable here, so we cannot read from the VMCS. */
448 if ( unlikely(v->arch.guest_context.debugreg[7] & DR7_ACTIVE_MASK) )
449 __restore_debug_registers(v);
450 }
452 void vmx_vmcs_save(struct vcpu *v, struct hvm_hw_cpu *c)
453 {
454 uint32_t ev;
456 vmx_vmcs_enter(v);
458 c->cr0 = v->arch.hvm_vcpu.guest_cr[0];
459 c->cr2 = v->arch.hvm_vcpu.guest_cr[2];
460 c->cr3 = v->arch.hvm_vcpu.guest_cr[3];
461 c->cr4 = v->arch.hvm_vcpu.guest_cr[4];
463 c->msr_efer = v->arch.hvm_vcpu.guest_efer;
465 c->idtr_limit = __vmread(GUEST_IDTR_LIMIT);
466 c->idtr_base = __vmread(GUEST_IDTR_BASE);
468 c->gdtr_limit = __vmread(GUEST_GDTR_LIMIT);
469 c->gdtr_base = __vmread(GUEST_GDTR_BASE);
471 c->cs_sel = __vmread(GUEST_CS_SELECTOR);
472 c->cs_limit = __vmread(GUEST_CS_LIMIT);
473 c->cs_base = __vmread(GUEST_CS_BASE);
474 c->cs_arbytes = __vmread(GUEST_CS_AR_BYTES);
476 c->ds_sel = __vmread(GUEST_DS_SELECTOR);
477 c->ds_limit = __vmread(GUEST_DS_LIMIT);
478 c->ds_base = __vmread(GUEST_DS_BASE);
479 c->ds_arbytes = __vmread(GUEST_DS_AR_BYTES);
481 c->es_sel = __vmread(GUEST_ES_SELECTOR);
482 c->es_limit = __vmread(GUEST_ES_LIMIT);
483 c->es_base = __vmread(GUEST_ES_BASE);
484 c->es_arbytes = __vmread(GUEST_ES_AR_BYTES);
486 c->ss_sel = __vmread(GUEST_SS_SELECTOR);
487 c->ss_limit = __vmread(GUEST_SS_LIMIT);
488 c->ss_base = __vmread(GUEST_SS_BASE);
489 c->ss_arbytes = __vmread(GUEST_SS_AR_BYTES);
491 c->fs_sel = __vmread(GUEST_FS_SELECTOR);
492 c->fs_limit = __vmread(GUEST_FS_LIMIT);
493 c->fs_base = __vmread(GUEST_FS_BASE);
494 c->fs_arbytes = __vmread(GUEST_FS_AR_BYTES);
496 c->gs_sel = __vmread(GUEST_GS_SELECTOR);
497 c->gs_limit = __vmread(GUEST_GS_LIMIT);
498 c->gs_base = __vmread(GUEST_GS_BASE);
499 c->gs_arbytes = __vmread(GUEST_GS_AR_BYTES);
501 c->tr_sel = __vmread(GUEST_TR_SELECTOR);
502 c->tr_limit = __vmread(GUEST_TR_LIMIT);
503 c->tr_base = __vmread(GUEST_TR_BASE);
504 c->tr_arbytes = __vmread(GUEST_TR_AR_BYTES);
506 c->ldtr_sel = __vmread(GUEST_LDTR_SELECTOR);
507 c->ldtr_limit = __vmread(GUEST_LDTR_LIMIT);
508 c->ldtr_base = __vmread(GUEST_LDTR_BASE);
509 c->ldtr_arbytes = __vmread(GUEST_LDTR_AR_BYTES);
511 c->sysenter_cs = __vmread(GUEST_SYSENTER_CS);
512 c->sysenter_esp = __vmread(GUEST_SYSENTER_ESP);
513 c->sysenter_eip = __vmread(GUEST_SYSENTER_EIP);
515 c->pending_event = 0;
516 c->error_code = 0;
517 if ( ((ev = __vmread(VM_ENTRY_INTR_INFO)) & INTR_INFO_VALID_MASK) &&
518 hvm_event_needs_reinjection((ev >> 8) & 7, ev & 0xff) )
519 {
520 c->pending_event = ev;
521 c->error_code = __vmread(VM_ENTRY_EXCEPTION_ERROR_CODE);
522 }
524 vmx_vmcs_exit(v);
525 }
527 static int vmx_restore_cr0_cr3(
528 struct vcpu *v, unsigned long cr0, unsigned long cr3)
529 {
530 unsigned long mfn = 0;
531 p2m_type_t p2mt;
533 if ( cr0 & X86_CR0_PG )
534 {
535 mfn = mfn_x(gfn_to_mfn(v->domain, cr3 >> PAGE_SHIFT, &p2mt));
536 if ( !p2m_is_ram(p2mt) || !get_page(mfn_to_page(mfn), v->domain) )
537 {
538 gdprintk(XENLOG_ERR, "Invalid CR3 value=0x%lx\n", cr3);
539 return -EINVAL;
540 }
541 }
543 if ( v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PG )
544 put_page(pagetable_get_page(v->arch.guest_table));
546 v->arch.guest_table = pagetable_from_pfn(mfn);
548 v->arch.hvm_vcpu.guest_cr[0] = cr0 | X86_CR0_ET;
549 v->arch.hvm_vcpu.guest_cr[3] = cr3;
551 return 0;
552 }
554 int vmx_vmcs_restore(struct vcpu *v, struct hvm_hw_cpu *c)
555 {
556 int rc;
558 if ( c->pending_valid &&
559 ((c->pending_type == 1) || (c->pending_type > 6) ||
560 (c->pending_reserved != 0)) )
561 {
562 gdprintk(XENLOG_ERR, "Invalid pending event 0x%"PRIx32".\n",
563 c->pending_event);
564 return -EINVAL;
565 }
567 rc = vmx_restore_cr0_cr3(v, c->cr0, c->cr3);
568 if ( rc )
569 return rc;
571 vmx_vmcs_enter(v);
573 v->arch.hvm_vcpu.guest_cr[2] = c->cr2;
574 v->arch.hvm_vcpu.guest_cr[4] = c->cr4;
575 vmx_update_guest_cr(v, 0);
576 vmx_update_guest_cr(v, 2);
577 vmx_update_guest_cr(v, 4);
579 #ifdef HVM_DEBUG_SUSPEND
580 printk("%s: cr3=0x%"PRIx64", cr0=0x%"PRIx64", cr4=0x%"PRIx64".\n",
581 __func__, c->cr3, c->cr0, c->cr4);
582 #endif
584 v->arch.hvm_vcpu.guest_efer = c->msr_efer;
585 vmx_update_guest_efer(v);
587 __vmwrite(GUEST_IDTR_LIMIT, c->idtr_limit);
588 __vmwrite(GUEST_IDTR_BASE, c->idtr_base);
590 __vmwrite(GUEST_GDTR_LIMIT, c->gdtr_limit);
591 __vmwrite(GUEST_GDTR_BASE, c->gdtr_base);
593 __vmwrite(GUEST_CS_SELECTOR, c->cs_sel);
594 __vmwrite(GUEST_CS_LIMIT, c->cs_limit);
595 __vmwrite(GUEST_CS_BASE, c->cs_base);
596 __vmwrite(GUEST_CS_AR_BYTES, c->cs_arbytes);
598 __vmwrite(GUEST_DS_SELECTOR, c->ds_sel);
599 __vmwrite(GUEST_DS_LIMIT, c->ds_limit);
600 __vmwrite(GUEST_DS_BASE, c->ds_base);
601 __vmwrite(GUEST_DS_AR_BYTES, c->ds_arbytes);
603 __vmwrite(GUEST_ES_SELECTOR, c->es_sel);
604 __vmwrite(GUEST_ES_LIMIT, c->es_limit);
605 __vmwrite(GUEST_ES_BASE, c->es_base);
606 __vmwrite(GUEST_ES_AR_BYTES, c->es_arbytes);
608 __vmwrite(GUEST_SS_SELECTOR, c->ss_sel);
609 __vmwrite(GUEST_SS_LIMIT, c->ss_limit);
610 __vmwrite(GUEST_SS_BASE, c->ss_base);
611 __vmwrite(GUEST_SS_AR_BYTES, c->ss_arbytes);
613 __vmwrite(GUEST_FS_SELECTOR, c->fs_sel);
614 __vmwrite(GUEST_FS_LIMIT, c->fs_limit);
615 __vmwrite(GUEST_FS_BASE, c->fs_base);
616 __vmwrite(GUEST_FS_AR_BYTES, c->fs_arbytes);
618 __vmwrite(GUEST_GS_SELECTOR, c->gs_sel);
619 __vmwrite(GUEST_GS_LIMIT, c->gs_limit);
620 __vmwrite(GUEST_GS_BASE, c->gs_base);
621 __vmwrite(GUEST_GS_AR_BYTES, c->gs_arbytes);
623 __vmwrite(GUEST_TR_SELECTOR, c->tr_sel);
624 __vmwrite(GUEST_TR_LIMIT, c->tr_limit);
625 __vmwrite(GUEST_TR_BASE, c->tr_base);
626 __vmwrite(GUEST_TR_AR_BYTES, c->tr_arbytes);
628 __vmwrite(GUEST_LDTR_SELECTOR, c->ldtr_sel);
629 __vmwrite(GUEST_LDTR_LIMIT, c->ldtr_limit);
630 __vmwrite(GUEST_LDTR_BASE, c->ldtr_base);
631 __vmwrite(GUEST_LDTR_AR_BYTES, c->ldtr_arbytes);
633 __vmwrite(GUEST_SYSENTER_CS, c->sysenter_cs);
634 __vmwrite(GUEST_SYSENTER_ESP, c->sysenter_esp);
635 __vmwrite(GUEST_SYSENTER_EIP, c->sysenter_eip);
637 __vmwrite(GUEST_DR7, c->dr7);
639 vmx_vmcs_exit(v);
641 paging_update_paging_modes(v);
643 if ( c->pending_valid )
644 {
645 gdprintk(XENLOG_INFO, "Re-injecting 0x%"PRIx32", 0x%"PRIx32"\n",
646 c->pending_event, c->error_code);
648 if ( hvm_event_needs_reinjection(c->pending_type, c->pending_vector) )
649 {
650 vmx_vmcs_enter(v);
651 __vmwrite(VM_ENTRY_INTR_INFO, c->pending_event);
652 __vmwrite(VM_ENTRY_EXCEPTION_ERROR_CODE, c->error_code);
653 vmx_vmcs_exit(v);
654 }
655 }
657 return 0;
658 }
660 #if defined(__x86_64__) && defined(HVM_DEBUG_SUSPEND)
661 static void dump_msr_state(struct vmx_msr_state *m)
662 {
663 int i = 0;
664 printk("**** msr state ****\n");
665 printk("shadow_gs=0x%lx, flags=0x%lx, msr_items:", m->shadow_gs, m->flags);
666 for ( i = 0; i < VMX_MSR_COUNT; i++ )
667 printk("0x%lx,", m->msrs[i]);
668 printk("\n");
669 }
670 #else
671 #define dump_msr_state(m) ((void)0)
672 #endif
674 static void vmx_save_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data)
675 {
676 #ifdef __x86_64__
677 struct vmx_msr_state *guest_state = &v->arch.hvm_vmx.msr_state;
678 unsigned long guest_flags = guest_state->flags;
680 data->shadow_gs = v->arch.hvm_vmx.shadow_gs;
681 data->msr_cstar = v->arch.hvm_vmx.cstar;
683 /* save msrs */
684 data->msr_flags = guest_flags;
685 data->msr_lstar = guest_state->msrs[VMX_INDEX_MSR_LSTAR];
686 data->msr_star = guest_state->msrs[VMX_INDEX_MSR_STAR];
687 data->msr_syscall_mask = guest_state->msrs[VMX_INDEX_MSR_SYSCALL_MASK];
688 #endif
690 data->tsc = hvm_get_guest_time(v);
692 dump_msr_state(guest_state);
693 }
695 static void vmx_load_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data)
696 {
697 #ifdef __x86_64__
698 struct vmx_msr_state *guest_state = &v->arch.hvm_vmx.msr_state;
700 /* restore msrs */
701 guest_state->flags = data->msr_flags;
702 guest_state->msrs[VMX_INDEX_MSR_LSTAR] = data->msr_lstar;
703 guest_state->msrs[VMX_INDEX_MSR_STAR] = data->msr_star;
704 guest_state->msrs[VMX_INDEX_MSR_SYSCALL_MASK] = data->msr_syscall_mask;
706 v->arch.hvm_vmx.cstar = data->msr_cstar;
707 v->arch.hvm_vmx.shadow_gs = data->shadow_gs;
708 #endif
710 #ifdef VMXASSIST
711 v->arch.hvm_vmx.vmxassist_enabled = !(data->cr0 & X86_CR0_PE);
712 #endif
714 hvm_set_guest_time(v, data->tsc);
716 dump_msr_state(guest_state);
717 }
720 static void vmx_save_vmcs_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt)
721 {
722 vmx_save_cpu_state(v, ctxt);
723 vmx_vmcs_save(v, ctxt);
724 }
726 static int vmx_load_vmcs_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt)
727 {
728 vmx_load_cpu_state(v, ctxt);
730 if ( vmx_vmcs_restore(v, ctxt) )
731 {
732 gdprintk(XENLOG_ERR, "vmx_vmcs restore failed!\n");
733 domain_crash(v->domain);
734 return -EINVAL;
735 }
737 return 0;
738 }
740 static void vmx_ctxt_switch_from(struct vcpu *v)
741 {
742 vmx_save_guest_msrs(v);
743 vmx_restore_host_msrs();
744 vmx_save_dr(v);
745 }
747 static void vmx_ctxt_switch_to(struct vcpu *v)
748 {
749 /* HOST_CR4 in VMCS is always mmu_cr4_features. Sync CR4 now. */
750 if ( unlikely(read_cr4() != mmu_cr4_features) )
751 write_cr4(mmu_cr4_features);
753 vmx_restore_guest_msrs(v);
754 vmx_restore_dr(v);
755 }
757 static unsigned long vmx_get_segment_base(struct vcpu *v, enum x86_segment seg)
758 {
759 unsigned long base = 0;
760 int long_mode = 0;
762 ASSERT(v == current);
764 if ( hvm_long_mode_enabled(v) &&
765 (__vmread(GUEST_CS_AR_BYTES) & X86_SEG_AR_CS_LM_ACTIVE) )
766 long_mode = 1;
768 switch ( seg )
769 {
770 case x86_seg_cs: if ( !long_mode ) base = __vmread(GUEST_CS_BASE); break;
771 case x86_seg_ds: if ( !long_mode ) base = __vmread(GUEST_DS_BASE); break;
772 case x86_seg_es: if ( !long_mode ) base = __vmread(GUEST_ES_BASE); break;
773 case x86_seg_fs: base = __vmread(GUEST_FS_BASE); break;
774 case x86_seg_gs: base = __vmread(GUEST_GS_BASE); break;
775 case x86_seg_ss: if ( !long_mode ) base = __vmread(GUEST_SS_BASE); break;
776 case x86_seg_tr: base = __vmread(GUEST_TR_BASE); break;
777 case x86_seg_gdtr: base = __vmread(GUEST_GDTR_BASE); break;
778 case x86_seg_idtr: base = __vmread(GUEST_IDTR_BASE); break;
779 case x86_seg_ldtr: base = __vmread(GUEST_LDTR_BASE); break;
780 default: BUG(); break;
781 }
783 return base;
784 }
786 static void vmx_get_segment_register(struct vcpu *v, enum x86_segment seg,
787 struct segment_register *reg)
788 {
789 uint32_t attr = 0;
791 ASSERT(v == current);
793 switch ( seg )
794 {
795 case x86_seg_cs:
796 reg->sel = __vmread(GUEST_CS_SELECTOR);
797 reg->limit = __vmread(GUEST_CS_LIMIT);
798 reg->base = __vmread(GUEST_CS_BASE);
799 attr = __vmread(GUEST_CS_AR_BYTES);
800 break;
801 case x86_seg_ds:
802 reg->sel = __vmread(GUEST_DS_SELECTOR);
803 reg->limit = __vmread(GUEST_DS_LIMIT);
804 reg->base = __vmread(GUEST_DS_BASE);
805 attr = __vmread(GUEST_DS_AR_BYTES);
806 break;
807 case x86_seg_es:
808 reg->sel = __vmread(GUEST_ES_SELECTOR);
809 reg->limit = __vmread(GUEST_ES_LIMIT);
810 reg->base = __vmread(GUEST_ES_BASE);
811 attr = __vmread(GUEST_ES_AR_BYTES);
812 break;
813 case x86_seg_fs:
814 reg->sel = __vmread(GUEST_FS_SELECTOR);
815 reg->limit = __vmread(GUEST_FS_LIMIT);
816 reg->base = __vmread(GUEST_FS_BASE);
817 attr = __vmread(GUEST_FS_AR_BYTES);
818 break;
819 case x86_seg_gs:
820 reg->sel = __vmread(GUEST_GS_SELECTOR);
821 reg->limit = __vmread(GUEST_GS_LIMIT);
822 reg->base = __vmread(GUEST_GS_BASE);
823 attr = __vmread(GUEST_GS_AR_BYTES);
824 break;
825 case x86_seg_ss:
826 reg->sel = __vmread(GUEST_SS_SELECTOR);
827 reg->limit = __vmread(GUEST_SS_LIMIT);
828 reg->base = __vmread(GUEST_SS_BASE);
829 attr = __vmread(GUEST_SS_AR_BYTES);
830 break;
831 case x86_seg_tr:
832 reg->sel = __vmread(GUEST_TR_SELECTOR);
833 reg->limit = __vmread(GUEST_TR_LIMIT);
834 reg->base = __vmread(GUEST_TR_BASE);
835 attr = __vmread(GUEST_TR_AR_BYTES);
836 break;
837 case x86_seg_gdtr:
838 reg->limit = __vmread(GUEST_GDTR_LIMIT);
839 reg->base = __vmread(GUEST_GDTR_BASE);
840 break;
841 case x86_seg_idtr:
842 reg->limit = __vmread(GUEST_IDTR_LIMIT);
843 reg->base = __vmread(GUEST_IDTR_BASE);
844 break;
845 case x86_seg_ldtr:
846 reg->sel = __vmread(GUEST_LDTR_SELECTOR);
847 reg->limit = __vmread(GUEST_LDTR_LIMIT);
848 reg->base = __vmread(GUEST_LDTR_BASE);
849 attr = __vmread(GUEST_LDTR_AR_BYTES);
850 break;
851 default:
852 BUG();
853 }
855 reg->attr.bytes = (attr & 0xff) | ((attr >> 4) & 0xf00);
856 /* Unusable flag is folded into Present flag. */
857 if ( attr & (1u<<16) )
858 reg->attr.fields.p = 0;
859 }
861 static void vmx_set_segment_register(struct vcpu *v, enum x86_segment seg,
862 struct segment_register *reg)
863 {
864 uint32_t attr;
866 ASSERT((v == current) || !vcpu_runnable(v));
868 attr = reg->attr.bytes;
869 attr = ((attr & 0xf00) << 4) | (attr & 0xff);
871 /* Not-present must mean unusable. */
872 if ( !reg->attr.fields.p )
873 attr |= (1u << 16);
875 vmx_vmcs_enter(v);
877 switch ( seg )
878 {
879 case x86_seg_cs:
880 __vmwrite(GUEST_CS_SELECTOR, reg->sel);
881 __vmwrite(GUEST_CS_LIMIT, reg->limit);
882 __vmwrite(GUEST_CS_BASE, reg->base);
883 __vmwrite(GUEST_CS_AR_BYTES, attr);
884 break;
885 case x86_seg_ds:
886 __vmwrite(GUEST_DS_SELECTOR, reg->sel);
887 __vmwrite(GUEST_DS_LIMIT, reg->limit);
888 __vmwrite(GUEST_DS_BASE, reg->base);
889 __vmwrite(GUEST_DS_AR_BYTES, attr);
890 break;
891 case x86_seg_es:
892 __vmwrite(GUEST_ES_SELECTOR, reg->sel);
893 __vmwrite(GUEST_ES_LIMIT, reg->limit);
894 __vmwrite(GUEST_ES_BASE, reg->base);
895 __vmwrite(GUEST_ES_AR_BYTES, attr);
896 break;
897 case x86_seg_fs:
898 __vmwrite(GUEST_FS_SELECTOR, reg->sel);
899 __vmwrite(GUEST_FS_LIMIT, reg->limit);
900 __vmwrite(GUEST_FS_BASE, reg->base);
901 __vmwrite(GUEST_FS_AR_BYTES, attr);
902 break;
903 case x86_seg_gs:
904 __vmwrite(GUEST_GS_SELECTOR, reg->sel);
905 __vmwrite(GUEST_GS_LIMIT, reg->limit);
906 __vmwrite(GUEST_GS_BASE, reg->base);
907 __vmwrite(GUEST_GS_AR_BYTES, attr);
908 break;
909 case x86_seg_ss:
910 __vmwrite(GUEST_SS_SELECTOR, reg->sel);
911 __vmwrite(GUEST_SS_LIMIT, reg->limit);
912 __vmwrite(GUEST_SS_BASE, reg->base);
913 __vmwrite(GUEST_SS_AR_BYTES, attr);
914 break;
915 case x86_seg_tr:
916 __vmwrite(GUEST_TR_SELECTOR, reg->sel);
917 __vmwrite(GUEST_TR_LIMIT, reg->limit);
918 __vmwrite(GUEST_TR_BASE, reg->base);
919 __vmwrite(GUEST_TR_AR_BYTES, attr);
920 break;
921 case x86_seg_gdtr:
922 __vmwrite(GUEST_GDTR_LIMIT, reg->limit);
923 __vmwrite(GUEST_GDTR_BASE, reg->base);
924 break;
925 case x86_seg_idtr:
926 __vmwrite(GUEST_IDTR_LIMIT, reg->limit);
927 __vmwrite(GUEST_IDTR_BASE, reg->base);
928 break;
929 case x86_seg_ldtr:
930 __vmwrite(GUEST_LDTR_SELECTOR, reg->sel);
931 __vmwrite(GUEST_LDTR_LIMIT, reg->limit);
932 __vmwrite(GUEST_LDTR_BASE, reg->base);
933 __vmwrite(GUEST_LDTR_AR_BYTES, attr);
934 break;
935 default:
936 BUG();
937 }
939 vmx_vmcs_exit(v);
940 }
942 /* Make sure that xen intercepts any FP accesses from current */
943 static void vmx_stts(struct vcpu *v)
944 {
945 /* VMX depends on operating on the current vcpu */
946 ASSERT(v == current);
948 /*
949 * If the guest does not have TS enabled then we must cause and handle an
950 * exception on first use of the FPU. If the guest *does* have TS enabled
951 * then this is not necessary: no FPU activity can occur until the guest
952 * clears CR0.TS, and we will initialise the FPU when that happens.
953 */
954 if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) )
955 {
956 v->arch.hvm_vcpu.hw_cr[0] |= X86_CR0_TS;
957 __vmwrite(GUEST_CR0, v->arch.hvm_vcpu.hw_cr[0]);
958 __vm_set_bit(EXCEPTION_BITMAP, TRAP_no_device);
959 }
960 }
962 static void vmx_set_tsc_offset(struct vcpu *v, u64 offset)
963 {
964 vmx_vmcs_enter(v);
965 __vmwrite(TSC_OFFSET, offset);
966 #if defined (__i386__)
967 __vmwrite(TSC_OFFSET_HIGH, offset >> 32);
968 #endif
969 vmx_vmcs_exit(v);
970 }
972 void do_nmi(struct cpu_user_regs *);
974 static void vmx_init_hypercall_page(struct domain *d, void *hypercall_page)
975 {
976 char *p;
977 int i;
979 for ( i = 0; i < (PAGE_SIZE / 32); i++ )
980 {
981 p = (char *)(hypercall_page + (i * 32));
982 *(u8 *)(p + 0) = 0xb8; /* mov imm32, %eax */
983 *(u32 *)(p + 1) = i;
984 *(u8 *)(p + 5) = 0x0f; /* vmcall */
985 *(u8 *)(p + 6) = 0x01;
986 *(u8 *)(p + 7) = 0xc1;
987 *(u8 *)(p + 8) = 0xc3; /* ret */
988 }
990 /* Don't support HYPERVISOR_iret at the moment */
991 *(u16 *)(hypercall_page + (__HYPERVISOR_iret * 32)) = 0x0b0f; /* ud2 */
992 }
994 static enum hvm_intblk vmx_interrupt_blocked(
995 struct vcpu *v, struct hvm_intack intack)
996 {
997 unsigned long intr_shadow;
999 intr_shadow = __vmread(GUEST_INTERRUPTIBILITY_INFO);
1001 if ( intr_shadow & (VMX_INTR_SHADOW_STI|VMX_INTR_SHADOW_MOV_SS) )
1002 return hvm_intblk_shadow;
1004 if ( intack.source == hvm_intsrc_nmi )
1005 return ((intr_shadow & VMX_INTR_SHADOW_NMI) ?
1006 hvm_intblk_nmi_iret : hvm_intblk_none);
1008 ASSERT((intack.source == hvm_intsrc_pic) ||
1009 (intack.source == hvm_intsrc_lapic));
1011 if ( !(guest_cpu_user_regs()->eflags & X86_EFLAGS_IF) )
1012 return hvm_intblk_rflags_ie;
1014 return hvm_intblk_none;
1017 static void vmx_update_host_cr3(struct vcpu *v)
1019 ASSERT((v == current) || !vcpu_runnable(v));
1020 vmx_vmcs_enter(v);
1021 __vmwrite(HOST_CR3, v->arch.cr3);
1022 vmx_vmcs_exit(v);
1025 static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr)
1027 ASSERT((v == current) || !vcpu_runnable(v));
1029 vmx_vmcs_enter(v);
1031 switch ( cr )
1033 case 0:
1034 /* TS cleared? Then initialise FPU now. */
1035 if ( (v == current) && !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) &&
1036 (v->arch.hvm_vcpu.hw_cr[0] & X86_CR0_TS) )
1038 setup_fpu(v);
1039 __vm_clear_bit(EXCEPTION_BITMAP, TRAP_no_device);
1042 v->arch.hvm_vcpu.hw_cr[0] =
1043 v->arch.hvm_vcpu.guest_cr[0] |
1044 X86_CR0_NE | X86_CR0_PG | X86_CR0_WP | X86_CR0_PE;
1045 __vmwrite(GUEST_CR0, v->arch.hvm_vcpu.hw_cr[0]);
1046 __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vcpu.guest_cr[0]);
1047 break;
1048 case 2:
1049 /* CR2 is updated in exit stub. */
1050 break;
1051 case 3:
1052 __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr[3]);
1053 break;
1054 case 4:
1055 v->arch.hvm_vcpu.hw_cr[4] =
1056 v->arch.hvm_vcpu.guest_cr[4] | HVM_CR4_HOST_MASK;
1057 __vmwrite(GUEST_CR4, v->arch.hvm_vcpu.hw_cr[4]);
1058 __vmwrite(CR4_READ_SHADOW, v->arch.hvm_vcpu.guest_cr[4]);
1059 break;
1060 default:
1061 BUG();
1064 vmx_vmcs_exit(v);
1067 static void vmx_update_guest_efer(struct vcpu *v)
1069 #ifdef __x86_64__
1070 unsigned long vm_entry_value;
1072 ASSERT((v == current) || !vcpu_runnable(v));
1074 vmx_vmcs_enter(v);
1076 vm_entry_value = __vmread(VM_ENTRY_CONTROLS);
1077 if ( v->arch.hvm_vcpu.guest_efer & EFER_LMA )
1078 vm_entry_value |= VM_ENTRY_IA32E_MODE;
1079 else
1080 vm_entry_value &= ~VM_ENTRY_IA32E_MODE;
1081 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
1083 vmx_vmcs_exit(v);
1084 #endif
1086 if ( v == current )
1087 write_efer((read_efer() & ~(EFER_NX|EFER_SCE)) |
1088 (v->arch.hvm_vcpu.guest_efer & (EFER_NX|EFER_SCE)));
1091 static void vmx_flush_guest_tlbs(void)
1093 /* No tagged TLB support on VMX yet. The fact that we're in Xen
1094 * at all means any guest will have a clean TLB when it's next run,
1095 * because VMRESUME will flush it for us. */
1098 static void vmx_inject_exception(
1099 unsigned int trapnr, int errcode, unsigned long cr2)
1101 struct vcpu *curr = current;
1103 vmx_inject_hw_exception(curr, trapnr, errcode);
1105 if ( trapnr == TRAP_page_fault )
1106 curr->arch.hvm_vcpu.guest_cr[2] = cr2;
1108 if ( (trapnr == TRAP_debug) &&
1109 (guest_cpu_user_regs()->eflags & X86_EFLAGS_TF) )
1111 __restore_debug_registers(curr);
1112 write_debugreg(6, read_debugreg(6) | 0x4000);
1116 static int vmx_event_pending(struct vcpu *v)
1118 ASSERT(v == current);
1119 return (__vmread(VM_ENTRY_INTR_INFO) & INTR_INFO_VALID_MASK);
1122 static struct hvm_function_table vmx_function_table = {
1123 .name = "VMX",
1124 .domain_initialise = vmx_domain_initialise,
1125 .domain_destroy = vmx_domain_destroy,
1126 .vcpu_initialise = vmx_vcpu_initialise,
1127 .vcpu_destroy = vmx_vcpu_destroy,
1128 .save_cpu_ctxt = vmx_save_vmcs_ctxt,
1129 .load_cpu_ctxt = vmx_load_vmcs_ctxt,
1130 .interrupt_blocked = vmx_interrupt_blocked,
1131 .guest_x86_mode = vmx_guest_x86_mode,
1132 .get_segment_base = vmx_get_segment_base,
1133 .get_segment_register = vmx_get_segment_register,
1134 .set_segment_register = vmx_set_segment_register,
1135 .update_host_cr3 = vmx_update_host_cr3,
1136 .update_guest_cr = vmx_update_guest_cr,
1137 .update_guest_efer = vmx_update_guest_efer,
1138 .flush_guest_tlbs = vmx_flush_guest_tlbs,
1139 .stts = vmx_stts,
1140 .set_tsc_offset = vmx_set_tsc_offset,
1141 .inject_exception = vmx_inject_exception,
1142 .init_hypercall_page = vmx_init_hypercall_page,
1143 .event_pending = vmx_event_pending,
1144 .cpu_up = vmx_cpu_up,
1145 .cpu_down = vmx_cpu_down,
1146 };
1148 void start_vmx(void)
1150 static int bootstrapped;
1152 vmx_save_host_msrs();
1154 if ( bootstrapped )
1156 if ( hvm_enabled && !vmx_cpu_up() )
1158 printk("VMX: FATAL: failed to initialise CPU%d!\n",
1159 smp_processor_id());
1160 BUG();
1162 return;
1165 bootstrapped = 1;
1167 /* Xen does not fill x86_capability words except 0. */
1168 boot_cpu_data.x86_capability[4] = cpuid_ecx(1);
1170 if ( !test_bit(X86_FEATURE_VMXE, &boot_cpu_data.x86_capability) )
1171 return;
1173 set_in_cr4(X86_CR4_VMXE);
1175 if ( !vmx_cpu_up() )
1177 printk("VMX: failed to initialise.\n");
1178 return;
1181 setup_vmcs_dump();
1183 hvm_enable(&vmx_function_table);
1186 /*
1187 * Not all cases receive valid value in the VM-exit instruction length field.
1188 * Callers must know what they're doing!
1189 */
1190 static int __get_instruction_length(void)
1192 int len;
1193 len = __vmread(VM_EXIT_INSTRUCTION_LEN); /* Safe: callers audited */
1194 BUG_ON((len < 1) || (len > 15));
1195 return len;
1198 static void __update_guest_eip(unsigned long inst_len)
1200 struct cpu_user_regs *regs = guest_cpu_user_regs();
1201 unsigned long x;
1203 regs->eip += inst_len;
1204 regs->eflags &= ~X86_EFLAGS_RF;
1206 x = __vmread(GUEST_INTERRUPTIBILITY_INFO);
1207 if ( x & (VMX_INTR_SHADOW_STI | VMX_INTR_SHADOW_MOV_SS) )
1209 x &= ~(VMX_INTR_SHADOW_STI | VMX_INTR_SHADOW_MOV_SS);
1210 __vmwrite(GUEST_INTERRUPTIBILITY_INFO, x);
1213 if ( regs->eflags & X86_EFLAGS_TF )
1214 vmx_inject_exception(TRAP_debug, HVM_DELIVER_NO_ERROR_CODE, 0);
1217 static void vmx_do_no_device_fault(void)
1219 struct vcpu *v = current;
1221 setup_fpu(current);
1222 __vm_clear_bit(EXCEPTION_BITMAP, TRAP_no_device);
1224 /* Disable TS in guest CR0 unless the guest wants the exception too. */
1225 if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) )
1227 v->arch.hvm_vcpu.hw_cr[0] &= ~X86_CR0_TS;
1228 __vmwrite(GUEST_CR0, v->arch.hvm_vcpu.hw_cr[0]);
1232 #define bitmaskof(idx) (1U << ((idx) & 31))
1233 void vmx_cpuid_intercept(
1234 unsigned int *eax, unsigned int *ebx,
1235 unsigned int *ecx, unsigned int *edx)
1237 unsigned int input = *eax;
1239 #ifdef VMXASSIST
1240 if ( input == 0x40000003 )
1242 /*
1243 * NB. Unsupported interface for private use of VMXASSIST only.
1244 * Note that this leaf lives at <max-hypervisor-leaf> + 1.
1245 */
1246 u64 value = ((u64)*edx << 32) | (u32)*ecx;
1247 p2m_type_t p2mt;
1248 unsigned long mfn;
1249 struct vcpu *v = current;
1250 char *p;
1252 mfn = mfn_x(gfn_to_mfn_current(value >> PAGE_SHIFT, &p2mt));
1254 gdprintk(XENLOG_INFO, "Input address is 0x%"PRIx64".\n", value);
1256 /* 8-byte aligned valid pseudophys address from vmxassist, please. */
1257 if ( (value & 7) || !p2m_is_ram(p2mt) ||
1258 !v->arch.hvm_vmx.vmxassist_enabled )
1260 domain_crash(v->domain);
1261 return;
1263 ASSERT(mfn_valid(mfn));
1265 p = map_domain_page(mfn);
1266 value = *((uint64_t *)(p + (value & (PAGE_SIZE - 1))));
1267 unmap_domain_page(p);
1269 gdprintk(XENLOG_INFO, "Output value is 0x%"PRIx64".\n", value);
1270 *ecx = (u32)value;
1271 *edx = (u32)(value >> 32);
1272 return;
1274 #endif
1276 hvm_cpuid(input, eax, ebx, ecx, edx);
1278 switch ( input )
1280 case 0x00000001:
1281 *ecx &= ~VMX_VCPU_CPUID_L1_ECX_RESERVED;
1282 *ebx &= NUM_THREADS_RESET_MASK;
1283 *ecx &= ~(bitmaskof(X86_FEATURE_VMXE) |
1284 bitmaskof(X86_FEATURE_EST) |
1285 bitmaskof(X86_FEATURE_TM2) |
1286 bitmaskof(X86_FEATURE_CID) |
1287 bitmaskof(X86_FEATURE_PDCM) |
1288 bitmaskof(X86_FEATURE_DSCPL));
1289 *edx &= ~(bitmaskof(X86_FEATURE_HT) |
1290 bitmaskof(X86_FEATURE_ACPI) |
1291 bitmaskof(X86_FEATURE_ACC) |
1292 bitmaskof(X86_FEATURE_DS));
1293 break;
1295 case 0x00000004:
1296 cpuid_count(input, *ecx, eax, ebx, ecx, edx);
1297 *eax &= NUM_CORES_RESET_MASK;
1298 break;
1300 case 0x00000006:
1301 case 0x00000009:
1302 case 0x0000000A:
1303 *eax = *ebx = *ecx = *edx = 0;
1304 break;
1306 case 0x80000001:
1307 /* Only a few features are advertised in Intel's 0x80000001. */
1308 *ecx &= (bitmaskof(X86_FEATURE_LAHF_LM));
1309 *edx &= (bitmaskof(X86_FEATURE_NX) |
1310 bitmaskof(X86_FEATURE_LM) |
1311 bitmaskof(X86_FEATURE_SYSCALL));
1312 break;
1315 HVMTRACE_3D(CPUID, current, input,
1316 ((uint64_t)*eax << 32) | *ebx, ((uint64_t)*ecx << 32) | *edx);
1319 static void vmx_do_cpuid(struct cpu_user_regs *regs)
1321 unsigned int eax, ebx, ecx, edx;
1323 eax = regs->eax;
1324 ebx = regs->ebx;
1325 ecx = regs->ecx;
1326 edx = regs->edx;
1328 vmx_cpuid_intercept(&eax, &ebx, &ecx, &edx);
1330 regs->eax = eax;
1331 regs->ebx = ebx;
1332 regs->ecx = ecx;
1333 regs->edx = edx;
1336 #define CASE_GET_REG_P(REG, reg) \
1337 case REG_ ## REG: reg_p = (unsigned long *)&(regs->reg); break
1339 #ifdef __i386__
1340 #define CASE_EXTEND_GET_REG_P
1341 #else
1342 #define CASE_EXTEND_GET_REG_P \
1343 CASE_GET_REG_P(R8, r8); \
1344 CASE_GET_REG_P(R9, r9); \
1345 CASE_GET_REG_P(R10, r10); \
1346 CASE_GET_REG_P(R11, r11); \
1347 CASE_GET_REG_P(R12, r12); \
1348 CASE_GET_REG_P(R13, r13); \
1349 CASE_GET_REG_P(R14, r14); \
1350 CASE_GET_REG_P(R15, r15)
1351 #endif
1353 static void vmx_dr_access(unsigned long exit_qualification,
1354 struct cpu_user_regs *regs)
1356 struct vcpu *v = current;
1358 HVMTRACE_0D(DR_WRITE, v);
1360 if ( !v->arch.hvm_vcpu.flag_dr_dirty )
1361 __restore_debug_registers(v);
1363 /* Allow guest direct access to DR registers */
1364 v->arch.hvm_vmx.exec_control &= ~CPU_BASED_MOV_DR_EXITING;
1365 __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control);
1368 /*
1369 * Invalidate the TLB for va. Invalidate the shadow page corresponding
1370 * the address va.
1371 */
1372 static void vmx_do_invlpg(unsigned long va)
1374 struct vcpu *v = current;
1376 HVMTRACE_2D(INVLPG, v, /*invlpga=*/ 0, va);
1378 /*
1379 * We do the safest things first, then try to update the shadow
1380 * copying from guest
1381 */
1382 paging_invlpg(v, va);
1385 /* Get segment for OUTS according to guest instruction. */
1386 static enum x86_segment vmx_outs_get_segment(
1387 int long_mode, unsigned long eip, int inst_len)
1389 unsigned char inst[MAX_INST_LEN];
1390 enum x86_segment seg = x86_seg_ds;
1391 int i;
1392 extern int inst_copy_from_guest(unsigned char *, unsigned long, int);
1394 if ( likely(cpu_has_vmx_ins_outs_instr_info) )
1396 unsigned int instr_info = __vmread(VMX_INSTRUCTION_INFO);
1398 /* Get segment register according to bits 17:15. */
1399 switch ( (instr_info >> 15) & 7 )
1401 case 0: seg = x86_seg_es; break;
1402 case 1: seg = x86_seg_cs; break;
1403 case 2: seg = x86_seg_ss; break;
1404 case 3: seg = x86_seg_ds; break;
1405 case 4: seg = x86_seg_fs; break;
1406 case 5: seg = x86_seg_gs; break;
1407 default: BUG();
1410 goto out;
1413 if ( !long_mode )
1414 eip += __vmread(GUEST_CS_BASE);
1416 memset(inst, 0, MAX_INST_LEN);
1417 if ( inst_copy_from_guest(inst, eip, inst_len) != inst_len )
1419 gdprintk(XENLOG_ERR, "Get guest instruction failed\n");
1420 domain_crash(current->domain);
1421 goto out;
1424 for ( i = 0; i < inst_len; i++ )
1426 switch ( inst[i] )
1428 case 0xf3: /* REPZ */
1429 case 0xf2: /* REPNZ */
1430 case 0xf0: /* LOCK */
1431 case 0x66: /* data32 */
1432 case 0x67: /* addr32 */
1433 #ifdef __x86_64__
1434 case 0x40 ... 0x4f: /* REX */
1435 #endif
1436 continue;
1437 case 0x2e: /* CS */
1438 seg = x86_seg_cs;
1439 continue;
1440 case 0x36: /* SS */
1441 seg = x86_seg_ss;
1442 continue;
1443 case 0x26: /* ES */
1444 seg = x86_seg_es;
1445 continue;
1446 case 0x64: /* FS */
1447 seg = x86_seg_fs;
1448 continue;
1449 case 0x65: /* GS */
1450 seg = x86_seg_gs;
1451 continue;
1452 case 0x3e: /* DS */
1453 seg = x86_seg_ds;
1454 continue;
1458 out:
1459 return seg;
1462 static int vmx_str_pio_check_descriptor(int long_mode, unsigned long eip,
1463 int inst_len, enum x86_segment seg,
1464 unsigned long *base, u32 *limit,
1465 u32 *ar_bytes)
1467 enum vmcs_field ar_field, base_field, limit_field;
1469 *base = 0;
1470 *limit = 0;
1471 if ( seg != x86_seg_es )
1472 seg = vmx_outs_get_segment(long_mode, eip, inst_len);
1474 switch ( seg )
1476 case x86_seg_cs:
1477 ar_field = GUEST_CS_AR_BYTES;
1478 base_field = GUEST_CS_BASE;
1479 limit_field = GUEST_CS_LIMIT;
1480 break;
1481 case x86_seg_ds:
1482 ar_field = GUEST_DS_AR_BYTES;
1483 base_field = GUEST_DS_BASE;
1484 limit_field = GUEST_DS_LIMIT;
1485 break;
1486 case x86_seg_es:
1487 ar_field = GUEST_ES_AR_BYTES;
1488 base_field = GUEST_ES_BASE;
1489 limit_field = GUEST_ES_LIMIT;
1490 break;
1491 case x86_seg_fs:
1492 ar_field = GUEST_FS_AR_BYTES;
1493 base_field = GUEST_FS_BASE;
1494 limit_field = GUEST_FS_LIMIT;
1495 break;
1496 case x86_seg_gs:
1497 ar_field = GUEST_GS_AR_BYTES;
1498 base_field = GUEST_GS_BASE;
1499 limit_field = GUEST_GS_LIMIT;
1500 break;
1501 case x86_seg_ss:
1502 ar_field = GUEST_SS_AR_BYTES;
1503 base_field = GUEST_SS_BASE;
1504 limit_field = GUEST_SS_LIMIT;
1505 break;
1506 default:
1507 BUG();
1508 return 0;
1511 if ( !long_mode || seg == x86_seg_fs || seg == x86_seg_gs )
1513 *base = __vmread(base_field);
1514 *limit = __vmread(limit_field);
1516 *ar_bytes = __vmread(ar_field);
1518 return !(*ar_bytes & X86_SEG_AR_SEG_UNUSABLE);
1522 static int vmx_str_pio_check_limit(u32 limit, unsigned int size,
1523 u32 ar_bytes, unsigned long addr,
1524 unsigned long base, int df,
1525 unsigned long *count)
1527 unsigned long ea = addr - base;
1529 /* Offset must be within limits. */
1530 ASSERT(ea == (u32)ea);
1531 if ( (u32)(ea + size - 1) < (u32)ea ||
1532 (ar_bytes & 0xc) != 0x4 ? ea + size - 1 > limit
1533 : ea <= limit )
1534 return 0;
1536 /* Check the limit for repeated instructions, as above we checked
1537 only the first instance. Truncate the count if a limit violation
1538 would occur. Note that the checking is not necessary for page
1539 granular segments as transfers crossing page boundaries will be
1540 broken up anyway. */
1541 if ( !(ar_bytes & X86_SEG_AR_GRANULARITY) && *count > 1 )
1543 if ( (ar_bytes & 0xc) != 0x4 )
1545 /* expand-up */
1546 if ( !df )
1548 if ( ea + *count * size - 1 < ea ||
1549 ea + *count * size - 1 > limit )
1550 *count = (limit + 1UL - ea) / size;
1552 else
1554 if ( *count - 1 > ea / size )
1555 *count = ea / size + 1;
1558 else
1560 /* expand-down */
1561 if ( !df )
1563 if ( *count - 1 > -(s32)ea / size )
1564 *count = -(s32)ea / size + 1UL;
1566 else
1568 if ( ea < (*count - 1) * size ||
1569 ea - (*count - 1) * size <= limit )
1570 *count = (ea - limit - 1) / size + 1;
1573 ASSERT(*count);
1576 return 1;
1579 #ifdef __x86_64__
1580 static int vmx_str_pio_lm_check_limit(struct cpu_user_regs *regs,
1581 unsigned int size,
1582 unsigned long addr,
1583 unsigned long *count)
1585 if ( !is_canonical_address(addr) ||
1586 !is_canonical_address(addr + size - 1) )
1587 return 0;
1589 if ( *count > (1UL << 48) / size )
1590 *count = (1UL << 48) / size;
1592 if ( !(regs->eflags & EF_DF) )
1594 if ( addr + *count * size - 1 < addr ||
1595 !is_canonical_address(addr + *count * size - 1) )
1596 *count = (addr & ~((1UL << 48) - 1)) / size;
1598 else
1600 if ( (*count - 1) * size > addr ||
1601 !is_canonical_address(addr + (*count - 1) * size) )
1602 *count = (addr & ~((1UL << 48) - 1)) / size + 1;
1605 ASSERT(*count);
1607 return 1;
1609 #endif
1611 static void vmx_send_str_pio(struct cpu_user_regs *regs,
1612 struct hvm_io_op *pio_opp,
1613 unsigned long inst_len, unsigned int port,
1614 int sign, unsigned int size, int dir,
1615 int df, unsigned long addr,
1616 unsigned long paddr, unsigned long count)
1618 /*
1619 * Handle string pio instructions that cross pages or that
1620 * are unaligned. See the comments in hvm_domain.c/handle_mmio()
1621 */
1622 if ( (addr & PAGE_MASK) != ((addr + size - 1) & PAGE_MASK) ) {
1623 unsigned long value = 0;
1625 pio_opp->flags |= OVERLAP;
1627 if ( dir == IOREQ_WRITE ) /* OUTS */
1629 if ( hvm_paging_enabled(current) )
1631 int rv = hvm_copy_from_guest_virt(&value, addr, size);
1632 if ( rv != 0 )
1634 /* Failed on the page-spanning copy. Inject PF into
1635 * the guest for the address where we failed. */
1636 addr += size - rv;
1637 gdprintk(XENLOG_DEBUG, "Pagefault reading non-io side "
1638 "of a page-spanning PIO: va=%#lx\n", addr);
1639 vmx_inject_exception(TRAP_page_fault, 0, addr);
1640 return;
1643 else
1644 (void) hvm_copy_from_guest_phys(&value, addr, size);
1645 } else /* dir != IOREQ_WRITE */
1646 /* Remember where to write the result, as a *VA*.
1647 * Must be a VA so we can handle the page overlap
1648 * correctly in hvm_pio_assist() */
1649 pio_opp->addr = addr;
1651 if ( count == 1 )
1652 regs->eip += inst_len;
1654 send_pio_req(port, 1, size, value, dir, df, 0);
1655 } else {
1656 unsigned long last_addr = sign > 0 ? addr + count * size - 1
1657 : addr - (count - 1) * size;
1659 if ( (addr & PAGE_MASK) != (last_addr & PAGE_MASK) )
1661 if ( sign > 0 )
1662 count = (PAGE_SIZE - (addr & ~PAGE_MASK)) / size;
1663 else
1664 count = (addr & ~PAGE_MASK) / size + 1;
1665 } else
1666 regs->eip += inst_len;
1668 send_pio_req(port, count, size, paddr, dir, df, 1);
1672 static void vmx_do_str_pio(unsigned long exit_qualification,
1673 unsigned long inst_len,
1674 struct cpu_user_regs *regs,
1675 struct hvm_io_op *pio_opp)
1677 unsigned int port, size;
1678 int dir, df, vm86;
1679 unsigned long addr, count = 1, base;
1680 paddr_t paddr;
1681 unsigned long gfn;
1682 u32 ar_bytes, limit, pfec;
1683 int sign;
1684 int long_mode = 0;
1686 vm86 = regs->eflags & X86_EFLAGS_VM ? 1 : 0;
1687 df = regs->eflags & X86_EFLAGS_DF ? 1 : 0;
1689 if ( test_bit(6, &exit_qualification) )
1690 port = (exit_qualification >> 16) & 0xFFFF;
1691 else
1692 port = regs->edx & 0xffff;
1694 size = (exit_qualification & 7) + 1;
1695 dir = test_bit(3, &exit_qualification); /* direction */
1697 if ( dir == IOREQ_READ )
1698 HVMTRACE_2D(IO_READ, current, port, size);
1699 else
1700 HVMTRACE_2D(IO_WRITE, current, port, size);
1702 sign = regs->eflags & X86_EFLAGS_DF ? -1 : 1;
1703 ar_bytes = __vmread(GUEST_CS_AR_BYTES);
1704 if ( hvm_long_mode_enabled(current) &&
1705 (ar_bytes & X86_SEG_AR_CS_LM_ACTIVE) )
1706 long_mode = 1;
1707 addr = __vmread(GUEST_LINEAR_ADDRESS);
1709 if ( test_bit(5, &exit_qualification) ) { /* "rep" prefix */
1710 pio_opp->flags |= REPZ;
1711 count = regs->ecx;
1712 if ( !long_mode &&
1713 (vm86 || !(ar_bytes & X86_SEG_AR_DEF_OP_SIZE)) )
1714 count &= 0xFFFF;
1717 /*
1718 * In protected mode, guest linear address is invalid if the
1719 * selector is null.
1720 */
1721 if ( !vmx_str_pio_check_descriptor(long_mode, regs->eip, inst_len,
1722 dir==IOREQ_WRITE ? x86_seg_ds :
1723 x86_seg_es, &base, &limit,
1724 &ar_bytes) ) {
1725 if ( !long_mode ) {
1726 vmx_inject_hw_exception(current, TRAP_gp_fault, 0);
1727 return;
1729 addr = dir == IOREQ_WRITE ? base + regs->esi : regs->edi;
1732 if ( !long_mode )
1734 /* Segment must be readable for outs and writeable for ins. */
1735 if ( ((dir == IOREQ_WRITE)
1736 ? ((ar_bytes & 0xa) == 0x8)
1737 : ((ar_bytes & 0xa) != 0x2)) ||
1738 !vmx_str_pio_check_limit(limit, size, ar_bytes,
1739 addr, base, df, &count) )
1741 vmx_inject_hw_exception(current, TRAP_gp_fault, 0);
1742 return;
1745 #ifdef __x86_64__
1746 else if ( !vmx_str_pio_lm_check_limit(regs, size, addr, &count) )
1748 vmx_inject_hw_exception(current, TRAP_gp_fault, 0);
1749 return;
1751 #endif
1753 /* Translate the address to a physical address */
1754 pfec = PFEC_page_present;
1755 if ( dir == IOREQ_READ ) /* Read from PIO --> write to RAM */
1756 pfec |= PFEC_write_access;
1757 if ( ring_3(regs) )
1758 pfec |= PFEC_user_mode;
1759 gfn = paging_gva_to_gfn(current, addr, &pfec);
1760 if ( gfn == INVALID_GFN )
1762 /* The guest does not have the RAM address mapped.
1763 * Need to send in a page fault */
1764 vmx_inject_exception(TRAP_page_fault, pfec, addr);
1765 return;
1767 paddr = (paddr_t)gfn << PAGE_SHIFT | (addr & ~PAGE_MASK);
1769 vmx_send_str_pio(regs, pio_opp, inst_len, port, sign,
1770 size, dir, df, addr, paddr, count);
1773 static void vmx_io_instruction(unsigned long exit_qualification,
1774 unsigned long inst_len)
1776 struct cpu_user_regs *regs;
1777 struct hvm_io_op *pio_opp;
1779 pio_opp = &current->arch.hvm_vcpu.io_op;
1780 pio_opp->instr = INSTR_PIO;
1781 pio_opp->flags = 0;
1783 regs = &pio_opp->io_context;
1785 /* Copy current guest state into io instruction state structure. */
1786 memcpy(regs, guest_cpu_user_regs(), HVM_CONTEXT_STACK_BYTES);
1788 HVM_DBG_LOG(DBG_LEVEL_IO, "vm86 %d, eip=%x:%lx, "
1789 "exit_qualification = %lx",
1790 regs->eflags & X86_EFLAGS_VM ? 1 : 0,
1791 regs->cs, (unsigned long)regs->eip, exit_qualification);
1793 if ( test_bit(4, &exit_qualification) ) /* string instrucation */
1794 vmx_do_str_pio(exit_qualification, inst_len, regs, pio_opp);
1795 else
1797 unsigned int port, size;
1798 int dir, df;
1800 df = regs->eflags & X86_EFLAGS_DF ? 1 : 0;
1802 if ( test_bit(6, &exit_qualification) )
1803 port = (exit_qualification >> 16) & 0xFFFF;
1804 else
1805 port = regs->edx & 0xffff;
1807 size = (exit_qualification & 7) + 1;
1808 dir = test_bit(3, &exit_qualification); /* direction */
1810 if ( dir == IOREQ_READ )
1811 HVMTRACE_2D(IO_READ, current, port, size);
1812 else
1813 HVMTRACE_3D(IO_WRITE, current, port, size, regs->eax);
1815 if ( port == 0xe9 && dir == IOREQ_WRITE && size == 1 )
1816 hvm_print_line(current, regs->eax); /* guest debug output */
1818 regs->eip += inst_len;
1819 send_pio_req(port, 1, size, regs->eax, dir, df, 0);
1823 #ifdef VMXASSIST
1825 static void vmx_world_save(struct vcpu *v, struct vmx_assist_context *c)
1827 struct cpu_user_regs *regs = guest_cpu_user_regs();
1829 c->eip = regs->eip;
1830 c->eip += __get_instruction_length(); /* Safe: MOV Cn, LMSW, CLTS */
1831 c->esp = regs->esp;
1832 c->eflags = regs->eflags & ~X86_EFLAGS_RF;
1834 c->cr0 = v->arch.hvm_vcpu.guest_cr[0];
1835 c->cr3 = v->arch.hvm_vcpu.guest_cr[3];
1836 c->cr4 = v->arch.hvm_vcpu.guest_cr[4];
1838 c->idtr_limit = __vmread(GUEST_IDTR_LIMIT);
1839 c->idtr_base = __vmread(GUEST_IDTR_BASE);
1841 c->gdtr_limit = __vmread(GUEST_GDTR_LIMIT);
1842 c->gdtr_base = __vmread(GUEST_GDTR_BASE);
1844 c->cs_sel = __vmread(GUEST_CS_SELECTOR);
1845 c->cs_limit = __vmread(GUEST_CS_LIMIT);
1846 c->cs_base = __vmread(GUEST_CS_BASE);
1847 c->cs_arbytes.bytes = __vmread(GUEST_CS_AR_BYTES);
1849 c->ds_sel = __vmread(GUEST_DS_SELECTOR);
1850 c->ds_limit = __vmread(GUEST_DS_LIMIT);
1851 c->ds_base = __vmread(GUEST_DS_BASE);
1852 c->ds_arbytes.bytes = __vmread(GUEST_DS_AR_BYTES);
1854 c->es_sel = __vmread(GUEST_ES_SELECTOR);
1855 c->es_limit = __vmread(GUEST_ES_LIMIT);
1856 c->es_base = __vmread(GUEST_ES_BASE);
1857 c->es_arbytes.bytes = __vmread(GUEST_ES_AR_BYTES);
1859 c->ss_sel = __vmread(GUEST_SS_SELECTOR);
1860 c->ss_limit = __vmread(GUEST_SS_LIMIT);
1861 c->ss_base = __vmread(GUEST_SS_BASE);
1862 c->ss_arbytes.bytes = __vmread(GUEST_SS_AR_BYTES);
1864 c->fs_sel = __vmread(GUEST_FS_SELECTOR);
1865 c->fs_limit = __vmread(GUEST_FS_LIMIT);
1866 c->fs_base = __vmread(GUEST_FS_BASE);
1867 c->fs_arbytes.bytes = __vmread(GUEST_FS_AR_BYTES);
1869 c->gs_sel = __vmread(GUEST_GS_SELECTOR);
1870 c->gs_limit = __vmread(GUEST_GS_LIMIT);
1871 c->gs_base = __vmread(GUEST_GS_BASE);
1872 c->gs_arbytes.bytes = __vmread(GUEST_GS_AR_BYTES);
1874 c->tr_sel = __vmread(GUEST_TR_SELECTOR);
1875 c->tr_limit = __vmread(GUEST_TR_LIMIT);
1876 c->tr_base = __vmread(GUEST_TR_BASE);
1877 c->tr_arbytes.bytes = __vmread(GUEST_TR_AR_BYTES);
1879 c->ldtr_sel = __vmread(GUEST_LDTR_SELECTOR);
1880 c->ldtr_limit = __vmread(GUEST_LDTR_LIMIT);
1881 c->ldtr_base = __vmread(GUEST_LDTR_BASE);
1882 c->ldtr_arbytes.bytes = __vmread(GUEST_LDTR_AR_BYTES);
1885 static int vmx_world_restore(struct vcpu *v, struct vmx_assist_context *c)
1887 struct cpu_user_regs *regs = guest_cpu_user_regs();
1888 int rc;
1890 rc = vmx_restore_cr0_cr3(v, c->cr0, c->cr3);
1891 if ( rc )
1892 return rc;
1894 regs->eip = c->eip;
1895 regs->esp = c->esp;
1896 regs->eflags = c->eflags | 2;
1898 v->arch.hvm_vcpu.guest_cr[4] = c->cr4;
1899 vmx_update_guest_cr(v, 0);
1900 vmx_update_guest_cr(v, 4);
1902 __vmwrite(GUEST_IDTR_LIMIT, c->idtr_limit);
1903 __vmwrite(GUEST_IDTR_BASE, c->idtr_base);
1905 __vmwrite(GUEST_GDTR_LIMIT, c->gdtr_limit);
1906 __vmwrite(GUEST_GDTR_BASE, c->gdtr_base);
1908 __vmwrite(GUEST_CS_SELECTOR, c->cs_sel);
1909 __vmwrite(GUEST_CS_LIMIT, c->cs_limit);
1910 __vmwrite(GUEST_CS_BASE, c->cs_base);
1911 __vmwrite(GUEST_CS_AR_BYTES, c->cs_arbytes.bytes);
1913 __vmwrite(GUEST_DS_SELECTOR, c->ds_sel);
1914 __vmwrite(GUEST_DS_LIMIT, c->ds_limit);
1915 __vmwrite(GUEST_DS_BASE, c->ds_base);
1916 __vmwrite(GUEST_DS_AR_BYTES, c->ds_arbytes.bytes);
1918 __vmwrite(GUEST_ES_SELECTOR, c->es_sel);
1919 __vmwrite(GUEST_ES_LIMIT, c->es_limit);
1920 __vmwrite(GUEST_ES_BASE, c->es_base);
1921 __vmwrite(GUEST_ES_AR_BYTES, c->es_arbytes.bytes);
1923 __vmwrite(GUEST_SS_SELECTOR, c->ss_sel);
1924 __vmwrite(GUEST_SS_LIMIT, c->ss_limit);
1925 __vmwrite(GUEST_SS_BASE, c->ss_base);
1926 __vmwrite(GUEST_SS_AR_BYTES, c->ss_arbytes.bytes);
1928 __vmwrite(GUEST_FS_SELECTOR, c->fs_sel);
1929 __vmwrite(GUEST_FS_LIMIT, c->fs_limit);
1930 __vmwrite(GUEST_FS_BASE, c->fs_base);
1931 __vmwrite(GUEST_FS_AR_BYTES, c->fs_arbytes.bytes);
1933 __vmwrite(GUEST_GS_SELECTOR, c->gs_sel);
1934 __vmwrite(GUEST_GS_LIMIT, c->gs_limit);
1935 __vmwrite(GUEST_GS_BASE, c->gs_base);
1936 __vmwrite(GUEST_GS_AR_BYTES, c->gs_arbytes.bytes);
1938 __vmwrite(GUEST_TR_SELECTOR, c->tr_sel);
1939 __vmwrite(GUEST_TR_LIMIT, c->tr_limit);
1940 __vmwrite(GUEST_TR_BASE, c->tr_base);
1941 __vmwrite(GUEST_TR_AR_BYTES, c->tr_arbytes.bytes);
1943 __vmwrite(GUEST_LDTR_SELECTOR, c->ldtr_sel);
1944 __vmwrite(GUEST_LDTR_LIMIT, c->ldtr_limit);
1945 __vmwrite(GUEST_LDTR_BASE, c->ldtr_base);
1946 __vmwrite(GUEST_LDTR_AR_BYTES, c->ldtr_arbytes.bytes);
1948 paging_update_paging_modes(v);
1949 return 0;
1952 enum { VMX_ASSIST_INVOKE = 0, VMX_ASSIST_RESTORE };
1954 static int vmx_assist(struct vcpu *v, int mode)
1956 struct vmx_assist_context c;
1957 struct hvm_hw_vpic *vpic = v->domain->arch.hvm_domain.vpic;
1958 u32 magic, cp;
1960 if ( hvm_copy_from_guest_phys(&magic, VMXASSIST_MAGIC_OFFSET,
1961 sizeof(magic)) )
1963 gdprintk(XENLOG_ERR, "No vmxassist: can't execute real mode code\n");
1964 domain_crash(v->domain);
1965 return 0;
1968 if ( magic != VMXASSIST_MAGIC )
1970 gdprintk(XENLOG_ERR, "vmxassist magic number not match\n");
1971 domain_crash(v->domain);
1972 return 0;
1975 switch ( mode ) {
1976 /*
1977 * Transfer control to vmxassist.
1978 * Store the current context in VMXASSIST_OLD_CONTEXT and load
1979 * the new VMXASSIST_NEW_CONTEXT context. This context was created
1980 * by vmxassist and will transfer control to it.
1981 */
1982 case VMX_ASSIST_INVOKE:
1983 /* save the old context */
1984 if ( hvm_copy_from_guest_phys(&cp, VMXASSIST_OLD_CONTEXT, sizeof(cp)) )
1985 goto error;
1986 if ( cp != 0 ) {
1987 vmx_world_save(v, &c);
1988 if ( hvm_copy_to_guest_phys(cp, &c, sizeof(c)) )
1989 goto error;
1992 /* restore the new context, this should activate vmxassist */
1993 if ( hvm_copy_from_guest_phys(&cp, VMXASSIST_NEW_CONTEXT, sizeof(cp)) )
1994 goto error;
1995 if ( cp != 0 ) {
1996 if ( hvm_copy_from_guest_phys(&c, cp, sizeof(c)) )
1997 goto error;
1998 if ( vmx_world_restore(v, &c) != 0 )
1999 goto error;
2000 v->arch.hvm_vmx.pm_irqbase[0] = vpic[0].irq_base;
2001 v->arch.hvm_vmx.pm_irqbase[1] = vpic[1].irq_base;
2002 vpic[0].irq_base = NR_EXCEPTION_HANDLER;
2003 vpic[1].irq_base = NR_EXCEPTION_HANDLER + 8;
2004 v->arch.hvm_vmx.vmxassist_enabled = 1;
2005 return 1;
2007 break;
2009 /*
2010 * Restore the VMXASSIST_OLD_CONTEXT that was saved by
2011 * VMX_ASSIST_INVOKE above.
2012 */
2013 case VMX_ASSIST_RESTORE:
2014 /* save the old context */
2015 if ( hvm_copy_from_guest_phys(&cp, VMXASSIST_OLD_CONTEXT, sizeof(cp)) )
2016 goto error;
2017 if ( cp != 0 ) {
2018 if ( hvm_copy_from_guest_phys(&c, cp, sizeof(c)) )
2019 goto error;
2020 if ( vmx_world_restore(v, &c) != 0 )
2021 goto error;
2022 if ( v->arch.hvm_vmx.irqbase_mode ) {
2023 vpic[0].irq_base = c.rm_irqbase[0] & 0xf8;
2024 vpic[1].irq_base = c.rm_irqbase[1] & 0xf8;
2025 } else {
2026 vpic[0].irq_base = v->arch.hvm_vmx.pm_irqbase[0];
2027 vpic[1].irq_base = v->arch.hvm_vmx.pm_irqbase[1];
2029 v->arch.hvm_vmx.vmxassist_enabled = 0;
2030 return 1;
2032 break;
2035 error:
2036 gdprintk(XENLOG_ERR, "Failed to transfer to vmxassist\n");
2037 domain_crash(v->domain);
2038 return 0;
2041 static int vmx_set_cr0(unsigned long value)
2043 struct vcpu *v = current;
2045 if ( hvm_set_cr0(value) == 0 )
2046 return 0;
2048 /*
2049 * VMX does not implement real-mode virtualization. We emulate
2050 * real-mode by performing a world switch to VMXAssist whenever
2051 * a partition disables the CR0.PE bit.
2052 */
2053 if ( !(value & X86_CR0_PE) )
2055 if ( vmx_assist(v, VMX_ASSIST_INVOKE) )
2056 return 0; /* do not update eip! */
2058 else if ( v->arch.hvm_vmx.vmxassist_enabled )
2060 if ( vmx_assist(v, VMX_ASSIST_RESTORE) )
2061 return 0; /* do not update eip! */
2064 return 1;
2067 #else /* !defined(VMXASSIST) */
2069 #define vmx_set_cr0(v) hvm_set_cr0(v)
2071 #endif
2073 #define CASE_SET_REG(REG, reg) \
2074 case REG_ ## REG: regs->reg = value; break
2075 #define CASE_GET_REG(REG, reg) \
2076 case REG_ ## REG: value = regs->reg; break
2078 #define CASE_EXTEND_SET_REG \
2079 CASE_EXTEND_REG(S)
2080 #define CASE_EXTEND_GET_REG \
2081 CASE_EXTEND_REG(G)
2083 #ifdef __i386__
2084 #define CASE_EXTEND_REG(T)
2085 #else
2086 #define CASE_EXTEND_REG(T) \
2087 CASE_ ## T ## ET_REG(R8, r8); \
2088 CASE_ ## T ## ET_REG(R9, r9); \
2089 CASE_ ## T ## ET_REG(R10, r10); \
2090 CASE_ ## T ## ET_REG(R11, r11); \
2091 CASE_ ## T ## ET_REG(R12, r12); \
2092 CASE_ ## T ## ET_REG(R13, r13); \
2093 CASE_ ## T ## ET_REG(R14, r14); \
2094 CASE_ ## T ## ET_REG(R15, r15)
2095 #endif
2097 static int mov_to_cr(int gp, int cr, struct cpu_user_regs *regs)
2099 unsigned long value;
2100 struct vcpu *v = current;
2101 struct vlapic *vlapic = vcpu_vlapic(v);
2103 switch ( gp )
2105 CASE_GET_REG(EAX, eax);
2106 CASE_GET_REG(ECX, ecx);
2107 CASE_GET_REG(EDX, edx);
2108 CASE_GET_REG(EBX, ebx);
2109 CASE_GET_REG(EBP, ebp);
2110 CASE_GET_REG(ESI, esi);
2111 CASE_GET_REG(EDI, edi);
2112 CASE_GET_REG(ESP, esp);
2113 CASE_EXTEND_GET_REG;
2114 default:
2115 gdprintk(XENLOG_ERR, "invalid gp: %d\n", gp);
2116 goto exit_and_crash;
2119 HVMTRACE_2D(CR_WRITE, v, cr, value);
2121 HVM_DBG_LOG(DBG_LEVEL_1, "CR%d, value = %lx", cr, value);
2123 switch ( cr )
2125 case 0:
2126 return vmx_set_cr0(value);
2128 case 3:
2129 return hvm_set_cr3(value);
2131 case 4:
2132 return hvm_set_cr4(value);
2134 case 8:
2135 vlapic_set_reg(vlapic, APIC_TASKPRI, ((value & 0x0F) << 4));
2136 break;
2138 default:
2139 gdprintk(XENLOG_ERR, "invalid cr: %d\n", cr);
2140 goto exit_and_crash;
2143 return 1;
2145 exit_and_crash:
2146 domain_crash(v->domain);
2147 return 0;
2150 /*
2151 * Read from control registers. CR0 and CR4 are read from the shadow.
2152 */
2153 static void mov_from_cr(int cr, int gp, struct cpu_user_regs *regs)
2155 unsigned long value = 0;
2156 struct vcpu *v = current;
2157 struct vlapic *vlapic = vcpu_vlapic(v);
2159 switch ( cr )
2161 case 3:
2162 value = (unsigned long)v->arch.hvm_vcpu.guest_cr[3];
2163 break;
2164 case 8:
2165 value = (unsigned long)vlapic_get_reg(vlapic, APIC_TASKPRI);
2166 value = (value & 0xF0) >> 4;
2167 break;
2168 default:
2169 gdprintk(XENLOG_ERR, "invalid cr: %d\n", cr);
2170 domain_crash(v->domain);
2171 break;
2174 switch ( gp ) {
2175 CASE_SET_REG(EAX, eax);
2176 CASE_SET_REG(ECX, ecx);
2177 CASE_SET_REG(EDX, edx);
2178 CASE_SET_REG(EBX, ebx);
2179 CASE_SET_REG(EBP, ebp);
2180 CASE_SET_REG(ESI, esi);
2181 CASE_SET_REG(EDI, edi);
2182 CASE_SET_REG(ESP, esp);
2183 CASE_EXTEND_SET_REG;
2184 default:
2185 printk("invalid gp: %d\n", gp);
2186 domain_crash(v->domain);
2187 break;
2190 HVMTRACE_2D(CR_READ, v, cr, value);
2192 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR%d, value = %lx", cr, value);
2195 static int vmx_cr_access(unsigned long exit_qualification,
2196 struct cpu_user_regs *regs)
2198 unsigned int gp, cr;
2199 unsigned long value;
2200 struct vcpu *v = current;
2202 switch ( exit_qualification & CONTROL_REG_ACCESS_TYPE )
2204 case TYPE_MOV_TO_CR:
2205 gp = exit_qualification & CONTROL_REG_ACCESS_REG;
2206 cr = exit_qualification & CONTROL_REG_ACCESS_NUM;
2207 return mov_to_cr(gp, cr, regs);
2208 case TYPE_MOV_FROM_CR:
2209 gp = exit_qualification & CONTROL_REG_ACCESS_REG;
2210 cr = exit_qualification & CONTROL_REG_ACCESS_NUM;
2211 mov_from_cr(cr, gp, regs);
2212 break;
2213 case TYPE_CLTS:
2214 /* We initialise the FPU now, to avoid needing another vmexit. */
2215 setup_fpu(v);
2216 __vm_clear_bit(EXCEPTION_BITMAP, TRAP_no_device);
2218 v->arch.hvm_vcpu.hw_cr[0] &= ~X86_CR0_TS; /* clear TS */
2219 __vmwrite(GUEST_CR0, v->arch.hvm_vcpu.hw_cr[0]);
2221 v->arch.hvm_vcpu.guest_cr[0] &= ~X86_CR0_TS; /* clear TS */
2222 __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vcpu.guest_cr[0]);
2223 HVMTRACE_0D(CLTS, current);
2224 break;
2225 case TYPE_LMSW:
2226 value = v->arch.hvm_vcpu.guest_cr[0];
2227 value = (value & ~0xF) |
2228 (((exit_qualification & LMSW_SOURCE_DATA) >> 16) & 0xF);
2229 HVMTRACE_1D(LMSW, current, value);
2230 return vmx_set_cr0(value);
2231 default:
2232 BUG();
2235 return 1;
2238 static const struct lbr_info {
2239 u32 base, count;
2240 } p4_lbr[] = {
2241 { MSR_P4_LER_FROM_LIP, 1 },
2242 { MSR_P4_LER_TO_LIP, 1 },
2243 { MSR_P4_LASTBRANCH_TOS, 1 },
2244 { MSR_P4_LASTBRANCH_0_FROM_LIP, NUM_MSR_P4_LASTBRANCH_FROM_TO },
2245 { MSR_P4_LASTBRANCH_0_TO_LIP, NUM_MSR_P4_LASTBRANCH_FROM_TO },
2246 { 0, 0 }
2247 }, c2_lbr[] = {
2248 { MSR_IA32_LASTINTFROMIP, 1 },
2249 { MSR_IA32_LASTINTTOIP, 1 },
2250 { MSR_C2_LASTBRANCH_TOS, 1 },
2251 { MSR_C2_LASTBRANCH_0_FROM_IP, NUM_MSR_C2_LASTBRANCH_FROM_TO },
2252 { MSR_C2_LASTBRANCH_0_TO_IP, NUM_MSR_C2_LASTBRANCH_FROM_TO },
2253 { 0, 0 }
2254 #ifdef __i386__
2255 }, pm_lbr[] = {
2256 { MSR_IA32_LASTINTFROMIP, 1 },
2257 { MSR_IA32_LASTINTTOIP, 1 },
2258 { MSR_PM_LASTBRANCH_TOS, 1 },
2259 { MSR_PM_LASTBRANCH_0, NUM_MSR_PM_LASTBRANCH },
2260 { 0, 0 }
2261 #endif
2262 };
2264 static const struct lbr_info *last_branch_msr_get(void)
2266 switch ( boot_cpu_data.x86 )
2268 case 6:
2269 switch ( boot_cpu_data.x86_model )
2271 #ifdef __i386__
2272 /* PentiumM */
2273 case 9: case 13:
2274 /* Core Solo/Duo */
2275 case 14:
2276 return pm_lbr;
2277 break;
2278 #endif
2279 /* Core2 Duo */
2280 case 15:
2281 return c2_lbr;
2282 break;
2284 break;
2286 case 15:
2287 switch ( boot_cpu_data.x86_model )
2289 /* Pentium4/Xeon with em64t */
2290 case 3: case 4: case 6:
2291 return p4_lbr;
2292 break;
2294 break;
2297 return NULL;
2300 static int is_last_branch_msr(u32 ecx)
2302 const struct lbr_info *lbr = last_branch_msr_get();
2304 if ( lbr == NULL )
2305 return 0;
2307 for ( ; lbr->count; lbr++ )
2308 if ( (ecx >= lbr->base) && (ecx < (lbr->base + lbr->count)) )
2309 return 1;
2311 return 0;
2314 static int vmx_do_msr_read(struct cpu_user_regs *regs)
2316 u64 msr_content = 0;
2317 u32 ecx = regs->ecx, eax, edx;
2318 struct vcpu *v = current;
2319 int index;
2320 u64 *var_range_base = (u64*)v->arch.hvm_vcpu.mtrr.var_ranges;
2321 u64 *fixed_range_base = (u64*)v->arch.hvm_vcpu.mtrr.fixed_ranges;
2323 HVM_DBG_LOG(DBG_LEVEL_1, "ecx=%x", ecx);
2325 switch ( ecx )
2327 case MSR_IA32_TSC:
2328 msr_content = hvm_get_guest_time(v);
2329 break;
2330 case MSR_IA32_SYSENTER_CS:
2331 msr_content = (u32)__vmread(GUEST_SYSENTER_CS);
2332 break;
2333 case MSR_IA32_SYSENTER_ESP:
2334 msr_content = __vmread(GUEST_SYSENTER_ESP);
2335 break;
2336 case MSR_IA32_SYSENTER_EIP:
2337 msr_content = __vmread(GUEST_SYSENTER_EIP);
2338 break;
2339 case MSR_IA32_APICBASE:
2340 msr_content = vcpu_vlapic(v)->hw.apic_base_msr;
2341 break;
2342 case MSR_IA32_CR_PAT:
2343 msr_content = v->arch.hvm_vcpu.pat_cr;
2344 break;
2345 case MSR_MTRRcap:
2346 msr_content = v->arch.hvm_vcpu.mtrr.mtrr_cap;
2347 break;
2348 case MSR_MTRRdefType:
2349 msr_content = v->arch.hvm_vcpu.mtrr.def_type
2350 | (v->arch.hvm_vcpu.mtrr.enabled << 10);
2351 break;
2352 case MSR_MTRRfix64K_00000:
2353 msr_content = fixed_range_base[0];
2354 break;
2355 case MSR_MTRRfix16K_80000:
2356 case MSR_MTRRfix16K_A0000:
2357 index = regs->ecx - MSR_MTRRfix16K_80000;
2358 msr_content = fixed_range_base[index + 1];
2359 break;
2360 case MSR_MTRRfix4K_C0000...MSR_MTRRfix4K_F8000:
2361 index = regs->ecx - MSR_MTRRfix4K_C0000;
2362 msr_content = fixed_range_base[index + 3];
2363 break;
2364 case MSR_IA32_MTRR_PHYSBASE0...MSR_IA32_MTRR_PHYSMASK7:
2365 index = regs->ecx - MSR_IA32_MTRR_PHYSBASE0;
2366 msr_content = var_range_base[index];
2367 break;
2368 case MSR_IA32_DEBUGCTLMSR:
2369 if ( vmx_read_guest_msr(v, ecx, &msr_content) != 0 )
2370 msr_content = 0;
2371 break;
2372 case MSR_IA32_VMX_BASIC...MSR_IA32_VMX_PROCBASED_CTLS2:
2373 goto gp_fault;
2374 case MSR_IA32_MCG_CAP:
2375 case MSR_IA32_MCG_STATUS:
2376 case MSR_IA32_MC0_STATUS:
2377 case MSR_IA32_MC1_STATUS:
2378 case MSR_IA32_MC2_STATUS:
2379 case MSR_IA32_MC3_STATUS:
2380 case MSR_IA32_MC4_STATUS:
2381 case MSR_IA32_MC5_STATUS:
2382 /* No point in letting the guest see real MCEs */
2383 msr_content = 0;
2384 break;
2385 default:
2386 switch ( long_mode_do_msr_read(regs) )
2388 case HNDL_unhandled:
2389 break;
2390 case HNDL_exception_raised:
2391 return 0;
2392 case HNDL_done:
2393 goto done;
2396 if ( vmx_read_guest_msr(v, ecx, &msr_content) == 0 )
2397 break;
2399 if ( is_last_branch_msr(ecx) )
2401 msr_content = 0;
2402 break;
2405 if ( rdmsr_hypervisor_regs(ecx, &eax, &edx) ||
2406 rdmsr_safe(ecx, eax, edx) == 0 )
2408 regs->eax = eax;
2409 regs->edx = edx;
2410 goto done;
2413 goto gp_fault;
2416 regs->eax = msr_content & 0xFFFFFFFF;
2417 regs->edx = msr_content >> 32;
2419 done:
2420 hvmtrace_msr_read(v, ecx, msr_content);
2421 HVM_DBG_LOG(DBG_LEVEL_1, "returns: ecx=%x, eax=%lx, edx=%lx",
2422 ecx, (unsigned long)regs->eax,
2423 (unsigned long)regs->edx);
2424 return 1;
2426 gp_fault:
2427 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
2428 return 0;
2431 static int vmx_alloc_vlapic_mapping(struct domain *d)
2433 void *apic_va;
2435 if ( !cpu_has_vmx_virtualize_apic_accesses )
2436 return 0;
2438 apic_va = alloc_xenheap_page();
2439 if ( apic_va == NULL )
2440 return -ENOMEM;
2441 share_xen_page_with_guest(virt_to_page(apic_va), d, XENSHARE_writable);
2442 guest_physmap_add_page(
2443 d, paddr_to_pfn(APIC_DEFAULT_PHYS_BASE), virt_to_mfn(apic_va));
2444 d->arch.hvm_domain.vmx_apic_access_mfn = virt_to_mfn(apic_va);
2446 return 0;
2449 static void vmx_free_vlapic_mapping(struct domain *d)
2451 unsigned long mfn = d->arch.hvm_domain.vmx_apic_access_mfn;
2452 if ( mfn != 0 )
2453 free_xenheap_page(mfn_to_virt(mfn));
2456 static void vmx_install_vlapic_mapping(struct vcpu *v)
2458 unsigned long virt_page_ma, apic_page_ma;
2460 if ( !cpu_has_vmx_virtualize_apic_accesses )
2461 return;
2463 virt_page_ma = page_to_maddr(vcpu_vlapic(v)->regs_page);
2464 apic_page_ma = v->domain->arch.hvm_domain.vmx_apic_access_mfn;
2465 apic_page_ma <<= PAGE_SHIFT;
2467 vmx_vmcs_enter(v);
2468 __vmwrite(VIRTUAL_APIC_PAGE_ADDR, virt_page_ma);
2469 __vmwrite(APIC_ACCESS_ADDR, apic_page_ma);
2470 vmx_vmcs_exit(v);
2473 void vmx_vlapic_msr_changed(struct vcpu *v)
2475 struct vlapic *vlapic = vcpu_vlapic(v);
2476 uint32_t ctl;
2478 if ( !cpu_has_vmx_virtualize_apic_accesses )
2479 return;
2481 vmx_vmcs_enter(v);
2482 ctl = __vmread(SECONDARY_VM_EXEC_CONTROL);
2483 ctl &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
2484 if ( !vlapic_hw_disabled(vlapic) &&
2485 (vlapic_base_address(vlapic) == APIC_DEFAULT_PHYS_BASE) )
2486 ctl |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
2487 __vmwrite(SECONDARY_VM_EXEC_CONTROL, ctl);
2488 vmx_vmcs_exit(v);
2491 extern bool_t mtrr_var_range_msr_set(struct mtrr_state *v,
2492 u32 msr, u64 msr_content);
2493 extern bool_t mtrr_fix_range_msr_set(struct mtrr_state *v,
2494 int row, u64 msr_content);
2495 extern bool_t mtrr_def_type_msr_set(struct mtrr_state *v, u64 msr_content);
2496 extern bool_t pat_msr_set(u64 *pat, u64 msr);
2498 static int vmx_do_msr_write(struct cpu_user_regs *regs)
2500 u32 ecx = regs->ecx;
2501 u64 msr_content;
2502 struct vcpu *v = current;
2503 int index;
2505 HVM_DBG_LOG(DBG_LEVEL_1, "ecx=%x, eax=%x, edx=%x",
2506 ecx, (u32)regs->eax, (u32)regs->edx);
2508 msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
2510 hvmtrace_msr_write(v, ecx, msr_content);
2512 switch ( ecx )
2514 case MSR_IA32_TSC:
2515 hvm_set_guest_time(v, msr_content);
2516 pt_reset(v);
2517 break;
2518 case MSR_IA32_SYSENTER_CS:
2519 __vmwrite(GUEST_SYSENTER_CS, msr_content);
2520 break;
2521 case MSR_IA32_SYSENTER_ESP:
2522 __vmwrite(GUEST_SYSENTER_ESP, msr_content);
2523 break;
2524 case MSR_IA32_SYSENTER_EIP:
2525 __vmwrite(GUEST_SYSENTER_EIP, msr_content);
2526 break;
2527 case MSR_IA32_APICBASE:
2528 vlapic_msr_set(vcpu_vlapic(v), msr_content);
2529 break;
2530 case MSR_IA32_CR_PAT:
2531 if ( !pat_msr_set(&v->arch.hvm_vcpu.pat_cr, msr_content) )
2532 goto gp_fault;
2533 break;
2534 case MSR_MTRRdefType:
2535 if ( !mtrr_def_type_msr_set(&v->arch.hvm_vcpu.mtrr, msr_content) )
2536 goto gp_fault;
2537 break;
2538 case MSR_MTRRfix64K_00000:
2539 if ( !mtrr_fix_range_msr_set(&v->arch.hvm_vcpu.mtrr, 0, msr_content) )
2540 goto gp_fault;
2541 break;
2542 case MSR_MTRRfix16K_80000:
2543 case MSR_MTRRfix16K_A0000:
2544 index = regs->ecx - MSR_MTRRfix16K_80000 + 1;
2545 if ( !mtrr_fix_range_msr_set(&v->arch.hvm_vcpu.mtrr,
2546 index, msr_content) )
2547 goto gp_fault;
2548 break;
2549 case MSR_MTRRfix4K_C0000...MSR_MTRRfix4K_F8000:
2550 index = regs->ecx - MSR_MTRRfix4K_C0000 + 3;
2551 if ( !mtrr_fix_range_msr_set(&v->arch.hvm_vcpu.mtrr,
2552 index, msr_content) )
2553 goto gp_fault;
2554 break;
2555 case MSR_IA32_MTRR_PHYSBASE0...MSR_IA32_MTRR_PHYSMASK7:
2556 if ( !mtrr_var_range_msr_set(&v->arch.hvm_vcpu.mtrr,
2557 regs->ecx, msr_content) )
2558 goto gp_fault;
2559 break;
2560 case MSR_MTRRcap:
2561 goto gp_fault;
2562 case MSR_IA32_DEBUGCTLMSR: {
2563 int i, rc = 0;
2565 if ( !msr_content || (msr_content & ~3) )
2566 break;
2568 if ( msr_content & 1 )
2570 const struct lbr_info *lbr = last_branch_msr_get();
2571 if ( lbr == NULL )
2572 break;
2574 for ( ; (rc == 0) && lbr->count; lbr++ )
2575 for ( i = 0; (rc == 0) && (i < lbr->count); i++ )
2576 if ( (rc = vmx_add_guest_msr(v, lbr->base + i)) == 0 )
2577 vmx_disable_intercept_for_msr(v, lbr->base + i);
2580 if ( (rc < 0) ||
2581 (vmx_add_guest_msr(v, ecx) < 0) ||
2582 (vmx_add_host_load_msr(v, ecx) < 0) )
2583 vmx_inject_hw_exception(v, TRAP_machine_check, 0);
2584 else
2585 vmx_write_guest_msr(v, ecx, msr_content);
2587 break;
2589 case MSR_IA32_VMX_BASIC...MSR_IA32_VMX_PROCBASED_CTLS2:
2590 goto gp_fault;
2591 default:
2592 switch ( long_mode_do_msr_write(regs) )
2594 case HNDL_unhandled:
2595 if ( (vmx_write_guest_msr(v, ecx, msr_content) != 0) &&
2596 !is_last_branch_msr(ecx) )
2597 wrmsr_hypervisor_regs(ecx, regs->eax, regs->edx);
2598 break;
2599 case HNDL_exception_raised:
2600 return 0;
2601 case HNDL_done:
2602 break;
2604 break;
2607 return 1;
2609 gp_fault:
2610 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
2611 return 0;
2614 static void vmx_do_hlt(struct cpu_user_regs *regs)
2616 unsigned long intr_info = __vmread(VM_ENTRY_INTR_INFO);
2617 struct vcpu *curr = current;
2619 /* Check for pending exception. */
2620 if ( intr_info & INTR_INFO_VALID_MASK )
2622 HVMTRACE_1D(HLT, curr, /*int pending=*/ 1);
2623 return;
2626 HVMTRACE_1D(HLT, curr, /*int pending=*/ 0);
2627 hvm_hlt(regs->eflags);
2630 static void vmx_do_extint(struct cpu_user_regs *regs)
2632 unsigned int vector;
2634 asmlinkage void do_IRQ(struct cpu_user_regs *);
2635 fastcall void smp_apic_timer_interrupt(struct cpu_user_regs *);
2636 fastcall void smp_event_check_interrupt(void);
2637 fastcall void smp_invalidate_interrupt(void);
2638 fastcall void smp_call_function_interrupt(void);
2639 fastcall void smp_spurious_interrupt(struct cpu_user_regs *regs);
2640 fastcall void smp_error_interrupt(struct cpu_user_regs *regs);
2641 #ifdef CONFIG_X86_MCE_P4THERMAL
2642 fastcall void smp_thermal_interrupt(struct cpu_user_regs *regs);
2643 #endif
2645 vector = __vmread(VM_EXIT_INTR_INFO);
2646 BUG_ON(!(vector & INTR_INFO_VALID_MASK));
2648 vector &= INTR_INFO_VECTOR_MASK;
2649 HVMTRACE_1D(INTR, current, vector);
2651 switch ( vector )
2653 case LOCAL_TIMER_VECTOR:
2654 smp_apic_timer_interrupt(regs);
2655 break;
2656 case EVENT_CHECK_VECTOR:
2657 smp_event_check_interrupt();
2658 break;
2659 case INVALIDATE_TLB_VECTOR:
2660 smp_invalidate_interrupt();
2661 break;
2662 case CALL_FUNCTION_VECTOR:
2663 smp_call_function_interrupt();
2664 break;
2665 case SPURIOUS_APIC_VECTOR:
2666 smp_spurious_interrupt(regs);
2667 break;
2668 case ERROR_APIC_VECTOR:
2669 smp_error_interrupt(regs);
2670 break;
2671 #ifdef CONFIG_X86_MCE_P4THERMAL
2672 case THERMAL_APIC_VECTOR:
2673 smp_thermal_interrupt(regs);
2674 break;
2675 #endif
2676 default:
2677 regs->entry_vector = vector;
2678 do_IRQ(regs);
2679 break;
2683 static void wbinvd_ipi(void *info)
2685 wbinvd();
2688 void vmx_wbinvd_intercept(void)
2690 if ( list_empty(&(domain_hvm_iommu(current->domain)->pdev_list)) )
2691 return;
2693 if ( cpu_has_wbinvd_exiting )
2694 on_each_cpu(wbinvd_ipi, NULL, 1, 1);
2695 else
2696 wbinvd();
2699 static void vmx_failed_vmentry(unsigned int exit_reason,
2700 struct cpu_user_regs *regs)
2702 unsigned int failed_vmentry_reason = (uint16_t)exit_reason;
2703 unsigned long exit_qualification = __vmread(EXIT_QUALIFICATION);
2704 struct vcpu *curr = current;
2706 printk("Failed vm entry (exit reason 0x%x) ", exit_reason);
2707 switch ( failed_vmentry_reason )
2709 case EXIT_REASON_INVALID_GUEST_STATE:
2710 printk("caused by invalid guest state (%ld).\n", exit_qualification);
2711 break;
2712 case EXIT_REASON_MSR_LOADING:
2713 printk("caused by MSR entry %ld loading.\n", exit_qualification);
2714 break;
2715 case EXIT_REASON_MACHINE_CHECK:
2716 printk("caused by machine check.\n");
2717 HVMTRACE_0D(MCE, curr);
2718 do_machine_check(regs);
2719 break;
2720 default:
2721 printk("reason not known yet!");
2722 break;
2725 printk("************* VMCS Area **************\n");
2726 vmcs_dump_vcpu(curr);
2727 printk("**************************************\n");
2729 domain_crash(curr->domain);
2732 asmlinkage void vmx_vmexit_handler(struct cpu_user_regs *regs)
2734 unsigned int exit_reason, idtv_info;
2735 unsigned long exit_qualification, inst_len = 0;
2736 struct vcpu *v = current;
2738 exit_reason = __vmread(VM_EXIT_REASON);
2740 hvmtrace_vmexit(v, regs->eip, exit_reason);
2742 perfc_incra(vmexits, exit_reason);
2744 if ( exit_reason != EXIT_REASON_EXTERNAL_INTERRUPT )
2745 local_irq_enable();
2747 if ( unlikely(exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) )
2748 return vmx_failed_vmentry(exit_reason, regs);
2750 /* Event delivery caused this intercept? Queue for redelivery. */
2751 idtv_info = __vmread(IDT_VECTORING_INFO);
2752 if ( unlikely(idtv_info & INTR_INFO_VALID_MASK) &&
2753 (exit_reason != EXIT_REASON_TASK_SWITCH) )
2755 if ( hvm_event_needs_reinjection((idtv_info>>8)&7, idtv_info&0xff) )
2757 /* See SDM 3B 25.7.1.1 and .2 for info about masking resvd bits. */
2758 __vmwrite(VM_ENTRY_INTR_INFO,
2759 idtv_info & ~INTR_INFO_RESVD_BITS_MASK);
2760 if ( idtv_info & INTR_INFO_DELIVER_CODE_MASK )
2761 __vmwrite(VM_ENTRY_EXCEPTION_ERROR_CODE,
2762 __vmread(IDT_VECTORING_ERROR_CODE));
2765 /*
2766 * Clear NMI-blocking interruptibility info if an NMI delivery faulted.
2767 * Re-delivery will re-set it (see SDM 3B 25.7.1.2).
2768 */
2769 if ( (idtv_info & INTR_INFO_INTR_TYPE_MASK) == (X86_EVENTTYPE_NMI<<8) )
2770 __vmwrite(GUEST_INTERRUPTIBILITY_INFO,
2771 __vmread(GUEST_INTERRUPTIBILITY_INFO) &
2772 ~VMX_INTR_SHADOW_NMI);
2775 switch ( exit_reason )
2777 case EXIT_REASON_EXCEPTION_NMI:
2779 /*
2780 * We don't set the software-interrupt exiting (INT n).
2781 * (1) We can get an exception (e.g. #PG) in the guest, or
2782 * (2) NMI
2783 */
2784 unsigned int intr_info, vector;
2786 intr_info = __vmread(VM_EXIT_INTR_INFO);
2787 BUG_ON(!(intr_info & INTR_INFO_VALID_MASK));
2789 vector = intr_info & INTR_INFO_VECTOR_MASK;
2791 /*
2792 * Re-set the NMI shadow if vmexit caused by a guest IRET fault (see 3B
2793 * 25.7.1.2, "Resuming Guest Software after Handling an Exception").
2794 * (NB. If we emulate this IRET for any reason, we should re-clear!)
2795 */
2796 if ( unlikely(intr_info & INTR_INFO_NMI_UNBLOCKED_BY_IRET) &&
2797 !(__vmread(IDT_VECTORING_INFO) & INTR_INFO_VALID_MASK) &&
2798 (vector != TRAP_double_fault) )
2799 __vmwrite(GUEST_INTERRUPTIBILITY_INFO,
2800 __vmread(GUEST_INTERRUPTIBILITY_INFO)|VMX_INTR_SHADOW_NMI);
2802 perfc_incra(cause_vector, vector);
2804 switch ( vector )
2806 case TRAP_debug:
2807 case TRAP_int3:
2808 if ( !v->domain->debugger_attached )
2809 goto exit_and_crash;
2810 domain_pause_for_debugger();
2811 break;
2812 case TRAP_no_device:
2813 vmx_do_no_device_fault();
2814 break;
2815 case TRAP_page_fault:
2816 exit_qualification = __vmread(EXIT_QUALIFICATION);
2817 regs->error_code = __vmread(VM_EXIT_INTR_ERROR_CODE);
2819 HVM_DBG_LOG(DBG_LEVEL_VMMU,
2820 "eax=%lx, ebx=%lx, ecx=%lx, edx=%lx, esi=%lx, edi=%lx",
2821 (unsigned long)regs->eax, (unsigned long)regs->ebx,
2822 (unsigned long)regs->ecx, (unsigned long)regs->edx,
2823 (unsigned long)regs->esi, (unsigned long)regs->edi);
2825 if ( paging_fault(exit_qualification, regs) )
2827 hvmtrace_pf_xen(v, exit_qualification, regs->error_code);
2828 break;
2831 v->arch.hvm_vcpu.guest_cr[2] = exit_qualification;
2832 vmx_inject_hw_exception(v, TRAP_page_fault, regs->error_code);
2833 break;
2834 case TRAP_nmi:
2835 if ( (intr_info & INTR_INFO_INTR_TYPE_MASK) !=
2836 (X86_EVENTTYPE_NMI << 8) )
2837 goto exit_and_crash;
2838 HVMTRACE_0D(NMI, v);
2839 do_nmi(regs); /* Real NMI, vector 2: normal processing. */
2840 break;
2841 case TRAP_machine_check:
2842 HVMTRACE_0D(MCE, v);
2843 do_machine_check(regs);
2844 break;
2845 default:
2846 goto exit_and_crash;
2848 break;
2850 case EXIT_REASON_EXTERNAL_INTERRUPT:
2851 vmx_do_extint(regs);
2852 break;
2853 case EXIT_REASON_TRIPLE_FAULT:
2854 hvm_triple_fault();
2855 break;
2856 case EXIT_REASON_PENDING_VIRT_INTR:
2857 /* Disable the interrupt window. */
2858 v->arch.hvm_vmx.exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
2859 __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
2860 v->arch.hvm_vmx.exec_control);
2861 break;
2862 case EXIT_REASON_PENDING_VIRT_NMI:
2863 /* Disable the NMI window. */
2864 v->arch.hvm_vmx.exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
2865 __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
2866 v->arch.hvm_vmx.exec_control);
2867 break;
2868 case EXIT_REASON_TASK_SWITCH: {
2869 const enum hvm_task_switch_reason reasons[] = {
2870 TSW_call_or_int, TSW_iret, TSW_jmp, TSW_call_or_int };
2871 int32_t errcode = -1;
2872 exit_qualification = __vmread(EXIT_QUALIFICATION);
2873 if ( (idtv_info & INTR_INFO_VALID_MASK) &&
2874 (idtv_info & INTR_INFO_DELIVER_CODE_MASK) )
2875 errcode = __vmread(IDT_VECTORING_ERROR_CODE);
2876 hvm_task_switch((uint16_t)exit_qualification,
2877 reasons[(exit_qualification >> 30) & 3],
2878 errcode);
2879 break;
2881 case EXIT_REASON_CPUID:
2882 inst_len = __get_instruction_length(); /* Safe: CPUID */
2883 __update_guest_eip(inst_len);
2884 vmx_do_cpuid(regs);
2885 break;
2886 case EXIT_REASON_HLT:
2887 inst_len = __get_instruction_length(); /* Safe: HLT */
2888 __update_guest_eip(inst_len);
2889 vmx_do_hlt(regs);
2890 break;
2891 case EXIT_REASON_INVLPG:
2893 inst_len = __get_instruction_length(); /* Safe: INVLPG */
2894 __update_guest_eip(inst_len);
2895 exit_qualification = __vmread(EXIT_QUALIFICATION);
2896 vmx_do_invlpg(exit_qualification);
2897 break;
2899 case EXIT_REASON_VMCALL:
2901 int rc;
2902 HVMTRACE_1D(VMMCALL, v, regs->eax);
2903 inst_len = __get_instruction_length(); /* Safe: VMCALL */
2904 rc = hvm_do_hypercall(regs);
2905 if ( rc != HVM_HCALL_preempted )
2907 __update_guest_eip(inst_len);
2908 if ( rc == HVM_HCALL_invalidate )
2909 send_invalidate_req();
2911 break;
2913 case EXIT_REASON_CR_ACCESS:
2915 exit_qualification = __vmread(EXIT_QUALIFICATION);
2916 inst_len = __get_instruction_length(); /* Safe: MOV Cn, LMSW, CLTS */
2917 if ( vmx_cr_access(exit_qualification, regs) )
2918 __update_guest_eip(inst_len);
2919 break;
2921 case EXIT_REASON_DR_ACCESS:
2922 exit_qualification = __vmread(EXIT_QUALIFICATION);
2923 vmx_dr_access(exit_qualification, regs);
2924 break;
2925 case EXIT_REASON_IO_INSTRUCTION:
2926 exit_qualification = __vmread(EXIT_QUALIFICATION);
2927 inst_len = __get_instruction_length(); /* Safe: IN, INS, OUT, OUTS */
2928 vmx_io_instruction(exit_qualification, inst_len);
2929 break;
2930 case EXIT_REASON_MSR_READ:
2931 inst_len = __get_instruction_length(); /* Safe: RDMSR */
2932 if ( vmx_do_msr_read(regs) )
2933 __update_guest_eip(inst_len);
2934 break;
2935 case EXIT_REASON_MSR_WRITE:
2936 inst_len = __get_instruction_length(); /* Safe: WRMSR */
2937 if ( vmx_do_msr_write(regs) )
2938 __update_guest_eip(inst_len);
2939 break;
2941 case EXIT_REASON_MWAIT_INSTRUCTION:
2942 case EXIT_REASON_MONITOR_INSTRUCTION:
2943 case EXIT_REASON_VMCLEAR:
2944 case EXIT_REASON_VMLAUNCH:
2945 case EXIT_REASON_VMPTRLD:
2946 case EXIT_REASON_VMPTRST:
2947 case EXIT_REASON_VMREAD:
2948 case EXIT_REASON_VMRESUME:
2949 case EXIT_REASON_VMWRITE:
2950 case EXIT_REASON_VMXOFF:
2951 case EXIT_REASON_VMXON:
2952 vmx_inject_hw_exception(v, TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE);
2953 break;
2955 case EXIT_REASON_TPR_BELOW_THRESHOLD:
2956 break;
2958 case EXIT_REASON_APIC_ACCESS:
2960 unsigned long offset;
2961 exit_qualification = __vmread(EXIT_QUALIFICATION);
2962 offset = exit_qualification & 0x0fffUL;
2963 handle_mmio(APIC_DEFAULT_PHYS_BASE | offset);
2964 break;
2967 case EXIT_REASON_INVD:
2968 case EXIT_REASON_WBINVD:
2970 inst_len = __get_instruction_length(); /* Safe: INVD, WBINVD */
2971 __update_guest_eip(inst_len);
2972 vmx_wbinvd_intercept();
2973 break;
2976 default:
2977 exit_and_crash:
2978 gdprintk(XENLOG_ERR, "Bad vmexit (reason %x)\n", exit_reason);
2979 domain_crash(v->domain);
2980 break;
2984 asmlinkage void vmx_trace_vmentry(void)
2986 struct vcpu *v = current;
2988 hvmtrace_vmentry(v);
2991 /*
2992 * Local variables:
2993 * mode: C
2994 * c-set-style: "BSD"
2995 * c-basic-offset: 4
2996 * tab-width: 4
2997 * indent-tabs-mode: nil
2998 * End:
2999 */