ia64/xen-unstable

view xen/arch/x86/hvm/vmx/vmx.c @ 15269:5710c94e6539

Fix boot loader hangs with syslinux's 32-bit vesamenu module.

Syslinux can load 32-bit UI code for menu handling. But the core of
syslinux is still 16-bit. When it jumps to this 32-bit code, it
installs a set of 32-bit interrupt trap handlers which just bounce the
interrupts back to 16-bit mode.

But this plays badly with vmxassist. When running 16-bit boot loader
code, vmxassist installs its own trap handlers which bounce vPIC
interrupts back down to 16-bit mode. The trap handlers live at
int 0x20 to 0x2f, so when the 16-bit code tries to reprogram the vPIC,
vm86 rewrites the outb()s on the fly to set the irq_base vectors
accordingly.

So when syslinux enters 32-bit mode, the vPIC has still been
programmed to point to vmxassist's bounce traps, even though vmxassist
is no longer active once the guest is running 32-bit code. So the
wrong interrupts get delivered to the guest.

Fix this by restoring the rombios vPIC irq_base vectors when we leave
vmxassist mode, and returning the vmxassist traps when we reenter it.
These irq base values are hard-coded in this patch, but they are
already hard-coded in vmxassist so any boot code that relies on
changing them will already fail.

Signed-off-by: Stephen Tweedie <sct@redhat.com>
author kfraser@localhost.localdomain
date Mon Jun 04 15:32:11 2007 +0100 (2007-06-04)
parents b182bd560e47
children 4f05a587cb6b
line source
1 /*
2 * vmx.c: handling VMX architecture-related VM exits
3 * Copyright (c) 2004, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
16 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 *
18 */
20 #include <xen/config.h>
21 #include <xen/init.h>
22 #include <xen/lib.h>
23 #include <xen/trace.h>
24 #include <xen/sched.h>
25 #include <xen/irq.h>
26 #include <xen/softirq.h>
27 #include <xen/domain_page.h>
28 #include <xen/hypercall.h>
29 #include <xen/perfc.h>
30 #include <asm/current.h>
31 #include <asm/io.h>
32 #include <asm/regs.h>
33 #include <asm/cpufeature.h>
34 #include <asm/processor.h>
35 #include <asm/types.h>
36 #include <asm/msr.h>
37 #include <asm/spinlock.h>
38 #include <asm/paging.h>
39 #include <asm/p2m.h>
40 #include <asm/hvm/hvm.h>
41 #include <asm/hvm/support.h>
42 #include <asm/hvm/vmx/vmx.h>
43 #include <asm/hvm/vmx/vmcs.h>
44 #include <asm/hvm/vmx/cpu.h>
45 #include <public/sched.h>
46 #include <public/hvm/ioreq.h>
47 #include <asm/hvm/vpic.h>
48 #include <asm/hvm/vlapic.h>
49 #include <asm/x86_emulate.h>
50 #include <asm/hvm/vpt.h>
51 #include <public/hvm/save.h>
52 #include <asm/hvm/trace.h>
54 char *vmx_msr_bitmap;
56 static void vmx_ctxt_switch_from(struct vcpu *v);
57 static void vmx_ctxt_switch_to(struct vcpu *v);
59 static int vmx_alloc_vlapic_mapping(struct domain *d);
60 static void vmx_free_vlapic_mapping(struct domain *d);
61 static void vmx_install_vlapic_mapping(struct vcpu *v);
63 static int vmx_domain_initialise(struct domain *d)
64 {
65 return vmx_alloc_vlapic_mapping(d);
66 }
68 static void vmx_domain_destroy(struct domain *d)
69 {
70 vmx_free_vlapic_mapping(d);
71 }
73 static int vmx_vcpu_initialise(struct vcpu *v)
74 {
75 int rc;
77 spin_lock_init(&v->arch.hvm_vmx.vmcs_lock);
79 v->arch.schedule_tail = vmx_do_resume;
80 v->arch.ctxt_switch_from = vmx_ctxt_switch_from;
81 v->arch.ctxt_switch_to = vmx_ctxt_switch_to;
83 if ( (rc = vmx_create_vmcs(v)) != 0 )
84 {
85 dprintk(XENLOG_WARNING,
86 "Failed to create VMCS for vcpu %d: err=%d.\n",
87 v->vcpu_id, rc);
88 return rc;
89 }
91 vmx_install_vlapic_mapping(v);
93 return 0;
94 }
96 static void vmx_vcpu_destroy(struct vcpu *v)
97 {
98 vmx_destroy_vmcs(v);
99 }
101 #ifdef __x86_64__
103 static DEFINE_PER_CPU(struct vmx_msr_state, host_msr_state);
105 static u32 msr_index[VMX_MSR_COUNT] =
106 {
107 MSR_LSTAR, MSR_STAR, MSR_SYSCALL_MASK
108 };
110 static void vmx_save_host_msrs(void)
111 {
112 struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state);
113 int i;
115 for ( i = 0; i < VMX_MSR_COUNT; i++ )
116 rdmsrl(msr_index[i], host_msr_state->msrs[i]);
117 }
119 #define WRITE_MSR(address) \
120 guest_msr_state->msrs[VMX_INDEX_MSR_ ## address] = msr_content; \
121 set_bit(VMX_INDEX_MSR_ ## address, &guest_msr_state->flags); \
122 wrmsrl(MSR_ ## address, msr_content); \
123 set_bit(VMX_INDEX_MSR_ ## address, &host_msr_state->flags); \
124 break
126 static inline int long_mode_do_msr_read(struct cpu_user_regs *regs)
127 {
128 u64 msr_content = 0;
129 u32 ecx = regs->ecx;
130 struct vcpu *v = current;
131 struct vmx_msr_state *guest_msr_state = &v->arch.hvm_vmx.msr_state;
133 switch ( ecx ) {
134 case MSR_EFER:
135 msr_content = v->arch.hvm_vmx.efer;
136 break;
138 case MSR_FS_BASE:
139 msr_content = __vmread(GUEST_FS_BASE);
140 goto check_long_mode;
142 case MSR_GS_BASE:
143 msr_content = __vmread(GUEST_GS_BASE);
144 goto check_long_mode;
146 case MSR_SHADOW_GS_BASE:
147 msr_content = v->arch.hvm_vmx.shadow_gs;
148 check_long_mode:
149 if ( !(vmx_long_mode_enabled(v)) )
150 {
151 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
152 return 0;
153 }
154 break;
156 case MSR_STAR:
157 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_STAR];
158 break;
160 case MSR_LSTAR:
161 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_LSTAR];
162 break;
164 case MSR_CSTAR:
165 msr_content = v->arch.hvm_vmx.cstar;
166 break;
168 case MSR_SYSCALL_MASK:
169 msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_SYSCALL_MASK];
170 break;
172 default:
173 return 0;
174 }
176 HVM_DBG_LOG(DBG_LEVEL_0, "msr 0x%x content 0x%"PRIx64, ecx, msr_content);
178 regs->eax = (u32)(msr_content >> 0);
179 regs->edx = (u32)(msr_content >> 32);
181 return 1;
182 }
184 static inline int long_mode_do_msr_write(struct cpu_user_regs *regs)
185 {
186 u64 msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
187 u32 ecx = regs->ecx;
188 struct vcpu *v = current;
189 struct vmx_msr_state *guest_msr_state = &v->arch.hvm_vmx.msr_state;
190 struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state);
192 HVM_DBG_LOG(DBG_LEVEL_0, "msr 0x%x content 0x%"PRIx64, ecx, msr_content);
194 switch ( ecx )
195 {
196 case MSR_EFER:
197 /* offending reserved bit will cause #GP */
198 if ( (msr_content & ~(EFER_LME | EFER_LMA | EFER_NX | EFER_SCE)) ||
199 (!cpu_has_nx && (msr_content & EFER_NX)) ||
200 (!cpu_has_syscall && (msr_content & EFER_SCE)) )
201 {
202 gdprintk(XENLOG_WARNING, "Trying to set reserved bit in "
203 "EFER: %"PRIx64"\n", msr_content);
204 goto gp_fault;
205 }
207 if ( (msr_content & EFER_LME)
208 && !(v->arch.hvm_vmx.efer & EFER_LME) )
209 {
210 if ( unlikely(vmx_paging_enabled(v)) )
211 {
212 gdprintk(XENLOG_WARNING,
213 "Trying to set EFER.LME with paging enabled\n");
214 goto gp_fault;
215 }
216 }
217 else if ( !(msr_content & EFER_LME)
218 && (v->arch.hvm_vmx.efer & EFER_LME) )
219 {
220 if ( unlikely(vmx_paging_enabled(v)) )
221 {
222 gdprintk(XENLOG_WARNING,
223 "Trying to clear EFER.LME with paging enabled\n");
224 goto gp_fault;
225 }
226 }
228 if ( (msr_content ^ v->arch.hvm_vmx.efer) & (EFER_NX|EFER_SCE) )
229 write_efer((read_efer() & ~(EFER_NX|EFER_SCE)) |
230 (msr_content & (EFER_NX|EFER_SCE)));
232 v->arch.hvm_vmx.efer = msr_content;
233 break;
235 case MSR_FS_BASE:
236 case MSR_GS_BASE:
237 case MSR_SHADOW_GS_BASE:
238 if ( !vmx_long_mode_enabled(v) )
239 goto gp_fault;
241 if ( !is_canonical_address(msr_content) )
242 goto uncanonical_address;
244 if ( ecx == MSR_FS_BASE )
245 __vmwrite(GUEST_FS_BASE, msr_content);
246 else if ( ecx == MSR_GS_BASE )
247 __vmwrite(GUEST_GS_BASE, msr_content);
248 else
249 {
250 v->arch.hvm_vmx.shadow_gs = msr_content;
251 wrmsrl(MSR_SHADOW_GS_BASE, msr_content);
252 }
254 break;
256 case MSR_STAR:
257 WRITE_MSR(STAR);
259 case MSR_LSTAR:
260 if ( !is_canonical_address(msr_content) )
261 goto uncanonical_address;
262 WRITE_MSR(LSTAR);
264 case MSR_CSTAR:
265 if ( !is_canonical_address(msr_content) )
266 goto uncanonical_address;
267 v->arch.hvm_vmx.cstar = msr_content;
268 break;
270 case MSR_SYSCALL_MASK:
271 WRITE_MSR(SYSCALL_MASK);
273 default:
274 return 0;
275 }
277 return 1;
279 uncanonical_address:
280 HVM_DBG_LOG(DBG_LEVEL_0, "Not cano address of msr write %x", ecx);
281 gp_fault:
282 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
283 return 0;
284 }
286 /*
287 * To avoid MSR save/restore at every VM exit/entry time, we restore
288 * the x86_64 specific MSRs at domain switch time. Since these MSRs
289 * are not modified once set for para domains, we don't save them,
290 * but simply reset them to values set in percpu_traps_init().
291 */
292 static void vmx_restore_host_msrs(void)
293 {
294 struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state);
295 int i;
297 while ( host_msr_state->flags )
298 {
299 i = find_first_set_bit(host_msr_state->flags);
300 wrmsrl(msr_index[i], host_msr_state->msrs[i]);
301 clear_bit(i, &host_msr_state->flags);
302 }
303 if ( cpu_has_nx && !(read_efer() & EFER_NX) )
304 write_efer(read_efer() | EFER_NX);
305 }
307 static void vmx_save_guest_msrs(struct vcpu *v)
308 {
309 /* MSR_SHADOW_GS_BASE may have been changed by swapgs instruction. */
310 rdmsrl(MSR_SHADOW_GS_BASE, v->arch.hvm_vmx.shadow_gs);
311 }
313 static void vmx_restore_guest_msrs(struct vcpu *v)
314 {
315 struct vmx_msr_state *guest_msr_state, *host_msr_state;
316 unsigned long guest_flags;
317 int i;
319 guest_msr_state = &v->arch.hvm_vmx.msr_state;
320 host_msr_state = &this_cpu(host_msr_state);
322 wrmsrl(MSR_SHADOW_GS_BASE, v->arch.hvm_vmx.shadow_gs);
324 guest_flags = guest_msr_state->flags;
326 while ( guest_flags ) {
327 i = find_first_set_bit(guest_flags);
329 HVM_DBG_LOG(DBG_LEVEL_2,
330 "restore guest's index %d msr %x with value %lx",
331 i, msr_index[i], guest_msr_state->msrs[i]);
332 set_bit(i, &host_msr_state->flags);
333 wrmsrl(msr_index[i], guest_msr_state->msrs[i]);
334 clear_bit(i, &guest_flags);
335 }
337 if ( (v->arch.hvm_vmx.efer ^ read_efer()) & (EFER_NX|EFER_SCE) )
338 {
339 HVM_DBG_LOG(DBG_LEVEL_2,
340 "restore guest's EFER with value %lx",
341 v->arch.hvm_vmx.efer);
342 write_efer((read_efer() & ~(EFER_NX|EFER_SCE)) |
343 (v->arch.hvm_vmx.efer & (EFER_NX|EFER_SCE)));
344 }
345 }
347 #else /* __i386__ */
349 #define vmx_save_host_msrs() ((void)0)
351 static void vmx_restore_host_msrs(void)
352 {
353 if ( cpu_has_nx && !(read_efer() & EFER_NX) )
354 write_efer(read_efer() | EFER_NX);
355 }
357 #define vmx_save_guest_msrs(v) ((void)0)
359 static void vmx_restore_guest_msrs(struct vcpu *v)
360 {
361 if ( (v->arch.hvm_vmx.efer ^ read_efer()) & EFER_NX )
362 {
363 HVM_DBG_LOG(DBG_LEVEL_2,
364 "restore guest's EFER with value %lx",
365 v->arch.hvm_vmx.efer);
366 write_efer((read_efer() & ~EFER_NX) |
367 (v->arch.hvm_vmx.efer & EFER_NX));
368 }
369 }
371 static inline int long_mode_do_msr_read(struct cpu_user_regs *regs)
372 {
373 u64 msr_content = 0;
374 struct vcpu *v = current;
376 switch ( regs->ecx ) {
377 case MSR_EFER:
378 msr_content = v->arch.hvm_vmx.efer;
379 break;
381 default:
382 return 0;
383 }
385 regs->eax = msr_content >> 0;
386 regs->edx = msr_content >> 32;
388 return 1;
389 }
391 static inline int long_mode_do_msr_write(struct cpu_user_regs *regs)
392 {
393 u64 msr_content = regs->eax | ((u64)regs->edx << 32);
394 struct vcpu *v = current;
396 switch ( regs->ecx )
397 {
398 case MSR_EFER:
399 /* offending reserved bit will cause #GP */
400 if ( (msr_content & ~EFER_NX) ||
401 (!cpu_has_nx && (msr_content & EFER_NX)) )
402 {
403 gdprintk(XENLOG_WARNING, "Trying to set reserved bit in "
404 "EFER: %"PRIx64"\n", msr_content);
405 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
406 return 0;
407 }
409 if ( (msr_content ^ v->arch.hvm_vmx.efer) & EFER_NX )
410 write_efer((read_efer() & ~EFER_NX) | (msr_content & EFER_NX));
412 v->arch.hvm_vmx.efer = msr_content;
413 break;
415 default:
416 return 0;
417 }
419 return 1;
420 }
422 #endif /* __i386__ */
424 #define loaddebug(_v,_reg) \
425 __asm__ __volatile__ ("mov %0,%%db" #_reg : : "r" ((_v)->debugreg[_reg]))
426 #define savedebug(_v,_reg) \
427 __asm__ __volatile__ ("mov %%db" #_reg ",%0" : : "r" ((_v)->debugreg[_reg]))
429 static inline void vmx_save_dr(struct vcpu *v)
430 {
431 if ( !v->arch.hvm_vcpu.flag_dr_dirty )
432 return;
434 /* Clear the DR dirty flag and re-enable intercepts for DR accesses. */
435 v->arch.hvm_vcpu.flag_dr_dirty = 0;
436 v->arch.hvm_vcpu.u.vmx.exec_control |= CPU_BASED_MOV_DR_EXITING;
437 __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vcpu.u.vmx.exec_control);
439 savedebug(&v->arch.guest_context, 0);
440 savedebug(&v->arch.guest_context, 1);
441 savedebug(&v->arch.guest_context, 2);
442 savedebug(&v->arch.guest_context, 3);
443 savedebug(&v->arch.guest_context, 6);
444 v->arch.guest_context.debugreg[7] = __vmread(GUEST_DR7);
445 }
447 static inline void __restore_debug_registers(struct vcpu *v)
448 {
449 loaddebug(&v->arch.guest_context, 0);
450 loaddebug(&v->arch.guest_context, 1);
451 loaddebug(&v->arch.guest_context, 2);
452 loaddebug(&v->arch.guest_context, 3);
453 /* No 4 and 5 */
454 loaddebug(&v->arch.guest_context, 6);
455 /* DR7 is loaded from the VMCS. */
456 }
458 int vmx_vmcs_save(struct vcpu *v, struct hvm_hw_cpu *c)
459 {
460 uint32_t ev;
462 c->rip = __vmread(GUEST_RIP);
463 c->rsp = __vmread(GUEST_RSP);
464 c->rflags = __vmread(GUEST_RFLAGS);
466 c->cr0 = v->arch.hvm_vmx.cpu_shadow_cr0;
467 c->cr2 = v->arch.hvm_vmx.cpu_cr2;
468 c->cr3 = v->arch.hvm_vmx.cpu_cr3;
469 c->cr4 = v->arch.hvm_vmx.cpu_shadow_cr4;
471 #ifdef HVM_DEBUG_SUSPEND
472 printk("vmx_vmcs_save: cr3=0x%"PRIx64", cr0=0x%"PRIx64", cr4=0x%"PRIx64".\n",
473 c->cr3,
474 c->cr0,
475 c->cr4);
476 #endif
478 c->idtr_limit = __vmread(GUEST_IDTR_LIMIT);
479 c->idtr_base = __vmread(GUEST_IDTR_BASE);
481 c->gdtr_limit = __vmread(GUEST_GDTR_LIMIT);
482 c->gdtr_base = __vmread(GUEST_GDTR_BASE);
484 c->cs_sel = __vmread(GUEST_CS_SELECTOR);
485 c->cs_limit = __vmread(GUEST_CS_LIMIT);
486 c->cs_base = __vmread(GUEST_CS_BASE);
487 c->cs_arbytes = __vmread(GUEST_CS_AR_BYTES);
489 c->ds_sel = __vmread(GUEST_DS_SELECTOR);
490 c->ds_limit = __vmread(GUEST_DS_LIMIT);
491 c->ds_base = __vmread(GUEST_DS_BASE);
492 c->ds_arbytes = __vmread(GUEST_DS_AR_BYTES);
494 c->es_sel = __vmread(GUEST_ES_SELECTOR);
495 c->es_limit = __vmread(GUEST_ES_LIMIT);
496 c->es_base = __vmread(GUEST_ES_BASE);
497 c->es_arbytes = __vmread(GUEST_ES_AR_BYTES);
499 c->ss_sel = __vmread(GUEST_SS_SELECTOR);
500 c->ss_limit = __vmread(GUEST_SS_LIMIT);
501 c->ss_base = __vmread(GUEST_SS_BASE);
502 c->ss_arbytes = __vmread(GUEST_SS_AR_BYTES);
504 c->fs_sel = __vmread(GUEST_FS_SELECTOR);
505 c->fs_limit = __vmread(GUEST_FS_LIMIT);
506 c->fs_base = __vmread(GUEST_FS_BASE);
507 c->fs_arbytes = __vmread(GUEST_FS_AR_BYTES);
509 c->gs_sel = __vmread(GUEST_GS_SELECTOR);
510 c->gs_limit = __vmread(GUEST_GS_LIMIT);
511 c->gs_base = __vmread(GUEST_GS_BASE);
512 c->gs_arbytes = __vmread(GUEST_GS_AR_BYTES);
514 c->tr_sel = __vmread(GUEST_TR_SELECTOR);
515 c->tr_limit = __vmread(GUEST_TR_LIMIT);
516 c->tr_base = __vmread(GUEST_TR_BASE);
517 c->tr_arbytes = __vmread(GUEST_TR_AR_BYTES);
519 c->ldtr_sel = __vmread(GUEST_LDTR_SELECTOR);
520 c->ldtr_limit = __vmread(GUEST_LDTR_LIMIT);
521 c->ldtr_base = __vmread(GUEST_LDTR_BASE);
522 c->ldtr_arbytes = __vmread(GUEST_LDTR_AR_BYTES);
524 c->sysenter_cs = __vmread(GUEST_SYSENTER_CS);
525 c->sysenter_esp = __vmread(GUEST_SYSENTER_ESP);
526 c->sysenter_eip = __vmread(GUEST_SYSENTER_EIP);
528 /* Save any event/interrupt that was being injected when we last
529 * exited. IDT_VECTORING_INFO_FIELD has priority, as anything in
530 * VM_ENTRY_INTR_INFO_FIELD is either a fault caused by the first
531 * event, which will happen the next time, or an interrupt, which we
532 * never inject when IDT_VECTORING_INFO_FIELD is valid.*/
533 if ( (ev = __vmread(IDT_VECTORING_INFO_FIELD)) & INTR_INFO_VALID_MASK )
534 {
535 c->pending_event = ev;
536 c->error_code = __vmread(IDT_VECTORING_ERROR_CODE);
537 }
538 else if ( (ev = __vmread(VM_ENTRY_INTR_INFO_FIELD))
539 & INTR_INFO_VALID_MASK )
540 {
541 c->pending_event = ev;
542 c->error_code = __vmread(VM_ENTRY_EXCEPTION_ERROR_CODE);
543 }
544 else
545 {
546 c->pending_event = 0;
547 c->error_code = 0;
548 }
550 return 1;
551 }
553 int vmx_vmcs_restore(struct vcpu *v, struct hvm_hw_cpu *c)
554 {
555 unsigned long mfn, old_base_mfn;
557 vmx_vmcs_enter(v);
559 __vmwrite(GUEST_RIP, c->rip);
560 __vmwrite(GUEST_RSP, c->rsp);
561 __vmwrite(GUEST_RFLAGS, c->rflags);
563 v->arch.hvm_vmx.cpu_shadow_cr0 = c->cr0;
564 __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr0);
566 v->arch.hvm_vmx.cpu_cr2 = c->cr2;
568 #ifdef HVM_DEBUG_SUSPEND
569 printk("vmx_vmcs_restore: cr3=0x%"PRIx64", cr0=0x%"PRIx64", cr4=0x%"PRIx64".\n",
570 c->cr3,
571 c->cr0,
572 c->cr4);
573 #endif
575 if (!vmx_paging_enabled(v)) {
576 printk("vmx_vmcs_restore: paging not enabled.");
577 goto skip_cr3;
578 }
580 if (c->cr3 == v->arch.hvm_vmx.cpu_cr3) {
581 /*
582 * This is simple TLB flush, implying the guest has
583 * removed some translation or changed page attributes.
584 * We simply invalidate the shadow.
585 */
586 mfn = gmfn_to_mfn(v->domain, c->cr3 >> PAGE_SHIFT);
587 if (mfn != pagetable_get_pfn(v->arch.guest_table)) {
588 goto bad_cr3;
589 }
590 } else {
591 /*
592 * If different, make a shadow. Check if the PDBR is valid
593 * first.
594 */
595 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 c->cr3 = %"PRIx64, c->cr3);
596 /* current!=vcpu as not called by arch_vmx_do_launch */
597 mfn = gmfn_to_mfn(v->domain, c->cr3 >> PAGE_SHIFT);
598 if( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), v->domain)) {
599 goto bad_cr3;
600 }
601 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
602 v->arch.guest_table = pagetable_from_pfn(mfn);
603 if (old_base_mfn)
604 put_page(mfn_to_page(old_base_mfn));
605 v->arch.hvm_vmx.cpu_cr3 = c->cr3;
606 }
608 skip_cr3:
609 #if defined(__x86_64__)
610 if (vmx_long_mode_enabled(v)) {
611 unsigned long vm_entry_value;
612 vm_entry_value = __vmread(VM_ENTRY_CONTROLS);
613 vm_entry_value |= VM_ENTRY_IA32E_MODE;
614 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
615 }
616 #endif
618 __vmwrite(GUEST_CR4, (c->cr4 | VMX_CR4_HOST_MASK));
619 v->arch.hvm_vmx.cpu_shadow_cr4 = c->cr4;
620 __vmwrite(CR4_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr4);
622 __vmwrite(GUEST_IDTR_LIMIT, c->idtr_limit);
623 __vmwrite(GUEST_IDTR_BASE, c->idtr_base);
625 __vmwrite(GUEST_GDTR_LIMIT, c->gdtr_limit);
626 __vmwrite(GUEST_GDTR_BASE, c->gdtr_base);
628 __vmwrite(GUEST_CS_SELECTOR, c->cs_sel);
629 __vmwrite(GUEST_CS_LIMIT, c->cs_limit);
630 __vmwrite(GUEST_CS_BASE, c->cs_base);
631 __vmwrite(GUEST_CS_AR_BYTES, c->cs_arbytes);
633 __vmwrite(GUEST_DS_SELECTOR, c->ds_sel);
634 __vmwrite(GUEST_DS_LIMIT, c->ds_limit);
635 __vmwrite(GUEST_DS_BASE, c->ds_base);
636 __vmwrite(GUEST_DS_AR_BYTES, c->ds_arbytes);
638 __vmwrite(GUEST_ES_SELECTOR, c->es_sel);
639 __vmwrite(GUEST_ES_LIMIT, c->es_limit);
640 __vmwrite(GUEST_ES_BASE, c->es_base);
641 __vmwrite(GUEST_ES_AR_BYTES, c->es_arbytes);
643 __vmwrite(GUEST_SS_SELECTOR, c->ss_sel);
644 __vmwrite(GUEST_SS_LIMIT, c->ss_limit);
645 __vmwrite(GUEST_SS_BASE, c->ss_base);
646 __vmwrite(GUEST_SS_AR_BYTES, c->ss_arbytes);
648 __vmwrite(GUEST_FS_SELECTOR, c->fs_sel);
649 __vmwrite(GUEST_FS_LIMIT, c->fs_limit);
650 __vmwrite(GUEST_FS_BASE, c->fs_base);
651 __vmwrite(GUEST_FS_AR_BYTES, c->fs_arbytes);
653 __vmwrite(GUEST_GS_SELECTOR, c->gs_sel);
654 __vmwrite(GUEST_GS_LIMIT, c->gs_limit);
655 __vmwrite(GUEST_GS_BASE, c->gs_base);
656 __vmwrite(GUEST_GS_AR_BYTES, c->gs_arbytes);
658 __vmwrite(GUEST_TR_SELECTOR, c->tr_sel);
659 __vmwrite(GUEST_TR_LIMIT, c->tr_limit);
660 __vmwrite(GUEST_TR_BASE, c->tr_base);
661 __vmwrite(GUEST_TR_AR_BYTES, c->tr_arbytes);
663 __vmwrite(GUEST_LDTR_SELECTOR, c->ldtr_sel);
664 __vmwrite(GUEST_LDTR_LIMIT, c->ldtr_limit);
665 __vmwrite(GUEST_LDTR_BASE, c->ldtr_base);
666 __vmwrite(GUEST_LDTR_AR_BYTES, c->ldtr_arbytes);
668 __vmwrite(GUEST_SYSENTER_CS, c->sysenter_cs);
669 __vmwrite(GUEST_SYSENTER_ESP, c->sysenter_esp);
670 __vmwrite(GUEST_SYSENTER_EIP, c->sysenter_eip);
672 __vmwrite(GUEST_DR7, c->dr7);
674 vmx_vmcs_exit(v);
676 paging_update_paging_modes(v);
678 if ( c->pending_valid )
679 {
680 vmx_vmcs_enter(v);
681 gdprintk(XENLOG_INFO, "Re-injecting 0x%"PRIx32", 0x%"PRIx32"\n",
682 c->pending_event, c->error_code);
684 /* SVM uses type 3 ("Exception") for #OF and #BP; VMX uses type 6 */
685 if ( c->pending_type == 3
686 && (c->pending_vector == 3 || c->pending_vector == 4) )
687 c->pending_type = 6;
689 /* For software exceptions, we need to tell the hardware the
690 * instruction length as well (hmmm). */
691 if ( c->pending_type > 4 )
692 {
693 int addrbytes, ilen;
694 if ( (c->cs_arbytes & (1u<<13)) && (c->msr_efer & EFER_LMA) )
695 addrbytes = 8;
696 else if ( (c->cs_arbytes & (1u<<14)) )
697 addrbytes = 4;
698 else
699 addrbytes = 2;
700 ilen = hvm_instruction_length(c->rip, addrbytes);
701 __vmwrite(VM_ENTRY_INSTRUCTION_LEN, ilen);
702 }
704 /* Sanity check */
705 if ( c->pending_type == 1 || c->pending_type > 6
706 || c->pending_reserved != 0 )
707 {
708 gdprintk(XENLOG_ERR, "Invalid pending event 0x%"PRIx32"\n",
709 c->pending_event);
710 return -EINVAL;
711 }
712 /* Re-inject the exception */
713 __vmwrite(VM_ENTRY_INTR_INFO_FIELD, c->pending_event);
714 __vmwrite(VM_ENTRY_EXCEPTION_ERROR_CODE, c->error_code);
715 v->arch.hvm_vmx.vector_injected = 1;
716 vmx_vmcs_exit(v);
717 }
719 return 0;
721 bad_cr3:
722 gdprintk(XENLOG_ERR, "Invalid CR3 value=0x%"PRIx64"", c->cr3);
723 vmx_vmcs_exit(v);
724 return -EINVAL;
725 }
727 #if defined(__x86_64__) && defined(HVM_DEBUG_SUSPEND)
728 static void dump_msr_state(struct vmx_msr_state *m)
729 {
730 int i = 0;
731 printk("**** msr state ****\n");
732 printk("shadow_gs=0x%lx, flags=0x%lx, msr_items:", m->shadow_gs, m->flags);
733 for (i = 0; i < VMX_MSR_COUNT; i++)
734 printk("0x%lx,", m->msrs[i]);
735 printk("\n");
736 }
737 #else
738 #define dump_msr_state(m) ((void)0)
739 #endif
741 static void vmx_save_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data)
742 {
743 #ifdef __x86_64__
744 struct vmx_msr_state *guest_state = &v->arch.hvm_vmx.msr_state;
745 unsigned long guest_flags = guest_state->flags;
747 data->shadow_gs = v->arch.hvm_vmx.shadow_gs;
748 data->msr_cstar = v->arch.hvm_vmx.cstar;
750 /* save msrs */
751 data->msr_flags = guest_flags;
752 data->msr_lstar = guest_state->msrs[VMX_INDEX_MSR_LSTAR];
753 data->msr_star = guest_state->msrs[VMX_INDEX_MSR_STAR];
754 data->msr_syscall_mask = guest_state->msrs[VMX_INDEX_MSR_SYSCALL_MASK];
755 #endif
757 data->msr_efer = v->arch.hvm_vmx.efer;
759 data->tsc = hvm_get_guest_time(v);
761 dump_msr_state(guest_state);
762 }
764 static void vmx_load_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data)
765 {
766 #ifdef __x86_64__
767 struct vmx_msr_state *guest_state = &v->arch.hvm_vmx.msr_state;
769 /* restore msrs */
770 guest_state->flags = data->msr_flags;
771 guest_state->msrs[VMX_INDEX_MSR_LSTAR] = data->msr_lstar;
772 guest_state->msrs[VMX_INDEX_MSR_STAR] = data->msr_star;
773 guest_state->msrs[VMX_INDEX_MSR_SYSCALL_MASK] = data->msr_syscall_mask;
775 v->arch.hvm_vmx.cstar = data->msr_cstar;
776 v->arch.hvm_vmx.shadow_gs = data->shadow_gs;
777 #endif
779 v->arch.hvm_vmx.efer = data->msr_efer;
781 v->arch.hvm_vmx.vmxassist_enabled = !(data->cr0 & X86_CR0_PE);
783 hvm_set_guest_time(v, data->tsc);
785 dump_msr_state(guest_state);
786 }
789 static void vmx_save_vmcs_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt)
790 {
791 vmx_save_cpu_state(v, ctxt);
792 vmx_vmcs_enter(v);
793 vmx_vmcs_save(v, ctxt);
794 vmx_vmcs_exit(v);
795 }
797 static int vmx_load_vmcs_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt)
798 {
799 vmx_load_cpu_state(v, ctxt);
800 if (vmx_vmcs_restore(v, ctxt)) {
801 printk("vmx_vmcs restore failed!\n");
802 domain_crash(v->domain);
803 return -EINVAL;
804 }
806 return 0;
807 }
809 /*
810 * DR7 is saved and restored on every vmexit. Other debug registers only
811 * need to be restored if their value is going to affect execution -- i.e.,
812 * if one of the breakpoints is enabled. So mask out all bits that don't
813 * enable some breakpoint functionality.
814 */
815 #define DR7_ACTIVE_MASK 0xff
817 static inline void vmx_restore_dr(struct vcpu *v)
818 {
819 /* NB. __vmread() is not usable here, so we cannot read from the VMCS. */
820 if ( unlikely(v->arch.guest_context.debugreg[7] & DR7_ACTIVE_MASK) )
821 __restore_debug_registers(v);
822 }
824 static void vmx_ctxt_switch_from(struct vcpu *v)
825 {
826 vmx_save_guest_msrs(v);
827 vmx_restore_host_msrs();
828 vmx_save_dr(v);
829 }
831 static void vmx_ctxt_switch_to(struct vcpu *v)
832 {
833 vmx_restore_guest_msrs(v);
834 vmx_restore_dr(v);
835 }
837 static void stop_vmx(void)
838 {
839 if ( !(read_cr4() & X86_CR4_VMXE) )
840 return;
842 __vmxoff();
843 clear_in_cr4(X86_CR4_VMXE);
844 }
846 static void vmx_store_cpu_guest_regs(
847 struct vcpu *v, struct cpu_user_regs *regs, unsigned long *crs)
848 {
849 vmx_vmcs_enter(v);
851 if ( regs != NULL )
852 {
853 regs->eflags = __vmread(GUEST_RFLAGS);
854 regs->ss = __vmread(GUEST_SS_SELECTOR);
855 regs->cs = __vmread(GUEST_CS_SELECTOR);
856 regs->eip = __vmread(GUEST_RIP);
857 regs->esp = __vmread(GUEST_RSP);
858 }
860 if ( crs != NULL )
861 {
862 crs[0] = v->arch.hvm_vmx.cpu_shadow_cr0;
863 crs[2] = v->arch.hvm_vmx.cpu_cr2;
864 crs[3] = v->arch.hvm_vmx.cpu_cr3;
865 crs[4] = v->arch.hvm_vmx.cpu_shadow_cr4;
866 }
868 vmx_vmcs_exit(v);
869 }
871 static void vmx_load_cpu_guest_regs(struct vcpu *v, struct cpu_user_regs *regs)
872 {
873 unsigned long base;
875 vmx_vmcs_enter(v);
877 __vmwrite(GUEST_SS_SELECTOR, regs->ss);
878 __vmwrite(GUEST_RSP, regs->esp);
880 /* NB. Bit 1 of RFLAGS must be set for VMENTRY to succeed. */
881 __vmwrite(GUEST_RFLAGS, regs->eflags | 2UL);
883 if ( regs->eflags & EF_VM )
884 {
885 /*
886 * The VMX spec (section 4.3.1.2, Checks on Guest Segment
887 * Registers) says that virtual-8086 mode guests' segment
888 * base-address fields in the VMCS must be equal to their
889 * corresponding segment selector field shifted right by
890 * four bits upon vmentry.
891 */
892 base = __vmread(GUEST_CS_BASE);
893 if ( (regs->cs << 4) != base )
894 __vmwrite(GUEST_CS_BASE, regs->cs << 4);
895 base = __vmread(GUEST_SS_BASE);
896 if ( (regs->ss << 4) != base )
897 __vmwrite(GUEST_SS_BASE, regs->ss << 4);
898 }
900 __vmwrite(GUEST_CS_SELECTOR, regs->cs);
901 __vmwrite(GUEST_RIP, regs->eip);
903 vmx_vmcs_exit(v);
904 }
906 static unsigned long vmx_get_ctrl_reg(struct vcpu *v, unsigned int num)
907 {
908 switch ( num )
909 {
910 case 0:
911 return v->arch.hvm_vmx.cpu_cr0;
912 case 2:
913 return v->arch.hvm_vmx.cpu_cr2;
914 case 3:
915 return v->arch.hvm_vmx.cpu_cr3;
916 case 4:
917 return v->arch.hvm_vmx.cpu_shadow_cr4;
918 default:
919 BUG();
920 }
921 return 0; /* dummy */
922 }
924 static unsigned long vmx_get_segment_base(struct vcpu *v, enum x86_segment seg)
925 {
926 unsigned long base = 0;
927 int long_mode = 0;
929 ASSERT(v == current);
931 #ifdef __x86_64__
932 if ( vmx_long_mode_enabled(v) && (__vmread(GUEST_CS_AR_BYTES) & (1u<<13)) )
933 long_mode = 1;
934 #endif
936 switch ( seg )
937 {
938 case x86_seg_cs: if ( !long_mode ) base = __vmread(GUEST_CS_BASE); break;
939 case x86_seg_ds: if ( !long_mode ) base = __vmread(GUEST_DS_BASE); break;
940 case x86_seg_es: if ( !long_mode ) base = __vmread(GUEST_ES_BASE); break;
941 case x86_seg_fs: base = __vmread(GUEST_FS_BASE); break;
942 case x86_seg_gs: base = __vmread(GUEST_GS_BASE); break;
943 case x86_seg_ss: if ( !long_mode ) base = __vmread(GUEST_SS_BASE); break;
944 case x86_seg_tr: base = __vmread(GUEST_TR_BASE); break;
945 case x86_seg_gdtr: base = __vmread(GUEST_GDTR_BASE); break;
946 case x86_seg_idtr: base = __vmread(GUEST_IDTR_BASE); break;
947 case x86_seg_ldtr: base = __vmread(GUEST_LDTR_BASE); break;
948 default: BUG(); break;
949 }
951 return base;
952 }
954 static void vmx_get_segment_register(struct vcpu *v, enum x86_segment seg,
955 struct segment_register *reg)
956 {
957 u16 attr = 0;
959 ASSERT(v == current);
961 switch ( seg )
962 {
963 case x86_seg_cs:
964 reg->sel = __vmread(GUEST_CS_SELECTOR);
965 reg->limit = __vmread(GUEST_CS_LIMIT);
966 reg->base = __vmread(GUEST_CS_BASE);
967 attr = __vmread(GUEST_CS_AR_BYTES);
968 break;
969 case x86_seg_ds:
970 reg->sel = __vmread(GUEST_DS_SELECTOR);
971 reg->limit = __vmread(GUEST_DS_LIMIT);
972 reg->base = __vmread(GUEST_DS_BASE);
973 attr = __vmread(GUEST_DS_AR_BYTES);
974 break;
975 case x86_seg_es:
976 reg->sel = __vmread(GUEST_ES_SELECTOR);
977 reg->limit = __vmread(GUEST_ES_LIMIT);
978 reg->base = __vmread(GUEST_ES_BASE);
979 attr = __vmread(GUEST_ES_AR_BYTES);
980 break;
981 case x86_seg_fs:
982 reg->sel = __vmread(GUEST_FS_SELECTOR);
983 reg->limit = __vmread(GUEST_FS_LIMIT);
984 reg->base = __vmread(GUEST_FS_BASE);
985 attr = __vmread(GUEST_FS_AR_BYTES);
986 break;
987 case x86_seg_gs:
988 reg->sel = __vmread(GUEST_GS_SELECTOR);
989 reg->limit = __vmread(GUEST_GS_LIMIT);
990 reg->base = __vmread(GUEST_GS_BASE);
991 attr = __vmread(GUEST_GS_AR_BYTES);
992 break;
993 case x86_seg_ss:
994 reg->sel = __vmread(GUEST_SS_SELECTOR);
995 reg->limit = __vmread(GUEST_SS_LIMIT);
996 reg->base = __vmread(GUEST_SS_BASE);
997 attr = __vmread(GUEST_SS_AR_BYTES);
998 break;
999 case x86_seg_tr:
1000 reg->sel = __vmread(GUEST_TR_SELECTOR);
1001 reg->limit = __vmread(GUEST_TR_LIMIT);
1002 reg->base = __vmread(GUEST_TR_BASE);
1003 attr = __vmread(GUEST_TR_AR_BYTES);
1004 break;
1005 case x86_seg_gdtr:
1006 reg->limit = __vmread(GUEST_GDTR_LIMIT);
1007 reg->base = __vmread(GUEST_GDTR_BASE);
1008 break;
1009 case x86_seg_idtr:
1010 reg->limit = __vmread(GUEST_IDTR_LIMIT);
1011 reg->base = __vmread(GUEST_IDTR_BASE);
1012 break;
1013 case x86_seg_ldtr:
1014 reg->sel = __vmread(GUEST_LDTR_SELECTOR);
1015 reg->limit = __vmread(GUEST_LDTR_LIMIT);
1016 reg->base = __vmread(GUEST_LDTR_BASE);
1017 attr = __vmread(GUEST_LDTR_AR_BYTES);
1018 break;
1019 default:
1020 BUG();
1023 reg->attr.bytes = (attr & 0xff) | ((attr >> 4) & 0xf00);
1026 /* Make sure that xen intercepts any FP accesses from current */
1027 static void vmx_stts(struct vcpu *v)
1029 /* VMX depends on operating on the current vcpu */
1030 ASSERT(v == current);
1032 /*
1033 * If the guest does not have TS enabled then we must cause and handle an
1034 * exception on first use of the FPU. If the guest *does* have TS enabled
1035 * then this is not necessary: no FPU activity can occur until the guest
1036 * clears CR0.TS, and we will initialise the FPU when that happens.
1037 */
1038 if ( !(v->arch.hvm_vmx.cpu_shadow_cr0 & X86_CR0_TS) )
1040 v->arch.hvm_vmx.cpu_cr0 |= X86_CR0_TS;
1041 __vmwrite(GUEST_CR0, v->arch.hvm_vmx.cpu_cr0);
1042 __vm_set_bit(EXCEPTION_BITMAP, TRAP_no_device);
1046 static void vmx_set_tsc_offset(struct vcpu *v, u64 offset)
1048 vmx_vmcs_enter(v);
1049 __vmwrite(TSC_OFFSET, offset);
1050 #if defined (__i386__)
1051 __vmwrite(TSC_OFFSET_HIGH, offset >> 32);
1052 #endif
1053 vmx_vmcs_exit(v);
1056 static void vmx_init_ap_context(
1057 struct vcpu_guest_context *ctxt, int vcpuid, int trampoline_vector)
1059 memset(ctxt, 0, sizeof(*ctxt));
1060 ctxt->user_regs.eip = VMXASSIST_BASE;
1061 ctxt->user_regs.edx = vcpuid;
1062 ctxt->user_regs.ebx = trampoline_vector;
1065 void do_nmi(struct cpu_user_regs *);
1067 static void vmx_init_hypercall_page(struct domain *d, void *hypercall_page)
1069 char *p;
1070 int i;
1072 memset(hypercall_page, 0, PAGE_SIZE);
1074 for ( i = 0; i < (PAGE_SIZE / 32); i++ )
1076 p = (char *)(hypercall_page + (i * 32));
1077 *(u8 *)(p + 0) = 0xb8; /* mov imm32, %eax */
1078 *(u32 *)(p + 1) = i;
1079 *(u8 *)(p + 5) = 0x0f; /* vmcall */
1080 *(u8 *)(p + 6) = 0x01;
1081 *(u8 *)(p + 7) = 0xc1;
1082 *(u8 *)(p + 8) = 0xc3; /* ret */
1085 /* Don't support HYPERVISOR_iret at the moment */
1086 *(u16 *)(hypercall_page + (__HYPERVISOR_iret * 32)) = 0x0b0f; /* ud2 */
1089 static int vmx_guest_x86_mode(struct vcpu *v)
1091 unsigned int cs_ar_bytes;
1093 ASSERT(v == current);
1095 if ( unlikely(!(v->arch.hvm_vmx.cpu_shadow_cr0 & X86_CR0_PE)) )
1096 return 0;
1097 if ( unlikely(__vmread(GUEST_RFLAGS) & X86_EFLAGS_VM) )
1098 return 1;
1099 cs_ar_bytes = __vmread(GUEST_CS_AR_BYTES);
1100 if ( vmx_long_mode_enabled(v) && likely(cs_ar_bytes & (1u<<13)) )
1101 return 8;
1102 return (likely(cs_ar_bytes & (1u<<14)) ? 4 : 2);
1105 static int vmx_pae_enabled(struct vcpu *v)
1107 unsigned long cr4 = v->arch.hvm_vmx.cpu_shadow_cr4;
1108 return (vmx_paging_enabled(v) && (cr4 & X86_CR4_PAE));
1111 static int vmx_nx_enabled(struct vcpu *v)
1113 return v->arch.hvm_vmx.efer & EFER_NX;
1116 static int vmx_interrupts_enabled(struct vcpu *v)
1118 unsigned long eflags = __vmread(GUEST_RFLAGS);
1119 return !irq_masked(eflags);
1123 static void vmx_update_host_cr3(struct vcpu *v)
1125 ASSERT( (v == current) || !vcpu_runnable(v) );
1126 vmx_vmcs_enter(v);
1127 __vmwrite(HOST_CR3, v->arch.cr3);
1128 vmx_vmcs_exit(v);
1131 static void vmx_update_guest_cr3(struct vcpu *v)
1133 ASSERT( (v == current) || !vcpu_runnable(v) );
1134 vmx_vmcs_enter(v);
1135 __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
1136 vmx_vmcs_exit(v);
1140 static void vmx_inject_exception(
1141 unsigned int trapnr, int errcode, unsigned long cr2)
1143 struct vcpu *v = current;
1144 vmx_inject_hw_exception(v, trapnr, errcode);
1145 if ( trapnr == TRAP_page_fault )
1146 v->arch.hvm_vmx.cpu_cr2 = cr2;
1149 static void vmx_update_vtpr(struct vcpu *v, unsigned long value)
1151 /* VMX doesn't have a V_TPR field */
1154 static int vmx_event_injection_faulted(struct vcpu *v)
1156 unsigned int idtv_info_field;
1158 ASSERT(v == current);
1160 idtv_info_field = __vmread(IDT_VECTORING_INFO_FIELD);
1161 return (idtv_info_field & INTR_INFO_VALID_MASK);
1164 static void disable_intercept_for_msr(u32 msr)
1166 /*
1167 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
1168 * have the write-low and read-high bitmap offsets the wrong way round.
1169 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
1170 */
1171 if ( msr <= 0x1fff )
1173 __clear_bit(msr, vmx_msr_bitmap + 0x000); /* read-low */
1174 __clear_bit(msr, vmx_msr_bitmap + 0x800); /* write-low */
1176 else if ( (msr >= 0xc0000000) && (msr <= 0xc0001fff) )
1178 msr &= 0x1fff;
1179 __clear_bit(msr, vmx_msr_bitmap + 0x400); /* read-high */
1180 __clear_bit(msr, vmx_msr_bitmap + 0xc00); /* write-high */
1184 static struct hvm_function_table vmx_function_table = {
1185 .name = "VMX",
1186 .disable = stop_vmx,
1187 .domain_initialise = vmx_domain_initialise,
1188 .domain_destroy = vmx_domain_destroy,
1189 .vcpu_initialise = vmx_vcpu_initialise,
1190 .vcpu_destroy = vmx_vcpu_destroy,
1191 .store_cpu_guest_regs = vmx_store_cpu_guest_regs,
1192 .load_cpu_guest_regs = vmx_load_cpu_guest_regs,
1193 .save_cpu_ctxt = vmx_save_vmcs_ctxt,
1194 .load_cpu_ctxt = vmx_load_vmcs_ctxt,
1195 .paging_enabled = vmx_paging_enabled,
1196 .long_mode_enabled = vmx_long_mode_enabled,
1197 .pae_enabled = vmx_pae_enabled,
1198 .nx_enabled = vmx_nx_enabled,
1199 .interrupts_enabled = vmx_interrupts_enabled,
1200 .guest_x86_mode = vmx_guest_x86_mode,
1201 .get_guest_ctrl_reg = vmx_get_ctrl_reg,
1202 .get_segment_base = vmx_get_segment_base,
1203 .get_segment_register = vmx_get_segment_register,
1204 .update_host_cr3 = vmx_update_host_cr3,
1205 .update_guest_cr3 = vmx_update_guest_cr3,
1206 .update_vtpr = vmx_update_vtpr,
1207 .stts = vmx_stts,
1208 .set_tsc_offset = vmx_set_tsc_offset,
1209 .inject_exception = vmx_inject_exception,
1210 .init_ap_context = vmx_init_ap_context,
1211 .init_hypercall_page = vmx_init_hypercall_page,
1212 .event_injection_faulted = vmx_event_injection_faulted
1213 };
1215 int start_vmx(void)
1217 u32 eax, edx;
1218 struct vmcs_struct *vmcs;
1220 /*
1221 * Xen does not fill x86_capability words except 0.
1222 */
1223 boot_cpu_data.x86_capability[4] = cpuid_ecx(1);
1225 if ( !test_bit(X86_FEATURE_VMXE, &boot_cpu_data.x86_capability) )
1226 return 0;
1228 rdmsr(IA32_FEATURE_CONTROL_MSR, eax, edx);
1230 if ( eax & IA32_FEATURE_CONTROL_MSR_LOCK )
1232 if ( (eax & IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON) == 0x0 )
1234 printk("VMX disabled by Feature Control MSR.\n");
1235 return 0;
1238 else
1240 wrmsr(IA32_FEATURE_CONTROL_MSR,
1241 IA32_FEATURE_CONTROL_MSR_LOCK |
1242 IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON, 0);
1245 set_in_cr4(X86_CR4_VMXE);
1247 vmx_init_vmcs_config();
1249 if ( smp_processor_id() == 0 )
1250 setup_vmcs_dump();
1252 if ( (vmcs = vmx_alloc_host_vmcs()) == NULL )
1254 clear_in_cr4(X86_CR4_VMXE);
1255 printk("Failed to allocate host VMCS\n");
1256 return 0;
1259 if ( __vmxon(virt_to_maddr(vmcs)) )
1261 clear_in_cr4(X86_CR4_VMXE);
1262 printk("VMXON failed\n");
1263 vmx_free_host_vmcs(vmcs);
1264 return 0;
1267 vmx_save_host_msrs();
1269 if ( smp_processor_id() != 0 )
1270 return 1;
1272 hvm_enable(&vmx_function_table);
1274 if ( cpu_has_vmx_msr_bitmap )
1276 printk("VMX: MSR intercept bitmap enabled\n");
1277 vmx_msr_bitmap = alloc_xenheap_page();
1278 BUG_ON(vmx_msr_bitmap == NULL);
1279 memset(vmx_msr_bitmap, ~0, PAGE_SIZE);
1280 disable_intercept_for_msr(MSR_FS_BASE);
1281 disable_intercept_for_msr(MSR_GS_BASE);
1284 return 1;
1287 /*
1288 * Not all cases receive valid value in the VM-exit instruction length field.
1289 * Callers must know what they're doing!
1290 */
1291 static int __get_instruction_length(void)
1293 int len;
1294 len = __vmread(VM_EXIT_INSTRUCTION_LEN); /* Safe: callers audited */
1295 BUG_ON((len < 1) || (len > 15));
1296 return len;
1299 static void inline __update_guest_eip(unsigned long inst_len)
1301 unsigned long current_eip;
1303 current_eip = __vmread(GUEST_RIP);
1304 __vmwrite(GUEST_RIP, current_eip + inst_len);
1305 __vmwrite(GUEST_INTERRUPTIBILITY_INFO, 0);
1308 static void vmx_do_no_device_fault(void)
1310 struct vcpu *v = current;
1312 setup_fpu(current);
1313 __vm_clear_bit(EXCEPTION_BITMAP, TRAP_no_device);
1315 /* Disable TS in guest CR0 unless the guest wants the exception too. */
1316 if ( !(v->arch.hvm_vmx.cpu_shadow_cr0 & X86_CR0_TS) )
1318 v->arch.hvm_vmx.cpu_cr0 &= ~X86_CR0_TS;
1319 __vmwrite(GUEST_CR0, v->arch.hvm_vmx.cpu_cr0);
1323 #define bitmaskof(idx) (1U << ((idx) & 31))
1324 static void vmx_do_cpuid(struct cpu_user_regs *regs)
1326 unsigned int input = (unsigned int)regs->eax;
1327 unsigned int count = (unsigned int)regs->ecx;
1328 unsigned int eax, ebx, ecx, edx;
1330 if ( input == 0x00000004 )
1332 cpuid_count(input, count, &eax, &ebx, &ecx, &edx);
1333 eax &= NUM_CORES_RESET_MASK;
1335 else if ( input == 0x40000003 )
1337 /*
1338 * NB. Unsupported interface for private use of VMXASSIST only.
1339 * Note that this leaf lives at <max-hypervisor-leaf> + 1.
1340 */
1341 u64 value = ((u64)regs->edx << 32) | (u32)regs->ecx;
1342 unsigned long mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
1343 struct vcpu *v = current;
1344 char *p;
1346 gdprintk(XENLOG_INFO, "Input address is 0x%"PRIx64".\n", value);
1348 /* 8-byte aligned valid pseudophys address from vmxassist, please. */
1349 if ( (value & 7) || (mfn == INVALID_MFN) ||
1350 !v->arch.hvm_vmx.vmxassist_enabled )
1352 domain_crash(v->domain);
1353 return;
1356 p = map_domain_page(mfn);
1357 value = *((uint64_t *)(p + (value & (PAGE_SIZE - 1))));
1358 unmap_domain_page(p);
1360 gdprintk(XENLOG_INFO, "Output value is 0x%"PRIx64".\n", value);
1361 ecx = (u32)value;
1362 edx = (u32)(value >> 32);
1363 } else {
1364 hvm_cpuid(input, &eax, &ebx, &ecx, &edx);
1366 if ( input == 0x00000001 )
1368 /* Mask off reserved bits. */
1369 ecx &= ~VMX_VCPU_CPUID_L1_ECX_RESERVED;
1371 ebx &= NUM_THREADS_RESET_MASK;
1373 /* Unsupportable for virtualised CPUs. */
1374 ecx &= ~(bitmaskof(X86_FEATURE_VMXE) |
1375 bitmaskof(X86_FEATURE_EST) |
1376 bitmaskof(X86_FEATURE_TM2) |
1377 bitmaskof(X86_FEATURE_CID));
1379 edx &= ~(bitmaskof(X86_FEATURE_HT) |
1380 bitmaskof(X86_FEATURE_ACPI) |
1381 bitmaskof(X86_FEATURE_ACC));
1384 if ( input == 0x00000006 || input == 0x00000009 || input == 0x0000000A )
1385 eax = ebx = ecx = edx = 0x0;
1388 regs->eax = (unsigned long)eax;
1389 regs->ebx = (unsigned long)ebx;
1390 regs->ecx = (unsigned long)ecx;
1391 regs->edx = (unsigned long)edx;
1393 HVMTRACE_3D(CPUID, current, input,
1394 ((uint64_t)eax << 32) | ebx, ((uint64_t)ecx << 32) | edx);
1397 #define CASE_GET_REG_P(REG, reg) \
1398 case REG_ ## REG: reg_p = (unsigned long *)&(regs->reg); break
1400 #ifdef __i386__
1401 #define CASE_EXTEND_GET_REG_P
1402 #else
1403 #define CASE_EXTEND_GET_REG_P \
1404 CASE_GET_REG_P(R8, r8); \
1405 CASE_GET_REG_P(R9, r9); \
1406 CASE_GET_REG_P(R10, r10); \
1407 CASE_GET_REG_P(R11, r11); \
1408 CASE_GET_REG_P(R12, r12); \
1409 CASE_GET_REG_P(R13, r13); \
1410 CASE_GET_REG_P(R14, r14); \
1411 CASE_GET_REG_P(R15, r15)
1412 #endif
1414 static void vmx_dr_access(unsigned long exit_qualification,
1415 struct cpu_user_regs *regs)
1417 struct vcpu *v = current;
1419 HVMTRACE_0D(DR_WRITE, v);
1421 v->arch.hvm_vcpu.flag_dr_dirty = 1;
1423 /* We could probably be smarter about this */
1424 __restore_debug_registers(v);
1426 /* Allow guest direct access to DR registers */
1427 v->arch.hvm_vcpu.u.vmx.exec_control &= ~CPU_BASED_MOV_DR_EXITING;
1428 __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
1429 v->arch.hvm_vcpu.u.vmx.exec_control);
1432 /*
1433 * Invalidate the TLB for va. Invalidate the shadow page corresponding
1434 * the address va.
1435 */
1436 static void vmx_do_invlpg(unsigned long va)
1438 unsigned long eip;
1439 struct vcpu *v = current;
1441 HVMTRACE_2D(INVLPG, v, /*invlpga=*/ 0, va);
1443 eip = __vmread(GUEST_RIP);
1445 HVM_DBG_LOG(DBG_LEVEL_VMMU, "eip=%lx, va=%lx",
1446 eip, va);
1448 /*
1449 * We do the safest things first, then try to update the shadow
1450 * copying from guest
1451 */
1452 paging_invlpg(v, va);
1456 static int vmx_check_descriptor(int long_mode, unsigned long eip, int inst_len,
1457 enum x86_segment seg, unsigned long *base,
1458 u32 *limit, u32 *ar_bytes)
1460 enum vmcs_field ar_field, base_field, limit_field;
1462 *base = 0;
1463 *limit = 0;
1464 if ( seg != x86_seg_es )
1466 unsigned char inst[MAX_INST_LEN];
1467 int i;
1468 extern int inst_copy_from_guest(unsigned char *, unsigned long, int);
1470 if ( !long_mode )
1471 eip += __vmread(GUEST_CS_BASE);
1472 memset(inst, 0, MAX_INST_LEN);
1473 if ( inst_copy_from_guest(inst, eip, inst_len) != inst_len )
1475 gdprintk(XENLOG_ERR, "Get guest instruction failed\n");
1476 domain_crash(current->domain);
1477 return 0;
1480 for ( i = 0; i < inst_len; i++ )
1482 switch ( inst[i] )
1484 case 0xf3: /* REPZ */
1485 case 0xf2: /* REPNZ */
1486 case 0xf0: /* LOCK */
1487 case 0x66: /* data32 */
1488 case 0x67: /* addr32 */
1489 #ifdef __x86_64__
1490 case 0x40 ... 0x4f: /* REX */
1491 #endif
1492 continue;
1493 case 0x2e: /* CS */
1494 seg = x86_seg_cs;
1495 continue;
1496 case 0x36: /* SS */
1497 seg = x86_seg_ss;
1498 continue;
1499 case 0x26: /* ES */
1500 seg = x86_seg_es;
1501 continue;
1502 case 0x64: /* FS */
1503 seg = x86_seg_fs;
1504 continue;
1505 case 0x65: /* GS */
1506 seg = x86_seg_gs;
1507 continue;
1508 case 0x3e: /* DS */
1509 seg = x86_seg_ds;
1510 continue;
1515 switch ( seg )
1517 case x86_seg_cs:
1518 ar_field = GUEST_CS_AR_BYTES;
1519 base_field = GUEST_CS_BASE;
1520 limit_field = GUEST_CS_LIMIT;
1521 break;
1522 case x86_seg_ds:
1523 ar_field = GUEST_DS_AR_BYTES;
1524 base_field = GUEST_DS_BASE;
1525 limit_field = GUEST_DS_LIMIT;
1526 break;
1527 case x86_seg_es:
1528 ar_field = GUEST_ES_AR_BYTES;
1529 base_field = GUEST_ES_BASE;
1530 limit_field = GUEST_ES_LIMIT;
1531 break;
1532 case x86_seg_fs:
1533 ar_field = GUEST_FS_AR_BYTES;
1534 base_field = GUEST_FS_BASE;
1535 limit_field = GUEST_FS_LIMIT;
1536 break;
1537 case x86_seg_gs:
1538 ar_field = GUEST_GS_AR_BYTES;
1539 base_field = GUEST_GS_BASE;
1540 limit_field = GUEST_GS_LIMIT;
1541 break;
1542 case x86_seg_ss:
1543 ar_field = GUEST_SS_AR_BYTES;
1544 base_field = GUEST_SS_BASE;
1545 limit_field = GUEST_SS_LIMIT;
1546 break;
1547 default:
1548 BUG();
1549 return 0;
1552 if ( !long_mode || seg == x86_seg_fs || seg == x86_seg_gs )
1554 *base = __vmread(base_field);
1555 *limit = __vmread(limit_field);
1557 *ar_bytes = __vmread(ar_field);
1559 return !(*ar_bytes & 0x10000);
1562 static void vmx_io_instruction(unsigned long exit_qualification,
1563 unsigned long inst_len)
1565 struct cpu_user_regs *regs;
1566 struct hvm_io_op *pio_opp;
1567 unsigned int port, size;
1568 int dir, df, vm86;
1570 pio_opp = &current->arch.hvm_vcpu.io_op;
1571 pio_opp->instr = INSTR_PIO;
1572 pio_opp->flags = 0;
1574 regs = &pio_opp->io_context;
1576 /* Copy current guest state into io instruction state structure. */
1577 memcpy(regs, guest_cpu_user_regs(), HVM_CONTEXT_STACK_BYTES);
1578 hvm_store_cpu_guest_regs(current, regs, NULL);
1580 vm86 = regs->eflags & X86_EFLAGS_VM ? 1 : 0;
1581 df = regs->eflags & X86_EFLAGS_DF ? 1 : 0;
1583 HVM_DBG_LOG(DBG_LEVEL_IO, "vm86 %d, eip=%x:%lx, "
1584 "exit_qualification = %lx",
1585 vm86, regs->cs, (unsigned long)regs->eip, exit_qualification);
1587 if ( test_bit(6, &exit_qualification) )
1588 port = (exit_qualification >> 16) & 0xFFFF;
1589 else
1590 port = regs->edx & 0xffff;
1592 size = (exit_qualification & 7) + 1;
1593 dir = test_bit(3, &exit_qualification); /* direction */
1595 if (dir==IOREQ_READ)
1596 HVMTRACE_2D(IO_READ, current, port, size);
1597 else
1598 HVMTRACE_2D(IO_WRITE, current, port, size);
1600 if ( test_bit(4, &exit_qualification) ) { /* string instruction */
1601 unsigned long addr, count = 1, base;
1602 paddr_t paddr;
1603 unsigned long gfn;
1604 u32 ar_bytes, limit;
1605 int sign = regs->eflags & X86_EFLAGS_DF ? -1 : 1;
1606 int long_mode = 0;
1608 ar_bytes = __vmread(GUEST_CS_AR_BYTES);
1609 #ifdef __x86_64__
1610 if ( vmx_long_mode_enabled(current) && (ar_bytes & (1u<<13)) )
1611 long_mode = 1;
1612 #endif
1613 addr = __vmread(GUEST_LINEAR_ADDRESS);
1615 if ( test_bit(5, &exit_qualification) ) { /* "rep" prefix */
1616 pio_opp->flags |= REPZ;
1617 count = regs->ecx;
1618 if ( !long_mode && (vm86 || !(ar_bytes & (1u<<14))) )
1619 count &= 0xFFFF;
1622 /*
1623 * In protected mode, guest linear address is invalid if the
1624 * selector is null.
1625 */
1626 if ( !vmx_check_descriptor(long_mode, regs->eip, inst_len,
1627 dir==IOREQ_WRITE ? x86_seg_ds : x86_seg_es,
1628 &base, &limit, &ar_bytes) ) {
1629 if ( !long_mode ) {
1630 vmx_inject_hw_exception(current, TRAP_gp_fault, 0);
1631 return;
1633 addr = dir == IOREQ_WRITE ? base + regs->esi : regs->edi;
1636 if ( !long_mode ) {
1637 unsigned long ea = addr - base;
1639 /* Segment must be readable for outs and writeable for ins. */
1640 if ( dir == IOREQ_WRITE ? (ar_bytes & 0xa) == 0x8
1641 : (ar_bytes & 0xa) != 0x2 ) {
1642 vmx_inject_hw_exception(current, TRAP_gp_fault, 0);
1643 return;
1646 /* Offset must be within limits. */
1647 ASSERT(ea == (u32)ea);
1648 if ( (u32)(ea + size - 1) < (u32)ea ||
1649 (ar_bytes & 0xc) != 0x4 ? ea + size - 1 > limit
1650 : ea <= limit )
1652 vmx_inject_hw_exception(current, TRAP_gp_fault, 0);
1653 return;
1656 /* Check the limit for repeated instructions, as above we checked
1657 only the first instance. Truncate the count if a limit violation
1658 would occur. Note that the checking is not necessary for page
1659 granular segments as transfers crossing page boundaries will be
1660 broken up anyway. */
1661 if ( !(ar_bytes & (1u<<15)) && count > 1 )
1663 if ( (ar_bytes & 0xc) != 0x4 )
1665 /* expand-up */
1666 if ( !df )
1668 if ( ea + count * size - 1 < ea ||
1669 ea + count * size - 1 > limit )
1670 count = (limit + 1UL - ea) / size;
1672 else
1674 if ( count - 1 > ea / size )
1675 count = ea / size + 1;
1678 else
1680 /* expand-down */
1681 if ( !df )
1683 if ( count - 1 > -(s32)ea / size )
1684 count = -(s32)ea / size + 1UL;
1686 else
1688 if ( ea < (count - 1) * size ||
1689 ea - (count - 1) * size <= limit )
1690 count = (ea - limit - 1) / size + 1;
1693 ASSERT(count);
1696 #ifdef __x86_64__
1697 else
1699 if ( !is_canonical_address(addr) ||
1700 !is_canonical_address(addr + size - 1) )
1702 vmx_inject_hw_exception(current, TRAP_gp_fault, 0);
1703 return;
1705 if ( count > (1UL << 48) / size )
1706 count = (1UL << 48) / size;
1707 if ( !(regs->eflags & EF_DF) )
1709 if ( addr + count * size - 1 < addr ||
1710 !is_canonical_address(addr + count * size - 1) )
1711 count = (addr & ~((1UL << 48) - 1)) / size;
1713 else
1715 if ( (count - 1) * size > addr ||
1716 !is_canonical_address(addr + (count - 1) * size) )
1717 count = (addr & ~((1UL << 48) - 1)) / size + 1;
1719 ASSERT(count);
1721 #endif
1723 /* Translate the address to a physical address */
1724 gfn = paging_gva_to_gfn(current, addr);
1725 if ( gfn == INVALID_GFN )
1727 /* The guest does not have the RAM address mapped.
1728 * Need to send in a page fault */
1729 int errcode = 0;
1730 /* IO read --> memory write */
1731 if ( dir == IOREQ_READ ) errcode |= PFEC_write_access;
1732 vmx_inject_exception(TRAP_page_fault, errcode, addr);
1733 return;
1735 paddr = (paddr_t)gfn << PAGE_SHIFT | (addr & ~PAGE_MASK);
1737 /*
1738 * Handle string pio instructions that cross pages or that
1739 * are unaligned. See the comments in hvm_domain.c/handle_mmio()
1740 */
1741 if ( (addr & PAGE_MASK) != ((addr + size - 1) & PAGE_MASK) ) {
1742 unsigned long value = 0;
1744 pio_opp->flags |= OVERLAP;
1746 if ( dir == IOREQ_WRITE ) /* OUTS */
1748 if ( hvm_paging_enabled(current) )
1750 int rv = hvm_copy_from_guest_virt(&value, addr, size);
1751 if ( rv != 0 )
1753 /* Failed on the page-spanning copy. Inject PF into
1754 * the guest for the address where we failed. */
1755 addr += size - rv;
1756 gdprintk(XENLOG_DEBUG, "Pagefault reading non-io side "
1757 "of a page-spanning PIO: va=%#lx\n", addr);
1758 vmx_inject_exception(TRAP_page_fault, 0, addr);
1759 return;
1762 else
1763 (void) hvm_copy_from_guest_phys(&value, addr, size);
1764 } else /* dir != IOREQ_WRITE */
1765 /* Remember where to write the result, as a *VA*.
1766 * Must be a VA so we can handle the page overlap
1767 * correctly in hvm_pio_assist() */
1768 pio_opp->addr = addr;
1770 if ( count == 1 )
1771 regs->eip += inst_len;
1773 send_pio_req(port, 1, size, value, dir, df, 0);
1774 } else {
1775 unsigned long last_addr = sign > 0 ? addr + count * size - 1
1776 : addr - (count - 1) * size;
1778 if ( (addr & PAGE_MASK) != (last_addr & PAGE_MASK) )
1780 if ( sign > 0 )
1781 count = (PAGE_SIZE - (addr & ~PAGE_MASK)) / size;
1782 else
1783 count = (addr & ~PAGE_MASK) / size + 1;
1784 } else
1785 regs->eip += inst_len;
1787 send_pio_req(port, count, size, paddr, dir, df, 1);
1789 } else {
1790 if ( port == 0xe9 && dir == IOREQ_WRITE && size == 1 )
1791 hvm_print_line(current, regs->eax); /* guest debug output */
1793 regs->eip += inst_len;
1794 send_pio_req(port, 1, size, regs->eax, dir, df, 0);
1798 static void vmx_world_save(struct vcpu *v, struct vmx_assist_context *c)
1800 /* NB. Skip transition instruction. */
1801 c->eip = __vmread(GUEST_RIP);
1802 c->eip += __get_instruction_length(); /* Safe: MOV Cn, LMSW, CLTS */
1804 c->esp = __vmread(GUEST_RSP);
1805 c->eflags = __vmread(GUEST_RFLAGS);
1807 c->cr0 = v->arch.hvm_vmx.cpu_shadow_cr0;
1808 c->cr3 = v->arch.hvm_vmx.cpu_cr3;
1809 c->cr4 = v->arch.hvm_vmx.cpu_shadow_cr4;
1811 c->idtr_limit = __vmread(GUEST_IDTR_LIMIT);
1812 c->idtr_base = __vmread(GUEST_IDTR_BASE);
1814 c->gdtr_limit = __vmread(GUEST_GDTR_LIMIT);
1815 c->gdtr_base = __vmread(GUEST_GDTR_BASE);
1817 c->cs_sel = __vmread(GUEST_CS_SELECTOR);
1818 c->cs_limit = __vmread(GUEST_CS_LIMIT);
1819 c->cs_base = __vmread(GUEST_CS_BASE);
1820 c->cs_arbytes.bytes = __vmread(GUEST_CS_AR_BYTES);
1822 c->ds_sel = __vmread(GUEST_DS_SELECTOR);
1823 c->ds_limit = __vmread(GUEST_DS_LIMIT);
1824 c->ds_base = __vmread(GUEST_DS_BASE);
1825 c->ds_arbytes.bytes = __vmread(GUEST_DS_AR_BYTES);
1827 c->es_sel = __vmread(GUEST_ES_SELECTOR);
1828 c->es_limit = __vmread(GUEST_ES_LIMIT);
1829 c->es_base = __vmread(GUEST_ES_BASE);
1830 c->es_arbytes.bytes = __vmread(GUEST_ES_AR_BYTES);
1832 c->ss_sel = __vmread(GUEST_SS_SELECTOR);
1833 c->ss_limit = __vmread(GUEST_SS_LIMIT);
1834 c->ss_base = __vmread(GUEST_SS_BASE);
1835 c->ss_arbytes.bytes = __vmread(GUEST_SS_AR_BYTES);
1837 c->fs_sel = __vmread(GUEST_FS_SELECTOR);
1838 c->fs_limit = __vmread(GUEST_FS_LIMIT);
1839 c->fs_base = __vmread(GUEST_FS_BASE);
1840 c->fs_arbytes.bytes = __vmread(GUEST_FS_AR_BYTES);
1842 c->gs_sel = __vmread(GUEST_GS_SELECTOR);
1843 c->gs_limit = __vmread(GUEST_GS_LIMIT);
1844 c->gs_base = __vmread(GUEST_GS_BASE);
1845 c->gs_arbytes.bytes = __vmread(GUEST_GS_AR_BYTES);
1847 c->tr_sel = __vmread(GUEST_TR_SELECTOR);
1848 c->tr_limit = __vmread(GUEST_TR_LIMIT);
1849 c->tr_base = __vmread(GUEST_TR_BASE);
1850 c->tr_arbytes.bytes = __vmread(GUEST_TR_AR_BYTES);
1852 c->ldtr_sel = __vmread(GUEST_LDTR_SELECTOR);
1853 c->ldtr_limit = __vmread(GUEST_LDTR_LIMIT);
1854 c->ldtr_base = __vmread(GUEST_LDTR_BASE);
1855 c->ldtr_arbytes.bytes = __vmread(GUEST_LDTR_AR_BYTES);
1858 static int vmx_world_restore(struct vcpu *v, struct vmx_assist_context *c)
1860 unsigned long mfn, old_base_mfn;
1862 __vmwrite(GUEST_RIP, c->eip);
1863 __vmwrite(GUEST_RSP, c->esp);
1864 __vmwrite(GUEST_RFLAGS, c->eflags);
1866 v->arch.hvm_vmx.cpu_shadow_cr0 = c->cr0;
1867 __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr0);
1869 if ( !vmx_paging_enabled(v) )
1870 goto skip_cr3;
1872 if ( c->cr3 == v->arch.hvm_vmx.cpu_cr3 )
1874 /*
1875 * This is simple TLB flush, implying the guest has
1876 * removed some translation or changed page attributes.
1877 * We simply invalidate the shadow.
1878 */
1879 mfn = get_mfn_from_gpfn(c->cr3 >> PAGE_SHIFT);
1880 if ( mfn != pagetable_get_pfn(v->arch.guest_table) )
1881 goto bad_cr3;
1883 else
1885 /*
1886 * If different, make a shadow. Check if the PDBR is valid
1887 * first.
1888 */
1889 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 c->cr3 = %x", c->cr3);
1890 mfn = get_mfn_from_gpfn(c->cr3 >> PAGE_SHIFT);
1891 if ( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), v->domain) )
1892 goto bad_cr3;
1893 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1894 v->arch.guest_table = pagetable_from_pfn(mfn);
1895 if (old_base_mfn)
1896 put_page(mfn_to_page(old_base_mfn));
1897 v->arch.hvm_vmx.cpu_cr3 = c->cr3;
1900 skip_cr3:
1901 if ( !vmx_paging_enabled(v) )
1902 HVM_DBG_LOG(DBG_LEVEL_VMMU, "switching to vmxassist. use phys table");
1903 else
1904 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %x", c->cr3);
1906 __vmwrite(GUEST_CR4, (c->cr4 | VMX_CR4_HOST_MASK));
1907 v->arch.hvm_vmx.cpu_shadow_cr4 = c->cr4;
1908 __vmwrite(CR4_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr4);
1910 __vmwrite(GUEST_IDTR_LIMIT, c->idtr_limit);
1911 __vmwrite(GUEST_IDTR_BASE, c->idtr_base);
1913 __vmwrite(GUEST_GDTR_LIMIT, c->gdtr_limit);
1914 __vmwrite(GUEST_GDTR_BASE, c->gdtr_base);
1916 __vmwrite(GUEST_CS_SELECTOR, c->cs_sel);
1917 __vmwrite(GUEST_CS_LIMIT, c->cs_limit);
1918 __vmwrite(GUEST_CS_BASE, c->cs_base);
1919 __vmwrite(GUEST_CS_AR_BYTES, c->cs_arbytes.bytes);
1921 __vmwrite(GUEST_DS_SELECTOR, c->ds_sel);
1922 __vmwrite(GUEST_DS_LIMIT, c->ds_limit);
1923 __vmwrite(GUEST_DS_BASE, c->ds_base);
1924 __vmwrite(GUEST_DS_AR_BYTES, c->ds_arbytes.bytes);
1926 __vmwrite(GUEST_ES_SELECTOR, c->es_sel);
1927 __vmwrite(GUEST_ES_LIMIT, c->es_limit);
1928 __vmwrite(GUEST_ES_BASE, c->es_base);
1929 __vmwrite(GUEST_ES_AR_BYTES, c->es_arbytes.bytes);
1931 __vmwrite(GUEST_SS_SELECTOR, c->ss_sel);
1932 __vmwrite(GUEST_SS_LIMIT, c->ss_limit);
1933 __vmwrite(GUEST_SS_BASE, c->ss_base);
1934 __vmwrite(GUEST_SS_AR_BYTES, c->ss_arbytes.bytes);
1936 __vmwrite(GUEST_FS_SELECTOR, c->fs_sel);
1937 __vmwrite(GUEST_FS_LIMIT, c->fs_limit);
1938 __vmwrite(GUEST_FS_BASE, c->fs_base);
1939 __vmwrite(GUEST_FS_AR_BYTES, c->fs_arbytes.bytes);
1941 __vmwrite(GUEST_GS_SELECTOR, c->gs_sel);
1942 __vmwrite(GUEST_GS_LIMIT, c->gs_limit);
1943 __vmwrite(GUEST_GS_BASE, c->gs_base);
1944 __vmwrite(GUEST_GS_AR_BYTES, c->gs_arbytes.bytes);
1946 __vmwrite(GUEST_TR_SELECTOR, c->tr_sel);
1947 __vmwrite(GUEST_TR_LIMIT, c->tr_limit);
1948 __vmwrite(GUEST_TR_BASE, c->tr_base);
1949 __vmwrite(GUEST_TR_AR_BYTES, c->tr_arbytes.bytes);
1951 __vmwrite(GUEST_LDTR_SELECTOR, c->ldtr_sel);
1952 __vmwrite(GUEST_LDTR_LIMIT, c->ldtr_limit);
1953 __vmwrite(GUEST_LDTR_BASE, c->ldtr_base);
1954 __vmwrite(GUEST_LDTR_AR_BYTES, c->ldtr_arbytes.bytes);
1956 paging_update_paging_modes(v);
1957 return 0;
1959 bad_cr3:
1960 gdprintk(XENLOG_ERR, "Invalid CR3 value=%x", c->cr3);
1961 return -EINVAL;
1964 enum { VMX_ASSIST_INVOKE = 0, VMX_ASSIST_RESTORE };
1966 static int vmx_assist(struct vcpu *v, int mode)
1968 struct vmx_assist_context c;
1969 u32 magic;
1970 u32 cp;
1972 /* make sure vmxassist exists (this is not an error) */
1973 if (hvm_copy_from_guest_phys(&magic, VMXASSIST_MAGIC_OFFSET,
1974 sizeof(magic)))
1975 return 0;
1976 if (magic != VMXASSIST_MAGIC)
1977 return 0;
1979 switch (mode) {
1980 /*
1981 * Transfer control to vmxassist.
1982 * Store the current context in VMXASSIST_OLD_CONTEXT and load
1983 * the new VMXASSIST_NEW_CONTEXT context. This context was created
1984 * by vmxassist and will transfer control to it.
1985 */
1986 case VMX_ASSIST_INVOKE:
1987 /* save the old context */
1988 if (hvm_copy_from_guest_phys(&cp, VMXASSIST_OLD_CONTEXT, sizeof(cp)))
1989 goto error;
1990 if (cp != 0) {
1991 vmx_world_save(v, &c);
1992 if (hvm_copy_to_guest_phys(cp, &c, sizeof(c)))
1993 goto error;
1996 /* restore the new context, this should activate vmxassist */
1997 if (hvm_copy_from_guest_phys(&cp, VMXASSIST_NEW_CONTEXT, sizeof(cp)))
1998 goto error;
1999 if (cp != 0) {
2000 if (hvm_copy_from_guest_phys(&c, cp, sizeof(c)))
2001 goto error;
2002 if ( vmx_world_restore(v, &c) != 0 )
2003 goto error;
2004 v->arch.hvm_vmx.vmxassist_enabled = 1;
2005 /*
2006 * The 32-bit vmxassist vm86.c support code is hard-coded to
2007 * expect vPIC interrupts to arrive at interrupt traps 0x20-0x27
2008 * and 0x28-0x2f. It bounces these to 16-bit boot code traps
2009 * 0x08-0x0f and 0x70-0x77. But when the guest transitions
2010 * to true native 32-bit mode, vmxassist steps out of the
2011 * way and no such bouncing occurs; so we need to rewrite
2012 * the vPIC irq base to point directly to 0x08/0x70 (see
2013 * code just below). So on re-entering 16-bit mode, we need
2014 * to reset the vPICs to go back to the 0x20/0x28 bounce traps.
2015 */
2016 v->domain->arch.hvm_domain.vpic[0].irq_base = 0x20;
2017 v->domain->arch.hvm_domain.vpic[1].irq_base = 0x28;
2018 return 1;
2020 break;
2022 /*
2023 * Restore the VMXASSIST_OLD_CONTEXT that was saved by
2024 * VMX_ASSIST_INVOKE above.
2025 */
2026 case VMX_ASSIST_RESTORE:
2027 /* save the old context */
2028 if (hvm_copy_from_guest_phys(&cp, VMXASSIST_OLD_CONTEXT, sizeof(cp)))
2029 goto error;
2030 if (cp != 0) {
2031 if (hvm_copy_from_guest_phys(&c, cp, sizeof(c)))
2032 goto error;
2033 if ( vmx_world_restore(v, &c) != 0 )
2034 goto error;
2035 v->arch.hvm_vmx.vmxassist_enabled = 0;
2036 /*
2037 * See comment above about vmxassist 16/32-bit vPIC behaviour.
2038 * The irq_base values are hard-coded into vmxassist vm86.c.
2039 */
2040 v->domain->arch.hvm_domain.vpic[0].irq_base = 0x08;
2041 v->domain->arch.hvm_domain.vpic[1].irq_base = 0x70;
2042 return 1;
2044 break;
2047 error:
2048 gdprintk(XENLOG_ERR, "Failed to transfer to vmxassist\n");
2049 domain_crash(v->domain);
2050 return 0;
2053 static int vmx_set_cr0(unsigned long value)
2055 struct vcpu *v = current;
2056 unsigned long mfn;
2057 unsigned long eip;
2058 int paging_enabled;
2059 unsigned long vm_entry_value;
2060 unsigned long old_cr0;
2061 unsigned long old_base_mfn;
2063 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR0 value = %lx", value);
2065 /* ET is reserved and should be always be 1. */
2066 value |= X86_CR0_ET;
2068 if ( (value & (X86_CR0_PE|X86_CR0_PG)) == X86_CR0_PG )
2070 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
2071 return 0;
2074 /* TS cleared? Then initialise FPU now. */
2075 if ( !(value & X86_CR0_TS) )
2077 setup_fpu(v);
2078 __vm_clear_bit(EXCEPTION_BITMAP, TRAP_no_device);
2081 old_cr0 = v->arch.hvm_vmx.cpu_shadow_cr0;
2082 paging_enabled = old_cr0 & X86_CR0_PG;
2084 v->arch.hvm_vmx.cpu_cr0 = (value | X86_CR0_PE | X86_CR0_PG
2085 | X86_CR0_NE | X86_CR0_WP);
2086 __vmwrite(GUEST_CR0, v->arch.hvm_vmx.cpu_cr0);
2088 v->arch.hvm_vmx.cpu_shadow_cr0 = value;
2089 __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr0);
2091 if ( (value & X86_CR0_PE) && (value & X86_CR0_PG) && !paging_enabled )
2093 /*
2094 * Trying to enable guest paging.
2095 * The guest CR3 must be pointing to the guest physical.
2096 */
2097 mfn = get_mfn_from_gpfn(v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT);
2098 if ( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), v->domain) )
2100 gdprintk(XENLOG_ERR, "Invalid CR3 value = %lx (mfn=%lx)\n",
2101 v->arch.hvm_vmx.cpu_cr3, mfn);
2102 domain_crash(v->domain);
2103 return 0;
2106 #if defined(__x86_64__)
2107 if ( vmx_lme_is_set(v) )
2109 if ( !(v->arch.hvm_vmx.cpu_shadow_cr4 & X86_CR4_PAE) )
2111 HVM_DBG_LOG(DBG_LEVEL_1, "Guest enabled paging "
2112 "with EFER.LME set but not CR4.PAE");
2113 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
2115 else
2117 HVM_DBG_LOG(DBG_LEVEL_1, "Enabling long mode");
2118 v->arch.hvm_vmx.efer |= EFER_LMA;
2119 vm_entry_value = __vmread(VM_ENTRY_CONTROLS);
2120 vm_entry_value |= VM_ENTRY_IA32E_MODE;
2121 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
2124 #endif
2126 /*
2127 * Now arch.guest_table points to machine physical.
2128 */
2129 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
2130 v->arch.guest_table = pagetable_from_pfn(mfn);
2131 if (old_base_mfn)
2132 put_page(mfn_to_page(old_base_mfn));
2133 paging_update_paging_modes(v);
2135 HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
2136 (unsigned long) (mfn << PAGE_SHIFT));
2138 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, mfn = %lx",
2139 v->arch.hvm_vmx.cpu_cr3, mfn);
2142 if ( !((value & X86_CR0_PE) && (value & X86_CR0_PG)) && paging_enabled )
2143 if ( v->arch.hvm_vmx.cpu_cr3 ) {
2144 put_page(mfn_to_page(get_mfn_from_gpfn(
2145 v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT)));
2146 v->arch.guest_table = pagetable_null();
2149 /*
2150 * VMX does not implement real-mode virtualization. We emulate
2151 * real-mode by performing a world switch to VMXAssist whenever
2152 * a partition disables the CR0.PE bit.
2153 */
2154 if ( (value & X86_CR0_PE) == 0 )
2156 if ( value & X86_CR0_PG ) {
2157 /* inject GP here */
2158 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
2159 return 0;
2160 } else {
2161 /*
2162 * Disable paging here.
2163 * Same to PE == 1 && PG == 0
2164 */
2165 if ( vmx_long_mode_enabled(v) )
2167 v->arch.hvm_vmx.efer &= ~EFER_LMA;
2168 vm_entry_value = __vmread(VM_ENTRY_CONTROLS);
2169 vm_entry_value &= ~VM_ENTRY_IA32E_MODE;
2170 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
2174 if ( vmx_assist(v, VMX_ASSIST_INVOKE) )
2176 eip = __vmread(GUEST_RIP);
2177 HVM_DBG_LOG(DBG_LEVEL_1,
2178 "Transfering control to vmxassist %%eip 0x%lx", eip);
2179 return 0; /* do not update eip! */
2182 else if ( v->arch.hvm_vmx.vmxassist_enabled )
2184 eip = __vmread(GUEST_RIP);
2185 HVM_DBG_LOG(DBG_LEVEL_1,
2186 "Enabling CR0.PE at %%eip 0x%lx", eip);
2187 if ( vmx_assist(v, VMX_ASSIST_RESTORE) )
2189 eip = __vmread(GUEST_RIP);
2190 HVM_DBG_LOG(DBG_LEVEL_1,
2191 "Restoring to %%eip 0x%lx", eip);
2192 return 0; /* do not update eip! */
2195 else if ( (value & (X86_CR0_PE | X86_CR0_PG)) == X86_CR0_PE )
2197 if ( vmx_long_mode_enabled(v) )
2199 v->arch.hvm_vmx.efer &= ~EFER_LMA;
2200 vm_entry_value = __vmread(VM_ENTRY_CONTROLS);
2201 vm_entry_value &= ~VM_ENTRY_IA32E_MODE;
2202 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
2204 paging_update_paging_modes(v);
2207 return 1;
2210 #define CASE_SET_REG(REG, reg) \
2211 case REG_ ## REG: regs->reg = value; break
2212 #define CASE_GET_REG(REG, reg) \
2213 case REG_ ## REG: value = regs->reg; break
2215 #define CASE_EXTEND_SET_REG \
2216 CASE_EXTEND_REG(S)
2217 #define CASE_EXTEND_GET_REG \
2218 CASE_EXTEND_REG(G)
2220 #ifdef __i386__
2221 #define CASE_EXTEND_REG(T)
2222 #else
2223 #define CASE_EXTEND_REG(T) \
2224 CASE_ ## T ## ET_REG(R8, r8); \
2225 CASE_ ## T ## ET_REG(R9, r9); \
2226 CASE_ ## T ## ET_REG(R10, r10); \
2227 CASE_ ## T ## ET_REG(R11, r11); \
2228 CASE_ ## T ## ET_REG(R12, r12); \
2229 CASE_ ## T ## ET_REG(R13, r13); \
2230 CASE_ ## T ## ET_REG(R14, r14); \
2231 CASE_ ## T ## ET_REG(R15, r15)
2232 #endif
2234 /*
2235 * Write to control registers
2236 */
2237 static int mov_to_cr(int gp, int cr, struct cpu_user_regs *regs)
2239 unsigned long value, old_cr, old_base_mfn, mfn;
2240 struct vcpu *v = current;
2241 struct vlapic *vlapic = vcpu_vlapic(v);
2243 switch ( gp )
2245 CASE_GET_REG(EAX, eax);
2246 CASE_GET_REG(ECX, ecx);
2247 CASE_GET_REG(EDX, edx);
2248 CASE_GET_REG(EBX, ebx);
2249 CASE_GET_REG(EBP, ebp);
2250 CASE_GET_REG(ESI, esi);
2251 CASE_GET_REG(EDI, edi);
2252 CASE_EXTEND_GET_REG;
2253 case REG_ESP:
2254 value = __vmread(GUEST_RSP);
2255 break;
2256 default:
2257 gdprintk(XENLOG_ERR, "invalid gp: %d\n", gp);
2258 goto exit_and_crash;
2261 HVMTRACE_2D(CR_WRITE, v, cr, value);
2263 HVM_DBG_LOG(DBG_LEVEL_1, "CR%d, value = %lx", cr, value);
2265 switch ( cr )
2267 case 0:
2268 return vmx_set_cr0(value);
2270 case 3:
2271 /*
2272 * If paging is not enabled yet, simply copy the value to CR3.
2273 */
2274 if (!vmx_paging_enabled(v)) {
2275 v->arch.hvm_vmx.cpu_cr3 = value;
2276 break;
2279 /*
2280 * We make a new one if the shadow does not exist.
2281 */
2282 if (value == v->arch.hvm_vmx.cpu_cr3) {
2283 /*
2284 * This is simple TLB flush, implying the guest has
2285 * removed some translation or changed page attributes.
2286 * We simply invalidate the shadow.
2287 */
2288 mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
2289 if (mfn != pagetable_get_pfn(v->arch.guest_table))
2290 goto bad_cr3;
2291 paging_update_cr3(v);
2292 } else {
2293 /*
2294 * If different, make a shadow. Check if the PDBR is valid
2295 * first.
2296 */
2297 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 value = %lx", value);
2298 mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
2299 if ( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), v->domain) )
2300 goto bad_cr3;
2301 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
2302 v->arch.guest_table = pagetable_from_pfn(mfn);
2303 if (old_base_mfn)
2304 put_page(mfn_to_page(old_base_mfn));
2305 v->arch.hvm_vmx.cpu_cr3 = value;
2306 update_cr3(v);
2307 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx", value);
2309 break;
2311 case 4: /* CR4 */
2312 old_cr = v->arch.hvm_vmx.cpu_shadow_cr4;
2314 if ( (value & X86_CR4_PAE) && !(old_cr & X86_CR4_PAE) )
2316 if ( vmx_pgbit_test(v) )
2318 /* The guest is a 32-bit PAE guest. */
2319 #if CONFIG_PAGING_LEVELS >= 3
2320 unsigned long mfn, old_base_mfn;
2321 mfn = get_mfn_from_gpfn(v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT);
2322 if ( !mfn_valid(mfn) ||
2323 !get_page(mfn_to_page(mfn), v->domain) )
2324 goto bad_cr3;
2326 /*
2327 * Now arch.guest_table points to machine physical.
2328 */
2330 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
2331 v->arch.guest_table = pagetable_from_pfn(mfn);
2332 if ( old_base_mfn )
2333 put_page(mfn_to_page(old_base_mfn));
2335 HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
2336 (unsigned long) (mfn << PAGE_SHIFT));
2338 HVM_DBG_LOG(DBG_LEVEL_VMMU,
2339 "Update CR3 value = %lx, mfn = %lx",
2340 v->arch.hvm_vmx.cpu_cr3, mfn);
2341 #endif
2344 else if ( !(value & X86_CR4_PAE) )
2346 if ( unlikely(vmx_long_mode_enabled(v)) )
2348 HVM_DBG_LOG(DBG_LEVEL_1, "Guest cleared CR4.PAE while "
2349 "EFER.LMA is set");
2350 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
2354 __vmwrite(GUEST_CR4, value| VMX_CR4_HOST_MASK);
2355 v->arch.hvm_vmx.cpu_shadow_cr4 = value;
2356 __vmwrite(CR4_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr4);
2358 /*
2359 * Writing to CR4 to modify the PSE, PGE, or PAE flag invalidates
2360 * all TLB entries except global entries.
2361 */
2362 if ( (old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE) )
2363 paging_update_paging_modes(v);
2364 break;
2366 case 8:
2367 vlapic_set_reg(vlapic, APIC_TASKPRI, ((value & 0x0F) << 4));
2368 break;
2370 default:
2371 gdprintk(XENLOG_ERR, "invalid cr: %d\n", cr);
2372 domain_crash(v->domain);
2373 return 0;
2376 return 1;
2378 bad_cr3:
2379 gdprintk(XENLOG_ERR, "Invalid CR3\n");
2380 exit_and_crash:
2381 domain_crash(v->domain);
2382 return 0;
2385 /*
2386 * Read from control registers. CR0 and CR4 are read from the shadow.
2387 */
2388 static void mov_from_cr(int cr, int gp, struct cpu_user_regs *regs)
2390 unsigned long value = 0;
2391 struct vcpu *v = current;
2392 struct vlapic *vlapic = vcpu_vlapic(v);
2394 switch ( cr )
2396 case 3:
2397 value = (unsigned long)v->arch.hvm_vmx.cpu_cr3;
2398 break;
2399 case 8:
2400 value = (unsigned long)vlapic_get_reg(vlapic, APIC_TASKPRI);
2401 value = (value & 0xF0) >> 4;
2402 break;
2403 default:
2404 gdprintk(XENLOG_ERR, "invalid cr: %d\n", cr);
2405 domain_crash(v->domain);
2406 break;
2409 switch ( gp ) {
2410 CASE_SET_REG(EAX, eax);
2411 CASE_SET_REG(ECX, ecx);
2412 CASE_SET_REG(EDX, edx);
2413 CASE_SET_REG(EBX, ebx);
2414 CASE_SET_REG(EBP, ebp);
2415 CASE_SET_REG(ESI, esi);
2416 CASE_SET_REG(EDI, edi);
2417 CASE_EXTEND_SET_REG;
2418 case REG_ESP:
2419 __vmwrite(GUEST_RSP, value);
2420 regs->esp = value;
2421 break;
2422 default:
2423 printk("invalid gp: %d\n", gp);
2424 domain_crash(v->domain);
2425 break;
2428 HVMTRACE_2D(CR_READ, v, cr, value);
2430 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR%d, value = %lx", cr, value);
2433 static int vmx_cr_access(unsigned long exit_qualification,
2434 struct cpu_user_regs *regs)
2436 unsigned int gp, cr;
2437 unsigned long value;
2438 struct vcpu *v = current;
2440 switch (exit_qualification & CONTROL_REG_ACCESS_TYPE) {
2441 case TYPE_MOV_TO_CR:
2442 gp = exit_qualification & CONTROL_REG_ACCESS_REG;
2443 cr = exit_qualification & CONTROL_REG_ACCESS_NUM;
2444 return mov_to_cr(gp, cr, regs);
2445 case TYPE_MOV_FROM_CR:
2446 gp = exit_qualification & CONTROL_REG_ACCESS_REG;
2447 cr = exit_qualification & CONTROL_REG_ACCESS_NUM;
2448 mov_from_cr(cr, gp, regs);
2449 break;
2450 case TYPE_CLTS:
2451 /* We initialise the FPU now, to avoid needing another vmexit. */
2452 setup_fpu(v);
2453 __vm_clear_bit(EXCEPTION_BITMAP, TRAP_no_device);
2455 v->arch.hvm_vmx.cpu_cr0 &= ~X86_CR0_TS; /* clear TS */
2456 __vmwrite(GUEST_CR0, v->arch.hvm_vmx.cpu_cr0);
2458 v->arch.hvm_vmx.cpu_shadow_cr0 &= ~X86_CR0_TS; /* clear TS */
2459 __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr0);
2460 break;
2461 case TYPE_LMSW:
2462 value = v->arch.hvm_vmx.cpu_shadow_cr0;
2463 value = (value & ~0xF) |
2464 (((exit_qualification & LMSW_SOURCE_DATA) >> 16) & 0xF);
2465 return vmx_set_cr0(value);
2466 default:
2467 BUG();
2470 return 1;
2473 static inline int vmx_do_msr_read(struct cpu_user_regs *regs)
2475 u64 msr_content = 0;
2476 u32 ecx = regs->ecx, eax, edx;
2477 struct vcpu *v = current;
2479 HVM_DBG_LOG(DBG_LEVEL_1, "ecx=%x", ecx);
2481 switch (ecx) {
2482 case MSR_IA32_TIME_STAMP_COUNTER:
2483 msr_content = hvm_get_guest_time(v);
2484 break;
2485 case MSR_IA32_SYSENTER_CS:
2486 msr_content = (u32)__vmread(GUEST_SYSENTER_CS);
2487 break;
2488 case MSR_IA32_SYSENTER_ESP:
2489 msr_content = __vmread(GUEST_SYSENTER_ESP);
2490 break;
2491 case MSR_IA32_SYSENTER_EIP:
2492 msr_content = __vmread(GUEST_SYSENTER_EIP);
2493 break;
2494 case MSR_IA32_APICBASE:
2495 msr_content = vcpu_vlapic(v)->hw.apic_base_msr;
2496 break;
2497 default:
2498 if ( long_mode_do_msr_read(regs) )
2499 goto done;
2501 if ( rdmsr_hypervisor_regs(ecx, &eax, &edx) ||
2502 rdmsr_safe(ecx, eax, edx) == 0 )
2504 regs->eax = eax;
2505 regs->edx = edx;
2506 goto done;
2508 vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
2509 return 0;
2512 regs->eax = msr_content & 0xFFFFFFFF;
2513 regs->edx = msr_content >> 32;
2515 done:
2516 HVMTRACE_2D(MSR_READ, v, ecx, msr_content);
2517 HVM_DBG_LOG(DBG_LEVEL_1, "returns: ecx=%x, eax=%lx, edx=%lx",
2518 ecx, (unsigned long)regs->eax,
2519 (unsigned long)regs->edx);
2520 return 1;
2523 static int vmx_alloc_vlapic_mapping(struct domain *d)
2525 void *apic_va;
2527 if ( !cpu_has_vmx_virtualize_apic_accesses )
2528 return 0;
2530 apic_va = alloc_xenheap_page();
2531 if ( apic_va == NULL )
2532 return -ENOMEM;
2533 share_xen_page_with_guest(virt_to_page(apic_va), d, XENSHARE_writable);
2534 guest_physmap_add_page(
2535 d, paddr_to_pfn(APIC_DEFAULT_PHYS_BASE), virt_to_mfn(apic_va));
2536 d->arch.hvm_domain.vmx_apic_access_mfn = virt_to_mfn(apic_va);
2538 return 0;
2541 static void vmx_free_vlapic_mapping(struct domain *d)
2543 unsigned long mfn = d->arch.hvm_domain.vmx_apic_access_mfn;
2544 if ( mfn != 0 )
2545 free_xenheap_page(mfn_to_virt(mfn));
2548 static void vmx_install_vlapic_mapping(struct vcpu *v)
2550 paddr_t virt_page_ma, apic_page_ma;
2552 if ( !cpu_has_vmx_virtualize_apic_accesses )
2553 return;
2555 virt_page_ma = page_to_maddr(vcpu_vlapic(v)->regs_page);
2556 apic_page_ma = v->domain->arch.hvm_domain.vmx_apic_access_mfn;
2557 apic_page_ma <<= PAGE_SHIFT;
2559 vmx_vmcs_enter(v);
2560 __vmwrite(VIRTUAL_APIC_PAGE_ADDR, virt_page_ma);
2561 __vmwrite(APIC_ACCESS_ADDR, apic_page_ma);
2562 #if defined (CONFIG_X86_PAE)
2563 __vmwrite(VIRTUAL_APIC_PAGE_ADDR_HIGH, virt_page_ma >> 32);
2564 __vmwrite(APIC_ACCESS_ADDR_HIGH, apic_page_ma >> 32);
2565 #endif
2566 vmx_vmcs_exit(v);
2569 void vmx_vlapic_msr_changed(struct vcpu *v)
2571 struct vlapic *vlapic = vcpu_vlapic(v);
2572 uint32_t ctl;
2574 if ( !cpu_has_vmx_virtualize_apic_accesses )
2575 return;
2577 vmx_vmcs_enter(v);
2578 ctl = __vmread(SECONDARY_VM_EXEC_CONTROL);
2579 ctl &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
2580 if ( !vlapic_hw_disabled(vlapic) &&
2581 (vlapic_base_address(vlapic) == APIC_DEFAULT_PHYS_BASE) )
2582 ctl |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
2583 __vmwrite(SECONDARY_VM_EXEC_CONTROL, ctl);
2584 vmx_vmcs_exit(v);
2587 static inline int vmx_do_msr_write(struct cpu_user_regs *regs)
2589 u32 ecx = regs->ecx;
2590 u64 msr_content;
2591 struct vcpu *v = current;
2593 HVM_DBG_LOG(DBG_LEVEL_1, "ecx=%x, eax=%x, edx=%x",
2594 ecx, (u32)regs->eax, (u32)regs->edx);
2596 msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
2597 HVMTRACE_2D(MSR_WRITE, v, ecx, msr_content);
2599 switch (ecx) {
2600 case MSR_IA32_TIME_STAMP_COUNTER:
2601 hvm_set_guest_time(v, msr_content);
2602 pt_reset(v);
2603 break;
2604 case MSR_IA32_SYSENTER_CS:
2605 __vmwrite(GUEST_SYSENTER_CS, msr_content);
2606 break;
2607 case MSR_IA32_SYSENTER_ESP:
2608 __vmwrite(GUEST_SYSENTER_ESP, msr_content);
2609 break;
2610 case MSR_IA32_SYSENTER_EIP:
2611 __vmwrite(GUEST_SYSENTER_EIP, msr_content);
2612 break;
2613 case MSR_IA32_APICBASE:
2614 vlapic_msr_set(vcpu_vlapic(v), msr_content);
2615 break;
2616 default:
2617 if ( !long_mode_do_msr_write(regs) )
2618 wrmsr_hypervisor_regs(ecx, regs->eax, regs->edx);
2619 break;
2622 return 1;
2625 static void vmx_do_hlt(void)
2627 unsigned long rflags;
2628 HVMTRACE_0D(HLT, current);
2629 rflags = __vmread(GUEST_RFLAGS);
2630 hvm_hlt(rflags);
2633 static inline void vmx_do_extint(struct cpu_user_regs *regs)
2635 unsigned int vector;
2637 asmlinkage void do_IRQ(struct cpu_user_regs *);
2638 fastcall void smp_apic_timer_interrupt(struct cpu_user_regs *);
2639 fastcall void smp_event_check_interrupt(void);
2640 fastcall void smp_invalidate_interrupt(void);
2641 fastcall void smp_call_function_interrupt(void);
2642 fastcall void smp_spurious_interrupt(struct cpu_user_regs *regs);
2643 fastcall void smp_error_interrupt(struct cpu_user_regs *regs);
2644 #ifdef CONFIG_X86_MCE_P4THERMAL
2645 fastcall void smp_thermal_interrupt(struct cpu_user_regs *regs);
2646 #endif
2648 vector = __vmread(VM_EXIT_INTR_INFO);
2649 BUG_ON(!(vector & INTR_INFO_VALID_MASK));
2651 vector &= INTR_INFO_VECTOR_MASK;
2652 HVMTRACE_1D(INTR, current, vector);
2654 switch(vector) {
2655 case LOCAL_TIMER_VECTOR:
2656 smp_apic_timer_interrupt(regs);
2657 break;
2658 case EVENT_CHECK_VECTOR:
2659 smp_event_check_interrupt();
2660 break;
2661 case INVALIDATE_TLB_VECTOR:
2662 smp_invalidate_interrupt();
2663 break;
2664 case CALL_FUNCTION_VECTOR:
2665 smp_call_function_interrupt();
2666 break;
2667 case SPURIOUS_APIC_VECTOR:
2668 smp_spurious_interrupt(regs);
2669 break;
2670 case ERROR_APIC_VECTOR:
2671 smp_error_interrupt(regs);
2672 break;
2673 #ifdef CONFIG_X86_MCE_P4THERMAL
2674 case THERMAL_APIC_VECTOR:
2675 smp_thermal_interrupt(regs);
2676 break;
2677 #endif
2678 default:
2679 regs->entry_vector = vector;
2680 do_IRQ(regs);
2681 break;
2685 static void vmx_reflect_exception(struct vcpu *v)
2687 int error_code, intr_info, vector;
2689 intr_info = __vmread(VM_EXIT_INTR_INFO);
2690 vector = intr_info & 0xff;
2691 if ( intr_info & INTR_INFO_DELIVER_CODE_MASK )
2692 error_code = __vmread(VM_EXIT_INTR_ERROR_CODE);
2693 else
2694 error_code = VMX_DELIVER_NO_ERROR_CODE;
2696 #ifndef NDEBUG
2698 unsigned long rip;
2700 rip = __vmread(GUEST_RIP);
2701 HVM_DBG_LOG(DBG_LEVEL_1, "rip = %lx, error_code = %x",
2702 rip, error_code);
2704 #endif /* NDEBUG */
2706 /*
2707 * According to Intel Virtualization Technology Specification for
2708 * the IA-32 Intel Architecture (C97063-002 April 2005), section
2709 * 2.8.3, SW_EXCEPTION should be used for #BP and #OV, and
2710 * HW_EXCEPTION used for everything else. The main difference
2711 * appears to be that for SW_EXCEPTION, the EIP/RIP is incremented
2712 * by VM_ENTER_INSTRUCTION_LEN bytes, whereas for HW_EXCEPTION,
2713 * it is not.
2714 */
2715 if ( (intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_SW_EXCEPTION )
2717 int ilen = __get_instruction_length(); /* Safe: software exception */
2718 vmx_inject_sw_exception(v, vector, ilen);
2720 else
2722 vmx_inject_hw_exception(v, vector, error_code);
2726 static void vmx_failed_vmentry(unsigned int exit_reason)
2728 unsigned int failed_vmentry_reason = (uint16_t)exit_reason;
2729 unsigned long exit_qualification;
2731 exit_qualification = __vmread(EXIT_QUALIFICATION);
2732 printk("Failed vm entry (exit reason 0x%x) ", exit_reason);
2733 switch ( failed_vmentry_reason )
2735 case EXIT_REASON_INVALID_GUEST_STATE:
2736 printk("caused by invalid guest state (%ld).\n", exit_qualification);
2737 break;
2738 case EXIT_REASON_MSR_LOADING:
2739 printk("caused by MSR entry %ld loading.\n", exit_qualification);
2740 break;
2741 case EXIT_REASON_MACHINE_CHECK:
2742 printk("caused by machine check.\n");
2743 break;
2744 default:
2745 printk("reason not known yet!");
2746 break;
2749 printk("************* VMCS Area **************\n");
2750 vmcs_dump_vcpu();
2751 printk("**************************************\n");
2753 domain_crash(current->domain);
2756 asmlinkage void vmx_vmexit_handler(struct cpu_user_regs *regs)
2758 unsigned int exit_reason;
2759 unsigned long exit_qualification, inst_len = 0;
2760 struct vcpu *v = current;
2762 exit_reason = __vmread(VM_EXIT_REASON);
2764 HVMTRACE_2D(VMEXIT, v, __vmread(GUEST_RIP), exit_reason);
2766 perfc_incra(vmexits, exit_reason);
2768 if ( exit_reason != EXIT_REASON_EXTERNAL_INTERRUPT )
2769 local_irq_enable();
2771 if ( unlikely(exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) )
2772 return vmx_failed_vmentry(exit_reason);
2774 switch ( exit_reason )
2776 case EXIT_REASON_EXCEPTION_NMI:
2778 /*
2779 * We don't set the software-interrupt exiting (INT n).
2780 * (1) We can get an exception (e.g. #PG) in the guest, or
2781 * (2) NMI
2782 */
2783 unsigned int intr_info, vector;
2785 intr_info = __vmread(VM_EXIT_INTR_INFO);
2786 BUG_ON(!(intr_info & INTR_INFO_VALID_MASK));
2788 vector = intr_info & INTR_INFO_VECTOR_MASK;
2790 perfc_incra(cause_vector, vector);
2792 switch ( vector )
2794 case TRAP_debug:
2795 case TRAP_int3:
2796 if ( !v->domain->debugger_attached )
2797 goto exit_and_crash;
2798 domain_pause_for_debugger();
2799 break;
2800 case TRAP_no_device:
2801 vmx_do_no_device_fault();
2802 break;
2803 case TRAP_page_fault:
2804 exit_qualification = __vmread(EXIT_QUALIFICATION);
2805 regs->error_code = __vmread(VM_EXIT_INTR_ERROR_CODE);
2807 HVM_DBG_LOG(DBG_LEVEL_VMMU,
2808 "eax=%lx, ebx=%lx, ecx=%lx, edx=%lx, esi=%lx, edi=%lx",
2809 (unsigned long)regs->eax, (unsigned long)regs->ebx,
2810 (unsigned long)regs->ecx, (unsigned long)regs->edx,
2811 (unsigned long)regs->esi, (unsigned long)regs->edi);
2813 if ( paging_fault(exit_qualification, regs) )
2815 HVMTRACE_2D(PF_XEN, v, exit_qualification, regs->error_code);
2816 break;
2819 v->arch.hvm_vmx.cpu_cr2 = exit_qualification;
2820 vmx_inject_hw_exception(v, TRAP_page_fault, regs->error_code);
2821 break;
2822 case TRAP_nmi:
2823 HVMTRACE_0D(NMI, v);
2824 if ( (intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI )
2825 do_nmi(regs); /* Real NMI, vector 2: normal processing. */
2826 else
2827 vmx_reflect_exception(v);
2828 break;
2829 default:
2830 goto exit_and_crash;
2832 break;
2834 case EXIT_REASON_EXTERNAL_INTERRUPT:
2835 vmx_do_extint(regs);
2836 break;
2837 case EXIT_REASON_TRIPLE_FAULT:
2838 hvm_triple_fault();
2839 break;
2840 case EXIT_REASON_PENDING_INTERRUPT:
2841 /* Disable the interrupt window. */
2842 v->arch.hvm_vcpu.u.vmx.exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
2843 __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
2844 v->arch.hvm_vcpu.u.vmx.exec_control);
2845 break;
2846 case EXIT_REASON_TASK_SWITCH:
2847 goto exit_and_crash;
2848 case EXIT_REASON_CPUID:
2849 inst_len = __get_instruction_length(); /* Safe: CPUID */
2850 __update_guest_eip(inst_len);
2851 vmx_do_cpuid(regs);
2852 break;
2853 case EXIT_REASON_HLT:
2854 inst_len = __get_instruction_length(); /* Safe: HLT */
2855 __update_guest_eip(inst_len);
2856 vmx_do_hlt();
2857 break;
2858 case EXIT_REASON_INVLPG:
2860 inst_len = __get_instruction_length(); /* Safe: INVLPG */
2861 __update_guest_eip(inst_len);
2862 exit_qualification = __vmread(EXIT_QUALIFICATION);
2863 vmx_do_invlpg(exit_qualification);
2864 break;
2866 case EXIT_REASON_VMCALL:
2868 int rc;
2869 HVMTRACE_1D(VMMCALL, v, regs->eax);
2870 inst_len = __get_instruction_length(); /* Safe: VMCALL */
2871 rc = hvm_do_hypercall(regs);
2872 if ( rc != HVM_HCALL_preempted )
2874 __update_guest_eip(inst_len);
2875 if ( rc == HVM_HCALL_invalidate )
2876 send_invalidate_req();
2878 break;
2880 case EXIT_REASON_CR_ACCESS:
2882 exit_qualification = __vmread(EXIT_QUALIFICATION);
2883 inst_len = __get_instruction_length(); /* Safe: MOV Cn, LMSW, CLTS */
2884 if ( vmx_cr_access(exit_qualification, regs) )
2885 __update_guest_eip(inst_len);
2886 break;
2888 case EXIT_REASON_DR_ACCESS:
2889 exit_qualification = __vmread(EXIT_QUALIFICATION);
2890 vmx_dr_access(exit_qualification, regs);
2891 break;
2892 case EXIT_REASON_IO_INSTRUCTION:
2893 exit_qualification = __vmread(EXIT_QUALIFICATION);
2894 inst_len = __get_instruction_length(); /* Safe: IN, INS, OUT, OUTS */
2895 vmx_io_instruction(exit_qualification, inst_len);
2896 break;
2897 case EXIT_REASON_MSR_READ:
2898 inst_len = __get_instruction_length(); /* Safe: RDMSR */
2899 if ( vmx_do_msr_read(regs) )
2900 __update_guest_eip(inst_len);
2901 break;
2902 case EXIT_REASON_MSR_WRITE:
2903 inst_len = __get_instruction_length(); /* Safe: WRMSR */
2904 if ( vmx_do_msr_write(regs) )
2905 __update_guest_eip(inst_len);
2906 break;
2907 case EXIT_REASON_MWAIT_INSTRUCTION:
2908 case EXIT_REASON_MONITOR_INSTRUCTION:
2909 case EXIT_REASON_PAUSE_INSTRUCTION:
2910 goto exit_and_crash;
2911 case EXIT_REASON_VMCLEAR:
2912 case EXIT_REASON_VMLAUNCH:
2913 case EXIT_REASON_VMPTRLD:
2914 case EXIT_REASON_VMPTRST:
2915 case EXIT_REASON_VMREAD:
2916 case EXIT_REASON_VMRESUME:
2917 case EXIT_REASON_VMWRITE:
2918 case EXIT_REASON_VMXOFF:
2919 case EXIT_REASON_VMXON:
2920 /* Report invalid opcode exception when a VMX guest tries to execute
2921 any of the VMX instructions */
2922 vmx_inject_hw_exception(v, TRAP_invalid_op, VMX_DELIVER_NO_ERROR_CODE);
2923 break;
2925 case EXIT_REASON_TPR_BELOW_THRESHOLD:
2926 break;
2928 case EXIT_REASON_APIC_ACCESS:
2930 unsigned long offset;
2931 exit_qualification = __vmread(EXIT_QUALIFICATION);
2932 offset = exit_qualification & 0x0fffUL;
2933 handle_mmio(APIC_DEFAULT_PHYS_BASE | offset);
2934 break;
2937 default:
2938 exit_and_crash:
2939 gdprintk(XENLOG_ERR, "Bad vmexit (reason %x)\n", exit_reason);
2940 domain_crash(v->domain);
2941 break;
2945 asmlinkage void vmx_trace_vmentry(void)
2947 struct vcpu *v = current;
2948 HVMTRACE_0D(VMENTRY, v);
2951 /*
2952 * Local variables:
2953 * mode: C
2954 * c-set-style: "BSD"
2955 * c-basic-offset: 4
2956 * tab-width: 4
2957 * indent-tabs-mode: nil
2958 * End:
2959 */