ia64/xen-unstable

view xen/arch/x86/vmx.c @ 6610:7557c46a9edf

We need to be more precise and restore the %eip.

Signed-Off-By: Leendert van Doorn <leendert@watson.ibm.com>
author kaf24@firebug.cl.cam.ac.uk
date Fri Sep 02 17:53:34 2005 +0000 (2005-09-02)
parents 151da8f5d5f2
children 20140d3fbf83
line source
1 /*
2 * vmx.c: handling VMX architecture-related VM exits
3 * Copyright (c) 2004, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
16 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 *
18 */
20 #include <xen/config.h>
21 #include <xen/init.h>
22 #include <xen/lib.h>
23 #include <xen/trace.h>
24 #include <xen/sched.h>
25 #include <xen/irq.h>
26 #include <xen/softirq.h>
27 #include <xen/domain_page.h>
28 #include <asm/current.h>
29 #include <asm/io.h>
30 #include <asm/shadow.h>
31 #include <asm/regs.h>
32 #include <asm/cpufeature.h>
33 #include <asm/processor.h>
34 #include <asm/types.h>
35 #include <asm/msr.h>
36 #include <asm/spinlock.h>
37 #include <asm/vmx.h>
38 #include <asm/vmx_vmcs.h>
39 #include <asm/vmx_intercept.h>
40 #include <asm/shadow.h>
41 #if CONFIG_PAGING_LEVELS >= 3
42 #include <asm/shadow_64.h>
43 #endif
45 #include <public/io/ioreq.h>
47 #ifdef CONFIG_VMX
49 int vmcs_size;
50 unsigned int opt_vmx_debug_level = 0;
51 integer_param("vmx_debug", opt_vmx_debug_level);
53 #ifdef TRACE_BUFFER
54 static unsigned long trace_values[NR_CPUS][4];
55 #define TRACE_VMEXIT(index,value) trace_values[current->processor][index]=value
56 #else
57 #define TRACE_VMEXIT(index,value) ((void)0)
58 #endif
60 #ifdef __x86_64__
61 static struct msr_state percpu_msr[NR_CPUS];
63 static u32 msr_data_index[VMX_MSR_COUNT] =
64 {
65 MSR_LSTAR, MSR_STAR, MSR_CSTAR,
66 MSR_SYSCALL_MASK, MSR_EFER,
67 };
69 /*
70 * To avoid MSR save/restore at every VM exit/entry time, we restore
71 * the x86_64 specific MSRs at domain switch time. Since those MSRs are
72 * are not modified once set for generic domains, we don't save them,
73 * but simply reset them to the values set at percpu_traps_init().
74 */
75 void vmx_load_msrs(struct vcpu *n)
76 {
77 struct msr_state *host_state;
78 host_state = &percpu_msr[smp_processor_id()];
80 while (host_state->flags){
81 int i;
83 i = find_first_set_bit(host_state->flags);
84 wrmsrl(msr_data_index[i], host_state->msr_items[i]);
85 clear_bit(i, &host_state->flags);
86 }
87 }
89 static void vmx_save_init_msrs(void)
90 {
91 struct msr_state *host_state;
92 host_state = &percpu_msr[smp_processor_id()];
93 int i;
95 for (i = 0; i < VMX_MSR_COUNT; i++)
96 rdmsrl(msr_data_index[i], host_state->msr_items[i]);
97 }
99 #define CASE_READ_MSR(address) \
100 case MSR_ ## address: \
101 msr_content = msr->msr_items[VMX_INDEX_MSR_ ## address]; \
102 break
104 #define CASE_WRITE_MSR(address) \
105 case MSR_ ## address: \
106 { \
107 msr->msr_items[VMX_INDEX_MSR_ ## address] = msr_content; \
108 if (!test_bit(VMX_INDEX_MSR_ ## address, &msr->flags)) { \
109 set_bit(VMX_INDEX_MSR_ ## address, &msr->flags); \
110 } \
111 wrmsrl(MSR_ ## address, msr_content); \
112 set_bit(VMX_INDEX_MSR_ ## address, &host_state->flags); \
113 } \
114 break
116 #define IS_CANO_ADDRESS(add) 1
117 static inline int long_mode_do_msr_read(struct cpu_user_regs *regs)
118 {
119 u64 msr_content = 0;
120 struct vcpu *vc = current;
121 struct msr_state * msr = &vc->arch.arch_vmx.msr_content;
122 switch(regs->ecx){
123 case MSR_EFER:
124 msr_content = msr->msr_items[VMX_INDEX_MSR_EFER];
125 VMX_DBG_LOG(DBG_LEVEL_2, "EFER msr_content %llx\n", (unsigned long long)msr_content);
126 if (test_bit(VMX_CPU_STATE_LME_ENABLED,
127 &vc->arch.arch_vmx.cpu_state))
128 msr_content |= 1 << _EFER_LME;
130 if (VMX_LONG_GUEST(vc))
131 msr_content |= 1 << _EFER_LMA;
132 break;
133 case MSR_FS_BASE:
134 if (!(VMX_LONG_GUEST(vc)))
135 /* XXX should it be GP fault */
136 domain_crash();
137 __vmread(GUEST_FS_BASE, &msr_content);
138 break;
139 case MSR_GS_BASE:
140 if (!(VMX_LONG_GUEST(vc)))
141 domain_crash();
142 __vmread(GUEST_GS_BASE, &msr_content);
143 break;
144 case MSR_SHADOW_GS_BASE:
145 msr_content = msr->shadow_gs;
146 break;
148 CASE_READ_MSR(STAR);
149 CASE_READ_MSR(LSTAR);
150 CASE_READ_MSR(CSTAR);
151 CASE_READ_MSR(SYSCALL_MASK);
152 default:
153 return 0;
154 }
155 VMX_DBG_LOG(DBG_LEVEL_2, "mode_do_msr_read: msr_content: %lx\n", msr_content);
156 regs->eax = msr_content & 0xffffffff;
157 regs->edx = msr_content >> 32;
158 return 1;
159 }
161 static inline int long_mode_do_msr_write(struct cpu_user_regs *regs)
162 {
163 u64 msr_content = regs->eax | ((u64)regs->edx << 32);
164 struct vcpu *vc = current;
165 struct msr_state * msr = &vc->arch.arch_vmx.msr_content;
166 struct msr_state * host_state =
167 &percpu_msr[smp_processor_id()];
169 VMX_DBG_LOG(DBG_LEVEL_1, " mode_do_msr_write msr %lx msr_content %lx\n",
170 regs->ecx, msr_content);
172 switch (regs->ecx){
173 case MSR_EFER:
174 if ((msr_content & EFER_LME) ^
175 test_bit(VMX_CPU_STATE_LME_ENABLED,
176 &vc->arch.arch_vmx.cpu_state)){
177 if (test_bit(VMX_CPU_STATE_PG_ENABLED,
178 &vc->arch.arch_vmx.cpu_state) ||
179 !test_bit(VMX_CPU_STATE_PAE_ENABLED,
180 &vc->arch.arch_vmx.cpu_state)){
181 vmx_inject_exception(vc, TRAP_gp_fault, 0);
182 }
183 }
184 if (msr_content & EFER_LME)
185 set_bit(VMX_CPU_STATE_LME_ENABLED,
186 &vc->arch.arch_vmx.cpu_state);
187 /* No update for LME/LMA since it have no effect */
188 msr->msr_items[VMX_INDEX_MSR_EFER] =
189 msr_content;
190 if (msr_content & ~(EFER_LME | EFER_LMA)){
191 msr->msr_items[VMX_INDEX_MSR_EFER] = msr_content;
192 if (!test_bit(VMX_INDEX_MSR_EFER, &msr->flags)){
193 rdmsrl(MSR_EFER,
194 host_state->msr_items[VMX_INDEX_MSR_EFER]);
195 set_bit(VMX_INDEX_MSR_EFER, &host_state->flags);
196 set_bit(VMX_INDEX_MSR_EFER, &msr->flags);
197 wrmsrl(MSR_EFER, msr_content);
198 }
199 }
200 break;
202 case MSR_FS_BASE:
203 case MSR_GS_BASE:
204 if (!(VMX_LONG_GUEST(vc)))
205 domain_crash();
206 if (!IS_CANO_ADDRESS(msr_content)){
207 VMX_DBG_LOG(DBG_LEVEL_1, "Not cano address of msr write\n");
208 vmx_inject_exception(vc, TRAP_gp_fault, 0);
209 }
210 if (regs->ecx == MSR_FS_BASE)
211 __vmwrite(GUEST_FS_BASE, msr_content);
212 else
213 __vmwrite(GUEST_GS_BASE, msr_content);
214 break;
216 case MSR_SHADOW_GS_BASE:
217 if (!(VMX_LONG_GUEST(vc)))
218 domain_crash();
219 vc->arch.arch_vmx.msr_content.shadow_gs = msr_content;
220 wrmsrl(MSR_SHADOW_GS_BASE, msr_content);
221 break;
223 CASE_WRITE_MSR(STAR);
224 CASE_WRITE_MSR(LSTAR);
225 CASE_WRITE_MSR(CSTAR);
226 CASE_WRITE_MSR(SYSCALL_MASK);
227 default:
228 return 0;
229 }
230 return 1;
231 }
233 void
234 vmx_restore_msrs(struct vcpu *d)
235 {
236 int i = 0;
237 struct msr_state *guest_state;
238 struct msr_state *host_state;
239 unsigned long guest_flags ;
241 guest_state = &d->arch.arch_vmx.msr_content;;
242 host_state = &percpu_msr[smp_processor_id()];
244 wrmsrl(MSR_SHADOW_GS_BASE, guest_state->shadow_gs);
245 guest_flags = guest_state->flags;
246 if (!guest_flags)
247 return;
249 while (guest_flags){
250 i = find_first_set_bit(guest_flags);
252 VMX_DBG_LOG(DBG_LEVEL_2,
253 "restore guest's index %d msr %lx with %lx\n",
254 i, (unsigned long) msr_data_index[i], (unsigned long) guest_state->msr_items[i]);
255 set_bit(i, &host_state->flags);
256 wrmsrl(msr_data_index[i], guest_state->msr_items[i]);
257 clear_bit(i, &guest_flags);
258 }
259 }
261 #else /* __i386__ */
262 #define vmx_save_init_msrs() ((void)0)
264 static inline int long_mode_do_msr_read(struct cpu_user_regs *regs){
265 return 0;
266 }
267 static inline int long_mode_do_msr_write(struct cpu_user_regs *regs){
268 return 0;
269 }
270 #endif
272 extern long evtchn_send(int lport);
273 extern long do_block(void);
274 void do_nmi(struct cpu_user_regs *, unsigned long);
276 static int check_vmx_controls(ctrls, msr)
277 {
278 u32 vmx_msr_low, vmx_msr_high;
280 rdmsr(msr, vmx_msr_low, vmx_msr_high);
281 if (ctrls < vmx_msr_low || ctrls > vmx_msr_high) {
282 printk("Insufficient VMX capability 0x%x, "
283 "msr=0x%x,low=0x%8x,high=0x%x\n",
284 ctrls, msr, vmx_msr_low, vmx_msr_high);
285 return 0;
286 }
287 return 1;
288 }
290 int start_vmx(void)
291 {
292 struct vmcs_struct *vmcs;
293 u32 ecx;
294 u32 eax, edx;
295 u64 phys_vmcs; /* debugging */
297 /*
298 * Xen does not fill x86_capability words except 0.
299 */
300 ecx = cpuid_ecx(1);
301 boot_cpu_data.x86_capability[4] = ecx;
303 if (!(test_bit(X86_FEATURE_VMXE, &boot_cpu_data.x86_capability)))
304 return 0;
306 rdmsr(IA32_FEATURE_CONTROL_MSR, eax, edx);
308 if (eax & IA32_FEATURE_CONTROL_MSR_LOCK) {
309 if ((eax & IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON) == 0x0) {
310 printk("VMX disabled by Feature Control MSR.\n");
311 return 0;
312 }
313 }
314 else {
315 wrmsr(IA32_FEATURE_CONTROL_MSR,
316 IA32_FEATURE_CONTROL_MSR_LOCK |
317 IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON, 0);
318 }
320 if (!check_vmx_controls(MONITOR_PIN_BASED_EXEC_CONTROLS,
321 MSR_IA32_VMX_PINBASED_CTLS_MSR))
322 return 0;
323 if (!check_vmx_controls(MONITOR_CPU_BASED_EXEC_CONTROLS,
324 MSR_IA32_VMX_PROCBASED_CTLS_MSR))
325 return 0;
326 if (!check_vmx_controls(MONITOR_VM_EXIT_CONTROLS,
327 MSR_IA32_VMX_EXIT_CTLS_MSR))
328 return 0;
329 if (!check_vmx_controls(MONITOR_VM_ENTRY_CONTROLS,
330 MSR_IA32_VMX_ENTRY_CTLS_MSR))
331 return 0;
333 set_in_cr4(X86_CR4_VMXE); /* Enable VMXE */
335 if (!(vmcs = alloc_vmcs())) {
336 printk("Failed to allocate VMCS\n");
337 return 0;
338 }
340 phys_vmcs = (u64) virt_to_phys(vmcs);
342 if (!(__vmxon(phys_vmcs))) {
343 printk("VMXON is done\n");
344 }
346 vmx_save_init_msrs();
348 return 1;
349 }
351 void stop_vmx(void)
352 {
353 if (read_cr4() & X86_CR4_VMXE)
354 __vmxoff();
355 }
357 /*
358 * Not all cases receive valid value in the VM-exit instruction length field.
359 */
360 #define __get_instruction_length(len) \
361 __vmread(VM_EXIT_INSTRUCTION_LEN, &(len)); \
362 if ((len) < 1 || (len) > 15) \
363 __vmx_bug(&regs);
365 static void inline __update_guest_eip(unsigned long inst_len)
366 {
367 unsigned long current_eip;
369 __vmread(GUEST_RIP, &current_eip);
370 __vmwrite(GUEST_RIP, current_eip + inst_len);
371 }
374 static int vmx_do_page_fault(unsigned long va, struct cpu_user_regs *regs)
375 {
376 unsigned long eip;
377 unsigned long gpa; /* FIXME: PAE */
378 int result;
380 #if VMX_DEBUG
381 {
382 __vmread(GUEST_RIP, &eip);
383 VMX_DBG_LOG(DBG_LEVEL_VMMU,
384 "vmx_do_page_fault = 0x%lx, eip = %lx, error_code = %lx",
385 va, eip, (unsigned long)regs->error_code);
386 }
387 #endif
389 if (!vmx_paging_enabled(current)){
390 handle_mmio(va, va);
391 TRACE_VMEXIT (2,2);
392 return 1;
393 }
394 gpa = gva_to_gpa(va);
396 /* Use 1:1 page table to identify MMIO address space */
397 if ( mmio_space(gpa) ){
398 if (gpa >= 0xFEE00000) { /* workaround for local APIC */
399 u32 inst_len;
400 __vmread(VM_EXIT_INSTRUCTION_LEN, &(inst_len));
401 __update_guest_eip(inst_len);
402 return 1;
403 }
404 TRACE_VMEXIT (2,2);
405 handle_mmio(va, gpa);
406 return 1;
407 }
409 result = shadow_fault(va, regs);
410 TRACE_VMEXIT (2,result);
411 #if 0
412 if ( !result )
413 {
414 __vmread(GUEST_RIP, &eip);
415 printk("vmx pgfault to guest va=%lx eip=%lx\n", va, eip);
416 }
417 #endif
419 return result;
420 }
422 static void vmx_do_no_device_fault(void)
423 {
424 unsigned long cr0;
426 clts();
427 setup_fpu(current);
428 __vmread(CR0_READ_SHADOW, &cr0);
429 if (!(cr0 & X86_CR0_TS)) {
430 __vmread(GUEST_CR0, &cr0);
431 cr0 &= ~X86_CR0_TS;
432 __vmwrite(GUEST_CR0, cr0);
433 }
434 __vm_clear_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_NM);
435 }
438 static void vmx_vmexit_do_cpuid(unsigned long input, struct cpu_user_regs *regs)
439 {
440 unsigned int eax, ebx, ecx, edx;
441 unsigned long eip;
443 __vmread(GUEST_RIP, &eip);
445 VMX_DBG_LOG(DBG_LEVEL_1,
446 "do_cpuid: (eax) %lx, (ebx) %lx, (ecx) %lx, (edx) %lx,"
447 " (esi) %lx, (edi) %lx",
448 (unsigned long)regs->eax, (unsigned long)regs->ebx,
449 (unsigned long)regs->ecx, (unsigned long)regs->edx,
450 (unsigned long)regs->esi, (unsigned long)regs->edi);
452 cpuid(input, &eax, &ebx, &ecx, &edx);
454 if (input == 1) {
455 #ifdef __i386__
456 clear_bit(X86_FEATURE_PSE, &edx);
457 clear_bit(X86_FEATURE_PAE, &edx);
458 clear_bit(X86_FEATURE_PSE36, &edx);
459 #else
460 struct vcpu *d = current;
461 if (d->domain->arch.ops->guest_paging_levels == PAGING_L2)
462 {
463 clear_bit(X86_FEATURE_PSE, &edx);
464 clear_bit(X86_FEATURE_PAE, &edx);
465 clear_bit(X86_FEATURE_PSE36, &edx);
466 }
467 #endif
469 }
471 regs->eax = (unsigned long) eax;
472 regs->ebx = (unsigned long) ebx;
473 regs->ecx = (unsigned long) ecx;
474 regs->edx = (unsigned long) edx;
476 VMX_DBG_LOG(DBG_LEVEL_1,
477 "vmx_vmexit_do_cpuid: eip: %lx, input: %lx, out:eax=%x, ebx=%x, ecx=%x, edx=%x",
478 eip, input, eax, ebx, ecx, edx);
480 }
482 #define CASE_GET_REG_P(REG, reg) \
483 case REG_ ## REG: reg_p = (unsigned long *)&(regs->reg); break
485 static void vmx_dr_access (unsigned long exit_qualification, struct cpu_user_regs *regs)
486 {
487 unsigned int reg;
488 unsigned long *reg_p = 0;
489 struct vcpu *v = current;
490 unsigned long eip;
492 __vmread(GUEST_RIP, &eip);
494 reg = exit_qualification & DEBUG_REG_ACCESS_NUM;
496 VMX_DBG_LOG(DBG_LEVEL_1,
497 "vmx_dr_access : eip=%lx, reg=%d, exit_qualification = %lx",
498 eip, reg, exit_qualification);
500 switch(exit_qualification & DEBUG_REG_ACCESS_REG) {
501 CASE_GET_REG_P(EAX, eax);
502 CASE_GET_REG_P(ECX, ecx);
503 CASE_GET_REG_P(EDX, edx);
504 CASE_GET_REG_P(EBX, ebx);
505 CASE_GET_REG_P(EBP, ebp);
506 CASE_GET_REG_P(ESI, esi);
507 CASE_GET_REG_P(EDI, edi);
508 case REG_ESP:
509 break;
510 default:
511 __vmx_bug(regs);
512 }
514 switch (exit_qualification & DEBUG_REG_ACCESS_TYPE) {
515 case TYPE_MOV_TO_DR:
516 /* don't need to check the range */
517 if (reg != REG_ESP)
518 v->arch.guest_context.debugreg[reg] = *reg_p;
519 else {
520 unsigned long value;
521 __vmread(GUEST_RSP, &value);
522 v->arch.guest_context.debugreg[reg] = value;
523 }
524 break;
525 case TYPE_MOV_FROM_DR:
526 if (reg != REG_ESP)
527 *reg_p = v->arch.guest_context.debugreg[reg];
528 else {
529 __vmwrite(GUEST_RSP, v->arch.guest_context.debugreg[reg]);
530 }
531 break;
532 }
533 }
535 /*
536 * Invalidate the TLB for va. Invalidate the shadow page corresponding
537 * the address va.
538 */
539 static void vmx_vmexit_do_invlpg(unsigned long va)
540 {
541 unsigned long eip;
542 struct vcpu *v = current;
544 __vmread(GUEST_RIP, &eip);
546 VMX_DBG_LOG(DBG_LEVEL_VMMU, "vmx_vmexit_do_invlpg: eip=%lx, va=%lx",
547 eip, va);
549 /*
550 * We do the safest things first, then try to update the shadow
551 * copying from guest
552 */
553 shadow_invlpg(v, va);
554 }
556 static int check_for_null_selector(unsigned long eip)
557 {
558 unsigned char inst[MAX_INST_LEN];
559 unsigned long sel;
560 int i, inst_len;
561 int inst_copy_from_guest(unsigned char *, unsigned long, int);
563 __vmread(VM_EXIT_INSTRUCTION_LEN, &inst_len);
564 memset(inst, 0, MAX_INST_LEN);
565 if (inst_copy_from_guest(inst, eip, inst_len) != inst_len) {
566 printf("check_for_null_selector: get guest instruction failed\n");
567 domain_crash_synchronous();
568 }
570 for (i = 0; i < inst_len; i++) {
571 switch (inst[i]) {
572 case 0xf3: /* REPZ */
573 case 0xf2: /* REPNZ */
574 case 0xf0: /* LOCK */
575 case 0x66: /* data32 */
576 case 0x67: /* addr32 */
577 continue;
578 case 0x2e: /* CS */
579 __vmread(GUEST_CS_SELECTOR, &sel);
580 break;
581 case 0x36: /* SS */
582 __vmread(GUEST_SS_SELECTOR, &sel);
583 break;
584 case 0x26: /* ES */
585 __vmread(GUEST_ES_SELECTOR, &sel);
586 break;
587 case 0x64: /* FS */
588 __vmread(GUEST_FS_SELECTOR, &sel);
589 break;
590 case 0x65: /* GS */
591 __vmread(GUEST_GS_SELECTOR, &sel);
592 break;
593 case 0x3e: /* DS */
594 /* FALLTHROUGH */
595 default:
596 /* DS is the default */
597 __vmread(GUEST_DS_SELECTOR, &sel);
598 }
599 return sel == 0 ? 1 : 0;
600 }
602 return 0;
603 }
605 static void vmx_io_instruction(struct cpu_user_regs *regs,
606 unsigned long exit_qualification, unsigned long inst_len)
607 {
608 struct vcpu *d = current;
609 vcpu_iodata_t *vio;
610 ioreq_t *p;
611 unsigned long addr;
612 unsigned long eip, cs, eflags;
613 int vm86;
615 __vmread(GUEST_RIP, &eip);
616 __vmread(GUEST_CS_SELECTOR, &cs);
617 __vmread(GUEST_RFLAGS, &eflags);
618 vm86 = eflags & X86_EFLAGS_VM ? 1 : 0;
620 VMX_DBG_LOG(DBG_LEVEL_1,
621 "vmx_io_instruction: vm86 %d, eip=%lx:%lx, "
622 "exit_qualification = %lx",
623 vm86, cs, eip, exit_qualification);
625 if (test_bit(6, &exit_qualification))
626 addr = (exit_qualification >> 16) & (0xffff);
627 else
628 addr = regs->edx & 0xffff;
629 TRACE_VMEXIT (2,addr);
631 vio = get_vio(d->domain, d->vcpu_id);
632 if (vio == 0) {
633 printk("bad shared page: %lx", (unsigned long) vio);
634 domain_crash_synchronous();
635 }
636 p = &vio->vp_ioreq;
637 p->dir = test_bit(3, &exit_qualification); /* direction */
639 p->pdata_valid = 0;
640 p->count = 1;
641 p->size = (exit_qualification & 7) + 1;
643 if (test_bit(4, &exit_qualification)) { /* string instruction */
644 unsigned long laddr;
646 __vmread(GUEST_LINEAR_ADDRESS, &laddr);
647 /*
648 * In protected mode, guest linear address is invalid if the
649 * selector is null.
650 */
651 if (!vm86 && check_for_null_selector(eip)) {
652 laddr = (p->dir == IOREQ_WRITE) ? regs->esi : regs->edi;
653 }
654 p->pdata_valid = 1;
656 p->u.data = laddr;
657 if (vmx_paging_enabled(d))
658 p->u.pdata = (void *) gva_to_gpa(p->u.data);
659 p->df = (eflags & X86_EFLAGS_DF) ? 1 : 0;
661 if (test_bit(5, &exit_qualification)) /* "rep" prefix */
662 p->count = vm86 ? regs->ecx & 0xFFFF : regs->ecx;
664 /*
665 * Split up string I/O operations that cross page boundaries. Don't
666 * advance %eip so that "rep insb" will restart at the next page.
667 */
668 if ((p->u.data & PAGE_MASK) !=
669 ((p->u.data + p->count * p->size - 1) & PAGE_MASK)) {
670 VMX_DBG_LOG(DBG_LEVEL_2,
671 "String I/O crosses page boundary (cs:eip=0x%lx:0x%lx)\n",
672 cs, eip);
673 if (p->u.data & (p->size - 1)) {
674 printf("Unaligned string I/O operation (cs:eip=0x%lx:0x%lx)\n",
675 cs, eip);
676 domain_crash_synchronous();
677 }
678 p->count = (PAGE_SIZE - (p->u.data & ~PAGE_MASK)) / p->size;
679 } else {
680 __update_guest_eip(inst_len);
681 }
682 } else if (p->dir == IOREQ_WRITE) {
683 p->u.data = regs->eax;
684 __update_guest_eip(inst_len);
685 } else
686 __update_guest_eip(inst_len);
688 p->addr = addr;
689 p->port_mm = 0;
691 /* Check if the packet needs to be intercepted */
692 if (vmx_portio_intercept(p))
693 /* no blocking & no evtchn notification */
694 return;
696 set_bit(ARCH_VMX_IO_WAIT, &d->arch.arch_vmx.flags);
697 p->state = STATE_IOREQ_READY;
698 evtchn_send(iopacket_port(d->domain));
699 vmx_wait_io();
700 }
702 int
703 vmx_copy(void *buf, unsigned long laddr, int size, int dir)
704 {
705 unsigned long mfn;
706 char *addr;
707 int count;
709 while (size > 0) {
710 count = PAGE_SIZE - (laddr & ~PAGE_MASK);
711 if (count > size)
712 count = size;
714 mfn = get_mfn_from_pfn(laddr >> PAGE_SHIFT);
715 /* XXX check whether laddr is valid */
716 addr = (char *)map_domain_page(mfn) + (laddr & ~PAGE_MASK);
718 if (dir == VMX_COPY_IN)
719 memcpy(buf, addr, count);
720 else
721 memcpy(addr, buf, count);
723 unmap_domain_page(addr);
725 laddr += count;
726 buf += count;
727 size -= count;
728 }
730 return 1;
731 }
733 int
734 vmx_world_save(struct vcpu *d, struct vmx_assist_context *c)
735 {
736 unsigned long inst_len;
737 int error = 0;
739 error |= __vmread(VM_EXIT_INSTRUCTION_LEN, &inst_len);
740 error |= __vmread(GUEST_RIP, &c->eip);
741 c->eip += inst_len; /* skip transition instruction */
742 error |= __vmread(GUEST_RSP, &c->esp);
743 error |= __vmread(GUEST_RFLAGS, &c->eflags);
745 error |= __vmread(CR0_READ_SHADOW, &c->cr0);
746 c->cr3 = d->arch.arch_vmx.cpu_cr3;
747 error |= __vmread(CR4_READ_SHADOW, &c->cr4);
749 error |= __vmread(GUEST_IDTR_LIMIT, &c->idtr_limit);
750 error |= __vmread(GUEST_IDTR_BASE, &c->idtr_base);
752 error |= __vmread(GUEST_GDTR_LIMIT, &c->gdtr_limit);
753 error |= __vmread(GUEST_GDTR_BASE, &c->gdtr_base);
755 error |= __vmread(GUEST_CS_SELECTOR, &c->cs_sel);
756 error |= __vmread(GUEST_CS_LIMIT, &c->cs_limit);
757 error |= __vmread(GUEST_CS_BASE, &c->cs_base);
758 error |= __vmread(GUEST_CS_AR_BYTES, &c->cs_arbytes.bytes);
760 error |= __vmread(GUEST_DS_SELECTOR, &c->ds_sel);
761 error |= __vmread(GUEST_DS_LIMIT, &c->ds_limit);
762 error |= __vmread(GUEST_DS_BASE, &c->ds_base);
763 error |= __vmread(GUEST_DS_AR_BYTES, &c->ds_arbytes.bytes);
765 error |= __vmread(GUEST_ES_SELECTOR, &c->es_sel);
766 error |= __vmread(GUEST_ES_LIMIT, &c->es_limit);
767 error |= __vmread(GUEST_ES_BASE, &c->es_base);
768 error |= __vmread(GUEST_ES_AR_BYTES, &c->es_arbytes.bytes);
770 error |= __vmread(GUEST_SS_SELECTOR, &c->ss_sel);
771 error |= __vmread(GUEST_SS_LIMIT, &c->ss_limit);
772 error |= __vmread(GUEST_SS_BASE, &c->ss_base);
773 error |= __vmread(GUEST_SS_AR_BYTES, &c->ss_arbytes.bytes);
775 error |= __vmread(GUEST_FS_SELECTOR, &c->fs_sel);
776 error |= __vmread(GUEST_FS_LIMIT, &c->fs_limit);
777 error |= __vmread(GUEST_FS_BASE, &c->fs_base);
778 error |= __vmread(GUEST_FS_AR_BYTES, &c->fs_arbytes.bytes);
780 error |= __vmread(GUEST_GS_SELECTOR, &c->gs_sel);
781 error |= __vmread(GUEST_GS_LIMIT, &c->gs_limit);
782 error |= __vmread(GUEST_GS_BASE, &c->gs_base);
783 error |= __vmread(GUEST_GS_AR_BYTES, &c->gs_arbytes.bytes);
785 error |= __vmread(GUEST_TR_SELECTOR, &c->tr_sel);
786 error |= __vmread(GUEST_TR_LIMIT, &c->tr_limit);
787 error |= __vmread(GUEST_TR_BASE, &c->tr_base);
788 error |= __vmread(GUEST_TR_AR_BYTES, &c->tr_arbytes.bytes);
790 error |= __vmread(GUEST_LDTR_SELECTOR, &c->ldtr_sel);
791 error |= __vmread(GUEST_LDTR_LIMIT, &c->ldtr_limit);
792 error |= __vmread(GUEST_LDTR_BASE, &c->ldtr_base);
793 error |= __vmread(GUEST_LDTR_AR_BYTES, &c->ldtr_arbytes.bytes);
795 return !error;
796 }
798 int
799 vmx_world_restore(struct vcpu *d, struct vmx_assist_context *c)
800 {
801 unsigned long mfn, old_cr4;
802 int error = 0;
804 error |= __vmwrite(GUEST_RIP, c->eip);
805 error |= __vmwrite(GUEST_RSP, c->esp);
806 error |= __vmwrite(GUEST_RFLAGS, c->eflags);
808 error |= __vmwrite(CR0_READ_SHADOW, c->cr0);
810 if (!vmx_paging_enabled(d)) {
811 VMX_DBG_LOG(DBG_LEVEL_VMMU, "switching to vmxassist. use phys table");
812 __vmwrite(GUEST_CR3, pagetable_get_paddr(d->domain->arch.phys_table));
813 goto skip_cr3;
814 }
816 if (c->cr3 == d->arch.arch_vmx.cpu_cr3) {
817 /*
818 * This is simple TLB flush, implying the guest has
819 * removed some translation or changed page attributes.
820 * We simply invalidate the shadow.
821 */
822 mfn = get_mfn_from_pfn(c->cr3 >> PAGE_SHIFT);
823 if (mfn != pagetable_get_pfn(d->arch.guest_table)) {
824 printk("Invalid CR3 value=%x", c->cr3);
825 domain_crash_synchronous();
826 return 0;
827 }
828 shadow_sync_all(d->domain);
829 } else {
830 /*
831 * If different, make a shadow. Check if the PDBR is valid
832 * first.
833 */
834 VMX_DBG_LOG(DBG_LEVEL_VMMU, "CR3 c->cr3 = %x", c->cr3);
835 if ((c->cr3 >> PAGE_SHIFT) > d->domain->max_pages) {
836 printk("Invalid CR3 value=%x", c->cr3);
837 domain_crash_synchronous();
838 return 0;
839 }
840 mfn = get_mfn_from_pfn(c->cr3 >> PAGE_SHIFT);
841 d->arch.guest_table = mk_pagetable(mfn << PAGE_SHIFT);
842 update_pagetables(d);
843 /*
844 * arch.shadow_table should now hold the next CR3 for shadow
845 */
846 d->arch.arch_vmx.cpu_cr3 = c->cr3;
847 VMX_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %x", c->cr3);
848 __vmwrite(GUEST_CR3, pagetable_get_paddr(d->arch.shadow_table));
849 }
851 skip_cr3:
853 error |= __vmread(CR4_READ_SHADOW, &old_cr4);
854 error |= __vmwrite(GUEST_CR4, (c->cr4 | VMX_CR4_HOST_MASK));
855 error |= __vmwrite(CR4_READ_SHADOW, c->cr4);
857 error |= __vmwrite(GUEST_IDTR_LIMIT, c->idtr_limit);
858 error |= __vmwrite(GUEST_IDTR_BASE, c->idtr_base);
860 error |= __vmwrite(GUEST_GDTR_LIMIT, c->gdtr_limit);
861 error |= __vmwrite(GUEST_GDTR_BASE, c->gdtr_base);
863 error |= __vmwrite(GUEST_CS_SELECTOR, c->cs_sel);
864 error |= __vmwrite(GUEST_CS_LIMIT, c->cs_limit);
865 error |= __vmwrite(GUEST_CS_BASE, c->cs_base);
866 error |= __vmwrite(GUEST_CS_AR_BYTES, c->cs_arbytes.bytes);
868 error |= __vmwrite(GUEST_DS_SELECTOR, c->ds_sel);
869 error |= __vmwrite(GUEST_DS_LIMIT, c->ds_limit);
870 error |= __vmwrite(GUEST_DS_BASE, c->ds_base);
871 error |= __vmwrite(GUEST_DS_AR_BYTES, c->ds_arbytes.bytes);
873 error |= __vmwrite(GUEST_ES_SELECTOR, c->es_sel);
874 error |= __vmwrite(GUEST_ES_LIMIT, c->es_limit);
875 error |= __vmwrite(GUEST_ES_BASE, c->es_base);
876 error |= __vmwrite(GUEST_ES_AR_BYTES, c->es_arbytes.bytes);
878 error |= __vmwrite(GUEST_SS_SELECTOR, c->ss_sel);
879 error |= __vmwrite(GUEST_SS_LIMIT, c->ss_limit);
880 error |= __vmwrite(GUEST_SS_BASE, c->ss_base);
881 error |= __vmwrite(GUEST_SS_AR_BYTES, c->ss_arbytes.bytes);
883 error |= __vmwrite(GUEST_FS_SELECTOR, c->fs_sel);
884 error |= __vmwrite(GUEST_FS_LIMIT, c->fs_limit);
885 error |= __vmwrite(GUEST_FS_BASE, c->fs_base);
886 error |= __vmwrite(GUEST_FS_AR_BYTES, c->fs_arbytes.bytes);
888 error |= __vmwrite(GUEST_GS_SELECTOR, c->gs_sel);
889 error |= __vmwrite(GUEST_GS_LIMIT, c->gs_limit);
890 error |= __vmwrite(GUEST_GS_BASE, c->gs_base);
891 error |= __vmwrite(GUEST_GS_AR_BYTES, c->gs_arbytes.bytes);
893 error |= __vmwrite(GUEST_TR_SELECTOR, c->tr_sel);
894 error |= __vmwrite(GUEST_TR_LIMIT, c->tr_limit);
895 error |= __vmwrite(GUEST_TR_BASE, c->tr_base);
896 error |= __vmwrite(GUEST_TR_AR_BYTES, c->tr_arbytes.bytes);
898 error |= __vmwrite(GUEST_LDTR_SELECTOR, c->ldtr_sel);
899 error |= __vmwrite(GUEST_LDTR_LIMIT, c->ldtr_limit);
900 error |= __vmwrite(GUEST_LDTR_BASE, c->ldtr_base);
901 error |= __vmwrite(GUEST_LDTR_AR_BYTES, c->ldtr_arbytes.bytes);
903 return !error;
904 }
906 enum { VMX_ASSIST_INVOKE = 0, VMX_ASSIST_RESTORE };
908 int
909 vmx_assist(struct vcpu *d, int mode)
910 {
911 struct vmx_assist_context c;
912 u32 magic;
913 u32 cp;
915 /* make sure vmxassist exists (this is not an error) */
916 if (!vmx_copy(&magic, VMXASSIST_MAGIC_OFFSET, sizeof(magic), VMX_COPY_IN))
917 return 0;
918 if (magic != VMXASSIST_MAGIC)
919 return 0;
921 switch (mode) {
922 /*
923 * Transfer control to vmxassist.
924 * Store the current context in VMXASSIST_OLD_CONTEXT and load
925 * the new VMXASSIST_NEW_CONTEXT context. This context was created
926 * by vmxassist and will transfer control to it.
927 */
928 case VMX_ASSIST_INVOKE:
929 /* save the old context */
930 if (!vmx_copy(&cp, VMXASSIST_OLD_CONTEXT, sizeof(cp), VMX_COPY_IN))
931 goto error;
932 if (cp != 0) {
933 if (!vmx_world_save(d, &c))
934 goto error;
935 if (!vmx_copy(&c, cp, sizeof(c), VMX_COPY_OUT))
936 goto error;
937 }
939 /* restore the new context, this should activate vmxassist */
940 if (!vmx_copy(&cp, VMXASSIST_NEW_CONTEXT, sizeof(cp), VMX_COPY_IN))
941 goto error;
942 if (cp != 0) {
943 if (!vmx_copy(&c, cp, sizeof(c), VMX_COPY_IN))
944 goto error;
945 if (!vmx_world_restore(d, &c))
946 goto error;
947 return 1;
948 }
949 break;
951 /*
952 * Restore the VMXASSIST_OLD_CONTEXT that was saved by VMX_ASSIST_INVOKE
953 * above.
954 */
955 case VMX_ASSIST_RESTORE:
956 /* save the old context */
957 if (!vmx_copy(&cp, VMXASSIST_OLD_CONTEXT, sizeof(cp), VMX_COPY_IN))
958 goto error;
959 if (cp != 0) {
960 if (!vmx_copy(&c, cp, sizeof(c), VMX_COPY_IN))
961 goto error;
962 if (!vmx_world_restore(d, &c))
963 goto error;
964 return 1;
965 }
966 break;
967 }
969 error:
970 printf("Failed to transfer to vmxassist\n");
971 domain_crash_synchronous();
972 return 0;
973 }
975 static int vmx_set_cr0(unsigned long value)
976 {
977 struct vcpu *d = current;
978 unsigned long mfn;
979 unsigned long eip;
980 int paging_enabled;
981 unsigned long vm_entry_value;
982 /*
983 * CR0: We don't want to lose PE and PG.
984 */
985 paging_enabled = vmx_paging_enabled(d);
986 __vmwrite(GUEST_CR0, (value | X86_CR0_PE | X86_CR0_PG));
987 __vmwrite(CR0_READ_SHADOW, value);
989 VMX_DBG_LOG(DBG_LEVEL_VMMU, "Update CR0 value = %lx\n", value);
991 if ((value & X86_CR0_PE) && (value & X86_CR0_PG) && !paging_enabled) {
992 /*
993 * The guest CR3 must be pointing to the guest physical.
994 */
995 if ( !VALID_MFN(mfn = get_mfn_from_pfn(
996 d->arch.arch_vmx.cpu_cr3 >> PAGE_SHIFT)) ||
997 !get_page(pfn_to_page(mfn), d->domain) )
998 {
999 printk("Invalid CR3 value = %lx", d->arch.arch_vmx.cpu_cr3);
1000 domain_crash_synchronous(); /* need to take a clean path */
1003 #if defined(__x86_64__)
1004 if (test_bit(VMX_CPU_STATE_LME_ENABLED,
1005 &d->arch.arch_vmx.cpu_state) &&
1006 !test_bit(VMX_CPU_STATE_PAE_ENABLED,
1007 &d->arch.arch_vmx.cpu_state)){
1008 VMX_DBG_LOG(DBG_LEVEL_1, "Enable paging before PAE enable\n");
1009 vmx_inject_exception(d, TRAP_gp_fault, 0);
1011 if (test_bit(VMX_CPU_STATE_LME_ENABLED,
1012 &d->arch.arch_vmx.cpu_state)){
1013 /* Here the PAE is should to be opened */
1014 VMX_DBG_LOG(DBG_LEVEL_1, "Enable the Long mode\n");
1015 set_bit(VMX_CPU_STATE_LMA_ENABLED,
1016 &d->arch.arch_vmx.cpu_state);
1017 __vmread(VM_ENTRY_CONTROLS, &vm_entry_value);
1018 vm_entry_value |= VM_ENTRY_CONTROLS_IA32E_MODE;
1019 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
1021 #if CONFIG_PAGING_LEVELS >= 4
1022 if(!shadow_set_guest_paging_levels(d->domain, 4)) {
1023 printk("Unsupported guest paging levels\n");
1024 domain_crash_synchronous(); /* need to take a clean path */
1026 #endif
1028 else
1030 #if CONFIG_PAGING_LEVELS >= 4
1031 if(!shadow_set_guest_paging_levels(d->domain, 2)) {
1032 printk("Unsupported guest paging levels\n");
1033 domain_crash_synchronous(); /* need to take a clean path */
1035 #endif
1038 unsigned long crn;
1039 /* update CR4's PAE if needed */
1040 __vmread(GUEST_CR4, &crn);
1041 if ( (!(crn & X86_CR4_PAE)) &&
1042 test_bit(VMX_CPU_STATE_PAE_ENABLED,
1043 &d->arch.arch_vmx.cpu_state)){
1044 VMX_DBG_LOG(DBG_LEVEL_1, "enable PAE on cr4\n");
1045 __vmwrite(GUEST_CR4, crn | X86_CR4_PAE);
1047 #elif defined( __i386__)
1048 unsigned long old_base_mfn;
1049 old_base_mfn = pagetable_get_pfn(d->arch.guest_table);
1050 if (old_base_mfn)
1051 put_page(pfn_to_page(old_base_mfn));
1052 #endif
1053 /*
1054 * Now arch.guest_table points to machine physical.
1055 */
1056 d->arch.guest_table = mk_pagetable(mfn << PAGE_SHIFT);
1057 update_pagetables(d);
1059 VMX_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
1060 (unsigned long) (mfn << PAGE_SHIFT));
1062 __vmwrite(GUEST_CR3, pagetable_get_paddr(d->arch.shadow_table));
1063 /*
1064 * arch->shadow_table should hold the next CR3 for shadow
1065 */
1066 VMX_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, mfn = %lx",
1067 d->arch.arch_vmx.cpu_cr3, mfn);
1070 /*
1071 * VMX does not implement real-mode virtualization. We emulate
1072 * real-mode by performing a world switch to VMXAssist whenever
1073 * a partition disables the CR0.PE bit.
1074 */
1075 if ((value & X86_CR0_PE) == 0) {
1076 if ( value & X86_CR0_PG ) {
1077 /* inject GP here */
1078 vmx_inject_exception(d, TRAP_gp_fault, 0);
1079 return 0;
1080 } else {
1081 /*
1082 * Disable paging here.
1083 * Same to PE == 1 && PG == 0
1084 */
1085 if (test_bit(VMX_CPU_STATE_LMA_ENABLED,
1086 &d->arch.arch_vmx.cpu_state)){
1087 clear_bit(VMX_CPU_STATE_LMA_ENABLED,
1088 &d->arch.arch_vmx.cpu_state);
1089 __vmread(VM_ENTRY_CONTROLS, &vm_entry_value);
1090 vm_entry_value &= ~VM_ENTRY_CONTROLS_IA32E_MODE;
1091 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
1094 __vmread(GUEST_RIP, &eip);
1095 VMX_DBG_LOG(DBG_LEVEL_1,
1096 "Disabling CR0.PE at %%eip 0x%lx\n", eip);
1097 if (vmx_assist(d, VMX_ASSIST_INVOKE)) {
1098 set_bit(VMX_CPU_STATE_ASSIST_ENABLED, &d->arch.arch_vmx.cpu_state);
1099 __vmread(GUEST_RIP, &eip);
1100 VMX_DBG_LOG(DBG_LEVEL_1,
1101 "Transfering control to vmxassist %%eip 0x%lx\n", eip);
1102 return 0; /* do not update eip! */
1104 } else if (test_bit(VMX_CPU_STATE_ASSIST_ENABLED,
1105 &d->arch.arch_vmx.cpu_state)) {
1106 __vmread(GUEST_RIP, &eip);
1107 VMX_DBG_LOG(DBG_LEVEL_1,
1108 "Enabling CR0.PE at %%eip 0x%lx\n", eip);
1109 if (vmx_assist(d, VMX_ASSIST_RESTORE)) {
1110 clear_bit(VMX_CPU_STATE_ASSIST_ENABLED,
1111 &d->arch.arch_vmx.cpu_state);
1112 __vmread(GUEST_RIP, &eip);
1113 VMX_DBG_LOG(DBG_LEVEL_1,
1114 "Restoring to %%eip 0x%lx\n", eip);
1115 return 0; /* do not update eip! */
1119 return 1;
1122 #define CASE_GET_REG(REG, reg) \
1123 case REG_ ## REG: value = regs->reg; break
1125 #define CASE_EXTEND_SET_REG \
1126 CASE_EXTEND_REG(S)
1127 #define CASE_EXTEND_GET_REG \
1128 CASE_EXTEND_REG(G)
1130 #ifdef __i386__
1131 #define CASE_EXTEND_REG(T)
1132 #else
1133 #define CASE_EXTEND_REG(T) \
1134 CASE_ ## T ## ET_REG(R8, r8); \
1135 CASE_ ## T ## ET_REG(R9, r9); \
1136 CASE_ ## T ## ET_REG(R10, r10); \
1137 CASE_ ## T ## ET_REG(R11, r11); \
1138 CASE_ ## T ## ET_REG(R12, r12); \
1139 CASE_ ## T ## ET_REG(R13, r13); \
1140 CASE_ ## T ## ET_REG(R14, r14); \
1141 CASE_ ## T ## ET_REG(R15, r15);
1142 #endif
1145 /*
1146 * Write to control registers
1147 */
1148 static int mov_to_cr(int gp, int cr, struct cpu_user_regs *regs)
1150 unsigned long value;
1151 unsigned long old_cr;
1152 struct vcpu *d = current;
1154 switch (gp) {
1155 CASE_GET_REG(EAX, eax);
1156 CASE_GET_REG(ECX, ecx);
1157 CASE_GET_REG(EDX, edx);
1158 CASE_GET_REG(EBX, ebx);
1159 CASE_GET_REG(EBP, ebp);
1160 CASE_GET_REG(ESI, esi);
1161 CASE_GET_REG(EDI, edi);
1162 CASE_EXTEND_GET_REG
1163 case REG_ESP:
1164 __vmread(GUEST_RSP, &value);
1165 break;
1166 default:
1167 printk("invalid gp: %d\n", gp);
1168 __vmx_bug(regs);
1171 VMX_DBG_LOG(DBG_LEVEL_1, "mov_to_cr: CR%d, value = %lx,", cr, value);
1172 VMX_DBG_LOG(DBG_LEVEL_1, "current = %lx,", (unsigned long) current);
1174 switch(cr) {
1175 case 0:
1177 return vmx_set_cr0(value);
1179 case 3:
1181 unsigned long old_base_mfn, mfn;
1183 /*
1184 * If paging is not enabled yet, simply copy the value to CR3.
1185 */
1186 if (!vmx_paging_enabled(d)) {
1187 d->arch.arch_vmx.cpu_cr3 = value;
1188 break;
1191 /*
1192 * We make a new one if the shadow does not exist.
1193 */
1194 if (value == d->arch.arch_vmx.cpu_cr3) {
1195 /*
1196 * This is simple TLB flush, implying the guest has
1197 * removed some translation or changed page attributes.
1198 * We simply invalidate the shadow.
1199 */
1200 mfn = get_mfn_from_pfn(value >> PAGE_SHIFT);
1201 if (mfn != pagetable_get_pfn(d->arch.guest_table))
1202 __vmx_bug(regs);
1203 shadow_sync_all(d->domain);
1204 } else {
1205 /*
1206 * If different, make a shadow. Check if the PDBR is valid
1207 * first.
1208 */
1209 VMX_DBG_LOG(DBG_LEVEL_VMMU, "CR3 value = %lx", value);
1210 if ( ((value >> PAGE_SHIFT) > d->domain->max_pages ) ||
1211 !VALID_MFN(mfn = get_mfn_from_pfn(value >> PAGE_SHIFT)) ||
1212 !get_page(pfn_to_page(mfn), d->domain) )
1214 printk("Invalid CR3 value=%lx", value);
1215 domain_crash_synchronous(); /* need to take a clean path */
1217 old_base_mfn = pagetable_get_pfn(d->arch.guest_table);
1218 d->arch.guest_table = mk_pagetable(mfn << PAGE_SHIFT);
1219 if (old_base_mfn)
1220 put_page(pfn_to_page(old_base_mfn));
1221 update_pagetables(d);
1222 /*
1223 * arch.shadow_table should now hold the next CR3 for shadow
1224 */
1225 d->arch.arch_vmx.cpu_cr3 = value;
1226 VMX_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx",
1227 value);
1228 __vmwrite(GUEST_CR3, pagetable_get_paddr(d->arch.shadow_table));
1230 break;
1232 case 4:
1234 /* CR4 */
1235 unsigned long old_guest_cr;
1237 __vmread(GUEST_CR4, &old_guest_cr);
1238 if (value & X86_CR4_PAE){
1239 set_bit(VMX_CPU_STATE_PAE_ENABLED, &d->arch.arch_vmx.cpu_state);
1240 } else {
1241 if (test_bit(VMX_CPU_STATE_LMA_ENABLED,
1242 &d->arch.arch_vmx.cpu_state)){
1243 vmx_inject_exception(d, TRAP_gp_fault, 0);
1245 clear_bit(VMX_CPU_STATE_PAE_ENABLED, &d->arch.arch_vmx.cpu_state);
1248 __vmread(CR4_READ_SHADOW, &old_cr);
1250 __vmwrite(GUEST_CR4, value| VMX_CR4_HOST_MASK);
1251 __vmwrite(CR4_READ_SHADOW, value);
1253 /*
1254 * Writing to CR4 to modify the PSE, PGE, or PAE flag invalidates
1255 * all TLB entries except global entries.
1256 */
1257 if ((old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE)) {
1258 shadow_sync_all(d->domain);
1260 break;
1262 default:
1263 printk("invalid cr: %d\n", gp);
1264 __vmx_bug(regs);
1267 return 1;
1270 #define CASE_SET_REG(REG, reg) \
1271 case REG_ ## REG: \
1272 regs->reg = value; \
1273 break
1275 /*
1276 * Read from control registers. CR0 and CR4 are read from the shadow.
1277 */
1278 static void mov_from_cr(int cr, int gp, struct cpu_user_regs *regs)
1280 unsigned long value;
1281 struct vcpu *d = current;
1283 if (cr != 3)
1284 __vmx_bug(regs);
1286 value = (unsigned long) d->arch.arch_vmx.cpu_cr3;
1288 switch (gp) {
1289 CASE_SET_REG(EAX, eax);
1290 CASE_SET_REG(ECX, ecx);
1291 CASE_SET_REG(EDX, edx);
1292 CASE_SET_REG(EBX, ebx);
1293 CASE_SET_REG(EBP, ebp);
1294 CASE_SET_REG(ESI, esi);
1295 CASE_SET_REG(EDI, edi);
1296 CASE_EXTEND_SET_REG
1297 case REG_ESP:
1298 __vmwrite(GUEST_RSP, value);
1299 regs->esp = value;
1300 break;
1301 default:
1302 printk("invalid gp: %d\n", gp);
1303 __vmx_bug(regs);
1306 VMX_DBG_LOG(DBG_LEVEL_VMMU, "mov_from_cr: CR%d, value = %lx,", cr, value);
1309 static int vmx_cr_access(unsigned long exit_qualification, struct cpu_user_regs *regs)
1311 unsigned int gp, cr;
1312 unsigned long value;
1314 switch (exit_qualification & CONTROL_REG_ACCESS_TYPE) {
1315 case TYPE_MOV_TO_CR:
1316 gp = exit_qualification & CONTROL_REG_ACCESS_REG;
1317 cr = exit_qualification & CONTROL_REG_ACCESS_NUM;
1318 TRACE_VMEXIT(1,TYPE_MOV_TO_CR);
1319 TRACE_VMEXIT(2,cr);
1320 TRACE_VMEXIT(3,gp);
1321 return mov_to_cr(gp, cr, regs);
1322 case TYPE_MOV_FROM_CR:
1323 gp = exit_qualification & CONTROL_REG_ACCESS_REG;
1324 cr = exit_qualification & CONTROL_REG_ACCESS_NUM;
1325 TRACE_VMEXIT(1,TYPE_MOV_FROM_CR);
1326 TRACE_VMEXIT(2,cr);
1327 TRACE_VMEXIT(3,gp);
1328 mov_from_cr(cr, gp, regs);
1329 break;
1330 case TYPE_CLTS:
1331 TRACE_VMEXIT(1,TYPE_CLTS);
1332 clts();
1333 setup_fpu(current);
1335 __vmread(GUEST_CR0, &value);
1336 value &= ~X86_CR0_TS; /* clear TS */
1337 __vmwrite(GUEST_CR0, value);
1339 __vmread(CR0_READ_SHADOW, &value);
1340 value &= ~X86_CR0_TS; /* clear TS */
1341 __vmwrite(CR0_READ_SHADOW, value);
1342 break;
1343 case TYPE_LMSW:
1344 TRACE_VMEXIT(1,TYPE_LMSW);
1345 __vmread(CR0_READ_SHADOW, &value);
1346 value = (value & ~0xF) |
1347 (((exit_qualification & LMSW_SOURCE_DATA) >> 16) & 0xF);
1348 return vmx_set_cr0(value);
1349 break;
1350 default:
1351 __vmx_bug(regs);
1352 break;
1354 return 1;
1357 static inline void vmx_do_msr_read(struct cpu_user_regs *regs)
1359 VMX_DBG_LOG(DBG_LEVEL_1, "vmx_do_msr_read: ecx=%lx, eax=%lx, edx=%lx",
1360 (unsigned long)regs->ecx, (unsigned long)regs->eax,
1361 (unsigned long)regs->edx);
1362 switch (regs->ecx) {
1363 case MSR_IA32_SYSENTER_CS:
1364 __vmread(GUEST_SYSENTER_CS, &regs->eax);
1365 regs->edx = 0;
1366 break;
1367 case MSR_IA32_SYSENTER_ESP:
1368 __vmread(GUEST_SYSENTER_ESP, &regs->eax);
1369 regs->edx = 0;
1370 break;
1371 case MSR_IA32_SYSENTER_EIP:
1372 __vmread(GUEST_SYSENTER_EIP, &regs->eax);
1373 regs->edx = 0;
1374 break;
1375 default:
1376 if(long_mode_do_msr_read(regs))
1377 return;
1378 rdmsr_user(regs->ecx, regs->eax, regs->edx);
1379 break;
1382 VMX_DBG_LOG(DBG_LEVEL_1, "vmx_do_msr_read returns: "
1383 "ecx=%lx, eax=%lx, edx=%lx",
1384 (unsigned long)regs->ecx, (unsigned long)regs->eax,
1385 (unsigned long)regs->edx);
1388 static inline void vmx_do_msr_write(struct cpu_user_regs *regs)
1390 VMX_DBG_LOG(DBG_LEVEL_1, "vmx_do_msr_write: ecx=%lx, eax=%lx, edx=%lx",
1391 (unsigned long)regs->ecx, (unsigned long)regs->eax,
1392 (unsigned long)regs->edx);
1393 switch (regs->ecx) {
1394 case MSR_IA32_SYSENTER_CS:
1395 __vmwrite(GUEST_SYSENTER_CS, regs->eax);
1396 break;
1397 case MSR_IA32_SYSENTER_ESP:
1398 __vmwrite(GUEST_SYSENTER_ESP, regs->eax);
1399 break;
1400 case MSR_IA32_SYSENTER_EIP:
1401 __vmwrite(GUEST_SYSENTER_EIP, regs->eax);
1402 break;
1403 default:
1404 long_mode_do_msr_write(regs);
1405 break;
1408 VMX_DBG_LOG(DBG_LEVEL_1, "vmx_do_msr_write returns: "
1409 "ecx=%lx, eax=%lx, edx=%lx",
1410 (unsigned long)regs->ecx, (unsigned long)regs->eax,
1411 (unsigned long)regs->edx);
1414 /*
1415 * Need to use this exit to reschedule
1416 */
1417 static inline void vmx_vmexit_do_hlt(void)
1419 #if VMX_DEBUG
1420 unsigned long eip;
1421 __vmread(GUEST_RIP, &eip);
1422 #endif
1423 VMX_DBG_LOG(DBG_LEVEL_1, "vmx_vmexit_do_hlt:eip=%lx", eip);
1424 raise_softirq(SCHEDULE_SOFTIRQ);
1427 static inline void vmx_vmexit_do_extint(struct cpu_user_regs *regs)
1429 unsigned int vector;
1430 int error;
1432 asmlinkage void do_IRQ(struct cpu_user_regs *);
1433 void smp_apic_timer_interrupt(struct cpu_user_regs *);
1434 void timer_interrupt(int, void *, struct cpu_user_regs *);
1435 void smp_event_check_interrupt(void);
1436 void smp_invalidate_interrupt(void);
1437 void smp_call_function_interrupt(void);
1438 void smp_spurious_interrupt(struct cpu_user_regs *regs);
1439 void smp_error_interrupt(struct cpu_user_regs *regs);
1441 if ((error = __vmread(VM_EXIT_INTR_INFO, &vector))
1442 && !(vector & INTR_INFO_VALID_MASK))
1443 __vmx_bug(regs);
1445 vector &= 0xff;
1446 local_irq_disable();
1448 switch(vector) {
1449 case LOCAL_TIMER_VECTOR:
1450 smp_apic_timer_interrupt(regs);
1451 break;
1452 case EVENT_CHECK_VECTOR:
1453 smp_event_check_interrupt();
1454 break;
1455 case INVALIDATE_TLB_VECTOR:
1456 smp_invalidate_interrupt();
1457 break;
1458 case CALL_FUNCTION_VECTOR:
1459 smp_call_function_interrupt();
1460 break;
1461 case SPURIOUS_APIC_VECTOR:
1462 smp_spurious_interrupt(regs);
1463 break;
1464 case ERROR_APIC_VECTOR:
1465 smp_error_interrupt(regs);
1466 break;
1467 default:
1468 regs->entry_vector = vector;
1469 do_IRQ(regs);
1470 break;
1474 static inline void vmx_vmexit_do_mwait(void)
1476 #if VMX_DEBUG
1477 unsigned long eip;
1478 __vmread(GUEST_RIP, &eip);
1479 #endif
1480 VMX_DBG_LOG(DBG_LEVEL_1, "vmx_vmexit_do_mwait:eip=%lx", eip);
1481 raise_softirq(SCHEDULE_SOFTIRQ);
1484 #define BUF_SIZ 256
1485 #define MAX_LINE 80
1486 char print_buf[BUF_SIZ];
1487 static int index;
1489 static void vmx_print_line(const char c, struct vcpu *d)
1492 if (index == MAX_LINE || c == '\n') {
1493 if (index == MAX_LINE) {
1494 print_buf[index++] = c;
1496 print_buf[index] = '\0';
1497 printk("(GUEST: %u) %s\n", d->domain->domain_id, (char *) &print_buf);
1498 index = 0;
1500 else
1501 print_buf[index++] = c;
1504 void save_vmx_cpu_user_regs(struct cpu_user_regs *ctxt)
1506 __vmread(GUEST_SS_SELECTOR, &ctxt->ss);
1507 __vmread(GUEST_RSP, &ctxt->esp);
1508 __vmread(GUEST_RFLAGS, &ctxt->eflags);
1509 __vmread(GUEST_CS_SELECTOR, &ctxt->cs);
1510 __vmread(GUEST_RIP, &ctxt->eip);
1512 __vmread(GUEST_GS_SELECTOR, &ctxt->gs);
1513 __vmread(GUEST_FS_SELECTOR, &ctxt->fs);
1514 __vmread(GUEST_ES_SELECTOR, &ctxt->es);
1515 __vmread(GUEST_DS_SELECTOR, &ctxt->ds);
1518 #ifdef XEN_DEBUGGER
1519 void save_cpu_user_regs(struct cpu_user_regs *regs)
1521 __vmread(GUEST_SS_SELECTOR, &regs->xss);
1522 __vmread(GUEST_RSP, &regs->esp);
1523 __vmread(GUEST_RFLAGS, &regs->eflags);
1524 __vmread(GUEST_CS_SELECTOR, &regs->xcs);
1525 __vmread(GUEST_RIP, &regs->eip);
1527 __vmread(GUEST_GS_SELECTOR, &regs->xgs);
1528 __vmread(GUEST_FS_SELECTOR, &regs->xfs);
1529 __vmread(GUEST_ES_SELECTOR, &regs->xes);
1530 __vmread(GUEST_DS_SELECTOR, &regs->xds);
1533 void restore_cpu_user_regs(struct cpu_user_regs *regs)
1535 __vmwrite(GUEST_SS_SELECTOR, regs->xss);
1536 __vmwrite(GUEST_RSP, regs->esp);
1537 __vmwrite(GUEST_RFLAGS, regs->eflags);
1538 __vmwrite(GUEST_CS_SELECTOR, regs->xcs);
1539 __vmwrite(GUEST_RIP, regs->eip);
1541 __vmwrite(GUEST_GS_SELECTOR, regs->xgs);
1542 __vmwrite(GUEST_FS_SELECTOR, regs->xfs);
1543 __vmwrite(GUEST_ES_SELECTOR, regs->xes);
1544 __vmwrite(GUEST_DS_SELECTOR, regs->xds);
1546 #endif
1548 asmlinkage void vmx_vmexit_handler(struct cpu_user_regs regs)
1550 unsigned int exit_reason, idtv_info_field;
1551 unsigned long exit_qualification, eip, inst_len = 0;
1552 struct vcpu *v = current;
1553 int error;
1555 if ((error = __vmread(VM_EXIT_REASON, &exit_reason)))
1556 __vmx_bug(&regs);
1558 perfc_incra(vmexits, exit_reason);
1560 __vmread(IDT_VECTORING_INFO_FIELD, &idtv_info_field);
1561 if (idtv_info_field & INTR_INFO_VALID_MASK) {
1562 __vmwrite(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field);
1564 __vmread(VM_EXIT_INSTRUCTION_LEN, &inst_len);
1565 if (inst_len >= 1 && inst_len <= 15)
1566 __vmwrite(VM_ENTRY_INSTRUCTION_LEN, inst_len);
1568 if (idtv_info_field & 0x800) { /* valid error code */
1569 unsigned long error_code;
1570 __vmread(IDT_VECTORING_ERROR_CODE, &error_code);
1571 __vmwrite(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
1574 VMX_DBG_LOG(DBG_LEVEL_1, "idtv_info_field=%x", idtv_info_field);
1577 /* don't bother H/W interrutps */
1578 if (exit_reason != EXIT_REASON_EXTERNAL_INTERRUPT &&
1579 exit_reason != EXIT_REASON_VMCALL &&
1580 exit_reason != EXIT_REASON_IO_INSTRUCTION)
1581 VMX_DBG_LOG(DBG_LEVEL_0, "exit reason = %x", exit_reason);
1583 if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
1584 printk("Failed vm entry\n");
1585 domain_crash_synchronous();
1586 return;
1589 __vmread(GUEST_RIP, &eip);
1590 TRACE_3D(TRC_VMX_VMEXIT, v->domain->domain_id, eip, exit_reason);
1591 TRACE_VMEXIT(0,exit_reason);
1593 switch (exit_reason) {
1594 case EXIT_REASON_EXCEPTION_NMI:
1596 /*
1597 * We don't set the software-interrupt exiting (INT n).
1598 * (1) We can get an exception (e.g. #PG) in the guest, or
1599 * (2) NMI
1600 */
1601 int error;
1602 unsigned int vector;
1603 unsigned long va;
1605 if ((error = __vmread(VM_EXIT_INTR_INFO, &vector))
1606 || !(vector & INTR_INFO_VALID_MASK))
1607 __vmx_bug(&regs);
1608 vector &= 0xff;
1610 TRACE_VMEXIT(1,vector);
1611 perfc_incra(cause_vector, vector);
1613 TRACE_3D(TRC_VMX_VECTOR, v->domain->domain_id, eip, vector);
1614 switch (vector) {
1615 #ifdef XEN_DEBUGGER
1616 case TRAP_debug:
1618 save_cpu_user_regs(&regs);
1619 pdb_handle_exception(1, &regs, 1);
1620 restore_cpu_user_regs(&regs);
1621 break;
1623 case TRAP_int3:
1625 save_cpu_user_regs(&regs);
1626 pdb_handle_exception(3, &regs, 1);
1627 restore_cpu_user_regs(&regs);
1628 break;
1630 #else
1631 case TRAP_debug:
1633 void store_cpu_user_regs(struct cpu_user_regs *regs);
1634 long do_sched_op(unsigned long op);
1637 store_cpu_user_regs(&regs);
1638 __vm_clear_bit(GUEST_PENDING_DBG_EXCEPTIONS, PENDING_DEBUG_EXC_BS);
1640 set_bit(_VCPUF_ctrl_pause, &current->vcpu_flags);
1641 do_sched_op(SCHEDOP_yield);
1643 break;
1645 #endif
1646 case TRAP_no_device:
1648 vmx_do_no_device_fault();
1649 break;
1651 case TRAP_page_fault:
1653 __vmread(EXIT_QUALIFICATION, &va);
1654 __vmread(VM_EXIT_INTR_ERROR_CODE, &regs.error_code);
1656 TRACE_VMEXIT(3,regs.error_code);
1657 TRACE_VMEXIT(4,va);
1659 VMX_DBG_LOG(DBG_LEVEL_VMMU,
1660 "eax=%lx, ebx=%lx, ecx=%lx, edx=%lx, esi=%lx, edi=%lx",
1661 (unsigned long)regs.eax, (unsigned long)regs.ebx,
1662 (unsigned long)regs.ecx, (unsigned long)regs.edx,
1663 (unsigned long)regs.esi, (unsigned long)regs.edi);
1664 v->domain->arch.vmx_platform.mpci.inst_decoder_regs = &regs;
1666 if (!(error = vmx_do_page_fault(va, &regs))) {
1667 /*
1668 * Inject #PG using Interruption-Information Fields
1669 */
1670 vmx_inject_exception(v, TRAP_page_fault, regs.error_code);
1671 v->arch.arch_vmx.cpu_cr2 = va;
1672 TRACE_3D(TRC_VMX_INT, v->domain->domain_id, TRAP_page_fault, va);
1674 break;
1676 case TRAP_nmi:
1677 do_nmi(&regs, 0);
1678 break;
1679 default:
1680 vmx_reflect_exception(v);
1681 break;
1683 break;
1685 case EXIT_REASON_EXTERNAL_INTERRUPT:
1686 vmx_vmexit_do_extint(&regs);
1687 break;
1688 case EXIT_REASON_PENDING_INTERRUPT:
1689 __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
1690 MONITOR_CPU_BASED_EXEC_CONTROLS);
1691 break;
1692 case EXIT_REASON_TASK_SWITCH:
1693 __vmx_bug(&regs);
1694 break;
1695 case EXIT_REASON_CPUID:
1696 __get_instruction_length(inst_len);
1697 vmx_vmexit_do_cpuid(regs.eax, &regs);
1698 __update_guest_eip(inst_len);
1699 break;
1700 case EXIT_REASON_HLT:
1701 __get_instruction_length(inst_len);
1702 __update_guest_eip(inst_len);
1703 vmx_vmexit_do_hlt();
1704 break;
1705 case EXIT_REASON_INVLPG:
1707 unsigned long va;
1709 __vmread(EXIT_QUALIFICATION, &va);
1710 vmx_vmexit_do_invlpg(va);
1711 __get_instruction_length(inst_len);
1712 __update_guest_eip(inst_len);
1713 break;
1715 case EXIT_REASON_VMCALL:
1716 __get_instruction_length(inst_len);
1717 __vmread(GUEST_RIP, &eip);
1718 __vmread(EXIT_QUALIFICATION, &exit_qualification);
1720 vmx_print_line(regs.eax, v); /* provides the current domain */
1721 __update_guest_eip(inst_len);
1722 break;
1723 case EXIT_REASON_CR_ACCESS:
1725 __vmread(GUEST_RIP, &eip);
1726 __get_instruction_length(inst_len);
1727 __vmread(EXIT_QUALIFICATION, &exit_qualification);
1729 VMX_DBG_LOG(DBG_LEVEL_1, "eip = %lx, inst_len =%lx, exit_qualification = %lx",
1730 eip, inst_len, exit_qualification);
1731 if (vmx_cr_access(exit_qualification, &regs))
1732 __update_guest_eip(inst_len);
1733 TRACE_VMEXIT(3,regs.error_code);
1734 TRACE_VMEXIT(4,exit_qualification);
1735 break;
1737 case EXIT_REASON_DR_ACCESS:
1738 __vmread(EXIT_QUALIFICATION, &exit_qualification);
1739 vmx_dr_access(exit_qualification, &regs);
1740 __get_instruction_length(inst_len);
1741 __update_guest_eip(inst_len);
1742 break;
1743 case EXIT_REASON_IO_INSTRUCTION:
1744 __vmread(EXIT_QUALIFICATION, &exit_qualification);
1745 __get_instruction_length(inst_len);
1746 vmx_io_instruction(&regs, exit_qualification, inst_len);
1747 TRACE_VMEXIT(4,exit_qualification);
1748 break;
1749 case EXIT_REASON_MSR_READ:
1750 __get_instruction_length(inst_len);
1751 vmx_do_msr_read(&regs);
1752 __update_guest_eip(inst_len);
1753 break;
1754 case EXIT_REASON_MSR_WRITE:
1755 __vmread(GUEST_RIP, &eip);
1756 vmx_do_msr_write(&regs);
1757 __get_instruction_length(inst_len);
1758 __update_guest_eip(inst_len);
1759 break;
1760 case EXIT_REASON_MWAIT_INSTRUCTION:
1761 __get_instruction_length(inst_len);
1762 __update_guest_eip(inst_len);
1763 vmx_vmexit_do_mwait();
1764 break;
1765 default:
1766 __vmx_bug(&regs); /* should not happen */
1770 asmlinkage void load_cr2(void)
1772 struct vcpu *d = current;
1774 local_irq_disable();
1775 #ifdef __i386__
1776 asm volatile("movl %0,%%cr2": :"r" (d->arch.arch_vmx.cpu_cr2));
1777 #else
1778 asm volatile("movq %0,%%cr2": :"r" (d->arch.arch_vmx.cpu_cr2));
1779 #endif
1782 #ifdef TRACE_BUFFER
1783 asmlinkage void trace_vmentry (void)
1785 TRACE_5D(TRC_VMENTRY,trace_values[current->processor][0],
1786 trace_values[current->processor][1],trace_values[current->processor][2],
1787 trace_values[current->processor][3],trace_values[current->processor][4]);
1788 TRACE_VMEXIT(0,9);
1789 TRACE_VMEXIT(1,9);
1790 TRACE_VMEXIT(2,9);
1791 TRACE_VMEXIT(3,9);
1792 TRACE_VMEXIT(4,9);
1793 return;
1795 asmlinkage void trace_vmexit (void)
1797 TRACE_3D(TRC_VMEXIT,0,0,0);
1798 return;
1800 #endif
1801 #endif /* CONFIG_VMX */
1803 /*
1804 * Local variables:
1805 * mode: C
1806 * c-set-style: "BSD"
1807 * c-basic-offset: 4
1808 * tab-width: 4
1809 * indent-tabs-mode: nil
1810 * End:
1811 */