ia64/xen-unstable

view xen/arch/x86/vmx.c @ 6219:3b0ce44f7b7a

merge?
author cl349@firebug.cl.cam.ac.uk
date Wed Aug 17 08:27:16 2005 +0000 (2005-08-17)
parents f294acb25858 027812e4a63c
children f51fe43c5d1c 5f4724c13040 23979fb12c49 84ee014ebd41 99914b54f7bf 81576d3d1ca8 3a8f27c6d56c
line source
1 /*
2 * vmx.c: handling VMX architecture-related VM exits
3 * Copyright (c) 2004, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
16 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 *
18 */
20 #include <xen/config.h>
21 #include <xen/init.h>
22 #include <xen/lib.h>
23 #include <xen/trace.h>
24 #include <xen/sched.h>
25 #include <xen/irq.h>
26 #include <xen/softirq.h>
27 #include <xen/domain_page.h>
28 #include <asm/current.h>
29 #include <asm/io.h>
30 #include <asm/shadow.h>
31 #include <asm/regs.h>
32 #include <asm/cpufeature.h>
33 #include <asm/processor.h>
34 #include <asm/types.h>
35 #include <asm/msr.h>
36 #include <asm/spinlock.h>
37 #include <asm/vmx.h>
38 #include <asm/vmx_vmcs.h>
39 #include <asm/vmx_intercept.h>
40 #include <asm/shadow.h>
41 #if CONFIG_PAGING_LEVELS >= 3
42 #include <asm/shadow_64.h>
43 #endif
45 #include <public/io/ioreq.h>
47 #ifdef CONFIG_VMX
49 int vmcs_size;
50 unsigned int opt_vmx_debug_level = 0;
51 integer_param("vmx_debug", opt_vmx_debug_level);
53 #ifdef __x86_64__
54 static struct msr_state percpu_msr[NR_CPUS];
56 static u32 msr_data_index[VMX_MSR_COUNT] =
57 {
58 MSR_LSTAR, MSR_STAR, MSR_CSTAR,
59 MSR_SYSCALL_MASK, MSR_EFER,
60 };
62 /*
63 * To avoid MSR save/restore at every VM exit/entry time, we restore
64 * the x86_64 specific MSRs at domain switch time. Since those MSRs are
65 * are not modified once set for generic domains, we don't save them,
66 * but simply reset them to the values set at percpu_traps_init().
67 */
68 void vmx_load_msrs(struct vcpu *n)
69 {
70 struct msr_state *host_state;
71 host_state = &percpu_msr[smp_processor_id()];
73 while (host_state->flags){
74 int i;
76 i = find_first_set_bit(host_state->flags);
77 wrmsrl(msr_data_index[i], host_state->msr_items[i]);
78 clear_bit(i, &host_state->flags);
79 }
80 }
82 static void vmx_save_init_msrs(void)
83 {
84 struct msr_state *host_state;
85 host_state = &percpu_msr[smp_processor_id()];
86 int i;
88 for (i = 0; i < VMX_MSR_COUNT; i++)
89 rdmsrl(msr_data_index[i], host_state->msr_items[i]);
90 }
92 #define CASE_READ_MSR(address) \
93 case MSR_ ## address: \
94 msr_content = msr->msr_items[VMX_INDEX_MSR_ ## address]; \
95 break
97 #define CASE_WRITE_MSR(address) \
98 case MSR_ ## address: \
99 { \
100 msr->msr_items[VMX_INDEX_MSR_ ## address] = msr_content; \
101 if (!test_bit(VMX_INDEX_MSR_ ## address, &msr->flags)) { \
102 set_bit(VMX_INDEX_MSR_ ## address, &msr->flags); \
103 } \
104 wrmsrl(MSR_ ## address, msr_content); \
105 set_bit(VMX_INDEX_MSR_ ## address, &host_state->flags); \
106 } \
107 break
109 #define IS_CANO_ADDRESS(add) 1
110 static inline int long_mode_do_msr_read(struct cpu_user_regs *regs)
111 {
112 u64 msr_content = 0;
113 struct vcpu *vc = current;
114 struct msr_state * msr = &vc->arch.arch_vmx.msr_content;
115 switch(regs->ecx){
116 case MSR_EFER:
117 msr_content = msr->msr_items[VMX_INDEX_MSR_EFER];
118 VMX_DBG_LOG(DBG_LEVEL_2, "EFER msr_content %llx\n", (unsigned long long)msr_content);
119 if (test_bit(VMX_CPU_STATE_LME_ENABLED,
120 &vc->arch.arch_vmx.cpu_state))
121 msr_content |= 1 << _EFER_LME;
123 if (VMX_LONG_GUEST(vc))
124 msr_content |= 1 << _EFER_LMA;
125 break;
126 case MSR_FS_BASE:
127 if (!(VMX_LONG_GUEST(vc)))
128 /* XXX should it be GP fault */
129 domain_crash();
130 __vmread(GUEST_FS_BASE, &msr_content);
131 break;
132 case MSR_GS_BASE:
133 if (!(VMX_LONG_GUEST(vc)))
134 domain_crash();
135 __vmread(GUEST_GS_BASE, &msr_content);
136 break;
137 case MSR_SHADOW_GS_BASE:
138 msr_content = msr->shadow_gs;
139 break;
141 CASE_READ_MSR(STAR);
142 CASE_READ_MSR(LSTAR);
143 CASE_READ_MSR(CSTAR);
144 CASE_READ_MSR(SYSCALL_MASK);
145 default:
146 return 0;
147 }
148 VMX_DBG_LOG(DBG_LEVEL_2, "mode_do_msr_read: msr_content: %lx\n", msr_content);
149 regs->eax = msr_content & 0xffffffff;
150 regs->edx = msr_content >> 32;
151 return 1;
152 }
154 static inline int long_mode_do_msr_write(struct cpu_user_regs *regs)
155 {
156 u64 msr_content = regs->eax | ((u64)regs->edx << 32);
157 struct vcpu *vc = current;
158 struct msr_state * msr = &vc->arch.arch_vmx.msr_content;
159 struct msr_state * host_state =
160 &percpu_msr[smp_processor_id()];
162 VMX_DBG_LOG(DBG_LEVEL_1, " mode_do_msr_write msr %lx msr_content %lx\n",
163 regs->ecx, msr_content);
165 switch (regs->ecx){
166 case MSR_EFER:
167 if ((msr_content & EFER_LME) ^
168 test_bit(VMX_CPU_STATE_LME_ENABLED,
169 &vc->arch.arch_vmx.cpu_state)){
170 if (test_bit(VMX_CPU_STATE_PG_ENABLED,
171 &vc->arch.arch_vmx.cpu_state) ||
172 !test_bit(VMX_CPU_STATE_PAE_ENABLED,
173 &vc->arch.arch_vmx.cpu_state)){
174 vmx_inject_exception(vc, TRAP_gp_fault, 0);
175 }
176 }
177 if (msr_content & EFER_LME)
178 set_bit(VMX_CPU_STATE_LME_ENABLED,
179 &vc->arch.arch_vmx.cpu_state);
180 /* No update for LME/LMA since it have no effect */
181 msr->msr_items[VMX_INDEX_MSR_EFER] =
182 msr_content;
183 if (msr_content & ~(EFER_LME | EFER_LMA)){
184 msr->msr_items[VMX_INDEX_MSR_EFER] = msr_content;
185 if (!test_bit(VMX_INDEX_MSR_EFER, &msr->flags)){
186 rdmsrl(MSR_EFER,
187 host_state->msr_items[VMX_INDEX_MSR_EFER]);
188 set_bit(VMX_INDEX_MSR_EFER, &host_state->flags);
189 set_bit(VMX_INDEX_MSR_EFER, &msr->flags);
190 wrmsrl(MSR_EFER, msr_content);
191 }
192 }
193 break;
195 case MSR_FS_BASE:
196 case MSR_GS_BASE:
197 if (!(VMX_LONG_GUEST(vc)))
198 domain_crash();
199 if (!IS_CANO_ADDRESS(msr_content)){
200 VMX_DBG_LOG(DBG_LEVEL_1, "Not cano address of msr write\n");
201 vmx_inject_exception(vc, TRAP_gp_fault, 0);
202 }
203 if (regs->ecx == MSR_FS_BASE)
204 __vmwrite(GUEST_FS_BASE, msr_content);
205 else
206 __vmwrite(GUEST_GS_BASE, msr_content);
207 break;
209 case MSR_SHADOW_GS_BASE:
210 if (!(VMX_LONG_GUEST(vc)))
211 domain_crash();
212 vc->arch.arch_vmx.msr_content.shadow_gs = msr_content;
213 wrmsrl(MSR_SHADOW_GS_BASE, msr_content);
214 break;
216 CASE_WRITE_MSR(STAR);
217 CASE_WRITE_MSR(LSTAR);
218 CASE_WRITE_MSR(CSTAR);
219 CASE_WRITE_MSR(SYSCALL_MASK);
220 default:
221 return 0;
222 }
223 return 1;
224 }
226 void
227 vmx_restore_msrs(struct vcpu *d)
228 {
229 int i = 0;
230 struct msr_state *guest_state;
231 struct msr_state *host_state;
232 unsigned long guest_flags ;
234 guest_state = &d->arch.arch_vmx.msr_content;;
235 host_state = &percpu_msr[smp_processor_id()];
237 wrmsrl(MSR_SHADOW_GS_BASE, guest_state->shadow_gs);
238 guest_flags = guest_state->flags;
239 if (!guest_flags)
240 return;
242 while (guest_flags){
243 i = find_first_set_bit(guest_flags);
245 VMX_DBG_LOG(DBG_LEVEL_2,
246 "restore guest's index %d msr %lx with %lx\n",
247 i, (unsigned long) msr_data_index[i], (unsigned long) guest_state->msr_items[i]);
248 set_bit(i, &host_state->flags);
249 wrmsrl(msr_data_index[i], guest_state->msr_items[i]);
250 clear_bit(i, &guest_flags);
251 }
252 }
254 #else /* __i386__ */
255 #define vmx_save_init_msrs() ((void)0)
257 static inline int long_mode_do_msr_read(struct cpu_user_regs *regs){
258 return 0;
259 }
260 static inline int long_mode_do_msr_write(struct cpu_user_regs *regs){
261 return 0;
262 }
263 #endif
265 extern long evtchn_send(int lport);
266 extern long do_block(void);
267 void do_nmi(struct cpu_user_regs *, unsigned long);
269 static int check_vmx_controls(ctrls, msr)
270 {
271 u32 vmx_msr_low, vmx_msr_high;
273 rdmsr(msr, vmx_msr_low, vmx_msr_high);
274 if (ctrls < vmx_msr_low || ctrls > vmx_msr_high) {
275 printk("Insufficient VMX capability 0x%x, "
276 "msr=0x%x,low=0x%8x,high=0x%x\n",
277 ctrls, msr, vmx_msr_low, vmx_msr_high);
278 return 0;
279 }
280 return 1;
281 }
283 int start_vmx(void)
284 {
285 struct vmcs_struct *vmcs;
286 u32 ecx;
287 u32 eax, edx;
288 u64 phys_vmcs; /* debugging */
290 /*
291 * Xen does not fill x86_capability words except 0.
292 */
293 ecx = cpuid_ecx(1);
294 boot_cpu_data.x86_capability[4] = ecx;
296 if (!(test_bit(X86_FEATURE_VMXE, &boot_cpu_data.x86_capability)))
297 return 0;
299 rdmsr(IA32_FEATURE_CONTROL_MSR, eax, edx);
301 if (eax & IA32_FEATURE_CONTROL_MSR_LOCK) {
302 if ((eax & IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON) == 0x0) {
303 printk("VMX disabled by Feature Control MSR.\n");
304 return 0;
305 }
306 }
307 else {
308 wrmsr(IA32_FEATURE_CONTROL_MSR,
309 IA32_FEATURE_CONTROL_MSR_LOCK |
310 IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON, 0);
311 }
313 if (!check_vmx_controls(MONITOR_PIN_BASED_EXEC_CONTROLS,
314 MSR_IA32_VMX_PINBASED_CTLS_MSR))
315 return 0;
316 if (!check_vmx_controls(MONITOR_CPU_BASED_EXEC_CONTROLS,
317 MSR_IA32_VMX_PROCBASED_CTLS_MSR))
318 return 0;
319 if (!check_vmx_controls(MONITOR_VM_EXIT_CONTROLS,
320 MSR_IA32_VMX_EXIT_CTLS_MSR))
321 return 0;
322 if (!check_vmx_controls(MONITOR_VM_ENTRY_CONTROLS,
323 MSR_IA32_VMX_ENTRY_CTLS_MSR))
324 return 0;
326 set_in_cr4(X86_CR4_VMXE); /* Enable VMXE */
328 if (!(vmcs = alloc_vmcs())) {
329 printk("Failed to allocate VMCS\n");
330 return 0;
331 }
333 phys_vmcs = (u64) virt_to_phys(vmcs);
335 if (!(__vmxon(phys_vmcs))) {
336 printk("VMXON is done\n");
337 }
339 vmx_save_init_msrs();
341 return 1;
342 }
344 void stop_vmx(void)
345 {
346 if (read_cr4() & X86_CR4_VMXE)
347 __vmxoff();
348 }
350 /*
351 * Not all cases receive valid value in the VM-exit instruction length field.
352 */
353 #define __get_instruction_length(len) \
354 __vmread(INSTRUCTION_LEN, &(len)); \
355 if ((len) < 1 || (len) > 15) \
356 __vmx_bug(&regs);
358 static void inline __update_guest_eip(unsigned long inst_len)
359 {
360 unsigned long current_eip;
362 __vmread(GUEST_RIP, &current_eip);
363 __vmwrite(GUEST_RIP, current_eip + inst_len);
364 }
367 static int vmx_do_page_fault(unsigned long va, struct cpu_user_regs *regs)
368 {
369 unsigned long eip;
370 unsigned long gpa; /* FIXME: PAE */
371 int result;
373 #if VMX_DEBUG
374 {
375 __vmread(GUEST_RIP, &eip);
376 VMX_DBG_LOG(DBG_LEVEL_VMMU,
377 "vmx_do_page_fault = 0x%lx, eip = %lx, error_code = %lx",
378 va, eip, (unsigned long)regs->error_code);
379 }
380 #endif
382 if (!vmx_paging_enabled(current)){
383 handle_mmio(va, va);
384 return 1;
385 }
386 gpa = gva_to_gpa(va);
388 /* Use 1:1 page table to identify MMIO address space */
389 if ( mmio_space(gpa) ){
390 if (gpa >= 0xFEE00000) { /* workaround for local APIC */
391 u32 inst_len;
392 __vmread(INSTRUCTION_LEN, &(inst_len));
393 __update_guest_eip(inst_len);
394 return 1;
395 }
396 handle_mmio(va, gpa);
397 return 1;
398 }
400 result = shadow_fault(va, regs);
402 #if 0
403 if ( !result )
404 {
405 __vmread(GUEST_RIP, &eip);
406 printk("vmx pgfault to guest va=%p eip=%p\n", va, eip);
407 }
408 #endif
410 return result;
411 }
413 static void vmx_do_no_device_fault(void)
414 {
415 unsigned long cr0;
417 clts();
418 setup_fpu(current);
419 __vmread(CR0_READ_SHADOW, &cr0);
420 if (!(cr0 & X86_CR0_TS)) {
421 __vmread(GUEST_CR0, &cr0);
422 cr0 &= ~X86_CR0_TS;
423 __vmwrite(GUEST_CR0, cr0);
424 }
425 __vm_clear_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_NM);
426 }
429 static void vmx_vmexit_do_cpuid(unsigned long input, struct cpu_user_regs *regs)
430 {
431 unsigned int eax, ebx, ecx, edx;
432 unsigned long eip;
434 __vmread(GUEST_RIP, &eip);
436 VMX_DBG_LOG(DBG_LEVEL_1,
437 "do_cpuid: (eax) %lx, (ebx) %lx, (ecx) %lx, (edx) %lx,"
438 " (esi) %lx, (edi) %lx",
439 (unsigned long)regs->eax, (unsigned long)regs->ebx,
440 (unsigned long)regs->ecx, (unsigned long)regs->edx,
441 (unsigned long)regs->esi, (unsigned long)regs->edi);
443 cpuid(input, &eax, &ebx, &ecx, &edx);
445 if (input == 1) {
446 #ifdef __i386__
447 clear_bit(X86_FEATURE_PSE, &edx);
448 clear_bit(X86_FEATURE_PAE, &edx);
449 clear_bit(X86_FEATURE_PSE36, &edx);
450 #endif
451 }
453 regs->eax = (unsigned long) eax;
454 regs->ebx = (unsigned long) ebx;
455 regs->ecx = (unsigned long) ecx;
456 regs->edx = (unsigned long) edx;
458 VMX_DBG_LOG(DBG_LEVEL_1,
459 "vmx_vmexit_do_cpuid: eip: %lx, input: %lx, out:eax=%x, ebx=%x, ecx=%x, edx=%x",
460 eip, input, eax, ebx, ecx, edx);
462 }
464 #define CASE_GET_REG_P(REG, reg) \
465 case REG_ ## REG: reg_p = (unsigned long *)&(regs->reg); break
467 static void vmx_dr_access (unsigned long exit_qualification, struct cpu_user_regs *regs)
468 {
469 unsigned int reg;
470 unsigned long *reg_p = 0;
471 struct vcpu *v = current;
472 unsigned long eip;
474 __vmread(GUEST_RIP, &eip);
476 reg = exit_qualification & DEBUG_REG_ACCESS_NUM;
478 VMX_DBG_LOG(DBG_LEVEL_1,
479 "vmx_dr_access : eip=%lx, reg=%d, exit_qualification = %lx",
480 eip, reg, exit_qualification);
482 switch(exit_qualification & DEBUG_REG_ACCESS_REG) {
483 CASE_GET_REG_P(EAX, eax);
484 CASE_GET_REG_P(ECX, ecx);
485 CASE_GET_REG_P(EDX, edx);
486 CASE_GET_REG_P(EBX, ebx);
487 CASE_GET_REG_P(EBP, ebp);
488 CASE_GET_REG_P(ESI, esi);
489 CASE_GET_REG_P(EDI, edi);
490 case REG_ESP:
491 break;
492 default:
493 __vmx_bug(regs);
494 }
496 switch (exit_qualification & DEBUG_REG_ACCESS_TYPE) {
497 case TYPE_MOV_TO_DR:
498 /* don't need to check the range */
499 if (reg != REG_ESP)
500 v->arch.guest_context.debugreg[reg] = *reg_p;
501 else {
502 unsigned long value;
503 __vmread(GUEST_RSP, &value);
504 v->arch.guest_context.debugreg[reg] = value;
505 }
506 break;
507 case TYPE_MOV_FROM_DR:
508 if (reg != REG_ESP)
509 *reg_p = v->arch.guest_context.debugreg[reg];
510 else {
511 __vmwrite(GUEST_RSP, v->arch.guest_context.debugreg[reg]);
512 }
513 break;
514 }
515 }
517 /*
518 * Invalidate the TLB for va. Invalidate the shadow page corresponding
519 * the address va.
520 */
521 static void vmx_vmexit_do_invlpg(unsigned long va)
522 {
523 unsigned long eip;
524 struct vcpu *v = current;
526 __vmread(GUEST_RIP, &eip);
528 VMX_DBG_LOG(DBG_LEVEL_VMMU, "vmx_vmexit_do_invlpg: eip=%lx, va=%lx",
529 eip, va);
531 /*
532 * We do the safest things first, then try to update the shadow
533 * copying from guest
534 */
535 shadow_invlpg(v, va);
536 }
538 static int check_for_null_selector(unsigned long eip)
539 {
540 unsigned char inst[MAX_INST_LEN];
541 unsigned long sel;
542 int i, inst_len;
543 int inst_copy_from_guest(unsigned char *, unsigned long, int);
545 __vmread(INSTRUCTION_LEN, &inst_len);
546 memset(inst, 0, MAX_INST_LEN);
547 if (inst_copy_from_guest(inst, eip, inst_len) != inst_len) {
548 printf("check_for_null_selector: get guest instruction failed\n");
549 domain_crash_synchronous();
550 }
552 for (i = 0; i < inst_len; i++) {
553 switch (inst[i]) {
554 case 0xf3: /* REPZ */
555 case 0xf2: /* REPNZ */
556 case 0xf0: /* LOCK */
557 case 0x66: /* data32 */
558 case 0x67: /* addr32 */
559 continue;
560 case 0x2e: /* CS */
561 __vmread(GUEST_CS_SELECTOR, &sel);
562 break;
563 case 0x36: /* SS */
564 __vmread(GUEST_SS_SELECTOR, &sel);
565 break;
566 case 0x26: /* ES */
567 __vmread(GUEST_ES_SELECTOR, &sel);
568 break;
569 case 0x64: /* FS */
570 __vmread(GUEST_FS_SELECTOR, &sel);
571 break;
572 case 0x65: /* GS */
573 __vmread(GUEST_GS_SELECTOR, &sel);
574 break;
575 case 0x3e: /* DS */
576 /* FALLTHROUGH */
577 default:
578 /* DS is the default */
579 __vmread(GUEST_DS_SELECTOR, &sel);
580 }
581 return sel == 0 ? 1 : 0;
582 }
584 return 0;
585 }
587 static void vmx_io_instruction(struct cpu_user_regs *regs,
588 unsigned long exit_qualification, unsigned long inst_len)
589 {
590 struct vcpu *d = current;
591 vcpu_iodata_t *vio;
592 ioreq_t *p;
593 unsigned long addr;
594 unsigned long eip, cs, eflags;
595 int vm86;
597 __vmread(GUEST_RIP, &eip);
598 __vmread(GUEST_CS_SELECTOR, &cs);
599 __vmread(GUEST_RFLAGS, &eflags);
600 vm86 = eflags & X86_EFLAGS_VM ? 1 : 0;
602 VMX_DBG_LOG(DBG_LEVEL_1,
603 "vmx_io_instruction: vm86 %d, eip=%lx:%lx, "
604 "exit_qualification = %lx",
605 vm86, cs, eip, exit_qualification);
607 if (test_bit(6, &exit_qualification))
608 addr = (exit_qualification >> 16) & (0xffff);
609 else
610 addr = regs->edx & 0xffff;
612 vio = get_vio(d->domain, d->vcpu_id);
613 if (vio == 0) {
614 printk("bad shared page: %lx", (unsigned long) vio);
615 domain_crash_synchronous();
616 }
617 p = &vio->vp_ioreq;
618 p->dir = test_bit(3, &exit_qualification); /* direction */
620 p->pdata_valid = 0;
621 p->count = 1;
622 p->size = (exit_qualification & 7) + 1;
624 if (test_bit(4, &exit_qualification)) { /* string instruction */
625 unsigned long laddr;
627 __vmread(GUEST_LINEAR_ADDRESS, &laddr);
628 /*
629 * In protected mode, guest linear address is invalid if the
630 * selector is null.
631 */
632 if (!vm86 && check_for_null_selector(eip)) {
633 laddr = (p->dir == IOREQ_WRITE) ? regs->esi : regs->edi;
634 }
635 p->pdata_valid = 1;
637 p->u.data = laddr;
638 if (vmx_paging_enabled(d))
639 p->u.pdata = (void *) gva_to_gpa(p->u.data);
640 p->df = (eflags & X86_EFLAGS_DF) ? 1 : 0;
642 if (test_bit(5, &exit_qualification)) /* "rep" prefix */
643 p->count = vm86 ? regs->ecx & 0xFFFF : regs->ecx;
645 /*
646 * Split up string I/O operations that cross page boundaries. Don't
647 * advance %eip so that "rep insb" will restart at the next page.
648 */
649 if ((p->u.data & PAGE_MASK) !=
650 ((p->u.data + p->count * p->size - 1) & PAGE_MASK)) {
651 VMX_DBG_LOG(DBG_LEVEL_2,
652 "String I/O crosses page boundary (cs:eip=0x%lx:0x%lx)\n",
653 cs, eip);
654 if (p->u.data & (p->size - 1)) {
655 printf("Unaligned string I/O operation (cs:eip=0x%lx:0x%lx)\n",
656 cs, eip);
657 domain_crash_synchronous();
658 }
659 p->count = (PAGE_SIZE - (p->u.data & ~PAGE_MASK)) / p->size;
660 } else {
661 __update_guest_eip(inst_len);
662 }
663 } else if (p->dir == IOREQ_WRITE) {
664 p->u.data = regs->eax;
665 __update_guest_eip(inst_len);
666 } else
667 __update_guest_eip(inst_len);
669 p->addr = addr;
670 p->port_mm = 0;
672 /* Check if the packet needs to be intercepted */
673 if (vmx_portio_intercept(p))
674 /* no blocking & no evtchn notification */
675 return;
677 set_bit(ARCH_VMX_IO_WAIT, &d->arch.arch_vmx.flags);
678 p->state = STATE_IOREQ_READY;
679 evtchn_send(iopacket_port(d->domain));
680 vmx_wait_io();
681 }
683 enum { COPY_IN = 0, COPY_OUT };
685 static inline int
686 vmx_copy(void *buf, unsigned long laddr, int size, int dir)
687 {
688 char *addr;
689 unsigned long mfn;
691 if ( (size + (laddr & (PAGE_SIZE - 1))) >= PAGE_SIZE )
692 {
693 printf("vmx_copy exceeds page boundary\n");
694 return 0;
695 }
697 mfn = phys_to_machine_mapping(laddr >> PAGE_SHIFT);
698 addr = (char *)map_domain_page(mfn) + (laddr & ~PAGE_MASK);
700 if (dir == COPY_IN)
701 memcpy(buf, addr, size);
702 else
703 memcpy(addr, buf, size);
705 unmap_domain_page(addr);
706 return 1;
707 }
709 int
710 vmx_world_save(struct vcpu *d, struct vmx_assist_context *c)
711 {
712 unsigned long inst_len;
713 int error = 0;
715 error |= __vmread(INSTRUCTION_LEN, &inst_len);
716 error |= __vmread(GUEST_RIP, &c->eip);
717 c->eip += inst_len; /* skip transition instruction */
718 error |= __vmread(GUEST_RSP, &c->esp);
719 error |= __vmread(GUEST_RFLAGS, &c->eflags);
721 error |= __vmread(CR0_READ_SHADOW, &c->cr0);
722 c->cr3 = d->arch.arch_vmx.cpu_cr3;
723 error |= __vmread(CR4_READ_SHADOW, &c->cr4);
725 error |= __vmread(GUEST_IDTR_LIMIT, &c->idtr_limit);
726 error |= __vmread(GUEST_IDTR_BASE, &c->idtr_base);
728 error |= __vmread(GUEST_GDTR_LIMIT, &c->gdtr_limit);
729 error |= __vmread(GUEST_GDTR_BASE, &c->gdtr_base);
731 error |= __vmread(GUEST_CS_SELECTOR, &c->cs_sel);
732 error |= __vmread(GUEST_CS_LIMIT, &c->cs_limit);
733 error |= __vmread(GUEST_CS_BASE, &c->cs_base);
734 error |= __vmread(GUEST_CS_AR_BYTES, &c->cs_arbytes.bytes);
736 error |= __vmread(GUEST_DS_SELECTOR, &c->ds_sel);
737 error |= __vmread(GUEST_DS_LIMIT, &c->ds_limit);
738 error |= __vmread(GUEST_DS_BASE, &c->ds_base);
739 error |= __vmread(GUEST_DS_AR_BYTES, &c->ds_arbytes.bytes);
741 error |= __vmread(GUEST_ES_SELECTOR, &c->es_sel);
742 error |= __vmread(GUEST_ES_LIMIT, &c->es_limit);
743 error |= __vmread(GUEST_ES_BASE, &c->es_base);
744 error |= __vmread(GUEST_ES_AR_BYTES, &c->es_arbytes.bytes);
746 error |= __vmread(GUEST_SS_SELECTOR, &c->ss_sel);
747 error |= __vmread(GUEST_SS_LIMIT, &c->ss_limit);
748 error |= __vmread(GUEST_SS_BASE, &c->ss_base);
749 error |= __vmread(GUEST_SS_AR_BYTES, &c->ss_arbytes.bytes);
751 error |= __vmread(GUEST_FS_SELECTOR, &c->fs_sel);
752 error |= __vmread(GUEST_FS_LIMIT, &c->fs_limit);
753 error |= __vmread(GUEST_FS_BASE, &c->fs_base);
754 error |= __vmread(GUEST_FS_AR_BYTES, &c->fs_arbytes.bytes);
756 error |= __vmread(GUEST_GS_SELECTOR, &c->gs_sel);
757 error |= __vmread(GUEST_GS_LIMIT, &c->gs_limit);
758 error |= __vmread(GUEST_GS_BASE, &c->gs_base);
759 error |= __vmread(GUEST_GS_AR_BYTES, &c->gs_arbytes.bytes);
761 error |= __vmread(GUEST_TR_SELECTOR, &c->tr_sel);
762 error |= __vmread(GUEST_TR_LIMIT, &c->tr_limit);
763 error |= __vmread(GUEST_TR_BASE, &c->tr_base);
764 error |= __vmread(GUEST_TR_AR_BYTES, &c->tr_arbytes.bytes);
766 error |= __vmread(GUEST_LDTR_SELECTOR, &c->ldtr_sel);
767 error |= __vmread(GUEST_LDTR_LIMIT, &c->ldtr_limit);
768 error |= __vmread(GUEST_LDTR_BASE, &c->ldtr_base);
769 error |= __vmread(GUEST_LDTR_AR_BYTES, &c->ldtr_arbytes.bytes);
771 return !error;
772 }
774 int
775 vmx_world_restore(struct vcpu *d, struct vmx_assist_context *c)
776 {
777 unsigned long mfn, old_cr4;
778 int error = 0;
780 error |= __vmwrite(GUEST_RIP, c->eip);
781 error |= __vmwrite(GUEST_RSP, c->esp);
782 error |= __vmwrite(GUEST_RFLAGS, c->eflags);
784 error |= __vmwrite(CR0_READ_SHADOW, c->cr0);
786 if (!vmx_paging_enabled(d)) {
787 VMX_DBG_LOG(DBG_LEVEL_VMMU, "switching to vmxassist. use phys table");
788 __vmwrite(GUEST_CR3, pagetable_get_paddr(d->domain->arch.phys_table));
789 goto skip_cr3;
790 }
792 if (c->cr3 == d->arch.arch_vmx.cpu_cr3) {
793 /*
794 * This is simple TLB flush, implying the guest has
795 * removed some translation or changed page attributes.
796 * We simply invalidate the shadow.
797 */
798 mfn = phys_to_machine_mapping(c->cr3 >> PAGE_SHIFT);
799 if (mfn != pagetable_get_pfn(d->arch.guest_table)) {
800 printk("Invalid CR3 value=%x", c->cr3);
801 domain_crash_synchronous();
802 return 0;
803 }
804 shadow_sync_all(d->domain);
805 } else {
806 /*
807 * If different, make a shadow. Check if the PDBR is valid
808 * first.
809 */
810 VMX_DBG_LOG(DBG_LEVEL_VMMU, "CR3 c->cr3 = %x", c->cr3);
811 if ((c->cr3 >> PAGE_SHIFT) > d->domain->max_pages) {
812 printk("Invalid CR3 value=%x", c->cr3);
813 domain_crash_synchronous();
814 return 0;
815 }
816 mfn = phys_to_machine_mapping(c->cr3 >> PAGE_SHIFT);
817 d->arch.guest_table = mk_pagetable(mfn << PAGE_SHIFT);
818 update_pagetables(d);
819 /*
820 * arch.shadow_table should now hold the next CR3 for shadow
821 */
822 d->arch.arch_vmx.cpu_cr3 = c->cr3;
823 VMX_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %x", c->cr3);
824 __vmwrite(GUEST_CR3, pagetable_get_paddr(d->arch.shadow_table));
825 }
827 skip_cr3:
829 error |= __vmread(CR4_READ_SHADOW, &old_cr4);
830 error |= __vmwrite(GUEST_CR4, (c->cr4 | VMX_CR4_HOST_MASK));
831 error |= __vmwrite(CR4_READ_SHADOW, c->cr4);
833 error |= __vmwrite(GUEST_IDTR_LIMIT, c->idtr_limit);
834 error |= __vmwrite(GUEST_IDTR_BASE, c->idtr_base);
836 error |= __vmwrite(GUEST_GDTR_LIMIT, c->gdtr_limit);
837 error |= __vmwrite(GUEST_GDTR_BASE, c->gdtr_base);
839 error |= __vmwrite(GUEST_CS_SELECTOR, c->cs_sel);
840 error |= __vmwrite(GUEST_CS_LIMIT, c->cs_limit);
841 error |= __vmwrite(GUEST_CS_BASE, c->cs_base);
842 error |= __vmwrite(GUEST_CS_AR_BYTES, c->cs_arbytes.bytes);
844 error |= __vmwrite(GUEST_DS_SELECTOR, c->ds_sel);
845 error |= __vmwrite(GUEST_DS_LIMIT, c->ds_limit);
846 error |= __vmwrite(GUEST_DS_BASE, c->ds_base);
847 error |= __vmwrite(GUEST_DS_AR_BYTES, c->ds_arbytes.bytes);
849 error |= __vmwrite(GUEST_ES_SELECTOR, c->es_sel);
850 error |= __vmwrite(GUEST_ES_LIMIT, c->es_limit);
851 error |= __vmwrite(GUEST_ES_BASE, c->es_base);
852 error |= __vmwrite(GUEST_ES_AR_BYTES, c->es_arbytes.bytes);
854 error |= __vmwrite(GUEST_SS_SELECTOR, c->ss_sel);
855 error |= __vmwrite(GUEST_SS_LIMIT, c->ss_limit);
856 error |= __vmwrite(GUEST_SS_BASE, c->ss_base);
857 error |= __vmwrite(GUEST_SS_AR_BYTES, c->ss_arbytes.bytes);
859 error |= __vmwrite(GUEST_FS_SELECTOR, c->fs_sel);
860 error |= __vmwrite(GUEST_FS_LIMIT, c->fs_limit);
861 error |= __vmwrite(GUEST_FS_BASE, c->fs_base);
862 error |= __vmwrite(GUEST_FS_AR_BYTES, c->fs_arbytes.bytes);
864 error |= __vmwrite(GUEST_GS_SELECTOR, c->gs_sel);
865 error |= __vmwrite(GUEST_GS_LIMIT, c->gs_limit);
866 error |= __vmwrite(GUEST_GS_BASE, c->gs_base);
867 error |= __vmwrite(GUEST_GS_AR_BYTES, c->gs_arbytes.bytes);
869 error |= __vmwrite(GUEST_TR_SELECTOR, c->tr_sel);
870 error |= __vmwrite(GUEST_TR_LIMIT, c->tr_limit);
871 error |= __vmwrite(GUEST_TR_BASE, c->tr_base);
872 error |= __vmwrite(GUEST_TR_AR_BYTES, c->tr_arbytes.bytes);
874 error |= __vmwrite(GUEST_LDTR_SELECTOR, c->ldtr_sel);
875 error |= __vmwrite(GUEST_LDTR_LIMIT, c->ldtr_limit);
876 error |= __vmwrite(GUEST_LDTR_BASE, c->ldtr_base);
877 error |= __vmwrite(GUEST_LDTR_AR_BYTES, c->ldtr_arbytes.bytes);
879 return !error;
880 }
882 enum { VMX_ASSIST_INVOKE = 0, VMX_ASSIST_RESTORE };
884 int
885 vmx_assist(struct vcpu *d, int mode)
886 {
887 struct vmx_assist_context c;
888 u32 magic;
889 u32 cp;
891 /* make sure vmxassist exists (this is not an error) */
892 if (!vmx_copy(&magic, VMXASSIST_MAGIC_OFFSET, sizeof(magic), COPY_IN))
893 return 0;
894 if (magic != VMXASSIST_MAGIC)
895 return 0;
897 switch (mode) {
898 /*
899 * Transfer control to vmxassist.
900 * Store the current context in VMXASSIST_OLD_CONTEXT and load
901 * the new VMXASSIST_NEW_CONTEXT context. This context was created
902 * by vmxassist and will transfer control to it.
903 */
904 case VMX_ASSIST_INVOKE:
905 /* save the old context */
906 if (!vmx_copy(&cp, VMXASSIST_OLD_CONTEXT, sizeof(cp), COPY_IN))
907 goto error;
908 if (cp != 0) {
909 if (!vmx_world_save(d, &c))
910 goto error;
911 if (!vmx_copy(&c, cp, sizeof(c), COPY_OUT))
912 goto error;
913 }
915 /* restore the new context, this should activate vmxassist */
916 if (!vmx_copy(&cp, VMXASSIST_NEW_CONTEXT, sizeof(cp), COPY_IN))
917 goto error;
918 if (cp != 0) {
919 if (!vmx_copy(&c, cp, sizeof(c), COPY_IN))
920 goto error;
921 if (!vmx_world_restore(d, &c))
922 goto error;
923 return 1;
924 }
925 break;
927 /*
928 * Restore the VMXASSIST_OLD_CONTEXT that was saved by VMX_ASSIST_INVOKE
929 * above.
930 */
931 case VMX_ASSIST_RESTORE:
932 /* save the old context */
933 if (!vmx_copy(&cp, VMXASSIST_OLD_CONTEXT, sizeof(cp), COPY_IN))
934 goto error;
935 if (cp != 0) {
936 if (!vmx_copy(&c, cp, sizeof(c), COPY_IN))
937 goto error;
938 if (!vmx_world_restore(d, &c))
939 goto error;
940 return 1;
941 }
942 break;
943 }
945 error:
946 printf("Failed to transfer to vmxassist\n");
947 domain_crash_synchronous();
948 return 0;
949 }
951 static int vmx_set_cr0(unsigned long value)
952 {
953 struct vcpu *d = current;
954 unsigned long mfn;
955 unsigned long eip;
956 int paging_enabled;
957 unsigned long vm_entry_value;
958 /*
959 * CR0: We don't want to lose PE and PG.
960 */
961 paging_enabled = vmx_paging_enabled(d);
962 __vmwrite(GUEST_CR0, (value | X86_CR0_PE | X86_CR0_PG));
963 __vmwrite(CR0_READ_SHADOW, value);
965 VMX_DBG_LOG(DBG_LEVEL_VMMU, "Update CR0 value = %lx\n", value);
967 if ((value & X86_CR0_PE) && (value & X86_CR0_PG) && !paging_enabled) {
968 /*
969 * The guest CR3 must be pointing to the guest physical.
970 */
971 if ( !VALID_MFN(mfn = phys_to_machine_mapping(
972 d->arch.arch_vmx.cpu_cr3 >> PAGE_SHIFT)) ||
973 !get_page(pfn_to_page(mfn), d->domain) )
974 {
975 printk("Invalid CR3 value = %lx", d->arch.arch_vmx.cpu_cr3);
976 domain_crash_synchronous(); /* need to take a clean path */
977 }
979 #if defined(__x86_64__)
980 if (test_bit(VMX_CPU_STATE_LME_ENABLED,
981 &d->arch.arch_vmx.cpu_state) &&
982 !test_bit(VMX_CPU_STATE_PAE_ENABLED,
983 &d->arch.arch_vmx.cpu_state)){
984 VMX_DBG_LOG(DBG_LEVEL_1, "Enable paging before PAE enable\n");
985 vmx_inject_exception(d, TRAP_gp_fault, 0);
986 }
987 if (test_bit(VMX_CPU_STATE_LME_ENABLED,
988 &d->arch.arch_vmx.cpu_state)){
989 /* Here the PAE is should to be opened */
990 VMX_DBG_LOG(DBG_LEVEL_1, "Enable the Long mode\n");
991 set_bit(VMX_CPU_STATE_LMA_ENABLED,
992 &d->arch.arch_vmx.cpu_state);
993 __vmread(VM_ENTRY_CONTROLS, &vm_entry_value);
994 vm_entry_value |= VM_ENTRY_CONTROLS_IA32E_MODE;
995 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
997 #if CONFIG_PAGING_LEVELS >= 4
998 if(!shadow_set_guest_paging_levels(d->domain, 4)) {
999 printk("Unsupported guest paging levels\n");
1000 domain_crash_synchronous(); /* need to take a clean path */
1002 #endif
1005 unsigned long crn;
1006 /* update CR4's PAE if needed */
1007 __vmread(GUEST_CR4, &crn);
1008 if ( (!(crn & X86_CR4_PAE)) &&
1009 test_bit(VMX_CPU_STATE_PAE_ENABLED,
1010 &d->arch.arch_vmx.cpu_state)){
1011 VMX_DBG_LOG(DBG_LEVEL_1, "enable PAE on cr4\n");
1012 __vmwrite(GUEST_CR4, crn | X86_CR4_PAE);
1014 #elif defined( __i386__)
1015 unsigned long old_base_mfn;
1016 old_base_mfn = pagetable_get_pfn(d->arch.guest_table);
1017 if (old_base_mfn)
1018 put_page(pfn_to_page(old_base_mfn));
1019 #endif
1020 /*
1021 * Now arch.guest_table points to machine physical.
1022 */
1023 d->arch.guest_table = mk_pagetable(mfn << PAGE_SHIFT);
1024 update_pagetables(d);
1026 VMX_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
1027 (unsigned long) (mfn << PAGE_SHIFT));
1029 __vmwrite(GUEST_CR3, pagetable_get_paddr(d->arch.shadow_table));
1030 /*
1031 * arch->shadow_table should hold the next CR3 for shadow
1032 */
1033 VMX_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, mfn = %lx",
1034 d->arch.arch_vmx.cpu_cr3, mfn);
1037 /*
1038 * VMX does not implement real-mode virtualization. We emulate
1039 * real-mode by performing a world switch to VMXAssist whenever
1040 * a partition disables the CR0.PE bit.
1041 */
1042 if ((value & X86_CR0_PE) == 0) {
1043 if ( value & X86_CR0_PG ) {
1044 /* inject GP here */
1045 vmx_inject_exception(d, TRAP_gp_fault, 0);
1046 return 0;
1047 } else {
1048 /*
1049 * Disable paging here.
1050 * Same to PE == 1 && PG == 0
1051 */
1052 if (test_bit(VMX_CPU_STATE_LMA_ENABLED,
1053 &d->arch.arch_vmx.cpu_state)){
1054 clear_bit(VMX_CPU_STATE_LMA_ENABLED,
1055 &d->arch.arch_vmx.cpu_state);
1056 __vmread(VM_ENTRY_CONTROLS, &vm_entry_value);
1057 vm_entry_value &= ~VM_ENTRY_CONTROLS_IA32E_MODE;
1058 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
1061 __vmread(GUEST_RIP, &eip);
1062 VMX_DBG_LOG(DBG_LEVEL_1,
1063 "Disabling CR0.PE at %%eip 0x%lx\n", eip);
1064 if (vmx_assist(d, VMX_ASSIST_INVOKE)) {
1065 set_bit(VMX_CPU_STATE_ASSIST_ENABLED, &d->arch.arch_vmx.cpu_state);
1066 __vmread(GUEST_RIP, &eip);
1067 VMX_DBG_LOG(DBG_LEVEL_1,
1068 "Transfering control to vmxassist %%eip 0x%lx\n", eip);
1069 return 0; /* do not update eip! */
1071 } else if (test_bit(VMX_CPU_STATE_ASSIST_ENABLED,
1072 &d->arch.arch_vmx.cpu_state)) {
1073 __vmread(GUEST_RIP, &eip);
1074 VMX_DBG_LOG(DBG_LEVEL_1,
1075 "Enabling CR0.PE at %%eip 0x%lx\n", eip);
1076 if (vmx_assist(d, VMX_ASSIST_RESTORE)) {
1077 clear_bit(VMX_CPU_STATE_ASSIST_ENABLED,
1078 &d->arch.arch_vmx.cpu_state);
1079 __vmread(GUEST_RIP, &eip);
1080 VMX_DBG_LOG(DBG_LEVEL_1,
1081 "Restoring to %%eip 0x%lx\n", eip);
1082 return 0; /* do not update eip! */
1086 return 1;
1089 #define CASE_GET_REG(REG, reg) \
1090 case REG_ ## REG: value = regs->reg; break
1092 #define CASE_EXTEND_SET_REG \
1093 CASE_EXTEND_REG(S)
1094 #define CASE_EXTEND_GET_REG \
1095 CASE_EXTEND_REG(G)
1097 #ifdef __i386__
1098 #define CASE_EXTEND_REG(T)
1099 #else
1100 #define CASE_EXTEND_REG(T) \
1101 CASE_ ## T ## ET_REG(R8, r8); \
1102 CASE_ ## T ## ET_REG(R9, r9); \
1103 CASE_ ## T ## ET_REG(R10, r10); \
1104 CASE_ ## T ## ET_REG(R11, r11); \
1105 CASE_ ## T ## ET_REG(R12, r12); \
1106 CASE_ ## T ## ET_REG(R13, r13); \
1107 CASE_ ## T ## ET_REG(R14, r14); \
1108 CASE_ ## T ## ET_REG(R15, r15);
1109 #endif
1112 /*
1113 * Write to control registers
1114 */
1115 static int mov_to_cr(int gp, int cr, struct cpu_user_regs *regs)
1117 unsigned long value;
1118 unsigned long old_cr;
1119 struct vcpu *d = current;
1121 switch (gp) {
1122 CASE_GET_REG(EAX, eax);
1123 CASE_GET_REG(ECX, ecx);
1124 CASE_GET_REG(EDX, edx);
1125 CASE_GET_REG(EBX, ebx);
1126 CASE_GET_REG(EBP, ebp);
1127 CASE_GET_REG(ESI, esi);
1128 CASE_GET_REG(EDI, edi);
1129 CASE_EXTEND_GET_REG
1130 case REG_ESP:
1131 __vmread(GUEST_RSP, &value);
1132 break;
1133 default:
1134 printk("invalid gp: %d\n", gp);
1135 __vmx_bug(regs);
1138 VMX_DBG_LOG(DBG_LEVEL_1, "mov_to_cr: CR%d, value = %lx,", cr, value);
1139 VMX_DBG_LOG(DBG_LEVEL_1, "current = %lx,", (unsigned long) current);
1141 switch(cr) {
1142 case 0:
1144 return vmx_set_cr0(value);
1146 case 3:
1148 unsigned long old_base_mfn, mfn;
1150 /*
1151 * If paging is not enabled yet, simply copy the value to CR3.
1152 */
1153 if (!vmx_paging_enabled(d)) {
1154 d->arch.arch_vmx.cpu_cr3 = value;
1155 break;
1158 /*
1159 * We make a new one if the shadow does not exist.
1160 */
1161 if (value == d->arch.arch_vmx.cpu_cr3) {
1162 /*
1163 * This is simple TLB flush, implying the guest has
1164 * removed some translation or changed page attributes.
1165 * We simply invalidate the shadow.
1166 */
1167 mfn = phys_to_machine_mapping(value >> PAGE_SHIFT);
1168 if (mfn != pagetable_get_pfn(d->arch.guest_table))
1169 __vmx_bug(regs);
1170 shadow_sync_all(d->domain);
1171 } else {
1172 /*
1173 * If different, make a shadow. Check if the PDBR is valid
1174 * first.
1175 */
1176 VMX_DBG_LOG(DBG_LEVEL_VMMU, "CR3 value = %lx", value);
1177 if ( ((value >> PAGE_SHIFT) > d->domain->max_pages ) ||
1178 !VALID_MFN(mfn = phys_to_machine_mapping(value >> PAGE_SHIFT)) ||
1179 !get_page(pfn_to_page(mfn), d->domain) )
1181 printk("Invalid CR3 value=%lx", value);
1182 domain_crash_synchronous(); /* need to take a clean path */
1184 old_base_mfn = pagetable_get_pfn(d->arch.guest_table);
1185 d->arch.guest_table = mk_pagetable(mfn << PAGE_SHIFT);
1186 if (old_base_mfn)
1187 put_page(pfn_to_page(old_base_mfn));
1188 update_pagetables(d);
1189 /*
1190 * arch.shadow_table should now hold the next CR3 for shadow
1191 */
1192 d->arch.arch_vmx.cpu_cr3 = value;
1193 VMX_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx",
1194 value);
1195 __vmwrite(GUEST_CR3, pagetable_get_paddr(d->arch.shadow_table));
1197 break;
1199 case 4:
1201 /* CR4 */
1202 unsigned long old_guest_cr;
1204 __vmread(GUEST_CR4, &old_guest_cr);
1205 if (value & X86_CR4_PAE){
1206 set_bit(VMX_CPU_STATE_PAE_ENABLED, &d->arch.arch_vmx.cpu_state);
1207 } else {
1208 if (test_bit(VMX_CPU_STATE_LMA_ENABLED,
1209 &d->arch.arch_vmx.cpu_state)){
1210 vmx_inject_exception(d, TRAP_gp_fault, 0);
1212 clear_bit(VMX_CPU_STATE_PAE_ENABLED, &d->arch.arch_vmx.cpu_state);
1215 __vmread(CR4_READ_SHADOW, &old_cr);
1217 __vmwrite(GUEST_CR4, value| VMX_CR4_HOST_MASK);
1218 __vmwrite(CR4_READ_SHADOW, value);
1220 /*
1221 * Writing to CR4 to modify the PSE, PGE, or PAE flag invalidates
1222 * all TLB entries except global entries.
1223 */
1224 if ((old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE)) {
1225 shadow_sync_all(d->domain);
1227 break;
1229 default:
1230 printk("invalid cr: %d\n", gp);
1231 __vmx_bug(regs);
1234 return 1;
1237 #define CASE_SET_REG(REG, reg) \
1238 case REG_ ## REG: \
1239 regs->reg = value; \
1240 break
1242 /*
1243 * Read from control registers. CR0 and CR4 are read from the shadow.
1244 */
1245 static void mov_from_cr(int cr, int gp, struct cpu_user_regs *regs)
1247 unsigned long value;
1248 struct vcpu *d = current;
1250 if (cr != 3)
1251 __vmx_bug(regs);
1253 value = (unsigned long) d->arch.arch_vmx.cpu_cr3;
1255 switch (gp) {
1256 CASE_SET_REG(EAX, eax);
1257 CASE_SET_REG(ECX, ecx);
1258 CASE_SET_REG(EDX, edx);
1259 CASE_SET_REG(EBX, ebx);
1260 CASE_SET_REG(EBP, ebp);
1261 CASE_SET_REG(ESI, esi);
1262 CASE_SET_REG(EDI, edi);
1263 CASE_EXTEND_SET_REG
1264 case REG_ESP:
1265 __vmwrite(GUEST_RSP, value);
1266 regs->esp = value;
1267 break;
1268 default:
1269 printk("invalid gp: %d\n", gp);
1270 __vmx_bug(regs);
1273 VMX_DBG_LOG(DBG_LEVEL_VMMU, "mov_from_cr: CR%d, value = %lx,", cr, value);
1276 static int vmx_cr_access(unsigned long exit_qualification, struct cpu_user_regs *regs)
1278 unsigned int gp, cr;
1279 unsigned long value;
1281 switch (exit_qualification & CONTROL_REG_ACCESS_TYPE) {
1282 case TYPE_MOV_TO_CR:
1283 gp = exit_qualification & CONTROL_REG_ACCESS_REG;
1284 cr = exit_qualification & CONTROL_REG_ACCESS_NUM;
1285 return mov_to_cr(gp, cr, regs);
1286 case TYPE_MOV_FROM_CR:
1287 gp = exit_qualification & CONTROL_REG_ACCESS_REG;
1288 cr = exit_qualification & CONTROL_REG_ACCESS_NUM;
1289 mov_from_cr(cr, gp, regs);
1290 break;
1291 case TYPE_CLTS:
1292 clts();
1293 setup_fpu(current);
1295 __vmread(GUEST_CR0, &value);
1296 value &= ~X86_CR0_TS; /* clear TS */
1297 __vmwrite(GUEST_CR0, value);
1299 __vmread(CR0_READ_SHADOW, &value);
1300 value &= ~X86_CR0_TS; /* clear TS */
1301 __vmwrite(CR0_READ_SHADOW, value);
1302 break;
1303 case TYPE_LMSW:
1304 __vmread(CR0_READ_SHADOW, &value);
1305 value = (value & ~0xF) |
1306 (((exit_qualification & LMSW_SOURCE_DATA) >> 16) & 0xF);
1307 return vmx_set_cr0(value);
1308 break;
1309 default:
1310 __vmx_bug(regs);
1311 break;
1313 return 1;
1316 static inline void vmx_do_msr_read(struct cpu_user_regs *regs)
1318 VMX_DBG_LOG(DBG_LEVEL_1, "vmx_do_msr_read: ecx=%lx, eax=%lx, edx=%lx",
1319 (unsigned long)regs->ecx, (unsigned long)regs->eax,
1320 (unsigned long)regs->edx);
1321 switch (regs->ecx) {
1322 case MSR_IA32_SYSENTER_CS:
1323 __vmread(GUEST_SYSENTER_CS, &regs->eax);
1324 regs->edx = 0;
1325 break;
1326 case MSR_IA32_SYSENTER_ESP:
1327 __vmread(GUEST_SYSENTER_ESP, &regs->eax);
1328 regs->edx = 0;
1329 break;
1330 case MSR_IA32_SYSENTER_EIP:
1331 __vmread(GUEST_SYSENTER_EIP, &regs->eax);
1332 regs->edx = 0;
1333 break;
1334 default:
1335 if(long_mode_do_msr_read(regs))
1336 return;
1337 rdmsr_user(regs->ecx, regs->eax, regs->edx);
1338 break;
1341 VMX_DBG_LOG(DBG_LEVEL_1, "vmx_do_msr_read returns: "
1342 "ecx=%lx, eax=%lx, edx=%lx",
1343 (unsigned long)regs->ecx, (unsigned long)regs->eax,
1344 (unsigned long)regs->edx);
1347 static inline void vmx_do_msr_write(struct cpu_user_regs *regs)
1349 VMX_DBG_LOG(DBG_LEVEL_1, "vmx_do_msr_write: ecx=%lx, eax=%lx, edx=%lx",
1350 (unsigned long)regs->ecx, (unsigned long)regs->eax,
1351 (unsigned long)regs->edx);
1352 switch (regs->ecx) {
1353 case MSR_IA32_SYSENTER_CS:
1354 __vmwrite(GUEST_SYSENTER_CS, regs->eax);
1355 break;
1356 case MSR_IA32_SYSENTER_ESP:
1357 __vmwrite(GUEST_SYSENTER_ESP, regs->eax);
1358 break;
1359 case MSR_IA32_SYSENTER_EIP:
1360 __vmwrite(GUEST_SYSENTER_EIP, regs->eax);
1361 break;
1362 default:
1363 long_mode_do_msr_write(regs);
1364 break;
1367 VMX_DBG_LOG(DBG_LEVEL_1, "vmx_do_msr_write returns: "
1368 "ecx=%lx, eax=%lx, edx=%lx",
1369 (unsigned long)regs->ecx, (unsigned long)regs->eax,
1370 (unsigned long)regs->edx);
1373 /*
1374 * Need to use this exit to reschedule
1375 */
1376 static inline void vmx_vmexit_do_hlt(void)
1378 #if VMX_DEBUG
1379 unsigned long eip;
1380 __vmread(GUEST_RIP, &eip);
1381 #endif
1382 VMX_DBG_LOG(DBG_LEVEL_1, "vmx_vmexit_do_hlt:eip=%lx", eip);
1383 raise_softirq(SCHEDULE_SOFTIRQ);
1386 static inline void vmx_vmexit_do_extint(struct cpu_user_regs *regs)
1388 unsigned int vector;
1389 int error;
1391 asmlinkage void do_IRQ(struct cpu_user_regs *);
1392 void smp_apic_timer_interrupt(struct cpu_user_regs *);
1393 void timer_interrupt(int, void *, struct cpu_user_regs *);
1394 void smp_event_check_interrupt(void);
1395 void smp_invalidate_interrupt(void);
1396 void smp_call_function_interrupt(void);
1397 void smp_spurious_interrupt(struct cpu_user_regs *regs);
1398 void smp_error_interrupt(struct cpu_user_regs *regs);
1400 if ((error = __vmread(VM_EXIT_INTR_INFO, &vector))
1401 && !(vector & INTR_INFO_VALID_MASK))
1402 __vmx_bug(regs);
1404 vector &= 0xff;
1405 local_irq_disable();
1407 switch(vector) {
1408 case LOCAL_TIMER_VECTOR:
1409 smp_apic_timer_interrupt(regs);
1410 break;
1411 case EVENT_CHECK_VECTOR:
1412 smp_event_check_interrupt();
1413 break;
1414 case INVALIDATE_TLB_VECTOR:
1415 smp_invalidate_interrupt();
1416 break;
1417 case CALL_FUNCTION_VECTOR:
1418 smp_call_function_interrupt();
1419 break;
1420 case SPURIOUS_APIC_VECTOR:
1421 smp_spurious_interrupt(regs);
1422 break;
1423 case ERROR_APIC_VECTOR:
1424 smp_error_interrupt(regs);
1425 break;
1426 default:
1427 regs->entry_vector = vector;
1428 do_IRQ(regs);
1429 break;
1433 static inline void vmx_vmexit_do_mwait(void)
1435 #if VMX_DEBUG
1436 unsigned long eip;
1437 __vmread(GUEST_RIP, &eip);
1438 #endif
1439 VMX_DBG_LOG(DBG_LEVEL_1, "vmx_vmexit_do_mwait:eip=%lx", eip);
1440 raise_softirq(SCHEDULE_SOFTIRQ);
1443 #define BUF_SIZ 256
1444 #define MAX_LINE 80
1445 char print_buf[BUF_SIZ];
1446 static int index;
1448 static void vmx_print_line(const char c, struct vcpu *d)
1451 if (index == MAX_LINE || c == '\n') {
1452 if (index == MAX_LINE) {
1453 print_buf[index++] = c;
1455 print_buf[index] = '\0';
1456 printk("(GUEST: %u) %s\n", d->domain->domain_id, (char *) &print_buf);
1457 index = 0;
1459 else
1460 print_buf[index++] = c;
1463 void save_vmx_cpu_user_regs(struct cpu_user_regs *ctxt)
1465 __vmread(GUEST_SS_SELECTOR, &ctxt->ss);
1466 __vmread(GUEST_RSP, &ctxt->esp);
1467 __vmread(GUEST_RFLAGS, &ctxt->eflags);
1468 __vmread(GUEST_CS_SELECTOR, &ctxt->cs);
1469 __vmread(GUEST_RIP, &ctxt->eip);
1471 __vmread(GUEST_GS_SELECTOR, &ctxt->gs);
1472 __vmread(GUEST_FS_SELECTOR, &ctxt->fs);
1473 __vmread(GUEST_ES_SELECTOR, &ctxt->es);
1474 __vmread(GUEST_DS_SELECTOR, &ctxt->ds);
1477 #ifdef XEN_DEBUGGER
1478 void save_cpu_user_regs(struct cpu_user_regs *regs)
1480 __vmread(GUEST_SS_SELECTOR, &regs->xss);
1481 __vmread(GUEST_RSP, &regs->esp);
1482 __vmread(GUEST_RFLAGS, &regs->eflags);
1483 __vmread(GUEST_CS_SELECTOR, &regs->xcs);
1484 __vmread(GUEST_RIP, &regs->eip);
1486 __vmread(GUEST_GS_SELECTOR, &regs->xgs);
1487 __vmread(GUEST_FS_SELECTOR, &regs->xfs);
1488 __vmread(GUEST_ES_SELECTOR, &regs->xes);
1489 __vmread(GUEST_DS_SELECTOR, &regs->xds);
1492 void restore_cpu_user_regs(struct cpu_user_regs *regs)
1494 __vmwrite(GUEST_SS_SELECTOR, regs->xss);
1495 __vmwrite(GUEST_RSP, regs->esp);
1496 __vmwrite(GUEST_RFLAGS, regs->eflags);
1497 __vmwrite(GUEST_CS_SELECTOR, regs->xcs);
1498 __vmwrite(GUEST_RIP, regs->eip);
1500 __vmwrite(GUEST_GS_SELECTOR, regs->xgs);
1501 __vmwrite(GUEST_FS_SELECTOR, regs->xfs);
1502 __vmwrite(GUEST_ES_SELECTOR, regs->xes);
1503 __vmwrite(GUEST_DS_SELECTOR, regs->xds);
1505 #endif
1507 asmlinkage void vmx_vmexit_handler(struct cpu_user_regs regs)
1509 unsigned int exit_reason, idtv_info_field;
1510 unsigned long exit_qualification, eip, inst_len = 0;
1511 struct vcpu *v = current;
1512 int error;
1514 if ((error = __vmread(VM_EXIT_REASON, &exit_reason)))
1515 __vmx_bug(&regs);
1517 perfc_incra(vmexits, exit_reason);
1519 __vmread(IDT_VECTORING_INFO_FIELD, &idtv_info_field);
1520 if (idtv_info_field & INTR_INFO_VALID_MASK) {
1521 if ((idtv_info_field & 0x0700) != 0x400) { /* exclude soft ints */
1522 __vmwrite(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field);
1524 if (idtv_info_field & 0x800) { /* valid error code */
1525 unsigned long error_code;
1526 __vmread(VM_EXIT_INTR_ERROR_CODE, &error_code);
1527 __vmwrite(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
1530 VMX_DBG_LOG(DBG_LEVEL_1, "idtv_info_field=%x", idtv_info_field);
1533 /* don't bother H/W interrutps */
1534 if (exit_reason != EXIT_REASON_EXTERNAL_INTERRUPT &&
1535 exit_reason != EXIT_REASON_VMCALL &&
1536 exit_reason != EXIT_REASON_IO_INSTRUCTION)
1537 VMX_DBG_LOG(DBG_LEVEL_0, "exit reason = %x", exit_reason);
1539 if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
1540 printk("Failed vm entry\n");
1541 domain_crash_synchronous();
1542 return;
1545 __vmread(GUEST_RIP, &eip);
1546 TRACE_3D(TRC_VMX_VMEXIT, v->domain->domain_id, eip, exit_reason);
1548 switch (exit_reason) {
1549 case EXIT_REASON_EXCEPTION_NMI:
1551 /*
1552 * We don't set the software-interrupt exiting (INT n).
1553 * (1) We can get an exception (e.g. #PG) in the guest, or
1554 * (2) NMI
1555 */
1556 int error;
1557 unsigned int vector;
1558 unsigned long va;
1560 if ((error = __vmread(VM_EXIT_INTR_INFO, &vector))
1561 || !(vector & INTR_INFO_VALID_MASK))
1562 __vmx_bug(&regs);
1563 vector &= 0xff;
1565 perfc_incra(cause_vector, vector);
1567 TRACE_3D(TRC_VMX_VECTOR, v->domain->domain_id, eip, vector);
1568 switch (vector) {
1569 #ifdef XEN_DEBUGGER
1570 case TRAP_debug:
1572 save_cpu_user_regs(&regs);
1573 pdb_handle_exception(1, &regs, 1);
1574 restore_cpu_user_regs(&regs);
1575 break;
1577 case TRAP_int3:
1579 save_cpu_user_regs(&regs);
1580 pdb_handle_exception(3, &regs, 1);
1581 restore_cpu_user_regs(&regs);
1582 break;
1584 #else
1585 case TRAP_debug:
1587 void store_cpu_user_regs(struct cpu_user_regs *regs);
1588 long do_sched_op(unsigned long op);
1591 store_cpu_user_regs(&regs);
1592 __vm_clear_bit(GUEST_PENDING_DBG_EXCEPTIONS, PENDING_DEBUG_EXC_BS);
1594 set_bit(_VCPUF_ctrl_pause, &current->vcpu_flags);
1595 do_sched_op(SCHEDOP_yield);
1597 break;
1599 #endif
1600 case TRAP_no_device:
1602 vmx_do_no_device_fault();
1603 break;
1605 case TRAP_page_fault:
1607 __vmread(EXIT_QUALIFICATION, &va);
1608 __vmread(VM_EXIT_INTR_ERROR_CODE, &regs.error_code);
1609 VMX_DBG_LOG(DBG_LEVEL_VMMU,
1610 "eax=%lx, ebx=%lx, ecx=%lx, edx=%lx, esi=%lx, edi=%lx",
1611 (unsigned long)regs.eax, (unsigned long)regs.ebx,
1612 (unsigned long)regs.ecx, (unsigned long)regs.edx,
1613 (unsigned long)regs.esi, (unsigned long)regs.edi);
1614 v->domain->arch.vmx_platform.mpci.inst_decoder_regs = &regs;
1616 if (!(error = vmx_do_page_fault(va, &regs))) {
1617 /*
1618 * Inject #PG using Interruption-Information Fields
1619 */
1620 vmx_inject_exception(v, TRAP_page_fault, regs.error_code);
1621 v->arch.arch_vmx.cpu_cr2 = va;
1622 TRACE_3D(TRC_VMX_INT, v->domain->domain_id, TRAP_page_fault, va);
1624 break;
1626 case TRAP_nmi:
1627 do_nmi(&regs, 0);
1628 break;
1629 default:
1630 vmx_reflect_exception(v);
1631 break;
1633 break;
1635 case EXIT_REASON_EXTERNAL_INTERRUPT:
1636 vmx_vmexit_do_extint(&regs);
1637 break;
1638 case EXIT_REASON_PENDING_INTERRUPT:
1639 __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
1640 MONITOR_CPU_BASED_EXEC_CONTROLS);
1641 break;
1642 case EXIT_REASON_TASK_SWITCH:
1643 __vmx_bug(&regs);
1644 break;
1645 case EXIT_REASON_CPUID:
1646 __get_instruction_length(inst_len);
1647 vmx_vmexit_do_cpuid(regs.eax, &regs);
1648 __update_guest_eip(inst_len);
1649 break;
1650 case EXIT_REASON_HLT:
1651 __get_instruction_length(inst_len);
1652 __update_guest_eip(inst_len);
1653 vmx_vmexit_do_hlt();
1654 break;
1655 case EXIT_REASON_INVLPG:
1657 unsigned long va;
1659 __vmread(EXIT_QUALIFICATION, &va);
1660 vmx_vmexit_do_invlpg(va);
1661 __get_instruction_length(inst_len);
1662 __update_guest_eip(inst_len);
1663 break;
1665 case EXIT_REASON_VMCALL:
1666 __get_instruction_length(inst_len);
1667 __vmread(GUEST_RIP, &eip);
1668 __vmread(EXIT_QUALIFICATION, &exit_qualification);
1670 vmx_print_line(regs.eax, v); /* provides the current domain */
1671 __update_guest_eip(inst_len);
1672 break;
1673 case EXIT_REASON_CR_ACCESS:
1675 __vmread(GUEST_RIP, &eip);
1676 __get_instruction_length(inst_len);
1677 __vmread(EXIT_QUALIFICATION, &exit_qualification);
1679 VMX_DBG_LOG(DBG_LEVEL_1, "eip = %lx, inst_len =%lx, exit_qualification = %lx",
1680 eip, inst_len, exit_qualification);
1681 if (vmx_cr_access(exit_qualification, &regs))
1682 __update_guest_eip(inst_len);
1683 break;
1685 case EXIT_REASON_DR_ACCESS:
1686 __vmread(EXIT_QUALIFICATION, &exit_qualification);
1687 vmx_dr_access(exit_qualification, &regs);
1688 __get_instruction_length(inst_len);
1689 __update_guest_eip(inst_len);
1690 break;
1691 case EXIT_REASON_IO_INSTRUCTION:
1692 __vmread(EXIT_QUALIFICATION, &exit_qualification);
1693 __get_instruction_length(inst_len);
1694 vmx_io_instruction(&regs, exit_qualification, inst_len);
1695 break;
1696 case EXIT_REASON_MSR_READ:
1697 __get_instruction_length(inst_len);
1698 vmx_do_msr_read(&regs);
1699 __update_guest_eip(inst_len);
1700 break;
1701 case EXIT_REASON_MSR_WRITE:
1702 __vmread(GUEST_RIP, &eip);
1703 vmx_do_msr_write(&regs);
1704 __get_instruction_length(inst_len);
1705 __update_guest_eip(inst_len);
1706 break;
1707 case EXIT_REASON_MWAIT_INSTRUCTION:
1708 __get_instruction_length(inst_len);
1709 __update_guest_eip(inst_len);
1710 vmx_vmexit_do_mwait();
1711 break;
1712 default:
1713 __vmx_bug(&regs); /* should not happen */
1716 vmx_intr_assist(v);
1717 return;
1720 asmlinkage void load_cr2(void)
1722 struct vcpu *d = current;
1724 local_irq_disable();
1725 #ifdef __i386__
1726 asm volatile("movl %0,%%cr2": :"r" (d->arch.arch_vmx.cpu_cr2));
1727 #else
1728 asm volatile("movq %0,%%cr2": :"r" (d->arch.arch_vmx.cpu_cr2));
1729 #endif
1732 #endif /* CONFIG_VMX */
1734 /*
1735 * Local variables:
1736 * mode: C
1737 * c-set-style: "BSD"
1738 * c-basic-offset: 4
1739 * tab-width: 4
1740 * indent-tabs-mode: nil
1741 * End:
1742 */