ia64/xen-unstable

view xen/arch/x86/vmx.c @ 5718:c270d9ffdcef

Remove debug printks.
Sometimes they generate too much output on serial console.
Signed-off-by: Arun Sharma <arun.sharma@intel.com>
author kaf24@firebug.cl.cam.ac.uk
date Mon Jul 11 08:59:58 2005 +0000 (2005-07-11)
parents ff5d7ccd8d69
children dd798dd2abce
line source
1 /*
2 * vmx.c: handling VMX architecture-related VM exits
3 * Copyright (c) 2004, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
16 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 *
18 */
20 #include <xen/config.h>
21 #include <xen/init.h>
22 #include <xen/lib.h>
23 #include <xen/trace.h>
24 #include <xen/sched.h>
25 #include <xen/irq.h>
26 #include <xen/softirq.h>
27 #include <xen/domain_page.h>
28 #include <asm/current.h>
29 #include <asm/io.h>
30 #include <asm/shadow.h>
31 #include <asm/regs.h>
32 #include <asm/cpufeature.h>
33 #include <asm/processor.h>
34 #include <asm/types.h>
35 #include <asm/msr.h>
36 #include <asm/spinlock.h>
37 #include <asm/vmx.h>
38 #include <asm/vmx_vmcs.h>
39 #include <asm/vmx_intercept.h>
40 #include <asm/shadow.h>
41 #include <public/io/ioreq.h>
43 #ifdef CONFIG_VMX
45 int vmcs_size;
46 unsigned int opt_vmx_debug_level = 0;
47 integer_param("vmx_debug", opt_vmx_debug_level);
49 #ifdef __x86_64__
50 static struct msr_state percpu_msr[NR_CPUS];
52 static u32 msr_data_index[VMX_MSR_COUNT] =
53 {
54 MSR_LSTAR, MSR_STAR, MSR_CSTAR,
55 MSR_SYSCALL_MASK, MSR_EFER,
56 };
58 /*
59 * To avoid MSR save/restore at every VM exit/entry time, we restore
60 * the x86_64 specific MSRs at domain switch time. Since those MSRs are
61 * are not modified once set for generic domains, we don't save them,
62 * but simply reset them to the values set at percpu_traps_init().
63 */
64 void vmx_load_msrs(struct vcpu *p, struct vcpu *n)
65 {
66 struct msr_state *host_state;
67 host_state = &percpu_msr[smp_processor_id()];
69 while (host_state->flags){
70 int i;
72 i = find_first_set_bit(host_state->flags);
73 wrmsrl(msr_data_index[i], host_state->msr_items[i]);
74 clear_bit(i, &host_state->flags);
75 }
76 }
78 static void vmx_save_init_msrs(void)
79 {
80 struct msr_state *host_state;
81 host_state = &percpu_msr[smp_processor_id()];
82 int i;
84 for (i = 0; i < VMX_MSR_COUNT; i++)
85 rdmsrl(msr_data_index[i], host_state->msr_items[i]);
86 }
88 #define CASE_READ_MSR(address) \
89 case MSR_ ## address: \
90 msr_content = msr->msr_items[VMX_INDEX_MSR_ ## address]; \
91 break
93 #define CASE_WRITE_MSR(address) \
94 case MSR_ ## address: \
95 msr->msr_items[VMX_INDEX_MSR_ ## address] = msr_content; \
96 if (!test_bit(VMX_INDEX_MSR_ ## address, &msr->flags)){ \
97 set_bit(VMX_INDEX_MSR_ ## address, &msr->flags); \
98 }\
99 break
101 #define IS_CANO_ADDRESS(add) 1
102 static inline int long_mode_do_msr_read(struct cpu_user_regs *regs)
103 {
104 u64 msr_content = 0;
105 struct vcpu *vc = current;
106 struct msr_state * msr = &vc->arch.arch_vmx.msr_content;
107 switch(regs->ecx){
108 case MSR_EFER:
109 msr_content = msr->msr_items[VMX_INDEX_MSR_EFER];
110 VMX_DBG_LOG(DBG_LEVEL_2, "EFER msr_content %llx\n", (unsigned long long)msr_content);
111 if (test_bit(VMX_CPU_STATE_LME_ENABLED,
112 &vc->arch.arch_vmx.cpu_state))
113 msr_content |= 1 << _EFER_LME;
115 if (VMX_LONG_GUEST(vc))
116 msr_content |= 1 << _EFER_LMA;
117 break;
118 case MSR_FS_BASE:
119 if (!(VMX_LONG_GUEST(vc)))
120 /* XXX should it be GP fault */
121 domain_crash();
122 __vmread(GUEST_FS_BASE, &msr_content);
123 break;
124 case MSR_GS_BASE:
125 if (!(VMX_LONG_GUEST(vc)))
126 domain_crash();
127 __vmread(GUEST_GS_BASE, &msr_content);
128 break;
129 case MSR_SHADOW_GS_BASE:
130 msr_content = msr->shadow_gs;
131 break;
133 CASE_READ_MSR(STAR);
134 CASE_READ_MSR(LSTAR);
135 CASE_READ_MSR(CSTAR);
136 CASE_READ_MSR(SYSCALL_MASK);
137 default:
138 return 0;
139 }
140 VMX_DBG_LOG(DBG_LEVEL_2, "mode_do_msr_read: msr_content: %lx\n", msr_content);
141 regs->eax = msr_content & 0xffffffff;
142 regs->edx = msr_content >> 32;
143 return 1;
144 }
146 static inline int long_mode_do_msr_write(struct cpu_user_regs *regs)
147 {
148 u64 msr_content = regs->eax | ((u64)regs->edx << 32);
149 struct vcpu *vc = current;
150 struct msr_state * msr = &vc->arch.arch_vmx.msr_content;
151 struct msr_state * host_state =
152 &percpu_msr[smp_processor_id()];
154 VMX_DBG_LOG(DBG_LEVEL_1, " mode_do_msr_write msr %lx msr_content %lx\n",
155 regs->ecx, msr_content);
157 switch (regs->ecx){
158 case MSR_EFER:
159 if ((msr_content & EFER_LME) ^
160 test_bit(VMX_CPU_STATE_LME_ENABLED,
161 &vc->arch.arch_vmx.cpu_state)){
162 if (test_bit(VMX_CPU_STATE_PG_ENABLED,
163 &vc->arch.arch_vmx.cpu_state) ||
164 !test_bit(VMX_CPU_STATE_PAE_ENABLED,
165 &vc->arch.arch_vmx.cpu_state)){
166 vmx_inject_exception(vc, TRAP_gp_fault, 0);
167 }
168 }
169 if (msr_content & EFER_LME)
170 set_bit(VMX_CPU_STATE_LME_ENABLED,
171 &vc->arch.arch_vmx.cpu_state);
172 /* No update for LME/LMA since it have no effect */
173 msr->msr_items[VMX_INDEX_MSR_EFER] =
174 msr_content;
175 if (msr_content & ~(EFER_LME | EFER_LMA)){
176 msr->msr_items[VMX_INDEX_MSR_EFER] = msr_content;
177 if (!test_bit(VMX_INDEX_MSR_EFER, &msr->flags)){
178 rdmsrl(MSR_EFER,
179 host_state->msr_items[VMX_INDEX_MSR_EFER]);
180 set_bit(VMX_INDEX_MSR_EFER, &host_state->flags);
181 set_bit(VMX_INDEX_MSR_EFER, &msr->flags);
182 wrmsrl(MSR_EFER, msr_content);
183 }
184 }
185 break;
187 case MSR_FS_BASE:
188 case MSR_GS_BASE:
189 if (!(VMX_LONG_GUEST(vc)))
190 domain_crash();
191 if (!IS_CANO_ADDRESS(msr_content)){
192 VMX_DBG_LOG(DBG_LEVEL_1, "Not cano address of msr write\n");
193 vmx_inject_exception(vc, TRAP_gp_fault, 0);
194 }
195 if (regs->ecx == MSR_FS_BASE)
196 __vmwrite(GUEST_FS_BASE, msr_content);
197 else
198 __vmwrite(GUEST_GS_BASE, msr_content);
199 break;
201 case MSR_SHADOW_GS_BASE:
202 if (!(VMX_LONG_GUEST(vc)))
203 domain_crash();
204 vc->arch.arch_vmx.msr_content.shadow_gs = msr_content;
205 wrmsrl(MSR_SHADOW_GS_BASE, msr_content);
206 break;
208 CASE_WRITE_MSR(STAR);
209 CASE_WRITE_MSR(LSTAR);
210 CASE_WRITE_MSR(CSTAR);
211 CASE_WRITE_MSR(SYSCALL_MASK);
212 default:
213 return 0;
214 }
215 return 1;
216 }
218 void
219 vmx_restore_msrs(struct vcpu *d)
220 {
221 int i = 0;
222 struct msr_state *guest_state;
223 struct msr_state *host_state;
224 unsigned long guest_flags ;
226 guest_state = &d->arch.arch_vmx.msr_content;;
227 host_state = &percpu_msr[smp_processor_id()];
229 wrmsrl(MSR_SHADOW_GS_BASE, guest_state->shadow_gs);
230 guest_flags = guest_state->flags;
231 if (!guest_flags)
232 return;
234 while (guest_flags){
235 i = find_first_set_bit(guest_flags);
237 VMX_DBG_LOG(DBG_LEVEL_2,
238 "restore guest's index %d msr %lx with %lx\n",
239 i, (unsigned long) msr_data_index[i], (unsigned long) guest_state->msr_items[i]);
240 set_bit(i, &host_state->flags);
241 wrmsrl(msr_data_index[i], guest_state->msr_items[i]);
242 clear_bit(i, &guest_flags);
243 }
244 }
246 #else /* __i386__ */
247 #define vmx_save_init_msrs() ((void)0)
249 static inline int long_mode_do_msr_read(struct cpu_user_regs *regs){
250 return 0;
251 }
252 static inline int long_mode_do_msr_write(struct cpu_user_regs *regs){
253 return 0;
254 }
255 #endif
257 extern long evtchn_send(int lport);
258 extern long do_block(void);
259 void do_nmi(struct cpu_user_regs *, unsigned long);
261 int start_vmx(void)
262 {
263 struct vmcs_struct *vmcs;
264 u32 ecx;
265 u32 eax, edx;
266 u64 phys_vmcs; /* debugging */
268 /*
269 * Xen does not fill x86_capability words except 0.
270 */
271 ecx = cpuid_ecx(1);
272 boot_cpu_data.x86_capability[4] = ecx;
274 if (!(test_bit(X86_FEATURE_VMXE, &boot_cpu_data.x86_capability)))
275 return 0;
277 rdmsr(IA32_FEATURE_CONTROL_MSR, eax, edx);
279 if (eax & IA32_FEATURE_CONTROL_MSR_LOCK) {
280 if ((eax & IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON) == 0x0) {
281 printk("VMX disabled by Feature Control MSR.\n");
282 return 0;
283 }
284 }
285 else {
286 wrmsr(IA32_FEATURE_CONTROL_MSR,
287 IA32_FEATURE_CONTROL_MSR_LOCK |
288 IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON, 0);
289 }
291 set_in_cr4(X86_CR4_VMXE); /* Enable VMXE */
293 if (!(vmcs = alloc_vmcs())) {
294 printk("Failed to allocate VMCS\n");
295 return 0;
296 }
298 phys_vmcs = (u64) virt_to_phys(vmcs);
300 if (!(__vmxon(phys_vmcs))) {
301 printk("VMXON is done\n");
302 }
304 vmx_save_init_msrs();
306 return 1;
307 }
309 void stop_vmx(void)
310 {
311 if (read_cr4() & X86_CR4_VMXE)
312 __vmxoff();
313 }
315 /*
316 * Not all cases receive valid value in the VM-exit instruction length field.
317 */
318 #define __get_instruction_length(len) \
319 __vmread(INSTRUCTION_LEN, &(len)); \
320 if ((len) < 1 || (len) > 15) \
321 __vmx_bug(&regs);
323 static void inline __update_guest_eip(unsigned long inst_len)
324 {
325 unsigned long current_eip;
327 __vmread(GUEST_RIP, &current_eip);
328 __vmwrite(GUEST_RIP, current_eip + inst_len);
329 }
332 static int vmx_do_page_fault(unsigned long va, struct cpu_user_regs *regs)
333 {
334 unsigned long eip;
335 unsigned long gpa; /* FIXME: PAE */
336 int result;
338 #if VMX_DEBUG
339 {
340 __vmread(GUEST_RIP, &eip);
341 VMX_DBG_LOG(DBG_LEVEL_VMMU,
342 "vmx_do_page_fault = 0x%lx, eip = %lx, error_code = %lx",
343 va, eip, (unsigned long)regs->error_code);
344 }
345 #endif
347 if (!vmx_paging_enabled(current)){
348 handle_mmio(va, va);
349 return 1;
350 }
351 gpa = gva_to_gpa(va);
353 /* Use 1:1 page table to identify MMIO address space */
354 if ( mmio_space(gpa) ){
355 if (gpa >= 0xFEE00000) { /* workaround for local APIC */
356 u32 inst_len;
357 __vmread(INSTRUCTION_LEN, &(inst_len));
358 __update_guest_eip(inst_len);
359 return 1;
360 }
361 handle_mmio(va, gpa);
362 return 1;
363 }
365 result = shadow_fault(va, regs);
367 #if 0
368 if ( !result )
369 {
370 __vmread(GUEST_RIP, &eip);
371 printk("vmx pgfault to guest va=%p eip=%p\n", va, eip);
372 }
373 #endif
375 return result;
376 }
378 static void vmx_do_no_device_fault(void)
379 {
380 unsigned long cr0;
382 clts();
383 setup_fpu(current);
384 __vmread(CR0_READ_SHADOW, &cr0);
385 if (!(cr0 & X86_CR0_TS)) {
386 __vmread(GUEST_CR0, &cr0);
387 cr0 &= ~X86_CR0_TS;
388 __vmwrite(GUEST_CR0, cr0);
389 }
390 __vm_clear_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_NM);
391 }
394 static void vmx_vmexit_do_cpuid(unsigned long input, struct cpu_user_regs *regs)
395 {
396 unsigned int eax, ebx, ecx, edx;
397 unsigned long eip;
399 __vmread(GUEST_RIP, &eip);
401 VMX_DBG_LOG(DBG_LEVEL_1,
402 "do_cpuid: (eax) %lx, (ebx) %lx, (ecx) %lx, (edx) %lx,"
403 " (esi) %lx, (edi) %lx",
404 (unsigned long)regs->eax, (unsigned long)regs->ebx,
405 (unsigned long)regs->ecx, (unsigned long)regs->edx,
406 (unsigned long)regs->esi, (unsigned long)regs->edi);
408 cpuid(input, &eax, &ebx, &ecx, &edx);
410 if (input == 1) {
411 #ifdef __i386__
412 clear_bit(X86_FEATURE_PSE, &edx);
413 clear_bit(X86_FEATURE_PAE, &edx);
414 clear_bit(X86_FEATURE_PSE36, &edx);
415 #endif
416 }
418 regs->eax = (unsigned long) eax;
419 regs->ebx = (unsigned long) ebx;
420 regs->ecx = (unsigned long) ecx;
421 regs->edx = (unsigned long) edx;
423 VMX_DBG_LOG(DBG_LEVEL_1,
424 "vmx_vmexit_do_cpuid: eip: %lx, input: %lx, out:eax=%x, ebx=%x, ecx=%x, edx=%x",
425 eip, input, eax, ebx, ecx, edx);
427 }
429 #define CASE_GET_REG_P(REG, reg) \
430 case REG_ ## REG: reg_p = (unsigned long *)&(regs->reg); break
432 static void vmx_dr_access (unsigned long exit_qualification, struct cpu_user_regs *regs)
433 {
434 unsigned int reg;
435 unsigned long *reg_p = 0;
436 struct vcpu *v = current;
437 unsigned long eip;
439 __vmread(GUEST_RIP, &eip);
441 reg = exit_qualification & DEBUG_REG_ACCESS_NUM;
443 VMX_DBG_LOG(DBG_LEVEL_1,
444 "vmx_dr_access : eip=%lx, reg=%d, exit_qualification = %lx",
445 eip, reg, exit_qualification);
447 switch(exit_qualification & DEBUG_REG_ACCESS_REG) {
448 CASE_GET_REG_P(EAX, eax);
449 CASE_GET_REG_P(ECX, ecx);
450 CASE_GET_REG_P(EDX, edx);
451 CASE_GET_REG_P(EBX, ebx);
452 CASE_GET_REG_P(EBP, ebp);
453 CASE_GET_REG_P(ESI, esi);
454 CASE_GET_REG_P(EDI, edi);
455 case REG_ESP:
456 break;
457 default:
458 __vmx_bug(regs);
459 }
461 switch (exit_qualification & DEBUG_REG_ACCESS_TYPE) {
462 case TYPE_MOV_TO_DR:
463 /* don't need to check the range */
464 if (reg != REG_ESP)
465 v->arch.guest_context.debugreg[reg] = *reg_p;
466 else {
467 unsigned long value;
468 __vmread(GUEST_RSP, &value);
469 v->arch.guest_context.debugreg[reg] = value;
470 }
471 break;
472 case TYPE_MOV_FROM_DR:
473 if (reg != REG_ESP)
474 *reg_p = v->arch.guest_context.debugreg[reg];
475 else {
476 __vmwrite(GUEST_RSP, v->arch.guest_context.debugreg[reg]);
477 }
478 break;
479 }
480 }
482 /*
483 * Invalidate the TLB for va. Invalidate the shadow page corresponding
484 * the address va.
485 */
486 static void vmx_vmexit_do_invlpg(unsigned long va)
487 {
488 unsigned long eip;
489 struct vcpu *v = current;
491 __vmread(GUEST_RIP, &eip);
493 VMX_DBG_LOG(DBG_LEVEL_VMMU, "vmx_vmexit_do_invlpg: eip=%lx, va=%lx",
494 eip, va);
496 /*
497 * We do the safest things first, then try to update the shadow
498 * copying from guest
499 */
500 shadow_invlpg(v, va);
501 }
503 static int check_for_null_selector(unsigned long eip)
504 {
505 unsigned char inst[MAX_INST_LEN];
506 unsigned long sel;
507 int i, inst_len;
508 int inst_copy_from_guest(unsigned char *, unsigned long, int);
510 __vmread(INSTRUCTION_LEN, &inst_len);
511 memset(inst, 0, MAX_INST_LEN);
512 if (inst_copy_from_guest(inst, eip, inst_len) != inst_len) {
513 printf("check_for_null_selector: get guest instruction failed\n");
514 domain_crash_synchronous();
515 }
517 for (i = 0; i < inst_len; i++) {
518 switch (inst[i]) {
519 case 0xf3: /* REPZ */
520 case 0xf2: /* REPNZ */
521 case 0xf0: /* LOCK */
522 case 0x66: /* data32 */
523 case 0x67: /* addr32 */
524 continue;
525 case 0x2e: /* CS */
526 __vmread(GUEST_CS_SELECTOR, &sel);
527 break;
528 case 0x36: /* SS */
529 __vmread(GUEST_SS_SELECTOR, &sel);
530 break;
531 case 0x26: /* ES */
532 __vmread(GUEST_ES_SELECTOR, &sel);
533 break;
534 case 0x64: /* FS */
535 __vmread(GUEST_FS_SELECTOR, &sel);
536 break;
537 case 0x65: /* GS */
538 __vmread(GUEST_GS_SELECTOR, &sel);
539 break;
540 case 0x3e: /* DS */
541 /* FALLTHROUGH */
542 default:
543 /* DS is the default */
544 __vmread(GUEST_DS_SELECTOR, &sel);
545 }
546 return sel == 0 ? 1 : 0;
547 }
549 return 0;
550 }
552 static void vmx_io_instruction(struct cpu_user_regs *regs,
553 unsigned long exit_qualification, unsigned long inst_len)
554 {
555 struct vcpu *d = current;
556 vcpu_iodata_t *vio;
557 ioreq_t *p;
558 unsigned long addr;
559 unsigned long eip, cs, eflags;
560 int vm86;
562 __vmread(GUEST_RIP, &eip);
563 __vmread(GUEST_CS_SELECTOR, &cs);
564 __vmread(GUEST_RFLAGS, &eflags);
565 vm86 = eflags & X86_EFLAGS_VM ? 1 : 0;
567 VMX_DBG_LOG(DBG_LEVEL_1,
568 "vmx_io_instruction: vm86 %d, eip=%lx:%lx, "
569 "exit_qualification = %lx",
570 vm86, cs, eip, exit_qualification);
572 if (test_bit(6, &exit_qualification))
573 addr = (exit_qualification >> 16) & (0xffff);
574 else
575 addr = regs->edx & 0xffff;
577 if (addr == 0x80) {
578 __update_guest_eip(inst_len);
579 return;
580 }
582 vio = get_vio(d->domain, d->vcpu_id);
583 if (vio == 0) {
584 printk("bad shared page: %lx", (unsigned long) vio);
585 domain_crash_synchronous();
586 }
587 p = &vio->vp_ioreq;
588 p->dir = test_bit(3, &exit_qualification); /* direction */
590 p->pdata_valid = 0;
591 p->count = 1;
592 p->size = (exit_qualification & 7) + 1;
594 if (test_bit(4, &exit_qualification)) { /* string instruction */
595 unsigned long laddr;
597 __vmread(GUEST_LINEAR_ADDRESS, &laddr);
598 /*
599 * In protected mode, guest linear address is invalid if the
600 * selector is null.
601 */
602 if (!vm86 && check_for_null_selector(eip)) {
603 laddr = (p->dir == IOREQ_WRITE) ? regs->esi : regs->edi;
604 }
605 p->pdata_valid = 1;
607 p->u.data = laddr;
608 if (vmx_paging_enabled(d))
609 p->u.pdata = (void *) gva_to_gpa(p->u.data);
610 p->df = (eflags & X86_EFLAGS_DF) ? 1 : 0;
612 if (test_bit(5, &exit_qualification)) /* "rep" prefix */
613 p->count = vm86 ? regs->ecx & 0xFFFF : regs->ecx;
615 /*
616 * Split up string I/O operations that cross page boundaries. Don't
617 * advance %eip so that "rep insb" will restart at the next page.
618 */
619 if ((p->u.data & PAGE_MASK) !=
620 ((p->u.data + p->count * p->size - 1) & PAGE_MASK)) {
621 VMX_DBG_LOG(DBG_LEVEL_2,
622 "String I/O crosses page boundary (cs:eip=0x%lx:0x%lx)\n",
623 cs, eip);
624 if (p->u.data & (p->size - 1)) {
625 printf("Unaligned string I/O operation (cs:eip=0x%lx:0x%lx)\n",
626 cs, eip);
627 domain_crash_synchronous();
628 }
629 p->count = (PAGE_SIZE - (p->u.data & ~PAGE_MASK)) / p->size;
630 } else {
631 __update_guest_eip(inst_len);
632 }
633 } else if (p->dir == IOREQ_WRITE) {
634 p->u.data = regs->eax;
635 __update_guest_eip(inst_len);
636 } else
637 __update_guest_eip(inst_len);
639 p->addr = addr;
640 p->port_mm = 0;
642 /* Check if the packet needs to be intercepted */
643 if (vmx_portio_intercept(p))
644 /* no blocking & no evtchn notification */
645 return;
647 set_bit(ARCH_VMX_IO_WAIT, &d->arch.arch_vmx.flags);
648 p->state = STATE_IOREQ_READY;
649 evtchn_send(iopacket_port(d->domain));
650 vmx_wait_io();
651 }
653 enum { COPY_IN = 0, COPY_OUT };
655 static inline int
656 vmx_copy(void *buf, unsigned long laddr, int size, int dir)
657 {
658 char *addr;
659 unsigned long mfn;
661 if ( (size + (laddr & (PAGE_SIZE - 1))) >= PAGE_SIZE )
662 {
663 printf("vmx_copy exceeds page boundary\n");
664 return 0;
665 }
667 mfn = phys_to_machine_mapping(laddr >> PAGE_SHIFT);
668 addr = (char *)map_domain_page(mfn) + (laddr & ~PAGE_MASK);
670 if (dir == COPY_IN)
671 memcpy(buf, addr, size);
672 else
673 memcpy(addr, buf, size);
675 unmap_domain_page(addr);
676 return 1;
677 }
679 int
680 vmx_world_save(struct vcpu *d, struct vmx_assist_context *c)
681 {
682 unsigned long inst_len;
683 int error = 0;
685 error |= __vmread(INSTRUCTION_LEN, &inst_len);
686 error |= __vmread(GUEST_RIP, &c->eip);
687 c->eip += inst_len; /* skip transition instruction */
688 error |= __vmread(GUEST_RSP, &c->esp);
689 error |= __vmread(GUEST_RFLAGS, &c->eflags);
691 error |= __vmread(CR0_READ_SHADOW, &c->cr0);
692 c->cr3 = d->arch.arch_vmx.cpu_cr3;
693 error |= __vmread(CR4_READ_SHADOW, &c->cr4);
695 error |= __vmread(GUEST_IDTR_LIMIT, &c->idtr_limit);
696 error |= __vmread(GUEST_IDTR_BASE, &c->idtr_base);
698 error |= __vmread(GUEST_GDTR_LIMIT, &c->gdtr_limit);
699 error |= __vmread(GUEST_GDTR_BASE, &c->gdtr_base);
701 error |= __vmread(GUEST_CS_SELECTOR, &c->cs_sel);
702 error |= __vmread(GUEST_CS_LIMIT, &c->cs_limit);
703 error |= __vmread(GUEST_CS_BASE, &c->cs_base);
704 error |= __vmread(GUEST_CS_AR_BYTES, &c->cs_arbytes.bytes);
706 error |= __vmread(GUEST_DS_SELECTOR, &c->ds_sel);
707 error |= __vmread(GUEST_DS_LIMIT, &c->ds_limit);
708 error |= __vmread(GUEST_DS_BASE, &c->ds_base);
709 error |= __vmread(GUEST_DS_AR_BYTES, &c->ds_arbytes.bytes);
711 error |= __vmread(GUEST_ES_SELECTOR, &c->es_sel);
712 error |= __vmread(GUEST_ES_LIMIT, &c->es_limit);
713 error |= __vmread(GUEST_ES_BASE, &c->es_base);
714 error |= __vmread(GUEST_ES_AR_BYTES, &c->es_arbytes.bytes);
716 error |= __vmread(GUEST_SS_SELECTOR, &c->ss_sel);
717 error |= __vmread(GUEST_SS_LIMIT, &c->ss_limit);
718 error |= __vmread(GUEST_SS_BASE, &c->ss_base);
719 error |= __vmread(GUEST_SS_AR_BYTES, &c->ss_arbytes.bytes);
721 error |= __vmread(GUEST_FS_SELECTOR, &c->fs_sel);
722 error |= __vmread(GUEST_FS_LIMIT, &c->fs_limit);
723 error |= __vmread(GUEST_FS_BASE, &c->fs_base);
724 error |= __vmread(GUEST_FS_AR_BYTES, &c->fs_arbytes.bytes);
726 error |= __vmread(GUEST_GS_SELECTOR, &c->gs_sel);
727 error |= __vmread(GUEST_GS_LIMIT, &c->gs_limit);
728 error |= __vmread(GUEST_GS_BASE, &c->gs_base);
729 error |= __vmread(GUEST_GS_AR_BYTES, &c->gs_arbytes.bytes);
731 error |= __vmread(GUEST_TR_SELECTOR, &c->tr_sel);
732 error |= __vmread(GUEST_TR_LIMIT, &c->tr_limit);
733 error |= __vmread(GUEST_TR_BASE, &c->tr_base);
734 error |= __vmread(GUEST_TR_AR_BYTES, &c->tr_arbytes.bytes);
736 error |= __vmread(GUEST_LDTR_SELECTOR, &c->ldtr_sel);
737 error |= __vmread(GUEST_LDTR_LIMIT, &c->ldtr_limit);
738 error |= __vmread(GUEST_LDTR_BASE, &c->ldtr_base);
739 error |= __vmread(GUEST_LDTR_AR_BYTES, &c->ldtr_arbytes.bytes);
741 return !error;
742 }
744 int
745 vmx_world_restore(struct vcpu *d, struct vmx_assist_context *c)
746 {
747 unsigned long mfn, old_cr4;
748 int error = 0;
750 error |= __vmwrite(GUEST_RIP, c->eip);
751 error |= __vmwrite(GUEST_RSP, c->esp);
752 error |= __vmwrite(GUEST_RFLAGS, c->eflags);
754 error |= __vmwrite(CR0_READ_SHADOW, c->cr0);
756 if (!vmx_paging_enabled(d)) {
757 VMX_DBG_LOG(DBG_LEVEL_VMMU, "switching to vmxassist. use phys table");
758 __vmwrite(GUEST_CR3, pagetable_get_paddr(d->domain->arch.phys_table));
759 goto skip_cr3;
760 }
762 if (c->cr3 == d->arch.arch_vmx.cpu_cr3) {
763 /*
764 * This is simple TLB flush, implying the guest has
765 * removed some translation or changed page attributes.
766 * We simply invalidate the shadow.
767 */
768 mfn = phys_to_machine_mapping(c->cr3 >> PAGE_SHIFT);
769 if (mfn != pagetable_get_pfn(d->arch.guest_table)) {
770 printk("Invalid CR3 value=%x", c->cr3);
771 domain_crash_synchronous();
772 return 0;
773 }
774 shadow_sync_all(d->domain);
775 } else {
776 /*
777 * If different, make a shadow. Check if the PDBR is valid
778 * first.
779 */
780 VMX_DBG_LOG(DBG_LEVEL_VMMU, "CR3 c->cr3 = %x", c->cr3);
781 if ((c->cr3 >> PAGE_SHIFT) > d->domain->max_pages) {
782 printk("Invalid CR3 value=%x", c->cr3);
783 domain_crash_synchronous();
784 return 0;
785 }
786 mfn = phys_to_machine_mapping(c->cr3 >> PAGE_SHIFT);
787 d->arch.guest_table = mk_pagetable(mfn << PAGE_SHIFT);
788 update_pagetables(d);
789 /*
790 * arch.shadow_table should now hold the next CR3 for shadow
791 */
792 d->arch.arch_vmx.cpu_cr3 = c->cr3;
793 VMX_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %x", c->cr3);
794 __vmwrite(GUEST_CR3, pagetable_get_paddr(d->arch.shadow_table));
795 }
797 skip_cr3:
799 error |= __vmread(CR4_READ_SHADOW, &old_cr4);
800 error |= __vmwrite(GUEST_CR4, (c->cr4 | X86_CR4_VMXE));
801 error |= __vmwrite(CR4_READ_SHADOW, c->cr4);
803 error |= __vmwrite(GUEST_IDTR_LIMIT, c->idtr_limit);
804 error |= __vmwrite(GUEST_IDTR_BASE, c->idtr_base);
806 error |= __vmwrite(GUEST_GDTR_LIMIT, c->gdtr_limit);
807 error |= __vmwrite(GUEST_GDTR_BASE, c->gdtr_base);
809 error |= __vmwrite(GUEST_CS_SELECTOR, c->cs_sel);
810 error |= __vmwrite(GUEST_CS_LIMIT, c->cs_limit);
811 error |= __vmwrite(GUEST_CS_BASE, c->cs_base);
812 error |= __vmwrite(GUEST_CS_AR_BYTES, c->cs_arbytes.bytes);
814 error |= __vmwrite(GUEST_DS_SELECTOR, c->ds_sel);
815 error |= __vmwrite(GUEST_DS_LIMIT, c->ds_limit);
816 error |= __vmwrite(GUEST_DS_BASE, c->ds_base);
817 error |= __vmwrite(GUEST_DS_AR_BYTES, c->ds_arbytes.bytes);
819 error |= __vmwrite(GUEST_ES_SELECTOR, c->es_sel);
820 error |= __vmwrite(GUEST_ES_LIMIT, c->es_limit);
821 error |= __vmwrite(GUEST_ES_BASE, c->es_base);
822 error |= __vmwrite(GUEST_ES_AR_BYTES, c->es_arbytes.bytes);
824 error |= __vmwrite(GUEST_SS_SELECTOR, c->ss_sel);
825 error |= __vmwrite(GUEST_SS_LIMIT, c->ss_limit);
826 error |= __vmwrite(GUEST_SS_BASE, c->ss_base);
827 error |= __vmwrite(GUEST_SS_AR_BYTES, c->ss_arbytes.bytes);
829 error |= __vmwrite(GUEST_FS_SELECTOR, c->fs_sel);
830 error |= __vmwrite(GUEST_FS_LIMIT, c->fs_limit);
831 error |= __vmwrite(GUEST_FS_BASE, c->fs_base);
832 error |= __vmwrite(GUEST_FS_AR_BYTES, c->fs_arbytes.bytes);
834 error |= __vmwrite(GUEST_GS_SELECTOR, c->gs_sel);
835 error |= __vmwrite(GUEST_GS_LIMIT, c->gs_limit);
836 error |= __vmwrite(GUEST_GS_BASE, c->gs_base);
837 error |= __vmwrite(GUEST_GS_AR_BYTES, c->gs_arbytes.bytes);
839 error |= __vmwrite(GUEST_TR_SELECTOR, c->tr_sel);
840 error |= __vmwrite(GUEST_TR_LIMIT, c->tr_limit);
841 error |= __vmwrite(GUEST_TR_BASE, c->tr_base);
842 error |= __vmwrite(GUEST_TR_AR_BYTES, c->tr_arbytes.bytes);
844 error |= __vmwrite(GUEST_LDTR_SELECTOR, c->ldtr_sel);
845 error |= __vmwrite(GUEST_LDTR_LIMIT, c->ldtr_limit);
846 error |= __vmwrite(GUEST_LDTR_BASE, c->ldtr_base);
847 error |= __vmwrite(GUEST_LDTR_AR_BYTES, c->ldtr_arbytes.bytes);
849 return !error;
850 }
852 enum { VMX_ASSIST_INVOKE = 0, VMX_ASSIST_RESTORE };
854 int
855 vmx_assist(struct vcpu *d, int mode)
856 {
857 struct vmx_assist_context c;
858 u32 magic;
859 unsigned long cp;
861 /* make sure vmxassist exists (this is not an error) */
862 if (!vmx_copy(&magic, VMXASSIST_MAGIC_OFFSET, sizeof(magic), COPY_IN))
863 return 0;
864 if (magic != VMXASSIST_MAGIC)
865 return 0;
867 switch (mode) {
868 /*
869 * Transfer control to vmxassist.
870 * Store the current context in VMXASSIST_OLD_CONTEXT and load
871 * the new VMXASSIST_NEW_CONTEXT context. This context was created
872 * by vmxassist and will transfer control to it.
873 */
874 case VMX_ASSIST_INVOKE:
875 /* save the old context */
876 if (!vmx_copy(&cp, VMXASSIST_OLD_CONTEXT, sizeof(cp), COPY_IN))
877 goto error;
878 if (cp != 0) {
879 if (!vmx_world_save(d, &c))
880 goto error;
881 if (!vmx_copy(&c, cp, sizeof(c), COPY_OUT))
882 goto error;
883 }
885 /* restore the new context, this should activate vmxassist */
886 if (!vmx_copy(&cp, VMXASSIST_NEW_CONTEXT, sizeof(cp), COPY_IN))
887 goto error;
888 if (cp != 0) {
889 if (!vmx_copy(&c, cp, sizeof(c), COPY_IN))
890 goto error;
891 if (!vmx_world_restore(d, &c))
892 goto error;
893 return 1;
894 }
895 break;
897 /*
898 * Restore the VMXASSIST_OLD_CONTEXT that was saved by VMX_ASSIST_INVOKE
899 * above.
900 */
901 case VMX_ASSIST_RESTORE:
902 /* save the old context */
903 if (!vmx_copy(&cp, VMXASSIST_OLD_CONTEXT, sizeof(cp), COPY_IN))
904 goto error;
905 if (cp != 0) {
906 if (!vmx_copy(&c, cp, sizeof(c), COPY_IN))
907 goto error;
908 if (!vmx_world_restore(d, &c))
909 goto error;
910 return 1;
911 }
912 break;
913 }
915 error:
916 printf("Failed to transfer to vmxassist\n");
917 domain_crash_synchronous();
918 return 0;
919 }
921 static int vmx_set_cr0(unsigned long value)
922 {
923 struct vcpu *d = current;
924 unsigned long mfn;
925 unsigned long eip;
926 int paging_enabled;
927 unsigned long vm_entry_value;
928 /*
929 * CR0: We don't want to lose PE and PG.
930 */
931 paging_enabled = vmx_paging_enabled(d);
932 __vmwrite(GUEST_CR0, (value | X86_CR0_PE | X86_CR0_PG));
933 __vmwrite(CR0_READ_SHADOW, value);
935 VMX_DBG_LOG(DBG_LEVEL_VMMU, "Update CR0 value = %lx\n", value);
937 if ((value & X86_CR0_PE) && (value & X86_CR0_PG) && !paging_enabled) {
938 /*
939 * The guest CR3 must be pointing to the guest physical.
940 */
941 if ( !VALID_MFN(mfn = phys_to_machine_mapping(
942 d->arch.arch_vmx.cpu_cr3 >> PAGE_SHIFT)) ||
943 !get_page(pfn_to_page(mfn), d->domain) )
944 {
945 printk("Invalid CR3 value = %lx", d->arch.arch_vmx.cpu_cr3);
946 domain_crash_synchronous(); /* need to take a clean path */
947 }
949 #if defined(__x86_64__)
950 if (test_bit(VMX_CPU_STATE_LME_ENABLED,
951 &d->arch.arch_vmx.cpu_state) &&
952 !test_bit(VMX_CPU_STATE_PAE_ENABLED,
953 &d->arch.arch_vmx.cpu_state)){
954 VMX_DBG_LOG(DBG_LEVEL_1, "Enable paging before PAE enable\n");
955 vmx_inject_exception(d, TRAP_gp_fault, 0);
956 }
957 if (test_bit(VMX_CPU_STATE_LME_ENABLED,
958 &d->arch.arch_vmx.cpu_state)){
959 /* Here the PAE is should to be opened */
960 VMX_DBG_LOG(DBG_LEVEL_1, "Enable the Long mode\n");
961 set_bit(VMX_CPU_STATE_LMA_ENABLED,
962 &d->arch.arch_vmx.cpu_state);
963 __vmread(VM_ENTRY_CONTROLS, &vm_entry_value);
964 vm_entry_value |= VM_ENTRY_CONTROLS_IA_32E_MODE;
965 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
967 }
969 unsigned long crn;
970 /* update CR4's PAE if needed */
971 __vmread(GUEST_CR4, &crn);
972 if ( (!(crn & X86_CR4_PAE)) &&
973 test_bit(VMX_CPU_STATE_PAE_ENABLED,
974 &d->arch.arch_vmx.cpu_state)){
975 VMX_DBG_LOG(DBG_LEVEL_1, "enable PAE on cr4\n");
976 __vmwrite(GUEST_CR4, crn | X86_CR4_PAE);
977 }
978 #elif defined( __i386__)
979 unsigned long old_base_mfn;
980 old_base_mfn = pagetable_get_pfn(d->arch.guest_table);
981 if (old_base_mfn)
982 put_page(pfn_to_page(old_base_mfn));
983 #endif
984 /*
985 * Now arch.guest_table points to machine physical.
986 */
987 d->arch.guest_table = mk_pagetable(mfn << PAGE_SHIFT);
988 update_pagetables(d);
990 VMX_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
991 (unsigned long) (mfn << PAGE_SHIFT));
993 __vmwrite(GUEST_CR3, pagetable_get_paddr(d->arch.shadow_table));
994 /*
995 * arch->shadow_table should hold the next CR3 for shadow
996 */
997 VMX_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, mfn = %lx",
998 d->arch.arch_vmx.cpu_cr3, mfn);
999 }
1001 /*
1002 * VMX does not implement real-mode virtualization. We emulate
1003 * real-mode by performing a world switch to VMXAssist whenever
1004 * a partition disables the CR0.PE bit.
1005 */
1006 if ((value & X86_CR0_PE) == 0) {
1007 if ( value & X86_CR0_PG ) {
1008 /* inject GP here */
1009 vmx_inject_exception(d, TRAP_gp_fault, 0);
1010 return 0;
1011 } else {
1012 /*
1013 * Disable paging here.
1014 * Same to PE == 1 && PG == 0
1015 */
1016 if (test_bit(VMX_CPU_STATE_LMA_ENABLED,
1017 &d->arch.arch_vmx.cpu_state)){
1018 clear_bit(VMX_CPU_STATE_LMA_ENABLED,
1019 &d->arch.arch_vmx.cpu_state);
1020 __vmread(VM_ENTRY_CONTROLS, &vm_entry_value);
1021 vm_entry_value &= ~VM_ENTRY_CONTROLS_IA_32E_MODE;
1022 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
1025 __vmread(GUEST_RIP, &eip);
1026 VMX_DBG_LOG(DBG_LEVEL_1,
1027 "Disabling CR0.PE at %%eip 0x%lx\n", eip);
1028 if (vmx_assist(d, VMX_ASSIST_INVOKE)) {
1029 set_bit(VMX_CPU_STATE_ASSIST_ENABLED, &d->arch.arch_vmx.cpu_state);
1030 __vmread(GUEST_RIP, &eip);
1031 VMX_DBG_LOG(DBG_LEVEL_1,
1032 "Transfering control to vmxassist %%eip 0x%lx\n", eip);
1033 return 0; /* do not update eip! */
1035 } else if (test_bit(VMX_CPU_STATE_ASSIST_ENABLED,
1036 &d->arch.arch_vmx.cpu_state)) {
1037 __vmread(GUEST_RIP, &eip);
1038 VMX_DBG_LOG(DBG_LEVEL_1,
1039 "Enabling CR0.PE at %%eip 0x%lx\n", eip);
1040 if (vmx_assist(d, VMX_ASSIST_RESTORE)) {
1041 clear_bit(VMX_CPU_STATE_ASSIST_ENABLED,
1042 &d->arch.arch_vmx.cpu_state);
1043 __vmread(GUEST_RIP, &eip);
1044 VMX_DBG_LOG(DBG_LEVEL_1,
1045 "Restoring to %%eip 0x%lx\n", eip);
1046 return 0; /* do not update eip! */
1050 return 1;
1053 #define CASE_GET_REG(REG, reg) \
1054 case REG_ ## REG: value = regs->reg; break
1056 #define CASE_EXTEND_SET_REG \
1057 CASE_EXTEND_REG(S)
1058 #define CASE_EXTEND_GET_REG \
1059 CASE_EXTEND_REG(G)
1061 #ifdef __i386__
1062 #define CASE_EXTEND_REG(T)
1063 #else
1064 #define CASE_EXTEND_REG(T) \
1065 CASE_ ## T ## ET_REG(R8, r8); \
1066 CASE_ ## T ## ET_REG(R9, r9); \
1067 CASE_ ## T ## ET_REG(R10, r10); \
1068 CASE_ ## T ## ET_REG(R11, r11); \
1069 CASE_ ## T ## ET_REG(R12, r12); \
1070 CASE_ ## T ## ET_REG(R13, r13); \
1071 CASE_ ## T ## ET_REG(R14, r14); \
1072 CASE_ ## T ## ET_REG(R15, r15);
1073 #endif
1076 /*
1077 * Write to control registers
1078 */
1079 static int mov_to_cr(int gp, int cr, struct cpu_user_regs *regs)
1081 unsigned long value;
1082 unsigned long old_cr;
1083 struct vcpu *d = current;
1085 switch (gp) {
1086 CASE_GET_REG(EAX, eax);
1087 CASE_GET_REG(ECX, ecx);
1088 CASE_GET_REG(EDX, edx);
1089 CASE_GET_REG(EBX, ebx);
1090 CASE_GET_REG(EBP, ebp);
1091 CASE_GET_REG(ESI, esi);
1092 CASE_GET_REG(EDI, edi);
1093 CASE_EXTEND_GET_REG
1094 case REG_ESP:
1095 __vmread(GUEST_RSP, &value);
1096 break;
1097 default:
1098 printk("invalid gp: %d\n", gp);
1099 __vmx_bug(regs);
1102 VMX_DBG_LOG(DBG_LEVEL_1, "mov_to_cr: CR%d, value = %lx,", cr, value);
1103 VMX_DBG_LOG(DBG_LEVEL_1, "current = %lx,", (unsigned long) current);
1105 switch(cr) {
1106 case 0:
1108 return vmx_set_cr0(value);
1110 case 3:
1112 unsigned long old_base_mfn, mfn;
1114 /*
1115 * If paging is not enabled yet, simply copy the value to CR3.
1116 */
1117 if (!vmx_paging_enabled(d)) {
1118 d->arch.arch_vmx.cpu_cr3 = value;
1119 break;
1122 /*
1123 * We make a new one if the shadow does not exist.
1124 */
1125 if (value == d->arch.arch_vmx.cpu_cr3) {
1126 /*
1127 * This is simple TLB flush, implying the guest has
1128 * removed some translation or changed page attributes.
1129 * We simply invalidate the shadow.
1130 */
1131 mfn = phys_to_machine_mapping(value >> PAGE_SHIFT);
1132 if (mfn != pagetable_get_pfn(d->arch.guest_table))
1133 __vmx_bug(regs);
1134 shadow_sync_all(d->domain);
1135 } else {
1136 /*
1137 * If different, make a shadow. Check if the PDBR is valid
1138 * first.
1139 */
1140 VMX_DBG_LOG(DBG_LEVEL_VMMU, "CR3 value = %lx", value);
1141 if ( ((value >> PAGE_SHIFT) > d->domain->max_pages ) ||
1142 !VALID_MFN(mfn = phys_to_machine_mapping(value >> PAGE_SHIFT)) ||
1143 !get_page(pfn_to_page(mfn), d->domain) )
1145 printk("Invalid CR3 value=%lx", value);
1146 domain_crash_synchronous(); /* need to take a clean path */
1148 old_base_mfn = pagetable_get_pfn(d->arch.guest_table);
1149 d->arch.guest_table = mk_pagetable(mfn << PAGE_SHIFT);
1150 if (old_base_mfn)
1151 put_page(pfn_to_page(old_base_mfn));
1152 update_pagetables(d);
1153 /*
1154 * arch.shadow_table should now hold the next CR3 for shadow
1155 */
1156 d->arch.arch_vmx.cpu_cr3 = value;
1157 VMX_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx",
1158 value);
1159 __vmwrite(GUEST_CR3, pagetable_get_paddr(d->arch.shadow_table));
1161 break;
1163 case 4:
1165 /* CR4 */
1166 unsigned long old_guest_cr;
1167 unsigned long pae_disabled = 0;
1169 __vmread(GUEST_CR4, &old_guest_cr);
1170 if (value & X86_CR4_PAE){
1171 set_bit(VMX_CPU_STATE_PAE_ENABLED, &d->arch.arch_vmx.cpu_state);
1172 if(!vmx_paging_enabled(d))
1173 pae_disabled = 1;
1174 } else {
1175 if (test_bit(VMX_CPU_STATE_LMA_ENABLED,
1176 &d->arch.arch_vmx.cpu_state)){
1177 vmx_inject_exception(d, TRAP_gp_fault, 0);
1179 clear_bit(VMX_CPU_STATE_PAE_ENABLED, &d->arch.arch_vmx.cpu_state);
1182 __vmread(CR4_READ_SHADOW, &old_cr);
1183 if (pae_disabled)
1184 __vmwrite(GUEST_CR4, ((value & ~X86_CR4_PAE) | X86_CR4_VMXE));
1185 else
1186 __vmwrite(GUEST_CR4, value| X86_CR4_VMXE);
1188 __vmwrite(CR4_READ_SHADOW, value);
1190 /*
1191 * Writing to CR4 to modify the PSE, PGE, or PAE flag invalidates
1192 * all TLB entries except global entries.
1193 */
1194 if ((old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE)) {
1195 shadow_sync_all(d->domain);
1197 break;
1199 default:
1200 printk("invalid cr: %d\n", gp);
1201 __vmx_bug(regs);
1204 return 1;
1207 #define CASE_SET_REG(REG, reg) \
1208 case REG_ ## REG: \
1209 regs->reg = value; \
1210 break
1212 /*
1213 * Read from control registers. CR0 and CR4 are read from the shadow.
1214 */
1215 static void mov_from_cr(int cr, int gp, struct cpu_user_regs *regs)
1217 unsigned long value;
1218 struct vcpu *d = current;
1220 if (cr != 3)
1221 __vmx_bug(regs);
1223 value = (unsigned long) d->arch.arch_vmx.cpu_cr3;
1225 switch (gp) {
1226 CASE_SET_REG(EAX, eax);
1227 CASE_SET_REG(ECX, ecx);
1228 CASE_SET_REG(EDX, edx);
1229 CASE_SET_REG(EBX, ebx);
1230 CASE_SET_REG(EBP, ebp);
1231 CASE_SET_REG(ESI, esi);
1232 CASE_SET_REG(EDI, edi);
1233 case REG_ESP:
1234 __vmwrite(GUEST_RSP, value);
1235 regs->esp = value;
1236 break;
1237 default:
1238 printk("invalid gp: %d\n", gp);
1239 __vmx_bug(regs);
1242 VMX_DBG_LOG(DBG_LEVEL_VMMU, "mov_from_cr: CR%d, value = %lx,", cr, value);
1245 static int vmx_cr_access(unsigned long exit_qualification, struct cpu_user_regs *regs)
1247 unsigned int gp, cr;
1248 unsigned long value;
1250 switch (exit_qualification & CONTROL_REG_ACCESS_TYPE) {
1251 case TYPE_MOV_TO_CR:
1252 gp = exit_qualification & CONTROL_REG_ACCESS_REG;
1253 cr = exit_qualification & CONTROL_REG_ACCESS_NUM;
1254 return mov_to_cr(gp, cr, regs);
1255 case TYPE_MOV_FROM_CR:
1256 gp = exit_qualification & CONTROL_REG_ACCESS_REG;
1257 cr = exit_qualification & CONTROL_REG_ACCESS_NUM;
1258 mov_from_cr(cr, gp, regs);
1259 break;
1260 case TYPE_CLTS:
1261 clts();
1262 setup_fpu(current);
1264 __vmread(GUEST_CR0, &value);
1265 value &= ~X86_CR0_TS; /* clear TS */
1266 __vmwrite(GUEST_CR0, value);
1268 __vmread(CR0_READ_SHADOW, &value);
1269 value &= ~X86_CR0_TS; /* clear TS */
1270 __vmwrite(CR0_READ_SHADOW, value);
1271 break;
1272 case TYPE_LMSW:
1273 __vmread(CR0_READ_SHADOW, &value);
1274 value = (value & ~0xF) |
1275 (((exit_qualification & LMSW_SOURCE_DATA) >> 16) & 0xF);
1276 return vmx_set_cr0(value);
1277 break;
1278 default:
1279 __vmx_bug(regs);
1280 break;
1282 return 1;
1285 static inline void vmx_do_msr_read(struct cpu_user_regs *regs)
1287 VMX_DBG_LOG(DBG_LEVEL_1, "vmx_do_msr_read: ecx=%lx, eax=%lx, edx=%lx",
1288 (unsigned long)regs->ecx, (unsigned long)regs->eax,
1289 (unsigned long)regs->edx);
1290 switch (regs->ecx) {
1291 case MSR_IA32_SYSENTER_CS:
1292 __vmread(GUEST_SYSENTER_CS, &regs->eax);
1293 regs->edx = 0;
1294 break;
1295 case MSR_IA32_SYSENTER_ESP:
1296 __vmread(GUEST_SYSENTER_ESP, &regs->eax);
1297 regs->edx = 0;
1298 break;
1299 case MSR_IA32_SYSENTER_EIP:
1300 __vmread(GUEST_SYSENTER_EIP, &regs->eax);
1301 regs->edx = 0;
1302 break;
1303 default:
1304 if(long_mode_do_msr_read(regs))
1305 return;
1306 rdmsr_user(regs->ecx, regs->eax, regs->edx);
1307 break;
1310 VMX_DBG_LOG(DBG_LEVEL_1, "vmx_do_msr_read returns: "
1311 "ecx=%lx, eax=%lx, edx=%lx",
1312 (unsigned long)regs->ecx, (unsigned long)regs->eax,
1313 (unsigned long)regs->edx);
1316 static inline void vmx_do_msr_write(struct cpu_user_regs *regs)
1318 VMX_DBG_LOG(DBG_LEVEL_1, "vmx_do_msr_write: ecx=%lx, eax=%lx, edx=%lx",
1319 (unsigned long)regs->ecx, (unsigned long)regs->eax,
1320 (unsigned long)regs->edx);
1321 switch (regs->ecx) {
1322 case MSR_IA32_SYSENTER_CS:
1323 __vmwrite(GUEST_SYSENTER_CS, regs->eax);
1324 break;
1325 case MSR_IA32_SYSENTER_ESP:
1326 __vmwrite(GUEST_SYSENTER_ESP, regs->eax);
1327 break;
1328 case MSR_IA32_SYSENTER_EIP:
1329 __vmwrite(GUEST_SYSENTER_EIP, regs->eax);
1330 break;
1331 default:
1332 long_mode_do_msr_write(regs);
1333 break;
1336 VMX_DBG_LOG(DBG_LEVEL_1, "vmx_do_msr_write returns: "
1337 "ecx=%lx, eax=%lx, edx=%lx",
1338 (unsigned long)regs->ecx, (unsigned long)regs->eax,
1339 (unsigned long)regs->edx);
1342 /*
1343 * Need to use this exit to reschedule
1344 */
1345 static inline void vmx_vmexit_do_hlt(void)
1347 #if VMX_DEBUG
1348 unsigned long eip;
1349 __vmread(GUEST_RIP, &eip);
1350 #endif
1351 VMX_DBG_LOG(DBG_LEVEL_1, "vmx_vmexit_do_hlt:eip=%lx", eip);
1352 raise_softirq(SCHEDULE_SOFTIRQ);
1355 static inline void vmx_vmexit_do_mwait(void)
1357 #if VMX_DEBUG
1358 unsigned long eip;
1359 __vmread(GUEST_RIP, &eip);
1360 #endif
1361 VMX_DBG_LOG(DBG_LEVEL_1, "vmx_vmexit_do_mwait:eip=%lx", eip);
1362 raise_softirq(SCHEDULE_SOFTIRQ);
1365 #define BUF_SIZ 256
1366 #define MAX_LINE 80
1367 char print_buf[BUF_SIZ];
1368 static int index;
1370 static void vmx_print_line(const char c, struct vcpu *d)
1373 if (index == MAX_LINE || c == '\n') {
1374 if (index == MAX_LINE) {
1375 print_buf[index++] = c;
1377 print_buf[index] = '\0';
1378 printk("(GUEST: %u) %s\n", d->domain->domain_id, (char *) &print_buf);
1379 index = 0;
1381 else
1382 print_buf[index++] = c;
1385 void save_vmx_cpu_user_regs(struct cpu_user_regs *ctxt)
1387 __vmread(GUEST_SS_SELECTOR, &ctxt->ss);
1388 __vmread(GUEST_RSP, &ctxt->esp);
1389 __vmread(GUEST_RFLAGS, &ctxt->eflags);
1390 __vmread(GUEST_CS_SELECTOR, &ctxt->cs);
1391 __vmread(GUEST_RIP, &ctxt->eip);
1393 __vmread(GUEST_GS_SELECTOR, &ctxt->gs);
1394 __vmread(GUEST_FS_SELECTOR, &ctxt->fs);
1395 __vmread(GUEST_ES_SELECTOR, &ctxt->es);
1396 __vmread(GUEST_DS_SELECTOR, &ctxt->ds);
1399 #ifdef XEN_DEBUGGER
1400 void save_cpu_user_regs(struct cpu_user_regs *regs)
1402 __vmread(GUEST_SS_SELECTOR, &regs->xss);
1403 __vmread(GUEST_RSP, &regs->esp);
1404 __vmread(GUEST_RFLAGS, &regs->eflags);
1405 __vmread(GUEST_CS_SELECTOR, &regs->xcs);
1406 __vmread(GUEST_RIP, &regs->eip);
1408 __vmread(GUEST_GS_SELECTOR, &regs->xgs);
1409 __vmread(GUEST_FS_SELECTOR, &regs->xfs);
1410 __vmread(GUEST_ES_SELECTOR, &regs->xes);
1411 __vmread(GUEST_DS_SELECTOR, &regs->xds);
1414 void restore_cpu_user_regs(struct cpu_user_regs *regs)
1416 __vmwrite(GUEST_SS_SELECTOR, regs->xss);
1417 __vmwrite(GUEST_RSP, regs->esp);
1418 __vmwrite(GUEST_RFLAGS, regs->eflags);
1419 __vmwrite(GUEST_CS_SELECTOR, regs->xcs);
1420 __vmwrite(GUEST_RIP, regs->eip);
1422 __vmwrite(GUEST_GS_SELECTOR, regs->xgs);
1423 __vmwrite(GUEST_FS_SELECTOR, regs->xfs);
1424 __vmwrite(GUEST_ES_SELECTOR, regs->xes);
1425 __vmwrite(GUEST_DS_SELECTOR, regs->xds);
1427 #endif
1429 asmlinkage void vmx_vmexit_handler(struct cpu_user_regs regs)
1431 unsigned int exit_reason, idtv_info_field;
1432 unsigned long exit_qualification, eip, inst_len = 0;
1433 struct vcpu *v = current;
1434 int error;
1436 if ((error = __vmread(VM_EXIT_REASON, &exit_reason)))
1437 __vmx_bug(&regs);
1439 perfc_incra(vmexits, exit_reason);
1441 __vmread(IDT_VECTORING_INFO_FIELD, &idtv_info_field);
1442 if (idtv_info_field & INTR_INFO_VALID_MASK) {
1443 if ((idtv_info_field & 0x0700) != 0x400) { /* exclude soft ints */
1444 __vmwrite(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field);
1446 if (idtv_info_field & 0x800) { /* valid error code */
1447 unsigned long error_code;
1448 __vmread(VM_EXIT_INTR_ERROR_CODE, &error_code);
1449 __vmwrite(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
1452 VMX_DBG_LOG(DBG_LEVEL_1, "idtv_info_field=%x", idtv_info_field);
1455 /* don't bother H/W interrutps */
1456 if (exit_reason != EXIT_REASON_EXTERNAL_INTERRUPT &&
1457 exit_reason != EXIT_REASON_VMCALL &&
1458 exit_reason != EXIT_REASON_IO_INSTRUCTION)
1459 VMX_DBG_LOG(DBG_LEVEL_0, "exit reason = %x", exit_reason);
1461 if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
1462 printk("Failed vm entry\n");
1463 domain_crash_synchronous();
1464 return;
1467 __vmread(GUEST_RIP, &eip);
1468 TRACE_3D(TRC_VMX_VMEXIT, v->domain->domain_id, eip, exit_reason);
1470 switch (exit_reason) {
1471 case EXIT_REASON_EXCEPTION_NMI:
1473 /*
1474 * We don't set the software-interrupt exiting (INT n).
1475 * (1) We can get an exception (e.g. #PG) in the guest, or
1476 * (2) NMI
1477 */
1478 int error;
1479 unsigned int vector;
1480 unsigned long va;
1482 if ((error = __vmread(VM_EXIT_INTR_INFO, &vector))
1483 || !(vector & INTR_INFO_VALID_MASK))
1484 __vmx_bug(&regs);
1485 vector &= 0xff;
1487 perfc_incra(cause_vector, vector);
1489 TRACE_3D(TRC_VMX_VECTOR, v->domain->domain_id, eip, vector);
1490 switch (vector) {
1491 #ifdef XEN_DEBUGGER
1492 case TRAP_debug:
1494 save_cpu_user_regs(&regs);
1495 pdb_handle_exception(1, &regs, 1);
1496 restore_cpu_user_regs(&regs);
1497 break;
1499 case TRAP_int3:
1501 save_cpu_user_regs(&regs);
1502 pdb_handle_exception(3, &regs, 1);
1503 restore_cpu_user_regs(&regs);
1504 break;
1506 #else
1507 case TRAP_debug:
1509 void store_cpu_user_regs(struct cpu_user_regs *regs);
1510 long do_sched_op(unsigned long op);
1513 store_cpu_user_regs(&regs);
1514 __vm_clear_bit(GUEST_PENDING_DBG_EXCEPTIONS, PENDING_DEBUG_EXC_BS);
1516 set_bit(_VCPUF_ctrl_pause, &current->vcpu_flags);
1517 do_sched_op(SCHEDOP_yield);
1519 break;
1521 #endif
1522 case TRAP_no_device:
1524 vmx_do_no_device_fault();
1525 break;
1527 case TRAP_page_fault:
1529 __vmread(EXIT_QUALIFICATION, &va);
1530 __vmread(VM_EXIT_INTR_ERROR_CODE, &regs.error_code);
1531 VMX_DBG_LOG(DBG_LEVEL_VMMU,
1532 "eax=%lx, ebx=%lx, ecx=%lx, edx=%lx, esi=%lx, edi=%lx",
1533 (unsigned long)regs.eax, (unsigned long)regs.ebx,
1534 (unsigned long)regs.ecx, (unsigned long)regs.edx,
1535 (unsigned long)regs.esi, (unsigned long)regs.edi);
1536 v->domain->arch.vmx_platform.mpci.inst_decoder_regs = &regs;
1538 if (!(error = vmx_do_page_fault(va, &regs))) {
1539 /*
1540 * Inject #PG using Interruption-Information Fields
1541 */
1542 vmx_inject_exception(v, TRAP_page_fault, regs.error_code);
1543 v->arch.arch_vmx.cpu_cr2 = va;
1544 TRACE_3D(TRC_VMX_INT, v->domain->domain_id, TRAP_page_fault, va);
1546 break;
1548 case TRAP_nmi:
1549 do_nmi(&regs, 0);
1550 break;
1551 default:
1552 vmx_reflect_exception(v);
1553 break;
1555 break;
1557 case EXIT_REASON_EXTERNAL_INTERRUPT:
1559 extern asmlinkage void do_IRQ(struct cpu_user_regs *);
1560 extern void smp_apic_timer_interrupt(struct cpu_user_regs *);
1561 extern void timer_interrupt(int, void *, struct cpu_user_regs *);
1562 unsigned int vector;
1564 if ((error = __vmread(VM_EXIT_INTR_INFO, &vector))
1565 && !(vector & INTR_INFO_VALID_MASK))
1566 __vmx_bug(&regs);
1568 vector &= 0xff;
1569 local_irq_disable();
1571 if (vector == LOCAL_TIMER_VECTOR) {
1572 smp_apic_timer_interrupt(&regs);
1573 } else {
1574 regs.entry_vector = vector;
1575 do_IRQ(&regs);
1577 break;
1579 case EXIT_REASON_PENDING_INTERRUPT:
1580 __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
1581 MONITOR_CPU_BASED_EXEC_CONTROLS);
1582 break;
1583 case EXIT_REASON_TASK_SWITCH:
1584 __vmx_bug(&regs);
1585 break;
1586 case EXIT_REASON_CPUID:
1587 __get_instruction_length(inst_len);
1588 vmx_vmexit_do_cpuid(regs.eax, &regs);
1589 __update_guest_eip(inst_len);
1590 break;
1591 case EXIT_REASON_HLT:
1592 __get_instruction_length(inst_len);
1593 __update_guest_eip(inst_len);
1594 vmx_vmexit_do_hlt();
1595 break;
1596 case EXIT_REASON_INVLPG:
1598 unsigned long va;
1600 __vmread(EXIT_QUALIFICATION, &va);
1601 vmx_vmexit_do_invlpg(va);
1602 __get_instruction_length(inst_len);
1603 __update_guest_eip(inst_len);
1604 break;
1606 case EXIT_REASON_VMCALL:
1607 __get_instruction_length(inst_len);
1608 __vmread(GUEST_RIP, &eip);
1609 __vmread(EXIT_QUALIFICATION, &exit_qualification);
1611 vmx_print_line(regs.eax, v); /* provides the current domain */
1612 __update_guest_eip(inst_len);
1613 break;
1614 case EXIT_REASON_CR_ACCESS:
1616 __vmread(GUEST_RIP, &eip);
1617 __get_instruction_length(inst_len);
1618 __vmread(EXIT_QUALIFICATION, &exit_qualification);
1620 VMX_DBG_LOG(DBG_LEVEL_1, "eip = %lx, inst_len =%lx, exit_qualification = %lx",
1621 eip, inst_len, exit_qualification);
1622 if (vmx_cr_access(exit_qualification, &regs))
1623 __update_guest_eip(inst_len);
1624 break;
1626 case EXIT_REASON_DR_ACCESS:
1627 __vmread(EXIT_QUALIFICATION, &exit_qualification);
1628 vmx_dr_access(exit_qualification, &regs);
1629 __get_instruction_length(inst_len);
1630 __update_guest_eip(inst_len);
1631 break;
1632 case EXIT_REASON_IO_INSTRUCTION:
1633 __vmread(EXIT_QUALIFICATION, &exit_qualification);
1634 __get_instruction_length(inst_len);
1635 vmx_io_instruction(&regs, exit_qualification, inst_len);
1636 break;
1637 case EXIT_REASON_MSR_READ:
1638 __get_instruction_length(inst_len);
1639 vmx_do_msr_read(&regs);
1640 __update_guest_eip(inst_len);
1641 break;
1642 case EXIT_REASON_MSR_WRITE:
1643 __vmread(GUEST_RIP, &eip);
1644 vmx_do_msr_write(&regs);
1645 __get_instruction_length(inst_len);
1646 __update_guest_eip(inst_len);
1647 break;
1648 case EXIT_REASON_MWAIT_INSTRUCTION:
1649 __get_instruction_length(inst_len);
1650 __update_guest_eip(inst_len);
1651 vmx_vmexit_do_mwait();
1652 break;
1653 default:
1654 __vmx_bug(&regs); /* should not happen */
1657 vmx_intr_assist(v);
1658 return;
1661 asmlinkage void load_cr2(void)
1663 struct vcpu *d = current;
1665 local_irq_disable();
1666 #ifdef __i386__
1667 asm volatile("movl %0,%%cr2": :"r" (d->arch.arch_vmx.cpu_cr2));
1668 #else
1669 asm volatile("movq %0,%%cr2": :"r" (d->arch.arch_vmx.cpu_cr2));
1670 #endif
1673 #endif /* CONFIG_VMX */
1675 /*
1676 * Local variables:
1677 * mode: C
1678 * c-set-style: "BSD"
1679 * c-basic-offset: 4
1680 * tab-width: 4
1681 * indent-tabs-mode: nil
1682 * End:
1683 */