ia64/xen-unstable

view xen/arch/x86/hvm/vmx/vmx.c @ 9334:56a775219c88

This patch fix HVM/VMX time resolution issue that cause IA32E complain
"loss tick" occationally and APIC time calibration issue.

Signed-off-by: Xiaowei Yang <xiaowei.yang@intel.com>
Signed-off-by: Eddie Dong <eddie.dong@intel.com>
author kaf24@firebug.cl.cam.ac.uk
date Sun Mar 19 18:52:20 2006 +0100 (2006-03-19)
parents 760f9149dbaa
children 0c6534a2e396
line source
1 /*
2 * vmx.c: handling VMX architecture-related VM exits
3 * Copyright (c) 2004, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
16 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 *
18 */
20 #include <xen/config.h>
21 #include <xen/init.h>
22 #include <xen/lib.h>
23 #include <xen/trace.h>
24 #include <xen/sched.h>
25 #include <xen/irq.h>
26 #include <xen/softirq.h>
27 #include <xen/domain_page.h>
28 #include <asm/current.h>
29 #include <asm/io.h>
30 #include <asm/shadow.h>
31 #include <asm/regs.h>
32 #include <asm/cpufeature.h>
33 #include <asm/processor.h>
34 #include <asm/types.h>
35 #include <asm/msr.h>
36 #include <asm/spinlock.h>
37 #include <asm/hvm/hvm.h>
38 #include <asm/hvm/support.h>
39 #include <asm/hvm/vmx/vmx.h>
40 #include <asm/hvm/vmx/vmcs.h>
41 #include <asm/shadow.h>
42 #if CONFIG_PAGING_LEVELS >= 3
43 #include <asm/shadow_64.h>
44 #endif
45 #include <public/sched.h>
46 #include <public/hvm/ioreq.h>
47 #include <asm/hvm/vpic.h>
48 #include <asm/hvm/vlapic.h>
50 static unsigned long trace_values[NR_CPUS][4];
51 #define TRACE_VMEXIT(index,value) trace_values[smp_processor_id()][index]=value
53 static void vmx_ctxt_switch_from(struct vcpu *v);
54 static void vmx_ctxt_switch_to(struct vcpu *v);
56 void vmx_final_setup_guest(struct vcpu *v)
57 {
58 v->arch.schedule_tail = arch_vmx_do_launch;
59 v->arch.ctxt_switch_from = vmx_ctxt_switch_from;
60 v->arch.ctxt_switch_to = vmx_ctxt_switch_to;
62 if ( v->vcpu_id == 0 )
63 {
64 struct domain *d = v->domain;
65 struct vcpu *vc;
67 /* Initialize monitor page table */
68 for_each_vcpu(d, vc)
69 vc->arch.monitor_table = mk_pagetable(0);
71 /*
72 * Required to do this once per domain
73 * XXX todo: add a seperate function to do these.
74 */
75 memset(&d->shared_info->evtchn_mask[0], 0xff,
76 sizeof(d->shared_info->evtchn_mask));
78 /* Put the domain in shadow mode even though we're going to be using
79 * the shared 1:1 page table initially. It shouldn't hurt */
80 shadow_mode_enable(d,
81 SHM_enable|SHM_refcounts|
82 SHM_translate|SHM_external|SHM_wr_pt_pte);
83 }
84 }
86 static void vmx_relinquish_guest_resources(struct domain *d)
87 {
88 struct vcpu *v;
90 for_each_vcpu ( d, v )
91 {
92 vmx_request_clear_vmcs(v);
93 destroy_vmcs(&v->arch.hvm_vmx);
94 free_monitor_pagetable(v);
95 kill_timer(&v->arch.hvm_vmx.hlt_timer);
96 if ( hvm_apic_support(v->domain) && (VLAPIC(v) != NULL) )
97 {
98 kill_timer(&VLAPIC(v)->vlapic_timer);
99 xfree(VLAPIC(v));
100 }
101 }
103 kill_timer(&d->arch.hvm_domain.vpit.pit_timer);
105 if ( d->arch.hvm_domain.shared_page_va )
106 unmap_domain_page_global(
107 (void *)d->arch.hvm_domain.shared_page_va);
109 shadow_direct_map_clean(d);
110 }
112 #ifdef __x86_64__
114 static struct vmx_msr_state percpu_msr[NR_CPUS];
116 static u32 msr_data_index[VMX_MSR_COUNT] =
117 {
118 MSR_LSTAR, MSR_STAR, MSR_CSTAR,
119 MSR_SYSCALL_MASK, MSR_EFER,
120 };
122 static void vmx_save_segments(struct vcpu *v)
123 {
124 rdmsrl(MSR_SHADOW_GS_BASE, v->arch.hvm_vmx.msr_content.shadow_gs);
125 }
127 /*
128 * To avoid MSR save/restore at every VM exit/entry time, we restore
129 * the x86_64 specific MSRs at domain switch time. Since those MSRs are
130 * are not modified once set for generic domains, we don't save them,
131 * but simply reset them to the values set at percpu_traps_init().
132 */
133 static void vmx_load_msrs(void)
134 {
135 struct vmx_msr_state *host_state = &percpu_msr[smp_processor_id()];
136 int i;
138 while ( host_state->flags )
139 {
140 i = find_first_set_bit(host_state->flags);
141 wrmsrl(msr_data_index[i], host_state->msr_items[i]);
142 clear_bit(i, &host_state->flags);
143 }
144 }
146 static void vmx_save_init_msrs(void)
147 {
148 struct vmx_msr_state *host_state = &percpu_msr[smp_processor_id()];
149 int i;
151 for ( i = 0; i < VMX_MSR_COUNT; i++ )
152 rdmsrl(msr_data_index[i], host_state->msr_items[i]);
153 }
155 #define CASE_READ_MSR(address) \
156 case MSR_ ## address: \
157 msr_content = msr->msr_items[VMX_INDEX_MSR_ ## address]; \
158 break
160 #define CASE_WRITE_MSR(address) \
161 case MSR_ ## address: \
162 { \
163 msr->msr_items[VMX_INDEX_MSR_ ## address] = msr_content; \
164 if (!test_bit(VMX_INDEX_MSR_ ## address, &msr->flags)) { \
165 set_bit(VMX_INDEX_MSR_ ## address, &msr->flags); \
166 } \
167 wrmsrl(MSR_ ## address, msr_content); \
168 set_bit(VMX_INDEX_MSR_ ## address, &host_state->flags); \
169 } \
170 break
172 #define IS_CANO_ADDRESS(add) 1
173 static inline int long_mode_do_msr_read(struct cpu_user_regs *regs)
174 {
175 u64 msr_content = 0;
176 struct vcpu *v = current;
177 struct vmx_msr_state *msr = &v->arch.hvm_vmx.msr_content;
179 switch ( regs->ecx ) {
180 case MSR_EFER:
181 HVM_DBG_LOG(DBG_LEVEL_2, "EFER msr_content 0x%"PRIx64, msr_content);
182 msr_content = msr->msr_items[VMX_INDEX_MSR_EFER];
184 /* the following code may be not needed */
185 if ( test_bit(VMX_CPU_STATE_LME_ENABLED, &v->arch.hvm_vmx.cpu_state) )
186 msr_content |= EFER_LME;
187 else
188 msr_content &= ~EFER_LME;
190 if ( VMX_LONG_GUEST(v) )
191 msr_content |= EFER_LMA;
192 else
193 msr_content &= ~EFER_LMA;
194 break;
196 case MSR_FS_BASE:
197 if ( !(VMX_LONG_GUEST(v)) )
198 /* XXX should it be GP fault */
199 domain_crash_synchronous();
201 __vmread(GUEST_FS_BASE, &msr_content);
202 break;
204 case MSR_GS_BASE:
205 if ( !(VMX_LONG_GUEST(v)) )
206 domain_crash_synchronous();
208 __vmread(GUEST_GS_BASE, &msr_content);
209 break;
211 case MSR_SHADOW_GS_BASE:
212 msr_content = msr->shadow_gs;
213 break;
215 CASE_READ_MSR(STAR);
216 CASE_READ_MSR(LSTAR);
217 CASE_READ_MSR(CSTAR);
218 CASE_READ_MSR(SYSCALL_MASK);
220 default:
221 return 0;
222 }
224 HVM_DBG_LOG(DBG_LEVEL_2, "msr_content: 0x%"PRIx64, msr_content);
226 regs->eax = msr_content & 0xffffffff;
227 regs->edx = msr_content >> 32;
229 return 1;
230 }
232 static inline int long_mode_do_msr_write(struct cpu_user_regs *regs)
233 {
234 u64 msr_content = regs->eax | ((u64)regs->edx << 32);
235 struct vcpu *v = current;
236 struct vmx_msr_state *msr = &v->arch.hvm_vmx.msr_content;
237 struct vmx_msr_state *host_state = &percpu_msr[smp_processor_id()];
239 HVM_DBG_LOG(DBG_LEVEL_1, "msr 0x%lx msr_content 0x%"PRIx64"\n",
240 (unsigned long)regs->ecx, msr_content);
242 switch ( regs->ecx ) {
243 case MSR_EFER:
244 /* offending reserved bit will cause #GP */
245 if ( msr_content & ~(EFER_LME | EFER_LMA | EFER_NX | EFER_SCE) )
246 {
247 printk("trying to set reserved bit in EFER\n");
248 vmx_inject_exception(v, TRAP_gp_fault, 0);
249 return 0;
250 }
252 /* LME: 0 -> 1 */
253 if ( msr_content & EFER_LME &&
254 !test_bit(VMX_CPU_STATE_LME_ENABLED, &v->arch.hvm_vmx.cpu_state) )
255 {
256 if ( vmx_paging_enabled(v) ||
257 !test_bit(VMX_CPU_STATE_PAE_ENABLED,
258 &v->arch.hvm_vmx.cpu_state) )
259 {
260 printk("trying to set LME bit when "
261 "in paging mode or PAE bit is not set\n");
262 vmx_inject_exception(v, TRAP_gp_fault, 0);
263 return 0;
264 }
266 set_bit(VMX_CPU_STATE_LME_ENABLED, &v->arch.hvm_vmx.cpu_state);
267 }
269 msr->msr_items[VMX_INDEX_MSR_EFER] = msr_content;
270 break;
272 case MSR_FS_BASE:
273 case MSR_GS_BASE:
274 if ( !(VMX_LONG_GUEST(v)) )
275 domain_crash_synchronous();
277 if ( !IS_CANO_ADDRESS(msr_content) )
278 {
279 HVM_DBG_LOG(DBG_LEVEL_1, "Not cano address of msr write\n");
280 vmx_inject_exception(v, TRAP_gp_fault, 0);
281 return 0;
282 }
284 if ( regs->ecx == MSR_FS_BASE )
285 __vmwrite(GUEST_FS_BASE, msr_content);
286 else
287 __vmwrite(GUEST_GS_BASE, msr_content);
289 break;
291 case MSR_SHADOW_GS_BASE:
292 if ( !(VMX_LONG_GUEST(v)) )
293 domain_crash_synchronous();
295 v->arch.hvm_vmx.msr_content.shadow_gs = msr_content;
296 wrmsrl(MSR_SHADOW_GS_BASE, msr_content);
297 break;
299 CASE_WRITE_MSR(STAR);
300 CASE_WRITE_MSR(LSTAR);
301 CASE_WRITE_MSR(CSTAR);
302 CASE_WRITE_MSR(SYSCALL_MASK);
304 default:
305 return 0;
306 }
308 return 1;
309 }
311 static void vmx_restore_msrs(struct vcpu *v)
312 {
313 int i = 0;
314 struct vmx_msr_state *guest_state;
315 struct vmx_msr_state *host_state;
316 unsigned long guest_flags ;
318 guest_state = &v->arch.hvm_vmx.msr_content;;
319 host_state = &percpu_msr[smp_processor_id()];
321 wrmsrl(MSR_SHADOW_GS_BASE, guest_state->shadow_gs);
322 guest_flags = guest_state->flags;
323 if (!guest_flags)
324 return;
326 while (guest_flags){
327 i = find_first_set_bit(guest_flags);
329 HVM_DBG_LOG(DBG_LEVEL_2,
330 "restore guest's index %d msr %lx with %lx\n",
331 i, (unsigned long)msr_data_index[i],
332 (unsigned long)guest_state->msr_items[i]);
333 set_bit(i, &host_state->flags);
334 wrmsrl(msr_data_index[i], guest_state->msr_items[i]);
335 clear_bit(i, &guest_flags);
336 }
337 }
338 #else /* __i386__ */
340 #define vmx_save_segments(v) ((void)0)
341 #define vmx_load_msrs() ((void)0)
342 #define vmx_restore_msrs(v) ((void)0)
343 #define vmx_save_init_msrs() ((void)0)
345 static inline int long_mode_do_msr_read(struct cpu_user_regs *regs)
346 {
347 return 0;
348 }
350 static inline int long_mode_do_msr_write(struct cpu_user_regs *regs)
351 {
352 return 0;
353 }
355 #endif /* __i386__ */
357 static void vmx_freeze_time(struct vcpu *v)
358 {
359 struct hvm_virpit *vpit = &v->domain->arch.hvm_domain.vpit;
361 v->domain->arch.hvm_domain.guest_time = get_guest_time(v);
362 if ( vpit->first_injected )
363 stop_timer(&(vpit->pit_timer));
364 }
366 static void vmx_ctxt_switch_from(struct vcpu *v)
367 {
368 vmx_freeze_time(v);
369 vmx_save_segments(v);
370 vmx_load_msrs();
371 }
373 static void vmx_ctxt_switch_to(struct vcpu *v)
374 {
375 vmx_restore_msrs(v);
376 }
378 void stop_vmx(void)
379 {
380 if (read_cr4() & X86_CR4_VMXE)
381 __vmxoff();
382 }
384 int vmx_initialize_guest_resources(struct vcpu *v)
385 {
386 vmx_final_setup_guest(v);
387 return 1;
388 }
390 void vmx_migrate_timers(struct vcpu *v)
391 {
392 struct hvm_virpit *vpit = &(v->domain->arch.hvm_domain.vpit);
394 migrate_timer(&vpit->pit_timer, v->processor);
395 migrate_timer(&v->arch.hvm_vmx.hlt_timer, v->processor);
396 if ( hvm_apic_support(v->domain) && VLAPIC(v))
397 migrate_timer(&(VLAPIC(v)->vlapic_timer), v->processor);
398 }
400 void vmx_store_cpu_guest_regs(struct vcpu *v, struct cpu_user_regs *regs)
401 {
402 #if defined (__x86_64__)
403 __vmread(GUEST_RFLAGS, &regs->rflags);
404 __vmread(GUEST_SS_SELECTOR, &regs->ss);
405 __vmread(GUEST_CS_SELECTOR, &regs->cs);
406 __vmread(GUEST_DS_SELECTOR, &regs->ds);
407 __vmread(GUEST_ES_SELECTOR, &regs->es);
408 __vmread(GUEST_GS_SELECTOR, &regs->gs);
409 __vmread(GUEST_FS_SELECTOR, &regs->fs);
410 __vmread(GUEST_RIP, &regs->rip);
411 __vmread(GUEST_RSP, &regs->rsp);
412 #elif defined (__i386__)
413 __vmread(GUEST_RFLAGS, &regs->eflags);
414 __vmread(GUEST_SS_SELECTOR, &regs->ss);
415 __vmread(GUEST_CS_SELECTOR, &regs->cs);
416 __vmread(GUEST_DS_SELECTOR, &regs->ds);
417 __vmread(GUEST_ES_SELECTOR, &regs->es);
418 __vmread(GUEST_GS_SELECTOR, &regs->gs);
419 __vmread(GUEST_FS_SELECTOR, &regs->fs);
420 __vmread(GUEST_RIP, &regs->eip);
421 __vmread(GUEST_RSP, &regs->esp);
422 #else
423 #error Unsupported architecture
424 #endif
425 }
427 void vmx_load_cpu_guest_regs(struct vcpu *v, struct cpu_user_regs *regs)
428 {
429 #if defined (__x86_64__)
430 __vmwrite(GUEST_SS_SELECTOR, regs->ss);
431 __vmwrite(GUEST_RSP, regs->rsp);
433 __vmwrite(GUEST_RFLAGS, regs->rflags);
434 if (regs->rflags & EF_TF)
435 __vm_set_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_DB);
436 else
437 __vm_clear_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_DB);
439 __vmwrite(GUEST_CS_SELECTOR, regs->cs);
440 __vmwrite(GUEST_RIP, regs->rip);
441 #elif defined (__i386__)
442 __vmwrite(GUEST_SS_SELECTOR, regs->ss);
443 __vmwrite(GUEST_RSP, regs->esp);
445 __vmwrite(GUEST_RFLAGS, regs->eflags);
446 if (regs->eflags & EF_TF)
447 __vm_set_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_DB);
448 else
449 __vm_clear_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_DB);
451 __vmwrite(GUEST_CS_SELECTOR, regs->cs);
452 __vmwrite(GUEST_RIP, regs->eip);
453 #else
454 #error Unsupported architecture
455 #endif
456 }
458 void vmx_store_cpu_guest_ctrl_regs(struct vcpu *v, unsigned long crs[8])
459 {
460 __vmread(CR0_READ_SHADOW, &crs[0]);
461 __vmread(GUEST_CR3, &crs[3]);
462 __vmread(CR4_READ_SHADOW, &crs[4]);
463 }
465 void vmx_modify_guest_state(struct vcpu *v)
466 {
467 modify_vmcs(&v->arch.hvm_vmx, &v->arch.guest_context.user_regs);
468 }
470 int vmx_realmode(struct vcpu *v)
471 {
472 unsigned long rflags;
474 __vmread(GUEST_RFLAGS, &rflags);
475 return rflags & X86_EFLAGS_VM;
476 }
478 int vmx_instruction_length(struct vcpu *v)
479 {
480 unsigned long inst_len;
482 if (__vmread(VM_EXIT_INSTRUCTION_LEN, &inst_len))
483 return 0;
484 return inst_len;
485 }
487 unsigned long vmx_get_ctrl_reg(struct vcpu *v, unsigned int num)
488 {
489 switch ( num )
490 {
491 case 0:
492 return v->arch.hvm_vmx.cpu_cr0;
493 case 2:
494 return v->arch.hvm_vmx.cpu_cr2;
495 case 3:
496 return v->arch.hvm_vmx.cpu_cr3;
497 default:
498 BUG();
499 }
500 return 0; /* dummy */
501 }
503 /* SMP VMX guest support */
504 void vmx_init_ap_context(struct vcpu_guest_context *ctxt,
505 int vcpuid, int trampoline_vector)
506 {
507 int i;
509 memset(ctxt, 0, sizeof(*ctxt));
511 /*
512 * Initial register values:
513 */
514 ctxt->user_regs.eip = VMXASSIST_BASE;
515 ctxt->user_regs.edx = vcpuid;
516 ctxt->user_regs.ebx = trampoline_vector;
518 ctxt->flags = VGCF_HVM_GUEST;
520 /* Virtual IDT is empty at start-of-day. */
521 for ( i = 0; i < 256; i++ )
522 {
523 ctxt->trap_ctxt[i].vector = i;
524 ctxt->trap_ctxt[i].cs = FLAT_KERNEL_CS;
525 }
527 /* No callback handlers. */
528 #if defined(__i386__)
529 ctxt->event_callback_cs = FLAT_KERNEL_CS;
530 ctxt->failsafe_callback_cs = FLAT_KERNEL_CS;
531 #endif
532 }
534 void do_nmi(struct cpu_user_regs *);
536 static int check_vmx_controls(u32 ctrls, u32 msr)
537 {
538 u32 vmx_msr_low, vmx_msr_high;
540 rdmsr(msr, vmx_msr_low, vmx_msr_high);
541 if ( (ctrls < vmx_msr_low) || (ctrls > vmx_msr_high) )
542 {
543 printk("Insufficient VMX capability 0x%x, "
544 "msr=0x%x,low=0x%8x,high=0x%x\n",
545 ctrls, msr, vmx_msr_low, vmx_msr_high);
546 return 0;
547 }
548 return 1;
549 }
551 int start_vmx(void)
552 {
553 struct vmcs_struct *vmcs;
554 u32 ecx;
555 u32 eax, edx;
556 u64 phys_vmcs; /* debugging */
558 /*
559 * Xen does not fill x86_capability words except 0.
560 */
561 ecx = cpuid_ecx(1);
562 boot_cpu_data.x86_capability[4] = ecx;
564 if (!(test_bit(X86_FEATURE_VMXE, &boot_cpu_data.x86_capability)))
565 return 0;
567 rdmsr(IA32_FEATURE_CONTROL_MSR, eax, edx);
569 if (eax & IA32_FEATURE_CONTROL_MSR_LOCK) {
570 if ((eax & IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON) == 0x0) {
571 printk("VMX disabled by Feature Control MSR.\n");
572 return 0;
573 }
574 }
575 else {
576 wrmsr(IA32_FEATURE_CONTROL_MSR,
577 IA32_FEATURE_CONTROL_MSR_LOCK |
578 IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON, 0);
579 }
581 if (!check_vmx_controls(MONITOR_PIN_BASED_EXEC_CONTROLS,
582 MSR_IA32_VMX_PINBASED_CTLS_MSR))
583 return 0;
584 if (!check_vmx_controls(MONITOR_CPU_BASED_EXEC_CONTROLS,
585 MSR_IA32_VMX_PROCBASED_CTLS_MSR))
586 return 0;
587 if (!check_vmx_controls(MONITOR_VM_EXIT_CONTROLS,
588 MSR_IA32_VMX_EXIT_CTLS_MSR))
589 return 0;
590 if (!check_vmx_controls(MONITOR_VM_ENTRY_CONTROLS,
591 MSR_IA32_VMX_ENTRY_CTLS_MSR))
592 return 0;
594 set_in_cr4(X86_CR4_VMXE); /* Enable VMXE */
596 if (!(vmcs = alloc_vmcs())) {
597 printk("Failed to allocate VMCS\n");
598 return 0;
599 }
601 phys_vmcs = (u64) virt_to_maddr(vmcs);
603 if (!(__vmxon(phys_vmcs))) {
604 printk("VMXON is done\n");
605 }
607 vmx_save_init_msrs();
609 /* Setup HVM interfaces */
610 hvm_funcs.disable = stop_vmx;
612 hvm_funcs.initialize_guest_resources = vmx_initialize_guest_resources;
613 hvm_funcs.relinquish_guest_resources = vmx_relinquish_guest_resources;
615 hvm_funcs.store_cpu_guest_regs = vmx_store_cpu_guest_regs;
616 hvm_funcs.load_cpu_guest_regs = vmx_load_cpu_guest_regs;
618 hvm_funcs.store_cpu_guest_ctrl_regs = vmx_store_cpu_guest_ctrl_regs;
619 hvm_funcs.modify_guest_state = vmx_modify_guest_state;
621 hvm_funcs.realmode = vmx_realmode;
622 hvm_funcs.paging_enabled = vmx_paging_enabled;
623 hvm_funcs.instruction_length = vmx_instruction_length;
624 hvm_funcs.get_guest_ctrl_reg = vmx_get_ctrl_reg;
626 hvm_funcs.init_ap_context = vmx_init_ap_context;
628 hvm_enabled = 1;
630 return 1;
631 }
633 /*
634 * Not all cases receive valid value in the VM-exit instruction length field.
635 */
636 #define __get_instruction_length(len) \
637 __vmread(VM_EXIT_INSTRUCTION_LEN, &(len)); \
638 if ((len) < 1 || (len) > 15) \
639 __hvm_bug(&regs);
641 static void inline __update_guest_eip(unsigned long inst_len)
642 {
643 unsigned long current_eip;
645 __vmread(GUEST_RIP, &current_eip);
646 __vmwrite(GUEST_RIP, current_eip + inst_len);
647 __vmwrite(GUEST_INTERRUPTIBILITY_INFO, 0);
648 }
651 static int vmx_do_page_fault(unsigned long va, struct cpu_user_regs *regs)
652 {
653 unsigned long gpa; /* FIXME: PAE */
654 int result;
656 #if 0 /* keep for debugging */
657 {
658 unsigned long eip;
660 __vmread(GUEST_RIP, &eip);
661 HVM_DBG_LOG(DBG_LEVEL_VMMU,
662 "vmx_do_page_fault = 0x%lx, eip = %lx, error_code = %lx",
663 va, eip, (unsigned long)regs->error_code);
664 }
665 #endif
667 if ( !vmx_paging_enabled(current) )
668 {
669 /* construct 1-to-1 direct mapping */
670 if ( shadow_direct_map_fault(va, regs) )
671 return 1;
673 handle_mmio(va, va);
674 TRACE_VMEXIT (2,2);
675 return 1;
676 }
677 gpa = gva_to_gpa(va);
679 /* Use 1:1 page table to identify MMIO address space */
680 if ( mmio_space(gpa) ){
681 struct vcpu *v = current;
682 /* No support for APIC */
683 if (!hvm_apic_support(v->domain) && gpa >= 0xFEC00000) {
684 u32 inst_len;
685 __vmread(VM_EXIT_INSTRUCTION_LEN, &(inst_len));
686 __update_guest_eip(inst_len);
687 return 1;
688 }
689 TRACE_VMEXIT (2,2);
690 handle_mmio(va, gpa);
691 return 1;
692 }
694 result = shadow_fault(va, regs);
695 TRACE_VMEXIT (2,result);
696 #if 0
697 if ( !result )
698 {
699 __vmread(GUEST_RIP, &eip);
700 printk("vmx pgfault to guest va=%lx eip=%lx\n", va, eip);
701 }
702 #endif
704 return result;
705 }
707 static void vmx_do_no_device_fault(void)
708 {
709 unsigned long cr0;
710 struct vcpu *v = current;
712 setup_fpu(current);
713 __vm_clear_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_NM);
715 /* Disable TS in guest CR0 unless the guest wants the exception too. */
716 __vmread_vcpu(v, CR0_READ_SHADOW, &cr0);
717 if ( !(cr0 & X86_CR0_TS) )
718 {
719 __vmread_vcpu(v, GUEST_CR0, &cr0);
720 cr0 &= ~X86_CR0_TS;
721 __vmwrite(GUEST_CR0, cr0);
722 }
723 }
725 /* Reserved bits: [31:15], [12:11], [9], [6], [2:1] */
726 #define VMX_VCPU_CPUID_L1_RESERVED 0xffff9a46
728 static void vmx_vmexit_do_cpuid(struct cpu_user_regs *regs)
729 {
730 unsigned int input = (unsigned int)regs->eax;
731 unsigned int count = (unsigned int)regs->ecx;
732 unsigned int eax, ebx, ecx, edx;
733 unsigned long eip;
734 struct vcpu *v = current;
736 __vmread(GUEST_RIP, &eip);
738 HVM_DBG_LOG(DBG_LEVEL_3, "(eax) 0x%08lx, (ebx) 0x%08lx, "
739 "(ecx) 0x%08lx, (edx) 0x%08lx, (esi) 0x%08lx, (edi) 0x%08lx",
740 (unsigned long)regs->eax, (unsigned long)regs->ebx,
741 (unsigned long)regs->ecx, (unsigned long)regs->edx,
742 (unsigned long)regs->esi, (unsigned long)regs->edi);
744 if ( input == 4 )
745 cpuid_count(input, count, &eax, &ebx, &ecx, &edx);
746 else
747 cpuid(input, &eax, &ebx, &ecx, &edx);
749 if ( input == 1 )
750 {
751 if ( hvm_apic_support(v->domain) &&
752 !vlapic_global_enabled((VLAPIC(v))) )
753 clear_bit(X86_FEATURE_APIC, &edx);
755 #if CONFIG_PAGING_LEVELS < 3
756 clear_bit(X86_FEATURE_PAE, &edx);
757 clear_bit(X86_FEATURE_PSE, &edx);
758 clear_bit(X86_FEATURE_PSE36, &edx);
759 #else
760 if ( v->domain->arch.ops->guest_paging_levels == PAGING_L2 )
761 {
762 if ( !v->domain->arch.hvm_domain.pae_enabled )
763 clear_bit(X86_FEATURE_PAE, &edx);
764 clear_bit(X86_FEATURE_PSE, &edx);
765 clear_bit(X86_FEATURE_PSE36, &edx);
766 }
767 #endif
769 /* Unsupportable for virtualised CPUs. */
770 ecx &= ~VMX_VCPU_CPUID_L1_RESERVED; /* mask off reserved bits */
771 clear_bit(X86_FEATURE_VMXE & 31, &ecx);
772 clear_bit(X86_FEATURE_MWAIT & 31, &ecx);
773 }
774 #ifdef __i386__
775 else if ( input == 0x80000001 )
776 {
777 /* Mask feature for Intel ia32e or AMD long mode. */
778 clear_bit(X86_FEATURE_LM & 31, &edx);
779 }
780 #endif
782 regs->eax = (unsigned long) eax;
783 regs->ebx = (unsigned long) ebx;
784 regs->ecx = (unsigned long) ecx;
785 regs->edx = (unsigned long) edx;
787 HVM_DBG_LOG(DBG_LEVEL_3, "eip@%lx, input: 0x%lx, "
788 "output: eax = 0x%08lx, ebx = 0x%08lx, "
789 "ecx = 0x%08lx, edx = 0x%08lx",
790 (unsigned long)eip, (unsigned long)input,
791 (unsigned long)eax, (unsigned long)ebx,
792 (unsigned long)ecx, (unsigned long)edx);
793 }
795 #define CASE_GET_REG_P(REG, reg) \
796 case REG_ ## REG: reg_p = (unsigned long *)&(regs->reg); break
798 static void vmx_dr_access (unsigned long exit_qualification, struct cpu_user_regs *regs)
799 {
800 unsigned int reg;
801 unsigned long *reg_p = 0;
802 struct vcpu *v = current;
803 unsigned long eip;
805 __vmread(GUEST_RIP, &eip);
807 reg = exit_qualification & DEBUG_REG_ACCESS_NUM;
809 HVM_DBG_LOG(DBG_LEVEL_1,
810 "vmx_dr_access : eip=%lx, reg=%d, exit_qualification = %lx",
811 eip, reg, exit_qualification);
813 switch(exit_qualification & DEBUG_REG_ACCESS_REG) {
814 CASE_GET_REG_P(EAX, eax);
815 CASE_GET_REG_P(ECX, ecx);
816 CASE_GET_REG_P(EDX, edx);
817 CASE_GET_REG_P(EBX, ebx);
818 CASE_GET_REG_P(EBP, ebp);
819 CASE_GET_REG_P(ESI, esi);
820 CASE_GET_REG_P(EDI, edi);
821 case REG_ESP:
822 break;
823 default:
824 __hvm_bug(regs);
825 }
827 switch (exit_qualification & DEBUG_REG_ACCESS_TYPE) {
828 case TYPE_MOV_TO_DR:
829 /* don't need to check the range */
830 if (reg != REG_ESP)
831 v->arch.guest_context.debugreg[reg] = *reg_p;
832 else {
833 unsigned long value;
834 __vmread(GUEST_RSP, &value);
835 v->arch.guest_context.debugreg[reg] = value;
836 }
837 break;
838 case TYPE_MOV_FROM_DR:
839 if (reg != REG_ESP)
840 *reg_p = v->arch.guest_context.debugreg[reg];
841 else {
842 __vmwrite(GUEST_RSP, v->arch.guest_context.debugreg[reg]);
843 }
844 break;
845 }
846 }
848 /*
849 * Invalidate the TLB for va. Invalidate the shadow page corresponding
850 * the address va.
851 */
852 static void vmx_vmexit_do_invlpg(unsigned long va)
853 {
854 unsigned long eip;
855 struct vcpu *v = current;
857 __vmread(GUEST_RIP, &eip);
859 HVM_DBG_LOG(DBG_LEVEL_VMMU, "vmx_vmexit_do_invlpg: eip=%lx, va=%lx",
860 eip, va);
862 /*
863 * We do the safest things first, then try to update the shadow
864 * copying from guest
865 */
866 shadow_invlpg(v, va);
867 }
869 static int check_for_null_selector(unsigned long eip)
870 {
871 unsigned char inst[MAX_INST_LEN];
872 unsigned long sel;
873 int i, inst_len;
874 int inst_copy_from_guest(unsigned char *, unsigned long, int);
876 __vmread(VM_EXIT_INSTRUCTION_LEN, &inst_len);
877 memset(inst, 0, MAX_INST_LEN);
878 if (inst_copy_from_guest(inst, eip, inst_len) != inst_len) {
879 printf("check_for_null_selector: get guest instruction failed\n");
880 domain_crash_synchronous();
881 }
883 for (i = 0; i < inst_len; i++) {
884 switch (inst[i]) {
885 case 0xf3: /* REPZ */
886 case 0xf2: /* REPNZ */
887 case 0xf0: /* LOCK */
888 case 0x66: /* data32 */
889 case 0x67: /* addr32 */
890 continue;
891 case 0x2e: /* CS */
892 __vmread(GUEST_CS_SELECTOR, &sel);
893 break;
894 case 0x36: /* SS */
895 __vmread(GUEST_SS_SELECTOR, &sel);
896 break;
897 case 0x26: /* ES */
898 __vmread(GUEST_ES_SELECTOR, &sel);
899 break;
900 case 0x64: /* FS */
901 __vmread(GUEST_FS_SELECTOR, &sel);
902 break;
903 case 0x65: /* GS */
904 __vmread(GUEST_GS_SELECTOR, &sel);
905 break;
906 case 0x3e: /* DS */
907 /* FALLTHROUGH */
908 default:
909 /* DS is the default */
910 __vmread(GUEST_DS_SELECTOR, &sel);
911 }
912 return sel == 0 ? 1 : 0;
913 }
915 return 0;
916 }
918 extern void send_pio_req(struct cpu_user_regs *regs, unsigned long port,
919 unsigned long count, int size, long value,
920 int dir, int pvalid);
922 static void vmx_io_instruction(struct cpu_user_regs *regs,
923 unsigned long exit_qualification, unsigned long inst_len)
924 {
925 struct mmio_op *mmio_opp;
926 unsigned long eip, cs, eflags;
927 unsigned long port, size, dir;
928 int vm86;
930 mmio_opp = &current->arch.hvm_vcpu.mmio_op;
931 mmio_opp->instr = INSTR_PIO;
932 mmio_opp->flags = 0;
934 __vmread(GUEST_RIP, &eip);
935 __vmread(GUEST_CS_SELECTOR, &cs);
936 __vmread(GUEST_RFLAGS, &eflags);
937 vm86 = eflags & X86_EFLAGS_VM ? 1 : 0;
939 HVM_DBG_LOG(DBG_LEVEL_IO,
940 "vmx_io_instruction: vm86 %d, eip=%lx:%lx, "
941 "exit_qualification = %lx",
942 vm86, cs, eip, exit_qualification);
944 if (test_bit(6, &exit_qualification))
945 port = (exit_qualification >> 16) & 0xFFFF;
946 else
947 port = regs->edx & 0xffff;
948 TRACE_VMEXIT(2, port);
949 size = (exit_qualification & 7) + 1;
950 dir = test_bit(3, &exit_qualification); /* direction */
952 if (test_bit(4, &exit_qualification)) { /* string instruction */
953 unsigned long addr, count = 1;
954 int sign = regs->eflags & EF_DF ? -1 : 1;
956 __vmread(GUEST_LINEAR_ADDRESS, &addr);
958 /*
959 * In protected mode, guest linear address is invalid if the
960 * selector is null.
961 */
962 if (!vm86 && check_for_null_selector(eip))
963 addr = dir == IOREQ_WRITE ? regs->esi : regs->edi;
965 if (test_bit(5, &exit_qualification)) { /* "rep" prefix */
966 mmio_opp->flags |= REPZ;
967 count = vm86 ? regs->ecx & 0xFFFF : regs->ecx;
968 }
970 /*
971 * Handle string pio instructions that cross pages or that
972 * are unaligned. See the comments in hvm_domain.c/handle_mmio()
973 */
974 if ((addr & PAGE_MASK) != ((addr + size - 1) & PAGE_MASK)) {
975 unsigned long value = 0;
977 mmio_opp->flags |= OVERLAP;
978 if (dir == IOREQ_WRITE)
979 hvm_copy(&value, addr, size, HVM_COPY_IN);
980 send_pio_req(regs, port, 1, size, value, dir, 0);
981 } else {
982 if ((addr & PAGE_MASK) != ((addr + count * size - 1) & PAGE_MASK)) {
983 if (sign > 0)
984 count = (PAGE_SIZE - (addr & ~PAGE_MASK)) / size;
985 else
986 count = (addr & ~PAGE_MASK) / size;
987 } else
988 __update_guest_eip(inst_len);
990 send_pio_req(regs, port, count, size, addr, dir, 1);
991 }
992 } else {
993 if (port == 0xe9 && dir == IOREQ_WRITE && size == 1)
994 hvm_print_line(current, regs->eax); /* guest debug output */
996 __update_guest_eip(inst_len);
997 send_pio_req(regs, port, 1, size, regs->eax, dir, 0);
998 }
999 }
1001 int
1002 vmx_world_save(struct vcpu *v, struct vmx_assist_context *c)
1004 unsigned long inst_len;
1005 int error = 0;
1007 error |= __vmread(VM_EXIT_INSTRUCTION_LEN, &inst_len);
1008 error |= __vmread(GUEST_RIP, &c->eip);
1009 c->eip += inst_len; /* skip transition instruction */
1010 error |= __vmread(GUEST_RSP, &c->esp);
1011 error |= __vmread(GUEST_RFLAGS, &c->eflags);
1013 error |= __vmread(CR0_READ_SHADOW, &c->cr0);
1014 c->cr3 = v->arch.hvm_vmx.cpu_cr3;
1015 error |= __vmread(CR4_READ_SHADOW, &c->cr4);
1017 error |= __vmread(GUEST_IDTR_LIMIT, &c->idtr_limit);
1018 error |= __vmread(GUEST_IDTR_BASE, &c->idtr_base);
1020 error |= __vmread(GUEST_GDTR_LIMIT, &c->gdtr_limit);
1021 error |= __vmread(GUEST_GDTR_BASE, &c->gdtr_base);
1023 error |= __vmread(GUEST_CS_SELECTOR, &c->cs_sel);
1024 error |= __vmread(GUEST_CS_LIMIT, &c->cs_limit);
1025 error |= __vmread(GUEST_CS_BASE, &c->cs_base);
1026 error |= __vmread(GUEST_CS_AR_BYTES, &c->cs_arbytes.bytes);
1028 error |= __vmread(GUEST_DS_SELECTOR, &c->ds_sel);
1029 error |= __vmread(GUEST_DS_LIMIT, &c->ds_limit);
1030 error |= __vmread(GUEST_DS_BASE, &c->ds_base);
1031 error |= __vmread(GUEST_DS_AR_BYTES, &c->ds_arbytes.bytes);
1033 error |= __vmread(GUEST_ES_SELECTOR, &c->es_sel);
1034 error |= __vmread(GUEST_ES_LIMIT, &c->es_limit);
1035 error |= __vmread(GUEST_ES_BASE, &c->es_base);
1036 error |= __vmread(GUEST_ES_AR_BYTES, &c->es_arbytes.bytes);
1038 error |= __vmread(GUEST_SS_SELECTOR, &c->ss_sel);
1039 error |= __vmread(GUEST_SS_LIMIT, &c->ss_limit);
1040 error |= __vmread(GUEST_SS_BASE, &c->ss_base);
1041 error |= __vmread(GUEST_SS_AR_BYTES, &c->ss_arbytes.bytes);
1043 error |= __vmread(GUEST_FS_SELECTOR, &c->fs_sel);
1044 error |= __vmread(GUEST_FS_LIMIT, &c->fs_limit);
1045 error |= __vmread(GUEST_FS_BASE, &c->fs_base);
1046 error |= __vmread(GUEST_FS_AR_BYTES, &c->fs_arbytes.bytes);
1048 error |= __vmread(GUEST_GS_SELECTOR, &c->gs_sel);
1049 error |= __vmread(GUEST_GS_LIMIT, &c->gs_limit);
1050 error |= __vmread(GUEST_GS_BASE, &c->gs_base);
1051 error |= __vmread(GUEST_GS_AR_BYTES, &c->gs_arbytes.bytes);
1053 error |= __vmread(GUEST_TR_SELECTOR, &c->tr_sel);
1054 error |= __vmread(GUEST_TR_LIMIT, &c->tr_limit);
1055 error |= __vmread(GUEST_TR_BASE, &c->tr_base);
1056 error |= __vmread(GUEST_TR_AR_BYTES, &c->tr_arbytes.bytes);
1058 error |= __vmread(GUEST_LDTR_SELECTOR, &c->ldtr_sel);
1059 error |= __vmread(GUEST_LDTR_LIMIT, &c->ldtr_limit);
1060 error |= __vmread(GUEST_LDTR_BASE, &c->ldtr_base);
1061 error |= __vmread(GUEST_LDTR_AR_BYTES, &c->ldtr_arbytes.bytes);
1063 return !error;
1066 int
1067 vmx_world_restore(struct vcpu *v, struct vmx_assist_context *c)
1069 unsigned long mfn, old_cr4, old_base_mfn;
1070 int error = 0;
1072 error |= __vmwrite(GUEST_RIP, c->eip);
1073 error |= __vmwrite(GUEST_RSP, c->esp);
1074 error |= __vmwrite(GUEST_RFLAGS, c->eflags);
1076 error |= __vmwrite(CR0_READ_SHADOW, c->cr0);
1078 if (!vmx_paging_enabled(v)) {
1079 HVM_DBG_LOG(DBG_LEVEL_VMMU, "switching to vmxassist. use phys table");
1080 __vmwrite(GUEST_CR3, pagetable_get_paddr(v->domain->arch.phys_table));
1081 goto skip_cr3;
1084 if (c->cr3 == v->arch.hvm_vmx.cpu_cr3) {
1085 /*
1086 * This is simple TLB flush, implying the guest has
1087 * removed some translation or changed page attributes.
1088 * We simply invalidate the shadow.
1089 */
1090 mfn = get_mfn_from_gpfn(c->cr3 >> PAGE_SHIFT);
1091 if (mfn != pagetable_get_pfn(v->arch.guest_table)) {
1092 printk("Invalid CR3 value=%x", c->cr3);
1093 domain_crash_synchronous();
1094 return 0;
1096 shadow_sync_all(v->domain);
1097 } else {
1098 /*
1099 * If different, make a shadow. Check if the PDBR is valid
1100 * first.
1101 */
1102 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 c->cr3 = %x", c->cr3);
1103 if ((c->cr3 >> PAGE_SHIFT) > v->domain->max_pages) {
1104 printk("Invalid CR3 value=%x", c->cr3);
1105 domain_crash_synchronous();
1106 return 0;
1108 mfn = get_mfn_from_gpfn(c->cr3 >> PAGE_SHIFT);
1109 if(!get_page(mfn_to_page(mfn), v->domain))
1110 return 0;
1111 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1112 v->arch.guest_table = mk_pagetable((u64)mfn << PAGE_SHIFT);
1113 if (old_base_mfn)
1114 put_page(mfn_to_page(old_base_mfn));
1115 /*
1116 * arch.shadow_table should now hold the next CR3 for shadow
1117 */
1118 v->arch.hvm_vmx.cpu_cr3 = c->cr3;
1119 update_pagetables(v);
1120 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %x", c->cr3);
1121 __vmwrite(GUEST_CR3, pagetable_get_paddr(v->arch.shadow_table));
1124 skip_cr3:
1126 error |= __vmread(CR4_READ_SHADOW, &old_cr4);
1127 error |= __vmwrite(GUEST_CR4, (c->cr4 | VMX_CR4_HOST_MASK));
1128 error |= __vmwrite(CR4_READ_SHADOW, c->cr4);
1130 error |= __vmwrite(GUEST_IDTR_LIMIT, c->idtr_limit);
1131 error |= __vmwrite(GUEST_IDTR_BASE, c->idtr_base);
1133 error |= __vmwrite(GUEST_GDTR_LIMIT, c->gdtr_limit);
1134 error |= __vmwrite(GUEST_GDTR_BASE, c->gdtr_base);
1136 error |= __vmwrite(GUEST_CS_SELECTOR, c->cs_sel);
1137 error |= __vmwrite(GUEST_CS_LIMIT, c->cs_limit);
1138 error |= __vmwrite(GUEST_CS_BASE, c->cs_base);
1139 error |= __vmwrite(GUEST_CS_AR_BYTES, c->cs_arbytes.bytes);
1141 error |= __vmwrite(GUEST_DS_SELECTOR, c->ds_sel);
1142 error |= __vmwrite(GUEST_DS_LIMIT, c->ds_limit);
1143 error |= __vmwrite(GUEST_DS_BASE, c->ds_base);
1144 error |= __vmwrite(GUEST_DS_AR_BYTES, c->ds_arbytes.bytes);
1146 error |= __vmwrite(GUEST_ES_SELECTOR, c->es_sel);
1147 error |= __vmwrite(GUEST_ES_LIMIT, c->es_limit);
1148 error |= __vmwrite(GUEST_ES_BASE, c->es_base);
1149 error |= __vmwrite(GUEST_ES_AR_BYTES, c->es_arbytes.bytes);
1151 error |= __vmwrite(GUEST_SS_SELECTOR, c->ss_sel);
1152 error |= __vmwrite(GUEST_SS_LIMIT, c->ss_limit);
1153 error |= __vmwrite(GUEST_SS_BASE, c->ss_base);
1154 error |= __vmwrite(GUEST_SS_AR_BYTES, c->ss_arbytes.bytes);
1156 error |= __vmwrite(GUEST_FS_SELECTOR, c->fs_sel);
1157 error |= __vmwrite(GUEST_FS_LIMIT, c->fs_limit);
1158 error |= __vmwrite(GUEST_FS_BASE, c->fs_base);
1159 error |= __vmwrite(GUEST_FS_AR_BYTES, c->fs_arbytes.bytes);
1161 error |= __vmwrite(GUEST_GS_SELECTOR, c->gs_sel);
1162 error |= __vmwrite(GUEST_GS_LIMIT, c->gs_limit);
1163 error |= __vmwrite(GUEST_GS_BASE, c->gs_base);
1164 error |= __vmwrite(GUEST_GS_AR_BYTES, c->gs_arbytes.bytes);
1166 error |= __vmwrite(GUEST_TR_SELECTOR, c->tr_sel);
1167 error |= __vmwrite(GUEST_TR_LIMIT, c->tr_limit);
1168 error |= __vmwrite(GUEST_TR_BASE, c->tr_base);
1169 error |= __vmwrite(GUEST_TR_AR_BYTES, c->tr_arbytes.bytes);
1171 error |= __vmwrite(GUEST_LDTR_SELECTOR, c->ldtr_sel);
1172 error |= __vmwrite(GUEST_LDTR_LIMIT, c->ldtr_limit);
1173 error |= __vmwrite(GUEST_LDTR_BASE, c->ldtr_base);
1174 error |= __vmwrite(GUEST_LDTR_AR_BYTES, c->ldtr_arbytes.bytes);
1176 return !error;
1179 enum { VMX_ASSIST_INVOKE = 0, VMX_ASSIST_RESTORE };
1181 int
1182 vmx_assist(struct vcpu *v, int mode)
1184 struct vmx_assist_context c;
1185 u32 magic;
1186 u32 cp;
1188 /* make sure vmxassist exists (this is not an error) */
1189 if (!hvm_copy(&magic, VMXASSIST_MAGIC_OFFSET, sizeof(magic), HVM_COPY_IN))
1190 return 0;
1191 if (magic != VMXASSIST_MAGIC)
1192 return 0;
1194 switch (mode) {
1195 /*
1196 * Transfer control to vmxassist.
1197 * Store the current context in VMXASSIST_OLD_CONTEXT and load
1198 * the new VMXASSIST_NEW_CONTEXT context. This context was created
1199 * by vmxassist and will transfer control to it.
1200 */
1201 case VMX_ASSIST_INVOKE:
1202 /* save the old context */
1203 if (!hvm_copy(&cp, VMXASSIST_OLD_CONTEXT, sizeof(cp), HVM_COPY_IN))
1204 goto error;
1205 if (cp != 0) {
1206 if (!vmx_world_save(v, &c))
1207 goto error;
1208 if (!hvm_copy(&c, cp, sizeof(c), HVM_COPY_OUT))
1209 goto error;
1212 /* restore the new context, this should activate vmxassist */
1213 if (!hvm_copy(&cp, VMXASSIST_NEW_CONTEXT, sizeof(cp), HVM_COPY_IN))
1214 goto error;
1215 if (cp != 0) {
1216 if (!hvm_copy(&c, cp, sizeof(c), HVM_COPY_IN))
1217 goto error;
1218 if (!vmx_world_restore(v, &c))
1219 goto error;
1220 return 1;
1222 break;
1224 /*
1225 * Restore the VMXASSIST_OLD_CONTEXT that was saved by VMX_ASSIST_INVOKE
1226 * above.
1227 */
1228 case VMX_ASSIST_RESTORE:
1229 /* save the old context */
1230 if (!hvm_copy(&cp, VMXASSIST_OLD_CONTEXT, sizeof(cp), HVM_COPY_IN))
1231 goto error;
1232 if (cp != 0) {
1233 if (!hvm_copy(&c, cp, sizeof(c), HVM_COPY_IN))
1234 goto error;
1235 if (!vmx_world_restore(v, &c))
1236 goto error;
1237 return 1;
1239 break;
1242 error:
1243 printf("Failed to transfer to vmxassist\n");
1244 domain_crash_synchronous();
1245 return 0;
1248 static int vmx_set_cr0(unsigned long value)
1250 struct vcpu *v = current;
1251 unsigned long mfn;
1252 unsigned long eip;
1253 int paging_enabled;
1254 unsigned long vm_entry_value;
1255 unsigned long old_cr0;
1257 /*
1258 * CR0: We don't want to lose PE and PG.
1259 */
1260 __vmread_vcpu(v, CR0_READ_SHADOW, &old_cr0);
1261 paging_enabled = (old_cr0 & X86_CR0_PE) && (old_cr0 & X86_CR0_PG);
1263 /* TS cleared? Then initialise FPU now. */
1264 if ( !(value & X86_CR0_TS) )
1266 setup_fpu(v);
1267 __vm_clear_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_NM);
1270 __vmwrite(GUEST_CR0, value | X86_CR0_PE | X86_CR0_PG | X86_CR0_NE);
1271 __vmwrite(CR0_READ_SHADOW, value);
1273 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR0 value = %lx\n", value);
1275 if ( (value & X86_CR0_PE) && (value & X86_CR0_PG) && !paging_enabled )
1277 /*
1278 * Trying to enable guest paging.
1279 * The guest CR3 must be pointing to the guest physical.
1280 */
1281 if ( !VALID_MFN(mfn = get_mfn_from_gpfn(
1282 v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT)) ||
1283 !get_page(mfn_to_page(mfn), v->domain) )
1285 printk("Invalid CR3 value = %lx", v->arch.hvm_vmx.cpu_cr3);
1286 domain_crash_synchronous(); /* need to take a clean path */
1289 #if defined(__x86_64__)
1290 if ( test_bit(VMX_CPU_STATE_LME_ENABLED,
1291 &v->arch.hvm_vmx.cpu_state) &&
1292 !test_bit(VMX_CPU_STATE_PAE_ENABLED,
1293 &v->arch.hvm_vmx.cpu_state) )
1295 HVM_DBG_LOG(DBG_LEVEL_1, "Enable paging before PAE enabled\n");
1296 vmx_inject_exception(v, TRAP_gp_fault, 0);
1299 if ( test_bit(VMX_CPU_STATE_LME_ENABLED,
1300 &v->arch.hvm_vmx.cpu_state) )
1302 /* Here the PAE is should be opened */
1303 HVM_DBG_LOG(DBG_LEVEL_1, "Enable long mode\n");
1304 set_bit(VMX_CPU_STATE_LMA_ENABLED,
1305 &v->arch.hvm_vmx.cpu_state);
1307 __vmread(VM_ENTRY_CONTROLS, &vm_entry_value);
1308 vm_entry_value |= VM_ENTRY_CONTROLS_IA32E_MODE;
1309 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
1311 if ( !shadow_set_guest_paging_levels(v->domain, 4) ) {
1312 printk("Unsupported guest paging levels\n");
1313 domain_crash_synchronous(); /* need to take a clean path */
1316 else
1317 #endif /* __x86_64__ */
1319 #if CONFIG_PAGING_LEVELS >= 3
1320 if ( !shadow_set_guest_paging_levels(v->domain, 2) ) {
1321 printk("Unsupported guest paging levels\n");
1322 domain_crash_synchronous(); /* need to take a clean path */
1324 #endif
1327 /*
1328 * Now arch.guest_table points to machine physical.
1329 */
1330 v->arch.guest_table = mk_pagetable((u64)mfn << PAGE_SHIFT);
1331 update_pagetables(v);
1333 HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
1334 (unsigned long) (mfn << PAGE_SHIFT));
1336 __vmwrite(GUEST_CR3, pagetable_get_paddr(v->arch.shadow_table));
1337 /*
1338 * arch->shadow_table should hold the next CR3 for shadow
1339 */
1340 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, mfn = %lx",
1341 v->arch.hvm_vmx.cpu_cr3, mfn);
1344 if ( !((value & X86_CR0_PE) && (value & X86_CR0_PG)) && paging_enabled )
1345 if ( v->arch.hvm_vmx.cpu_cr3 ) {
1346 put_page(mfn_to_page(get_mfn_from_gpfn(
1347 v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT)));
1348 v->arch.guest_table = mk_pagetable(0);
1351 /*
1352 * VMX does not implement real-mode virtualization. We emulate
1353 * real-mode by performing a world switch to VMXAssist whenever
1354 * a partition disables the CR0.PE bit.
1355 */
1356 if ( (value & X86_CR0_PE) == 0 )
1358 if ( value & X86_CR0_PG ) {
1359 /* inject GP here */
1360 vmx_inject_exception(v, TRAP_gp_fault, 0);
1361 return 0;
1362 } else {
1363 /*
1364 * Disable paging here.
1365 * Same to PE == 1 && PG == 0
1366 */
1367 if ( test_bit(VMX_CPU_STATE_LMA_ENABLED,
1368 &v->arch.hvm_vmx.cpu_state) )
1370 clear_bit(VMX_CPU_STATE_LMA_ENABLED,
1371 &v->arch.hvm_vmx.cpu_state);
1372 __vmread(VM_ENTRY_CONTROLS, &vm_entry_value);
1373 vm_entry_value &= ~VM_ENTRY_CONTROLS_IA32E_MODE;
1374 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
1378 clear_all_shadow_status(v->domain);
1379 if ( vmx_assist(v, VMX_ASSIST_INVOKE) ) {
1380 set_bit(VMX_CPU_STATE_ASSIST_ENABLED, &v->arch.hvm_vmx.cpu_state);
1381 __vmread(GUEST_RIP, &eip);
1382 HVM_DBG_LOG(DBG_LEVEL_1,
1383 "Transfering control to vmxassist %%eip 0x%lx\n", eip);
1384 return 0; /* do not update eip! */
1386 } else if ( test_bit(VMX_CPU_STATE_ASSIST_ENABLED,
1387 &v->arch.hvm_vmx.cpu_state) )
1389 __vmread(GUEST_RIP, &eip);
1390 HVM_DBG_LOG(DBG_LEVEL_1,
1391 "Enabling CR0.PE at %%eip 0x%lx\n", eip);
1392 if ( vmx_assist(v, VMX_ASSIST_RESTORE) )
1394 clear_bit(VMX_CPU_STATE_ASSIST_ENABLED,
1395 &v->arch.hvm_vmx.cpu_state);
1396 __vmread(GUEST_RIP, &eip);
1397 HVM_DBG_LOG(DBG_LEVEL_1,
1398 "Restoring to %%eip 0x%lx\n", eip);
1399 return 0; /* do not update eip! */
1403 return 1;
1406 #define CASE_GET_REG(REG, reg) \
1407 case REG_ ## REG: value = regs->reg; break
1409 #define CASE_EXTEND_SET_REG \
1410 CASE_EXTEND_REG(S)
1411 #define CASE_EXTEND_GET_REG \
1412 CASE_EXTEND_REG(G)
1414 #ifdef __i386__
1415 #define CASE_EXTEND_REG(T)
1416 #else
1417 #define CASE_EXTEND_REG(T) \
1418 CASE_ ## T ## ET_REG(R8, r8); \
1419 CASE_ ## T ## ET_REG(R9, r9); \
1420 CASE_ ## T ## ET_REG(R10, r10); \
1421 CASE_ ## T ## ET_REG(R11, r11); \
1422 CASE_ ## T ## ET_REG(R12, r12); \
1423 CASE_ ## T ## ET_REG(R13, r13); \
1424 CASE_ ## T ## ET_REG(R14, r14); \
1425 CASE_ ## T ## ET_REG(R15, r15);
1426 #endif
1429 /*
1430 * Write to control registers
1431 */
1432 static int mov_to_cr(int gp, int cr, struct cpu_user_regs *regs)
1434 unsigned long value;
1435 unsigned long old_cr;
1436 struct vcpu *v = current;
1438 switch (gp) {
1439 CASE_GET_REG(EAX, eax);
1440 CASE_GET_REG(ECX, ecx);
1441 CASE_GET_REG(EDX, edx);
1442 CASE_GET_REG(EBX, ebx);
1443 CASE_GET_REG(EBP, ebp);
1444 CASE_GET_REG(ESI, esi);
1445 CASE_GET_REG(EDI, edi);
1446 CASE_EXTEND_GET_REG
1447 case REG_ESP:
1448 __vmread(GUEST_RSP, &value);
1449 break;
1450 default:
1451 printk("invalid gp: %d\n", gp);
1452 __hvm_bug(regs);
1455 HVM_DBG_LOG(DBG_LEVEL_1, "mov_to_cr: CR%d, value = %lx,", cr, value);
1456 HVM_DBG_LOG(DBG_LEVEL_1, "current = %lx,", (unsigned long) current);
1458 switch(cr) {
1459 case 0:
1461 return vmx_set_cr0(value);
1463 case 3:
1465 unsigned long old_base_mfn, mfn;
1467 /*
1468 * If paging is not enabled yet, simply copy the value to CR3.
1469 */
1470 if (!vmx_paging_enabled(v)) {
1471 v->arch.hvm_vmx.cpu_cr3 = value;
1472 break;
1475 /*
1476 * We make a new one if the shadow does not exist.
1477 */
1478 if (value == v->arch.hvm_vmx.cpu_cr3) {
1479 /*
1480 * This is simple TLB flush, implying the guest has
1481 * removed some translation or changed page attributes.
1482 * We simply invalidate the shadow.
1483 */
1484 mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
1485 if (mfn != pagetable_get_pfn(v->arch.guest_table))
1486 __hvm_bug(regs);
1487 shadow_sync_all(v->domain);
1488 } else {
1489 /*
1490 * If different, make a shadow. Check if the PDBR is valid
1491 * first.
1492 */
1493 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 value = %lx", value);
1494 if ( ((value >> PAGE_SHIFT) > v->domain->max_pages ) ||
1495 !VALID_MFN(mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT)) ||
1496 !get_page(mfn_to_page(mfn), v->domain) )
1498 printk("Invalid CR3 value=%lx", value);
1499 domain_crash_synchronous(); /* need to take a clean path */
1501 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1502 v->arch.guest_table = mk_pagetable((u64)mfn << PAGE_SHIFT);
1503 if (old_base_mfn)
1504 put_page(mfn_to_page(old_base_mfn));
1505 /*
1506 * arch.shadow_table should now hold the next CR3 for shadow
1507 */
1508 #if CONFIG_PAGING_LEVELS >= 3
1509 if ( v->domain->arch.ops->guest_paging_levels == PAGING_L3 )
1510 shadow_sync_all(v->domain);
1511 #endif
1513 v->arch.hvm_vmx.cpu_cr3 = value;
1514 update_pagetables(v);
1515 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx",
1516 value);
1517 __vmwrite(GUEST_CR3, pagetable_get_paddr(v->arch.shadow_table));
1519 break;
1521 case 4: /* CR4 */
1523 __vmread(CR4_READ_SHADOW, &old_cr);
1525 if ( value & X86_CR4_PAE && !(old_cr & X86_CR4_PAE) )
1527 set_bit(VMX_CPU_STATE_PAE_ENABLED, &v->arch.hvm_vmx.cpu_state);
1529 if ( vmx_pgbit_test(v) )
1531 /* The guest is 32 bit. */
1532 #if CONFIG_PAGING_LEVELS >= 4
1533 unsigned long mfn, old_base_mfn;
1535 if( !shadow_set_guest_paging_levels(v->domain, 3) )
1537 printk("Unsupported guest paging levels\n");
1538 domain_crash_synchronous(); /* need to take a clean path */
1541 if ( !VALID_MFN(mfn = get_mfn_from_gpfn(
1542 v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT)) ||
1543 !get_page(mfn_to_page(mfn), v->domain) )
1545 printk("Invalid CR3 value = %lx", v->arch.hvm_vmx.cpu_cr3);
1546 domain_crash_synchronous(); /* need to take a clean path */
1549 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1550 if ( old_base_mfn )
1551 put_page(mfn_to_page(old_base_mfn));
1553 /*
1554 * Now arch.guest_table points to machine physical.
1555 */
1557 v->arch.guest_table = mk_pagetable((u64)mfn << PAGE_SHIFT);
1558 update_pagetables(v);
1560 HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
1561 (unsigned long) (mfn << PAGE_SHIFT));
1563 __vmwrite(GUEST_CR3, pagetable_get_paddr(v->arch.shadow_table));
1565 /*
1566 * arch->shadow_table should hold the next CR3 for shadow
1567 */
1569 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, mfn = %lx",
1570 v->arch.hvm_vmx.cpu_cr3, mfn);
1571 #endif
1573 else
1575 /* The guest is 64 bit. */
1576 #if CONFIG_PAGING_LEVELS >= 4
1577 if ( !shadow_set_guest_paging_levels(v->domain, 4) )
1579 printk("Unsupported guest paging levels\n");
1580 domain_crash_synchronous(); /* need to take a clean path */
1582 #endif
1585 else if ( value & X86_CR4_PAE )
1586 set_bit(VMX_CPU_STATE_PAE_ENABLED, &v->arch.hvm_vmx.cpu_state);
1587 else
1589 if ( test_bit(VMX_CPU_STATE_LMA_ENABLED, &v->arch.hvm_vmx.cpu_state) )
1590 vmx_inject_exception(v, TRAP_gp_fault, 0);
1592 clear_bit(VMX_CPU_STATE_PAE_ENABLED, &v->arch.hvm_vmx.cpu_state);
1595 __vmwrite(GUEST_CR4, value| VMX_CR4_HOST_MASK);
1596 __vmwrite(CR4_READ_SHADOW, value);
1598 /*
1599 * Writing to CR4 to modify the PSE, PGE, or PAE flag invalidates
1600 * all TLB entries except global entries.
1601 */
1602 if ( (old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE) )
1603 shadow_sync_all(v->domain);
1605 break;
1607 default:
1608 printk("invalid cr: %d\n", gp);
1609 __hvm_bug(regs);
1612 return 1;
1615 #define CASE_SET_REG(REG, reg) \
1616 case REG_ ## REG: \
1617 regs->reg = value; \
1618 break
1620 /*
1621 * Read from control registers. CR0 and CR4 are read from the shadow.
1622 */
1623 static void mov_from_cr(int cr, int gp, struct cpu_user_regs *regs)
1625 unsigned long value;
1626 struct vcpu *v = current;
1628 if (cr != 3)
1629 __hvm_bug(regs);
1631 value = (unsigned long) v->arch.hvm_vmx.cpu_cr3;
1633 switch (gp) {
1634 CASE_SET_REG(EAX, eax);
1635 CASE_SET_REG(ECX, ecx);
1636 CASE_SET_REG(EDX, edx);
1637 CASE_SET_REG(EBX, ebx);
1638 CASE_SET_REG(EBP, ebp);
1639 CASE_SET_REG(ESI, esi);
1640 CASE_SET_REG(EDI, edi);
1641 CASE_EXTEND_SET_REG
1642 case REG_ESP:
1643 __vmwrite(GUEST_RSP, value);
1644 regs->esp = value;
1645 break;
1646 default:
1647 printk("invalid gp: %d\n", gp);
1648 __hvm_bug(regs);
1651 HVM_DBG_LOG(DBG_LEVEL_VMMU, "mov_from_cr: CR%d, value = %lx,", cr, value);
1654 static int vmx_cr_access(unsigned long exit_qualification, struct cpu_user_regs *regs)
1656 unsigned int gp, cr;
1657 unsigned long value;
1658 struct vcpu *v = current;
1660 switch (exit_qualification & CONTROL_REG_ACCESS_TYPE) {
1661 case TYPE_MOV_TO_CR:
1662 gp = exit_qualification & CONTROL_REG_ACCESS_REG;
1663 cr = exit_qualification & CONTROL_REG_ACCESS_NUM;
1664 TRACE_VMEXIT(1,TYPE_MOV_TO_CR);
1665 TRACE_VMEXIT(2,cr);
1666 TRACE_VMEXIT(3,gp);
1667 return mov_to_cr(gp, cr, regs);
1668 case TYPE_MOV_FROM_CR:
1669 gp = exit_qualification & CONTROL_REG_ACCESS_REG;
1670 cr = exit_qualification & CONTROL_REG_ACCESS_NUM;
1671 TRACE_VMEXIT(1,TYPE_MOV_FROM_CR);
1672 TRACE_VMEXIT(2,cr);
1673 TRACE_VMEXIT(3,gp);
1674 mov_from_cr(cr, gp, regs);
1675 break;
1676 case TYPE_CLTS:
1677 TRACE_VMEXIT(1,TYPE_CLTS);
1679 /* We initialise the FPU now, to avoid needing another vmexit. */
1680 setup_fpu(v);
1681 __vm_clear_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_NM);
1683 __vmread_vcpu(v, GUEST_CR0, &value);
1684 value &= ~X86_CR0_TS; /* clear TS */
1685 __vmwrite(GUEST_CR0, value);
1687 __vmread_vcpu(v, CR0_READ_SHADOW, &value);
1688 value &= ~X86_CR0_TS; /* clear TS */
1689 __vmwrite(CR0_READ_SHADOW, value);
1690 break;
1691 case TYPE_LMSW:
1692 TRACE_VMEXIT(1,TYPE_LMSW);
1693 __vmread_vcpu(v, CR0_READ_SHADOW, &value);
1694 value = (value & ~0xF) |
1695 (((exit_qualification & LMSW_SOURCE_DATA) >> 16) & 0xF);
1696 return vmx_set_cr0(value);
1697 break;
1698 default:
1699 __hvm_bug(regs);
1700 break;
1702 return 1;
1705 static inline void vmx_do_msr_read(struct cpu_user_regs *regs)
1707 u64 msr_content = 0;
1708 struct vcpu *v = current;
1710 HVM_DBG_LOG(DBG_LEVEL_1, "vmx_do_msr_read: ecx=%lx, eax=%lx, edx=%lx",
1711 (unsigned long)regs->ecx, (unsigned long)regs->eax,
1712 (unsigned long)regs->edx);
1713 switch (regs->ecx) {
1714 case MSR_IA32_TIME_STAMP_COUNTER:
1716 struct hvm_virpit *vpit;
1718 rdtscll(msr_content);
1719 vpit = &(v->domain->arch.hvm_domain.vpit);
1720 msr_content += vpit->cache_tsc_offset;
1721 break;
1723 case MSR_IA32_SYSENTER_CS:
1724 __vmread(GUEST_SYSENTER_CS, (u32 *)&msr_content);
1725 break;
1726 case MSR_IA32_SYSENTER_ESP:
1727 __vmread(GUEST_SYSENTER_ESP, &msr_content);
1728 break;
1729 case MSR_IA32_SYSENTER_EIP:
1730 __vmread(GUEST_SYSENTER_EIP, &msr_content);
1731 break;
1732 case MSR_IA32_APICBASE:
1733 msr_content = VLAPIC(v) ? VLAPIC(v)->apic_base_msr : 0;
1734 break;
1735 default:
1736 if(long_mode_do_msr_read(regs))
1737 return;
1738 rdmsr_safe(regs->ecx, regs->eax, regs->edx);
1739 break;
1742 regs->eax = msr_content & 0xFFFFFFFF;
1743 regs->edx = msr_content >> 32;
1745 HVM_DBG_LOG(DBG_LEVEL_1, "vmx_do_msr_read returns: "
1746 "ecx=%lx, eax=%lx, edx=%lx",
1747 (unsigned long)regs->ecx, (unsigned long)regs->eax,
1748 (unsigned long)regs->edx);
1751 static inline void vmx_do_msr_write(struct cpu_user_regs *regs)
1753 u64 msr_content;
1754 struct vcpu *v = current;
1756 HVM_DBG_LOG(DBG_LEVEL_1, "vmx_do_msr_write: ecx=%lx, eax=%lx, edx=%lx",
1757 (unsigned long)regs->ecx, (unsigned long)regs->eax,
1758 (unsigned long)regs->edx);
1760 msr_content = (regs->eax & 0xFFFFFFFF) | ((u64)regs->edx << 32);
1762 switch (regs->ecx) {
1763 case MSR_IA32_TIME_STAMP_COUNTER:
1764 set_guest_time(v, msr_content);
1765 break;
1766 case MSR_IA32_SYSENTER_CS:
1767 __vmwrite(GUEST_SYSENTER_CS, msr_content);
1768 break;
1769 case MSR_IA32_SYSENTER_ESP:
1770 __vmwrite(GUEST_SYSENTER_ESP, msr_content);
1771 break;
1772 case MSR_IA32_SYSENTER_EIP:
1773 __vmwrite(GUEST_SYSENTER_EIP, msr_content);
1774 break;
1775 case MSR_IA32_APICBASE:
1776 vlapic_msr_set(VLAPIC(v), msr_content);
1777 break;
1778 default:
1779 long_mode_do_msr_write(regs);
1780 break;
1783 HVM_DBG_LOG(DBG_LEVEL_1, "vmx_do_msr_write returns: "
1784 "ecx=%lx, eax=%lx, edx=%lx",
1785 (unsigned long)regs->ecx, (unsigned long)regs->eax,
1786 (unsigned long)regs->edx);
1789 /*
1790 * Need to use this exit to reschedule
1791 */
1792 void vmx_vmexit_do_hlt(void)
1794 struct vcpu *v=current;
1795 struct hvm_virpit *vpit = &(v->domain->arch.hvm_domain.vpit);
1796 s_time_t next_pit=-1,next_wakeup;
1798 if ( !v->vcpu_id )
1799 next_pit = get_pit_scheduled(v,vpit);
1800 next_wakeup = get_apictime_scheduled(v);
1801 if ( (next_pit != -1 && next_pit < next_wakeup) || next_wakeup == -1 )
1802 next_wakeup = next_pit;
1803 if ( next_wakeup != - 1 )
1804 set_timer(&current->arch.hvm_vmx.hlt_timer, next_wakeup);
1805 hvm_safe_block();
1808 static inline void vmx_vmexit_do_extint(struct cpu_user_regs *regs)
1810 unsigned int vector;
1811 int error;
1813 asmlinkage void do_IRQ(struct cpu_user_regs *);
1814 fastcall void smp_apic_timer_interrupt(struct cpu_user_regs *);
1815 fastcall void smp_event_check_interrupt(void);
1816 fastcall void smp_invalidate_interrupt(void);
1817 fastcall void smp_call_function_interrupt(void);
1818 fastcall void smp_spurious_interrupt(struct cpu_user_regs *regs);
1819 fastcall void smp_error_interrupt(struct cpu_user_regs *regs);
1820 #ifdef CONFIG_X86_MCE_P4THERMAL
1821 fastcall void smp_thermal_interrupt(struct cpu_user_regs *regs);
1822 #endif
1824 if ((error = __vmread(VM_EXIT_INTR_INFO, &vector))
1825 && !(vector & INTR_INFO_VALID_MASK))
1826 __hvm_bug(regs);
1828 vector &= 0xff;
1829 local_irq_disable();
1831 switch(vector) {
1832 case LOCAL_TIMER_VECTOR:
1833 smp_apic_timer_interrupt(regs);
1834 break;
1835 case EVENT_CHECK_VECTOR:
1836 smp_event_check_interrupt();
1837 break;
1838 case INVALIDATE_TLB_VECTOR:
1839 smp_invalidate_interrupt();
1840 break;
1841 case CALL_FUNCTION_VECTOR:
1842 smp_call_function_interrupt();
1843 break;
1844 case SPURIOUS_APIC_VECTOR:
1845 smp_spurious_interrupt(regs);
1846 break;
1847 case ERROR_APIC_VECTOR:
1848 smp_error_interrupt(regs);
1849 break;
1850 #ifdef CONFIG_X86_MCE_P4THERMAL
1851 case THERMAL_APIC_VECTOR:
1852 smp_thermal_interrupt(regs);
1853 break;
1854 #endif
1855 default:
1856 regs->entry_vector = vector;
1857 do_IRQ(regs);
1858 break;
1862 #if defined (__x86_64__)
1863 void store_cpu_user_regs(struct cpu_user_regs *regs)
1865 __vmread(GUEST_SS_SELECTOR, &regs->ss);
1866 __vmread(GUEST_RSP, &regs->rsp);
1867 __vmread(GUEST_RFLAGS, &regs->rflags);
1868 __vmread(GUEST_CS_SELECTOR, &regs->cs);
1869 __vmread(GUEST_DS_SELECTOR, &regs->ds);
1870 __vmread(GUEST_ES_SELECTOR, &regs->es);
1871 __vmread(GUEST_RIP, &regs->rip);
1873 #elif defined (__i386__)
1874 void store_cpu_user_regs(struct cpu_user_regs *regs)
1876 __vmread(GUEST_SS_SELECTOR, &regs->ss);
1877 __vmread(GUEST_RSP, &regs->esp);
1878 __vmread(GUEST_RFLAGS, &regs->eflags);
1879 __vmread(GUEST_CS_SELECTOR, &regs->cs);
1880 __vmread(GUEST_DS_SELECTOR, &regs->ds);
1881 __vmread(GUEST_ES_SELECTOR, &regs->es);
1882 __vmread(GUEST_RIP, &regs->eip);
1884 #endif
1886 #ifdef XEN_DEBUGGER
1887 void save_cpu_user_regs(struct cpu_user_regs *regs)
1889 __vmread(GUEST_SS_SELECTOR, &regs->xss);
1890 __vmread(GUEST_RSP, &regs->esp);
1891 __vmread(GUEST_RFLAGS, &regs->eflags);
1892 __vmread(GUEST_CS_SELECTOR, &regs->xcs);
1893 __vmread(GUEST_RIP, &regs->eip);
1895 __vmread(GUEST_GS_SELECTOR, &regs->xgs);
1896 __vmread(GUEST_FS_SELECTOR, &regs->xfs);
1897 __vmread(GUEST_ES_SELECTOR, &regs->xes);
1898 __vmread(GUEST_DS_SELECTOR, &regs->xds);
1901 void restore_cpu_user_regs(struct cpu_user_regs *regs)
1903 __vmwrite(GUEST_SS_SELECTOR, regs->xss);
1904 __vmwrite(GUEST_RSP, regs->esp);
1905 __vmwrite(GUEST_RFLAGS, regs->eflags);
1906 __vmwrite(GUEST_CS_SELECTOR, regs->xcs);
1907 __vmwrite(GUEST_RIP, regs->eip);
1909 __vmwrite(GUEST_GS_SELECTOR, regs->xgs);
1910 __vmwrite(GUEST_FS_SELECTOR, regs->xfs);
1911 __vmwrite(GUEST_ES_SELECTOR, regs->xes);
1912 __vmwrite(GUEST_DS_SELECTOR, regs->xds);
1914 #endif
1916 asmlinkage void vmx_vmexit_handler(struct cpu_user_regs regs)
1918 unsigned int exit_reason, idtv_info_field;
1919 unsigned long exit_qualification, eip, inst_len = 0;
1920 struct vcpu *v = current;
1921 int error;
1923 if ((error = __vmread(VM_EXIT_REASON, &exit_reason)))
1924 __hvm_bug(&regs);
1926 perfc_incra(vmexits, exit_reason);
1928 __vmread(IDT_VECTORING_INFO_FIELD, &idtv_info_field);
1929 if (idtv_info_field & INTR_INFO_VALID_MASK) {
1930 __vmwrite(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field);
1932 __vmread(VM_EXIT_INSTRUCTION_LEN, &inst_len);
1933 if (inst_len >= 1 && inst_len <= 15)
1934 __vmwrite(VM_ENTRY_INSTRUCTION_LEN, inst_len);
1936 if (idtv_info_field & 0x800) { /* valid error code */
1937 unsigned long error_code;
1938 __vmread(IDT_VECTORING_ERROR_CODE, &error_code);
1939 __vmwrite(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
1942 HVM_DBG_LOG(DBG_LEVEL_1, "idtv_info_field=%x", idtv_info_field);
1945 /* don't bother H/W interrutps */
1946 if (exit_reason != EXIT_REASON_EXTERNAL_INTERRUPT &&
1947 exit_reason != EXIT_REASON_VMCALL &&
1948 exit_reason != EXIT_REASON_IO_INSTRUCTION)
1949 HVM_DBG_LOG(DBG_LEVEL_0, "exit reason = %x", exit_reason);
1951 if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
1952 printk("Failed vm entry\n");
1953 domain_crash_synchronous();
1954 return;
1958 __vmread(GUEST_RIP, &eip);
1959 TRACE_3D(TRC_VMX_VMEXIT, v->domain->domain_id, eip, exit_reason);
1960 TRACE_VMEXIT(0,exit_reason);
1963 switch (exit_reason) {
1964 case EXIT_REASON_EXCEPTION_NMI:
1966 /*
1967 * We don't set the software-interrupt exiting (INT n).
1968 * (1) We can get an exception (e.g. #PG) in the guest, or
1969 * (2) NMI
1970 */
1971 int error;
1972 unsigned int vector;
1973 unsigned long va;
1975 if ((error = __vmread(VM_EXIT_INTR_INFO, &vector))
1976 || !(vector & INTR_INFO_VALID_MASK))
1977 __hvm_bug(&regs);
1978 vector &= 0xff;
1980 TRACE_VMEXIT(1,vector);
1981 perfc_incra(cause_vector, vector);
1983 TRACE_3D(TRC_VMX_VECTOR, v->domain->domain_id, eip, vector);
1984 switch (vector) {
1985 #ifdef XEN_DEBUGGER
1986 case TRAP_debug:
1988 save_cpu_user_regs(&regs);
1989 pdb_handle_exception(1, &regs, 1);
1990 restore_cpu_user_regs(&regs);
1991 break;
1993 case TRAP_int3:
1995 save_cpu_user_regs(&regs);
1996 pdb_handle_exception(3, &regs, 1);
1997 restore_cpu_user_regs(&regs);
1998 break;
2000 #else
2001 case TRAP_debug:
2003 void store_cpu_user_regs(struct cpu_user_regs *regs);
2005 store_cpu_user_regs(&regs);
2006 __vm_clear_bit(GUEST_PENDING_DBG_EXCEPTIONS, PENDING_DEBUG_EXC_BS);
2008 domain_pause_for_debugger();
2010 break;
2012 #endif
2013 case TRAP_no_device:
2015 vmx_do_no_device_fault();
2016 break;
2018 case TRAP_page_fault:
2020 __vmread(EXIT_QUALIFICATION, &va);
2021 __vmread(VM_EXIT_INTR_ERROR_CODE, &regs.error_code);
2023 TRACE_VMEXIT(3,regs.error_code);
2024 TRACE_VMEXIT(4,va);
2026 HVM_DBG_LOG(DBG_LEVEL_VMMU,
2027 "eax=%lx, ebx=%lx, ecx=%lx, edx=%lx, esi=%lx, edi=%lx",
2028 (unsigned long)regs.eax, (unsigned long)regs.ebx,
2029 (unsigned long)regs.ecx, (unsigned long)regs.edx,
2030 (unsigned long)regs.esi, (unsigned long)regs.edi);
2031 v->arch.hvm_vcpu.mmio_op.inst_decoder_regs = &regs;
2033 if (!(error = vmx_do_page_fault(va, &regs))) {
2034 /*
2035 * Inject #PG using Interruption-Information Fields
2036 */
2037 vmx_inject_exception(v, TRAP_page_fault, regs.error_code);
2038 v->arch.hvm_vmx.cpu_cr2 = va;
2039 TRACE_3D(TRC_VMX_INT, v->domain->domain_id, TRAP_page_fault, va);
2041 break;
2043 case TRAP_nmi:
2044 do_nmi(&regs);
2045 break;
2046 default:
2047 vmx_reflect_exception(v);
2048 break;
2050 break;
2052 case EXIT_REASON_EXTERNAL_INTERRUPT:
2053 vmx_vmexit_do_extint(&regs);
2054 break;
2055 case EXIT_REASON_PENDING_INTERRUPT:
2056 __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
2057 MONITOR_CPU_BASED_EXEC_CONTROLS);
2058 v->arch.hvm_vcpu.u.vmx.exec_control = MONITOR_CPU_BASED_EXEC_CONTROLS;
2059 break;
2060 case EXIT_REASON_TASK_SWITCH:
2061 __hvm_bug(&regs);
2062 break;
2063 case EXIT_REASON_CPUID:
2064 vmx_vmexit_do_cpuid(&regs);
2065 __get_instruction_length(inst_len);
2066 __update_guest_eip(inst_len);
2067 break;
2068 case EXIT_REASON_HLT:
2069 __get_instruction_length(inst_len);
2070 __update_guest_eip(inst_len);
2071 vmx_vmexit_do_hlt();
2072 break;
2073 case EXIT_REASON_INVLPG:
2075 unsigned long va;
2077 __vmread(EXIT_QUALIFICATION, &va);
2078 vmx_vmexit_do_invlpg(va);
2079 __get_instruction_length(inst_len);
2080 __update_guest_eip(inst_len);
2081 break;
2083 #if 0 /* keep this for debugging */
2084 case EXIT_REASON_VMCALL:
2085 __get_instruction_length(inst_len);
2086 __vmread(GUEST_RIP, &eip);
2087 __vmread(EXIT_QUALIFICATION, &exit_qualification);
2089 hvm_print_line(v, regs.eax); /* provides the current domain */
2090 __update_guest_eip(inst_len);
2091 break;
2092 #endif
2093 case EXIT_REASON_CR_ACCESS:
2095 __vmread(GUEST_RIP, &eip);
2096 __get_instruction_length(inst_len);
2097 __vmread(EXIT_QUALIFICATION, &exit_qualification);
2099 HVM_DBG_LOG(DBG_LEVEL_1, "eip = %lx, inst_len =%lx, exit_qualification = %lx",
2100 eip, inst_len, exit_qualification);
2101 if (vmx_cr_access(exit_qualification, &regs))
2102 __update_guest_eip(inst_len);
2103 TRACE_VMEXIT(3,regs.error_code);
2104 TRACE_VMEXIT(4,exit_qualification);
2105 break;
2107 case EXIT_REASON_DR_ACCESS:
2108 __vmread(EXIT_QUALIFICATION, &exit_qualification);
2109 vmx_dr_access(exit_qualification, &regs);
2110 __get_instruction_length(inst_len);
2111 __update_guest_eip(inst_len);
2112 break;
2113 case EXIT_REASON_IO_INSTRUCTION:
2114 __vmread(EXIT_QUALIFICATION, &exit_qualification);
2115 __get_instruction_length(inst_len);
2116 vmx_io_instruction(&regs, exit_qualification, inst_len);
2117 TRACE_VMEXIT(4,exit_qualification);
2118 break;
2119 case EXIT_REASON_MSR_READ:
2120 __get_instruction_length(inst_len);
2121 vmx_do_msr_read(&regs);
2122 __update_guest_eip(inst_len);
2123 break;
2124 case EXIT_REASON_MSR_WRITE:
2125 __vmread(GUEST_RIP, &eip);
2126 vmx_do_msr_write(&regs);
2127 __get_instruction_length(inst_len);
2128 __update_guest_eip(inst_len);
2129 break;
2130 case EXIT_REASON_MWAIT_INSTRUCTION:
2131 __hvm_bug(&regs);
2132 break;
2133 case EXIT_REASON_VMCALL:
2134 case EXIT_REASON_VMCLEAR:
2135 case EXIT_REASON_VMLAUNCH:
2136 case EXIT_REASON_VMPTRLD:
2137 case EXIT_REASON_VMPTRST:
2138 case EXIT_REASON_VMREAD:
2139 case EXIT_REASON_VMRESUME:
2140 case EXIT_REASON_VMWRITE:
2141 case EXIT_REASON_VMOFF:
2142 case EXIT_REASON_VMON:
2143 /* Report invalid opcode exception when a VMX guest tries to execute
2144 any of the VMX instructions */
2145 vmx_inject_exception(v, TRAP_invalid_op, VMX_DELIVER_NO_ERROR_CODE);
2146 break;
2148 default:
2149 __hvm_bug(&regs); /* should not happen */
2153 asmlinkage void vmx_load_cr2(void)
2155 struct vcpu *v = current;
2157 local_irq_disable();
2158 #ifdef __i386__
2159 asm volatile("movl %0,%%cr2": :"r" (v->arch.hvm_vmx.cpu_cr2));
2160 #else
2161 asm volatile("movq %0,%%cr2": :"r" (v->arch.hvm_vmx.cpu_cr2));
2162 #endif
2165 asmlinkage void vmx_trace_vmentry (void)
2167 TRACE_5D(TRC_VMENTRY,
2168 trace_values[smp_processor_id()][0],
2169 trace_values[smp_processor_id()][1],
2170 trace_values[smp_processor_id()][2],
2171 trace_values[smp_processor_id()][3],
2172 trace_values[smp_processor_id()][4]);
2173 TRACE_VMEXIT(0,9);
2174 TRACE_VMEXIT(1,9);
2175 TRACE_VMEXIT(2,9);
2176 TRACE_VMEXIT(3,9);
2177 TRACE_VMEXIT(4,9);
2178 return;
2181 asmlinkage void vmx_trace_vmexit (void)
2183 TRACE_3D(TRC_VMEXIT,0,0,0);
2184 return;
2187 /*
2188 * Local variables:
2189 * mode: C
2190 * c-set-style: "BSD"
2191 * c-basic-offset: 4
2192 * tab-width: 4
2193 * indent-tabs-mode: nil
2194 * End:
2195 */