direct-io.hg

view xen/arch/x86/hvm/vmx/vmx.c @ 9698:9c313ff7a0ed

There are instances where we DO NOT want an hvm guest to run an
MP enabled kernel. In such situations we should have a workaround to
guarantee hvm guests will not detect MP.

For example, in the absence of ACPI and MPS the installation code in some
linux distributions key off the presence of cpuid edx/HTT bit (indicating
the presence of Hyper-Threading Technology) to determine if another
logical processor is present and if so load an MP enabled kernel instead
of a uniprocessor kernel. SMBIOS is also looked at for the same purpose
and presents a potential problem as well. While both approaches for
selecting an MP kernel are debatable (since using MPS or ACPI have long
been the standard for MP detection), these approaches are something we
have to live and work around with because making a change in the fully
virtualized guest is not an option.

To solve the problem we need to hide all secondary processors from the hvm
guest. Since the hvm does not surface MPS tables, we only need to deal
with ACPI, cpuid HTT, and possibly SMBIOS. (I did not have time right
now to look closely at the hvm BIOS to know if SMBIOS is also going to be
a problem.)

Also fixes a logic problem the code path where apic=0 was not
being handled correctly (vmx path only).

Signed-off-by: Clyde Griffin <cgriffin@novell.com>
author kaf24@firebug.cl.cam.ac.uk
date Fri Apr 21 09:56:50 2006 +0100 (2006-04-21)
parents 5765497cf75e
children 29e9a0313c09
line source
1 /*
2 * vmx.c: handling VMX architecture-related VM exits
3 * Copyright (c) 2004, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
16 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 *
18 */
20 #include <xen/config.h>
21 #include <xen/init.h>
22 #include <xen/lib.h>
23 #include <xen/trace.h>
24 #include <xen/sched.h>
25 #include <xen/irq.h>
26 #include <xen/softirq.h>
27 #include <xen/domain_page.h>
28 #include <asm/current.h>
29 #include <asm/io.h>
30 #include <asm/shadow.h>
31 #include <asm/regs.h>
32 #include <asm/cpufeature.h>
33 #include <asm/processor.h>
34 #include <asm/types.h>
35 #include <asm/msr.h>
36 #include <asm/spinlock.h>
37 #include <asm/hvm/hvm.h>
38 #include <asm/hvm/support.h>
39 #include <asm/hvm/vmx/vmx.h>
40 #include <asm/hvm/vmx/vmcs.h>
41 #include <asm/shadow.h>
42 #if CONFIG_PAGING_LEVELS >= 3
43 #include <asm/shadow_64.h>
44 #endif
45 #include <public/sched.h>
46 #include <public/hvm/ioreq.h>
47 #include <asm/hvm/vpic.h>
48 #include <asm/hvm/vlapic.h>
50 static unsigned long trace_values[NR_CPUS][4];
51 #define TRACE_VMEXIT(index,value) trace_values[smp_processor_id()][index]=value
53 static void vmx_ctxt_switch_from(struct vcpu *v);
54 static void vmx_ctxt_switch_to(struct vcpu *v);
56 void vmx_final_setup_guest(struct vcpu *v)
57 {
58 v->arch.schedule_tail = arch_vmx_do_launch;
59 v->arch.ctxt_switch_from = vmx_ctxt_switch_from;
60 v->arch.ctxt_switch_to = vmx_ctxt_switch_to;
62 if ( v->vcpu_id == 0 )
63 {
64 struct domain *d = v->domain;
65 struct vcpu *vc;
67 /* Initialize monitor page table */
68 for_each_vcpu(d, vc)
69 vc->arch.monitor_table = mk_pagetable(0);
71 /*
72 * Required to do this once per domain
73 * XXX todo: add a seperate function to do these.
74 */
75 memset(&d->shared_info->evtchn_mask[0], 0xff,
76 sizeof(d->shared_info->evtchn_mask));
78 /* Put the domain in shadow mode even though we're going to be using
79 * the shared 1:1 page table initially. It shouldn't hurt */
80 shadow_mode_enable(d,
81 SHM_enable|SHM_refcounts|
82 SHM_translate|SHM_external|SHM_wr_pt_pte);
83 }
84 }
86 static void vmx_relinquish_guest_resources(struct domain *d)
87 {
88 struct vcpu *v;
90 for_each_vcpu ( d, v )
91 {
92 if ( !test_bit(_VCPUF_initialised, &v->vcpu_flags) )
93 continue;
94 vmx_request_clear_vmcs(v);
95 destroy_vmcs(&v->arch.hvm_vmx);
96 free_monitor_pagetable(v);
97 kill_timer(&v->arch.hvm_vmx.hlt_timer);
98 if ( hvm_apic_support(v->domain) && (VLAPIC(v) != NULL) )
99 {
100 kill_timer(&VLAPIC(v)->vlapic_timer);
101 xfree(VLAPIC(v));
102 }
103 }
105 kill_timer(&d->arch.hvm_domain.vpit.time_info.pit_timer);
107 if ( d->arch.hvm_domain.shared_page_va )
108 unmap_domain_page_global(
109 (void *)d->arch.hvm_domain.shared_page_va);
111 shadow_direct_map_clean(d);
112 }
114 #ifdef __x86_64__
116 static struct vmx_msr_state percpu_msr[NR_CPUS];
118 static u32 msr_data_index[VMX_MSR_COUNT] =
119 {
120 MSR_LSTAR, MSR_STAR, MSR_CSTAR,
121 MSR_SYSCALL_MASK, MSR_EFER,
122 };
124 static void vmx_save_segments(struct vcpu *v)
125 {
126 rdmsrl(MSR_SHADOW_GS_BASE, v->arch.hvm_vmx.msr_content.shadow_gs);
127 }
129 /*
130 * To avoid MSR save/restore at every VM exit/entry time, we restore
131 * the x86_64 specific MSRs at domain switch time. Since those MSRs are
132 * are not modified once set for generic domains, we don't save them,
133 * but simply reset them to the values set at percpu_traps_init().
134 */
135 static void vmx_load_msrs(void)
136 {
137 struct vmx_msr_state *host_state = &percpu_msr[smp_processor_id()];
138 int i;
140 while ( host_state->flags )
141 {
142 i = find_first_set_bit(host_state->flags);
143 wrmsrl(msr_data_index[i], host_state->msr_items[i]);
144 clear_bit(i, &host_state->flags);
145 }
146 }
148 static void vmx_save_init_msrs(void)
149 {
150 struct vmx_msr_state *host_state = &percpu_msr[smp_processor_id()];
151 int i;
153 for ( i = 0; i < VMX_MSR_COUNT; i++ )
154 rdmsrl(msr_data_index[i], host_state->msr_items[i]);
155 }
157 #define CASE_READ_MSR(address) \
158 case MSR_ ## address: \
159 msr_content = msr->msr_items[VMX_INDEX_MSR_ ## address]; \
160 break
162 #define CASE_WRITE_MSR(address) \
163 case MSR_ ## address: \
164 { \
165 msr->msr_items[VMX_INDEX_MSR_ ## address] = msr_content; \
166 if (!test_bit(VMX_INDEX_MSR_ ## address, &msr->flags)) { \
167 set_bit(VMX_INDEX_MSR_ ## address, &msr->flags); \
168 } \
169 wrmsrl(MSR_ ## address, msr_content); \
170 set_bit(VMX_INDEX_MSR_ ## address, &host_state->flags); \
171 } \
172 break
174 #define IS_CANO_ADDRESS(add) 1
175 static inline int long_mode_do_msr_read(struct cpu_user_regs *regs)
176 {
177 u64 msr_content = 0;
178 struct vcpu *v = current;
179 struct vmx_msr_state *msr = &v->arch.hvm_vmx.msr_content;
181 switch ( regs->ecx ) {
182 case MSR_EFER:
183 HVM_DBG_LOG(DBG_LEVEL_2, "EFER msr_content 0x%"PRIx64, msr_content);
184 msr_content = msr->msr_items[VMX_INDEX_MSR_EFER];
186 /* the following code may be not needed */
187 if ( test_bit(VMX_CPU_STATE_LME_ENABLED, &v->arch.hvm_vmx.cpu_state) )
188 msr_content |= EFER_LME;
189 else
190 msr_content &= ~EFER_LME;
192 if ( VMX_LONG_GUEST(v) )
193 msr_content |= EFER_LMA;
194 else
195 msr_content &= ~EFER_LMA;
196 break;
198 case MSR_FS_BASE:
199 if ( !(VMX_LONG_GUEST(v)) )
200 /* XXX should it be GP fault */
201 domain_crash_synchronous();
203 __vmread(GUEST_FS_BASE, &msr_content);
204 break;
206 case MSR_GS_BASE:
207 if ( !(VMX_LONG_GUEST(v)) )
208 domain_crash_synchronous();
210 __vmread(GUEST_GS_BASE, &msr_content);
211 break;
213 case MSR_SHADOW_GS_BASE:
214 msr_content = msr->shadow_gs;
215 break;
217 CASE_READ_MSR(STAR);
218 CASE_READ_MSR(LSTAR);
219 CASE_READ_MSR(CSTAR);
220 CASE_READ_MSR(SYSCALL_MASK);
222 default:
223 return 0;
224 }
226 HVM_DBG_LOG(DBG_LEVEL_2, "msr_content: 0x%"PRIx64, msr_content);
228 regs->eax = msr_content & 0xffffffff;
229 regs->edx = msr_content >> 32;
231 return 1;
232 }
234 static inline int long_mode_do_msr_write(struct cpu_user_regs *regs)
235 {
236 u64 msr_content = regs->eax | ((u64)regs->edx << 32);
237 struct vcpu *v = current;
238 struct vmx_msr_state *msr = &v->arch.hvm_vmx.msr_content;
239 struct vmx_msr_state *host_state = &percpu_msr[smp_processor_id()];
241 HVM_DBG_LOG(DBG_LEVEL_1, "msr 0x%lx msr_content 0x%"PRIx64"\n",
242 (unsigned long)regs->ecx, msr_content);
244 switch ( regs->ecx ) {
245 case MSR_EFER:
246 /* offending reserved bit will cause #GP */
247 if ( msr_content & ~(EFER_LME | EFER_LMA | EFER_NX | EFER_SCE) )
248 {
249 printk("trying to set reserved bit in EFER\n");
250 vmx_inject_exception(v, TRAP_gp_fault, 0);
251 return 0;
252 }
254 /* LME: 0 -> 1 */
255 if ( msr_content & EFER_LME &&
256 !test_bit(VMX_CPU_STATE_LME_ENABLED, &v->arch.hvm_vmx.cpu_state) )
257 {
258 if ( vmx_paging_enabled(v) ||
259 !test_bit(VMX_CPU_STATE_PAE_ENABLED,
260 &v->arch.hvm_vmx.cpu_state) )
261 {
262 printk("trying to set LME bit when "
263 "in paging mode or PAE bit is not set\n");
264 vmx_inject_exception(v, TRAP_gp_fault, 0);
265 return 0;
266 }
268 set_bit(VMX_CPU_STATE_LME_ENABLED, &v->arch.hvm_vmx.cpu_state);
269 }
271 msr->msr_items[VMX_INDEX_MSR_EFER] = msr_content;
272 break;
274 case MSR_FS_BASE:
275 case MSR_GS_BASE:
276 if ( !(VMX_LONG_GUEST(v)) )
277 domain_crash_synchronous();
279 if ( !IS_CANO_ADDRESS(msr_content) )
280 {
281 HVM_DBG_LOG(DBG_LEVEL_1, "Not cano address of msr write\n");
282 vmx_inject_exception(v, TRAP_gp_fault, 0);
283 return 0;
284 }
286 if ( regs->ecx == MSR_FS_BASE )
287 __vmwrite(GUEST_FS_BASE, msr_content);
288 else
289 __vmwrite(GUEST_GS_BASE, msr_content);
291 break;
293 case MSR_SHADOW_GS_BASE:
294 if ( !(VMX_LONG_GUEST(v)) )
295 domain_crash_synchronous();
297 v->arch.hvm_vmx.msr_content.shadow_gs = msr_content;
298 wrmsrl(MSR_SHADOW_GS_BASE, msr_content);
299 break;
301 CASE_WRITE_MSR(STAR);
302 CASE_WRITE_MSR(LSTAR);
303 CASE_WRITE_MSR(CSTAR);
304 CASE_WRITE_MSR(SYSCALL_MASK);
306 default:
307 return 0;
308 }
310 return 1;
311 }
313 static void vmx_restore_msrs(struct vcpu *v)
314 {
315 int i = 0;
316 struct vmx_msr_state *guest_state;
317 struct vmx_msr_state *host_state;
318 unsigned long guest_flags ;
320 guest_state = &v->arch.hvm_vmx.msr_content;;
321 host_state = &percpu_msr[smp_processor_id()];
323 wrmsrl(MSR_SHADOW_GS_BASE, guest_state->shadow_gs);
324 guest_flags = guest_state->flags;
325 if (!guest_flags)
326 return;
328 while (guest_flags){
329 i = find_first_set_bit(guest_flags);
331 HVM_DBG_LOG(DBG_LEVEL_2,
332 "restore guest's index %d msr %lx with %lx\n",
333 i, (unsigned long)msr_data_index[i],
334 (unsigned long)guest_state->msr_items[i]);
335 set_bit(i, &host_state->flags);
336 wrmsrl(msr_data_index[i], guest_state->msr_items[i]);
337 clear_bit(i, &guest_flags);
338 }
339 }
340 #else /* __i386__ */
342 #define vmx_save_segments(v) ((void)0)
343 #define vmx_load_msrs() ((void)0)
344 #define vmx_restore_msrs(v) ((void)0)
345 #define vmx_save_init_msrs() ((void)0)
347 static inline int long_mode_do_msr_read(struct cpu_user_regs *regs)
348 {
349 return 0;
350 }
352 static inline int long_mode_do_msr_write(struct cpu_user_regs *regs)
353 {
354 return 0;
355 }
357 #endif /* __i386__ */
359 static void vmx_freeze_time(struct vcpu *v)
360 {
361 struct hvm_time_info *time_info = &(v->domain->arch.hvm_domain.vpit.time_info);
363 if ( time_info->first_injected && !v->domain->arch.hvm_domain.guest_time ) {
364 v->domain->arch.hvm_domain.guest_time = get_guest_time(v);
365 time_info->count_advance += (NOW() - time_info->count_point);
366 stop_timer(&(time_info->pit_timer));
367 }
368 }
370 static void vmx_ctxt_switch_from(struct vcpu *v)
371 {
372 vmx_freeze_time(v);
373 vmx_save_segments(v);
374 vmx_load_msrs();
375 }
377 static void vmx_ctxt_switch_to(struct vcpu *v)
378 {
379 vmx_restore_msrs(v);
380 }
382 void stop_vmx(void)
383 {
384 if (read_cr4() & X86_CR4_VMXE)
385 __vmxoff();
386 }
388 int vmx_initialize_guest_resources(struct vcpu *v)
389 {
390 vmx_final_setup_guest(v);
391 return 1;
392 }
394 void vmx_migrate_timers(struct vcpu *v)
395 {
396 struct hvm_time_info *time_info = &v->domain->arch.hvm_domain.vpit.time_info;
398 migrate_timer(&time_info->pit_timer, v->processor);
399 migrate_timer(&v->arch.hvm_vmx.hlt_timer, v->processor);
400 if ( hvm_apic_support(v->domain) && VLAPIC(v))
401 migrate_timer(&(VLAPIC(v)->vlapic_timer), v->processor);
402 }
404 struct vmx_cpu_guest_regs_callback_info {
405 struct vcpu *v;
406 struct cpu_user_regs *regs;
407 unsigned long *crs;
408 };
410 static void vmx_store_cpu_guest_regs(
411 struct vcpu *v, struct cpu_user_regs *regs, unsigned long *crs);
413 static void vmx_load_cpu_guest_regs(
414 struct vcpu *v, struct cpu_user_regs *regs);
416 static void vmx_store_cpu_guest_regs_callback(void *data)
417 {
418 struct vmx_cpu_guest_regs_callback_info *info = data;
419 vmx_store_cpu_guest_regs(info->v, info->regs, info->crs);
420 }
422 static void vmx_load_cpu_guest_regs_callback(void *data)
423 {
424 struct vmx_cpu_guest_regs_callback_info *info = data;
425 vmx_load_cpu_guest_regs(info->v, info->regs);
426 }
428 static void vmx_store_cpu_guest_regs(
429 struct vcpu *v, struct cpu_user_regs *regs, unsigned long *crs)
430 {
431 if ( v != current )
432 {
433 /* Non-current VCPUs must be paused to get a register snapshot. */
434 ASSERT(atomic_read(&v->pausecnt) != 0);
436 if ( v->arch.hvm_vmx.launch_cpu != smp_processor_id() )
437 {
438 /* Get register details from remote CPU. */
439 struct vmx_cpu_guest_regs_callback_info info = {
440 .v = v, .regs = regs, .crs = crs };
441 cpumask_t cpumask = cpumask_of_cpu(v->arch.hvm_vmx.launch_cpu);
442 on_selected_cpus(cpumask, vmx_store_cpu_guest_regs_callback,
443 &info, 1, 1);
444 return;
445 }
447 /* Register details are on this CPU. Load the correct VMCS. */
448 __vmptrld(virt_to_maddr(v->arch.hvm_vmx.vmcs));
449 }
451 ASSERT(v->arch.hvm_vmx.launch_cpu == smp_processor_id());
453 if ( regs != NULL )
454 {
455 #if defined (__x86_64__)
456 __vmread(GUEST_RFLAGS, &regs->rflags);
457 __vmread(GUEST_SS_SELECTOR, &regs->ss);
458 __vmread(GUEST_CS_SELECTOR, &regs->cs);
459 __vmread(GUEST_DS_SELECTOR, &regs->ds);
460 __vmread(GUEST_ES_SELECTOR, &regs->es);
461 __vmread(GUEST_GS_SELECTOR, &regs->gs);
462 __vmread(GUEST_FS_SELECTOR, &regs->fs);
463 __vmread(GUEST_RIP, &regs->rip);
464 __vmread(GUEST_RSP, &regs->rsp);
465 #elif defined (__i386__)
466 __vmread(GUEST_RFLAGS, &regs->eflags);
467 __vmread(GUEST_SS_SELECTOR, &regs->ss);
468 __vmread(GUEST_CS_SELECTOR, &regs->cs);
469 __vmread(GUEST_DS_SELECTOR, &regs->ds);
470 __vmread(GUEST_ES_SELECTOR, &regs->es);
471 __vmread(GUEST_GS_SELECTOR, &regs->gs);
472 __vmread(GUEST_FS_SELECTOR, &regs->fs);
473 __vmread(GUEST_RIP, &regs->eip);
474 __vmread(GUEST_RSP, &regs->esp);
475 #endif
476 }
478 if ( crs != NULL )
479 {
480 __vmread(CR0_READ_SHADOW, &crs[0]);
481 __vmread(GUEST_CR3, &crs[3]);
482 __vmread(CR4_READ_SHADOW, &crs[4]);
483 }
485 /* Reload current VCPU's VMCS if it was temporarily unloaded. */
486 if ( (v != current) && hvm_guest(current) )
487 __vmptrld(virt_to_maddr(current->arch.hvm_vmx.vmcs));
488 }
490 void vmx_load_cpu_guest_regs(struct vcpu *v, struct cpu_user_regs *regs)
491 {
492 if ( v != current )
493 {
494 /* Non-current VCPUs must be paused to set the register snapshot. */
495 ASSERT(atomic_read(&v->pausecnt) != 0);
497 if ( v->arch.hvm_vmx.launch_cpu != smp_processor_id() )
498 {
499 struct vmx_cpu_guest_regs_callback_info info = {
500 .v = v, .regs = regs };
501 cpumask_t cpumask = cpumask_of_cpu(v->arch.hvm_vmx.launch_cpu);
502 on_selected_cpus(cpumask, vmx_load_cpu_guest_regs_callback,
503 &info, 1, 1);
504 return;
505 }
507 /* Register details are on this CPU. Load the correct VMCS. */
508 __vmptrld(virt_to_maddr(v->arch.hvm_vmx.vmcs));
509 }
511 ASSERT(v->arch.hvm_vmx.launch_cpu == smp_processor_id());
513 #if defined (__x86_64__)
514 __vmwrite(GUEST_SS_SELECTOR, regs->ss);
515 __vmwrite(GUEST_DS_SELECTOR, regs->ds);
516 __vmwrite(GUEST_ES_SELECTOR, regs->es);
517 __vmwrite(GUEST_GS_SELECTOR, regs->gs);
518 __vmwrite(GUEST_FS_SELECTOR, regs->fs);
519 __vmwrite(GUEST_RSP, regs->rsp);
521 __vmwrite(GUEST_RFLAGS, regs->rflags);
522 if (regs->rflags & EF_TF)
523 __vm_set_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_DB);
524 else
525 __vm_clear_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_DB);
527 __vmwrite(GUEST_CS_SELECTOR, regs->cs);
528 __vmwrite(GUEST_RIP, regs->rip);
529 #elif defined (__i386__)
530 __vmwrite(GUEST_SS_SELECTOR, regs->ss);
531 __vmwrite(GUEST_DS_SELECTOR, regs->ds);
532 __vmwrite(GUEST_ES_SELECTOR, regs->es);
533 __vmwrite(GUEST_GS_SELECTOR, regs->gs);
534 __vmwrite(GUEST_FS_SELECTOR, regs->fs);
536 __vmwrite(GUEST_RSP, regs->esp);
538 __vmwrite(GUEST_RFLAGS, regs->eflags);
539 if (regs->eflags & EF_TF)
540 __vm_set_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_DB);
541 else
542 __vm_clear_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_DB);
544 __vmwrite(GUEST_CS_SELECTOR, regs->cs);
545 __vmwrite(GUEST_RIP, regs->eip);
546 #endif
548 /* Reload current VCPU's VMCS if it was temporarily unloaded. */
549 if ( (v != current) && hvm_guest(current) )
550 __vmptrld(virt_to_maddr(current->arch.hvm_vmx.vmcs));
551 }
553 int vmx_realmode(struct vcpu *v)
554 {
555 unsigned long rflags;
557 __vmread(GUEST_RFLAGS, &rflags);
558 return rflags & X86_EFLAGS_VM;
559 }
561 int vmx_instruction_length(struct vcpu *v)
562 {
563 unsigned long inst_len;
565 if (__vmread(VM_EXIT_INSTRUCTION_LEN, &inst_len))
566 return 0;
567 return inst_len;
568 }
570 unsigned long vmx_get_ctrl_reg(struct vcpu *v, unsigned int num)
571 {
572 switch ( num )
573 {
574 case 0:
575 return v->arch.hvm_vmx.cpu_cr0;
576 case 2:
577 return v->arch.hvm_vmx.cpu_cr2;
578 case 3:
579 return v->arch.hvm_vmx.cpu_cr3;
580 default:
581 BUG();
582 }
583 return 0; /* dummy */
584 }
586 /* SMP VMX guest support */
587 void vmx_init_ap_context(struct vcpu_guest_context *ctxt,
588 int vcpuid, int trampoline_vector)
589 {
590 int i;
592 memset(ctxt, 0, sizeof(*ctxt));
594 /*
595 * Initial register values:
596 */
597 ctxt->user_regs.eip = VMXASSIST_BASE;
598 ctxt->user_regs.edx = vcpuid;
599 ctxt->user_regs.ebx = trampoline_vector;
601 ctxt->flags = VGCF_HVM_GUEST;
603 /* Virtual IDT is empty at start-of-day. */
604 for ( i = 0; i < 256; i++ )
605 {
606 ctxt->trap_ctxt[i].vector = i;
607 ctxt->trap_ctxt[i].cs = FLAT_KERNEL_CS;
608 }
610 /* No callback handlers. */
611 #if defined(__i386__)
612 ctxt->event_callback_cs = FLAT_KERNEL_CS;
613 ctxt->failsafe_callback_cs = FLAT_KERNEL_CS;
614 #endif
615 }
617 void do_nmi(struct cpu_user_regs *);
619 static int check_vmx_controls(u32 ctrls, u32 msr)
620 {
621 u32 vmx_msr_low, vmx_msr_high;
623 rdmsr(msr, vmx_msr_low, vmx_msr_high);
624 if ( (ctrls < vmx_msr_low) || (ctrls > vmx_msr_high) )
625 {
626 printk("Insufficient VMX capability 0x%x, "
627 "msr=0x%x,low=0x%8x,high=0x%x\n",
628 ctrls, msr, vmx_msr_low, vmx_msr_high);
629 return 0;
630 }
631 return 1;
632 }
634 int start_vmx(void)
635 {
636 struct vmcs_struct *vmcs;
637 u32 ecx;
638 u32 eax, edx;
639 u64 phys_vmcs; /* debugging */
641 /*
642 * Xen does not fill x86_capability words except 0.
643 */
644 ecx = cpuid_ecx(1);
645 boot_cpu_data.x86_capability[4] = ecx;
647 if (!(test_bit(X86_FEATURE_VMXE, &boot_cpu_data.x86_capability)))
648 return 0;
650 rdmsr(IA32_FEATURE_CONTROL_MSR, eax, edx);
652 if (eax & IA32_FEATURE_CONTROL_MSR_LOCK) {
653 if ((eax & IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON) == 0x0) {
654 printk("VMX disabled by Feature Control MSR.\n");
655 return 0;
656 }
657 }
658 else {
659 wrmsr(IA32_FEATURE_CONTROL_MSR,
660 IA32_FEATURE_CONTROL_MSR_LOCK |
661 IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON, 0);
662 }
664 if (!check_vmx_controls(MONITOR_PIN_BASED_EXEC_CONTROLS,
665 MSR_IA32_VMX_PINBASED_CTLS_MSR))
666 return 0;
667 if (!check_vmx_controls(MONITOR_CPU_BASED_EXEC_CONTROLS,
668 MSR_IA32_VMX_PROCBASED_CTLS_MSR))
669 return 0;
670 if (!check_vmx_controls(MONITOR_VM_EXIT_CONTROLS,
671 MSR_IA32_VMX_EXIT_CTLS_MSR))
672 return 0;
673 if (!check_vmx_controls(MONITOR_VM_ENTRY_CONTROLS,
674 MSR_IA32_VMX_ENTRY_CTLS_MSR))
675 return 0;
677 set_in_cr4(X86_CR4_VMXE); /* Enable VMXE */
679 if (!(vmcs = alloc_vmcs())) {
680 printk("Failed to allocate VMCS\n");
681 return 0;
682 }
684 phys_vmcs = (u64) virt_to_maddr(vmcs);
686 if (!(__vmxon(phys_vmcs))) {
687 printk("VMXON is done\n");
688 }
690 vmx_save_init_msrs();
692 /* Setup HVM interfaces */
693 hvm_funcs.disable = stop_vmx;
695 hvm_funcs.initialize_guest_resources = vmx_initialize_guest_resources;
696 hvm_funcs.relinquish_guest_resources = vmx_relinquish_guest_resources;
698 hvm_funcs.store_cpu_guest_regs = vmx_store_cpu_guest_regs;
699 hvm_funcs.load_cpu_guest_regs = vmx_load_cpu_guest_regs;
701 hvm_funcs.realmode = vmx_realmode;
702 hvm_funcs.paging_enabled = vmx_paging_enabled;
703 hvm_funcs.instruction_length = vmx_instruction_length;
704 hvm_funcs.get_guest_ctrl_reg = vmx_get_ctrl_reg;
706 hvm_funcs.init_ap_context = vmx_init_ap_context;
708 hvm_enabled = 1;
710 return 1;
711 }
713 /*
714 * Not all cases receive valid value in the VM-exit instruction length field.
715 */
716 #define __get_instruction_length(len) \
717 __vmread(VM_EXIT_INSTRUCTION_LEN, &(len)); \
718 if ((len) < 1 || (len) > 15) \
719 __hvm_bug(&regs);
721 static void inline __update_guest_eip(unsigned long inst_len)
722 {
723 unsigned long current_eip;
725 __vmread(GUEST_RIP, &current_eip);
726 __vmwrite(GUEST_RIP, current_eip + inst_len);
727 __vmwrite(GUEST_INTERRUPTIBILITY_INFO, 0);
728 }
731 static int vmx_do_page_fault(unsigned long va, struct cpu_user_regs *regs)
732 {
733 unsigned long gpa; /* FIXME: PAE */
734 int result;
736 #if 0 /* keep for debugging */
737 {
738 unsigned long eip;
740 __vmread(GUEST_RIP, &eip);
741 HVM_DBG_LOG(DBG_LEVEL_VMMU,
742 "vmx_do_page_fault = 0x%lx, eip = %lx, error_code = %lx",
743 va, eip, (unsigned long)regs->error_code);
744 }
745 #endif
747 if ( !vmx_paging_enabled(current) )
748 {
749 /* construct 1-to-1 direct mapping */
750 if ( shadow_direct_map_fault(va, regs) )
751 return 1;
753 handle_mmio(va, va);
754 TRACE_VMEXIT (2,2);
755 return 1;
756 }
757 gpa = gva_to_gpa(va);
759 /* Use 1:1 page table to identify MMIO address space */
760 if ( mmio_space(gpa) ){
761 struct vcpu *v = current;
762 /* No support for APIC */
763 if (!hvm_apic_support(v->domain) && gpa >= 0xFEC00000) {
764 u32 inst_len;
765 __vmread(VM_EXIT_INSTRUCTION_LEN, &(inst_len));
766 __update_guest_eip(inst_len);
767 return 1;
768 }
769 TRACE_VMEXIT (2,2);
770 handle_mmio(va, gpa);
771 return 1;
772 }
774 result = shadow_fault(va, regs);
775 TRACE_VMEXIT (2,result);
776 #if 0
777 if ( !result )
778 {
779 __vmread(GUEST_RIP, &eip);
780 printk("vmx pgfault to guest va=%lx eip=%lx\n", va, eip);
781 }
782 #endif
784 return result;
785 }
787 static void vmx_do_no_device_fault(void)
788 {
789 unsigned long cr0;
790 struct vcpu *v = current;
792 setup_fpu(current);
793 __vm_clear_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_NM);
795 /* Disable TS in guest CR0 unless the guest wants the exception too. */
796 __vmread_vcpu(v, CR0_READ_SHADOW, &cr0);
797 if ( !(cr0 & X86_CR0_TS) )
798 {
799 __vmread_vcpu(v, GUEST_CR0, &cr0);
800 cr0 &= ~X86_CR0_TS;
801 __vmwrite(GUEST_CR0, cr0);
802 }
803 }
805 /* Reserved bits: [31:15], [12:11], [9], [6], [2:1] */
806 #define VMX_VCPU_CPUID_L1_RESERVED 0xffff9a46
808 static void vmx_vmexit_do_cpuid(struct cpu_user_regs *regs)
809 {
810 unsigned int input = (unsigned int)regs->eax;
811 unsigned int count = (unsigned int)regs->ecx;
812 unsigned int eax, ebx, ecx, edx;
813 unsigned long eip;
814 struct vcpu *v = current;
816 __vmread(GUEST_RIP, &eip);
818 HVM_DBG_LOG(DBG_LEVEL_3, "(eax) 0x%08lx, (ebx) 0x%08lx, "
819 "(ecx) 0x%08lx, (edx) 0x%08lx, (esi) 0x%08lx, (edi) 0x%08lx",
820 (unsigned long)regs->eax, (unsigned long)regs->ebx,
821 (unsigned long)regs->ecx, (unsigned long)regs->edx,
822 (unsigned long)regs->esi, (unsigned long)regs->edi);
824 if ( input == 4 )
825 cpuid_count(input, count, &eax, &ebx, &ecx, &edx);
826 else
827 cpuid(input, &eax, &ebx, &ecx, &edx);
829 if ( input == 1 )
830 {
831 if ( !hvm_apic_support(v->domain) ||
832 !vlapic_global_enabled((VLAPIC(v))) )
833 {
834 clear_bit(X86_FEATURE_APIC, &edx);
835 /* Since the apic is disabled, avoid any confusion about SMP cpus being available */
836 clear_bit(X86_FEATURE_HT, &edx); /* clear the hyperthread bit */
837 ebx &= 0xFF00FFFF; /* set the logical processor count to 1 */
838 ebx |= 0x00010000;
839 }
842 #if CONFIG_PAGING_LEVELS < 3
843 clear_bit(X86_FEATURE_PAE, &edx);
844 clear_bit(X86_FEATURE_PSE, &edx);
845 clear_bit(X86_FEATURE_PSE36, &edx);
846 #else
847 if ( v->domain->arch.ops->guest_paging_levels == PAGING_L2 )
848 {
849 if ( !v->domain->arch.hvm_domain.pae_enabled )
850 clear_bit(X86_FEATURE_PAE, &edx);
851 clear_bit(X86_FEATURE_PSE, &edx);
852 clear_bit(X86_FEATURE_PSE36, &edx);
853 }
854 #endif
856 /* Unsupportable for virtualised CPUs. */
857 ecx &= ~VMX_VCPU_CPUID_L1_RESERVED; /* mask off reserved bits */
858 clear_bit(X86_FEATURE_VMXE & 31, &ecx);
859 clear_bit(X86_FEATURE_MWAIT & 31, &ecx);
860 }
861 #ifdef __i386__
862 else if ( input == 0x80000001 )
863 {
864 /* Mask feature for Intel ia32e or AMD long mode. */
865 clear_bit(X86_FEATURE_LM & 31, &edx);
866 }
867 #endif
869 regs->eax = (unsigned long) eax;
870 regs->ebx = (unsigned long) ebx;
871 regs->ecx = (unsigned long) ecx;
872 regs->edx = (unsigned long) edx;
874 HVM_DBG_LOG(DBG_LEVEL_3, "eip@%lx, input: 0x%lx, "
875 "output: eax = 0x%08lx, ebx = 0x%08lx, "
876 "ecx = 0x%08lx, edx = 0x%08lx",
877 (unsigned long)eip, (unsigned long)input,
878 (unsigned long)eax, (unsigned long)ebx,
879 (unsigned long)ecx, (unsigned long)edx);
880 }
882 #define CASE_GET_REG_P(REG, reg) \
883 case REG_ ## REG: reg_p = (unsigned long *)&(regs->reg); break
885 static void vmx_dr_access (unsigned long exit_qualification, struct cpu_user_regs *regs)
886 {
887 unsigned int reg;
888 unsigned long *reg_p = 0;
889 struct vcpu *v = current;
890 unsigned long eip;
892 __vmread(GUEST_RIP, &eip);
894 reg = exit_qualification & DEBUG_REG_ACCESS_NUM;
896 HVM_DBG_LOG(DBG_LEVEL_1,
897 "vmx_dr_access : eip=%lx, reg=%d, exit_qualification = %lx",
898 eip, reg, exit_qualification);
900 switch(exit_qualification & DEBUG_REG_ACCESS_REG) {
901 CASE_GET_REG_P(EAX, eax);
902 CASE_GET_REG_P(ECX, ecx);
903 CASE_GET_REG_P(EDX, edx);
904 CASE_GET_REG_P(EBX, ebx);
905 CASE_GET_REG_P(EBP, ebp);
906 CASE_GET_REG_P(ESI, esi);
907 CASE_GET_REG_P(EDI, edi);
908 case REG_ESP:
909 break;
910 default:
911 __hvm_bug(regs);
912 }
914 switch (exit_qualification & DEBUG_REG_ACCESS_TYPE) {
915 case TYPE_MOV_TO_DR:
916 /* don't need to check the range */
917 if (reg != REG_ESP)
918 v->arch.guest_context.debugreg[reg] = *reg_p;
919 else {
920 unsigned long value;
921 __vmread(GUEST_RSP, &value);
922 v->arch.guest_context.debugreg[reg] = value;
923 }
924 break;
925 case TYPE_MOV_FROM_DR:
926 if (reg != REG_ESP)
927 *reg_p = v->arch.guest_context.debugreg[reg];
928 else {
929 __vmwrite(GUEST_RSP, v->arch.guest_context.debugreg[reg]);
930 }
931 break;
932 }
933 }
935 /*
936 * Invalidate the TLB for va. Invalidate the shadow page corresponding
937 * the address va.
938 */
939 static void vmx_vmexit_do_invlpg(unsigned long va)
940 {
941 unsigned long eip;
942 struct vcpu *v = current;
944 __vmread(GUEST_RIP, &eip);
946 HVM_DBG_LOG(DBG_LEVEL_VMMU, "vmx_vmexit_do_invlpg: eip=%lx, va=%lx",
947 eip, va);
949 /*
950 * We do the safest things first, then try to update the shadow
951 * copying from guest
952 */
953 shadow_invlpg(v, va);
954 }
956 static int check_for_null_selector(unsigned long eip)
957 {
958 unsigned char inst[MAX_INST_LEN];
959 unsigned long sel;
960 int i, inst_len;
961 int inst_copy_from_guest(unsigned char *, unsigned long, int);
963 __vmread(VM_EXIT_INSTRUCTION_LEN, &inst_len);
964 memset(inst, 0, MAX_INST_LEN);
965 if (inst_copy_from_guest(inst, eip, inst_len) != inst_len) {
966 printf("check_for_null_selector: get guest instruction failed\n");
967 domain_crash_synchronous();
968 }
970 for (i = 0; i < inst_len; i++) {
971 switch (inst[i]) {
972 case 0xf3: /* REPZ */
973 case 0xf2: /* REPNZ */
974 case 0xf0: /* LOCK */
975 case 0x66: /* data32 */
976 case 0x67: /* addr32 */
977 continue;
978 case 0x2e: /* CS */
979 __vmread(GUEST_CS_SELECTOR, &sel);
980 break;
981 case 0x36: /* SS */
982 __vmread(GUEST_SS_SELECTOR, &sel);
983 break;
984 case 0x26: /* ES */
985 __vmread(GUEST_ES_SELECTOR, &sel);
986 break;
987 case 0x64: /* FS */
988 __vmread(GUEST_FS_SELECTOR, &sel);
989 break;
990 case 0x65: /* GS */
991 __vmread(GUEST_GS_SELECTOR, &sel);
992 break;
993 case 0x3e: /* DS */
994 /* FALLTHROUGH */
995 default:
996 /* DS is the default */
997 __vmread(GUEST_DS_SELECTOR, &sel);
998 }
999 return sel == 0 ? 1 : 0;
1002 return 0;
1005 extern void send_pio_req(struct cpu_user_regs *regs, unsigned long port,
1006 unsigned long count, int size, long value,
1007 int dir, int pvalid);
1009 static void vmx_io_instruction(struct cpu_user_regs *regs,
1010 unsigned long exit_qualification, unsigned long inst_len)
1012 struct mmio_op *mmio_opp;
1013 unsigned long eip, cs, eflags;
1014 unsigned long port, size, dir;
1015 int vm86;
1017 mmio_opp = &current->arch.hvm_vcpu.mmio_op;
1018 mmio_opp->instr = INSTR_PIO;
1019 mmio_opp->flags = 0;
1021 __vmread(GUEST_RIP, &eip);
1022 __vmread(GUEST_CS_SELECTOR, &cs);
1023 __vmread(GUEST_RFLAGS, &eflags);
1024 vm86 = eflags & X86_EFLAGS_VM ? 1 : 0;
1026 HVM_DBG_LOG(DBG_LEVEL_IO,
1027 "vmx_io_instruction: vm86 %d, eip=%lx:%lx, "
1028 "exit_qualification = %lx",
1029 vm86, cs, eip, exit_qualification);
1031 if (test_bit(6, &exit_qualification))
1032 port = (exit_qualification >> 16) & 0xFFFF;
1033 else
1034 port = regs->edx & 0xffff;
1035 TRACE_VMEXIT(1, port);
1036 size = (exit_qualification & 7) + 1;
1037 dir = test_bit(3, &exit_qualification); /* direction */
1039 if (test_bit(4, &exit_qualification)) { /* string instruction */
1040 unsigned long addr, count = 1;
1041 int sign = regs->eflags & EF_DF ? -1 : 1;
1043 __vmread(GUEST_LINEAR_ADDRESS, &addr);
1045 /*
1046 * In protected mode, guest linear address is invalid if the
1047 * selector is null.
1048 */
1049 if (!vm86 && check_for_null_selector(eip))
1050 addr = dir == IOREQ_WRITE ? regs->esi : regs->edi;
1052 if (test_bit(5, &exit_qualification)) { /* "rep" prefix */
1053 mmio_opp->flags |= REPZ;
1054 count = vm86 ? regs->ecx & 0xFFFF : regs->ecx;
1057 /*
1058 * Handle string pio instructions that cross pages or that
1059 * are unaligned. See the comments in hvm_domain.c/handle_mmio()
1060 */
1061 if ((addr & PAGE_MASK) != ((addr + size - 1) & PAGE_MASK)) {
1062 unsigned long value = 0;
1064 mmio_opp->flags |= OVERLAP;
1065 if (dir == IOREQ_WRITE)
1066 hvm_copy(&value, addr, size, HVM_COPY_IN);
1067 send_pio_req(regs, port, 1, size, value, dir, 0);
1068 } else {
1069 if ((addr & PAGE_MASK) != ((addr + count * size - 1) & PAGE_MASK)) {
1070 if (sign > 0)
1071 count = (PAGE_SIZE - (addr & ~PAGE_MASK)) / size;
1072 else
1073 count = (addr & ~PAGE_MASK) / size;
1074 } else
1075 __update_guest_eip(inst_len);
1077 send_pio_req(regs, port, count, size, addr, dir, 1);
1079 } else {
1080 if (port == 0xe9 && dir == IOREQ_WRITE && size == 1)
1081 hvm_print_line(current, regs->eax); /* guest debug output */
1083 __update_guest_eip(inst_len);
1084 send_pio_req(regs, port, 1, size, regs->eax, dir, 0);
1088 int
1089 vmx_world_save(struct vcpu *v, struct vmx_assist_context *c)
1091 unsigned long inst_len;
1092 int error = 0;
1094 error |= __vmread(VM_EXIT_INSTRUCTION_LEN, &inst_len);
1095 error |= __vmread(GUEST_RIP, &c->eip);
1096 c->eip += inst_len; /* skip transition instruction */
1097 error |= __vmread(GUEST_RSP, &c->esp);
1098 error |= __vmread(GUEST_RFLAGS, &c->eflags);
1100 error |= __vmread(CR0_READ_SHADOW, &c->cr0);
1101 c->cr3 = v->arch.hvm_vmx.cpu_cr3;
1102 error |= __vmread(CR4_READ_SHADOW, &c->cr4);
1104 error |= __vmread(GUEST_IDTR_LIMIT, &c->idtr_limit);
1105 error |= __vmread(GUEST_IDTR_BASE, &c->idtr_base);
1107 error |= __vmread(GUEST_GDTR_LIMIT, &c->gdtr_limit);
1108 error |= __vmread(GUEST_GDTR_BASE, &c->gdtr_base);
1110 error |= __vmread(GUEST_CS_SELECTOR, &c->cs_sel);
1111 error |= __vmread(GUEST_CS_LIMIT, &c->cs_limit);
1112 error |= __vmread(GUEST_CS_BASE, &c->cs_base);
1113 error |= __vmread(GUEST_CS_AR_BYTES, &c->cs_arbytes.bytes);
1115 error |= __vmread(GUEST_DS_SELECTOR, &c->ds_sel);
1116 error |= __vmread(GUEST_DS_LIMIT, &c->ds_limit);
1117 error |= __vmread(GUEST_DS_BASE, &c->ds_base);
1118 error |= __vmread(GUEST_DS_AR_BYTES, &c->ds_arbytes.bytes);
1120 error |= __vmread(GUEST_ES_SELECTOR, &c->es_sel);
1121 error |= __vmread(GUEST_ES_LIMIT, &c->es_limit);
1122 error |= __vmread(GUEST_ES_BASE, &c->es_base);
1123 error |= __vmread(GUEST_ES_AR_BYTES, &c->es_arbytes.bytes);
1125 error |= __vmread(GUEST_SS_SELECTOR, &c->ss_sel);
1126 error |= __vmread(GUEST_SS_LIMIT, &c->ss_limit);
1127 error |= __vmread(GUEST_SS_BASE, &c->ss_base);
1128 error |= __vmread(GUEST_SS_AR_BYTES, &c->ss_arbytes.bytes);
1130 error |= __vmread(GUEST_FS_SELECTOR, &c->fs_sel);
1131 error |= __vmread(GUEST_FS_LIMIT, &c->fs_limit);
1132 error |= __vmread(GUEST_FS_BASE, &c->fs_base);
1133 error |= __vmread(GUEST_FS_AR_BYTES, &c->fs_arbytes.bytes);
1135 error |= __vmread(GUEST_GS_SELECTOR, &c->gs_sel);
1136 error |= __vmread(GUEST_GS_LIMIT, &c->gs_limit);
1137 error |= __vmread(GUEST_GS_BASE, &c->gs_base);
1138 error |= __vmread(GUEST_GS_AR_BYTES, &c->gs_arbytes.bytes);
1140 error |= __vmread(GUEST_TR_SELECTOR, &c->tr_sel);
1141 error |= __vmread(GUEST_TR_LIMIT, &c->tr_limit);
1142 error |= __vmread(GUEST_TR_BASE, &c->tr_base);
1143 error |= __vmread(GUEST_TR_AR_BYTES, &c->tr_arbytes.bytes);
1145 error |= __vmread(GUEST_LDTR_SELECTOR, &c->ldtr_sel);
1146 error |= __vmread(GUEST_LDTR_LIMIT, &c->ldtr_limit);
1147 error |= __vmread(GUEST_LDTR_BASE, &c->ldtr_base);
1148 error |= __vmread(GUEST_LDTR_AR_BYTES, &c->ldtr_arbytes.bytes);
1150 return !error;
1153 int
1154 vmx_world_restore(struct vcpu *v, struct vmx_assist_context *c)
1156 unsigned long mfn, old_cr4, old_base_mfn;
1157 int error = 0;
1159 error |= __vmwrite(GUEST_RIP, c->eip);
1160 error |= __vmwrite(GUEST_RSP, c->esp);
1161 error |= __vmwrite(GUEST_RFLAGS, c->eflags);
1163 error |= __vmwrite(CR0_READ_SHADOW, c->cr0);
1165 if (!vmx_paging_enabled(v)) {
1166 HVM_DBG_LOG(DBG_LEVEL_VMMU, "switching to vmxassist. use phys table");
1167 __vmwrite(GUEST_CR3, pagetable_get_paddr(v->domain->arch.phys_table));
1168 goto skip_cr3;
1171 if (c->cr3 == v->arch.hvm_vmx.cpu_cr3) {
1172 /*
1173 * This is simple TLB flush, implying the guest has
1174 * removed some translation or changed page attributes.
1175 * We simply invalidate the shadow.
1176 */
1177 mfn = get_mfn_from_gpfn(c->cr3 >> PAGE_SHIFT);
1178 if (mfn != pagetable_get_pfn(v->arch.guest_table)) {
1179 printk("Invalid CR3 value=%x", c->cr3);
1180 domain_crash_synchronous();
1181 return 0;
1183 shadow_sync_all(v->domain);
1184 } else {
1185 /*
1186 * If different, make a shadow. Check if the PDBR is valid
1187 * first.
1188 */
1189 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 c->cr3 = %x", c->cr3);
1190 if ((c->cr3 >> PAGE_SHIFT) > v->domain->max_pages) {
1191 printk("Invalid CR3 value=%x", c->cr3);
1192 domain_crash_synchronous();
1193 return 0;
1195 mfn = get_mfn_from_gpfn(c->cr3 >> PAGE_SHIFT);
1196 if(!get_page(mfn_to_page(mfn), v->domain))
1197 return 0;
1198 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1199 v->arch.guest_table = mk_pagetable((u64)mfn << PAGE_SHIFT);
1200 if (old_base_mfn)
1201 put_page(mfn_to_page(old_base_mfn));
1202 /*
1203 * arch.shadow_table should now hold the next CR3 for shadow
1204 */
1205 v->arch.hvm_vmx.cpu_cr3 = c->cr3;
1206 update_pagetables(v);
1207 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %x", c->cr3);
1208 __vmwrite(GUEST_CR3, pagetable_get_paddr(v->arch.shadow_table));
1211 skip_cr3:
1213 error |= __vmread(CR4_READ_SHADOW, &old_cr4);
1214 error |= __vmwrite(GUEST_CR4, (c->cr4 | VMX_CR4_HOST_MASK));
1215 error |= __vmwrite(CR4_READ_SHADOW, c->cr4);
1217 error |= __vmwrite(GUEST_IDTR_LIMIT, c->idtr_limit);
1218 error |= __vmwrite(GUEST_IDTR_BASE, c->idtr_base);
1220 error |= __vmwrite(GUEST_GDTR_LIMIT, c->gdtr_limit);
1221 error |= __vmwrite(GUEST_GDTR_BASE, c->gdtr_base);
1223 error |= __vmwrite(GUEST_CS_SELECTOR, c->cs_sel);
1224 error |= __vmwrite(GUEST_CS_LIMIT, c->cs_limit);
1225 error |= __vmwrite(GUEST_CS_BASE, c->cs_base);
1226 error |= __vmwrite(GUEST_CS_AR_BYTES, c->cs_arbytes.bytes);
1228 error |= __vmwrite(GUEST_DS_SELECTOR, c->ds_sel);
1229 error |= __vmwrite(GUEST_DS_LIMIT, c->ds_limit);
1230 error |= __vmwrite(GUEST_DS_BASE, c->ds_base);
1231 error |= __vmwrite(GUEST_DS_AR_BYTES, c->ds_arbytes.bytes);
1233 error |= __vmwrite(GUEST_ES_SELECTOR, c->es_sel);
1234 error |= __vmwrite(GUEST_ES_LIMIT, c->es_limit);
1235 error |= __vmwrite(GUEST_ES_BASE, c->es_base);
1236 error |= __vmwrite(GUEST_ES_AR_BYTES, c->es_arbytes.bytes);
1238 error |= __vmwrite(GUEST_SS_SELECTOR, c->ss_sel);
1239 error |= __vmwrite(GUEST_SS_LIMIT, c->ss_limit);
1240 error |= __vmwrite(GUEST_SS_BASE, c->ss_base);
1241 error |= __vmwrite(GUEST_SS_AR_BYTES, c->ss_arbytes.bytes);
1243 error |= __vmwrite(GUEST_FS_SELECTOR, c->fs_sel);
1244 error |= __vmwrite(GUEST_FS_LIMIT, c->fs_limit);
1245 error |= __vmwrite(GUEST_FS_BASE, c->fs_base);
1246 error |= __vmwrite(GUEST_FS_AR_BYTES, c->fs_arbytes.bytes);
1248 error |= __vmwrite(GUEST_GS_SELECTOR, c->gs_sel);
1249 error |= __vmwrite(GUEST_GS_LIMIT, c->gs_limit);
1250 error |= __vmwrite(GUEST_GS_BASE, c->gs_base);
1251 error |= __vmwrite(GUEST_GS_AR_BYTES, c->gs_arbytes.bytes);
1253 error |= __vmwrite(GUEST_TR_SELECTOR, c->tr_sel);
1254 error |= __vmwrite(GUEST_TR_LIMIT, c->tr_limit);
1255 error |= __vmwrite(GUEST_TR_BASE, c->tr_base);
1256 error |= __vmwrite(GUEST_TR_AR_BYTES, c->tr_arbytes.bytes);
1258 error |= __vmwrite(GUEST_LDTR_SELECTOR, c->ldtr_sel);
1259 error |= __vmwrite(GUEST_LDTR_LIMIT, c->ldtr_limit);
1260 error |= __vmwrite(GUEST_LDTR_BASE, c->ldtr_base);
1261 error |= __vmwrite(GUEST_LDTR_AR_BYTES, c->ldtr_arbytes.bytes);
1263 return !error;
1266 enum { VMX_ASSIST_INVOKE = 0, VMX_ASSIST_RESTORE };
1268 int
1269 vmx_assist(struct vcpu *v, int mode)
1271 struct vmx_assist_context c;
1272 u32 magic;
1273 u32 cp;
1275 /* make sure vmxassist exists (this is not an error) */
1276 if (!hvm_copy(&magic, VMXASSIST_MAGIC_OFFSET, sizeof(magic), HVM_COPY_IN))
1277 return 0;
1278 if (magic != VMXASSIST_MAGIC)
1279 return 0;
1281 switch (mode) {
1282 /*
1283 * Transfer control to vmxassist.
1284 * Store the current context in VMXASSIST_OLD_CONTEXT and load
1285 * the new VMXASSIST_NEW_CONTEXT context. This context was created
1286 * by vmxassist and will transfer control to it.
1287 */
1288 case VMX_ASSIST_INVOKE:
1289 /* save the old context */
1290 if (!hvm_copy(&cp, VMXASSIST_OLD_CONTEXT, sizeof(cp), HVM_COPY_IN))
1291 goto error;
1292 if (cp != 0) {
1293 if (!vmx_world_save(v, &c))
1294 goto error;
1295 if (!hvm_copy(&c, cp, sizeof(c), HVM_COPY_OUT))
1296 goto error;
1299 /* restore the new context, this should activate vmxassist */
1300 if (!hvm_copy(&cp, VMXASSIST_NEW_CONTEXT, sizeof(cp), HVM_COPY_IN))
1301 goto error;
1302 if (cp != 0) {
1303 if (!hvm_copy(&c, cp, sizeof(c), HVM_COPY_IN))
1304 goto error;
1305 if (!vmx_world_restore(v, &c))
1306 goto error;
1307 return 1;
1309 break;
1311 /*
1312 * Restore the VMXASSIST_OLD_CONTEXT that was saved by VMX_ASSIST_INVOKE
1313 * above.
1314 */
1315 case VMX_ASSIST_RESTORE:
1316 /* save the old context */
1317 if (!hvm_copy(&cp, VMXASSIST_OLD_CONTEXT, sizeof(cp), HVM_COPY_IN))
1318 goto error;
1319 if (cp != 0) {
1320 if (!hvm_copy(&c, cp, sizeof(c), HVM_COPY_IN))
1321 goto error;
1322 if (!vmx_world_restore(v, &c))
1323 goto error;
1324 return 1;
1326 break;
1329 error:
1330 printf("Failed to transfer to vmxassist\n");
1331 domain_crash_synchronous();
1332 return 0;
1335 static int vmx_set_cr0(unsigned long value)
1337 struct vcpu *v = current;
1338 unsigned long mfn;
1339 unsigned long eip;
1340 int paging_enabled;
1341 unsigned long vm_entry_value;
1342 unsigned long old_cr0;
1344 /*
1345 * CR0: We don't want to lose PE and PG.
1346 */
1347 __vmread_vcpu(v, CR0_READ_SHADOW, &old_cr0);
1348 paging_enabled = (old_cr0 & X86_CR0_PE) && (old_cr0 & X86_CR0_PG);
1350 /* TS cleared? Then initialise FPU now. */
1351 if ( !(value & X86_CR0_TS) )
1353 setup_fpu(v);
1354 __vm_clear_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_NM);
1357 __vmwrite(GUEST_CR0, value | X86_CR0_PE | X86_CR0_PG | X86_CR0_NE);
1358 __vmwrite(CR0_READ_SHADOW, value);
1360 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR0 value = %lx\n", value);
1362 if ( (value & X86_CR0_PE) && (value & X86_CR0_PG) && !paging_enabled )
1364 /*
1365 * Trying to enable guest paging.
1366 * The guest CR3 must be pointing to the guest physical.
1367 */
1368 if ( !VALID_MFN(mfn = get_mfn_from_gpfn(
1369 v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT)) ||
1370 !get_page(mfn_to_page(mfn), v->domain) )
1372 printk("Invalid CR3 value = %lx", v->arch.hvm_vmx.cpu_cr3);
1373 domain_crash_synchronous(); /* need to take a clean path */
1376 #if defined(__x86_64__)
1377 if ( test_bit(VMX_CPU_STATE_LME_ENABLED,
1378 &v->arch.hvm_vmx.cpu_state) &&
1379 !test_bit(VMX_CPU_STATE_PAE_ENABLED,
1380 &v->arch.hvm_vmx.cpu_state) )
1382 HVM_DBG_LOG(DBG_LEVEL_1, "Enable paging before PAE enabled\n");
1383 vmx_inject_exception(v, TRAP_gp_fault, 0);
1386 if ( test_bit(VMX_CPU_STATE_LME_ENABLED,
1387 &v->arch.hvm_vmx.cpu_state) )
1389 /* Here the PAE is should be opened */
1390 HVM_DBG_LOG(DBG_LEVEL_1, "Enable long mode\n");
1391 set_bit(VMX_CPU_STATE_LMA_ENABLED,
1392 &v->arch.hvm_vmx.cpu_state);
1394 __vmread(VM_ENTRY_CONTROLS, &vm_entry_value);
1395 vm_entry_value |= VM_ENTRY_CONTROLS_IA32E_MODE;
1396 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
1398 if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L4) )
1400 printk("Unsupported guest paging levels\n");
1401 domain_crash_synchronous(); /* need to take a clean path */
1404 else
1405 #endif /* __x86_64__ */
1407 #if CONFIG_PAGING_LEVELS >= 3
1408 /* seems it's a 32-bit or 32-bit PAE guest */
1410 if ( test_bit(VMX_CPU_STATE_PAE_ENABLED,
1411 &v->arch.hvm_vmx.cpu_state) )
1413 /* The guest enables PAE first and then it enables PG, it is
1414 * really a PAE guest */
1415 if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L3) )
1417 printk("Unsupported guest paging levels\n");
1418 domain_crash_synchronous();
1421 else
1423 if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L2) )
1425 printk("Unsupported guest paging levels\n");
1426 domain_crash_synchronous(); /* need to take a clean path */
1429 #endif
1432 /*
1433 * Now arch.guest_table points to machine physical.
1434 */
1435 v->arch.guest_table = mk_pagetable((u64)mfn << PAGE_SHIFT);
1436 update_pagetables(v);
1438 HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
1439 (unsigned long) (mfn << PAGE_SHIFT));
1441 __vmwrite(GUEST_CR3, pagetable_get_paddr(v->arch.shadow_table));
1442 /*
1443 * arch->shadow_table should hold the next CR3 for shadow
1444 */
1445 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, mfn = %lx",
1446 v->arch.hvm_vmx.cpu_cr3, mfn);
1449 if ( !((value & X86_CR0_PE) && (value & X86_CR0_PG)) && paging_enabled )
1450 if ( v->arch.hvm_vmx.cpu_cr3 ) {
1451 put_page(mfn_to_page(get_mfn_from_gpfn(
1452 v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT)));
1453 v->arch.guest_table = mk_pagetable(0);
1456 /*
1457 * VMX does not implement real-mode virtualization. We emulate
1458 * real-mode by performing a world switch to VMXAssist whenever
1459 * a partition disables the CR0.PE bit.
1460 */
1461 if ( (value & X86_CR0_PE) == 0 )
1463 if ( value & X86_CR0_PG ) {
1464 /* inject GP here */
1465 vmx_inject_exception(v, TRAP_gp_fault, 0);
1466 return 0;
1467 } else {
1468 /*
1469 * Disable paging here.
1470 * Same to PE == 1 && PG == 0
1471 */
1472 if ( test_bit(VMX_CPU_STATE_LMA_ENABLED,
1473 &v->arch.hvm_vmx.cpu_state) )
1475 clear_bit(VMX_CPU_STATE_LMA_ENABLED,
1476 &v->arch.hvm_vmx.cpu_state);
1477 __vmread(VM_ENTRY_CONTROLS, &vm_entry_value);
1478 vm_entry_value &= ~VM_ENTRY_CONTROLS_IA32E_MODE;
1479 __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
1483 clear_all_shadow_status(v->domain);
1484 if ( vmx_assist(v, VMX_ASSIST_INVOKE) ) {
1485 set_bit(VMX_CPU_STATE_ASSIST_ENABLED, &v->arch.hvm_vmx.cpu_state);
1486 __vmread(GUEST_RIP, &eip);
1487 HVM_DBG_LOG(DBG_LEVEL_1,
1488 "Transfering control to vmxassist %%eip 0x%lx\n", eip);
1489 return 0; /* do not update eip! */
1491 } else if ( test_bit(VMX_CPU_STATE_ASSIST_ENABLED,
1492 &v->arch.hvm_vmx.cpu_state) )
1494 __vmread(GUEST_RIP, &eip);
1495 HVM_DBG_LOG(DBG_LEVEL_1,
1496 "Enabling CR0.PE at %%eip 0x%lx\n", eip);
1497 if ( vmx_assist(v, VMX_ASSIST_RESTORE) )
1499 clear_bit(VMX_CPU_STATE_ASSIST_ENABLED,
1500 &v->arch.hvm_vmx.cpu_state);
1501 __vmread(GUEST_RIP, &eip);
1502 HVM_DBG_LOG(DBG_LEVEL_1,
1503 "Restoring to %%eip 0x%lx\n", eip);
1504 return 0; /* do not update eip! */
1507 else if ( (value & (X86_CR0_PE | X86_CR0_PG)) == X86_CR0_PE )
1509 /* we should take care of this kind of situation */
1510 clear_all_shadow_status(v->domain);
1511 __vmwrite(GUEST_CR3, pagetable_get_paddr(v->domain->arch.phys_table));
1514 return 1;
1517 #define CASE_GET_REG(REG, reg) \
1518 case REG_ ## REG: value = regs->reg; break
1520 #define CASE_EXTEND_SET_REG \
1521 CASE_EXTEND_REG(S)
1522 #define CASE_EXTEND_GET_REG \
1523 CASE_EXTEND_REG(G)
1525 #ifdef __i386__
1526 #define CASE_EXTEND_REG(T)
1527 #else
1528 #define CASE_EXTEND_REG(T) \
1529 CASE_ ## T ## ET_REG(R8, r8); \
1530 CASE_ ## T ## ET_REG(R9, r9); \
1531 CASE_ ## T ## ET_REG(R10, r10); \
1532 CASE_ ## T ## ET_REG(R11, r11); \
1533 CASE_ ## T ## ET_REG(R12, r12); \
1534 CASE_ ## T ## ET_REG(R13, r13); \
1535 CASE_ ## T ## ET_REG(R14, r14); \
1536 CASE_ ## T ## ET_REG(R15, r15);
1537 #endif
1540 /*
1541 * Write to control registers
1542 */
1543 static int mov_to_cr(int gp, int cr, struct cpu_user_regs *regs)
1545 unsigned long value;
1546 unsigned long old_cr;
1547 struct vcpu *v = current;
1549 switch (gp) {
1550 CASE_GET_REG(EAX, eax);
1551 CASE_GET_REG(ECX, ecx);
1552 CASE_GET_REG(EDX, edx);
1553 CASE_GET_REG(EBX, ebx);
1554 CASE_GET_REG(EBP, ebp);
1555 CASE_GET_REG(ESI, esi);
1556 CASE_GET_REG(EDI, edi);
1557 CASE_EXTEND_GET_REG
1558 case REG_ESP:
1559 __vmread(GUEST_RSP, &value);
1560 break;
1561 default:
1562 printk("invalid gp: %d\n", gp);
1563 __hvm_bug(regs);
1566 HVM_DBG_LOG(DBG_LEVEL_1, "mov_to_cr: CR%d, value = %lx,", cr, value);
1567 HVM_DBG_LOG(DBG_LEVEL_1, "current = %lx,", (unsigned long) current);
1569 switch(cr) {
1570 case 0:
1572 return vmx_set_cr0(value);
1574 case 3:
1576 unsigned long old_base_mfn, mfn;
1578 /*
1579 * If paging is not enabled yet, simply copy the value to CR3.
1580 */
1581 if (!vmx_paging_enabled(v)) {
1582 v->arch.hvm_vmx.cpu_cr3 = value;
1583 break;
1586 /*
1587 * We make a new one if the shadow does not exist.
1588 */
1589 if (value == v->arch.hvm_vmx.cpu_cr3) {
1590 /*
1591 * This is simple TLB flush, implying the guest has
1592 * removed some translation or changed page attributes.
1593 * We simply invalidate the shadow.
1594 */
1595 mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
1596 if (mfn != pagetable_get_pfn(v->arch.guest_table))
1597 __hvm_bug(regs);
1598 shadow_sync_all(v->domain);
1599 } else {
1600 /*
1601 * If different, make a shadow. Check if the PDBR is valid
1602 * first.
1603 */
1604 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 value = %lx", value);
1605 if ( ((value >> PAGE_SHIFT) > v->domain->max_pages ) ||
1606 !VALID_MFN(mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT)) ||
1607 !get_page(mfn_to_page(mfn), v->domain) )
1609 printk("Invalid CR3 value=%lx", value);
1610 domain_crash_synchronous(); /* need to take a clean path */
1612 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1613 v->arch.guest_table = mk_pagetable((u64)mfn << PAGE_SHIFT);
1614 if (old_base_mfn)
1615 put_page(mfn_to_page(old_base_mfn));
1616 /*
1617 * arch.shadow_table should now hold the next CR3 for shadow
1618 */
1619 #if CONFIG_PAGING_LEVELS >= 3
1620 if ( v->domain->arch.ops->guest_paging_levels == PAGING_L3 )
1621 shadow_sync_all(v->domain);
1622 #endif
1624 v->arch.hvm_vmx.cpu_cr3 = value;
1625 update_pagetables(v);
1626 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx",
1627 value);
1628 __vmwrite(GUEST_CR3, pagetable_get_paddr(v->arch.shadow_table));
1630 break;
1632 case 4: /* CR4 */
1634 __vmread(CR4_READ_SHADOW, &old_cr);
1636 if ( value & X86_CR4_PAE && !(old_cr & X86_CR4_PAE) )
1638 set_bit(VMX_CPU_STATE_PAE_ENABLED, &v->arch.hvm_vmx.cpu_state);
1640 if ( vmx_pgbit_test(v) )
1642 /* The guest is a 32-bit PAE guest. */
1643 #if CONFIG_PAGING_LEVELS >= 4
1644 unsigned long mfn, old_base_mfn;
1646 if( !shadow_set_guest_paging_levels(v->domain, PAGING_L3) )
1648 printk("Unsupported guest paging levels\n");
1649 domain_crash_synchronous(); /* need to take a clean path */
1652 if ( !VALID_MFN(mfn = get_mfn_from_gpfn(
1653 v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT)) ||
1654 !get_page(mfn_to_page(mfn), v->domain) )
1656 printk("Invalid CR3 value = %lx", v->arch.hvm_vmx.cpu_cr3);
1657 domain_crash_synchronous(); /* need to take a clean path */
1660 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1661 if ( old_base_mfn )
1662 put_page(mfn_to_page(old_base_mfn));
1664 /*
1665 * Now arch.guest_table points to machine physical.
1666 */
1668 v->arch.guest_table = mk_pagetable((u64)mfn << PAGE_SHIFT);
1669 update_pagetables(v);
1671 HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
1672 (unsigned long) (mfn << PAGE_SHIFT));
1674 __vmwrite(GUEST_CR3, pagetable_get_paddr(v->arch.shadow_table));
1676 /*
1677 * arch->shadow_table should hold the next CR3 for shadow
1678 */
1680 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, mfn = %lx",
1681 v->arch.hvm_vmx.cpu_cr3, mfn);
1682 #endif
1684 else
1686 /* The guest is a 64 bit or 32-bit PAE guest. */
1687 #if CONFIG_PAGING_LEVELS >= 4
1688 if ( (v->domain->arch.ops != NULL) &&
1689 v->domain->arch.ops->guest_paging_levels == PAGING_L2)
1691 /* Seems the guest first enables PAE without enabling PG,
1692 * it must enable PG after that, and it is a 32-bit PAE
1693 * guest */
1695 if ( !shadow_set_guest_paging_levels(v->domain,
1696 PAGING_L3) )
1698 printk("Unsupported guest paging levels\n");
1699 /* need to take a clean path */
1700 domain_crash_synchronous();
1703 else
1705 if ( !shadow_set_guest_paging_levels(v->domain,
1706 PAGING_L4) )
1708 printk("Unsupported guest paging levels\n");
1709 domain_crash_synchronous();
1712 #endif
1715 else if ( value & X86_CR4_PAE )
1716 set_bit(VMX_CPU_STATE_PAE_ENABLED, &v->arch.hvm_vmx.cpu_state);
1717 else
1719 if ( test_bit(VMX_CPU_STATE_LMA_ENABLED, &v->arch.hvm_vmx.cpu_state) )
1720 vmx_inject_exception(v, TRAP_gp_fault, 0);
1722 clear_bit(VMX_CPU_STATE_PAE_ENABLED, &v->arch.hvm_vmx.cpu_state);
1725 __vmwrite(GUEST_CR4, value| VMX_CR4_HOST_MASK);
1726 __vmwrite(CR4_READ_SHADOW, value);
1728 /*
1729 * Writing to CR4 to modify the PSE, PGE, or PAE flag invalidates
1730 * all TLB entries except global entries.
1731 */
1732 if ( (old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE) )
1733 shadow_sync_all(v->domain);
1735 break;
1737 default:
1738 printk("invalid cr: %d\n", gp);
1739 __hvm_bug(regs);
1742 return 1;
1745 #define CASE_SET_REG(REG, reg) \
1746 case REG_ ## REG: \
1747 regs->reg = value; \
1748 break
1750 /*
1751 * Read from control registers. CR0 and CR4 are read from the shadow.
1752 */
1753 static void mov_from_cr(int cr, int gp, struct cpu_user_regs *regs)
1755 unsigned long value;
1756 struct vcpu *v = current;
1758 if (cr != 3)
1759 __hvm_bug(regs);
1761 value = (unsigned long) v->arch.hvm_vmx.cpu_cr3;
1763 switch (gp) {
1764 CASE_SET_REG(EAX, eax);
1765 CASE_SET_REG(ECX, ecx);
1766 CASE_SET_REG(EDX, edx);
1767 CASE_SET_REG(EBX, ebx);
1768 CASE_SET_REG(EBP, ebp);
1769 CASE_SET_REG(ESI, esi);
1770 CASE_SET_REG(EDI, edi);
1771 CASE_EXTEND_SET_REG
1772 case REG_ESP:
1773 __vmwrite(GUEST_RSP, value);
1774 regs->esp = value;
1775 break;
1776 default:
1777 printk("invalid gp: %d\n", gp);
1778 __hvm_bug(regs);
1781 HVM_DBG_LOG(DBG_LEVEL_VMMU, "mov_from_cr: CR%d, value = %lx,", cr, value);
1784 static int vmx_cr_access(unsigned long exit_qualification, struct cpu_user_regs *regs)
1786 unsigned int gp, cr;
1787 unsigned long value;
1788 struct vcpu *v = current;
1790 switch (exit_qualification & CONTROL_REG_ACCESS_TYPE) {
1791 case TYPE_MOV_TO_CR:
1792 gp = exit_qualification & CONTROL_REG_ACCESS_REG;
1793 cr = exit_qualification & CONTROL_REG_ACCESS_NUM;
1794 TRACE_VMEXIT(1,TYPE_MOV_TO_CR);
1795 TRACE_VMEXIT(2,cr);
1796 TRACE_VMEXIT(3,gp);
1797 return mov_to_cr(gp, cr, regs);
1798 case TYPE_MOV_FROM_CR:
1799 gp = exit_qualification & CONTROL_REG_ACCESS_REG;
1800 cr = exit_qualification & CONTROL_REG_ACCESS_NUM;
1801 TRACE_VMEXIT(1,TYPE_MOV_FROM_CR);
1802 TRACE_VMEXIT(2,cr);
1803 TRACE_VMEXIT(3,gp);
1804 mov_from_cr(cr, gp, regs);
1805 break;
1806 case TYPE_CLTS:
1807 TRACE_VMEXIT(1,TYPE_CLTS);
1809 /* We initialise the FPU now, to avoid needing another vmexit. */
1810 setup_fpu(v);
1811 __vm_clear_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_NM);
1813 __vmread_vcpu(v, GUEST_CR0, &value);
1814 value &= ~X86_CR0_TS; /* clear TS */
1815 __vmwrite(GUEST_CR0, value);
1817 __vmread_vcpu(v, CR0_READ_SHADOW, &value);
1818 value &= ~X86_CR0_TS; /* clear TS */
1819 __vmwrite(CR0_READ_SHADOW, value);
1820 break;
1821 case TYPE_LMSW:
1822 TRACE_VMEXIT(1,TYPE_LMSW);
1823 __vmread_vcpu(v, CR0_READ_SHADOW, &value);
1824 value = (value & ~0xF) |
1825 (((exit_qualification & LMSW_SOURCE_DATA) >> 16) & 0xF);
1826 return vmx_set_cr0(value);
1827 break;
1828 default:
1829 __hvm_bug(regs);
1830 break;
1832 return 1;
1835 static inline void vmx_do_msr_read(struct cpu_user_regs *regs)
1837 u64 msr_content = 0;
1838 struct vcpu *v = current;
1840 HVM_DBG_LOG(DBG_LEVEL_1, "vmx_do_msr_read: ecx=%lx, eax=%lx, edx=%lx",
1841 (unsigned long)regs->ecx, (unsigned long)regs->eax,
1842 (unsigned long)regs->edx);
1843 switch (regs->ecx) {
1844 case MSR_IA32_TIME_STAMP_COUNTER:
1846 struct hvm_time_info *time_info;
1848 rdtscll(msr_content);
1849 time_info = &(v->domain->arch.hvm_domain.vpit.time_info);
1850 msr_content += time_info->cache_tsc_offset;
1851 break;
1853 case MSR_IA32_SYSENTER_CS:
1854 __vmread(GUEST_SYSENTER_CS, (u32 *)&msr_content);
1855 break;
1856 case MSR_IA32_SYSENTER_ESP:
1857 __vmread(GUEST_SYSENTER_ESP, &msr_content);
1858 break;
1859 case MSR_IA32_SYSENTER_EIP:
1860 __vmread(GUEST_SYSENTER_EIP, &msr_content);
1861 break;
1862 case MSR_IA32_APICBASE:
1863 msr_content = VLAPIC(v) ? VLAPIC(v)->apic_base_msr : 0;
1864 break;
1865 default:
1866 if(long_mode_do_msr_read(regs))
1867 return;
1868 rdmsr_safe(regs->ecx, regs->eax, regs->edx);
1869 break;
1872 regs->eax = msr_content & 0xFFFFFFFF;
1873 regs->edx = msr_content >> 32;
1875 HVM_DBG_LOG(DBG_LEVEL_1, "vmx_do_msr_read returns: "
1876 "ecx=%lx, eax=%lx, edx=%lx",
1877 (unsigned long)regs->ecx, (unsigned long)regs->eax,
1878 (unsigned long)regs->edx);
1881 static inline void vmx_do_msr_write(struct cpu_user_regs *regs)
1883 u64 msr_content;
1884 struct vcpu *v = current;
1886 HVM_DBG_LOG(DBG_LEVEL_1, "vmx_do_msr_write: ecx=%lx, eax=%lx, edx=%lx",
1887 (unsigned long)regs->ecx, (unsigned long)regs->eax,
1888 (unsigned long)regs->edx);
1890 msr_content = (regs->eax & 0xFFFFFFFF) | ((u64)regs->edx << 32);
1892 switch (regs->ecx) {
1893 case MSR_IA32_TIME_STAMP_COUNTER:
1894 set_guest_time(v, msr_content);
1895 break;
1896 case MSR_IA32_SYSENTER_CS:
1897 __vmwrite(GUEST_SYSENTER_CS, msr_content);
1898 break;
1899 case MSR_IA32_SYSENTER_ESP:
1900 __vmwrite(GUEST_SYSENTER_ESP, msr_content);
1901 break;
1902 case MSR_IA32_SYSENTER_EIP:
1903 __vmwrite(GUEST_SYSENTER_EIP, msr_content);
1904 break;
1905 case MSR_IA32_APICBASE:
1906 vlapic_msr_set(VLAPIC(v), msr_content);
1907 break;
1908 default:
1909 long_mode_do_msr_write(regs);
1910 break;
1913 HVM_DBG_LOG(DBG_LEVEL_1, "vmx_do_msr_write returns: "
1914 "ecx=%lx, eax=%lx, edx=%lx",
1915 (unsigned long)regs->ecx, (unsigned long)regs->eax,
1916 (unsigned long)regs->edx);
1919 /*
1920 * Need to use this exit to reschedule
1921 */
1922 void vmx_vmexit_do_hlt(void)
1924 struct vcpu *v=current;
1925 struct hvm_virpit *vpit = &(v->domain->arch.hvm_domain.vpit);
1926 s_time_t next_pit=-1,next_wakeup;
1928 if ( !v->vcpu_id )
1929 next_pit = get_pit_scheduled(v,vpit);
1930 next_wakeup = get_apictime_scheduled(v);
1931 if ( (next_pit != -1 && next_pit < next_wakeup) || next_wakeup == -1 )
1932 next_wakeup = next_pit;
1933 if ( next_wakeup != - 1 )
1934 set_timer(&current->arch.hvm_vmx.hlt_timer, next_wakeup);
1935 hvm_safe_block();
1938 static inline void vmx_vmexit_do_extint(struct cpu_user_regs *regs)
1940 unsigned int vector;
1941 int error;
1943 asmlinkage void do_IRQ(struct cpu_user_regs *);
1944 fastcall void smp_apic_timer_interrupt(struct cpu_user_regs *);
1945 fastcall void smp_event_check_interrupt(void);
1946 fastcall void smp_invalidate_interrupt(void);
1947 fastcall void smp_call_function_interrupt(void);
1948 fastcall void smp_spurious_interrupt(struct cpu_user_regs *regs);
1949 fastcall void smp_error_interrupt(struct cpu_user_regs *regs);
1950 #ifdef CONFIG_X86_MCE_P4THERMAL
1951 fastcall void smp_thermal_interrupt(struct cpu_user_regs *regs);
1952 #endif
1954 if ((error = __vmread(VM_EXIT_INTR_INFO, &vector))
1955 && !(vector & INTR_INFO_VALID_MASK))
1956 __hvm_bug(regs);
1958 vector &= INTR_INFO_VECTOR_MASK;
1959 local_irq_disable();
1960 TRACE_VMEXIT(1,vector);
1962 switch(vector) {
1963 case LOCAL_TIMER_VECTOR:
1964 smp_apic_timer_interrupt(regs);
1965 break;
1966 case EVENT_CHECK_VECTOR:
1967 smp_event_check_interrupt();
1968 break;
1969 case INVALIDATE_TLB_VECTOR:
1970 smp_invalidate_interrupt();
1971 break;
1972 case CALL_FUNCTION_VECTOR:
1973 smp_call_function_interrupt();
1974 break;
1975 case SPURIOUS_APIC_VECTOR:
1976 smp_spurious_interrupt(regs);
1977 break;
1978 case ERROR_APIC_VECTOR:
1979 smp_error_interrupt(regs);
1980 break;
1981 #ifdef CONFIG_X86_MCE_P4THERMAL
1982 case THERMAL_APIC_VECTOR:
1983 smp_thermal_interrupt(regs);
1984 break;
1985 #endif
1986 default:
1987 regs->entry_vector = vector;
1988 do_IRQ(regs);
1989 break;
1993 #if defined (__x86_64__)
1994 void store_cpu_user_regs(struct cpu_user_regs *regs)
1996 __vmread(GUEST_SS_SELECTOR, &regs->ss);
1997 __vmread(GUEST_RSP, &regs->rsp);
1998 __vmread(GUEST_RFLAGS, &regs->rflags);
1999 __vmread(GUEST_CS_SELECTOR, &regs->cs);
2000 __vmread(GUEST_DS_SELECTOR, &regs->ds);
2001 __vmread(GUEST_ES_SELECTOR, &regs->es);
2002 __vmread(GUEST_RIP, &regs->rip);
2004 #elif defined (__i386__)
2005 void store_cpu_user_regs(struct cpu_user_regs *regs)
2007 __vmread(GUEST_SS_SELECTOR, &regs->ss);
2008 __vmread(GUEST_RSP, &regs->esp);
2009 __vmread(GUEST_RFLAGS, &regs->eflags);
2010 __vmread(GUEST_CS_SELECTOR, &regs->cs);
2011 __vmread(GUEST_DS_SELECTOR, &regs->ds);
2012 __vmread(GUEST_ES_SELECTOR, &regs->es);
2013 __vmread(GUEST_RIP, &regs->eip);
2015 #endif
2017 #ifdef XEN_DEBUGGER
2018 void save_cpu_user_regs(struct cpu_user_regs *regs)
2020 __vmread(GUEST_SS_SELECTOR, &regs->xss);
2021 __vmread(GUEST_RSP, &regs->esp);
2022 __vmread(GUEST_RFLAGS, &regs->eflags);
2023 __vmread(GUEST_CS_SELECTOR, &regs->xcs);
2024 __vmread(GUEST_RIP, &regs->eip);
2026 __vmread(GUEST_GS_SELECTOR, &regs->xgs);
2027 __vmread(GUEST_FS_SELECTOR, &regs->xfs);
2028 __vmread(GUEST_ES_SELECTOR, &regs->xes);
2029 __vmread(GUEST_DS_SELECTOR, &regs->xds);
2032 void restore_cpu_user_regs(struct cpu_user_regs *regs)
2034 __vmwrite(GUEST_SS_SELECTOR, regs->xss);
2035 __vmwrite(GUEST_RSP, regs->esp);
2036 __vmwrite(GUEST_RFLAGS, regs->eflags);
2037 __vmwrite(GUEST_CS_SELECTOR, regs->xcs);
2038 __vmwrite(GUEST_RIP, regs->eip);
2040 __vmwrite(GUEST_GS_SELECTOR, regs->xgs);
2041 __vmwrite(GUEST_FS_SELECTOR, regs->xfs);
2042 __vmwrite(GUEST_ES_SELECTOR, regs->xes);
2043 __vmwrite(GUEST_DS_SELECTOR, regs->xds);
2045 #endif
2047 asmlinkage void vmx_vmexit_handler(struct cpu_user_regs regs)
2049 unsigned int exit_reason, idtv_info_field;
2050 unsigned long exit_qualification, eip, inst_len = 0;
2051 struct vcpu *v = current;
2052 int error;
2054 if ((error = __vmread(VM_EXIT_REASON, &exit_reason)))
2055 __hvm_bug(&regs);
2057 perfc_incra(vmexits, exit_reason);
2059 __vmread(IDT_VECTORING_INFO_FIELD, &idtv_info_field);
2060 if (idtv_info_field & INTR_INFO_VALID_MASK) {
2061 __vmwrite(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field);
2063 __vmread(VM_EXIT_INSTRUCTION_LEN, &inst_len);
2064 if (inst_len >= 1 && inst_len <= 15)
2065 __vmwrite(VM_ENTRY_INSTRUCTION_LEN, inst_len);
2067 if (idtv_info_field & 0x800) { /* valid error code */
2068 unsigned long error_code;
2069 __vmread(IDT_VECTORING_ERROR_CODE, &error_code);
2070 __vmwrite(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
2073 HVM_DBG_LOG(DBG_LEVEL_1, "idtv_info_field=%x", idtv_info_field);
2076 /* don't bother H/W interrutps */
2077 if (exit_reason != EXIT_REASON_EXTERNAL_INTERRUPT &&
2078 exit_reason != EXIT_REASON_VMCALL &&
2079 exit_reason != EXIT_REASON_IO_INSTRUCTION)
2080 HVM_DBG_LOG(DBG_LEVEL_0, "exit reason = %x", exit_reason);
2082 if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
2083 printk("Failed vm entry\n");
2084 domain_crash_synchronous();
2085 return;
2088 __vmread(GUEST_RIP, &eip);
2089 TRACE_VMEXIT(0,exit_reason);
2091 switch (exit_reason) {
2092 case EXIT_REASON_EXCEPTION_NMI:
2094 /*
2095 * We don't set the software-interrupt exiting (INT n).
2096 * (1) We can get an exception (e.g. #PG) in the guest, or
2097 * (2) NMI
2098 */
2099 int error;
2100 unsigned int vector;
2101 unsigned long va;
2103 if ((error = __vmread(VM_EXIT_INTR_INFO, &vector))
2104 || !(vector & INTR_INFO_VALID_MASK))
2105 __hvm_bug(&regs);
2106 vector &= INTR_INFO_VECTOR_MASK;
2108 TRACE_VMEXIT(1,vector);
2109 perfc_incra(cause_vector, vector);
2111 switch (vector) {
2112 #ifdef XEN_DEBUGGER
2113 case TRAP_debug:
2115 save_cpu_user_regs(&regs);
2116 pdb_handle_exception(1, &regs, 1);
2117 restore_cpu_user_regs(&regs);
2118 break;
2120 case TRAP_int3:
2122 save_cpu_user_regs(&regs);
2123 pdb_handle_exception(3, &regs, 1);
2124 restore_cpu_user_regs(&regs);
2125 break;
2127 #else
2128 case TRAP_debug:
2130 void store_cpu_user_regs(struct cpu_user_regs *regs);
2132 store_cpu_user_regs(&regs);
2133 __vm_clear_bit(GUEST_PENDING_DBG_EXCEPTIONS, PENDING_DEBUG_EXC_BS);
2135 domain_pause_for_debugger();
2137 break;
2139 case TRAP_int3:
2141 if ( test_bit(_DOMF_debugging, &v->domain->domain_flags) )
2142 domain_pause_for_debugger();
2143 else
2144 vmx_inject_exception(v, TRAP_int3, VMX_DELIVER_NO_ERROR_CODE);
2145 break;
2147 #endif
2148 case TRAP_no_device:
2150 vmx_do_no_device_fault();
2151 break;
2153 case TRAP_page_fault:
2155 __vmread(EXIT_QUALIFICATION, &va);
2156 __vmread(VM_EXIT_INTR_ERROR_CODE, &regs.error_code);
2158 TRACE_VMEXIT(3,regs.error_code);
2159 TRACE_VMEXIT(4,va);
2161 HVM_DBG_LOG(DBG_LEVEL_VMMU,
2162 "eax=%lx, ebx=%lx, ecx=%lx, edx=%lx, esi=%lx, edi=%lx",
2163 (unsigned long)regs.eax, (unsigned long)regs.ebx,
2164 (unsigned long)regs.ecx, (unsigned long)regs.edx,
2165 (unsigned long)regs.esi, (unsigned long)regs.edi);
2166 v->arch.hvm_vcpu.mmio_op.inst_decoder_regs = &regs;
2168 if (!(error = vmx_do_page_fault(va, &regs))) {
2169 /*
2170 * Inject #PG using Interruption-Information Fields
2171 */
2172 vmx_inject_exception(v, TRAP_page_fault, regs.error_code);
2173 v->arch.hvm_vmx.cpu_cr2 = va;
2174 TRACE_3D(TRC_VMX_INT, v->domain->domain_id, TRAP_page_fault, va);
2176 break;
2178 case TRAP_nmi:
2179 do_nmi(&regs);
2180 break;
2181 default:
2182 vmx_reflect_exception(v);
2183 break;
2185 break;
2187 case EXIT_REASON_EXTERNAL_INTERRUPT:
2188 vmx_vmexit_do_extint(&regs);
2189 break;
2190 case EXIT_REASON_PENDING_INTERRUPT:
2191 __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
2192 MONITOR_CPU_BASED_EXEC_CONTROLS);
2193 v->arch.hvm_vcpu.u.vmx.exec_control = MONITOR_CPU_BASED_EXEC_CONTROLS;
2194 break;
2195 case EXIT_REASON_TASK_SWITCH:
2196 __hvm_bug(&regs);
2197 break;
2198 case EXIT_REASON_CPUID:
2199 vmx_vmexit_do_cpuid(&regs);
2200 __get_instruction_length(inst_len);
2201 __update_guest_eip(inst_len);
2202 break;
2203 case EXIT_REASON_HLT:
2204 __get_instruction_length(inst_len);
2205 __update_guest_eip(inst_len);
2206 vmx_vmexit_do_hlt();
2207 break;
2208 case EXIT_REASON_INVLPG:
2210 unsigned long va;
2212 __vmread(EXIT_QUALIFICATION, &va);
2213 vmx_vmexit_do_invlpg(va);
2214 __get_instruction_length(inst_len);
2215 __update_guest_eip(inst_len);
2216 break;
2218 #if 0 /* keep this for debugging */
2219 case EXIT_REASON_VMCALL:
2220 __get_instruction_length(inst_len);
2221 __vmread(GUEST_RIP, &eip);
2222 __vmread(EXIT_QUALIFICATION, &exit_qualification);
2224 hvm_print_line(v, regs.eax); /* provides the current domain */
2225 __update_guest_eip(inst_len);
2226 break;
2227 #endif
2228 case EXIT_REASON_CR_ACCESS:
2230 __vmread(GUEST_RIP, &eip);
2231 __get_instruction_length(inst_len);
2232 __vmread(EXIT_QUALIFICATION, &exit_qualification);
2234 HVM_DBG_LOG(DBG_LEVEL_1, "eip = %lx, inst_len =%lx, exit_qualification = %lx",
2235 eip, inst_len, exit_qualification);
2236 if (vmx_cr_access(exit_qualification, &regs))
2237 __update_guest_eip(inst_len);
2238 TRACE_VMEXIT(3,regs.error_code);
2239 TRACE_VMEXIT(4,exit_qualification);
2240 break;
2242 case EXIT_REASON_DR_ACCESS:
2243 __vmread(EXIT_QUALIFICATION, &exit_qualification);
2244 vmx_dr_access(exit_qualification, &regs);
2245 __get_instruction_length(inst_len);
2246 __update_guest_eip(inst_len);
2247 break;
2248 case EXIT_REASON_IO_INSTRUCTION:
2249 __vmread(EXIT_QUALIFICATION, &exit_qualification);
2250 __get_instruction_length(inst_len);
2251 vmx_io_instruction(&regs, exit_qualification, inst_len);
2252 TRACE_VMEXIT(4,exit_qualification);
2253 break;
2254 case EXIT_REASON_MSR_READ:
2255 __get_instruction_length(inst_len);
2256 vmx_do_msr_read(&regs);
2257 __update_guest_eip(inst_len);
2258 break;
2259 case EXIT_REASON_MSR_WRITE:
2260 __vmread(GUEST_RIP, &eip);
2261 vmx_do_msr_write(&regs);
2262 __get_instruction_length(inst_len);
2263 __update_guest_eip(inst_len);
2264 break;
2265 case EXIT_REASON_MWAIT_INSTRUCTION:
2266 __hvm_bug(&regs);
2267 break;
2268 case EXIT_REASON_VMCALL:
2269 case EXIT_REASON_VMCLEAR:
2270 case EXIT_REASON_VMLAUNCH:
2271 case EXIT_REASON_VMPTRLD:
2272 case EXIT_REASON_VMPTRST:
2273 case EXIT_REASON_VMREAD:
2274 case EXIT_REASON_VMRESUME:
2275 case EXIT_REASON_VMWRITE:
2276 case EXIT_REASON_VMOFF:
2277 case EXIT_REASON_VMON:
2278 /* Report invalid opcode exception when a VMX guest tries to execute
2279 any of the VMX instructions */
2280 vmx_inject_exception(v, TRAP_invalid_op, VMX_DELIVER_NO_ERROR_CODE);
2281 break;
2283 default:
2284 __hvm_bug(&regs); /* should not happen */
2288 asmlinkage void vmx_load_cr2(void)
2290 struct vcpu *v = current;
2292 local_irq_disable();
2293 #ifdef __i386__
2294 asm volatile("movl %0,%%cr2": :"r" (v->arch.hvm_vmx.cpu_cr2));
2295 #else
2296 asm volatile("movq %0,%%cr2": :"r" (v->arch.hvm_vmx.cpu_cr2));
2297 #endif
2300 asmlinkage void vmx_trace_vmentry (void)
2302 TRACE_5D(TRC_VMX_VMENTRY,
2303 trace_values[smp_processor_id()][0],
2304 trace_values[smp_processor_id()][1],
2305 trace_values[smp_processor_id()][2],
2306 trace_values[smp_processor_id()][3],
2307 trace_values[smp_processor_id()][4]);
2308 TRACE_VMEXIT(0,9);
2309 TRACE_VMEXIT(1,9);
2310 TRACE_VMEXIT(2,9);
2311 TRACE_VMEXIT(3,9);
2312 TRACE_VMEXIT(4,9);
2313 return;
2316 asmlinkage void vmx_trace_vmexit (void)
2318 TRACE_3D(TRC_VMX_VMEXIT,0,0,0);
2319 return;
2322 /*
2323 * Local variables:
2324 * mode: C
2325 * c-set-style: "BSD"
2326 * c-basic-offset: 4
2327 * tab-width: 4
2328 * indent-tabs-mode: nil
2329 * End:
2330 */