ia64/xen-unstable

view xen/arch/x86/hvm/svm/svm.c @ 14635:5c52e5ca8459

hvm: Clean up handling of exception intercepts.
Only intercept #DB/#BP if a debugger is attached.
Signed-off-by: Keir Fraser <keir@xensource.com>
author Keir Fraser <keir@xensource.com>
date Wed Mar 28 18:47:17 2007 +0100 (2007-03-28)
parents ffb9dda42946
children 98b049ed2540
line source
1 /*
2 * svm.c: handling SVM architecture-related VM exits
3 * Copyright (c) 2004, Intel Corporation.
4 * Copyright (c) 2005, AMD Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms and conditions of the GNU General Public License,
8 * version 2, as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
17 * Place - Suite 330, Boston, MA 02111-1307 USA.
18 *
19 */
21 #include <xen/config.h>
22 #include <xen/init.h>
23 #include <xen/lib.h>
24 #include <xen/trace.h>
25 #include <xen/sched.h>
26 #include <xen/irq.h>
27 #include <xen/softirq.h>
28 #include <xen/hypercall.h>
29 #include <xen/domain_page.h>
30 #include <asm/current.h>
31 #include <asm/io.h>
32 #include <asm/paging.h>
33 #include <asm/p2m.h>
34 #include <asm/regs.h>
35 #include <asm/cpufeature.h>
36 #include <asm/processor.h>
37 #include <asm/types.h>
38 #include <asm/msr.h>
39 #include <asm/spinlock.h>
40 #include <asm/hvm/hvm.h>
41 #include <asm/hvm/support.h>
42 #include <asm/hvm/io.h>
43 #include <asm/hvm/svm/svm.h>
44 #include <asm/hvm/svm/vmcb.h>
45 #include <asm/hvm/svm/emulate.h>
46 #include <asm/hvm/svm/intr.h>
47 #include <asm/x86_emulate.h>
48 #include <public/sched.h>
49 #include <asm/hvm/vpt.h>
50 #include <asm/hvm/trace.h>
51 #include <asm/hap.h>
53 #define SVM_EXTRA_DEBUG
55 #define set_segment_register(name, value) \
56 __asm__ __volatile__ ( "movw %%ax ,%%" STR(name) "" : : "a" (value) )
58 /* External functions. We should move these to some suitable header file(s) */
60 extern int inst_copy_from_guest(unsigned char *buf, unsigned long guest_eip,
61 int inst_len);
62 extern asmlinkage void do_IRQ(struct cpu_user_regs *);
63 extern void svm_dump_inst(unsigned long eip);
64 extern int svm_dbg_on;
65 void svm_dump_regs(const char *from, struct cpu_user_regs *regs);
67 static int svm_reset_to_realmode(struct vcpu *v,
68 struct cpu_user_regs *regs);
70 /* va of hardware host save area */
71 static void *hsa[NR_CPUS] __read_mostly;
73 /* vmcb used for extended host state */
74 static void *root_vmcb[NR_CPUS] __read_mostly;
76 /* physical address of above for host VMSAVE/VMLOAD */
77 u64 root_vmcb_pa[NR_CPUS] __read_mostly;
79 /* hardware assisted paging bits */
80 extern int opt_hap_enabled;
81 extern int hap_capable_system;
83 static inline void svm_inject_exception(struct vcpu *v, int trap,
84 int ev, int error_code)
85 {
86 eventinj_t event;
87 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
89 if ( trap == TRAP_page_fault )
90 HVMTRACE_2D(PF_INJECT, v, v->arch.hvm_svm.cpu_cr2, error_code);
91 else
92 HVMTRACE_2D(INJ_EXC, v, trap, error_code);
94 event.bytes = 0;
95 event.fields.v = 1;
96 event.fields.type = EVENTTYPE_EXCEPTION;
97 event.fields.vector = trap;
98 event.fields.ev = ev;
99 event.fields.errorcode = error_code;
101 ASSERT(vmcb->eventinj.fields.v == 0);
103 vmcb->eventinj = event;
104 }
106 static void stop_svm(void)
107 {
108 u32 eax, edx;
109 /* We turn off the EFER_SVME bit. */
110 rdmsr(MSR_EFER, eax, edx);
111 eax &= ~EFER_SVME;
112 wrmsr(MSR_EFER, eax, edx);
113 }
115 static void svm_store_cpu_guest_regs(
116 struct vcpu *v, struct cpu_user_regs *regs, unsigned long *crs)
117 {
118 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
120 if ( regs != NULL )
121 {
122 regs->eip = vmcb->rip;
123 regs->esp = vmcb->rsp;
124 regs->eflags = vmcb->rflags;
125 regs->cs = vmcb->cs.sel;
126 regs->ds = vmcb->ds.sel;
127 regs->es = vmcb->es.sel;
128 regs->ss = vmcb->ss.sel;
129 regs->gs = vmcb->gs.sel;
130 regs->fs = vmcb->fs.sel;
131 }
133 if ( crs != NULL )
134 {
135 /* Returning the guest's regs */
136 crs[0] = v->arch.hvm_svm.cpu_shadow_cr0;
137 crs[2] = v->arch.hvm_svm.cpu_cr2;
138 crs[3] = v->arch.hvm_svm.cpu_cr3;
139 crs[4] = v->arch.hvm_svm.cpu_shadow_cr4;
140 }
141 }
144 static inline int long_mode_do_msr_read(struct cpu_user_regs *regs)
145 {
146 u64 msr_content = 0;
147 struct vcpu *v = current;
148 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
150 switch ((u32)regs->ecx)
151 {
152 case MSR_EFER:
153 msr_content = v->arch.hvm_svm.cpu_shadow_efer;
154 break;
156 #ifdef __x86_64__
157 case MSR_FS_BASE:
158 msr_content = vmcb->fs.base;
159 goto check_long_mode;
161 case MSR_GS_BASE:
162 msr_content = vmcb->gs.base;
163 goto check_long_mode;
165 case MSR_SHADOW_GS_BASE:
166 msr_content = vmcb->kerngsbase;
167 check_long_mode:
168 if ( !svm_long_mode_enabled(v) )
169 {
170 svm_inject_exception(v, TRAP_gp_fault, 1, 0);
171 return 0;
172 }
173 break;
174 #endif
176 case MSR_STAR:
177 msr_content = vmcb->star;
178 break;
180 case MSR_LSTAR:
181 msr_content = vmcb->lstar;
182 break;
184 case MSR_CSTAR:
185 msr_content = vmcb->cstar;
186 break;
188 case MSR_SYSCALL_MASK:
189 msr_content = vmcb->sfmask;
190 break;
191 default:
192 return 0;
193 }
195 HVM_DBG_LOG(DBG_LEVEL_2, "msr_content: %"PRIx64"\n",
196 msr_content);
198 regs->eax = (u32)(msr_content >> 0);
199 regs->edx = (u32)(msr_content >> 32);
200 return 1;
201 }
203 static inline int long_mode_do_msr_write(struct cpu_user_regs *regs)
204 {
205 u64 msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
206 u32 ecx = regs->ecx;
207 struct vcpu *v = current;
208 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
210 HVM_DBG_LOG(DBG_LEVEL_1, "msr %x msr_content %"PRIx64"\n",
211 ecx, msr_content);
213 switch ( ecx )
214 {
215 case MSR_EFER:
216 /* offending reserved bit will cause #GP */
217 if ( msr_content & ~(EFER_LME | EFER_LMA | EFER_NX | EFER_SCE) )
218 {
219 gdprintk(XENLOG_WARNING, "Trying to set reserved bit in "
220 "EFER: %"PRIx64"\n", msr_content);
221 goto gp_fault;
222 }
224 /*
225 * update the VMCB's EFER with the intended value along with
226 * that crucial EFER.SVME bit =)
227 */
228 vmcb->efer = msr_content | EFER_SVME;
230 #ifdef __x86_64__
232 /*
233 * Check for EFER.LME transitions from 0->1 or 1->0. Do the
234 * sanity checks and then make sure that both EFER.LME and
235 * EFER.LMA are cleared. (EFER.LME can't be set in the vmcb
236 * until the guest also sets CR0.PG, since even if the guest has
237 * paging "disabled", the vmcb's CR0 always has PG set.)
238 */
239 if ( (msr_content & EFER_LME) && !svm_lme_is_set(v) )
240 {
241 /* EFER.LME transition from 0 to 1 */
243 if ( svm_paging_enabled(v) ||
244 !svm_cr4_pae_is_set(v) )
245 {
246 gdprintk(XENLOG_WARNING, "Trying to set LME bit when "
247 "in paging mode or PAE bit is not set\n");
248 goto gp_fault;
249 }
251 vmcb->efer &= ~(EFER_LME | EFER_LMA);
252 }
253 else if ( !(msr_content & EFER_LME) && svm_lme_is_set(v) )
254 {
255 /* EFER.LME transistion from 1 to 0 */
257 if ( svm_paging_enabled(v) )
258 {
259 gdprintk(XENLOG_WARNING,
260 "Trying to clear EFER.LME while paging enabled\n");
261 goto gp_fault;
262 }
264 vmcb->efer &= ~(EFER_LME | EFER_LMA);
265 }
267 #endif /* __x86_64__ */
269 /* update the guest EFER's shadow with the intended value */
270 v->arch.hvm_svm.cpu_shadow_efer = msr_content;
272 break;
274 #ifdef __x86_64__
275 case MSR_FS_BASE:
276 case MSR_GS_BASE:
277 case MSR_SHADOW_GS_BASE:
278 if ( !svm_long_mode_enabled(v) )
279 goto gp_fault;
281 if ( !is_canonical_address(msr_content) )
282 goto uncanonical_address;
284 if ( ecx == MSR_FS_BASE )
285 vmcb->fs.base = msr_content;
286 else if ( ecx == MSR_GS_BASE )
287 vmcb->gs.base = msr_content;
288 else
289 vmcb->kerngsbase = msr_content;
290 break;
291 #endif
293 case MSR_STAR:
294 vmcb->star = msr_content;
295 break;
297 case MSR_LSTAR:
298 case MSR_CSTAR:
299 if ( !is_canonical_address(msr_content) )
300 goto uncanonical_address;
302 if ( ecx == MSR_LSTAR )
303 vmcb->lstar = msr_content;
304 else
305 vmcb->cstar = msr_content;
306 break;
308 case MSR_SYSCALL_MASK:
309 vmcb->sfmask = msr_content;
310 break;
312 default:
313 return 0;
314 }
316 return 1;
318 uncanonical_address:
319 HVM_DBG_LOG(DBG_LEVEL_1, "Not cano address of msr write %x\n", ecx);
320 gp_fault:
321 svm_inject_exception(v, TRAP_gp_fault, 1, 0);
322 return 0;
323 }
326 #define loaddebug(_v,_reg) \
327 __asm__ __volatile__ ("mov %0,%%db" #_reg : : "r" ((_v)->debugreg[_reg]))
328 #define savedebug(_v,_reg) \
329 __asm__ __volatile__ ("mov %%db" #_reg ",%0" : : "r" ((_v)->debugreg[_reg]))
331 static inline void svm_save_dr(struct vcpu *v)
332 {
333 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
335 if ( !v->arch.hvm_vcpu.flag_dr_dirty )
336 return;
338 /* Clear the DR dirty flag and re-enable intercepts for DR accesses. */
339 v->arch.hvm_vcpu.flag_dr_dirty = 0;
340 v->arch.hvm_svm.vmcb->dr_intercepts = DR_INTERCEPT_ALL_WRITES;
342 savedebug(&v->arch.guest_context, 0);
343 savedebug(&v->arch.guest_context, 1);
344 savedebug(&v->arch.guest_context, 2);
345 savedebug(&v->arch.guest_context, 3);
346 v->arch.guest_context.debugreg[6] = vmcb->dr6;
347 v->arch.guest_context.debugreg[7] = vmcb->dr7;
348 }
351 static inline void __restore_debug_registers(struct vcpu *v)
352 {
353 loaddebug(&v->arch.guest_context, 0);
354 loaddebug(&v->arch.guest_context, 1);
355 loaddebug(&v->arch.guest_context, 2);
356 loaddebug(&v->arch.guest_context, 3);
357 /* DR6 and DR7 are loaded from the VMCB. */
358 }
361 int svm_vmcb_save(struct vcpu *v, struct hvm_hw_cpu *c)
362 {
363 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
365 c->eip = vmcb->rip;
367 #ifdef HVM_DEBUG_SUSPEND
368 printk("%s: eip=0x%"PRIx64".\n",
369 __func__,
370 inst_len, c->eip);
371 #endif
373 c->esp = vmcb->rsp;
374 c->eflags = vmcb->rflags;
376 c->cr0 = v->arch.hvm_svm.cpu_shadow_cr0;
377 c->cr3 = v->arch.hvm_svm.cpu_cr3;
378 c->cr4 = v->arch.hvm_svm.cpu_shadow_cr4;
380 #ifdef HVM_DEBUG_SUSPEND
381 printk("%s: cr3=0x%"PRIx64", cr0=0x%"PRIx64", cr4=0x%"PRIx64".\n",
382 __func__,
383 c->cr3,
384 c->cr0,
385 c->cr4);
386 #endif
388 c->idtr_limit = vmcb->idtr.limit;
389 c->idtr_base = vmcb->idtr.base;
391 c->gdtr_limit = vmcb->gdtr.limit;
392 c->gdtr_base = vmcb->gdtr.base;
394 c->cs_sel = vmcb->cs.sel;
395 c->cs_limit = vmcb->cs.limit;
396 c->cs_base = vmcb->cs.base;
397 c->cs_arbytes = vmcb->cs.attr.bytes;
399 c->ds_sel = vmcb->ds.sel;
400 c->ds_limit = vmcb->ds.limit;
401 c->ds_base = vmcb->ds.base;
402 c->ds_arbytes = vmcb->ds.attr.bytes;
404 c->es_sel = vmcb->es.sel;
405 c->es_limit = vmcb->es.limit;
406 c->es_base = vmcb->es.base;
407 c->es_arbytes = vmcb->es.attr.bytes;
409 c->ss_sel = vmcb->ss.sel;
410 c->ss_limit = vmcb->ss.limit;
411 c->ss_base = vmcb->ss.base;
412 c->ss_arbytes = vmcb->ss.attr.bytes;
414 c->fs_sel = vmcb->fs.sel;
415 c->fs_limit = vmcb->fs.limit;
416 c->fs_base = vmcb->fs.base;
417 c->fs_arbytes = vmcb->fs.attr.bytes;
419 c->gs_sel = vmcb->gs.sel;
420 c->gs_limit = vmcb->gs.limit;
421 c->gs_base = vmcb->gs.base;
422 c->gs_arbytes = vmcb->gs.attr.bytes;
424 c->tr_sel = vmcb->tr.sel;
425 c->tr_limit = vmcb->tr.limit;
426 c->tr_base = vmcb->tr.base;
427 c->tr_arbytes = vmcb->tr.attr.bytes;
429 c->ldtr_sel = vmcb->ldtr.sel;
430 c->ldtr_limit = vmcb->ldtr.limit;
431 c->ldtr_base = vmcb->ldtr.base;
432 c->ldtr_arbytes = vmcb->ldtr.attr.bytes;
434 c->sysenter_cs = vmcb->sysenter_cs;
435 c->sysenter_esp = vmcb->sysenter_esp;
436 c->sysenter_eip = vmcb->sysenter_eip;
438 return 1;
439 }
442 int svm_vmcb_restore(struct vcpu *v, struct hvm_hw_cpu *c)
443 {
444 unsigned long mfn, old_base_mfn;
445 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
447 vmcb->rip = c->eip;
448 vmcb->rsp = c->esp;
449 vmcb->rflags = c->eflags;
451 v->arch.hvm_svm.cpu_shadow_cr0 = c->cr0;
452 vmcb->cr0 = c->cr0 | X86_CR0_WP | X86_CR0_ET;
453 if ( !paging_mode_hap(v->domain) )
454 vmcb->cr0 |= X86_CR0_PG;
456 #ifdef HVM_DEBUG_SUSPEND
457 printk("%s: cr3=0x%"PRIx64", cr0=0x%"PRIx64", cr4=0x%"PRIx64".\n",
458 __func__,
459 c->cr3,
460 c->cr0,
461 c->cr4);
462 #endif
464 if ( !svm_paging_enabled(v) )
465 {
466 printk("%s: paging not enabled.", __func__);
467 goto skip_cr3;
468 }
470 if ( c->cr3 == v->arch.hvm_svm.cpu_cr3 )
471 {
472 /*
473 * This is simple TLB flush, implying the guest has
474 * removed some translation or changed page attributes.
475 * We simply invalidate the shadow.
476 */
477 mfn = gmfn_to_mfn(v->domain, c->cr3 >> PAGE_SHIFT);
478 if ( mfn != pagetable_get_pfn(v->arch.guest_table) )
479 goto bad_cr3;
480 }
481 else
482 {
483 /*
484 * If different, make a shadow. Check if the PDBR is valid
485 * first.
486 */
487 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 c->cr3 = %"PRIx64"", c->cr3);
488 mfn = gmfn_to_mfn(v->domain, c->cr3 >> PAGE_SHIFT);
489 if( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), v->domain) )
490 goto bad_cr3;
492 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
493 v->arch.guest_table = pagetable_from_pfn(mfn);
494 if (old_base_mfn)
495 put_page(mfn_to_page(old_base_mfn));
496 v->arch.hvm_svm.cpu_cr3 = c->cr3;
497 }
499 skip_cr3:
500 vmcb->cr4 = c->cr4 | SVM_CR4_HOST_MASK;
501 v->arch.hvm_svm.cpu_shadow_cr4 = c->cr4;
503 vmcb->idtr.limit = c->idtr_limit;
504 vmcb->idtr.base = c->idtr_base;
506 vmcb->gdtr.limit = c->gdtr_limit;
507 vmcb->gdtr.base = c->gdtr_base;
509 vmcb->cs.sel = c->cs_sel;
510 vmcb->cs.limit = c->cs_limit;
511 vmcb->cs.base = c->cs_base;
512 vmcb->cs.attr.bytes = c->cs_arbytes;
514 vmcb->ds.sel = c->ds_sel;
515 vmcb->ds.limit = c->ds_limit;
516 vmcb->ds.base = c->ds_base;
517 vmcb->ds.attr.bytes = c->ds_arbytes;
519 vmcb->es.sel = c->es_sel;
520 vmcb->es.limit = c->es_limit;
521 vmcb->es.base = c->es_base;
522 vmcb->es.attr.bytes = c->es_arbytes;
524 vmcb->ss.sel = c->ss_sel;
525 vmcb->ss.limit = c->ss_limit;
526 vmcb->ss.base = c->ss_base;
527 vmcb->ss.attr.bytes = c->ss_arbytes;
529 vmcb->fs.sel = c->fs_sel;
530 vmcb->fs.limit = c->fs_limit;
531 vmcb->fs.base = c->fs_base;
532 vmcb->fs.attr.bytes = c->fs_arbytes;
534 vmcb->gs.sel = c->gs_sel;
535 vmcb->gs.limit = c->gs_limit;
536 vmcb->gs.base = c->gs_base;
537 vmcb->gs.attr.bytes = c->gs_arbytes;
539 vmcb->tr.sel = c->tr_sel;
540 vmcb->tr.limit = c->tr_limit;
541 vmcb->tr.base = c->tr_base;
542 vmcb->tr.attr.bytes = c->tr_arbytes;
544 vmcb->ldtr.sel = c->ldtr_sel;
545 vmcb->ldtr.limit = c->ldtr_limit;
546 vmcb->ldtr.base = c->ldtr_base;
547 vmcb->ldtr.attr.bytes = c->ldtr_arbytes;
549 vmcb->sysenter_cs = c->sysenter_cs;
550 vmcb->sysenter_esp = c->sysenter_esp;
551 vmcb->sysenter_eip = c->sysenter_eip;
553 paging_update_paging_modes(v);
554 return 0;
556 bad_cr3:
557 gdprintk(XENLOG_ERR, "Invalid CR3 value=0x%"PRIx64"", c->cr3);
558 return -EINVAL;
559 }
562 void svm_save_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data)
563 {
564 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
566 data->shadow_gs = vmcb->kerngsbase;
567 data->msr_lstar = vmcb->lstar;
568 data->msr_star = vmcb->star;
569 data->msr_cstar = vmcb->cstar;
570 data->msr_syscall_mask = vmcb->sfmask;
571 data->msr_efer = v->arch.hvm_svm.cpu_shadow_efer;
573 data->tsc = hvm_get_guest_time(v);
574 }
577 void svm_load_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data)
578 {
579 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
581 vmcb->kerngsbase = data->shadow_gs;
582 vmcb->lstar = data->msr_lstar;
583 vmcb->star = data->msr_star;
584 vmcb->cstar = data->msr_cstar;
585 vmcb->sfmask = data->msr_syscall_mask;
586 v->arch.hvm_svm.cpu_shadow_efer = data->msr_efer;
587 vmcb->efer = data->msr_efer | EFER_SVME;
588 /* VMCB's EFER.LME isn't set unless we're actually in long mode
589 * (see long_mode_do_msr_write()) */
590 if ( !(vmcb->efer & EFER_LMA) )
591 vmcb->efer &= ~EFER_LME;
593 hvm_set_guest_time(v, data->tsc);
594 }
596 void svm_save_vmcb_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt)
597 {
598 svm_save_cpu_state(v, ctxt);
599 svm_vmcb_save(v, ctxt);
600 }
602 int svm_load_vmcb_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt)
603 {
604 svm_load_cpu_state(v, ctxt);
605 if (svm_vmcb_restore(v, ctxt)) {
606 printk("svm_vmcb restore failed!\n");
607 domain_crash(v->domain);
608 return -EINVAL;
609 }
611 return 0;
612 }
615 static inline void svm_restore_dr(struct vcpu *v)
616 {
617 if ( unlikely(v->arch.guest_context.debugreg[7] & 0xFF) )
618 __restore_debug_registers(v);
619 }
622 static int svm_realmode(struct vcpu *v)
623 {
624 unsigned long cr0 = v->arch.hvm_svm.cpu_shadow_cr0;
625 unsigned long eflags = v->arch.hvm_svm.vmcb->rflags;
627 return (eflags & X86_EFLAGS_VM) || !(cr0 & X86_CR0_PE);
628 }
630 static int svm_guest_x86_mode(struct vcpu *v)
631 {
632 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
634 if ( svm_long_mode_enabled(v) && vmcb->cs.attr.fields.l )
635 return 8;
637 if ( svm_realmode(v) )
638 return 2;
640 return (vmcb->cs.attr.fields.db ? 4 : 2);
641 }
643 void svm_update_host_cr3(struct vcpu *v)
644 {
645 /* SVM doesn't have a HOST_CR3 equivalent to update. */
646 }
648 void svm_update_guest_cr3(struct vcpu *v)
649 {
650 v->arch.hvm_svm.vmcb->cr3 = v->arch.hvm_vcpu.hw_cr3;
651 }
653 static void svm_update_vtpr(struct vcpu *v, unsigned long value)
654 {
655 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
657 vmcb->vintr.fields.tpr = value & 0x0f;
658 }
660 unsigned long svm_get_ctrl_reg(struct vcpu *v, unsigned int num)
661 {
662 switch ( num )
663 {
664 case 0:
665 return v->arch.hvm_svm.cpu_shadow_cr0;
666 case 2:
667 return v->arch.hvm_svm.cpu_cr2;
668 case 3:
669 return v->arch.hvm_svm.cpu_cr3;
670 case 4:
671 return v->arch.hvm_svm.cpu_shadow_cr4;
672 default:
673 BUG();
674 }
675 return 0; /* dummy */
676 }
678 static unsigned long svm_get_segment_base(struct vcpu *v, enum x86_segment seg)
679 {
680 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
681 int long_mode = 0;
683 #ifdef __x86_64__
684 long_mode = vmcb->cs.attr.fields.l && svm_long_mode_enabled(v);
685 #endif
686 switch ( seg )
687 {
688 case x86_seg_cs: return long_mode ? 0 : vmcb->cs.base;
689 case x86_seg_ds: return long_mode ? 0 : vmcb->ds.base;
690 case x86_seg_es: return long_mode ? 0 : vmcb->es.base;
691 case x86_seg_fs: return vmcb->fs.base;
692 case x86_seg_gs: return vmcb->gs.base;
693 case x86_seg_ss: return long_mode ? 0 : vmcb->ss.base;
694 case x86_seg_tr: return vmcb->tr.base;
695 case x86_seg_gdtr: return vmcb->gdtr.base;
696 case x86_seg_idtr: return vmcb->idtr.base;
697 case x86_seg_ldtr: return vmcb->ldtr.base;
698 }
699 BUG();
700 return 0;
701 }
703 static void svm_get_segment_register(struct vcpu *v, enum x86_segment seg,
704 struct segment_register *reg)
705 {
706 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
707 switch ( seg )
708 {
709 case x86_seg_cs: memcpy(reg, &vmcb->cs, sizeof(*reg)); break;
710 case x86_seg_ds: memcpy(reg, &vmcb->ds, sizeof(*reg)); break;
711 case x86_seg_es: memcpy(reg, &vmcb->es, sizeof(*reg)); break;
712 case x86_seg_fs: memcpy(reg, &vmcb->fs, sizeof(*reg)); break;
713 case x86_seg_gs: memcpy(reg, &vmcb->gs, sizeof(*reg)); break;
714 case x86_seg_ss: memcpy(reg, &vmcb->ss, sizeof(*reg)); break;
715 case x86_seg_tr: memcpy(reg, &vmcb->tr, sizeof(*reg)); break;
716 case x86_seg_gdtr: memcpy(reg, &vmcb->gdtr, sizeof(*reg)); break;
717 case x86_seg_idtr: memcpy(reg, &vmcb->idtr, sizeof(*reg)); break;
718 case x86_seg_ldtr: memcpy(reg, &vmcb->ldtr, sizeof(*reg)); break;
719 default: BUG();
720 }
721 }
723 /* Make sure that xen intercepts any FP accesses from current */
724 static void svm_stts(struct vcpu *v)
725 {
726 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
728 /*
729 * If the guest does not have TS enabled then we must cause and handle an
730 * exception on first use of the FPU. If the guest *does* have TS enabled
731 * then this is not necessary: no FPU activity can occur until the guest
732 * clears CR0.TS, and we will initialise the FPU when that happens.
733 */
734 if ( !(v->arch.hvm_svm.cpu_shadow_cr0 & X86_CR0_TS) )
735 {
736 v->arch.hvm_svm.vmcb->exception_intercepts |= 1U << TRAP_no_device;
737 vmcb->cr0 |= X86_CR0_TS;
738 }
739 }
742 static void svm_set_tsc_offset(struct vcpu *v, u64 offset)
743 {
744 v->arch.hvm_svm.vmcb->tsc_offset = offset;
745 }
748 static void svm_init_ap_context(
749 struct vcpu_guest_context *ctxt, int vcpuid, int trampoline_vector)
750 {
751 struct vcpu *v;
752 struct vmcb_struct *vmcb;
753 cpu_user_regs_t *regs;
754 u16 cs_sel;
756 /* We know this is safe because hvm_bringup_ap() does it */
757 v = current->domain->vcpu[vcpuid];
758 vmcb = v->arch.hvm_svm.vmcb;
759 regs = &v->arch.guest_context.user_regs;
761 memset(ctxt, 0, sizeof(*ctxt));
763 /*
764 * We execute the trampoline code in real mode. The trampoline vector
765 * passed to us is page alligned and is the physical frame number for
766 * the code. We will execute this code in real mode.
767 */
768 cs_sel = trampoline_vector << 8;
769 ctxt->user_regs.eip = 0x0;
770 ctxt->user_regs.cs = cs_sel;
772 /*
773 * This is the launch of an AP; set state so that we begin executing
774 * the trampoline code in real-mode.
775 */
776 svm_reset_to_realmode(v, regs);
777 /* Adjust the vmcb's hidden register state. */
778 vmcb->rip = 0;
779 vmcb->cs.sel = cs_sel;
780 vmcb->cs.base = (cs_sel << 4);
781 }
783 static void svm_init_hypercall_page(struct domain *d, void *hypercall_page)
784 {
785 char *p;
786 int i;
788 memset(hypercall_page, 0, PAGE_SIZE);
790 for ( i = 0; i < (PAGE_SIZE / 32); i++ )
791 {
792 p = (char *)(hypercall_page + (i * 32));
793 *(u8 *)(p + 0) = 0xb8; /* mov imm32, %eax */
794 *(u32 *)(p + 1) = i;
795 *(u8 *)(p + 5) = 0x0f; /* vmmcall */
796 *(u8 *)(p + 6) = 0x01;
797 *(u8 *)(p + 7) = 0xd9;
798 *(u8 *)(p + 8) = 0xc3; /* ret */
799 }
801 /* Don't support HYPERVISOR_iret at the moment */
802 *(u16 *)(hypercall_page + (__HYPERVISOR_iret * 32)) = 0x0b0f; /* ud2 */
803 }
806 int svm_dbg_on = 0;
808 static inline int svm_do_debugout(unsigned long exit_code)
809 {
810 int i;
812 static unsigned long counter = 0;
813 static unsigned long works[] =
814 {
815 VMEXIT_IOIO,
816 VMEXIT_HLT,
817 VMEXIT_CPUID,
818 VMEXIT_DR0_READ,
819 VMEXIT_DR1_READ,
820 VMEXIT_DR2_READ,
821 VMEXIT_DR3_READ,
822 VMEXIT_DR6_READ,
823 VMEXIT_DR7_READ,
824 VMEXIT_DR0_WRITE,
825 VMEXIT_DR1_WRITE,
826 VMEXIT_DR2_WRITE,
827 VMEXIT_DR3_WRITE,
828 VMEXIT_CR0_READ,
829 VMEXIT_CR0_WRITE,
830 VMEXIT_CR3_READ,
831 VMEXIT_CR4_READ,
832 VMEXIT_MSR,
833 VMEXIT_CR0_WRITE,
834 VMEXIT_CR3_WRITE,
835 VMEXIT_CR4_WRITE,
836 VMEXIT_EXCEPTION_PF,
837 VMEXIT_INTR,
838 VMEXIT_INVLPG,
839 VMEXIT_EXCEPTION_NM
840 };
843 #if 0
844 if (svm_dbg_on && exit_code != 0x7B)
845 return 1;
846 #endif
848 counter++;
850 #if 0
851 if ((exit_code == 0x4E
852 || exit_code == VMEXIT_CR0_READ
853 || exit_code == VMEXIT_CR0_WRITE)
854 && counter < 200000)
855 return 0;
857 if ((exit_code == 0x4E) && counter < 500000)
858 return 0;
859 #endif
861 for (i = 0; i < sizeof(works) / sizeof(works[0]); i++)
862 if (exit_code == works[i])
863 return 0;
865 return 1;
866 }
868 static void save_svm_cpu_user_regs(struct vcpu *v, struct cpu_user_regs *ctxt)
869 {
870 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
872 ctxt->eax = vmcb->rax;
873 ctxt->ss = vmcb->ss.sel;
874 ctxt->esp = vmcb->rsp;
875 ctxt->eflags = vmcb->rflags;
876 ctxt->cs = vmcb->cs.sel;
877 ctxt->eip = vmcb->rip;
879 ctxt->gs = vmcb->gs.sel;
880 ctxt->fs = vmcb->fs.sel;
881 ctxt->es = vmcb->es.sel;
882 ctxt->ds = vmcb->ds.sel;
883 }
885 static void svm_load_cpu_guest_regs(struct vcpu *v, struct cpu_user_regs *regs)
886 {
887 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
889 vmcb->rax = regs->eax;
890 vmcb->ss.sel = regs->ss;
891 vmcb->rsp = regs->esp;
892 vmcb->rflags = regs->eflags | 2UL;
893 vmcb->cs.sel = regs->cs;
894 vmcb->rip = regs->eip;
895 }
897 static void svm_ctxt_switch_from(struct vcpu *v)
898 {
899 svm_save_dr(v);
900 }
902 static void svm_ctxt_switch_to(struct vcpu *v)
903 {
904 #ifdef __x86_64__
905 /*
906 * This is required, because VMRUN does consistency check
907 * and some of the DOM0 selectors are pointing to
908 * invalid GDT locations, and cause AMD processors
909 * to shutdown.
910 */
911 set_segment_register(ds, 0);
912 set_segment_register(es, 0);
913 set_segment_register(ss, 0);
914 #endif
915 svm_restore_dr(v);
916 }
918 static void svm_do_resume(struct vcpu *v)
919 {
920 bool_t debug_state = v->domain->debugger_attached;
922 if ( unlikely(v->arch.hvm_vcpu.debug_state_latch != debug_state) )
923 {
924 uint32_t mask = (1U << TRAP_debug) | (1U << TRAP_int3);
925 v->arch.hvm_vcpu.debug_state_latch = debug_state;
926 if ( debug_state )
927 v->arch.hvm_svm.vmcb->exception_intercepts |= mask;
928 else
929 v->arch.hvm_svm.vmcb->exception_intercepts &= ~mask;
930 }
932 if ( v->arch.hvm_svm.launch_core != smp_processor_id() )
933 {
934 v->arch.hvm_svm.launch_core = smp_processor_id();
935 hvm_migrate_timers(v);
936 }
938 hvm_do_resume(v);
939 reset_stack_and_jump(svm_asm_do_resume);
940 }
942 static int svm_vcpu_initialise(struct vcpu *v)
943 {
944 int rc;
946 v->arch.schedule_tail = svm_do_resume;
947 v->arch.ctxt_switch_from = svm_ctxt_switch_from;
948 v->arch.ctxt_switch_to = svm_ctxt_switch_to;
950 v->arch.hvm_svm.launch_core = -1;
952 if ( (rc = svm_create_vmcb(v)) != 0 )
953 {
954 dprintk(XENLOG_WARNING,
955 "Failed to create VMCB for vcpu %d: err=%d.\n",
956 v->vcpu_id, rc);
957 return rc;
958 }
960 return 0;
961 }
963 static void svm_vcpu_destroy(struct vcpu *v)
964 {
965 svm_destroy_vmcb(v);
966 }
968 static void svm_hvm_inject_exception(
969 unsigned int trapnr, int errcode, unsigned long cr2)
970 {
971 struct vcpu *v = current;
972 if ( trapnr == TRAP_page_fault )
973 v->arch.hvm_svm.vmcb->cr2 = v->arch.hvm_svm.cpu_cr2 = cr2;
974 svm_inject_exception(v, trapnr, (errcode != -1), errcode);
975 }
977 static int svm_event_injection_faulted(struct vcpu *v)
978 {
979 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
980 return vmcb->exitintinfo.fields.v;
981 }
983 static struct hvm_function_table svm_function_table = {
984 .disable = stop_svm,
985 .vcpu_initialise = svm_vcpu_initialise,
986 .vcpu_destroy = svm_vcpu_destroy,
987 .store_cpu_guest_regs = svm_store_cpu_guest_regs,
988 .load_cpu_guest_regs = svm_load_cpu_guest_regs,
989 .save_cpu_ctxt = svm_save_vmcb_ctxt,
990 .load_cpu_ctxt = svm_load_vmcb_ctxt,
991 .paging_enabled = svm_paging_enabled,
992 .long_mode_enabled = svm_long_mode_enabled,
993 .pae_enabled = svm_pae_enabled,
994 .guest_x86_mode = svm_guest_x86_mode,
995 .get_guest_ctrl_reg = svm_get_ctrl_reg,
996 .get_segment_base = svm_get_segment_base,
997 .get_segment_register = svm_get_segment_register,
998 .update_host_cr3 = svm_update_host_cr3,
999 .update_guest_cr3 = svm_update_guest_cr3,
1000 .update_vtpr = svm_update_vtpr,
1001 .stts = svm_stts,
1002 .set_tsc_offset = svm_set_tsc_offset,
1003 .inject_exception = svm_hvm_inject_exception,
1004 .init_ap_context = svm_init_ap_context,
1005 .init_hypercall_page = svm_init_hypercall_page,
1006 .event_injection_faulted = svm_event_injection_faulted
1007 };
1009 void svm_npt_detect(void)
1011 u32 eax, ebx, ecx, edx;
1013 /* check CPUID for nested paging support */
1014 cpuid(0x8000000A, &eax, &ebx, &ecx, &edx);
1015 if ( edx & 0x01 ) /* nested paging */
1017 hap_capable_system = 1;
1019 else if ( opt_hap_enabled )
1021 printk(" nested paging is not supported by this CPU.\n");
1022 hap_capable_system = 0; /* no nested paging, we disable flag. */
1026 int start_svm(void)
1028 u32 eax, ecx, edx;
1029 u32 phys_hsa_lo, phys_hsa_hi;
1030 u64 phys_hsa;
1031 int cpu = smp_processor_id();
1033 /* Xen does not fill x86_capability words except 0. */
1034 ecx = cpuid_ecx(0x80000001);
1035 boot_cpu_data.x86_capability[5] = ecx;
1037 if (!(test_bit(X86_FEATURE_SVME, &boot_cpu_data.x86_capability)))
1038 return 0;
1040 /* check whether SVM feature is disabled in BIOS */
1041 rdmsr(MSR_K8_VM_CR, eax, edx);
1042 if ( eax & K8_VMCR_SVME_DISABLE )
1044 printk("AMD SVM Extension is disabled in BIOS.\n");
1045 return 0;
1048 if (!hsa[cpu])
1049 if (!(hsa[cpu] = alloc_host_save_area()))
1050 return 0;
1052 rdmsr(MSR_EFER, eax, edx);
1053 eax |= EFER_SVME;
1054 wrmsr(MSR_EFER, eax, edx);
1055 printk("AMD SVM Extension is enabled for cpu %d.\n", cpu );
1057 svm_npt_detect();
1059 /* Initialize the HSA for this core */
1060 phys_hsa = (u64) virt_to_maddr(hsa[cpu]);
1061 phys_hsa_lo = (u32) phys_hsa;
1062 phys_hsa_hi = (u32) (phys_hsa >> 32);
1063 wrmsr(MSR_K8_VM_HSAVE_PA, phys_hsa_lo, phys_hsa_hi);
1065 if (!root_vmcb[cpu])
1066 if (!(root_vmcb[cpu] = alloc_vmcb()))
1067 return 0;
1068 root_vmcb_pa[cpu] = virt_to_maddr(root_vmcb[cpu]);
1070 if (cpu == 0)
1071 setup_vmcb_dump();
1073 hvm_enable(&svm_function_table);
1075 return 1;
1078 static int svm_do_nested_pgfault(paddr_t gpa, struct cpu_user_regs *regs)
1080 if (mmio_space(gpa)) {
1081 handle_mmio(gpa);
1082 return 1;
1085 /* We should not reach here. Otherwise, P2M table is not correct.*/
1086 return 0;
1090 static int svm_do_page_fault(unsigned long va, struct cpu_user_regs *regs)
1092 HVM_DBG_LOG(DBG_LEVEL_VMMU,
1093 "svm_do_page_fault = 0x%lx, eip = %lx, error_code = %lx",
1094 va, (unsigned long)current->arch.hvm_svm.vmcb->rip,
1095 (unsigned long)regs->error_code);
1096 return paging_fault(va, regs);
1100 static void svm_do_no_device_fault(struct vmcb_struct *vmcb)
1102 struct vcpu *v = current;
1104 setup_fpu(v);
1105 vmcb->exception_intercepts &= ~(1U << TRAP_no_device);
1107 if ( !(v->arch.hvm_svm.cpu_shadow_cr0 & X86_CR0_TS) )
1108 vmcb->cr0 &= ~X86_CR0_TS;
1111 /* Reserved bits ECX: [31:14], [12:4], [2:1]*/
1112 #define SVM_VCPU_CPUID_L1_ECX_RESERVED 0xffffdff6
1113 /* Reserved bits EDX: [31:29], [27], [22:20], [18], [10] */
1114 #define SVM_VCPU_CPUID_L1_EDX_RESERVED 0xe8740400
1116 static void svm_vmexit_do_cpuid(struct vmcb_struct *vmcb,
1117 struct cpu_user_regs *regs)
1119 unsigned long input = regs->eax;
1120 unsigned int eax, ebx, ecx, edx;
1121 struct vcpu *v = current;
1122 int inst_len;
1124 ASSERT(vmcb);
1126 hvm_cpuid(input, &eax, &ebx, &ecx, &edx);
1128 if ( input == 0x00000001 )
1130 /* Clear out reserved bits. */
1131 ecx &= ~SVM_VCPU_CPUID_L1_ECX_RESERVED;
1132 edx &= ~SVM_VCPU_CPUID_L1_EDX_RESERVED;
1134 /* Guest should only see one logical processor.
1135 * See details on page 23 of AMD CPUID Specification.
1136 */
1137 clear_bit(X86_FEATURE_HT & 31, &edx); /* clear the hyperthread bit */
1138 ebx &= 0xFF00FFFF; /* clear the logical processor count when HTT=0 */
1139 ebx |= 0x00010000; /* set to 1 just for precaution */
1141 else if ( input == 0x80000001 )
1143 if ( vlapic_hw_disabled(vcpu_vlapic(v)) )
1144 clear_bit(X86_FEATURE_APIC & 31, &edx);
1146 #if CONFIG_PAGING_LEVELS >= 3
1147 if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] )
1148 #endif
1149 clear_bit(X86_FEATURE_PAE & 31, &edx);
1151 clear_bit(X86_FEATURE_PSE36 & 31, &edx);
1153 /* Clear the Cmp_Legacy bit
1154 * This bit is supposed to be zero when HTT = 0.
1155 * See details on page 23 of AMD CPUID Specification.
1156 */
1157 clear_bit(X86_FEATURE_CMP_LEGACY & 31, &ecx);
1159 /* Make SVM feature invisible to the guest. */
1160 clear_bit(X86_FEATURE_SVME & 31, &ecx);
1162 /* So far, we do not support 3DNow for the guest. */
1163 clear_bit(X86_FEATURE_3DNOW & 31, &edx);
1164 clear_bit(X86_FEATURE_3DNOWEXT & 31, &edx);
1165 /* no FFXSR instructions feature. */
1166 clear_bit(X86_FEATURE_FFXSR & 31, &edx);
1168 else if ( input == 0x80000007 || input == 0x8000000A )
1170 /* Mask out features of power management and SVM extension. */
1171 eax = ebx = ecx = edx = 0;
1173 else if ( input == 0x80000008 )
1175 /* Make sure Number of CPU core is 1 when HTT=0 */
1176 ecx &= 0xFFFFFF00;
1179 regs->eax = (unsigned long)eax;
1180 regs->ebx = (unsigned long)ebx;
1181 regs->ecx = (unsigned long)ecx;
1182 regs->edx = (unsigned long)edx;
1184 HVMTRACE_3D(CPUID, v, input,
1185 ((uint64_t)eax << 32) | ebx, ((uint64_t)ecx << 32) | edx);
1187 inst_len = __get_instruction_length(v, INSTR_CPUID, NULL);
1188 ASSERT(inst_len > 0);
1189 __update_guest_eip(vmcb, inst_len);
1192 static inline unsigned long *get_reg_p(
1193 unsigned int gpreg,
1194 struct cpu_user_regs *regs, struct vmcb_struct *vmcb)
1196 unsigned long *reg_p = NULL;
1197 switch (gpreg)
1199 case SVM_REG_EAX:
1200 reg_p = (unsigned long *)&regs->eax;
1201 break;
1202 case SVM_REG_EBX:
1203 reg_p = (unsigned long *)&regs->ebx;
1204 break;
1205 case SVM_REG_ECX:
1206 reg_p = (unsigned long *)&regs->ecx;
1207 break;
1208 case SVM_REG_EDX:
1209 reg_p = (unsigned long *)&regs->edx;
1210 break;
1211 case SVM_REG_EDI:
1212 reg_p = (unsigned long *)&regs->edi;
1213 break;
1214 case SVM_REG_ESI:
1215 reg_p = (unsigned long *)&regs->esi;
1216 break;
1217 case SVM_REG_EBP:
1218 reg_p = (unsigned long *)&regs->ebp;
1219 break;
1220 case SVM_REG_ESP:
1221 reg_p = (unsigned long *)&vmcb->rsp;
1222 break;
1223 #ifdef __x86_64__
1224 case SVM_REG_R8:
1225 reg_p = (unsigned long *)&regs->r8;
1226 break;
1227 case SVM_REG_R9:
1228 reg_p = (unsigned long *)&regs->r9;
1229 break;
1230 case SVM_REG_R10:
1231 reg_p = (unsigned long *)&regs->r10;
1232 break;
1233 case SVM_REG_R11:
1234 reg_p = (unsigned long *)&regs->r11;
1235 break;
1236 case SVM_REG_R12:
1237 reg_p = (unsigned long *)&regs->r12;
1238 break;
1239 case SVM_REG_R13:
1240 reg_p = (unsigned long *)&regs->r13;
1241 break;
1242 case SVM_REG_R14:
1243 reg_p = (unsigned long *)&regs->r14;
1244 break;
1245 case SVM_REG_R15:
1246 reg_p = (unsigned long *)&regs->r15;
1247 break;
1248 #endif
1249 default:
1250 BUG();
1253 return reg_p;
1257 static inline unsigned long get_reg(unsigned int gpreg,
1258 struct cpu_user_regs *regs, struct vmcb_struct *vmcb)
1260 unsigned long *gp;
1261 gp = get_reg_p(gpreg, regs, vmcb);
1262 return *gp;
1266 static inline void set_reg(unsigned int gpreg, unsigned long value,
1267 struct cpu_user_regs *regs, struct vmcb_struct *vmcb)
1269 unsigned long *gp;
1270 gp = get_reg_p(gpreg, regs, vmcb);
1271 *gp = value;
1275 static void svm_dr_access(struct vcpu *v, struct cpu_user_regs *regs)
1277 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
1279 HVMTRACE_0D(DR_WRITE, v);
1281 v->arch.hvm_vcpu.flag_dr_dirty = 1;
1283 __restore_debug_registers(v);
1285 /* allow the guest full access to the debug registers */
1286 vmcb->dr_intercepts = 0;
1290 static void svm_get_prefix_info(struct vcpu *v, unsigned int dir,
1291 svm_segment_register_t **seg,
1292 unsigned int *asize)
1294 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
1295 unsigned char inst[MAX_INST_LEN];
1296 int i;
1298 memset(inst, 0, MAX_INST_LEN);
1299 if (inst_copy_from_guest(inst, svm_rip2pointer(v), sizeof(inst))
1300 != MAX_INST_LEN)
1302 gdprintk(XENLOG_ERR, "get guest instruction failed\n");
1303 domain_crash(current->domain);
1304 return;
1307 for (i = 0; i < MAX_INST_LEN; i++)
1309 switch (inst[i])
1311 case 0xf3: /* REPZ */
1312 case 0xf2: /* REPNZ */
1313 case 0xf0: /* LOCK */
1314 case 0x66: /* data32 */
1315 #ifdef __x86_64__
1316 /* REX prefixes */
1317 case 0x40:
1318 case 0x41:
1319 case 0x42:
1320 case 0x43:
1321 case 0x44:
1322 case 0x45:
1323 case 0x46:
1324 case 0x47:
1326 case 0x48:
1327 case 0x49:
1328 case 0x4a:
1329 case 0x4b:
1330 case 0x4c:
1331 case 0x4d:
1332 case 0x4e:
1333 case 0x4f:
1334 #endif
1335 continue;
1336 case 0x67: /* addr32 */
1337 *asize ^= 48; /* Switch 16/32 bits */
1338 continue;
1339 case 0x2e: /* CS */
1340 *seg = &vmcb->cs;
1341 continue;
1342 case 0x36: /* SS */
1343 *seg = &vmcb->ss;
1344 continue;
1345 case 0x26: /* ES */
1346 *seg = &vmcb->es;
1347 continue;
1348 case 0x64: /* FS */
1349 *seg = &vmcb->fs;
1350 continue;
1351 case 0x65: /* GS */
1352 *seg = &vmcb->gs;
1353 continue;
1354 case 0x3e: /* DS */
1355 *seg = &vmcb->ds;
1356 continue;
1357 default:
1358 break;
1360 return;
1365 /* Get the address of INS/OUTS instruction */
1366 static inline int svm_get_io_address(
1367 struct vcpu *v, struct cpu_user_regs *regs,
1368 unsigned int size, ioio_info_t info,
1369 unsigned long *count, unsigned long *addr)
1371 unsigned long reg;
1372 unsigned int asize, isize;
1373 int long_mode = 0;
1374 svm_segment_register_t *seg = NULL;
1375 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
1377 #ifdef __x86_64__
1378 /* If we're in long mode, we shouldn't check the segment presence & limit */
1379 long_mode = vmcb->cs.attr.fields.l && svm_long_mode_enabled(v);
1380 #endif
1382 /* d field of cs.attr is 1 for 32-bit, 0 for 16 or 64 bit.
1383 * l field combined with EFER_LMA says whether it's 16 or 64 bit.
1384 */
1385 asize = (long_mode)?64:((vmcb->cs.attr.fields.db)?32:16);
1388 /* The ins/outs instructions are single byte, so if we have got more
1389 * than one byte (+ maybe rep-prefix), we have some prefix so we need
1390 * to figure out what it is...
1391 */
1392 isize = vmcb->exitinfo2 - vmcb->rip;
1394 if (info.fields.rep)
1395 isize --;
1397 if (isize > 1)
1398 svm_get_prefix_info(v, info.fields.type, &seg, &asize);
1400 if (info.fields.type == IOREQ_WRITE)
1402 reg = regs->esi;
1403 if (!seg) /* If no prefix, used DS. */
1404 seg = &vmcb->ds;
1405 if (!long_mode && (seg->attr.fields.type & 0xa) == 0x8) {
1406 svm_inject_exception(v, TRAP_gp_fault, 1, 0);
1407 return 0;
1410 else
1412 reg = regs->edi;
1413 seg = &vmcb->es; /* Note: This is ALWAYS ES. */
1414 if (!long_mode && (seg->attr.fields.type & 0xa) != 0x2) {
1415 svm_inject_exception(v, TRAP_gp_fault, 1, 0);
1416 return 0;
1420 /* If the segment isn't present, give GP fault! */
1421 if (!long_mode && !seg->attr.fields.p)
1423 svm_inject_exception(v, TRAP_gp_fault, 1, 0);
1424 return 0;
1427 if (asize == 16)
1429 *addr = (reg & 0xFFFF);
1430 *count = regs->ecx & 0xffff;
1432 else
1434 *addr = reg;
1435 *count = regs->ecx;
1437 if (!info.fields.rep)
1438 *count = 1;
1440 if (!long_mode)
1442 ASSERT(*addr == (u32)*addr);
1443 if ((u32)(*addr + size - 1) < (u32)*addr ||
1444 (seg->attr.fields.type & 0xc) != 0x4 ?
1445 *addr + size - 1 > seg->limit :
1446 *addr <= seg->limit)
1448 svm_inject_exception(v, TRAP_gp_fault, 1, 0);
1449 return 0;
1452 /* Check the limit for repeated instructions, as above we checked only
1453 the first instance. Truncate the count if a limit violation would
1454 occur. Note that the checking is not necessary for page granular
1455 segments as transfers crossing page boundaries will be broken up
1456 anyway. */
1457 if (!seg->attr.fields.g && *count > 1)
1459 if ((seg->attr.fields.type & 0xc) != 0x4)
1461 /* expand-up */
1462 if (!(regs->eflags & EF_DF))
1464 if (*addr + *count * size - 1 < *addr ||
1465 *addr + *count * size - 1 > seg->limit)
1466 *count = (seg->limit + 1UL - *addr) / size;
1468 else
1470 if (*count - 1 > *addr / size)
1471 *count = *addr / size + 1;
1474 else
1476 /* expand-down */
1477 if (!(regs->eflags & EF_DF))
1479 if (*count - 1 > -(s32)*addr / size)
1480 *count = -(s32)*addr / size + 1UL;
1482 else
1484 if (*addr < (*count - 1) * size ||
1485 *addr - (*count - 1) * size <= seg->limit)
1486 *count = (*addr - seg->limit - 1) / size + 1;
1489 ASSERT(*count);
1492 *addr += seg->base;
1494 #ifdef __x86_64__
1495 else
1497 if (seg == &vmcb->fs || seg == &vmcb->gs)
1498 *addr += seg->base;
1500 if (!is_canonical_address(*addr) ||
1501 !is_canonical_address(*addr + size - 1))
1503 svm_inject_exception(v, TRAP_gp_fault, 1, 0);
1504 return 0;
1506 if (*count > (1UL << 48) / size)
1507 *count = (1UL << 48) / size;
1508 if (!(regs->eflags & EF_DF))
1510 if (*addr + *count * size - 1 < *addr ||
1511 !is_canonical_address(*addr + *count * size - 1))
1512 *count = (*addr & ~((1UL << 48) - 1)) / size;
1514 else
1516 if ((*count - 1) * size > *addr ||
1517 !is_canonical_address(*addr + (*count - 1) * size))
1518 *count = (*addr & ~((1UL << 48) - 1)) / size + 1;
1520 ASSERT(*count);
1522 #endif
1524 return 1;
1528 static void svm_io_instruction(struct vcpu *v)
1530 struct cpu_user_regs *regs;
1531 struct hvm_io_op *pio_opp;
1532 unsigned int port;
1533 unsigned int size, dir, df;
1534 ioio_info_t info;
1535 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
1537 ASSERT(vmcb);
1538 pio_opp = &current->arch.hvm_vcpu.io_op;
1539 pio_opp->instr = INSTR_PIO;
1540 pio_opp->flags = 0;
1542 regs = &pio_opp->io_context;
1544 /* Copy current guest state into io instruction state structure. */
1545 memcpy(regs, guest_cpu_user_regs(), HVM_CONTEXT_STACK_BYTES);
1546 hvm_store_cpu_guest_regs(v, regs, NULL);
1548 info.bytes = vmcb->exitinfo1;
1550 port = info.fields.port; /* port used to be addr */
1551 dir = info.fields.type; /* direction */
1552 df = regs->eflags & X86_EFLAGS_DF ? 1 : 0;
1554 if (info.fields.sz32)
1555 size = 4;
1556 else if (info.fields.sz16)
1557 size = 2;
1558 else
1559 size = 1;
1561 if (dir==IOREQ_READ)
1562 HVMTRACE_2D(IO_READ, v, port, size);
1563 else
1564 HVMTRACE_2D(IO_WRITE, v, port, size);
1566 HVM_DBG_LOG(DBG_LEVEL_IO,
1567 "svm_io_instruction: port 0x%x eip=%x:%"PRIx64", "
1568 "exit_qualification = %"PRIx64,
1569 port, vmcb->cs.sel, vmcb->rip, info.bytes);
1571 /* string instruction */
1572 if (info.fields.str)
1574 unsigned long addr, count;
1575 paddr_t paddr;
1576 unsigned long gfn;
1577 int sign = regs->eflags & X86_EFLAGS_DF ? -1 : 1;
1579 if (!svm_get_io_address(v, regs, size, info, &count, &addr))
1581 /* We failed to get a valid address, so don't do the IO operation -
1582 * it would just get worse if we do! Hopefully the guest is handing
1583 * gp-faults...
1584 */
1585 return;
1588 /* "rep" prefix */
1589 if (info.fields.rep)
1591 pio_opp->flags |= REPZ;
1594 /* Translate the address to a physical address */
1595 gfn = paging_gva_to_gfn(v, addr);
1596 if ( gfn == INVALID_GFN )
1598 /* The guest does not have the RAM address mapped.
1599 * Need to send in a page fault */
1600 int errcode = 0;
1601 /* IO read --> memory write */
1602 if ( dir == IOREQ_READ ) errcode |= PFEC_write_access;
1603 svm_hvm_inject_exception(TRAP_page_fault, errcode, addr);
1604 return;
1606 paddr = (paddr_t)gfn << PAGE_SHIFT | (addr & ~PAGE_MASK);
1608 /*
1609 * Handle string pio instructions that cross pages or that
1610 * are unaligned. See the comments in hvm_platform.c/handle_mmio()
1611 */
1612 if ((addr & PAGE_MASK) != ((addr + size - 1) & PAGE_MASK))
1614 unsigned long value = 0;
1616 pio_opp->flags |= OVERLAP;
1617 pio_opp->addr = addr;
1619 if (dir == IOREQ_WRITE) /* OUTS */
1621 if ( hvm_paging_enabled(current) )
1623 int rv = hvm_copy_from_guest_virt(&value, addr, size);
1624 if ( rv != 0 )
1626 /* Failed on the page-spanning copy. Inject PF into
1627 * the guest for the address where we failed. */
1628 addr += size - rv;
1629 gdprintk(XENLOG_DEBUG, "Pagefault reading non-io side "
1630 "of a page-spanning PIO: va=%#lx\n", addr);
1631 svm_hvm_inject_exception(TRAP_page_fault, 0, addr);
1632 return;
1635 else
1636 (void) hvm_copy_from_guest_phys(&value, addr, size);
1637 } else /* dir != IOREQ_WRITE */
1638 /* Remember where to write the result, as a *VA*.
1639 * Must be a VA so we can handle the page overlap
1640 * correctly in hvm_pio_assist() */
1641 pio_opp->addr = addr;
1643 if (count == 1)
1644 regs->eip = vmcb->exitinfo2;
1646 send_pio_req(port, 1, size, value, dir, df, 0);
1648 else
1650 unsigned long last_addr = sign > 0 ? addr + count * size - 1
1651 : addr - (count - 1) * size;
1653 if ((addr & PAGE_MASK) != (last_addr & PAGE_MASK))
1655 if (sign > 0)
1656 count = (PAGE_SIZE - (addr & ~PAGE_MASK)) / size;
1657 else
1658 count = (addr & ~PAGE_MASK) / size + 1;
1660 else
1661 regs->eip = vmcb->exitinfo2;
1663 send_pio_req(port, count, size, paddr, dir, df, 1);
1666 else
1668 /*
1669 * On SVM, the RIP of the intruction following the IN/OUT is saved in
1670 * ExitInfo2
1671 */
1672 regs->eip = vmcb->exitinfo2;
1674 if (port == 0xe9 && dir == IOREQ_WRITE && size == 1)
1675 hvm_print_line(v, regs->eax); /* guest debug output */
1677 send_pio_req(port, 1, size, regs->eax, dir, df, 0);
1681 static int npt_set_cr0(unsigned long value)
1683 struct vcpu *v = current;
1684 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
1686 ASSERT(vmcb);
1688 /* ET is reserved and should be always be 1*/
1689 value |= X86_CR0_ET;
1691 /* Check whether the guest is about to turn on long mode.
1692 * If it is, set EFER.LME and EFER.LMA. Update the shadow EFER.LMA
1693 * bit too, so svm_long_mode_enabled() will work.
1694 */
1695 if ( (value & X86_CR0_PG) && svm_lme_is_set(v) &&
1696 (vmcb->cr4 & X86_CR4_PAE) && (vmcb->cr0 & X86_CR0_PE) )
1698 v->arch.hvm_svm.cpu_shadow_efer |= EFER_LMA;
1699 vmcb->efer |= EFER_LMA | EFER_LME;
1702 /* Whenever CR0.PG is cleared under long mode, LMA will be cleared
1703 * immediatly. We emulate this process for svm_long_mode_enabled().
1704 */
1705 if ( (value & (X86_CR0_PE | X86_CR0_PG)) == X86_CR0_PE )
1707 if ( svm_long_mode_enabled(v) )
1709 v->arch.hvm_svm.cpu_shadow_efer &= ~EFER_LMA;
1713 vmcb->cr0 = value | X86_CR0_WP;
1714 v->arch.hvm_svm.cpu_shadow_cr0 = value;
1716 /* TS cleared? Then initialise FPU now. */
1717 if ( !(value & X86_CR0_TS) ) {
1718 setup_fpu(v);
1719 vmcb->exception_intercepts &= ~(1U << TRAP_no_device);
1722 paging_update_paging_modes(v);
1724 return 1;
1727 static int svm_set_cr0(unsigned long value)
1729 struct vcpu *v = current;
1730 unsigned long mfn;
1731 int paging_enabled;
1732 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
1733 unsigned long old_base_mfn;
1735 ASSERT(vmcb);
1737 /* We don't want to lose PG. ET is reserved and should be always be 1*/
1738 paging_enabled = svm_paging_enabled(v);
1739 value |= X86_CR0_ET;
1740 vmcb->cr0 = value | X86_CR0_PG | X86_CR0_WP;
1741 v->arch.hvm_svm.cpu_shadow_cr0 = value;
1743 /* TS cleared? Then initialise FPU now. */
1744 if ( !(value & X86_CR0_TS) )
1746 setup_fpu(v);
1747 vmcb->exception_intercepts &= ~(1U << TRAP_no_device);
1750 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR0 value = %lx\n", value);
1752 if ( ((value & (X86_CR0_PE | X86_CR0_PG)) == (X86_CR0_PE | X86_CR0_PG))
1753 && !paging_enabled )
1755 /* The guest CR3 must be pointing to the guest physical. */
1756 mfn = get_mfn_from_gpfn(v->arch.hvm_svm.cpu_cr3 >> PAGE_SHIFT);
1757 if ( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), v->domain))
1759 gdprintk(XENLOG_ERR, "Invalid CR3 value = %lx (mfn=%lx)\n",
1760 v->arch.hvm_svm.cpu_cr3, mfn);
1761 domain_crash(v->domain);
1762 return 0;
1765 #if defined(__x86_64__)
1766 if ( svm_lme_is_set(v) && !svm_cr4_pae_is_set(v) )
1768 HVM_DBG_LOG(DBG_LEVEL_1, "Enable paging before PAE enable\n");
1769 svm_inject_exception(v, TRAP_gp_fault, 1, 0);
1772 if ( svm_lme_is_set(v) )
1774 HVM_DBG_LOG(DBG_LEVEL_1, "Enable the Long mode\n");
1775 v->arch.hvm_svm.cpu_shadow_efer |= EFER_LMA;
1776 vmcb->efer |= EFER_LMA | EFER_LME;
1778 #endif /* __x86_64__ */
1780 /* Now arch.guest_table points to machine physical. */
1781 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1782 v->arch.guest_table = pagetable_from_pfn(mfn);
1783 if ( old_base_mfn )
1784 put_page(mfn_to_page(old_base_mfn));
1785 paging_update_paging_modes(v);
1787 HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
1788 (unsigned long) (mfn << PAGE_SHIFT));
1791 if ( !((value & X86_CR0_PE) && (value & X86_CR0_PG)) && paging_enabled )
1792 if ( v->arch.hvm_svm.cpu_cr3 ) {
1793 put_page(mfn_to_page(get_mfn_from_gpfn(
1794 v->arch.hvm_svm.cpu_cr3 >> PAGE_SHIFT)));
1795 v->arch.guest_table = pagetable_null();
1798 /*
1799 * SVM implements paged real-mode and when we return to real-mode
1800 * we revert back to the physical mappings that the domain builder
1801 * created.
1802 */
1803 if ((value & X86_CR0_PE) == 0) {
1804 if (value & X86_CR0_PG) {
1805 svm_inject_exception(v, TRAP_gp_fault, 1, 0);
1806 return 0;
1808 paging_update_paging_modes(v);
1810 else if ( (value & (X86_CR0_PE | X86_CR0_PG)) == X86_CR0_PE )
1812 if ( svm_long_mode_enabled(v) )
1814 vmcb->efer &= ~(EFER_LME | EFER_LMA);
1815 v->arch.hvm_svm.cpu_shadow_efer &= ~EFER_LMA;
1817 /* we should take care of this kind of situation */
1818 paging_update_paging_modes(v);
1821 return 1;
1824 //
1825 // nested paging functions
1826 //
1828 static int npt_mov_to_cr(int gpreg, int cr, struct cpu_user_regs *regs)
1830 unsigned long value;
1831 struct vcpu *v = current;
1832 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
1833 struct vlapic *vlapic = vcpu_vlapic(v);
1835 ASSERT(vmcb);
1837 value = get_reg(gpreg, regs, vmcb);
1839 switch (cr) {
1840 case 0:
1841 return npt_set_cr0(value);
1843 case 3:
1844 vmcb->cr3 = value;
1845 v->arch.hvm_svm.cpu_cr3 = value;
1846 break;
1848 case 4: /* CR4 */
1849 vmcb->cr4 = value;
1850 v->arch.hvm_svm.cpu_shadow_cr4 = value;
1851 paging_update_paging_modes(v);
1852 break;
1854 case 8:
1855 vlapic_set_reg(vlapic, APIC_TASKPRI, ((value & 0x0F) << 4));
1856 vmcb->vintr.fields.tpr = value & 0x0F;
1857 break;
1859 default:
1860 gdprintk(XENLOG_ERR, "invalid cr: %d\n", cr);
1861 domain_crash(v->domain);
1862 return 0;
1865 return 1;
1868 static void npt_mov_from_cr(int cr, int gp, struct cpu_user_regs *regs)
1870 unsigned long value = 0;
1871 struct vcpu *v = current;
1872 struct vmcb_struct *vmcb;
1873 struct vlapic *vlapic = vcpu_vlapic(v);
1875 vmcb = v->arch.hvm_svm.vmcb;
1876 ASSERT(vmcb);
1878 switch(cr) {
1879 case 0:
1880 value = (unsigned long) v->arch.hvm_svm.cpu_shadow_cr0;
1881 break;
1882 case 2:
1883 value = vmcb->cr2;
1884 break;
1885 case 3:
1886 value = (unsigned long) v->arch.hvm_svm.cpu_cr3;
1887 break;
1888 case 4:
1889 value = (unsigned long) v->arch.hvm_svm.cpu_shadow_cr4;
1890 break;
1891 case 8:
1892 value = (unsigned long)vlapic_get_reg(vlapic, APIC_TASKPRI);
1893 value = (value & 0xF0) >> 4;
1894 break;
1895 default:
1896 domain_crash(v->domain);
1897 return;
1900 set_reg(gp, value, regs, vmcb);
1903 /*
1904 * Read from control registers. CR0 and CR4 are read from the shadow.
1905 */
1906 static void mov_from_cr(int cr, int gp, struct cpu_user_regs *regs)
1908 unsigned long value = 0;
1909 struct vcpu *v = current;
1910 struct vlapic *vlapic = vcpu_vlapic(v);
1911 struct vmcb_struct *vmcb;
1913 vmcb = v->arch.hvm_svm.vmcb;
1914 ASSERT(vmcb);
1916 switch ( cr )
1918 case 0:
1919 value = v->arch.hvm_svm.cpu_shadow_cr0;
1920 if (svm_dbg_on)
1921 printk("CR0 read =%lx \n", value );
1922 break;
1923 case 2:
1924 value = vmcb->cr2;
1925 break;
1926 case 3:
1927 value = (unsigned long) v->arch.hvm_svm.cpu_cr3;
1928 if (svm_dbg_on)
1929 printk("CR3 read =%lx \n", value );
1930 break;
1931 case 4:
1932 value = (unsigned long) v->arch.hvm_svm.cpu_shadow_cr4;
1933 if (svm_dbg_on)
1934 printk("CR4 read=%lx\n", value);
1935 break;
1936 case 8:
1937 value = (unsigned long)vlapic_get_reg(vlapic, APIC_TASKPRI);
1938 value = (value & 0xF0) >> 4;
1939 break;
1941 default:
1942 domain_crash(v->domain);
1943 return;
1946 HVMTRACE_2D(CR_READ, v, cr, value);
1948 set_reg(gp, value, regs, vmcb);
1950 HVM_DBG_LOG(DBG_LEVEL_VMMU, "mov_from_cr: CR%d, value = %lx,", cr, value);
1954 /*
1955 * Write to control registers
1956 */
1957 static int mov_to_cr(int gpreg, int cr, struct cpu_user_regs *regs)
1959 unsigned long value, old_cr, old_base_mfn, mfn;
1960 struct vcpu *v = current;
1961 struct vlapic *vlapic = vcpu_vlapic(v);
1962 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
1964 value = get_reg(gpreg, regs, vmcb);
1966 HVMTRACE_2D(CR_WRITE, v, cr, value);
1968 HVM_DBG_LOG(DBG_LEVEL_1, "mov_to_cr: CR%d, value = %lx,", cr, value);
1969 HVM_DBG_LOG(DBG_LEVEL_1, "current = %lx,", (unsigned long) current);
1971 switch (cr)
1973 case 0:
1974 if (svm_dbg_on)
1975 printk("CR0 write =%lx \n", value );
1976 return svm_set_cr0(value);
1978 case 3:
1979 if (svm_dbg_on)
1980 printk("CR3 write =%lx \n", value );
1981 /* If paging is not enabled yet, simply copy the value to CR3. */
1982 if (!svm_paging_enabled(v)) {
1983 v->arch.hvm_svm.cpu_cr3 = value;
1984 break;
1987 /* We make a new one if the shadow does not exist. */
1988 if (value == v->arch.hvm_svm.cpu_cr3)
1990 /*
1991 * This is simple TLB flush, implying the guest has
1992 * removed some translation or changed page attributes.
1993 * We simply invalidate the shadow.
1994 */
1995 mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
1996 if (mfn != pagetable_get_pfn(v->arch.guest_table))
1997 goto bad_cr3;
1998 paging_update_cr3(v);
2000 else
2002 /*
2003 * If different, make a shadow. Check if the PDBR is valid
2004 * first.
2005 */
2006 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 value = %lx", value);
2007 mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
2008 if ( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), v->domain))
2009 goto bad_cr3;
2011 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
2012 v->arch.guest_table = pagetable_from_pfn(mfn);
2014 if (old_base_mfn)
2015 put_page(mfn_to_page(old_base_mfn));
2017 v->arch.hvm_svm.cpu_cr3 = value;
2018 update_cr3(v);
2019 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx", value);
2021 break;
2023 case 4: /* CR4 */
2024 if (svm_dbg_on)
2025 printk( "write cr4=%lx, cr0=%lx\n",
2026 value, v->arch.hvm_svm.cpu_shadow_cr0 );
2027 old_cr = v->arch.hvm_svm.cpu_shadow_cr4;
2028 if ( value & X86_CR4_PAE && !(old_cr & X86_CR4_PAE) )
2030 if ( svm_pgbit_test(v) )
2032 /* The guest is a 32-bit PAE guest. */
2033 #if CONFIG_PAGING_LEVELS >= 3
2034 unsigned long mfn, old_base_mfn;
2035 mfn = get_mfn_from_gpfn(v->arch.hvm_svm.cpu_cr3 >> PAGE_SHIFT);
2036 if ( !mfn_valid(mfn) ||
2037 !get_page(mfn_to_page(mfn), v->domain) )
2038 goto bad_cr3;
2040 /*
2041 * Now arch.guest_table points to machine physical.
2042 */
2044 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
2045 v->arch.guest_table = pagetable_from_pfn(mfn);
2046 if ( old_base_mfn )
2047 put_page(mfn_to_page(old_base_mfn));
2048 paging_update_paging_modes(v);
2050 HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
2051 (unsigned long) (mfn << PAGE_SHIFT));
2053 HVM_DBG_LOG(DBG_LEVEL_VMMU,
2054 "Update CR3 value = %lx, mfn = %lx",
2055 v->arch.hvm_svm.cpu_cr3, mfn);
2056 #endif
2059 else if ( !(value & X86_CR4_PAE) )
2061 if ( svm_long_mode_enabled(v) )
2063 svm_inject_exception(v, TRAP_gp_fault, 1, 0);
2067 v->arch.hvm_svm.cpu_shadow_cr4 = value;
2068 vmcb->cr4 = value | SVM_CR4_HOST_MASK;
2070 /*
2071 * Writing to CR4 to modify the PSE, PGE, or PAE flag invalidates
2072 * all TLB entries except global entries.
2073 */
2074 if ((old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE))
2075 paging_update_paging_modes(v);
2076 break;
2078 case 8:
2079 vlapic_set_reg(vlapic, APIC_TASKPRI, ((value & 0x0F) << 4));
2080 vmcb->vintr.fields.tpr = value & 0x0F;
2081 break;
2083 default:
2084 gdprintk(XENLOG_ERR, "invalid cr: %d\n", cr);
2085 domain_crash(v->domain);
2086 return 0;
2089 return 1;
2091 bad_cr3:
2092 gdprintk(XENLOG_ERR, "Invalid CR3\n");
2093 domain_crash(v->domain);
2094 return 0;
2098 #define ARR_SIZE(x) (sizeof(x) / sizeof(x[0]))
2101 static int svm_cr_access(struct vcpu *v, unsigned int cr, unsigned int type,
2102 struct cpu_user_regs *regs)
2104 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
2105 int inst_len = 0;
2106 int index;
2107 unsigned int gpreg;
2108 unsigned long value;
2109 u8 buffer[MAX_INST_LEN];
2110 u8 prefix = 0;
2111 int result = 1;
2112 enum instruction_index list_a[] = {INSTR_MOV2CR, INSTR_CLTS, INSTR_LMSW};
2113 enum instruction_index list_b[] = {INSTR_MOVCR2, INSTR_SMSW};
2114 enum instruction_index match;
2116 ASSERT(vmcb);
2118 inst_copy_from_guest(buffer, svm_rip2pointer(v), sizeof(buffer));
2120 /* get index to first actual instruction byte - as we will need to know
2121 where the prefix lives later on */
2122 index = skip_prefix_bytes(buffer, sizeof(buffer));
2124 if ( type == TYPE_MOV_TO_CR )
2126 inst_len = __get_instruction_length_from_list(
2127 v, list_a, ARR_SIZE(list_a), &buffer[index], &match);
2129 else /* type == TYPE_MOV_FROM_CR */
2131 inst_len = __get_instruction_length_from_list(
2132 v, list_b, ARR_SIZE(list_b), &buffer[index], &match);
2135 ASSERT(inst_len > 0);
2137 inst_len += index;
2139 /* Check for REX prefix - it's ALWAYS the last byte of any prefix bytes */
2140 if (index > 0 && (buffer[index-1] & 0xF0) == 0x40)
2141 prefix = buffer[index-1];
2143 HVM_DBG_LOG(DBG_LEVEL_1, "eip = %lx", (unsigned long) vmcb->rip);
2145 switch (match)
2147 case INSTR_MOV2CR:
2148 gpreg = decode_src_reg(prefix, buffer[index+2]);
2149 if ( paging_mode_hap(v->domain) )
2150 result = npt_mov_to_cr(gpreg, cr, regs);
2151 else
2152 result = mov_to_cr(gpreg, cr, regs);
2153 break;
2155 case INSTR_MOVCR2:
2156 gpreg = decode_src_reg(prefix, buffer[index+2]);
2157 if ( paging_mode_hap(v->domain) )
2158 npt_mov_from_cr(cr, gpreg, regs);
2159 else
2160 mov_from_cr(cr, gpreg, regs);
2161 break;
2163 case INSTR_CLTS:
2164 /* TS being cleared means that it's time to restore fpu state. */
2165 setup_fpu(current);
2166 vmcb->exception_intercepts &= ~(1U << TRAP_no_device);
2167 vmcb->cr0 &= ~X86_CR0_TS; /* clear TS */
2168 v->arch.hvm_svm.cpu_shadow_cr0 &= ~X86_CR0_TS; /* clear TS */
2169 break;
2171 case INSTR_LMSW:
2172 if (svm_dbg_on)
2173 svm_dump_inst(svm_rip2pointer(v));
2175 gpreg = decode_src_reg(prefix, buffer[index+2]);
2176 value = get_reg(gpreg, regs, vmcb) & 0xF;
2178 if (svm_dbg_on)
2179 printk("CR0-LMSW value=%lx, reg=%d, inst_len=%d\n", value, gpreg,
2180 inst_len);
2182 value = (v->arch.hvm_svm.cpu_shadow_cr0 & ~0xF) | value;
2184 if (svm_dbg_on)
2185 printk("CR0-LMSW CR0 - New value=%lx\n", value);
2187 if ( paging_mode_hap(v->domain) )
2188 result = npt_set_cr0(value);
2189 else
2190 result = svm_set_cr0(value);
2191 break;
2193 case INSTR_SMSW:
2194 if (svm_dbg_on)
2195 svm_dump_inst(svm_rip2pointer(v));
2196 value = v->arch.hvm_svm.cpu_shadow_cr0;
2197 gpreg = decode_src_reg(prefix, buffer[index+2]);
2198 set_reg(gpreg, value, regs, vmcb);
2200 if (svm_dbg_on)
2201 printk("CR0-SMSW value=%lx, reg=%d, inst_len=%d\n", value, gpreg,
2202 inst_len);
2203 break;
2205 default:
2206 BUG();
2209 ASSERT(inst_len);
2211 __update_guest_eip(vmcb, inst_len);
2213 return result;
2216 static inline void svm_do_msr_access(
2217 struct vcpu *v, struct cpu_user_regs *regs)
2219 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
2220 int inst_len;
2221 u64 msr_content=0;
2222 u32 ecx = regs->ecx, eax, edx;
2224 ASSERT(vmcb);
2226 HVM_DBG_LOG(DBG_LEVEL_1, "ecx=%x, eax=%x, edx=%x, exitinfo = %lx",
2227 ecx, (u32)regs->eax, (u32)regs->edx,
2228 (unsigned long)vmcb->exitinfo1);
2230 /* is it a read? */
2231 if (vmcb->exitinfo1 == 0)
2233 switch (ecx) {
2234 case MSR_IA32_TIME_STAMP_COUNTER:
2235 msr_content = hvm_get_guest_time(v);
2236 break;
2237 case MSR_IA32_SYSENTER_CS:
2238 msr_content = vmcb->sysenter_cs;
2239 break;
2240 case MSR_IA32_SYSENTER_ESP:
2241 msr_content = vmcb->sysenter_esp;
2242 break;
2243 case MSR_IA32_SYSENTER_EIP:
2244 msr_content = vmcb->sysenter_eip;
2245 break;
2246 case MSR_IA32_APICBASE:
2247 msr_content = vcpu_vlapic(v)->hw.apic_base_msr;
2248 break;
2249 default:
2250 if (long_mode_do_msr_read(regs))
2251 goto done;
2253 if ( rdmsr_hypervisor_regs(ecx, &eax, &edx) ||
2254 rdmsr_safe(ecx, eax, edx) == 0 )
2256 regs->eax = eax;
2257 regs->edx = edx;
2258 goto done;
2260 svm_inject_exception(v, TRAP_gp_fault, 1, 0);
2261 return;
2263 regs->eax = msr_content & 0xFFFFFFFF;
2264 regs->edx = msr_content >> 32;
2266 done:
2267 HVMTRACE_2D(MSR_READ, v, ecx, msr_content);
2268 HVM_DBG_LOG(DBG_LEVEL_1, "returns: ecx=%x, eax=%lx, edx=%lx",
2269 ecx, (unsigned long)regs->eax, (unsigned long)regs->edx);
2271 inst_len = __get_instruction_length(v, INSTR_RDMSR, NULL);
2273 else
2275 msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
2277 HVMTRACE_2D(MSR_WRITE, v, ecx, msr_content);
2279 switch (ecx)
2281 case MSR_IA32_TIME_STAMP_COUNTER:
2282 hvm_set_guest_time(v, msr_content);
2283 pt_reset(v);
2284 break;
2285 case MSR_IA32_SYSENTER_CS:
2286 vmcb->sysenter_cs = msr_content;
2287 break;
2288 case MSR_IA32_SYSENTER_ESP:
2289 vmcb->sysenter_esp = msr_content;
2290 break;
2291 case MSR_IA32_SYSENTER_EIP:
2292 vmcb->sysenter_eip = msr_content;
2293 break;
2294 case MSR_IA32_APICBASE:
2295 vlapic_msr_set(vcpu_vlapic(v), msr_content);
2296 break;
2297 default:
2298 if ( !long_mode_do_msr_write(regs) )
2299 wrmsr_hypervisor_regs(ecx, regs->eax, regs->edx);
2300 break;
2303 inst_len = __get_instruction_length(v, INSTR_WRMSR, NULL);
2306 __update_guest_eip(vmcb, inst_len);
2309 static inline void svm_vmexit_do_hlt(struct vmcb_struct *vmcb)
2311 __update_guest_eip(vmcb, 1);
2313 /* Check for interrupt not handled or new interrupt. */
2314 if ( (vmcb->rflags & X86_EFLAGS_IF) &&
2315 (vmcb->vintr.fields.irq || cpu_has_pending_irq(current)) ) {
2316 HVMTRACE_1D(HLT, current, /*int pending=*/ 1);
2317 return;
2320 HVMTRACE_1D(HLT, current, /*int pending=*/ 0);
2321 hvm_hlt(vmcb->rflags);
2324 static void svm_vmexit_do_invd(struct vcpu *v)
2326 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
2327 int inst_len;
2329 /* Invalidate the cache - we can't really do that safely - maybe we should
2330 * WBINVD, but I think it's just fine to completely ignore it - we should
2331 * have cache-snooping that solves it anyways. -- Mats P.
2332 */
2334 /* Tell the user that we did this - just in case someone runs some really
2335 * weird operating system and wants to know why it's not working...
2336 */
2337 printk("INVD instruction intercepted - ignored\n");
2339 inst_len = __get_instruction_length(v, INSTR_INVD, NULL);
2340 __update_guest_eip(vmcb, inst_len);
2343 void svm_handle_invlpg(const short invlpga, struct cpu_user_regs *regs)
2345 struct vcpu *v = current;
2346 u8 opcode[MAX_INST_LEN], prefix, length = MAX_INST_LEN;
2347 unsigned long g_vaddr;
2348 int inst_len;
2349 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
2351 /*
2352 * Unknown how many bytes the invlpg instruction will take. Use the
2353 * maximum instruction length here
2354 */
2355 if (inst_copy_from_guest(opcode, svm_rip2pointer(v), length) < length)
2357 gdprintk(XENLOG_ERR, "Error reading memory %d bytes\n", length);
2358 domain_crash(v->domain);
2359 return;
2362 if (invlpga)
2364 inst_len = __get_instruction_length(v, INSTR_INVLPGA, opcode);
2365 ASSERT(inst_len > 0);
2366 __update_guest_eip(vmcb, inst_len);
2368 /*
2369 * The address is implicit on this instruction. At the moment, we don't
2370 * use ecx (ASID) to identify individual guests pages
2371 */
2372 g_vaddr = regs->eax;
2374 else
2376 /* What about multiple prefix codes? */
2377 prefix = (is_prefix(opcode[0])?opcode[0]:0);
2378 inst_len = __get_instruction_length(v, INSTR_INVLPG, opcode);
2379 ASSERT(inst_len > 0);
2381 inst_len--;
2382 length -= inst_len;
2384 /*
2385 * Decode memory operand of the instruction including ModRM, SIB, and
2386 * displacement to get effective address and length in bytes. Assume
2387 * the system in either 32- or 64-bit mode.
2388 */
2389 g_vaddr = get_effective_addr_modrm64(regs, prefix, inst_len,
2390 &opcode[inst_len], &length);
2392 inst_len += length;
2393 __update_guest_eip (vmcb, inst_len);
2396 HVMTRACE_3D(INVLPG, v, (invlpga?1:0), g_vaddr, (invlpga?regs->ecx:0));
2398 paging_invlpg(v, g_vaddr);
2402 /*
2403 * Reset to realmode causes execution to start at 0xF000:0xFFF0 in
2404 * 16-bit realmode. Basically, this mimics a processor reset.
2406 * returns 0 on success, non-zero otherwise
2407 */
2408 static int svm_reset_to_realmode(struct vcpu *v,
2409 struct cpu_user_regs *regs)
2411 struct vmcb_struct *vmcb;
2413 ASSERT(v);
2414 ASSERT(regs);
2416 vmcb = v->arch.hvm_svm.vmcb;
2418 ASSERT(vmcb);
2420 /* clear the vmcb and user regs */
2421 memset(regs, 0, sizeof(struct cpu_user_regs));
2423 /* VMCB Control */
2424 vmcb->tsc_offset = 0;
2426 /* VMCB State */
2427 vmcb->cr0 = X86_CR0_ET | X86_CR0_PG | X86_CR0_WP;
2428 v->arch.hvm_svm.cpu_shadow_cr0 = X86_CR0_ET;
2430 vmcb->cr2 = 0;
2431 vmcb->efer = EFER_SVME;
2433 vmcb->cr4 = SVM_CR4_HOST_MASK;
2434 v->arch.hvm_svm.cpu_shadow_cr4 = 0;
2436 if ( paging_mode_hap(v->domain) ) {
2437 vmcb->cr0 = v->arch.hvm_svm.cpu_shadow_cr0;
2438 vmcb->cr4 = v->arch.hvm_svm.cpu_shadow_cr4;
2441 /* This will jump to ROMBIOS */
2442 vmcb->rip = 0xFFF0;
2444 /* setup the segment registers and all their hidden states */
2445 vmcb->cs.sel = 0xF000;
2446 vmcb->cs.attr.bytes = 0x089b;
2447 vmcb->cs.limit = 0xffff;
2448 vmcb->cs.base = 0x000F0000;
2450 vmcb->ss.sel = 0x00;
2451 vmcb->ss.attr.bytes = 0x0893;
2452 vmcb->ss.limit = 0xffff;
2453 vmcb->ss.base = 0x00;
2455 vmcb->ds.sel = 0x00;
2456 vmcb->ds.attr.bytes = 0x0893;
2457 vmcb->ds.limit = 0xffff;
2458 vmcb->ds.base = 0x00;
2460 vmcb->es.sel = 0x00;
2461 vmcb->es.attr.bytes = 0x0893;
2462 vmcb->es.limit = 0xffff;
2463 vmcb->es.base = 0x00;
2465 vmcb->fs.sel = 0x00;
2466 vmcb->fs.attr.bytes = 0x0893;
2467 vmcb->fs.limit = 0xffff;
2468 vmcb->fs.base = 0x00;
2470 vmcb->gs.sel = 0x00;
2471 vmcb->gs.attr.bytes = 0x0893;
2472 vmcb->gs.limit = 0xffff;
2473 vmcb->gs.base = 0x00;
2475 vmcb->ldtr.sel = 0x00;
2476 vmcb->ldtr.attr.bytes = 0x0000;
2477 vmcb->ldtr.limit = 0x0;
2478 vmcb->ldtr.base = 0x00;
2480 vmcb->gdtr.sel = 0x00;
2481 vmcb->gdtr.attr.bytes = 0x0000;
2482 vmcb->gdtr.limit = 0x0;
2483 vmcb->gdtr.base = 0x00;
2485 vmcb->tr.sel = 0;
2486 vmcb->tr.attr.bytes = 0;
2487 vmcb->tr.limit = 0x0;
2488 vmcb->tr.base = 0;
2490 vmcb->idtr.sel = 0x00;
2491 vmcb->idtr.attr.bytes = 0x0000;
2492 vmcb->idtr.limit = 0x3ff;
2493 vmcb->idtr.base = 0x00;
2495 vmcb->rax = 0;
2496 vmcb->rsp = 0;
2498 return 0;
2502 void svm_dump_inst(unsigned long eip)
2504 u8 opcode[256];
2505 unsigned long ptr;
2506 int len;
2507 int i;
2509 ptr = eip & ~0xff;
2510 len = 0;
2512 if (hvm_copy_from_guest_virt(opcode, ptr, sizeof(opcode)) == 0)
2513 len = sizeof(opcode);
2515 printk("Code bytes around(len=%d) %lx:", len, eip);
2516 for (i = 0; i < len; i++)
2518 if ((i & 0x0f) == 0)
2519 printk("\n%08lx:", ptr+i);
2521 printk("%02x ", opcode[i]);
2524 printk("\n");
2528 void svm_dump_regs(const char *from, struct cpu_user_regs *regs)
2530 struct vcpu *v = current;
2531 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
2532 unsigned long pt = v->arch.hvm_vcpu.hw_cr3;
2534 printk("%s: guest registers from %s:\n", __func__, from);
2535 #if defined (__x86_64__)
2536 printk("rax: %016lx rbx: %016lx rcx: %016lx\n",
2537 regs->rax, regs->rbx, regs->rcx);
2538 printk("rdx: %016lx rsi: %016lx rdi: %016lx\n",
2539 regs->rdx, regs->rsi, regs->rdi);
2540 printk("rbp: %016lx rsp: %016lx r8: %016lx\n",
2541 regs->rbp, regs->rsp, regs->r8);
2542 printk("r9: %016lx r10: %016lx r11: %016lx\n",
2543 regs->r9, regs->r10, regs->r11);
2544 printk("r12: %016lx r13: %016lx r14: %016lx\n",
2545 regs->r12, regs->r13, regs->r14);
2546 printk("r15: %016lx cr0: %016lx cr3: %016lx\n",
2547 regs->r15, v->arch.hvm_svm.cpu_shadow_cr0, vmcb->cr3);
2548 #else
2549 printk("eax: %08x, ebx: %08x, ecx: %08x, edx: %08x\n",
2550 regs->eax, regs->ebx, regs->ecx, regs->edx);
2551 printk("edi: %08x, esi: %08x, ebp: %08x, esp: %08x\n",
2552 regs->edi, regs->esi, regs->ebp, regs->esp);
2553 printk("%s: guest cr0: %lx\n", __func__,
2554 v->arch.hvm_svm.cpu_shadow_cr0);
2555 printk("guest CR3 = %llx\n", vmcb->cr3);
2556 #endif
2557 printk("%s: pt = %lx\n", __func__, pt);
2561 void svm_dump_host_regs(const char *from)
2563 struct vcpu *v = current;
2564 unsigned long pt = pt = pagetable_get_paddr(v->arch.monitor_table);
2565 unsigned long cr3, cr0;
2566 printk("Host registers at %s\n", from);
2568 __asm__ __volatile__ ("\tmov %%cr0,%0\n"
2569 "\tmov %%cr3,%1\n"
2570 : "=r" (cr0), "=r"(cr3));
2571 printk("%s: pt = %lx, cr3 = %lx, cr0 = %lx\n", __func__, pt, cr3, cr0);
2574 #ifdef SVM_EXTRA_DEBUG
2575 static char *exit_reasons[] = {
2576 [VMEXIT_CR0_READ] = "CR0_READ",
2577 [VMEXIT_CR1_READ] = "CR1_READ",
2578 [VMEXIT_CR2_READ] = "CR2_READ",
2579 [VMEXIT_CR3_READ] = "CR3_READ",
2580 [VMEXIT_CR4_READ] = "CR4_READ",
2581 [VMEXIT_CR5_READ] = "CR5_READ",
2582 [VMEXIT_CR6_READ] = "CR6_READ",
2583 [VMEXIT_CR7_READ] = "CR7_READ",
2584 [VMEXIT_CR8_READ] = "CR8_READ",
2585 [VMEXIT_CR9_READ] = "CR9_READ",
2586 [VMEXIT_CR10_READ] = "CR10_READ",
2587 [VMEXIT_CR11_READ] = "CR11_READ",
2588 [VMEXIT_CR12_READ] = "CR12_READ",
2589 [VMEXIT_CR13_READ] = "CR13_READ",
2590 [VMEXIT_CR14_READ] = "CR14_READ",
2591 [VMEXIT_CR15_READ] = "CR15_READ",
2592 [VMEXIT_CR0_WRITE] = "CR0_WRITE",
2593 [VMEXIT_CR1_WRITE] = "CR1_WRITE",
2594 [VMEXIT_CR2_WRITE] = "CR2_WRITE",
2595 [VMEXIT_CR3_WRITE] = "CR3_WRITE",
2596 [VMEXIT_CR4_WRITE] = "CR4_WRITE",
2597 [VMEXIT_CR5_WRITE] = "CR5_WRITE",
2598 [VMEXIT_CR6_WRITE] = "CR6_WRITE",
2599 [VMEXIT_CR7_WRITE] = "CR7_WRITE",
2600 [VMEXIT_CR8_WRITE] = "CR8_WRITE",
2601 [VMEXIT_CR9_WRITE] = "CR9_WRITE",
2602 [VMEXIT_CR10_WRITE] = "CR10_WRITE",
2603 [VMEXIT_CR11_WRITE] = "CR11_WRITE",
2604 [VMEXIT_CR12_WRITE] = "CR12_WRITE",
2605 [VMEXIT_CR13_WRITE] = "CR13_WRITE",
2606 [VMEXIT_CR14_WRITE] = "CR14_WRITE",
2607 [VMEXIT_CR15_WRITE] = "CR15_WRITE",
2608 [VMEXIT_DR0_READ] = "DR0_READ",
2609 [VMEXIT_DR1_READ] = "DR1_READ",
2610 [VMEXIT_DR2_READ] = "DR2_READ",
2611 [VMEXIT_DR3_READ] = "DR3_READ",
2612 [VMEXIT_DR4_READ] = "DR4_READ",
2613 [VMEXIT_DR5_READ] = "DR5_READ",
2614 [VMEXIT_DR6_READ] = "DR6_READ",
2615 [VMEXIT_DR7_READ] = "DR7_READ",
2616 [VMEXIT_DR8_READ] = "DR8_READ",
2617 [VMEXIT_DR9_READ] = "DR9_READ",
2618 [VMEXIT_DR10_READ] = "DR10_READ",
2619 [VMEXIT_DR11_READ] = "DR11_READ",
2620 [VMEXIT_DR12_READ] = "DR12_READ",
2621 [VMEXIT_DR13_READ] = "DR13_READ",
2622 [VMEXIT_DR14_READ] = "DR14_READ",
2623 [VMEXIT_DR15_READ] = "DR15_READ",
2624 [VMEXIT_DR0_WRITE] = "DR0_WRITE",
2625 [VMEXIT_DR1_WRITE] = "DR1_WRITE",
2626 [VMEXIT_DR2_WRITE] = "DR2_WRITE",
2627 [VMEXIT_DR3_WRITE] = "DR3_WRITE",
2628 [VMEXIT_DR4_WRITE] = "DR4_WRITE",
2629 [VMEXIT_DR5_WRITE] = "DR5_WRITE",
2630 [VMEXIT_DR6_WRITE] = "DR6_WRITE",
2631 [VMEXIT_DR7_WRITE] = "DR7_WRITE",
2632 [VMEXIT_DR8_WRITE] = "DR8_WRITE",
2633 [VMEXIT_DR9_WRITE] = "DR9_WRITE",
2634 [VMEXIT_DR10_WRITE] = "DR10_WRITE",
2635 [VMEXIT_DR11_WRITE] = "DR11_WRITE",
2636 [VMEXIT_DR12_WRITE] = "DR12_WRITE",
2637 [VMEXIT_DR13_WRITE] = "DR13_WRITE",
2638 [VMEXIT_DR14_WRITE] = "DR14_WRITE",
2639 [VMEXIT_DR15_WRITE] = "DR15_WRITE",
2640 [VMEXIT_EXCEPTION_DE] = "EXCEPTION_DE",
2641 [VMEXIT_EXCEPTION_DB] = "EXCEPTION_DB",
2642 [VMEXIT_EXCEPTION_NMI] = "EXCEPTION_NMI",
2643 [VMEXIT_EXCEPTION_BP] = "EXCEPTION_BP",
2644 [VMEXIT_EXCEPTION_OF] = "EXCEPTION_OF",
2645 [VMEXIT_EXCEPTION_BR] = "EXCEPTION_BR",
2646 [VMEXIT_EXCEPTION_UD] = "EXCEPTION_UD",
2647 [VMEXIT_EXCEPTION_NM] = "EXCEPTION_NM",
2648 [VMEXIT_EXCEPTION_DF] = "EXCEPTION_DF",
2649 [VMEXIT_EXCEPTION_09] = "EXCEPTION_09",
2650 [VMEXIT_EXCEPTION_TS] = "EXCEPTION_TS",
2651 [VMEXIT_EXCEPTION_NP] = "EXCEPTION_NP",
2652 [VMEXIT_EXCEPTION_SS] = "EXCEPTION_SS",
2653 [VMEXIT_EXCEPTION_GP] = "EXCEPTION_GP",
2654 [VMEXIT_EXCEPTION_PF] = "EXCEPTION_PF",
2655 [VMEXIT_EXCEPTION_15] = "EXCEPTION_15",
2656 [VMEXIT_EXCEPTION_MF] = "EXCEPTION_MF",
2657 [VMEXIT_EXCEPTION_AC] = "EXCEPTION_AC",
2658 [VMEXIT_EXCEPTION_MC] = "EXCEPTION_MC",
2659 [VMEXIT_EXCEPTION_XF] = "EXCEPTION_XF",
2660 [VMEXIT_INTR] = "INTR",
2661 [VMEXIT_NMI] = "NMI",
2662 [VMEXIT_SMI] = "SMI",
2663 [VMEXIT_INIT] = "INIT",
2664 [VMEXIT_VINTR] = "VINTR",
2665 [VMEXIT_CR0_SEL_WRITE] = "CR0_SEL_WRITE",
2666 [VMEXIT_IDTR_READ] = "IDTR_READ",
2667 [VMEXIT_GDTR_READ] = "GDTR_READ",
2668 [VMEXIT_LDTR_READ] = "LDTR_READ",
2669 [VMEXIT_TR_READ] = "TR_READ",
2670 [VMEXIT_IDTR_WRITE] = "IDTR_WRITE",
2671 [VMEXIT_GDTR_WRITE] = "GDTR_WRITE",
2672 [VMEXIT_LDTR_WRITE] = "LDTR_WRITE",
2673 [VMEXIT_TR_WRITE] = "TR_WRITE",
2674 [VMEXIT_RDTSC] = "RDTSC",
2675 [VMEXIT_RDPMC] = "RDPMC",
2676 [VMEXIT_PUSHF] = "PUSHF",
2677 [VMEXIT_POPF] = "POPF",
2678 [VMEXIT_CPUID] = "CPUID",
2679 [VMEXIT_RSM] = "RSM",
2680 [VMEXIT_IRET] = "IRET",
2681 [VMEXIT_SWINT] = "SWINT",
2682 [VMEXIT_INVD] = "INVD",
2683 [VMEXIT_PAUSE] = "PAUSE",
2684 [VMEXIT_HLT] = "HLT",
2685 [VMEXIT_INVLPG] = "INVLPG",
2686 [VMEXIT_INVLPGA] = "INVLPGA",
2687 [VMEXIT_IOIO] = "IOIO",
2688 [VMEXIT_MSR] = "MSR",
2689 [VMEXIT_TASK_SWITCH] = "TASK_SWITCH",
2690 [VMEXIT_FERR_FREEZE] = "FERR_FREEZE",
2691 [VMEXIT_SHUTDOWN] = "SHUTDOWN",
2692 [VMEXIT_VMRUN] = "VMRUN",
2693 [VMEXIT_VMMCALL] = "VMMCALL",
2694 [VMEXIT_VMLOAD] = "VMLOAD",
2695 [VMEXIT_VMSAVE] = "VMSAVE",
2696 [VMEXIT_STGI] = "STGI",
2697 [VMEXIT_CLGI] = "CLGI",
2698 [VMEXIT_SKINIT] = "SKINIT",
2699 [VMEXIT_RDTSCP] = "RDTSCP",
2700 [VMEXIT_ICEBP] = "ICEBP",
2701 [VMEXIT_NPF] = "NPF"
2702 };
2703 #endif /* SVM_EXTRA_DEBUG */
2705 #ifdef SVM_WALK_GUEST_PAGES
2706 void walk_shadow_and_guest_pt(unsigned long gva)
2708 l2_pgentry_t gpde;
2709 l2_pgentry_t spde;
2710 l1_pgentry_t gpte;
2711 l1_pgentry_t spte;
2712 struct vcpu *v = current;
2713 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
2714 paddr_t gpa;
2716 gpa = paging_gva_to_gpa(current, gva);
2717 printk("gva = %lx, gpa=%"PRIpaddr", gCR3=%x\n", gva, gpa, (u32)vmcb->cr3);
2718 if( !svm_paging_enabled(v) || mmio_space(gpa) )
2719 return;
2721 /* let's dump the guest and shadow page info */
2723 __guest_get_l2e(v, gva, &gpde);
2724 printk( "G-PDE = %x, flags=%x\n", gpde.l2, l2e_get_flags(gpde) );
2725 __shadow_get_l2e( v, gva, &spde );
2726 printk( "S-PDE = %x, flags=%x\n", spde.l2, l2e_get_flags(spde) );
2728 if ( unlikely(!(l2e_get_flags(gpde) & _PAGE_PRESENT)) )
2729 return;
2731 spte = l1e_empty();
2733 /* This is actually overkill - we only need to ensure the hl2 is in-sync.*/
2734 shadow_sync_va(v, gva);
2736 gpte.l1 = 0;
2737 __copy_from_user(&gpte, &__linear_l1_table[ l1_linear_offset(gva) ],
2738 sizeof(gpte) );
2739 printk( "G-PTE = %x, flags=%x\n", gpte.l1, l1e_get_flags(gpte) );
2741 BUG(); // need to think about this, and convert usage of
2742 // phys_to_machine_mapping to use pagetable format...
2743 __copy_from_user( &spte, &phys_to_machine_mapping[ l1e_get_pfn( gpte ) ],
2744 sizeof(spte) );
2746 printk( "S-PTE = %x, flags=%x\n", spte.l1, l1e_get_flags(spte));
2748 #endif /* SVM_WALK_GUEST_PAGES */
2751 asmlinkage void svm_vmexit_handler(struct cpu_user_regs *regs)
2753 unsigned int exit_reason;
2754 unsigned long eip;
2755 struct vcpu *v = current;
2756 int do_debug = 0;
2757 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
2759 ASSERT(vmcb);
2761 exit_reason = vmcb->exitcode;
2762 save_svm_cpu_user_regs(v, regs);
2764 HVMTRACE_2D(VMEXIT, v, vmcb->rip, exit_reason);
2766 if (exit_reason == VMEXIT_INVALID)
2768 svm_dump_vmcb(__func__, vmcb);
2769 goto exit_and_crash;
2772 #ifdef SVM_EXTRA_DEBUG
2774 #if defined(__i386__)
2775 #define rip eip
2776 #endif
2778 static unsigned long intercepts_counter = 0;
2780 if (svm_dbg_on && exit_reason == VMEXIT_EXCEPTION_PF)
2782 if (svm_paging_enabled(v) &&
2783 !mmio_space(
2784 paging_gva_to_gfn(current, vmcb->exitinfo2) << PAGE_SHIFT))
2786 printk("I%08ld,ExC=%s(%d),IP=%x:%"PRIx64","
2787 "I1=%"PRIx64",I2=%"PRIx64",INT=%"PRIx64", "
2788 "gpa=%"PRIx64"\n", intercepts_counter,
2789 exit_reasons[exit_reason], exit_reason, regs->cs,
2790 (u64)regs->rip,
2791 (u64)vmcb->exitinfo1,
2792 (u64)vmcb->exitinfo2,
2793 (u64)vmcb->exitintinfo.bytes,
2794 (((u64)paging_gva_to_gfn(current, vmcb->exitinfo2)
2795 << PAGE_SHIFT) | (vmcb->exitinfo2 & ~PAGE_MASK)));
2797 else
2799 printk("I%08ld,ExC=%s(%d),IP=%x:%"PRIx64","
2800 "I1=%"PRIx64",I2=%"PRIx64",INT=%"PRIx64"\n",
2801 intercepts_counter,
2802 exit_reasons[exit_reason], exit_reason, regs->cs,
2803 (u64)regs->rip,
2804 (u64)vmcb->exitinfo1,
2805 (u64)vmcb->exitinfo2,
2806 (u64)vmcb->exitintinfo.bytes );
2809 else if ( svm_dbg_on
2810 && exit_reason != VMEXIT_IOIO
2811 && exit_reason != VMEXIT_INTR)
2814 if (exit_reasons[exit_reason])
2816 printk("I%08ld,ExC=%s(%d),IP=%x:%"PRIx64","
2817 "I1=%"PRIx64",I2=%"PRIx64",INT=%"PRIx64"\n",
2818 intercepts_counter,
2819 exit_reasons[exit_reason], exit_reason, regs->cs,
2820 (u64)regs->rip,
2821 (u64)vmcb->exitinfo1,
2822 (u64)vmcb->exitinfo2,
2823 (u64)vmcb->exitintinfo.bytes);
2825 else
2827 printk("I%08ld,ExC=%d(0x%x),IP=%x:%"PRIx64","
2828 "I1=%"PRIx64",I2=%"PRIx64",INT=%"PRIx64"\n",
2829 intercepts_counter, exit_reason, exit_reason, regs->cs,
2830 (u64)regs->rip,
2831 (u64)vmcb->exitinfo1,
2832 (u64)vmcb->exitinfo2,
2833 (u64)vmcb->exitintinfo.bytes);
2837 #ifdef SVM_WALK_GUEST_PAGES
2838 if( exit_reason == VMEXIT_EXCEPTION_PF
2839 && ( ( vmcb->exitinfo2 == vmcb->rip )
2840 || vmcb->exitintinfo.bytes) )
2842 if ( svm_paging_enabled(v) &&
2843 !mmio_space(gva_to_gpa(vmcb->exitinfo2)) )
2844 walk_shadow_and_guest_pt(vmcb->exitinfo2);
2846 #endif
2848 intercepts_counter++;
2850 #if 0
2851 if (svm_dbg_on)
2852 do_debug = svm_do_debugout(exit_reason);
2853 #endif
2855 if (do_debug)
2857 printk("%s:+ guest_table = 0x%08x, monitor_table = 0x%08x, "
2858 "hw_cr3 = 0x%16lx\n",
2859 __func__,
2860 (int) v->arch.guest_table.pfn,
2861 (int) v->arch.monitor_table.pfn,
2862 (long unsigned int) v->arch.hvm_vcpu.hw_cr3);
2864 svm_dump_vmcb(__func__, vmcb);
2865 svm_dump_regs(__func__, regs);
2866 svm_dump_inst(svm_rip2pointer(v));
2869 #if defined(__i386__)
2870 #undef rip
2871 #endif
2874 #endif /* SVM_EXTRA_DEBUG */
2877 perfc_incra(svmexits, exit_reason);
2878 eip = vmcb->rip;
2880 #ifdef SVM_EXTRA_DEBUG
2881 if (do_debug)
2883 printk("eip = %lx, exit_reason = %d (0x%x)\n",
2884 eip, exit_reason, exit_reason);
2886 #endif /* SVM_EXTRA_DEBUG */
2888 switch (exit_reason)
2890 case VMEXIT_INTR:
2891 /* Asynchronous event, handled when we STGI'd after the VMEXIT. */
2892 HVMTRACE_0D(INTR, v);
2893 break;
2895 case VMEXIT_NMI:
2896 /* Asynchronous event, handled when we STGI'd after the VMEXIT. */
2897 HVMTRACE_0D(NMI, v);
2898 break;
2900 case VMEXIT_SMI:
2901 /* Asynchronous event, handled when we STGI'd after the VMEXIT. */
2902 HVMTRACE_0D(SMI, v);
2903 break;
2905 case VMEXIT_EXCEPTION_DB:
2906 if ( v->domain->debugger_attached )
2907 domain_pause_for_debugger();
2908 else
2909 svm_inject_exception(v, TRAP_debug, 0, 0);
2910 break;
2912 case VMEXIT_EXCEPTION_BP:
2913 if ( v->domain->debugger_attached )
2914 domain_pause_for_debugger();
2915 else
2916 svm_inject_exception(v, TRAP_int3, 0, 0);
2917 break;
2919 case VMEXIT_EXCEPTION_NM:
2920 svm_do_no_device_fault(vmcb);
2921 break;
2923 case VMEXIT_EXCEPTION_PF: {
2924 unsigned long va;
2925 va = vmcb->exitinfo2;
2926 regs->error_code = vmcb->exitinfo1;
2927 HVM_DBG_LOG(DBG_LEVEL_VMMU,
2928 "eax=%lx, ebx=%lx, ecx=%lx, edx=%lx, esi=%lx, edi=%lx",
2929 (unsigned long)regs->eax, (unsigned long)regs->ebx,
2930 (unsigned long)regs->ecx, (unsigned long)regs->edx,
2931 (unsigned long)regs->esi, (unsigned long)regs->edi);
2933 if ( svm_do_page_fault(va, regs) )
2935 HVMTRACE_2D(PF_XEN, v, va, regs->error_code);
2936 break;
2939 v->arch.hvm_svm.cpu_cr2 = vmcb->cr2 = va;
2940 svm_inject_exception(v, TRAP_page_fault, 1, regs->error_code);
2941 break;
2944 case VMEXIT_VINTR:
2945 vmcb->vintr.fields.irq = 0;
2946 vmcb->general1_intercepts &= ~GENERAL1_INTERCEPT_VINTR;
2947 break;
2949 case VMEXIT_INVD:
2950 svm_vmexit_do_invd(v);
2951 break;
2953 case VMEXIT_GDTR_WRITE:
2954 printk("WRITE to GDTR\n");
2955 break;
2957 case VMEXIT_TASK_SWITCH:
2958 goto exit_and_crash;
2960 case VMEXIT_CPUID:
2961 svm_vmexit_do_cpuid(vmcb, regs);
2962 break;
2964 case VMEXIT_HLT:
2965 svm_vmexit_do_hlt(vmcb);
2966 break;
2968 case VMEXIT_INVLPG:
2969 svm_handle_invlpg(0, regs);
2970 break;
2972 case VMEXIT_INVLPGA:
2973 svm_handle_invlpg(1, regs);
2974 break;
2976 case VMEXIT_VMMCALL: {
2977 int inst_len = __get_instruction_length(v, INSTR_VMCALL, NULL);
2978 ASSERT(inst_len > 0);
2979 HVMTRACE_1D(VMMCALL, v, regs->eax);
2980 __update_guest_eip(vmcb, inst_len);
2981 hvm_do_hypercall(regs);
2982 break;
2985 case VMEXIT_CR0_READ:
2986 svm_cr_access(v, 0, TYPE_MOV_FROM_CR, regs);
2987 break;
2989 case VMEXIT_CR2_READ:
2990 svm_cr_access(v, 2, TYPE_MOV_FROM_CR, regs);
2991 break;
2993 case VMEXIT_CR3_READ:
2994 svm_cr_access(v, 3, TYPE_MOV_FROM_CR, regs);
2995 break;
2997 case VMEXIT_CR4_READ:
2998 svm_cr_access(v, 4, TYPE_MOV_FROM_CR, regs);
2999 break;
3001 case VMEXIT_CR8_READ:
3002 svm_cr_access(v, 8, TYPE_MOV_FROM_CR, regs);
3003 break;
3005 case VMEXIT_CR0_WRITE:
3006 svm_cr_access(v, 0, TYPE_MOV_TO_CR, regs);
3007 break;
3009 case VMEXIT_CR2_WRITE:
3010 svm_cr_access(v, 2, TYPE_MOV_TO_CR, regs);
3011 break;
3013 case VMEXIT_CR3_WRITE:
3014 svm_cr_access(v, 3, TYPE_MOV_TO_CR, regs);
3015 local_flush_tlb();
3016 break;
3018 case VMEXIT_CR4_WRITE:
3019 svm_cr_access(v, 4, TYPE_MOV_TO_CR, regs);
3020 break;
3022 case VMEXIT_CR8_WRITE:
3023 svm_cr_access(v, 8, TYPE_MOV_TO_CR, regs);
3024 break;
3026 case VMEXIT_DR0_WRITE ... VMEXIT_DR7_WRITE:
3027 svm_dr_access(v, regs);
3028 break;
3030 case VMEXIT_IOIO:
3031 svm_io_instruction(v);
3032 break;
3034 case VMEXIT_MSR:
3035 svm_do_msr_access(v, regs);
3036 break;
3038 case VMEXIT_SHUTDOWN:
3039 hvm_triple_fault();
3040 break;
3042 case VMEXIT_VMRUN:
3043 case VMEXIT_VMLOAD:
3044 case VMEXIT_VMSAVE:
3045 case VMEXIT_STGI:
3046 case VMEXIT_CLGI:
3047 case VMEXIT_SKINIT:
3048 /* Report "Invalid opcode" on any VM-operation except VMMCALL */
3049 svm_inject_exception(v, TRAP_invalid_op, 0, 0);
3050 break;
3052 case VMEXIT_NPF:
3053 regs->error_code = vmcb->exitinfo1;
3054 if ( !svm_do_nested_pgfault(vmcb->exitinfo2, regs) ) {
3055 domain_crash(v->domain);
3057 break;
3059 default:
3060 exit_and_crash:
3061 gdprintk(XENLOG_ERR, "unexpected VMEXIT: exit reason = 0x%x, "
3062 "exitinfo1 = %"PRIx64", exitinfo2 = %"PRIx64"\n",
3063 exit_reason,
3064 (u64)vmcb->exitinfo1, (u64)vmcb->exitinfo2);
3065 domain_crash(v->domain);
3066 break;
3069 #ifdef SVM_EXTRA_DEBUG
3070 if (do_debug)
3072 printk("%s: Done switch on vmexit_code\n", __func__);
3073 svm_dump_regs(__func__, regs);
3076 if (do_debug)
3078 printk("vmexit_handler():- guest_table = 0x%08x, "
3079 "monitor_table = 0x%08x, hw_cr3 = 0x%16x\n",
3080 (int)v->arch.guest_table.pfn,
3081 (int)v->arch.monitor_table.pfn,
3082 (int)v->arch.hvm_vcpu.hw_cr3);
3083 printk("svm_vmexit_handler: Returning\n");
3085 #endif
3088 asmlinkage void svm_load_cr2(void)
3090 struct vcpu *v = current;
3092 // this is the last C code before the VMRUN instruction
3093 HVMTRACE_0D(VMENTRY, v);
3095 local_irq_disable();
3096 asm volatile("mov %0,%%cr2": :"r" (v->arch.hvm_svm.cpu_cr2));
3099 /*
3100 * Local variables:
3101 * mode: C
3102 * c-set-style: "BSD"
3103 * c-basic-offset: 4
3104 * tab-width: 4
3105 * indent-tabs-mode: nil
3106 * End:
3107 */