ia64/xen-unstable

view xen/arch/x86/hvm/svm/svm.c @ 14437:a20e3ad50ae8

[HVM][SVM] Save/restore: save the guest's versions of CR0 and EFER
and recalculate the vmcb ones on restore
Signed-off-by: Tim Deegan <Tim.Deegan@xensource.com>
author Tim Deegan <Tim.Deegan@xensource.com>
date Fri Mar 16 11:41:10 2007 +0000 (2007-03-16)
parents 1721f90e1422
children ed1e4cc4a5b7
line source
1 /*
2 * svm.c: handling SVM architecture-related VM exits
3 * Copyright (c) 2004, Intel Corporation.
4 * Copyright (c) 2005, AMD Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms and conditions of the GNU General Public License,
8 * version 2, as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
17 * Place - Suite 330, Boston, MA 02111-1307 USA.
18 *
19 */
21 #include <xen/config.h>
22 #include <xen/init.h>
23 #include <xen/lib.h>
24 #include <xen/trace.h>
25 #include <xen/sched.h>
26 #include <xen/irq.h>
27 #include <xen/softirq.h>
28 #include <xen/hypercall.h>
29 #include <xen/domain_page.h>
30 #include <asm/current.h>
31 #include <asm/io.h>
32 #include <asm/paging.h>
33 #include <asm/p2m.h>
34 #include <asm/regs.h>
35 #include <asm/cpufeature.h>
36 #include <asm/processor.h>
37 #include <asm/types.h>
38 #include <asm/msr.h>
39 #include <asm/spinlock.h>
40 #include <asm/hvm/hvm.h>
41 #include <asm/hvm/support.h>
42 #include <asm/hvm/io.h>
43 #include <asm/hvm/svm/svm.h>
44 #include <asm/hvm/svm/vmcb.h>
45 #include <asm/hvm/svm/emulate.h>
46 #include <asm/hvm/svm/intr.h>
47 #include <asm/x86_emulate.h>
48 #include <public/sched.h>
49 #include <asm/hvm/vpt.h>
50 #include <asm/hvm/trace.h>
51 #include <asm/hap.h>
53 #define SVM_EXTRA_DEBUG
55 #define set_segment_register(name, value) \
56 __asm__ __volatile__ ( "movw %%ax ,%%" STR(name) "" : : "a" (value) )
58 /* External functions. We should move these to some suitable header file(s) */
60 extern int inst_copy_from_guest(unsigned char *buf, unsigned long guest_eip,
61 int inst_len);
62 extern asmlinkage void do_IRQ(struct cpu_user_regs *);
63 extern void svm_dump_inst(unsigned long eip);
64 extern int svm_dbg_on;
65 void svm_dump_regs(const char *from, struct cpu_user_regs *regs);
67 static int svm_do_vmmcall_reset_to_realmode(struct vcpu *v,
68 struct cpu_user_regs *regs);
70 /* va of hardware host save area */
71 static void *hsa[NR_CPUS] __read_mostly;
73 /* vmcb used for extended host state */
74 static void *root_vmcb[NR_CPUS] __read_mostly;
76 /* physical address of above for host VMSAVE/VMLOAD */
77 u64 root_vmcb_pa[NR_CPUS] __read_mostly;
79 /* hardware assisted paging bits */
80 extern int opt_hap_enabled;
81 extern int hap_capable_system;
83 static inline void svm_inject_exception(struct vcpu *v, int trap,
84 int ev, int error_code)
85 {
86 eventinj_t event;
87 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
89 if ( trap == TRAP_page_fault )
90 HVMTRACE_2D(PF_INJECT, v, v->arch.hvm_svm.cpu_cr2, error_code);
91 else
92 HVMTRACE_2D(INJ_EXC, v, trap, error_code);
94 event.bytes = 0;
95 event.fields.v = 1;
96 event.fields.type = EVENTTYPE_EXCEPTION;
97 event.fields.vector = trap;
98 event.fields.ev = ev;
99 event.fields.errorcode = error_code;
101 ASSERT(vmcb->eventinj.fields.v == 0);
103 vmcb->eventinj = event;
104 }
106 static void stop_svm(void)
107 {
108 u32 eax, edx;
109 /* We turn off the EFER_SVME bit. */
110 rdmsr(MSR_EFER, eax, edx);
111 eax &= ~EFER_SVME;
112 wrmsr(MSR_EFER, eax, edx);
113 }
115 static void svm_store_cpu_guest_regs(
116 struct vcpu *v, struct cpu_user_regs *regs, unsigned long *crs)
117 {
118 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
120 if ( regs != NULL )
121 {
122 regs->eip = vmcb->rip;
123 regs->esp = vmcb->rsp;
124 regs->eflags = vmcb->rflags;
125 regs->cs = vmcb->cs.sel;
126 regs->ds = vmcb->ds.sel;
127 regs->es = vmcb->es.sel;
128 regs->ss = vmcb->ss.sel;
129 regs->gs = vmcb->gs.sel;
130 regs->fs = vmcb->fs.sel;
131 }
133 if ( crs != NULL )
134 {
135 /* Returning the guest's regs */
136 crs[0] = v->arch.hvm_svm.cpu_shadow_cr0;
137 crs[2] = v->arch.hvm_svm.cpu_cr2;
138 crs[3] = v->arch.hvm_svm.cpu_cr3;
139 crs[4] = v->arch.hvm_svm.cpu_shadow_cr4;
140 }
141 }
144 static inline int long_mode_do_msr_read(struct cpu_user_regs *regs)
145 {
146 u64 msr_content = 0;
147 struct vcpu *v = current;
148 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
150 switch ((u32)regs->ecx)
151 {
152 case MSR_EFER:
153 msr_content = v->arch.hvm_svm.cpu_shadow_efer;
154 break;
156 #ifdef __x86_64__
157 case MSR_FS_BASE:
158 msr_content = vmcb->fs.base;
159 goto check_long_mode;
161 case MSR_GS_BASE:
162 msr_content = vmcb->gs.base;
163 goto check_long_mode;
165 case MSR_SHADOW_GS_BASE:
166 msr_content = vmcb->kerngsbase;
167 check_long_mode:
168 if ( !svm_long_mode_enabled(v) )
169 {
170 svm_inject_exception(v, TRAP_gp_fault, 1, 0);
171 return 0;
172 }
173 break;
174 #endif
176 case MSR_STAR:
177 msr_content = vmcb->star;
178 break;
180 case MSR_LSTAR:
181 msr_content = vmcb->lstar;
182 break;
184 case MSR_CSTAR:
185 msr_content = vmcb->cstar;
186 break;
188 case MSR_SYSCALL_MASK:
189 msr_content = vmcb->sfmask;
190 break;
191 default:
192 return 0;
193 }
195 HVM_DBG_LOG(DBG_LEVEL_2, "msr_content: %"PRIx64"\n",
196 msr_content);
198 regs->eax = (u32)(msr_content >> 0);
199 regs->edx = (u32)(msr_content >> 32);
200 return 1;
201 }
203 static inline int long_mode_do_msr_write(struct cpu_user_regs *regs)
204 {
205 u64 msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
206 u32 ecx = regs->ecx;
207 struct vcpu *v = current;
208 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
210 HVM_DBG_LOG(DBG_LEVEL_1, "msr %x msr_content %"PRIx64"\n",
211 ecx, msr_content);
213 switch ( ecx )
214 {
215 case MSR_EFER:
216 /* offending reserved bit will cause #GP */
217 if ( msr_content & ~(EFER_LME | EFER_LMA | EFER_NX | EFER_SCE) )
218 {
219 gdprintk(XENLOG_WARNING, "Trying to set reserved bit in "
220 "EFER: %"PRIx64"\n", msr_content);
221 goto gp_fault;
222 }
224 /*
225 * update the VMCB's EFER with the intended value along with
226 * that crucial EFER.SVME bit =)
227 */
228 vmcb->efer = msr_content | EFER_SVME;
230 #ifdef __x86_64__
232 /*
233 * Check for EFER.LME transitions from 0->1 or 1->0. Do the
234 * sanity checks and then make sure that both EFER.LME and
235 * EFER.LMA are cleared. (EFER.LME can't be set in the vmcb
236 * until the guest also sets CR0.PG, since even if the guest has
237 * paging "disabled", the vmcb's CR0 always has PG set.)
238 */
239 if ( (msr_content & EFER_LME) && !svm_lme_is_set(v) )
240 {
241 /* EFER.LME transition from 0 to 1 */
243 if ( svm_paging_enabled(v) ||
244 !svm_cr4_pae_is_set(v) )
245 {
246 gdprintk(XENLOG_WARNING, "Trying to set LME bit when "
247 "in paging mode or PAE bit is not set\n");
248 goto gp_fault;
249 }
251 vmcb->efer &= ~(EFER_LME | EFER_LMA);
252 }
253 else if ( !(msr_content & EFER_LME) && svm_lme_is_set(v) )
254 {
255 /* EFER.LME transistion from 1 to 0 */
257 if ( svm_paging_enabled(v) )
258 {
259 gdprintk(XENLOG_WARNING,
260 "Trying to clear EFER.LME while paging enabled\n");
261 goto gp_fault;
262 }
264 vmcb->efer &= ~(EFER_LME | EFER_LMA);
265 }
267 #endif /* __x86_64__ */
269 /* update the guest EFER's shadow with the intended value */
270 v->arch.hvm_svm.cpu_shadow_efer = msr_content;
272 break;
274 #ifdef __x86_64__
275 case MSR_FS_BASE:
276 case MSR_GS_BASE:
277 case MSR_SHADOW_GS_BASE:
278 if ( !svm_long_mode_enabled(v) )
279 goto gp_fault;
281 if ( !is_canonical_address(msr_content) )
282 goto uncanonical_address;
284 if ( ecx == MSR_FS_BASE )
285 vmcb->fs.base = msr_content;
286 else if ( ecx == MSR_GS_BASE )
287 vmcb->gs.base = msr_content;
288 else
289 vmcb->kerngsbase = msr_content;
290 break;
291 #endif
293 case MSR_STAR:
294 vmcb->star = msr_content;
295 break;
297 case MSR_LSTAR:
298 case MSR_CSTAR:
299 if ( !is_canonical_address(msr_content) )
300 goto uncanonical_address;
302 if ( ecx == MSR_LSTAR )
303 vmcb->lstar = msr_content;
304 else
305 vmcb->cstar = msr_content;
306 break;
308 case MSR_SYSCALL_MASK:
309 vmcb->sfmask = msr_content;
310 break;
312 default:
313 return 0;
314 }
316 return 1;
318 uncanonical_address:
319 HVM_DBG_LOG(DBG_LEVEL_1, "Not cano address of msr write %x\n", ecx);
320 gp_fault:
321 svm_inject_exception(v, TRAP_gp_fault, 1, 0);
322 return 0;
323 }
326 #define loaddebug(_v,_reg) \
327 __asm__ __volatile__ ("mov %0,%%db" #_reg : : "r" ((_v)->debugreg[_reg]))
328 #define savedebug(_v,_reg) \
329 __asm__ __volatile__ ("mov %%db" #_reg ",%0" : : "r" ((_v)->debugreg[_reg]))
331 static inline void svm_save_dr(struct vcpu *v)
332 {
333 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
335 if ( !v->arch.hvm_vcpu.flag_dr_dirty )
336 return;
338 /* Clear the DR dirty flag and re-enable intercepts for DR accesses. */
339 v->arch.hvm_vcpu.flag_dr_dirty = 0;
340 v->arch.hvm_svm.vmcb->dr_intercepts = DR_INTERCEPT_ALL_WRITES;
342 savedebug(&v->arch.guest_context, 0);
343 savedebug(&v->arch.guest_context, 1);
344 savedebug(&v->arch.guest_context, 2);
345 savedebug(&v->arch.guest_context, 3);
346 v->arch.guest_context.debugreg[6] = vmcb->dr6;
347 v->arch.guest_context.debugreg[7] = vmcb->dr7;
348 }
351 static inline void __restore_debug_registers(struct vcpu *v)
352 {
353 loaddebug(&v->arch.guest_context, 0);
354 loaddebug(&v->arch.guest_context, 1);
355 loaddebug(&v->arch.guest_context, 2);
356 loaddebug(&v->arch.guest_context, 3);
357 /* DR6 and DR7 are loaded from the VMCB. */
358 }
361 int svm_vmcb_save(struct vcpu *v, struct hvm_hw_cpu *c)
362 {
363 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
365 c->eip = vmcb->rip;
367 #ifdef HVM_DEBUG_SUSPEND
368 printk("%s: eip=0x%"PRIx64".\n",
369 __func__,
370 inst_len, c->eip);
371 #endif
373 c->esp = vmcb->rsp;
374 c->eflags = vmcb->rflags;
376 c->cr0 = v->arch.hvm_svm.cpu_shadow_cr0;
377 c->cr3 = v->arch.hvm_svm.cpu_cr3;
378 c->cr4 = v->arch.hvm_svm.cpu_shadow_cr4;
380 #ifdef HVM_DEBUG_SUSPEND
381 printk("%s: cr3=0x%"PRIx64", cr0=0x%"PRIx64", cr4=0x%"PRIx64".\n",
382 __func__,
383 c->cr3,
384 c->cr0,
385 c->cr4);
386 #endif
388 c->idtr_limit = vmcb->idtr.limit;
389 c->idtr_base = vmcb->idtr.base;
391 c->gdtr_limit = vmcb->gdtr.limit;
392 c->gdtr_base = vmcb->gdtr.base;
394 c->cs_sel = vmcb->cs.sel;
395 c->cs_limit = vmcb->cs.limit;
396 c->cs_base = vmcb->cs.base;
397 c->cs_arbytes = vmcb->cs.attr.bytes;
399 c->ds_sel = vmcb->ds.sel;
400 c->ds_limit = vmcb->ds.limit;
401 c->ds_base = vmcb->ds.base;
402 c->ds_arbytes = vmcb->ds.attr.bytes;
404 c->es_sel = vmcb->es.sel;
405 c->es_limit = vmcb->es.limit;
406 c->es_base = vmcb->es.base;
407 c->es_arbytes = vmcb->es.attr.bytes;
409 c->ss_sel = vmcb->ss.sel;
410 c->ss_limit = vmcb->ss.limit;
411 c->ss_base = vmcb->ss.base;
412 c->ss_arbytes = vmcb->ss.attr.bytes;
414 c->fs_sel = vmcb->fs.sel;
415 c->fs_limit = vmcb->fs.limit;
416 c->fs_base = vmcb->fs.base;
417 c->fs_arbytes = vmcb->fs.attr.bytes;
419 c->gs_sel = vmcb->gs.sel;
420 c->gs_limit = vmcb->gs.limit;
421 c->gs_base = vmcb->gs.base;
422 c->gs_arbytes = vmcb->gs.attr.bytes;
424 c->tr_sel = vmcb->tr.sel;
425 c->tr_limit = vmcb->tr.limit;
426 c->tr_base = vmcb->tr.base;
427 c->tr_arbytes = vmcb->tr.attr.bytes;
429 c->ldtr_sel = vmcb->ldtr.sel;
430 c->ldtr_limit = vmcb->ldtr.limit;
431 c->ldtr_base = vmcb->ldtr.base;
432 c->ldtr_arbytes = vmcb->ldtr.attr.bytes;
434 c->sysenter_cs = vmcb->sysenter_cs;
435 c->sysenter_esp = vmcb->sysenter_esp;
436 c->sysenter_eip = vmcb->sysenter_eip;
438 return 1;
439 }
442 int svm_vmcb_restore(struct vcpu *v, struct hvm_hw_cpu *c)
443 {
444 unsigned long mfn, old_base_mfn;
445 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
447 vmcb->rip = c->eip;
448 vmcb->rsp = c->esp;
449 vmcb->rflags = c->eflags;
451 v->arch.hvm_svm.cpu_shadow_cr0 = c->cr0;
452 vmcb->cr0 = c->cr0 | X86_CR0_WP | X86_CR0_ET;
453 if ( !paging_mode_hap(v->domain) )
454 vmcb->cr0 |= X86_CR0_PG;
456 #ifdef HVM_DEBUG_SUSPEND
457 printk("%s: cr3=0x%"PRIx64", cr0=0x%"PRIx64", cr4=0x%"PRIx64".\n",
458 __func__,
459 c->cr3,
460 c->cr0,
461 c->cr4);
462 #endif
464 if ( !svm_paging_enabled(v) )
465 {
466 printk("%s: paging not enabled.", __func__);
467 goto skip_cr3;
468 }
470 if ( c->cr3 == v->arch.hvm_svm.cpu_cr3 )
471 {
472 /*
473 * This is simple TLB flush, implying the guest has
474 * removed some translation or changed page attributes.
475 * We simply invalidate the shadow.
476 */
477 mfn = gmfn_to_mfn(v->domain, c->cr3 >> PAGE_SHIFT);
478 if ( mfn != pagetable_get_pfn(v->arch.guest_table) )
479 goto bad_cr3;
480 }
481 else
482 {
483 /*
484 * If different, make a shadow. Check if the PDBR is valid
485 * first.
486 */
487 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 c->cr3 = %"PRIx64"", c->cr3);
488 /* current!=vcpu as not called by arch_vmx_do_launch */
489 mfn = gmfn_to_mfn(v->domain, c->cr3 >> PAGE_SHIFT);
490 if( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), v->domain) )
491 goto bad_cr3;
493 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
494 v->arch.guest_table = pagetable_from_pfn(mfn);
495 if (old_base_mfn)
496 put_page(mfn_to_page(old_base_mfn));
497 v->arch.hvm_svm.cpu_cr3 = c->cr3;
498 }
500 skip_cr3:
501 vmcb->cr4 = c->cr4 | SVM_CR4_HOST_MASK;
502 v->arch.hvm_svm.cpu_shadow_cr4 = c->cr4;
504 vmcb->idtr.limit = c->idtr_limit;
505 vmcb->idtr.base = c->idtr_base;
507 vmcb->gdtr.limit = c->gdtr_limit;
508 vmcb->gdtr.base = c->gdtr_base;
510 vmcb->cs.sel = c->cs_sel;
511 vmcb->cs.limit = c->cs_limit;
512 vmcb->cs.base = c->cs_base;
513 vmcb->cs.attr.bytes = c->cs_arbytes;
515 vmcb->ds.sel = c->ds_sel;
516 vmcb->ds.limit = c->ds_limit;
517 vmcb->ds.base = c->ds_base;
518 vmcb->ds.attr.bytes = c->ds_arbytes;
520 vmcb->es.sel = c->es_sel;
521 vmcb->es.limit = c->es_limit;
522 vmcb->es.base = c->es_base;
523 vmcb->es.attr.bytes = c->es_arbytes;
525 vmcb->ss.sel = c->ss_sel;
526 vmcb->ss.limit = c->ss_limit;
527 vmcb->ss.base = c->ss_base;
528 vmcb->ss.attr.bytes = c->ss_arbytes;
530 vmcb->fs.sel = c->fs_sel;
531 vmcb->fs.limit = c->fs_limit;
532 vmcb->fs.base = c->fs_base;
533 vmcb->fs.attr.bytes = c->fs_arbytes;
535 vmcb->gs.sel = c->gs_sel;
536 vmcb->gs.limit = c->gs_limit;
537 vmcb->gs.base = c->gs_base;
538 vmcb->gs.attr.bytes = c->gs_arbytes;
540 vmcb->tr.sel = c->tr_sel;
541 vmcb->tr.limit = c->tr_limit;
542 vmcb->tr.base = c->tr_base;
543 vmcb->tr.attr.bytes = c->tr_arbytes;
545 vmcb->ldtr.sel = c->ldtr_sel;
546 vmcb->ldtr.limit = c->ldtr_limit;
547 vmcb->ldtr.base = c->ldtr_base;
548 vmcb->ldtr.attr.bytes = c->ldtr_arbytes;
550 vmcb->sysenter_cs = c->sysenter_cs;
551 vmcb->sysenter_esp = c->sysenter_esp;
552 vmcb->sysenter_eip = c->sysenter_eip;
554 paging_update_paging_modes(v);
555 return 0;
557 bad_cr3:
558 gdprintk(XENLOG_ERR, "Invalid CR3 value=0x%"PRIx64"", c->cr3);
559 return -EINVAL;
560 }
563 void svm_save_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data)
564 {
565 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
567 data->shadow_gs = vmcb->kerngsbase;
568 data->msr_lstar = vmcb->lstar;
569 data->msr_star = vmcb->star;
570 data->msr_cstar = vmcb->cstar;
571 data->msr_syscall_mask = vmcb->sfmask;
572 data->msr_efer = v->arch.hvm_svm.cpu_shadow_efer;
574 data->tsc = hvm_get_guest_time(v);
575 }
578 void svm_load_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data)
579 {
580 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
582 vmcb->kerngsbase = data->shadow_gs;
583 vmcb->lstar = data->msr_lstar;
584 vmcb->star = data->msr_star;
585 vmcb->cstar = data->msr_cstar;
586 vmcb->sfmask = data->msr_syscall_mask;
587 v->arch.hvm_svm.cpu_shadow_efer = data->msr_efer;
588 vmcb->efer = data->msr_efer | EFER_SVME;
589 /* VMCB's EFER.LME isn't set unless we're actually in long mode
590 * (see long_mode_do_msr_write()) */
591 if ( !(vmcb->efer & EFER_LMA) )
592 vmcb->efer &= ~EFER_LME;
594 hvm_set_guest_time(v, data->tsc);
595 }
597 void svm_save_vmcb_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt)
598 {
599 svm_save_cpu_state(v, ctxt);
600 svm_vmcb_save(v, ctxt);
601 }
603 int svm_load_vmcb_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt)
604 {
605 svm_load_cpu_state(v, ctxt);
606 if (svm_vmcb_restore(v, ctxt)) {
607 printk("svm_vmcb restore failed!\n");
608 domain_crash(v->domain);
609 return -EINVAL;
610 }
612 return 0;
613 }
616 static inline void svm_restore_dr(struct vcpu *v)
617 {
618 if ( unlikely(v->arch.guest_context.debugreg[7] & 0xFF) )
619 __restore_debug_registers(v);
620 }
623 static int svm_realmode(struct vcpu *v)
624 {
625 unsigned long cr0 = v->arch.hvm_svm.cpu_shadow_cr0;
626 unsigned long eflags = v->arch.hvm_svm.vmcb->rflags;
628 return (eflags & X86_EFLAGS_VM) || !(cr0 & X86_CR0_PE);
629 }
631 static int svm_guest_x86_mode(struct vcpu *v)
632 {
633 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
635 if ( svm_long_mode_enabled(v) && vmcb->cs.attr.fields.l )
636 return 8;
638 if ( svm_realmode(v) )
639 return 2;
641 return (vmcb->cs.attr.fields.db ? 4 : 2);
642 }
644 void svm_update_host_cr3(struct vcpu *v)
645 {
646 /* SVM doesn't have a HOST_CR3 equivalent to update. */
647 }
649 void svm_update_guest_cr3(struct vcpu *v)
650 {
651 v->arch.hvm_svm.vmcb->cr3 = v->arch.hvm_vcpu.hw_cr3;
652 }
654 static void svm_update_vtpr(struct vcpu *v, unsigned long value)
655 {
656 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
658 vmcb->vintr.fields.tpr = value & 0x0f;
659 }
661 unsigned long svm_get_ctrl_reg(struct vcpu *v, unsigned int num)
662 {
663 switch ( num )
664 {
665 case 0:
666 return v->arch.hvm_svm.cpu_shadow_cr0;
667 case 2:
668 return v->arch.hvm_svm.cpu_cr2;
669 case 3:
670 return v->arch.hvm_svm.cpu_cr3;
671 case 4:
672 return v->arch.hvm_svm.cpu_shadow_cr4;
673 default:
674 BUG();
675 }
676 return 0; /* dummy */
677 }
679 static unsigned long svm_get_segment_base(struct vcpu *v, enum x86_segment seg)
680 {
681 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
682 int long_mode = 0;
684 #ifdef __x86_64__
685 long_mode = vmcb->cs.attr.fields.l && svm_long_mode_enabled(v);
686 #endif
687 switch ( seg )
688 {
689 case x86_seg_cs: return long_mode ? 0 : vmcb->cs.base;
690 case x86_seg_ds: return long_mode ? 0 : vmcb->ds.base;
691 case x86_seg_es: return long_mode ? 0 : vmcb->es.base;
692 case x86_seg_fs: return vmcb->fs.base;
693 case x86_seg_gs: return vmcb->gs.base;
694 case x86_seg_ss: return long_mode ? 0 : vmcb->ss.base;
695 case x86_seg_tr: return vmcb->tr.base;
696 case x86_seg_gdtr: return vmcb->gdtr.base;
697 case x86_seg_idtr: return vmcb->idtr.base;
698 case x86_seg_ldtr: return vmcb->ldtr.base;
699 }
700 BUG();
701 return 0;
702 }
704 static void svm_get_segment_register(struct vcpu *v, enum x86_segment seg,
705 struct segment_register *reg)
706 {
707 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
708 switch ( seg )
709 {
710 case x86_seg_cs: memcpy(reg, &vmcb->cs, sizeof(*reg)); break;
711 case x86_seg_ds: memcpy(reg, &vmcb->ds, sizeof(*reg)); break;
712 case x86_seg_es: memcpy(reg, &vmcb->es, sizeof(*reg)); break;
713 case x86_seg_fs: memcpy(reg, &vmcb->fs, sizeof(*reg)); break;
714 case x86_seg_gs: memcpy(reg, &vmcb->gs, sizeof(*reg)); break;
715 case x86_seg_ss: memcpy(reg, &vmcb->ss, sizeof(*reg)); break;
716 case x86_seg_tr: memcpy(reg, &vmcb->tr, sizeof(*reg)); break;
717 case x86_seg_gdtr: memcpy(reg, &vmcb->gdtr, sizeof(*reg)); break;
718 case x86_seg_idtr: memcpy(reg, &vmcb->idtr, sizeof(*reg)); break;
719 case x86_seg_ldtr: memcpy(reg, &vmcb->ldtr, sizeof(*reg)); break;
720 default: BUG();
721 }
722 }
724 /* Make sure that xen intercepts any FP accesses from current */
725 static void svm_stts(struct vcpu *v)
726 {
727 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
729 /*
730 * If the guest does not have TS enabled then we must cause and handle an
731 * exception on first use of the FPU. If the guest *does* have TS enabled
732 * then this is not necessary: no FPU activity can occur until the guest
733 * clears CR0.TS, and we will initialise the FPU when that happens.
734 */
735 if ( !(v->arch.hvm_svm.cpu_shadow_cr0 & X86_CR0_TS) )
736 {
737 v->arch.hvm_svm.vmcb->exception_intercepts |= EXCEPTION_BITMAP_NM;
738 vmcb->cr0 |= X86_CR0_TS;
739 }
740 }
743 static void svm_set_tsc_offset(struct vcpu *v, u64 offset)
744 {
745 v->arch.hvm_svm.vmcb->tsc_offset = offset;
746 }
749 static void svm_init_ap_context(
750 struct vcpu_guest_context *ctxt, int vcpuid, int trampoline_vector)
751 {
752 memset(ctxt, 0, sizeof(*ctxt));
754 /*
755 * We execute the trampoline code in real mode. The trampoline vector
756 * passed to us is page alligned and is the physicall frame number for
757 * the code. We will execute this code in real mode.
758 */
759 ctxt->user_regs.eip = 0x0;
760 ctxt->user_regs.cs = (trampoline_vector << 8);
761 }
763 static void svm_init_hypercall_page(struct domain *d, void *hypercall_page)
764 {
765 char *p;
766 int i;
768 memset(hypercall_page, 0, PAGE_SIZE);
770 for ( i = 0; i < (PAGE_SIZE / 32); i++ )
771 {
772 p = (char *)(hypercall_page + (i * 32));
773 *(u8 *)(p + 0) = 0xb8; /* mov imm32, %eax */
774 *(u32 *)(p + 1) = i;
775 *(u8 *)(p + 5) = 0x0f; /* vmmcall */
776 *(u8 *)(p + 6) = 0x01;
777 *(u8 *)(p + 7) = 0xd9;
778 *(u8 *)(p + 8) = 0xc3; /* ret */
779 }
781 /* Don't support HYPERVISOR_iret at the moment */
782 *(u16 *)(hypercall_page + (__HYPERVISOR_iret * 32)) = 0x0b0f; /* ud2 */
783 }
786 int svm_dbg_on = 0;
788 static inline int svm_do_debugout(unsigned long exit_code)
789 {
790 int i;
792 static unsigned long counter = 0;
793 static unsigned long works[] =
794 {
795 VMEXIT_IOIO,
796 VMEXIT_HLT,
797 VMEXIT_CPUID,
798 VMEXIT_DR0_READ,
799 VMEXIT_DR1_READ,
800 VMEXIT_DR2_READ,
801 VMEXIT_DR3_READ,
802 VMEXIT_DR6_READ,
803 VMEXIT_DR7_READ,
804 VMEXIT_DR0_WRITE,
805 VMEXIT_DR1_WRITE,
806 VMEXIT_DR2_WRITE,
807 VMEXIT_DR3_WRITE,
808 VMEXIT_CR0_READ,
809 VMEXIT_CR0_WRITE,
810 VMEXIT_CR3_READ,
811 VMEXIT_CR4_READ,
812 VMEXIT_MSR,
813 VMEXIT_CR0_WRITE,
814 VMEXIT_CR3_WRITE,
815 VMEXIT_CR4_WRITE,
816 VMEXIT_EXCEPTION_PF,
817 VMEXIT_INTR,
818 VMEXIT_INVLPG,
819 VMEXIT_EXCEPTION_NM
820 };
823 #if 0
824 if (svm_dbg_on && exit_code != 0x7B)
825 return 1;
826 #endif
828 counter++;
830 #if 0
831 if ((exit_code == 0x4E
832 || exit_code == VMEXIT_CR0_READ
833 || exit_code == VMEXIT_CR0_WRITE)
834 && counter < 200000)
835 return 0;
837 if ((exit_code == 0x4E) && counter < 500000)
838 return 0;
839 #endif
841 for (i = 0; i < sizeof(works) / sizeof(works[0]); i++)
842 if (exit_code == works[i])
843 return 0;
845 return 1;
846 }
848 static void save_svm_cpu_user_regs(struct vcpu *v, struct cpu_user_regs *ctxt)
849 {
850 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
852 ASSERT(vmcb);
854 ctxt->eax = vmcb->rax;
855 ctxt->ss = vmcb->ss.sel;
856 ctxt->esp = vmcb->rsp;
857 ctxt->eflags = vmcb->rflags;
858 ctxt->cs = vmcb->cs.sel;
859 ctxt->eip = vmcb->rip;
861 ctxt->gs = vmcb->gs.sel;
862 ctxt->fs = vmcb->fs.sel;
863 ctxt->es = vmcb->es.sel;
864 ctxt->ds = vmcb->ds.sel;
865 }
867 static void svm_store_cpu_user_regs(struct cpu_user_regs *regs, struct vcpu *v)
868 {
869 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
871 regs->eip = vmcb->rip;
872 regs->esp = vmcb->rsp;
873 regs->eflags = vmcb->rflags;
874 regs->cs = vmcb->cs.sel;
875 regs->ds = vmcb->ds.sel;
876 regs->es = vmcb->es.sel;
877 regs->ss = vmcb->ss.sel;
878 }
880 /* XXX Use svm_load_cpu_guest_regs instead */
881 static void svm_load_cpu_user_regs(struct vcpu *v, struct cpu_user_regs *regs)
882 {
883 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
884 u32 *intercepts = &v->arch.hvm_svm.vmcb->exception_intercepts;
886 /* Write the guest register value into VMCB */
887 vmcb->rax = regs->eax;
888 vmcb->ss.sel = regs->ss;
889 vmcb->rsp = regs->esp;
890 vmcb->rflags = regs->eflags | 2UL;
891 vmcb->cs.sel = regs->cs;
892 vmcb->rip = regs->eip;
893 if (regs->eflags & EF_TF)
894 *intercepts |= EXCEPTION_BITMAP_DB;
895 else
896 *intercepts &= ~EXCEPTION_BITMAP_DB;
897 }
899 static void svm_load_cpu_guest_regs(
900 struct vcpu *v, struct cpu_user_regs *regs)
901 {
902 svm_load_cpu_user_regs(v, regs);
903 }
905 static void arch_svm_do_launch(struct vcpu *v)
906 {
907 svm_do_launch(v);
909 if ( paging_mode_hap(v->domain) ) {
910 v->arch.hvm_svm.vmcb->h_cr3 = pagetable_get_paddr(v->domain->arch.phys_table);
911 }
913 if ( v->vcpu_id != 0 )
914 {
915 cpu_user_regs_t *regs = &current->arch.guest_context.user_regs;
916 u16 cs_sel = regs->cs;
917 /*
918 * This is the launch of an AP; set state so that we begin executing
919 * the trampoline code in real-mode.
920 */
921 svm_do_vmmcall_reset_to_realmode(v, regs);
922 /* Adjust the state to execute the trampoline code.*/
923 v->arch.hvm_svm.vmcb->rip = 0;
924 v->arch.hvm_svm.vmcb->cs.sel= cs_sel;
925 v->arch.hvm_svm.vmcb->cs.base = (cs_sel << 4);
926 }
928 reset_stack_and_jump(svm_asm_do_launch);
929 }
931 static void svm_ctxt_switch_from(struct vcpu *v)
932 {
933 svm_save_dr(v);
934 }
936 static void svm_ctxt_switch_to(struct vcpu *v)
937 {
938 #ifdef __x86_64__
939 /*
940 * This is required, because VMRUN does consistency check
941 * and some of the DOM0 selectors are pointing to
942 * invalid GDT locations, and cause AMD processors
943 * to shutdown.
944 */
945 set_segment_register(ds, 0);
946 set_segment_register(es, 0);
947 set_segment_register(ss, 0);
948 #endif
949 svm_restore_dr(v);
950 }
952 static int svm_vcpu_initialise(struct vcpu *v)
953 {
954 int rc;
956 v->arch.schedule_tail = arch_svm_do_launch;
957 v->arch.ctxt_switch_from = svm_ctxt_switch_from;
958 v->arch.ctxt_switch_to = svm_ctxt_switch_to;
960 v->arch.hvm_svm.saved_irq_vector = -1;
962 if ( (rc = svm_create_vmcb(v)) != 0 )
963 {
964 dprintk(XENLOG_WARNING,
965 "Failed to create VMCB for vcpu %d: err=%d.\n",
966 v->vcpu_id, rc);
967 return rc;
968 }
970 return 0;
971 }
973 static void svm_vcpu_destroy(struct vcpu *v)
974 {
975 svm_destroy_vmcb(v);
976 }
978 static void svm_hvm_inject_exception(
979 unsigned int trapnr, int errcode, unsigned long cr2)
980 {
981 struct vcpu *v = current;
982 if ( trapnr == TRAP_page_fault )
983 v->arch.hvm_svm.vmcb->cr2 = v->arch.hvm_svm.cpu_cr2 = cr2;
984 svm_inject_exception(v, trapnr, (errcode != -1), errcode);
985 }
987 static int svm_event_injection_faulted(struct vcpu *v)
988 {
989 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
990 return vmcb->exitintinfo.fields.v;
991 }
993 static struct hvm_function_table svm_function_table = {
994 .disable = stop_svm,
995 .vcpu_initialise = svm_vcpu_initialise,
996 .vcpu_destroy = svm_vcpu_destroy,
997 .store_cpu_guest_regs = svm_store_cpu_guest_regs,
998 .load_cpu_guest_regs = svm_load_cpu_guest_regs,
999 .save_cpu_ctxt = svm_save_vmcb_ctxt,
1000 .load_cpu_ctxt = svm_load_vmcb_ctxt,
1001 .paging_enabled = svm_paging_enabled,
1002 .long_mode_enabled = svm_long_mode_enabled,
1003 .pae_enabled = svm_pae_enabled,
1004 .guest_x86_mode = svm_guest_x86_mode,
1005 .get_guest_ctrl_reg = svm_get_ctrl_reg,
1006 .get_segment_base = svm_get_segment_base,
1007 .get_segment_register = svm_get_segment_register,
1008 .update_host_cr3 = svm_update_host_cr3,
1009 .update_guest_cr3 = svm_update_guest_cr3,
1010 .update_vtpr = svm_update_vtpr,
1011 .stts = svm_stts,
1012 .set_tsc_offset = svm_set_tsc_offset,
1013 .inject_exception = svm_hvm_inject_exception,
1014 .init_ap_context = svm_init_ap_context,
1015 .init_hypercall_page = svm_init_hypercall_page,
1016 .event_injection_faulted = svm_event_injection_faulted
1017 };
1019 void svm_npt_detect(void)
1021 u32 eax, ebx, ecx, edx;
1023 /* check CPUID for nested paging support */
1024 cpuid(0x8000000A, &eax, &ebx, &ecx, &edx);
1025 if ( edx & 0x01 ) { /* nested paging */
1026 hap_capable_system = 1;
1028 else if ( opt_hap_enabled ) {
1029 printk(" nested paging is not supported by this CPU.\n");
1030 hap_capable_system = 0; /* no nested paging, we disable flag. */
1034 int start_svm(void)
1036 u32 eax, ecx, edx;
1037 u32 phys_hsa_lo, phys_hsa_hi;
1038 u64 phys_hsa;
1039 int cpu = smp_processor_id();
1041 /* Xen does not fill x86_capability words except 0. */
1042 ecx = cpuid_ecx(0x80000001);
1043 boot_cpu_data.x86_capability[5] = ecx;
1045 if (!(test_bit(X86_FEATURE_SVME, &boot_cpu_data.x86_capability)))
1046 return 0;
1048 /* check whether SVM feature is disabled in BIOS */
1049 rdmsr(MSR_K8_VM_CR, eax, edx);
1050 if ( eax & K8_VMCR_SVME_DISABLE )
1052 printk("AMD SVM Extension is disabled in BIOS.\n");
1053 return 0;
1056 if (!hsa[cpu])
1057 if (!(hsa[cpu] = alloc_host_save_area()))
1058 return 0;
1060 rdmsr(MSR_EFER, eax, edx);
1061 eax |= EFER_SVME;
1062 wrmsr(MSR_EFER, eax, edx);
1063 printk("AMD SVM Extension is enabled for cpu %d.\n", cpu );
1065 svm_npt_detect();
1067 /* Initialize the HSA for this core */
1068 phys_hsa = (u64) virt_to_maddr(hsa[cpu]);
1069 phys_hsa_lo = (u32) phys_hsa;
1070 phys_hsa_hi = (u32) (phys_hsa >> 32);
1071 wrmsr(MSR_K8_VM_HSAVE_PA, phys_hsa_lo, phys_hsa_hi);
1073 if (!root_vmcb[cpu])
1074 if (!(root_vmcb[cpu] = alloc_vmcb()))
1075 return 0;
1076 root_vmcb_pa[cpu] = virt_to_maddr(root_vmcb[cpu]);
1078 if (cpu == 0)
1079 setup_vmcb_dump();
1081 hvm_enable(&svm_function_table);
1083 return 1;
1086 void arch_svm_do_resume(struct vcpu *v)
1088 /* pinning VCPU to a different core? */
1089 if ( v->arch.hvm_svm.launch_core == smp_processor_id()) {
1090 hvm_do_resume( v );
1091 reset_stack_and_jump( svm_asm_do_resume );
1093 else {
1094 if (svm_dbg_on)
1095 printk("VCPU core pinned: %d to %d\n",
1096 v->arch.hvm_svm.launch_core, smp_processor_id() );
1097 v->arch.hvm_svm.launch_core = smp_processor_id();
1098 hvm_migrate_timers( v );
1099 hvm_do_resume( v );
1100 reset_stack_and_jump( svm_asm_do_resume );
1104 static int svm_do_nested_pgfault(paddr_t gpa, struct cpu_user_regs *regs)
1106 if (mmio_space(gpa)) {
1107 handle_mmio(gpa);
1108 return 1;
1111 /* We should not reach here. Otherwise, P2M table is not correct.*/
1112 return 0;
1116 static int svm_do_page_fault(unsigned long va, struct cpu_user_regs *regs)
1118 HVM_DBG_LOG(DBG_LEVEL_VMMU,
1119 "svm_do_page_fault = 0x%lx, eip = %lx, error_code = %lx",
1120 va, (unsigned long)current->arch.hvm_svm.vmcb->rip,
1121 (unsigned long)regs->error_code);
1122 return paging_fault(va, regs);
1126 static void svm_do_no_device_fault(struct vmcb_struct *vmcb)
1128 struct vcpu *v = current;
1130 setup_fpu(v);
1131 vmcb->exception_intercepts &= ~EXCEPTION_BITMAP_NM;
1133 if ( !(v->arch.hvm_svm.cpu_shadow_cr0 & X86_CR0_TS) )
1134 vmcb->cr0 &= ~X86_CR0_TS;
1138 static void svm_do_general_protection_fault(struct vcpu *v,
1139 struct cpu_user_regs *regs)
1141 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
1142 unsigned long eip, error_code;
1144 ASSERT(vmcb);
1146 eip = vmcb->rip;
1147 error_code = vmcb->exitinfo1;
1149 if (vmcb->idtr.limit == 0) {
1150 printk("Huh? We got a GP Fault with an invalid IDTR!\n");
1151 svm_dump_vmcb(__func__, vmcb);
1152 svm_dump_regs(__func__, regs);
1153 svm_dump_inst(svm_rip2pointer(v));
1154 domain_crash(v->domain);
1155 return;
1158 HVM_DBG_LOG(DBG_LEVEL_1,
1159 "svm_general_protection_fault: eip = %lx, erro_code = %lx",
1160 eip, error_code);
1162 HVM_DBG_LOG(DBG_LEVEL_1,
1163 "eax=%lx, ebx=%lx, ecx=%lx, edx=%lx, esi=%lx, edi=%lx",
1164 (unsigned long)regs->eax, (unsigned long)regs->ebx,
1165 (unsigned long)regs->ecx, (unsigned long)regs->edx,
1166 (unsigned long)regs->esi, (unsigned long)regs->edi);
1168 /* Reflect it back into the guest */
1169 svm_inject_exception(v, TRAP_gp_fault, 1, error_code);
1172 /* Reserved bits ECX: [31:14], [12:4], [2:1]*/
1173 #define SVM_VCPU_CPUID_L1_ECX_RESERVED 0xffffdff6
1174 /* Reserved bits EDX: [31:29], [27], [22:20], [18], [10] */
1175 #define SVM_VCPU_CPUID_L1_EDX_RESERVED 0xe8740400
1177 static void svm_vmexit_do_cpuid(struct vmcb_struct *vmcb,
1178 struct cpu_user_regs *regs)
1180 unsigned long input = regs->eax;
1181 unsigned int eax, ebx, ecx, edx;
1182 struct vcpu *v = current;
1183 int inst_len;
1185 ASSERT(vmcb);
1187 hvm_cpuid(input, &eax, &ebx, &ecx, &edx);
1189 if ( input == 0x00000001 )
1191 /* Clear out reserved bits. */
1192 ecx &= ~SVM_VCPU_CPUID_L1_ECX_RESERVED;
1193 edx &= ~SVM_VCPU_CPUID_L1_EDX_RESERVED;
1195 /* Guest should only see one logical processor.
1196 * See details on page 23 of AMD CPUID Specification.
1197 */
1198 clear_bit(X86_FEATURE_HT & 31, &edx); /* clear the hyperthread bit */
1199 ebx &= 0xFF00FFFF; /* clear the logical processor count when HTT=0 */
1200 ebx |= 0x00010000; /* set to 1 just for precaution */
1202 else if ( input == 0x80000001 )
1204 if ( vlapic_hw_disabled(vcpu_vlapic(v)) )
1205 clear_bit(X86_FEATURE_APIC & 31, &edx);
1207 #if CONFIG_PAGING_LEVELS >= 3
1208 if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] )
1209 #endif
1210 clear_bit(X86_FEATURE_PAE & 31, &edx);
1212 clear_bit(X86_FEATURE_PSE36 & 31, &edx);
1214 /* Clear the Cmp_Legacy bit
1215 * This bit is supposed to be zero when HTT = 0.
1216 * See details on page 23 of AMD CPUID Specification.
1217 */
1218 clear_bit(X86_FEATURE_CMP_LEGACY & 31, &ecx);
1220 /* Make SVM feature invisible to the guest. */
1221 clear_bit(X86_FEATURE_SVME & 31, &ecx);
1223 /* So far, we do not support 3DNow for the guest. */
1224 clear_bit(X86_FEATURE_3DNOW & 31, &edx);
1225 clear_bit(X86_FEATURE_3DNOWEXT & 31, &edx);
1226 /* no FFXSR instructions feature. */
1227 clear_bit(X86_FEATURE_FFXSR & 31, &edx);
1229 else if ( input == 0x80000007 || input == 0x8000000A )
1231 /* Mask out features of power management and SVM extension. */
1232 eax = ebx = ecx = edx = 0;
1234 else if ( input == 0x80000008 )
1236 /* Make sure Number of CPU core is 1 when HTT=0 */
1237 ecx &= 0xFFFFFF00;
1240 regs->eax = (unsigned long)eax;
1241 regs->ebx = (unsigned long)ebx;
1242 regs->ecx = (unsigned long)ecx;
1243 regs->edx = (unsigned long)edx;
1245 HVMTRACE_3D(CPUID, v, input,
1246 ((uint64_t)eax << 32) | ebx, ((uint64_t)ecx << 32) | edx);
1248 inst_len = __get_instruction_length(v, INSTR_CPUID, NULL);
1249 ASSERT(inst_len > 0);
1250 __update_guest_eip(vmcb, inst_len);
1253 static inline unsigned long *get_reg_p(
1254 unsigned int gpreg,
1255 struct cpu_user_regs *regs, struct vmcb_struct *vmcb)
1257 unsigned long *reg_p = NULL;
1258 switch (gpreg)
1260 case SVM_REG_EAX:
1261 reg_p = (unsigned long *)&regs->eax;
1262 break;
1263 case SVM_REG_EBX:
1264 reg_p = (unsigned long *)&regs->ebx;
1265 break;
1266 case SVM_REG_ECX:
1267 reg_p = (unsigned long *)&regs->ecx;
1268 break;
1269 case SVM_REG_EDX:
1270 reg_p = (unsigned long *)&regs->edx;
1271 break;
1272 case SVM_REG_EDI:
1273 reg_p = (unsigned long *)&regs->edi;
1274 break;
1275 case SVM_REG_ESI:
1276 reg_p = (unsigned long *)&regs->esi;
1277 break;
1278 case SVM_REG_EBP:
1279 reg_p = (unsigned long *)&regs->ebp;
1280 break;
1281 case SVM_REG_ESP:
1282 reg_p = (unsigned long *)&vmcb->rsp;
1283 break;
1284 #ifdef __x86_64__
1285 case SVM_REG_R8:
1286 reg_p = (unsigned long *)&regs->r8;
1287 break;
1288 case SVM_REG_R9:
1289 reg_p = (unsigned long *)&regs->r9;
1290 break;
1291 case SVM_REG_R10:
1292 reg_p = (unsigned long *)&regs->r10;
1293 break;
1294 case SVM_REG_R11:
1295 reg_p = (unsigned long *)&regs->r11;
1296 break;
1297 case SVM_REG_R12:
1298 reg_p = (unsigned long *)&regs->r12;
1299 break;
1300 case SVM_REG_R13:
1301 reg_p = (unsigned long *)&regs->r13;
1302 break;
1303 case SVM_REG_R14:
1304 reg_p = (unsigned long *)&regs->r14;
1305 break;
1306 case SVM_REG_R15:
1307 reg_p = (unsigned long *)&regs->r15;
1308 break;
1309 #endif
1310 default:
1311 BUG();
1314 return reg_p;
1318 static inline unsigned long get_reg(unsigned int gpreg,
1319 struct cpu_user_regs *regs, struct vmcb_struct *vmcb)
1321 unsigned long *gp;
1322 gp = get_reg_p(gpreg, regs, vmcb);
1323 return *gp;
1327 static inline void set_reg(unsigned int gpreg, unsigned long value,
1328 struct cpu_user_regs *regs, struct vmcb_struct *vmcb)
1330 unsigned long *gp;
1331 gp = get_reg_p(gpreg, regs, vmcb);
1332 *gp = value;
1336 static void svm_dr_access(struct vcpu *v, struct cpu_user_regs *regs)
1338 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
1340 HVMTRACE_0D(DR_WRITE, v);
1342 v->arch.hvm_vcpu.flag_dr_dirty = 1;
1344 __restore_debug_registers(v);
1346 /* allow the guest full access to the debug registers */
1347 vmcb->dr_intercepts = 0;
1351 static void svm_get_prefix_info(struct vcpu *v, unsigned int dir,
1352 svm_segment_register_t **seg,
1353 unsigned int *asize)
1355 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
1356 unsigned char inst[MAX_INST_LEN];
1357 int i;
1359 memset(inst, 0, MAX_INST_LEN);
1360 if (inst_copy_from_guest(inst, svm_rip2pointer(v), sizeof(inst))
1361 != MAX_INST_LEN)
1363 gdprintk(XENLOG_ERR, "get guest instruction failed\n");
1364 domain_crash(current->domain);
1365 return;
1368 for (i = 0; i < MAX_INST_LEN; i++)
1370 switch (inst[i])
1372 case 0xf3: /* REPZ */
1373 case 0xf2: /* REPNZ */
1374 case 0xf0: /* LOCK */
1375 case 0x66: /* data32 */
1376 #ifdef __x86_64__
1377 /* REX prefixes */
1378 case 0x40:
1379 case 0x41:
1380 case 0x42:
1381 case 0x43:
1382 case 0x44:
1383 case 0x45:
1384 case 0x46:
1385 case 0x47:
1387 case 0x48:
1388 case 0x49:
1389 case 0x4a:
1390 case 0x4b:
1391 case 0x4c:
1392 case 0x4d:
1393 case 0x4e:
1394 case 0x4f:
1395 #endif
1396 continue;
1397 case 0x67: /* addr32 */
1398 *asize ^= 48; /* Switch 16/32 bits */
1399 continue;
1400 case 0x2e: /* CS */
1401 *seg = &vmcb->cs;
1402 continue;
1403 case 0x36: /* SS */
1404 *seg = &vmcb->ss;
1405 continue;
1406 case 0x26: /* ES */
1407 *seg = &vmcb->es;
1408 continue;
1409 case 0x64: /* FS */
1410 *seg = &vmcb->fs;
1411 continue;
1412 case 0x65: /* GS */
1413 *seg = &vmcb->gs;
1414 continue;
1415 case 0x3e: /* DS */
1416 *seg = &vmcb->ds;
1417 continue;
1418 default:
1419 break;
1421 return;
1426 /* Get the address of INS/OUTS instruction */
1427 static inline int svm_get_io_address(
1428 struct vcpu *v, struct cpu_user_regs *regs,
1429 unsigned int size, ioio_info_t info,
1430 unsigned long *count, unsigned long *addr)
1432 unsigned long reg;
1433 unsigned int asize, isize;
1434 int long_mode = 0;
1435 svm_segment_register_t *seg = NULL;
1436 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
1438 #ifdef __x86_64__
1439 /* If we're in long mode, we shouldn't check the segment presence & limit */
1440 long_mode = vmcb->cs.attr.fields.l && svm_long_mode_enabled(v);
1441 #endif
1443 /* d field of cs.attr is 1 for 32-bit, 0 for 16 or 64 bit.
1444 * l field combined with EFER_LMA says whether it's 16 or 64 bit.
1445 */
1446 asize = (long_mode)?64:((vmcb->cs.attr.fields.db)?32:16);
1449 /* The ins/outs instructions are single byte, so if we have got more
1450 * than one byte (+ maybe rep-prefix), we have some prefix so we need
1451 * to figure out what it is...
1452 */
1453 isize = vmcb->exitinfo2 - vmcb->rip;
1455 if (info.fields.rep)
1456 isize --;
1458 if (isize > 1)
1459 svm_get_prefix_info(v, info.fields.type, &seg, &asize);
1461 if (info.fields.type == IOREQ_WRITE)
1463 reg = regs->esi;
1464 if (!seg) /* If no prefix, used DS. */
1465 seg = &vmcb->ds;
1466 if (!long_mode && (seg->attr.fields.type & 0xa) == 0x8) {
1467 svm_inject_exception(v, TRAP_gp_fault, 1, 0);
1468 return 0;
1471 else
1473 reg = regs->edi;
1474 seg = &vmcb->es; /* Note: This is ALWAYS ES. */
1475 if (!long_mode && (seg->attr.fields.type & 0xa) != 0x2) {
1476 svm_inject_exception(v, TRAP_gp_fault, 1, 0);
1477 return 0;
1481 /* If the segment isn't present, give GP fault! */
1482 if (!long_mode && !seg->attr.fields.p)
1484 svm_inject_exception(v, TRAP_gp_fault, 1, 0);
1485 return 0;
1488 if (asize == 16)
1490 *addr = (reg & 0xFFFF);
1491 *count = regs->ecx & 0xffff;
1493 else
1495 *addr = reg;
1496 *count = regs->ecx;
1498 if (!info.fields.rep)
1499 *count = 1;
1501 if (!long_mode)
1503 ASSERT(*addr == (u32)*addr);
1504 if ((u32)(*addr + size - 1) < (u32)*addr ||
1505 (seg->attr.fields.type & 0xc) != 0x4 ?
1506 *addr + size - 1 > seg->limit :
1507 *addr <= seg->limit)
1509 svm_inject_exception(v, TRAP_gp_fault, 1, 0);
1510 return 0;
1513 /* Check the limit for repeated instructions, as above we checked only
1514 the first instance. Truncate the count if a limit violation would
1515 occur. Note that the checking is not necessary for page granular
1516 segments as transfers crossing page boundaries will be broken up
1517 anyway. */
1518 if (!seg->attr.fields.g && *count > 1)
1520 if ((seg->attr.fields.type & 0xc) != 0x4)
1522 /* expand-up */
1523 if (!(regs->eflags & EF_DF))
1525 if (*addr + *count * size - 1 < *addr ||
1526 *addr + *count * size - 1 > seg->limit)
1527 *count = (seg->limit + 1UL - *addr) / size;
1529 else
1531 if (*count - 1 > *addr / size)
1532 *count = *addr / size + 1;
1535 else
1537 /* expand-down */
1538 if (!(regs->eflags & EF_DF))
1540 if (*count - 1 > -(s32)*addr / size)
1541 *count = -(s32)*addr / size + 1UL;
1543 else
1545 if (*addr < (*count - 1) * size ||
1546 *addr - (*count - 1) * size <= seg->limit)
1547 *count = (*addr - seg->limit - 1) / size + 1;
1550 ASSERT(*count);
1553 *addr += seg->base;
1555 #ifdef __x86_64__
1556 else
1558 if (seg == &vmcb->fs || seg == &vmcb->gs)
1559 *addr += seg->base;
1561 if (!is_canonical_address(*addr) ||
1562 !is_canonical_address(*addr + size - 1))
1564 svm_inject_exception(v, TRAP_gp_fault, 1, 0);
1565 return 0;
1567 if (*count > (1UL << 48) / size)
1568 *count = (1UL << 48) / size;
1569 if (!(regs->eflags & EF_DF))
1571 if (*addr + *count * size - 1 < *addr ||
1572 !is_canonical_address(*addr + *count * size - 1))
1573 *count = (*addr & ~((1UL << 48) - 1)) / size;
1575 else
1577 if ((*count - 1) * size > *addr ||
1578 !is_canonical_address(*addr + (*count - 1) * size))
1579 *count = (*addr & ~((1UL << 48) - 1)) / size + 1;
1581 ASSERT(*count);
1583 #endif
1585 return 1;
1589 static void svm_io_instruction(struct vcpu *v)
1591 struct cpu_user_regs *regs;
1592 struct hvm_io_op *pio_opp;
1593 unsigned int port;
1594 unsigned int size, dir, df;
1595 ioio_info_t info;
1596 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
1598 ASSERT(vmcb);
1599 pio_opp = &current->arch.hvm_vcpu.io_op;
1600 pio_opp->instr = INSTR_PIO;
1601 pio_opp->flags = 0;
1603 regs = &pio_opp->io_context;
1605 /* Copy current guest state into io instruction state structure. */
1606 memcpy(regs, guest_cpu_user_regs(), HVM_CONTEXT_STACK_BYTES);
1607 hvm_store_cpu_guest_regs(v, regs, NULL);
1609 info.bytes = vmcb->exitinfo1;
1611 port = info.fields.port; /* port used to be addr */
1612 dir = info.fields.type; /* direction */
1613 df = regs->eflags & X86_EFLAGS_DF ? 1 : 0;
1615 if (info.fields.sz32)
1616 size = 4;
1617 else if (info.fields.sz16)
1618 size = 2;
1619 else
1620 size = 1;
1622 if (dir==IOREQ_READ)
1623 HVMTRACE_2D(IO_READ, v, port, size);
1624 else
1625 HVMTRACE_2D(IO_WRITE, v, port, size);
1627 HVM_DBG_LOG(DBG_LEVEL_IO,
1628 "svm_io_instruction: port 0x%x eip=%x:%"PRIx64", "
1629 "exit_qualification = %"PRIx64,
1630 port, vmcb->cs.sel, vmcb->rip, info.bytes);
1632 /* string instruction */
1633 if (info.fields.str)
1635 unsigned long addr, count;
1636 paddr_t paddr;
1637 unsigned long gfn;
1638 int sign = regs->eflags & X86_EFLAGS_DF ? -1 : 1;
1640 if (!svm_get_io_address(v, regs, size, info, &count, &addr))
1642 /* We failed to get a valid address, so don't do the IO operation -
1643 * it would just get worse if we do! Hopefully the guest is handing
1644 * gp-faults...
1645 */
1646 return;
1649 /* "rep" prefix */
1650 if (info.fields.rep)
1652 pio_opp->flags |= REPZ;
1655 /* Translate the address to a physical address */
1656 gfn = paging_gva_to_gfn(v, addr);
1657 if ( gfn == INVALID_GFN )
1659 /* The guest does not have the RAM address mapped.
1660 * Need to send in a page fault */
1661 int errcode = 0;
1662 /* IO read --> memory write */
1663 if ( dir == IOREQ_READ ) errcode |= PFEC_write_access;
1664 svm_hvm_inject_exception(TRAP_page_fault, errcode, addr);
1665 return;
1667 paddr = (paddr_t)gfn << PAGE_SHIFT | (addr & ~PAGE_MASK);
1669 /*
1670 * Handle string pio instructions that cross pages or that
1671 * are unaligned. See the comments in hvm_platform.c/handle_mmio()
1672 */
1673 if ((addr & PAGE_MASK) != ((addr + size - 1) & PAGE_MASK))
1675 unsigned long value = 0;
1677 pio_opp->flags |= OVERLAP;
1678 pio_opp->addr = addr;
1680 if (dir == IOREQ_WRITE) /* OUTS */
1682 if ( hvm_paging_enabled(current) )
1684 int rv = hvm_copy_from_guest_virt(&value, addr, size);
1685 if ( rv != 0 )
1687 /* Failed on the page-spanning copy. Inject PF into
1688 * the guest for the address where we failed. */
1689 addr += size - rv;
1690 gdprintk(XENLOG_DEBUG, "Pagefault reading non-io side "
1691 "of a page-spanning PIO: va=%#lx\n", addr);
1692 svm_hvm_inject_exception(TRAP_page_fault, 0, addr);
1693 return;
1696 else
1697 (void) hvm_copy_from_guest_phys(&value, addr, size);
1698 } else /* dir != IOREQ_WRITE */
1699 /* Remember where to write the result, as a *VA*.
1700 * Must be a VA so we can handle the page overlap
1701 * correctly in hvm_pio_assist() */
1702 pio_opp->addr = addr;
1704 if (count == 1)
1705 regs->eip = vmcb->exitinfo2;
1707 send_pio_req(port, 1, size, value, dir, df, 0);
1709 else
1711 unsigned long last_addr = sign > 0 ? addr + count * size - 1
1712 : addr - (count - 1) * size;
1714 if ((addr & PAGE_MASK) != (last_addr & PAGE_MASK))
1716 if (sign > 0)
1717 count = (PAGE_SIZE - (addr & ~PAGE_MASK)) / size;
1718 else
1719 count = (addr & ~PAGE_MASK) / size + 1;
1721 else
1722 regs->eip = vmcb->exitinfo2;
1724 send_pio_req(port, count, size, paddr, dir, df, 1);
1727 else
1729 /*
1730 * On SVM, the RIP of the intruction following the IN/OUT is saved in
1731 * ExitInfo2
1732 */
1733 regs->eip = vmcb->exitinfo2;
1735 if (port == 0xe9 && dir == IOREQ_WRITE && size == 1)
1736 hvm_print_line(v, regs->eax); /* guest debug output */
1738 send_pio_req(port, 1, size, regs->eax, dir, df, 0);
1742 static int npt_set_cr0(unsigned long value)
1744 struct vcpu *v = current;
1745 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
1747 ASSERT(vmcb);
1749 /* ET is reserved and should be always be 1*/
1750 value |= X86_CR0_ET;
1752 /* Check whether the guest is about to turn on long mode.
1753 * If it is, set EFER.LME and EFER.LMA. Update the shadow EFER.LMA
1754 * bit too, so svm_long_mode_enabled() will work.
1755 */
1756 if ( (value & X86_CR0_PG) && svm_lme_is_set(v) &&
1757 (vmcb->cr4 & X86_CR4_PAE) && (vmcb->cr0 & X86_CR0_PE) )
1759 v->arch.hvm_svm.cpu_shadow_efer |= EFER_LMA;
1760 vmcb->efer |= EFER_LMA | EFER_LME;
1763 /* Whenever CR0.PG is cleared under long mode, LMA will be cleared
1764 * immediatly. We emulate this process for svm_long_mode_enabled().
1765 */
1766 if ( (value & (X86_CR0_PE | X86_CR0_PG)) == X86_CR0_PE )
1768 if ( svm_long_mode_enabled(v) )
1770 v->arch.hvm_svm.cpu_shadow_efer &= ~EFER_LMA;
1774 vmcb->cr0 = value | X86_CR0_WP;
1775 v->arch.hvm_svm.cpu_shadow_cr0 = value;
1777 /* TS cleared? Then initialise FPU now. */
1778 if ( !(value & X86_CR0_TS) ) {
1779 setup_fpu(v);
1780 vmcb->exception_intercepts &= ~EXCEPTION_BITMAP_NM;
1783 paging_update_paging_modes(v);
1785 return 1;
1788 static int svm_set_cr0(unsigned long value)
1790 struct vcpu *v = current;
1791 unsigned long mfn;
1792 int paging_enabled;
1793 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
1794 unsigned long old_base_mfn;
1796 ASSERT(vmcb);
1798 /* We don't want to lose PG. ET is reserved and should be always be 1*/
1799 paging_enabled = svm_paging_enabled(v);
1800 value |= X86_CR0_ET;
1801 vmcb->cr0 = value | X86_CR0_PG | X86_CR0_WP;
1802 v->arch.hvm_svm.cpu_shadow_cr0 = value;
1804 /* TS cleared? Then initialise FPU now. */
1805 if ( !(value & X86_CR0_TS) )
1807 setup_fpu(v);
1808 vmcb->exception_intercepts &= ~EXCEPTION_BITMAP_NM;
1811 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR0 value = %lx\n", value);
1813 if ( ((value & (X86_CR0_PE | X86_CR0_PG)) == (X86_CR0_PE | X86_CR0_PG))
1814 && !paging_enabled )
1816 /* The guest CR3 must be pointing to the guest physical. */
1817 mfn = get_mfn_from_gpfn(v->arch.hvm_svm.cpu_cr3 >> PAGE_SHIFT);
1818 if ( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), v->domain))
1820 gdprintk(XENLOG_ERR, "Invalid CR3 value = %lx (mfn=%lx)\n",
1821 v->arch.hvm_svm.cpu_cr3, mfn);
1822 domain_crash(v->domain);
1823 return 0;
1826 #if defined(__x86_64__)
1827 if ( svm_lme_is_set(v) && !svm_cr4_pae_is_set(v) )
1829 HVM_DBG_LOG(DBG_LEVEL_1, "Enable paging before PAE enable\n");
1830 svm_inject_exception(v, TRAP_gp_fault, 1, 0);
1833 if ( svm_lme_is_set(v) )
1835 HVM_DBG_LOG(DBG_LEVEL_1, "Enable the Long mode\n");
1836 v->arch.hvm_svm.cpu_shadow_efer |= EFER_LMA;
1837 vmcb->efer |= EFER_LMA | EFER_LME;
1839 #endif /* __x86_64__ */
1841 /* Now arch.guest_table points to machine physical. */
1842 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1843 v->arch.guest_table = pagetable_from_pfn(mfn);
1844 if ( old_base_mfn )
1845 put_page(mfn_to_page(old_base_mfn));
1846 paging_update_paging_modes(v);
1848 HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
1849 (unsigned long) (mfn << PAGE_SHIFT));
1852 if ( !((value & X86_CR0_PE) && (value & X86_CR0_PG)) && paging_enabled )
1853 if ( v->arch.hvm_svm.cpu_cr3 ) {
1854 put_page(mfn_to_page(get_mfn_from_gpfn(
1855 v->arch.hvm_svm.cpu_cr3 >> PAGE_SHIFT)));
1856 v->arch.guest_table = pagetable_null();
1859 /*
1860 * SVM implements paged real-mode and when we return to real-mode
1861 * we revert back to the physical mappings that the domain builder
1862 * created.
1863 */
1864 if ((value & X86_CR0_PE) == 0) {
1865 if (value & X86_CR0_PG) {
1866 svm_inject_exception(v, TRAP_gp_fault, 1, 0);
1867 return 0;
1869 paging_update_paging_modes(v);
1871 else if ( (value & (X86_CR0_PE | X86_CR0_PG)) == X86_CR0_PE )
1873 if ( svm_long_mode_enabled(v) )
1875 vmcb->efer &= ~(EFER_LME | EFER_LMA);
1876 v->arch.hvm_svm.cpu_shadow_efer &= ~EFER_LMA;
1878 /* we should take care of this kind of situation */
1879 paging_update_paging_modes(v);
1882 return 1;
1885 //
1886 // nested paging functions
1887 //
1889 static int npt_mov_to_cr(int gpreg, int cr, struct cpu_user_regs *regs)
1891 unsigned long value;
1892 struct vcpu *v = current;
1893 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
1894 struct vlapic *vlapic = vcpu_vlapic(v);
1896 ASSERT(vmcb);
1898 value = get_reg(gpreg, regs, vmcb);
1900 switch (cr) {
1901 case 0:
1902 return npt_set_cr0(value);
1904 case 3:
1905 vmcb->cr3 = value;
1906 v->arch.hvm_svm.cpu_cr3 = value;
1907 break;
1909 case 4: /* CR4 */
1910 vmcb->cr4 = value;
1911 v->arch.hvm_svm.cpu_shadow_cr4 = value;
1912 paging_update_paging_modes(v);
1913 break;
1915 case 8:
1916 vlapic_set_reg(vlapic, APIC_TASKPRI, ((value & 0x0F) << 4));
1917 vmcb->vintr.fields.tpr = value & 0x0F;
1918 break;
1920 default:
1921 gdprintk(XENLOG_ERR, "invalid cr: %d\n", cr);
1922 domain_crash(v->domain);
1923 return 0;
1926 return 1;
1929 static void npt_mov_from_cr(int cr, int gp, struct cpu_user_regs *regs)
1931 unsigned long value = 0;
1932 struct vcpu *v = current;
1933 struct vmcb_struct *vmcb;
1934 struct vlapic *vlapic = vcpu_vlapic(v);
1936 vmcb = v->arch.hvm_svm.vmcb;
1937 ASSERT(vmcb);
1939 switch(cr) {
1940 case 0:
1941 value = (unsigned long) v->arch.hvm_svm.cpu_shadow_cr0;
1942 break;
1943 case 2:
1944 value = vmcb->cr2;
1945 break;
1946 case 3:
1947 value = (unsigned long) v->arch.hvm_svm.cpu_cr3;
1948 break;
1949 case 4:
1950 value = (unsigned long) v->arch.hvm_svm.cpu_shadow_cr4;
1951 break;
1952 case 8:
1953 value = (unsigned long)vlapic_get_reg(vlapic, APIC_TASKPRI);
1954 value = (value & 0xF0) >> 4;
1955 break;
1956 default:
1957 domain_crash(v->domain);
1958 return;
1961 set_reg(gp, value, regs, vmcb);
1964 /*
1965 * Read from control registers. CR0 and CR4 are read from the shadow.
1966 */
1967 static void mov_from_cr(int cr, int gp, struct cpu_user_regs *regs)
1969 unsigned long value = 0;
1970 struct vcpu *v = current;
1971 struct vlapic *vlapic = vcpu_vlapic(v);
1972 struct vmcb_struct *vmcb;
1974 vmcb = v->arch.hvm_svm.vmcb;
1975 ASSERT(vmcb);
1977 switch ( cr )
1979 case 0:
1980 value = v->arch.hvm_svm.cpu_shadow_cr0;
1981 if (svm_dbg_on)
1982 printk("CR0 read =%lx \n", value );
1983 break;
1984 case 2:
1985 value = vmcb->cr2;
1986 break;
1987 case 3:
1988 value = (unsigned long) v->arch.hvm_svm.cpu_cr3;
1989 if (svm_dbg_on)
1990 printk("CR3 read =%lx \n", value );
1991 break;
1992 case 4:
1993 value = (unsigned long) v->arch.hvm_svm.cpu_shadow_cr4;
1994 if (svm_dbg_on)
1995 printk("CR4 read=%lx\n", value);
1996 break;
1997 case 8:
1998 value = (unsigned long)vlapic_get_reg(vlapic, APIC_TASKPRI);
1999 value = (value & 0xF0) >> 4;
2000 break;
2002 default:
2003 domain_crash(v->domain);
2004 return;
2007 HVMTRACE_2D(CR_READ, v, cr, value);
2009 set_reg(gp, value, regs, vmcb);
2011 HVM_DBG_LOG(DBG_LEVEL_VMMU, "mov_from_cr: CR%d, value = %lx,", cr, value);
2015 /*
2016 * Write to control registers
2017 */
2018 static int mov_to_cr(int gpreg, int cr, struct cpu_user_regs *regs)
2020 unsigned long value, old_cr, old_base_mfn, mfn;
2021 struct vcpu *v = current;
2022 struct vlapic *vlapic = vcpu_vlapic(v);
2023 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
2025 value = get_reg(gpreg, regs, vmcb);
2027 HVMTRACE_2D(CR_WRITE, v, cr, value);
2029 HVM_DBG_LOG(DBG_LEVEL_1, "mov_to_cr: CR%d, value = %lx,", cr, value);
2030 HVM_DBG_LOG(DBG_LEVEL_1, "current = %lx,", (unsigned long) current);
2032 switch (cr)
2034 case 0:
2035 if (svm_dbg_on)
2036 printk("CR0 write =%lx \n", value );
2037 return svm_set_cr0(value);
2039 case 3:
2040 if (svm_dbg_on)
2041 printk("CR3 write =%lx \n", value );
2042 /* If paging is not enabled yet, simply copy the value to CR3. */
2043 if (!svm_paging_enabled(v)) {
2044 v->arch.hvm_svm.cpu_cr3 = value;
2045 break;
2048 /* We make a new one if the shadow does not exist. */
2049 if (value == v->arch.hvm_svm.cpu_cr3)
2051 /*
2052 * This is simple TLB flush, implying the guest has
2053 * removed some translation or changed page attributes.
2054 * We simply invalidate the shadow.
2055 */
2056 mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
2057 if (mfn != pagetable_get_pfn(v->arch.guest_table))
2058 goto bad_cr3;
2059 paging_update_cr3(v);
2061 else
2063 /*
2064 * If different, make a shadow. Check if the PDBR is valid
2065 * first.
2066 */
2067 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 value = %lx", value);
2068 mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
2069 if ( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), v->domain))
2070 goto bad_cr3;
2072 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
2073 v->arch.guest_table = pagetable_from_pfn(mfn);
2075 if (old_base_mfn)
2076 put_page(mfn_to_page(old_base_mfn));
2078 v->arch.hvm_svm.cpu_cr3 = value;
2079 update_cr3(v);
2080 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx", value);
2082 break;
2084 case 4: /* CR4 */
2085 if (svm_dbg_on)
2086 printk( "write cr4=%lx, cr0=%lx\n",
2087 value, v->arch.hvm_svm.cpu_shadow_cr0 );
2088 old_cr = v->arch.hvm_svm.cpu_shadow_cr4;
2089 if ( value & X86_CR4_PAE && !(old_cr & X86_CR4_PAE) )
2091 if ( svm_pgbit_test(v) )
2093 /* The guest is a 32-bit PAE guest. */
2094 #if CONFIG_PAGING_LEVELS >= 3
2095 unsigned long mfn, old_base_mfn;
2096 mfn = get_mfn_from_gpfn(v->arch.hvm_svm.cpu_cr3 >> PAGE_SHIFT);
2097 if ( !mfn_valid(mfn) ||
2098 !get_page(mfn_to_page(mfn), v->domain) )
2099 goto bad_cr3;
2101 /*
2102 * Now arch.guest_table points to machine physical.
2103 */
2105 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
2106 v->arch.guest_table = pagetable_from_pfn(mfn);
2107 if ( old_base_mfn )
2108 put_page(mfn_to_page(old_base_mfn));
2109 paging_update_paging_modes(v);
2111 HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
2112 (unsigned long) (mfn << PAGE_SHIFT));
2114 HVM_DBG_LOG(DBG_LEVEL_VMMU,
2115 "Update CR3 value = %lx, mfn = %lx",
2116 v->arch.hvm_svm.cpu_cr3, mfn);
2117 #endif
2120 else if ( !(value & X86_CR4_PAE) )
2122 if ( svm_long_mode_enabled(v) )
2124 svm_inject_exception(v, TRAP_gp_fault, 1, 0);
2128 v->arch.hvm_svm.cpu_shadow_cr4 = value;
2129 vmcb->cr4 = value | SVM_CR4_HOST_MASK;
2131 /*
2132 * Writing to CR4 to modify the PSE, PGE, or PAE flag invalidates
2133 * all TLB entries except global entries.
2134 */
2135 if ((old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE))
2136 paging_update_paging_modes(v);
2137 break;
2139 case 8:
2140 vlapic_set_reg(vlapic, APIC_TASKPRI, ((value & 0x0F) << 4));
2141 vmcb->vintr.fields.tpr = value & 0x0F;
2142 break;
2144 default:
2145 gdprintk(XENLOG_ERR, "invalid cr: %d\n", cr);
2146 domain_crash(v->domain);
2147 return 0;
2150 return 1;
2152 bad_cr3:
2153 gdprintk(XENLOG_ERR, "Invalid CR3\n");
2154 domain_crash(v->domain);
2155 return 0;
2159 #define ARR_SIZE(x) (sizeof(x) / sizeof(x[0]))
2162 static int svm_cr_access(struct vcpu *v, unsigned int cr, unsigned int type,
2163 struct cpu_user_regs *regs)
2165 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
2166 int inst_len = 0;
2167 int index;
2168 unsigned int gpreg;
2169 unsigned long value;
2170 u8 buffer[MAX_INST_LEN];
2171 u8 prefix = 0;
2172 int result = 1;
2173 enum instruction_index list_a[] = {INSTR_MOV2CR, INSTR_CLTS, INSTR_LMSW};
2174 enum instruction_index list_b[] = {INSTR_MOVCR2, INSTR_SMSW};
2175 enum instruction_index match;
2177 ASSERT(vmcb);
2179 inst_copy_from_guest(buffer, svm_rip2pointer(v), sizeof(buffer));
2181 /* get index to first actual instruction byte - as we will need to know
2182 where the prefix lives later on */
2183 index = skip_prefix_bytes(buffer, sizeof(buffer));
2185 if ( type == TYPE_MOV_TO_CR )
2187 inst_len = __get_instruction_length_from_list(
2188 v, list_a, ARR_SIZE(list_a), &buffer[index], &match);
2190 else /* type == TYPE_MOV_FROM_CR */
2192 inst_len = __get_instruction_length_from_list(
2193 v, list_b, ARR_SIZE(list_b), &buffer[index], &match);
2196 ASSERT(inst_len > 0);
2198 inst_len += index;
2200 /* Check for REX prefix - it's ALWAYS the last byte of any prefix bytes */
2201 if (index > 0 && (buffer[index-1] & 0xF0) == 0x40)
2202 prefix = buffer[index-1];
2204 HVM_DBG_LOG(DBG_LEVEL_1, "eip = %lx", (unsigned long) vmcb->rip);
2206 switch (match)
2208 case INSTR_MOV2CR:
2209 gpreg = decode_src_reg(prefix, buffer[index+2]);
2210 if ( paging_mode_hap(v->domain) )
2211 result = npt_mov_to_cr(gpreg, cr, regs);
2212 else
2213 result = mov_to_cr(gpreg, cr, regs);
2214 break;
2216 case INSTR_MOVCR2:
2217 gpreg = decode_src_reg(prefix, buffer[index+2]);
2218 if ( paging_mode_hap(v->domain) )
2219 npt_mov_from_cr(cr, gpreg, regs);
2220 else
2221 mov_from_cr(cr, gpreg, regs);
2222 break;
2224 case INSTR_CLTS:
2225 /* TS being cleared means that it's time to restore fpu state. */
2226 setup_fpu(current);
2227 vmcb->exception_intercepts &= ~EXCEPTION_BITMAP_NM;
2228 vmcb->cr0 &= ~X86_CR0_TS; /* clear TS */
2229 v->arch.hvm_svm.cpu_shadow_cr0 &= ~X86_CR0_TS; /* clear TS */
2230 break;
2232 case INSTR_LMSW:
2233 if (svm_dbg_on)
2234 svm_dump_inst(svm_rip2pointer(v));
2236 gpreg = decode_src_reg(prefix, buffer[index+2]);
2237 value = get_reg(gpreg, regs, vmcb) & 0xF;
2239 if (svm_dbg_on)
2240 printk("CR0-LMSW value=%lx, reg=%d, inst_len=%d\n", value, gpreg,
2241 inst_len);
2243 value = (v->arch.hvm_svm.cpu_shadow_cr0 & ~0xF) | value;
2245 if (svm_dbg_on)
2246 printk("CR0-LMSW CR0 - New value=%lx\n", value);
2248 if ( paging_mode_hap(v->domain) )
2249 result = npt_set_cr0(value);
2250 else
2251 result = svm_set_cr0(value);
2252 break;
2254 case INSTR_SMSW:
2255 if (svm_dbg_on)
2256 svm_dump_inst(svm_rip2pointer(v));
2257 value = v->arch.hvm_svm.cpu_shadow_cr0;
2258 gpreg = decode_src_reg(prefix, buffer[index+2]);
2259 set_reg(gpreg, value, regs, vmcb);
2261 if (svm_dbg_on)
2262 printk("CR0-SMSW value=%lx, reg=%d, inst_len=%d\n", value, gpreg,
2263 inst_len);
2264 break;
2266 default:
2267 BUG();
2270 ASSERT(inst_len);
2272 __update_guest_eip(vmcb, inst_len);
2274 return result;
2277 static inline void svm_do_msr_access(
2278 struct vcpu *v, struct cpu_user_regs *regs)
2280 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
2281 int inst_len;
2282 u64 msr_content=0;
2283 u32 ecx = regs->ecx, eax, edx;
2285 ASSERT(vmcb);
2287 HVM_DBG_LOG(DBG_LEVEL_1, "ecx=%x, eax=%x, edx=%x, exitinfo = %lx",
2288 ecx, (u32)regs->eax, (u32)regs->edx,
2289 (unsigned long)vmcb->exitinfo1);
2291 /* is it a read? */
2292 if (vmcb->exitinfo1 == 0)
2294 switch (ecx) {
2295 case MSR_IA32_TIME_STAMP_COUNTER:
2296 msr_content = hvm_get_guest_time(v);
2297 break;
2298 case MSR_IA32_SYSENTER_CS:
2299 msr_content = vmcb->sysenter_cs;
2300 break;
2301 case MSR_IA32_SYSENTER_ESP:
2302 msr_content = vmcb->sysenter_esp;
2303 break;
2304 case MSR_IA32_SYSENTER_EIP:
2305 msr_content = vmcb->sysenter_eip;
2306 break;
2307 case MSR_IA32_APICBASE:
2308 msr_content = vcpu_vlapic(v)->hw.apic_base_msr;
2309 break;
2310 default:
2311 if (long_mode_do_msr_read(regs))
2312 goto done;
2314 if ( rdmsr_hypervisor_regs(ecx, &eax, &edx) ||
2315 rdmsr_safe(ecx, eax, edx) == 0 )
2317 regs->eax = eax;
2318 regs->edx = edx;
2319 goto done;
2321 svm_inject_exception(v, TRAP_gp_fault, 1, 0);
2322 return;
2324 regs->eax = msr_content & 0xFFFFFFFF;
2325 regs->edx = msr_content >> 32;
2327 done:
2328 HVMTRACE_2D(MSR_READ, v, ecx, msr_content);
2329 HVM_DBG_LOG(DBG_LEVEL_1, "returns: ecx=%x, eax=%lx, edx=%lx",
2330 ecx, (unsigned long)regs->eax, (unsigned long)regs->edx);
2332 inst_len = __get_instruction_length(v, INSTR_RDMSR, NULL);
2334 else
2336 msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
2338 HVMTRACE_2D(MSR_WRITE, v, ecx, msr_content);
2340 switch (ecx)
2342 case MSR_IA32_TIME_STAMP_COUNTER:
2343 hvm_set_guest_time(v, msr_content);
2344 pt_reset(v);
2345 break;
2346 case MSR_IA32_SYSENTER_CS:
2347 vmcb->sysenter_cs = msr_content;
2348 break;
2349 case MSR_IA32_SYSENTER_ESP:
2350 vmcb->sysenter_esp = msr_content;
2351 break;
2352 case MSR_IA32_SYSENTER_EIP:
2353 vmcb->sysenter_eip = msr_content;
2354 break;
2355 case MSR_IA32_APICBASE:
2356 vlapic_msr_set(vcpu_vlapic(v), msr_content);
2357 break;
2358 default:
2359 if ( !long_mode_do_msr_write(regs) )
2360 wrmsr_hypervisor_regs(ecx, regs->eax, regs->edx);
2361 break;
2364 inst_len = __get_instruction_length(v, INSTR_WRMSR, NULL);
2367 __update_guest_eip(vmcb, inst_len);
2371 static inline void svm_vmexit_do_hlt(struct vmcb_struct *vmcb)
2373 __update_guest_eip(vmcb, 1);
2375 /* Check for interrupt not handled or new interrupt. */
2376 if ( (vmcb->rflags & X86_EFLAGS_IF) &&
2377 (vmcb->vintr.fields.irq || cpu_has_pending_irq(current)) ) {
2378 HVMTRACE_1D(HLT, current, /*int pending=*/ 1);
2379 return;
2382 HVMTRACE_1D(HLT, current, /*int pending=*/ 0);
2383 hvm_hlt(vmcb->rflags);
2387 static void svm_vmexit_do_invd(struct vcpu *v)
2389 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
2390 int inst_len;
2392 /* Invalidate the cache - we can't really do that safely - maybe we should
2393 * WBINVD, but I think it's just fine to completely ignore it - we should
2394 * have cache-snooping that solves it anyways. -- Mats P.
2395 */
2397 /* Tell the user that we did this - just in case someone runs some really
2398 * weird operating system and wants to know why it's not working...
2399 */
2400 printk("INVD instruction intercepted - ignored\n");
2402 inst_len = __get_instruction_length(v, INSTR_INVD, NULL);
2403 __update_guest_eip(vmcb, inst_len);
2409 #ifdef XEN_DEBUGGER
2410 static void svm_debug_save_cpu_user_regs(struct vmcb_struct *vmcb,
2411 struct cpu_user_regs *regs)
2413 regs->eip = vmcb->rip;
2414 regs->esp = vmcb->rsp;
2415 regs->eflags = vmcb->rflags;
2417 regs->xcs = vmcb->cs.sel;
2418 regs->xds = vmcb->ds.sel;
2419 regs->xes = vmcb->es.sel;
2420 regs->xfs = vmcb->fs.sel;
2421 regs->xgs = vmcb->gs.sel;
2422 regs->xss = vmcb->ss.sel;
2426 static void svm_debug_restore_cpu_user_regs(struct cpu_user_regs *regs)
2428 vmcb->ss.sel = regs->xss;
2429 vmcb->rsp = regs->esp;
2430 vmcb->rflags = regs->eflags;
2431 vmcb->cs.sel = regs->xcs;
2432 vmcb->rip = regs->eip;
2434 vmcb->gs.sel = regs->xgs;
2435 vmcb->fs.sel = regs->xfs;
2436 vmcb->es.sel = regs->xes;
2437 vmcb->ds.sel = regs->xds;
2439 #endif
2442 void svm_handle_invlpg(const short invlpga, struct cpu_user_regs *regs)
2444 struct vcpu *v = current;
2445 u8 opcode[MAX_INST_LEN], prefix, length = MAX_INST_LEN;
2446 unsigned long g_vaddr;
2447 int inst_len;
2448 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
2450 /*
2451 * Unknown how many bytes the invlpg instruction will take. Use the
2452 * maximum instruction length here
2453 */
2454 if (inst_copy_from_guest(opcode, svm_rip2pointer(v), length) < length)
2456 gdprintk(XENLOG_ERR, "Error reading memory %d bytes\n", length);
2457 domain_crash(v->domain);
2458 return;
2461 if (invlpga)
2463 inst_len = __get_instruction_length(v, INSTR_INVLPGA, opcode);
2464 ASSERT(inst_len > 0);
2465 __update_guest_eip(vmcb, inst_len);
2467 /*
2468 * The address is implicit on this instruction. At the moment, we don't
2469 * use ecx (ASID) to identify individual guests pages
2470 */
2471 g_vaddr = regs->eax;
2473 else
2475 /* What about multiple prefix codes? */
2476 prefix = (is_prefix(opcode[0])?opcode[0]:0);
2477 inst_len = __get_instruction_length(v, INSTR_INVLPG, opcode);
2478 ASSERT(inst_len > 0);
2480 inst_len--;
2481 length -= inst_len;
2483 /*
2484 * Decode memory operand of the instruction including ModRM, SIB, and
2485 * displacement to get effective address and length in bytes. Assume
2486 * the system in either 32- or 64-bit mode.
2487 */
2488 g_vaddr = get_effective_addr_modrm64(regs, prefix, inst_len,
2489 &opcode[inst_len], &length);
2491 inst_len += length;
2492 __update_guest_eip (vmcb, inst_len);
2495 HVMTRACE_3D(INVLPG, v, (invlpga?1:0), g_vaddr, (invlpga?regs->ecx:0));
2497 paging_invlpg(v, g_vaddr);
2501 /*
2502 * Reset to realmode causes execution to start at 0xF000:0xFFF0 in
2503 * 16-bit realmode. Basically, this mimics a processor reset.
2505 * returns 0 on success, non-zero otherwise
2506 */
2507 static int svm_do_vmmcall_reset_to_realmode(struct vcpu *v,
2508 struct cpu_user_regs *regs)
2510 struct vmcb_struct *vmcb;
2512 ASSERT(v);
2513 ASSERT(regs);
2515 vmcb = v->arch.hvm_svm.vmcb;
2517 ASSERT(vmcb);
2519 /* clear the vmcb and user regs */
2520 memset(regs, 0, sizeof(struct cpu_user_regs));
2522 /* VMCB Control */
2523 vmcb->tsc_offset = 0;
2525 /* VMCB State */
2526 vmcb->cr0 = X86_CR0_ET | X86_CR0_PG | X86_CR0_WP;
2527 v->arch.hvm_svm.cpu_shadow_cr0 = X86_CR0_ET;
2529 vmcb->cr2 = 0;
2530 vmcb->efer = EFER_SVME;
2532 vmcb->cr4 = SVM_CR4_HOST_MASK;
2533 v->arch.hvm_svm.cpu_shadow_cr4 = 0;
2535 if ( paging_mode_hap(v->domain) ) {
2536 vmcb->cr0 = v->arch.hvm_svm.cpu_shadow_cr0;
2537 vmcb->cr4 = v->arch.hvm_svm.cpu_shadow_cr4;
2540 /* This will jump to ROMBIOS */
2541 vmcb->rip = 0xFFF0;
2543 /* setup the segment registers and all their hidden states */
2544 vmcb->cs.sel = 0xF000;
2545 vmcb->cs.attr.bytes = 0x089b;
2546 vmcb->cs.limit = 0xffff;
2547 vmcb->cs.base = 0x000F0000;
2549 vmcb->ss.sel = 0x00;
2550 vmcb->ss.attr.bytes = 0x0893;
2551 vmcb->ss.limit = 0xffff;
2552 vmcb->ss.base = 0x00;
2554 vmcb->ds.sel = 0x00;
2555 vmcb->ds.attr.bytes = 0x0893;
2556 vmcb->ds.limit = 0xffff;
2557 vmcb->ds.base = 0x00;
2559 vmcb->es.sel = 0x00;
2560 vmcb->es.attr.bytes = 0x0893;
2561 vmcb->es.limit = 0xffff;
2562 vmcb->es.base = 0x00;
2564 vmcb->fs.sel = 0x00;
2565 vmcb->fs.attr.bytes = 0x0893;
2566 vmcb->fs.limit = 0xffff;
2567 vmcb->fs.base = 0x00;
2569 vmcb->gs.sel = 0x00;
2570 vmcb->gs.attr.bytes = 0x0893;
2571 vmcb->gs.limit = 0xffff;
2572 vmcb->gs.base = 0x00;
2574 vmcb->ldtr.sel = 0x00;
2575 vmcb->ldtr.attr.bytes = 0x0000;
2576 vmcb->ldtr.limit = 0x0;
2577 vmcb->ldtr.base = 0x00;
2579 vmcb->gdtr.sel = 0x00;
2580 vmcb->gdtr.attr.bytes = 0x0000;
2581 vmcb->gdtr.limit = 0x0;
2582 vmcb->gdtr.base = 0x00;
2584 vmcb->tr.sel = 0;
2585 vmcb->tr.attr.bytes = 0;
2586 vmcb->tr.limit = 0x0;
2587 vmcb->tr.base = 0;
2589 vmcb->idtr.sel = 0x00;
2590 vmcb->idtr.attr.bytes = 0x0000;
2591 vmcb->idtr.limit = 0x3ff;
2592 vmcb->idtr.base = 0x00;
2594 vmcb->rax = 0;
2595 vmcb->rsp = 0;
2597 return 0;
2601 void svm_dump_inst(unsigned long eip)
2603 u8 opcode[256];
2604 unsigned long ptr;
2605 int len;
2606 int i;
2608 ptr = eip & ~0xff;
2609 len = 0;
2611 if (hvm_copy_from_guest_virt(opcode, ptr, sizeof(opcode)) == 0)
2612 len = sizeof(opcode);
2614 printk("Code bytes around(len=%d) %lx:", len, eip);
2615 for (i = 0; i < len; i++)
2617 if ((i & 0x0f) == 0)
2618 printk("\n%08lx:", ptr+i);
2620 printk("%02x ", opcode[i]);
2623 printk("\n");
2627 void svm_dump_regs(const char *from, struct cpu_user_regs *regs)
2629 struct vcpu *v = current;
2630 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
2631 unsigned long pt = v->arch.hvm_vcpu.hw_cr3;
2633 printk("%s: guest registers from %s:\n", __func__, from);
2634 #if defined (__x86_64__)
2635 printk("rax: %016lx rbx: %016lx rcx: %016lx\n",
2636 regs->rax, regs->rbx, regs->rcx);
2637 printk("rdx: %016lx rsi: %016lx rdi: %016lx\n",
2638 regs->rdx, regs->rsi, regs->rdi);
2639 printk("rbp: %016lx rsp: %016lx r8: %016lx\n",
2640 regs->rbp, regs->rsp, regs->r8);
2641 printk("r9: %016lx r10: %016lx r11: %016lx\n",
2642 regs->r9, regs->r10, regs->r11);
2643 printk("r12: %016lx r13: %016lx r14: %016lx\n",
2644 regs->r12, regs->r13, regs->r14);
2645 printk("r15: %016lx cr0: %016lx cr3: %016lx\n",
2646 regs->r15, v->arch.hvm_svm.cpu_shadow_cr0, vmcb->cr3);
2647 #else
2648 printk("eax: %08x, ebx: %08x, ecx: %08x, edx: %08x\n",
2649 regs->eax, regs->ebx, regs->ecx, regs->edx);
2650 printk("edi: %08x, esi: %08x, ebp: %08x, esp: %08x\n",
2651 regs->edi, regs->esi, regs->ebp, regs->esp);
2652 printk("%s: guest cr0: %lx\n", __func__,
2653 v->arch.hvm_svm.cpu_shadow_cr0);
2654 printk("guest CR3 = %llx\n", vmcb->cr3);
2655 #endif
2656 printk("%s: pt = %lx\n", __func__, pt);
2660 void svm_dump_host_regs(const char *from)
2662 struct vcpu *v = current;
2663 unsigned long pt = pt = pagetable_get_paddr(v->arch.monitor_table);
2664 unsigned long cr3, cr0;
2665 printk("Host registers at %s\n", from);
2667 __asm__ __volatile__ ("\tmov %%cr0,%0\n"
2668 "\tmov %%cr3,%1\n"
2669 : "=r" (cr0), "=r"(cr3));
2670 printk("%s: pt = %lx, cr3 = %lx, cr0 = %lx\n", __func__, pt, cr3, cr0);
2673 #ifdef SVM_EXTRA_DEBUG
2674 static char *exit_reasons[] = {
2675 [VMEXIT_CR0_READ] = "CR0_READ",
2676 [VMEXIT_CR1_READ] = "CR1_READ",
2677 [VMEXIT_CR2_READ] = "CR2_READ",
2678 [VMEXIT_CR3_READ] = "CR3_READ",
2679 [VMEXIT_CR4_READ] = "CR4_READ",
2680 [VMEXIT_CR5_READ] = "CR5_READ",
2681 [VMEXIT_CR6_READ] = "CR6_READ",
2682 [VMEXIT_CR7_READ] = "CR7_READ",
2683 [VMEXIT_CR8_READ] = "CR8_READ",
2684 [VMEXIT_CR9_READ] = "CR9_READ",
2685 [VMEXIT_CR10_READ] = "CR10_READ",
2686 [VMEXIT_CR11_READ] = "CR11_READ",
2687 [VMEXIT_CR12_READ] = "CR12_READ",
2688 [VMEXIT_CR13_READ] = "CR13_READ",
2689 [VMEXIT_CR14_READ] = "CR14_READ",
2690 [VMEXIT_CR15_READ] = "CR15_READ",
2691 [VMEXIT_CR0_WRITE] = "CR0_WRITE",
2692 [VMEXIT_CR1_WRITE] = "CR1_WRITE",
2693 [VMEXIT_CR2_WRITE] = "CR2_WRITE",
2694 [VMEXIT_CR3_WRITE] = "CR3_WRITE",
2695 [VMEXIT_CR4_WRITE] = "CR4_WRITE",
2696 [VMEXIT_CR5_WRITE] = "CR5_WRITE",
2697 [VMEXIT_CR6_WRITE] = "CR6_WRITE",
2698 [VMEXIT_CR7_WRITE] = "CR7_WRITE",
2699 [VMEXIT_CR8_WRITE] = "CR8_WRITE",
2700 [VMEXIT_CR9_WRITE] = "CR9_WRITE",
2701 [VMEXIT_CR10_WRITE] = "CR10_WRITE",
2702 [VMEXIT_CR11_WRITE] = "CR11_WRITE",
2703 [VMEXIT_CR12_WRITE] = "CR12_WRITE",
2704 [VMEXIT_CR13_WRITE] = "CR13_WRITE",
2705 [VMEXIT_CR14_WRITE] = "CR14_WRITE",
2706 [VMEXIT_CR15_WRITE] = "CR15_WRITE",
2707 [VMEXIT_DR0_READ] = "DR0_READ",
2708 [VMEXIT_DR1_READ] = "DR1_READ",
2709 [VMEXIT_DR2_READ] = "DR2_READ",
2710 [VMEXIT_DR3_READ] = "DR3_READ",
2711 [VMEXIT_DR4_READ] = "DR4_READ",
2712 [VMEXIT_DR5_READ] = "DR5_READ",
2713 [VMEXIT_DR6_READ] = "DR6_READ",
2714 [VMEXIT_DR7_READ] = "DR7_READ",
2715 [VMEXIT_DR8_READ] = "DR8_READ",
2716 [VMEXIT_DR9_READ] = "DR9_READ",
2717 [VMEXIT_DR10_READ] = "DR10_READ",
2718 [VMEXIT_DR11_READ] = "DR11_READ",
2719 [VMEXIT_DR12_READ] = "DR12_READ",
2720 [VMEXIT_DR13_READ] = "DR13_READ",
2721 [VMEXIT_DR14_READ] = "DR14_READ",
2722 [VMEXIT_DR15_READ] = "DR15_READ",
2723 [VMEXIT_DR0_WRITE] = "DR0_WRITE",
2724 [VMEXIT_DR1_WRITE] = "DR1_WRITE",
2725 [VMEXIT_DR2_WRITE] = "DR2_WRITE",
2726 [VMEXIT_DR3_WRITE] = "DR3_WRITE",
2727 [VMEXIT_DR4_WRITE] = "DR4_WRITE",
2728 [VMEXIT_DR5_WRITE] = "DR5_WRITE",
2729 [VMEXIT_DR6_WRITE] = "DR6_WRITE",
2730 [VMEXIT_DR7_WRITE] = "DR7_WRITE",
2731 [VMEXIT_DR8_WRITE] = "DR8_WRITE",
2732 [VMEXIT_DR9_WRITE] = "DR9_WRITE",
2733 [VMEXIT_DR10_WRITE] = "DR10_WRITE",
2734 [VMEXIT_DR11_WRITE] = "DR11_WRITE",
2735 [VMEXIT_DR12_WRITE] = "DR12_WRITE",
2736 [VMEXIT_DR13_WRITE] = "DR13_WRITE",
2737 [VMEXIT_DR14_WRITE] = "DR14_WRITE",
2738 [VMEXIT_DR15_WRITE] = "DR15_WRITE",
2739 [VMEXIT_EXCEPTION_DE] = "EXCEPTION_DE",
2740 [VMEXIT_EXCEPTION_DB] = "EXCEPTION_DB",
2741 [VMEXIT_EXCEPTION_NMI] = "EXCEPTION_NMI",
2742 [VMEXIT_EXCEPTION_BP] = "EXCEPTION_BP",
2743 [VMEXIT_EXCEPTION_OF] = "EXCEPTION_OF",
2744 [VMEXIT_EXCEPTION_BR] = "EXCEPTION_BR",
2745 [VMEXIT_EXCEPTION_UD] = "EXCEPTION_UD",
2746 [VMEXIT_EXCEPTION_NM] = "EXCEPTION_NM",
2747 [VMEXIT_EXCEPTION_DF] = "EXCEPTION_DF",
2748 [VMEXIT_EXCEPTION_09] = "EXCEPTION_09",
2749 [VMEXIT_EXCEPTION_TS] = "EXCEPTION_TS",
2750 [VMEXIT_EXCEPTION_NP] = "EXCEPTION_NP",
2751 [VMEXIT_EXCEPTION_SS] = "EXCEPTION_SS",
2752 [VMEXIT_EXCEPTION_GP] = "EXCEPTION_GP",
2753 [VMEXIT_EXCEPTION_PF] = "EXCEPTION_PF",
2754 [VMEXIT_EXCEPTION_15] = "EXCEPTION_15",
2755 [VMEXIT_EXCEPTION_MF] = "EXCEPTION_MF",
2756 [VMEXIT_EXCEPTION_AC] = "EXCEPTION_AC",
2757 [VMEXIT_EXCEPTION_MC] = "EXCEPTION_MC",
2758 [VMEXIT_EXCEPTION_XF] = "EXCEPTION_XF",
2759 [VMEXIT_INTR] = "INTR",
2760 [VMEXIT_NMI] = "NMI",
2761 [VMEXIT_SMI] = "SMI",
2762 [VMEXIT_INIT] = "INIT",
2763 [VMEXIT_VINTR] = "VINTR",
2764 [VMEXIT_CR0_SEL_WRITE] = "CR0_SEL_WRITE",
2765 [VMEXIT_IDTR_READ] = "IDTR_READ",
2766 [VMEXIT_GDTR_READ] = "GDTR_READ",
2767 [VMEXIT_LDTR_READ] = "LDTR_READ",
2768 [VMEXIT_TR_READ] = "TR_READ",
2769 [VMEXIT_IDTR_WRITE] = "IDTR_WRITE",
2770 [VMEXIT_GDTR_WRITE] = "GDTR_WRITE",
2771 [VMEXIT_LDTR_WRITE] = "LDTR_WRITE",
2772 [VMEXIT_TR_WRITE] = "TR_WRITE",
2773 [VMEXIT_RDTSC] = "RDTSC",
2774 [VMEXIT_RDPMC] = "RDPMC",
2775 [VMEXIT_PUSHF] = "PUSHF",
2776 [VMEXIT_POPF] = "POPF",
2777 [VMEXIT_CPUID] = "CPUID",
2778 [VMEXIT_RSM] = "RSM",
2779 [VMEXIT_IRET] = "IRET",
2780 [VMEXIT_SWINT] = "SWINT",
2781 [VMEXIT_INVD] = "INVD",
2782 [VMEXIT_PAUSE] = "PAUSE",
2783 [VMEXIT_HLT] = "HLT",
2784 [VMEXIT_INVLPG] = "INVLPG",
2785 [VMEXIT_INVLPGA] = "INVLPGA",
2786 [VMEXIT_IOIO] = "IOIO",
2787 [VMEXIT_MSR] = "MSR",
2788 [VMEXIT_TASK_SWITCH] = "TASK_SWITCH",
2789 [VMEXIT_FERR_FREEZE] = "FERR_FREEZE",
2790 [VMEXIT_SHUTDOWN] = "SHUTDOWN",
2791 [VMEXIT_VMRUN] = "VMRUN",
2792 [VMEXIT_VMMCALL] = "VMMCALL",
2793 [VMEXIT_VMLOAD] = "VMLOAD",
2794 [VMEXIT_VMSAVE] = "VMSAVE",
2795 [VMEXIT_STGI] = "STGI",
2796 [VMEXIT_CLGI] = "CLGI",
2797 [VMEXIT_SKINIT] = "SKINIT",
2798 [VMEXIT_RDTSCP] = "RDTSCP",
2799 [VMEXIT_ICEBP] = "ICEBP",
2800 [VMEXIT_NPF] = "NPF"
2801 };
2802 #endif /* SVM_EXTRA_DEBUG */
2804 #ifdef SVM_WALK_GUEST_PAGES
2805 void walk_shadow_and_guest_pt(unsigned long gva)
2807 l2_pgentry_t gpde;
2808 l2_pgentry_t spde;
2809 l1_pgentry_t gpte;
2810 l1_pgentry_t spte;
2811 struct vcpu *v = current;
2812 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
2813 paddr_t gpa;
2815 gpa = paging_gva_to_gpa(current, gva);
2816 printk("gva = %lx, gpa=%"PRIpaddr", gCR3=%x\n", gva, gpa, (u32)vmcb->cr3);
2817 if( !svm_paging_enabled(v) || mmio_space(gpa) )
2818 return;
2820 /* let's dump the guest and shadow page info */
2822 __guest_get_l2e(v, gva, &gpde);
2823 printk( "G-PDE = %x, flags=%x\n", gpde.l2, l2e_get_flags(gpde) );
2824 __shadow_get_l2e( v, gva, &spde );
2825 printk( "S-PDE = %x, flags=%x\n", spde.l2, l2e_get_flags(spde) );
2827 if ( unlikely(!(l2e_get_flags(gpde) & _PAGE_PRESENT)) )
2828 return;
2830 spte = l1e_empty();
2832 /* This is actually overkill - we only need to ensure the hl2 is in-sync.*/
2833 shadow_sync_va(v, gva);
2835 gpte.l1 = 0;
2836 __copy_from_user(&gpte, &__linear_l1_table[ l1_linear_offset(gva) ],
2837 sizeof(gpte) );
2838 printk( "G-PTE = %x, flags=%x\n", gpte.l1, l1e_get_flags(gpte) );
2840 BUG(); // need to think about this, and convert usage of
2841 // phys_to_machine_mapping to use pagetable format...
2842 __copy_from_user( &spte, &phys_to_machine_mapping[ l1e_get_pfn( gpte ) ],
2843 sizeof(spte) );
2845 printk( "S-PTE = %x, flags=%x\n", spte.l1, l1e_get_flags(spte));
2847 #endif /* SVM_WALK_GUEST_PAGES */
2850 asmlinkage void svm_vmexit_handler(struct cpu_user_regs *regs)
2852 unsigned int exit_reason;
2853 unsigned long eip;
2854 struct vcpu *v = current;
2855 int do_debug = 0;
2856 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
2858 ASSERT(vmcb);
2860 exit_reason = vmcb->exitcode;
2861 save_svm_cpu_user_regs(v, regs);
2863 HVMTRACE_2D(VMEXIT, v, vmcb->rip, exit_reason);
2865 if (exit_reason == VMEXIT_INVALID)
2867 svm_dump_vmcb(__func__, vmcb);
2868 goto exit_and_crash;
2871 #ifdef SVM_EXTRA_DEBUG
2873 #if defined(__i386__)
2874 #define rip eip
2875 #endif
2877 static unsigned long intercepts_counter = 0;
2879 if (svm_dbg_on && exit_reason == VMEXIT_EXCEPTION_PF)
2881 if (svm_paging_enabled(v) &&
2882 !mmio_space(
2883 paging_gva_to_gfn(current, vmcb->exitinfo2) << PAGE_SHIFT))
2885 printk("I%08ld,ExC=%s(%d),IP=%x:%"PRIx64","
2886 "I1=%"PRIx64",I2=%"PRIx64",INT=%"PRIx64", "
2887 "gpa=%"PRIx64"\n", intercepts_counter,
2888 exit_reasons[exit_reason], exit_reason, regs->cs,
2889 (u64)regs->rip,
2890 (u64)vmcb->exitinfo1,
2891 (u64)vmcb->exitinfo2,
2892 (u64)vmcb->exitintinfo.bytes,
2893 (((u64)paging_gva_to_gfn(current, vmcb->exitinfo2)
2894 << PAGE_SHIFT) | (vmcb->exitinfo2 & ~PAGE_MASK)));
2896 else
2898 printk("I%08ld,ExC=%s(%d),IP=%x:%"PRIx64","
2899 "I1=%"PRIx64",I2=%"PRIx64",INT=%"PRIx64"\n",
2900 intercepts_counter,
2901 exit_reasons[exit_reason], exit_reason, regs->cs,
2902 (u64)regs->rip,
2903 (u64)vmcb->exitinfo1,
2904 (u64)vmcb->exitinfo2,
2905 (u64)vmcb->exitintinfo.bytes );
2908 else if ( svm_dbg_on
2909 && exit_reason != VMEXIT_IOIO
2910 && exit_reason != VMEXIT_INTR)
2913 if (exit_reasons[exit_reason])
2915 printk("I%08ld,ExC=%s(%d),IP=%x:%"PRIx64","
2916 "I1=%"PRIx64",I2=%"PRIx64",INT=%"PRIx64"\n",
2917 intercepts_counter,
2918 exit_reasons[exit_reason], exit_reason, regs->cs,
2919 (u64)regs->rip,
2920 (u64)vmcb->exitinfo1,
2921 (u64)vmcb->exitinfo2,
2922 (u64)vmcb->exitintinfo.bytes);
2924 else
2926 printk("I%08ld,ExC=%d(0x%x),IP=%x:%"PRIx64","
2927 "I1=%"PRIx64",I2=%"PRIx64",INT=%"PRIx64"\n",
2928 intercepts_counter, exit_reason, exit_reason, regs->cs,
2929 (u64)regs->rip,
2930 (u64)vmcb->exitinfo1,
2931 (u64)vmcb->exitinfo2,
2932 (u64)vmcb->exitintinfo.bytes);
2936 #ifdef SVM_WALK_GUEST_PAGES
2937 if( exit_reason == VMEXIT_EXCEPTION_PF
2938 && ( ( vmcb->exitinfo2 == vmcb->rip )
2939 || vmcb->exitintinfo.bytes) )
2941 if ( svm_paging_enabled(v) &&
2942 !mmio_space(gva_to_gpa(vmcb->exitinfo2)) )
2943 walk_shadow_and_guest_pt(vmcb->exitinfo2);
2945 #endif
2947 intercepts_counter++;
2949 #if 0
2950 if (svm_dbg_on)
2951 do_debug = svm_do_debugout(exit_reason);
2952 #endif
2954 if (do_debug)
2956 printk("%s:+ guest_table = 0x%08x, monitor_table = 0x%08x, "
2957 "hw_cr3 = 0x%16lx\n",
2958 __func__,
2959 (int) v->arch.guest_table.pfn,
2960 (int) v->arch.monitor_table.pfn,
2961 (long unsigned int) v->arch.hvm_vcpu.hw_cr3);
2963 svm_dump_vmcb(__func__, vmcb);
2964 svm_dump_regs(__func__, regs);
2965 svm_dump_inst(svm_rip2pointer(v));
2968 #if defined(__i386__)
2969 #undef rip
2970 #endif
2973 #endif /* SVM_EXTRA_DEBUG */
2976 perfc_incra(svmexits, exit_reason);
2977 eip = vmcb->rip;
2979 #ifdef SVM_EXTRA_DEBUG
2980 if (do_debug)
2982 printk("eip = %lx, exit_reason = %d (0x%x)\n",
2983 eip, exit_reason, exit_reason);
2985 #endif /* SVM_EXTRA_DEBUG */
2987 switch (exit_reason)
2989 case VMEXIT_EXCEPTION_DB:
2991 #ifdef XEN_DEBUGGER
2992 svm_debug_save_cpu_user_regs(regs);
2993 pdb_handle_exception(1, regs, 1);
2994 svm_debug_restore_cpu_user_regs(regs);
2995 #else
2996 svm_store_cpu_user_regs(regs, v);
2997 domain_pause_for_debugger();
2998 #endif
3000 break;
3002 case VMEXIT_INTR:
3003 /* Asynchronous event, handled when we STGI'd after the VMEXIT. */
3004 HVMTRACE_0D(INTR, v);
3005 break;
3006 case VMEXIT_NMI:
3007 /* Asynchronous event, handled when we STGI'd after the VMEXIT. */
3008 HVMTRACE_0D(NMI, v);
3009 break;
3010 case VMEXIT_SMI:
3011 /* Asynchronous event, handled when we STGI'd after the VMEXIT. */
3012 HVMTRACE_0D(SMI, v);
3013 break;
3015 case VMEXIT_INIT:
3016 BUG(); /* unreachable */
3018 case VMEXIT_EXCEPTION_BP:
3019 #ifdef XEN_DEBUGGER
3020 svm_debug_save_cpu_user_regs(regs);
3021 pdb_handle_exception(3, regs, 1);
3022 svm_debug_restore_cpu_user_regs(regs);
3023 #else
3024 if ( test_bit(_DOMF_debugging, &v->domain->domain_flags) )
3025 domain_pause_for_debugger();
3026 else
3027 svm_inject_exception(v, TRAP_int3, 0, 0);
3028 #endif
3029 break;
3031 case VMEXIT_EXCEPTION_NM:
3032 svm_do_no_device_fault(vmcb);
3033 break;
3035 case VMEXIT_EXCEPTION_GP:
3036 /* This should probably not be trapped in the future */
3037 regs->error_code = vmcb->exitinfo1;
3038 svm_do_general_protection_fault(v, regs);
3039 break;
3041 case VMEXIT_EXCEPTION_PF:
3043 unsigned long va;
3044 va = vmcb->exitinfo2;
3045 regs->error_code = vmcb->exitinfo1;
3046 HVM_DBG_LOG(DBG_LEVEL_VMMU,
3047 "eax=%lx, ebx=%lx, ecx=%lx, edx=%lx, esi=%lx, edi=%lx",
3048 (unsigned long)regs->eax, (unsigned long)regs->ebx,
3049 (unsigned long)regs->ecx, (unsigned long)regs->edx,
3050 (unsigned long)regs->esi, (unsigned long)regs->edi);
3052 if ( svm_do_page_fault(va, regs) )
3054 HVMTRACE_2D(PF_XEN, v, va, regs->error_code);
3055 break;
3058 v->arch.hvm_svm.cpu_cr2 = vmcb->cr2 = va;
3059 svm_inject_exception(v, TRAP_page_fault, 1, regs->error_code);
3060 break;
3063 case VMEXIT_EXCEPTION_DF:
3064 /* Debug info to hopefully help debug WHY the guest double-faulted. */
3065 svm_dump_vmcb(__func__, vmcb);
3066 svm_dump_regs(__func__, regs);
3067 svm_dump_inst(svm_rip2pointer(v));
3068 svm_inject_exception(v, TRAP_double_fault, 1, 0);
3069 break;
3071 case VMEXIT_VINTR:
3072 vmcb->vintr.fields.irq = 0;
3073 vmcb->general1_intercepts &= ~GENERAL1_INTERCEPT_VINTR;
3074 break;
3076 case VMEXIT_INVD:
3077 svm_vmexit_do_invd(v);
3078 break;
3080 case VMEXIT_GDTR_WRITE:
3081 printk("WRITE to GDTR\n");
3082 break;
3084 case VMEXIT_TASK_SWITCH:
3085 goto exit_and_crash;
3087 case VMEXIT_CPUID:
3088 svm_vmexit_do_cpuid(vmcb, regs);
3089 break;
3091 case VMEXIT_HLT:
3092 svm_vmexit_do_hlt(vmcb);
3093 break;
3095 case VMEXIT_INVLPG:
3096 svm_handle_invlpg(0, regs);
3097 break;
3099 case VMEXIT_INVLPGA:
3100 svm_handle_invlpg(1, regs);
3101 break;
3103 case VMEXIT_VMMCALL: {
3104 int inst_len = __get_instruction_length(v, INSTR_VMCALL, NULL);
3105 ASSERT(inst_len > 0);
3106 HVMTRACE_1D(VMMCALL, v, regs->eax);
3107 __update_guest_eip(vmcb, inst_len);
3108 hvm_do_hypercall(regs);
3109 break;
3112 case VMEXIT_CR0_READ:
3113 svm_cr_access(v, 0, TYPE_MOV_FROM_CR, regs);
3114 break;
3116 case VMEXIT_CR2_READ:
3117 svm_cr_access(v, 2, TYPE_MOV_FROM_CR, regs);
3118 break;
3120 case VMEXIT_CR3_READ:
3121 svm_cr_access(v, 3, TYPE_MOV_FROM_CR, regs);
3122 break;
3124 case VMEXIT_CR4_READ:
3125 svm_cr_access(v, 4, TYPE_MOV_FROM_CR, regs);
3126 break;
3128 case VMEXIT_CR8_READ:
3129 svm_cr_access(v, 8, TYPE_MOV_FROM_CR, regs);
3130 break;
3132 case VMEXIT_CR0_WRITE:
3133 svm_cr_access(v, 0, TYPE_MOV_TO_CR, regs);
3134 break;
3136 case VMEXIT_CR2_WRITE:
3137 svm_cr_access(v, 2, TYPE_MOV_TO_CR, regs);
3138 break;
3140 case VMEXIT_CR3_WRITE:
3141 svm_cr_access(v, 3, TYPE_MOV_TO_CR, regs);
3142 local_flush_tlb();
3143 break;
3145 case VMEXIT_CR4_WRITE:
3146 svm_cr_access(v, 4, TYPE_MOV_TO_CR, regs);
3147 break;
3149 case VMEXIT_CR8_WRITE:
3150 svm_cr_access(v, 8, TYPE_MOV_TO_CR, regs);
3151 break;
3153 case VMEXIT_DR0_WRITE ... VMEXIT_DR7_WRITE:
3154 svm_dr_access(v, regs);
3155 break;
3157 case VMEXIT_IOIO:
3158 svm_io_instruction(v);
3159 break;
3161 case VMEXIT_MSR:
3162 svm_do_msr_access(v, regs);
3163 break;
3165 case VMEXIT_SHUTDOWN:
3166 hvm_triple_fault();
3167 break;
3169 case VMEXIT_NPF:
3171 regs->error_code = vmcb->exitinfo1;
3172 if ( !svm_do_nested_pgfault(vmcb->exitinfo2, regs) ) {
3173 domain_crash(v->domain);
3175 break;
3178 default:
3179 exit_and_crash:
3180 gdprintk(XENLOG_ERR, "unexpected VMEXIT: exit reason = 0x%x, "
3181 "exitinfo1 = %"PRIx64", exitinfo2 = %"PRIx64"\n",
3182 exit_reason,
3183 (u64)vmcb->exitinfo1, (u64)vmcb->exitinfo2);
3184 domain_crash(v->domain);
3185 break;
3188 #ifdef SVM_EXTRA_DEBUG
3189 if (do_debug)
3191 printk("%s: Done switch on vmexit_code\n", __func__);
3192 svm_dump_regs(__func__, regs);
3195 if (do_debug)
3197 printk("vmexit_handler():- guest_table = 0x%08x, "
3198 "monitor_table = 0x%08x, hw_cr3 = 0x%16x\n",
3199 (int)v->arch.guest_table.pfn,
3200 (int)v->arch.monitor_table.pfn,
3201 (int)v->arch.hvm_vcpu.hw_cr3);
3202 printk("svm_vmexit_handler: Returning\n");
3204 #endif
3207 asmlinkage void svm_load_cr2(void)
3209 struct vcpu *v = current;
3211 // this is the last C code before the VMRUN instruction
3212 HVMTRACE_0D(VMENTRY, v);
3214 local_irq_disable();
3215 asm volatile("mov %0,%%cr2": :"r" (v->arch.hvm_svm.cpu_cr2));
3218 /*
3219 * Local variables:
3220 * mode: C
3221 * c-set-style: "BSD"
3222 * c-basic-offset: 4
3223 * tab-width: 4
3224 * indent-tabs-mode: nil
3225 * End:
3226 */