ia64/xen-unstable

view xen/arch/x86/hvm/svm/svm.c @ 15708:52e5c110aadb

[HVM] Yet another MCA/MCE MSR.
Signed-off-by: Tim Deegan <Tim.Deegan@xensource.com>
author Tim Deegan <Tim.Deegan@xensource.com>
date Fri Aug 03 12:10:35 2007 +0100 (2007-08-03)
parents 0636f262ecd8
children 0f541efbb6d6
line source
1 /*
2 * svm.c: handling SVM architecture-related VM exits
3 * Copyright (c) 2004, Intel Corporation.
4 * Copyright (c) 2005-2007, Advanced Micro Devices, Inc.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms and conditions of the GNU General Public License,
8 * version 2, as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
17 * Place - Suite 330, Boston, MA 02111-1307 USA.
18 */
20 #include <xen/config.h>
21 #include <xen/init.h>
22 #include <xen/lib.h>
23 #include <xen/trace.h>
24 #include <xen/sched.h>
25 #include <xen/irq.h>
26 #include <xen/softirq.h>
27 #include <xen/hypercall.h>
28 #include <xen/domain_page.h>
29 #include <asm/current.h>
30 #include <asm/io.h>
31 #include <asm/paging.h>
32 #include <asm/p2m.h>
33 #include <asm/regs.h>
34 #include <asm/cpufeature.h>
35 #include <asm/processor.h>
36 #include <asm/types.h>
37 #include <asm/msr.h>
38 #include <asm/spinlock.h>
39 #include <asm/hvm/hvm.h>
40 #include <asm/hvm/support.h>
41 #include <asm/hvm/io.h>
42 #include <asm/hvm/svm/asid.h>
43 #include <asm/hvm/svm/svm.h>
44 #include <asm/hvm/svm/vmcb.h>
45 #include <asm/hvm/svm/emulate.h>
46 #include <asm/hvm/svm/intr.h>
47 #include <asm/x86_emulate.h>
48 #include <public/sched.h>
49 #include <asm/hvm/vpt.h>
50 #include <asm/hvm/trace.h>
51 #include <asm/hap.h>
53 #define set_segment_register(name, value) \
54 asm volatile ( "movw %%ax ,%%" STR(name) "" : : "a" (value) )
56 enum handler_return { HNDL_done, HNDL_unhandled, HNDL_exception_raised };
58 int inst_copy_from_guest(unsigned char *buf, unsigned long guest_eip,
59 int inst_len);
60 asmlinkage void do_IRQ(struct cpu_user_regs *);
62 static int svm_reset_to_realmode(struct vcpu *v,
63 struct cpu_user_regs *regs);
65 /* va of hardware host save area */
66 static void *hsa[NR_CPUS] __read_mostly;
68 /* vmcb used for extended host state */
69 static void *root_vmcb[NR_CPUS] __read_mostly;
71 /* hardware assisted paging bits */
72 extern int opt_hap_enabled;
74 static void svm_inject_exception(
75 struct vcpu *v, int trap, int ev, int error_code)
76 {
77 eventinj_t event;
78 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
80 if ( trap == TRAP_page_fault )
81 HVMTRACE_2D(PF_INJECT, v, v->arch.hvm_svm.cpu_cr2, error_code);
82 else
83 HVMTRACE_2D(INJ_EXC, v, trap, error_code);
85 event.bytes = 0;
86 event.fields.v = 1;
87 event.fields.type = X86_EVENTTYPE_HW_EXCEPTION;
88 event.fields.vector = trap;
89 event.fields.ev = ev;
90 event.fields.errorcode = error_code;
92 vmcb->eventinj = event;
93 }
95 static void svm_cpu_down(void)
96 {
97 write_efer(read_efer() & ~EFER_SVME);
98 }
100 #ifdef __x86_64__
102 static int svm_lme_is_set(struct vcpu *v)
103 {
104 u64 guest_efer = v->arch.hvm_svm.cpu_shadow_efer;
105 return guest_efer & EFER_LME;
106 }
108 static int svm_long_mode_enabled(struct vcpu *v)
109 {
110 u64 guest_efer = v->arch.hvm_svm.cpu_shadow_efer;
111 return guest_efer & EFER_LMA;
112 }
114 #else /* __i386__ */
116 static int svm_lme_is_set(struct vcpu *v)
117 { return 0; }
118 static int svm_long_mode_enabled(struct vcpu *v)
119 { return 0; }
121 #endif
123 static int svm_cr4_pae_is_set(struct vcpu *v)
124 {
125 unsigned long guest_cr4 = v->arch.hvm_svm.cpu_shadow_cr4;
126 return guest_cr4 & X86_CR4_PAE;
127 }
129 static int svm_paging_enabled(struct vcpu *v)
130 {
131 unsigned long guest_cr0 = v->arch.hvm_svm.cpu_shadow_cr0;
132 return (guest_cr0 & X86_CR0_PE) && (guest_cr0 & X86_CR0_PG);
133 }
135 static int svm_pae_enabled(struct vcpu *v)
136 {
137 unsigned long guest_cr4 = v->arch.hvm_svm.cpu_shadow_cr4;
138 return svm_paging_enabled(v) && (guest_cr4 & X86_CR4_PAE);
139 }
141 static int svm_nx_enabled(struct vcpu *v)
142 {
143 return v->arch.hvm_svm.cpu_shadow_efer & EFER_NX;
144 }
146 static int svm_pgbit_test(struct vcpu *v)
147 {
148 return v->arch.hvm_svm.cpu_shadow_cr0 & X86_CR0_PG;
149 }
151 static void svm_store_cpu_guest_regs(
152 struct vcpu *v, struct cpu_user_regs *regs, unsigned long *crs)
153 {
154 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
156 if ( regs != NULL )
157 {
158 regs->ss = vmcb->ss.sel;
159 regs->esp = vmcb->rsp;
160 regs->eflags = vmcb->rflags;
161 regs->cs = vmcb->cs.sel;
162 regs->eip = vmcb->rip;
163 }
165 if ( crs != NULL )
166 {
167 /* Returning the guest's regs */
168 crs[0] = v->arch.hvm_svm.cpu_shadow_cr0;
169 crs[2] = v->arch.hvm_svm.cpu_cr2;
170 crs[3] = v->arch.hvm_svm.cpu_cr3;
171 crs[4] = v->arch.hvm_svm.cpu_shadow_cr4;
172 }
173 }
175 static enum handler_return long_mode_do_msr_write(struct cpu_user_regs *regs)
176 {
177 u64 msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
178 u32 ecx = regs->ecx;
179 struct vcpu *v = current;
180 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
182 HVM_DBG_LOG(DBG_LEVEL_0, "msr %x msr_content %"PRIx64,
183 ecx, msr_content);
185 switch ( ecx )
186 {
187 case MSR_EFER:
188 /* Offending reserved bit will cause #GP. */
189 #ifdef __x86_64__
190 if ( (msr_content & ~(EFER_LME | EFER_LMA | EFER_NX | EFER_SCE)) ||
191 #else
192 if ( (msr_content & ~(EFER_NX | EFER_SCE)) ||
193 #endif
194 (!cpu_has_nx && (msr_content & EFER_NX)) ||
195 (!cpu_has_syscall && (msr_content & EFER_SCE)) )
196 {
197 gdprintk(XENLOG_WARNING, "Trying to set reserved bit in "
198 "EFER: %"PRIx64"\n", msr_content);
199 goto gp_fault;
200 }
202 if ( (msr_content & EFER_LME) && !svm_lme_is_set(v) )
203 {
204 /* EFER.LME transition from 0 to 1. */
205 if ( svm_paging_enabled(v) || !svm_cr4_pae_is_set(v) )
206 {
207 gdprintk(XENLOG_WARNING, "Trying to set LME bit when "
208 "in paging mode or PAE bit is not set\n");
209 goto gp_fault;
210 }
211 }
212 else if ( !(msr_content & EFER_LME) && svm_lme_is_set(v) )
213 {
214 /* EFER.LME transistion from 1 to 0. */
215 if ( svm_paging_enabled(v) )
216 {
217 gdprintk(XENLOG_WARNING,
218 "Trying to clear EFER.LME while paging enabled\n");
219 goto gp_fault;
220 }
221 }
223 v->arch.hvm_svm.cpu_shadow_efer = msr_content;
224 vmcb->efer = msr_content | EFER_SVME;
225 if ( !svm_paging_enabled(v) )
226 vmcb->efer &= ~(EFER_LME | EFER_LMA);
228 break;
230 case MSR_K8_MC4_MISC: /* Threshold register */
231 /*
232 * MCA/MCE: Threshold register is reported to be locked, so we ignore
233 * all write accesses. This behaviour matches real HW, so guests should
234 * have no problem with this.
235 */
236 break;
238 default:
239 return HNDL_unhandled;
240 }
242 return HNDL_done;
244 gp_fault:
245 svm_inject_exception(v, TRAP_gp_fault, 1, 0);
246 return HNDL_exception_raised;
247 }
250 #define loaddebug(_v,_reg) \
251 asm volatile ("mov %0,%%db" #_reg : : "r" ((_v)->debugreg[_reg]))
252 #define savedebug(_v,_reg) \
253 asm volatile ("mov %%db" #_reg ",%0" : : "r" ((_v)->debugreg[_reg]))
255 static void svm_save_dr(struct vcpu *v)
256 {
257 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
259 if ( !v->arch.hvm_vcpu.flag_dr_dirty )
260 return;
262 /* Clear the DR dirty flag and re-enable intercepts for DR accesses. */
263 v->arch.hvm_vcpu.flag_dr_dirty = 0;
264 v->arch.hvm_svm.vmcb->dr_intercepts = DR_INTERCEPT_ALL_WRITES;
266 savedebug(&v->arch.guest_context, 0);
267 savedebug(&v->arch.guest_context, 1);
268 savedebug(&v->arch.guest_context, 2);
269 savedebug(&v->arch.guest_context, 3);
270 v->arch.guest_context.debugreg[6] = vmcb->dr6;
271 v->arch.guest_context.debugreg[7] = vmcb->dr7;
272 }
275 static void __restore_debug_registers(struct vcpu *v)
276 {
277 loaddebug(&v->arch.guest_context, 0);
278 loaddebug(&v->arch.guest_context, 1);
279 loaddebug(&v->arch.guest_context, 2);
280 loaddebug(&v->arch.guest_context, 3);
281 /* DR6 and DR7 are loaded from the VMCB. */
282 }
285 int svm_vmcb_save(struct vcpu *v, struct hvm_hw_cpu *c)
286 {
287 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
289 c->rip = vmcb->rip;
291 #ifdef HVM_DEBUG_SUSPEND
292 printk("%s: eip=0x%"PRIx64".\n",
293 __func__,
294 inst_len, c->eip);
295 #endif
297 c->rsp = vmcb->rsp;
298 c->rflags = vmcb->rflags;
300 c->cr0 = v->arch.hvm_svm.cpu_shadow_cr0;
301 c->cr2 = v->arch.hvm_svm.cpu_cr2;
302 c->cr3 = v->arch.hvm_svm.cpu_cr3;
303 c->cr4 = v->arch.hvm_svm.cpu_shadow_cr4;
305 #ifdef HVM_DEBUG_SUSPEND
306 printk("%s: cr3=0x%"PRIx64", cr0=0x%"PRIx64", cr4=0x%"PRIx64".\n",
307 __func__,
308 c->cr3,
309 c->cr0,
310 c->cr4);
311 #endif
313 c->idtr_limit = vmcb->idtr.limit;
314 c->idtr_base = vmcb->idtr.base;
316 c->gdtr_limit = vmcb->gdtr.limit;
317 c->gdtr_base = vmcb->gdtr.base;
319 c->cs_sel = vmcb->cs.sel;
320 c->cs_limit = vmcb->cs.limit;
321 c->cs_base = vmcb->cs.base;
322 c->cs_arbytes = vmcb->cs.attr.bytes;
324 c->ds_sel = vmcb->ds.sel;
325 c->ds_limit = vmcb->ds.limit;
326 c->ds_base = vmcb->ds.base;
327 c->ds_arbytes = vmcb->ds.attr.bytes;
329 c->es_sel = vmcb->es.sel;
330 c->es_limit = vmcb->es.limit;
331 c->es_base = vmcb->es.base;
332 c->es_arbytes = vmcb->es.attr.bytes;
334 c->ss_sel = vmcb->ss.sel;
335 c->ss_limit = vmcb->ss.limit;
336 c->ss_base = vmcb->ss.base;
337 c->ss_arbytes = vmcb->ss.attr.bytes;
339 c->fs_sel = vmcb->fs.sel;
340 c->fs_limit = vmcb->fs.limit;
341 c->fs_base = vmcb->fs.base;
342 c->fs_arbytes = vmcb->fs.attr.bytes;
344 c->gs_sel = vmcb->gs.sel;
345 c->gs_limit = vmcb->gs.limit;
346 c->gs_base = vmcb->gs.base;
347 c->gs_arbytes = vmcb->gs.attr.bytes;
349 c->tr_sel = vmcb->tr.sel;
350 c->tr_limit = vmcb->tr.limit;
351 c->tr_base = vmcb->tr.base;
352 c->tr_arbytes = vmcb->tr.attr.bytes;
354 c->ldtr_sel = vmcb->ldtr.sel;
355 c->ldtr_limit = vmcb->ldtr.limit;
356 c->ldtr_base = vmcb->ldtr.base;
357 c->ldtr_arbytes = vmcb->ldtr.attr.bytes;
359 c->sysenter_cs = vmcb->sysenter_cs;
360 c->sysenter_esp = vmcb->sysenter_esp;
361 c->sysenter_eip = vmcb->sysenter_eip;
363 c->pending_event = 0;
364 c->error_code = 0;
365 if ( vmcb->eventinj.fields.v &&
366 hvm_event_needs_reinjection(vmcb->eventinj.fields.type,
367 vmcb->eventinj.fields.vector) )
368 {
369 c->pending_event = (uint32_t)vmcb->eventinj.bytes;
370 c->error_code = vmcb->eventinj.fields.errorcode;
371 }
373 return 1;
374 }
377 int svm_vmcb_restore(struct vcpu *v, struct hvm_hw_cpu *c)
378 {
379 unsigned long mfn, old_base_mfn;
380 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
382 vmcb->rip = c->rip;
383 vmcb->rsp = c->rsp;
384 vmcb->rflags = c->rflags;
386 v->arch.hvm_svm.cpu_shadow_cr0 = c->cr0;
387 vmcb->cr0 = c->cr0 | X86_CR0_WP | X86_CR0_ET | X86_CR0_PG;
389 v->arch.hvm_svm.cpu_cr2 = c->cr2;
391 #ifdef HVM_DEBUG_SUSPEND
392 printk("%s: cr3=0x%"PRIx64", cr0=0x%"PRIx64", cr4=0x%"PRIx64".\n",
393 __func__,
394 c->cr3,
395 c->cr0,
396 c->cr4);
397 #endif
399 if ( !svm_paging_enabled(v) )
400 {
401 printk("%s: paging not enabled.\n", __func__);
402 goto skip_cr3;
403 }
405 if ( c->cr3 == v->arch.hvm_svm.cpu_cr3 )
406 {
407 /*
408 * This is simple TLB flush, implying the guest has
409 * removed some translation or changed page attributes.
410 * We simply invalidate the shadow.
411 */
412 mfn = gmfn_to_mfn(v->domain, c->cr3 >> PAGE_SHIFT);
413 if ( mfn != pagetable_get_pfn(v->arch.guest_table) )
414 goto bad_cr3;
415 }
416 else
417 {
418 /*
419 * If different, make a shadow. Check if the PDBR is valid
420 * first.
421 */
422 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 c->cr3 = %"PRIx64, c->cr3);
423 mfn = gmfn_to_mfn(v->domain, c->cr3 >> PAGE_SHIFT);
424 if( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), v->domain) )
425 goto bad_cr3;
427 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
428 v->arch.guest_table = pagetable_from_pfn(mfn);
429 if (old_base_mfn)
430 put_page(mfn_to_page(old_base_mfn));
431 v->arch.hvm_svm.cpu_cr3 = c->cr3;
432 }
434 skip_cr3:
435 vmcb->cr4 = c->cr4 | HVM_CR4_HOST_MASK;
436 v->arch.hvm_svm.cpu_shadow_cr4 = c->cr4;
438 vmcb->idtr.limit = c->idtr_limit;
439 vmcb->idtr.base = c->idtr_base;
441 vmcb->gdtr.limit = c->gdtr_limit;
442 vmcb->gdtr.base = c->gdtr_base;
444 vmcb->cs.sel = c->cs_sel;
445 vmcb->cs.limit = c->cs_limit;
446 vmcb->cs.base = c->cs_base;
447 vmcb->cs.attr.bytes = c->cs_arbytes;
449 vmcb->ds.sel = c->ds_sel;
450 vmcb->ds.limit = c->ds_limit;
451 vmcb->ds.base = c->ds_base;
452 vmcb->ds.attr.bytes = c->ds_arbytes;
454 vmcb->es.sel = c->es_sel;
455 vmcb->es.limit = c->es_limit;
456 vmcb->es.base = c->es_base;
457 vmcb->es.attr.bytes = c->es_arbytes;
459 vmcb->ss.sel = c->ss_sel;
460 vmcb->ss.limit = c->ss_limit;
461 vmcb->ss.base = c->ss_base;
462 vmcb->ss.attr.bytes = c->ss_arbytes;
463 vmcb->cpl = vmcb->ss.attr.fields.dpl;
465 vmcb->fs.sel = c->fs_sel;
466 vmcb->fs.limit = c->fs_limit;
467 vmcb->fs.base = c->fs_base;
468 vmcb->fs.attr.bytes = c->fs_arbytes;
470 vmcb->gs.sel = c->gs_sel;
471 vmcb->gs.limit = c->gs_limit;
472 vmcb->gs.base = c->gs_base;
473 vmcb->gs.attr.bytes = c->gs_arbytes;
475 vmcb->tr.sel = c->tr_sel;
476 vmcb->tr.limit = c->tr_limit;
477 vmcb->tr.base = c->tr_base;
478 vmcb->tr.attr.bytes = c->tr_arbytes;
480 vmcb->ldtr.sel = c->ldtr_sel;
481 vmcb->ldtr.limit = c->ldtr_limit;
482 vmcb->ldtr.base = c->ldtr_base;
483 vmcb->ldtr.attr.bytes = c->ldtr_arbytes;
485 vmcb->sysenter_cs = c->sysenter_cs;
486 vmcb->sysenter_esp = c->sysenter_esp;
487 vmcb->sysenter_eip = c->sysenter_eip;
489 if ( paging_mode_hap(v->domain) )
490 {
491 vmcb->cr0 = v->arch.hvm_svm.cpu_shadow_cr0;
492 vmcb->cr4 = (v->arch.hvm_svm.cpu_shadow_cr4 |
493 (HVM_CR4_HOST_MASK & ~X86_CR4_PAE));
494 vmcb->cr3 = c->cr3;
495 vmcb->np_enable = 1;
496 vmcb->g_pat = 0x0007040600070406ULL; /* guest PAT */
497 vmcb->h_cr3 = pagetable_get_paddr(v->domain->arch.phys_table);
498 }
500 vmcb->dr6 = c->dr6;
501 vmcb->dr7 = c->dr7;
503 if ( c->pending_valid )
504 {
505 gdprintk(XENLOG_INFO, "Re-injecting 0x%"PRIx32", 0x%"PRIx32"\n",
506 c->pending_event, c->error_code);
508 if ( (c->pending_type == 1) || (c->pending_type > 6) ||
509 (c->pending_reserved != 0) )
510 {
511 gdprintk(XENLOG_ERR, "Invalid pending event 0x%"PRIx32"\n",
512 c->pending_event);
513 return -EINVAL;
514 }
516 if ( hvm_event_needs_reinjection(c->pending_type, c->pending_vector) )
517 {
518 vmcb->eventinj.bytes = c->pending_event;
519 vmcb->eventinj.fields.errorcode = c->error_code;
520 }
521 }
523 paging_update_paging_modes(v);
524 svm_asid_g_update_paging(v);
526 return 0;
528 bad_cr3:
529 gdprintk(XENLOG_ERR, "Invalid CR3 value=0x%"PRIx64"\n", c->cr3);
530 return -EINVAL;
531 }
534 static void svm_save_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data)
535 {
536 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
538 data->shadow_gs = vmcb->kerngsbase;
539 data->msr_lstar = vmcb->lstar;
540 data->msr_star = vmcb->star;
541 data->msr_cstar = vmcb->cstar;
542 data->msr_syscall_mask = vmcb->sfmask;
543 data->msr_efer = v->arch.hvm_svm.cpu_shadow_efer;
544 data->msr_flags = -1ULL;
546 data->tsc = hvm_get_guest_time(v);
547 }
550 static void svm_load_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data)
551 {
552 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
554 vmcb->kerngsbase = data->shadow_gs;
555 vmcb->lstar = data->msr_lstar;
556 vmcb->star = data->msr_star;
557 vmcb->cstar = data->msr_cstar;
558 vmcb->sfmask = data->msr_syscall_mask;
559 v->arch.hvm_svm.cpu_shadow_efer = data->msr_efer;
560 vmcb->efer = data->msr_efer | EFER_SVME;
561 /* VMCB's EFER.LME isn't set unless we're actually in long mode
562 * (see long_mode_do_msr_write()) */
563 if ( !(vmcb->efer & EFER_LMA) )
564 vmcb->efer &= ~EFER_LME;
566 hvm_set_guest_time(v, data->tsc);
567 }
569 static void svm_save_vmcb_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt)
570 {
571 svm_save_cpu_state(v, ctxt);
572 svm_vmcb_save(v, ctxt);
573 }
575 static int svm_load_vmcb_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt)
576 {
577 svm_load_cpu_state(v, ctxt);
578 if (svm_vmcb_restore(v, ctxt)) {
579 printk("svm_vmcb restore failed!\n");
580 domain_crash(v->domain);
581 return -EINVAL;
582 }
584 return 0;
585 }
587 static void svm_restore_dr(struct vcpu *v)
588 {
589 if ( unlikely(v->arch.guest_context.debugreg[7] & 0xFF) )
590 __restore_debug_registers(v);
591 }
593 static int svm_interrupts_enabled(struct vcpu *v, enum hvm_intack type)
594 {
595 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
597 if ( type == hvm_intack_nmi )
598 return !vmcb->interrupt_shadow;
600 ASSERT((type == hvm_intack_pic) || (type == hvm_intack_lapic));
601 return !irq_masked(vmcb->rflags) && !vmcb->interrupt_shadow;
602 }
604 static int svm_guest_x86_mode(struct vcpu *v)
605 {
606 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
608 if ( unlikely(!(v->arch.hvm_svm.cpu_shadow_cr0 & X86_CR0_PE)) )
609 return 0;
610 if ( unlikely(vmcb->rflags & X86_EFLAGS_VM) )
611 return 1;
612 if ( svm_long_mode_enabled(v) && likely(vmcb->cs.attr.fields.l) )
613 return 8;
614 return (likely(vmcb->cs.attr.fields.db) ? 4 : 2);
615 }
617 static void svm_update_host_cr3(struct vcpu *v)
618 {
619 /* SVM doesn't have a HOST_CR3 equivalent to update. */
620 }
622 static void svm_update_guest_cr3(struct vcpu *v)
623 {
624 v->arch.hvm_svm.vmcb->cr3 = v->arch.hvm_vcpu.hw_cr3;
625 }
627 static void svm_flush_guest_tlbs(void)
628 {
629 /* Roll over the CPU's ASID generation, so it gets a clean TLB when we
630 * next VMRUN. (If ASIDs are disabled, the whole TLB is flushed on
631 * VMRUN anyway). */
632 svm_asid_inc_generation();
633 }
635 static void svm_update_vtpr(struct vcpu *v, unsigned long value)
636 {
637 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
639 vmcb->vintr.fields.tpr = value & 0x0f;
640 }
642 static unsigned long svm_get_ctrl_reg(struct vcpu *v, unsigned int num)
643 {
644 switch ( num )
645 {
646 case 0:
647 return v->arch.hvm_svm.cpu_shadow_cr0;
648 case 2:
649 return v->arch.hvm_svm.cpu_cr2;
650 case 3:
651 return v->arch.hvm_svm.cpu_cr3;
652 case 4:
653 return v->arch.hvm_svm.cpu_shadow_cr4;
654 default:
655 BUG();
656 }
657 return 0; /* dummy */
658 }
660 static void svm_sync_vmcb(struct vcpu *v)
661 {
662 struct arch_svm_struct *arch_svm = &v->arch.hvm_svm;
664 if ( arch_svm->vmcb_in_sync )
665 return;
667 arch_svm->vmcb_in_sync = 1;
669 asm volatile (
670 ".byte 0x0f,0x01,0xdb" /* vmsave */
671 : : "a" (__pa(arch_svm->vmcb)) );
672 }
674 static unsigned long svm_get_segment_base(struct vcpu *v, enum x86_segment seg)
675 {
676 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
677 int long_mode = vmcb->cs.attr.fields.l && svm_long_mode_enabled(v);
679 switch ( seg )
680 {
681 case x86_seg_cs: return long_mode ? 0 : vmcb->cs.base;
682 case x86_seg_ds: return long_mode ? 0 : vmcb->ds.base;
683 case x86_seg_es: return long_mode ? 0 : vmcb->es.base;
684 case x86_seg_fs: svm_sync_vmcb(v); return vmcb->fs.base;
685 case x86_seg_gs: svm_sync_vmcb(v); return vmcb->gs.base;
686 case x86_seg_ss: return long_mode ? 0 : vmcb->ss.base;
687 case x86_seg_tr: svm_sync_vmcb(v); return vmcb->tr.base;
688 case x86_seg_gdtr: return vmcb->gdtr.base;
689 case x86_seg_idtr: return vmcb->idtr.base;
690 case x86_seg_ldtr: svm_sync_vmcb(v); return vmcb->ldtr.base;
691 }
692 BUG();
693 return 0;
694 }
696 static void svm_get_segment_register(struct vcpu *v, enum x86_segment seg,
697 struct segment_register *reg)
698 {
699 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
700 switch ( seg )
701 {
702 case x86_seg_cs:
703 memcpy(reg, &vmcb->cs, sizeof(*reg));
704 break;
705 case x86_seg_ds:
706 memcpy(reg, &vmcb->ds, sizeof(*reg));
707 break;
708 case x86_seg_es:
709 memcpy(reg, &vmcb->es, sizeof(*reg));
710 break;
711 case x86_seg_fs:
712 svm_sync_vmcb(v);
713 memcpy(reg, &vmcb->fs, sizeof(*reg));
714 break;
715 case x86_seg_gs:
716 svm_sync_vmcb(v);
717 memcpy(reg, &vmcb->gs, sizeof(*reg));
718 break;
719 case x86_seg_ss:
720 memcpy(reg, &vmcb->ss, sizeof(*reg));
721 break;
722 case x86_seg_tr:
723 svm_sync_vmcb(v);
724 memcpy(reg, &vmcb->tr, sizeof(*reg));
725 break;
726 case x86_seg_gdtr:
727 memcpy(reg, &vmcb->gdtr, sizeof(*reg));
728 break;
729 case x86_seg_idtr:
730 memcpy(reg, &vmcb->idtr, sizeof(*reg));
731 break;
732 case x86_seg_ldtr:
733 svm_sync_vmcb(v);
734 memcpy(reg, &vmcb->ldtr, sizeof(*reg));
735 break;
736 default: BUG();
737 }
738 }
740 /* Make sure that xen intercepts any FP accesses from current */
741 static void svm_stts(struct vcpu *v)
742 {
743 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
745 /*
746 * If the guest does not have TS enabled then we must cause and handle an
747 * exception on first use of the FPU. If the guest *does* have TS enabled
748 * then this is not necessary: no FPU activity can occur until the guest
749 * clears CR0.TS, and we will initialise the FPU when that happens.
750 */
751 if ( !(v->arch.hvm_svm.cpu_shadow_cr0 & X86_CR0_TS) )
752 {
753 v->arch.hvm_svm.vmcb->exception_intercepts |= 1U << TRAP_no_device;
754 vmcb->cr0 |= X86_CR0_TS;
755 }
756 }
759 static void svm_set_tsc_offset(struct vcpu *v, u64 offset)
760 {
761 v->arch.hvm_svm.vmcb->tsc_offset = offset;
762 }
765 static void svm_init_ap_context(
766 struct vcpu_guest_context *ctxt, int vcpuid, int trampoline_vector)
767 {
768 struct vcpu *v;
769 struct vmcb_struct *vmcb;
770 cpu_user_regs_t *regs;
771 u16 cs_sel;
773 /* We know this is safe because hvm_bringup_ap() does it */
774 v = current->domain->vcpu[vcpuid];
775 vmcb = v->arch.hvm_svm.vmcb;
776 regs = &v->arch.guest_context.user_regs;
778 memset(ctxt, 0, sizeof(*ctxt));
780 /*
781 * We execute the trampoline code in real mode. The trampoline vector
782 * passed to us is page alligned and is the physical frame number for
783 * the code. We will execute this code in real mode.
784 */
785 cs_sel = trampoline_vector << 8;
786 ctxt->user_regs.eip = 0x0;
787 ctxt->user_regs.cs = cs_sel;
789 /*
790 * This is the launch of an AP; set state so that we begin executing
791 * the trampoline code in real-mode.
792 */
793 svm_reset_to_realmode(v, regs);
794 /* Adjust the vmcb's hidden register state. */
795 vmcb->rip = 0;
796 vmcb->cs.sel = cs_sel;
797 vmcb->cs.base = (cs_sel << 4);
798 }
800 static void svm_init_hypercall_page(struct domain *d, void *hypercall_page)
801 {
802 char *p;
803 int i;
805 for ( i = 0; i < (PAGE_SIZE / 32); i++ )
806 {
807 p = (char *)(hypercall_page + (i * 32));
808 *(u8 *)(p + 0) = 0xb8; /* mov imm32, %eax */
809 *(u32 *)(p + 1) = i;
810 *(u8 *)(p + 5) = 0x0f; /* vmmcall */
811 *(u8 *)(p + 6) = 0x01;
812 *(u8 *)(p + 7) = 0xd9;
813 *(u8 *)(p + 8) = 0xc3; /* ret */
814 }
816 /* Don't support HYPERVISOR_iret at the moment */
817 *(u16 *)(hypercall_page + (__HYPERVISOR_iret * 32)) = 0x0b0f; /* ud2 */
818 }
820 static void svm_load_cpu_guest_regs(struct vcpu *v, struct cpu_user_regs *regs)
821 {
822 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
824 vmcb->ss.sel = regs->ss;
825 vmcb->rsp = regs->esp;
826 vmcb->rflags = regs->eflags | 2UL;
827 vmcb->cs.sel = regs->cs;
828 vmcb->rip = regs->eip;
829 }
831 static void svm_ctxt_switch_from(struct vcpu *v)
832 {
833 int cpu = smp_processor_id();
835 svm_save_dr(v);
837 svm_sync_vmcb(v);
839 asm volatile (
840 ".byte 0x0f,0x01,0xda" /* vmload */
841 : : "a" (__pa(root_vmcb[cpu])) );
843 #ifdef __x86_64__
844 /* Resume use of ISTs now that the host TR is reinstated. */
845 idt_tables[cpu][TRAP_double_fault].a |= IST_DF << 32;
846 idt_tables[cpu][TRAP_nmi].a |= IST_NMI << 32;
847 idt_tables[cpu][TRAP_machine_check].a |= IST_MCE << 32;
848 #endif
849 }
851 static void svm_ctxt_switch_to(struct vcpu *v)
852 {
853 int cpu = smp_processor_id();
855 #ifdef __x86_64__
856 /*
857 * This is required, because VMRUN does consistency check
858 * and some of the DOM0 selectors are pointing to
859 * invalid GDT locations, and cause AMD processors
860 * to shutdown.
861 */
862 set_segment_register(ds, 0);
863 set_segment_register(es, 0);
864 set_segment_register(ss, 0);
866 /*
867 * Cannot use ISTs for NMI/#MC/#DF while we are running with the guest TR.
868 * But this doesn't matter: the IST is only req'd to handle SYSCALL/SYSRET.
869 */
870 idt_tables[cpu][TRAP_double_fault].a &= ~(7UL << 32);
871 idt_tables[cpu][TRAP_nmi].a &= ~(7UL << 32);
872 idt_tables[cpu][TRAP_machine_check].a &= ~(7UL << 32);
873 #endif
875 svm_restore_dr(v);
877 asm volatile (
878 ".byte 0x0f,0x01,0xdb" /* vmsave */
879 : : "a" (__pa(root_vmcb[cpu])) );
880 asm volatile (
881 ".byte 0x0f,0x01,0xda" /* vmload */
882 : : "a" (__pa(v->arch.hvm_svm.vmcb)) );
883 }
885 static void svm_do_resume(struct vcpu *v)
886 {
887 bool_t debug_state = v->domain->debugger_attached;
889 if ( unlikely(v->arch.hvm_vcpu.debug_state_latch != debug_state) )
890 {
891 uint32_t mask = (1U << TRAP_debug) | (1U << TRAP_int3);
892 v->arch.hvm_vcpu.debug_state_latch = debug_state;
893 if ( debug_state )
894 v->arch.hvm_svm.vmcb->exception_intercepts |= mask;
895 else
896 v->arch.hvm_svm.vmcb->exception_intercepts &= ~mask;
897 }
899 if ( v->arch.hvm_svm.launch_core != smp_processor_id() )
900 {
901 v->arch.hvm_svm.launch_core = smp_processor_id();
902 hvm_migrate_timers(v);
904 /* Migrating to another ASID domain. Request a new ASID. */
905 svm_asid_init_vcpu(v);
906 }
908 hvm_do_resume(v);
909 reset_stack_and_jump(svm_asm_do_resume);
910 }
912 static int svm_domain_initialise(struct domain *d)
913 {
914 return 0;
915 }
917 static void svm_domain_destroy(struct domain *d)
918 {
919 }
921 static int svm_vcpu_initialise(struct vcpu *v)
922 {
923 int rc;
925 v->arch.schedule_tail = svm_do_resume;
926 v->arch.ctxt_switch_from = svm_ctxt_switch_from;
927 v->arch.ctxt_switch_to = svm_ctxt_switch_to;
929 v->arch.hvm_svm.launch_core = -1;
931 if ( (rc = svm_create_vmcb(v)) != 0 )
932 {
933 dprintk(XENLOG_WARNING,
934 "Failed to create VMCB for vcpu %d: err=%d.\n",
935 v->vcpu_id, rc);
936 return rc;
937 }
939 return 0;
940 }
942 static void svm_vcpu_destroy(struct vcpu *v)
943 {
944 svm_destroy_vmcb(v);
945 }
947 static void svm_hvm_inject_exception(
948 unsigned int trapnr, int errcode, unsigned long cr2)
949 {
950 struct vcpu *v = current;
951 if ( trapnr == TRAP_page_fault )
952 v->arch.hvm_svm.vmcb->cr2 = v->arch.hvm_svm.cpu_cr2 = cr2;
953 svm_inject_exception(v, trapnr, (errcode != -1), errcode);
954 }
956 static int svm_event_pending(struct vcpu *v)
957 {
958 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
959 return vmcb->eventinj.fields.v;
960 }
962 static struct hvm_function_table svm_function_table = {
963 .name = "SVM",
964 .cpu_down = svm_cpu_down,
965 .domain_initialise = svm_domain_initialise,
966 .domain_destroy = svm_domain_destroy,
967 .vcpu_initialise = svm_vcpu_initialise,
968 .vcpu_destroy = svm_vcpu_destroy,
969 .store_cpu_guest_regs = svm_store_cpu_guest_regs,
970 .load_cpu_guest_regs = svm_load_cpu_guest_regs,
971 .save_cpu_ctxt = svm_save_vmcb_ctxt,
972 .load_cpu_ctxt = svm_load_vmcb_ctxt,
973 .paging_enabled = svm_paging_enabled,
974 .long_mode_enabled = svm_long_mode_enabled,
975 .pae_enabled = svm_pae_enabled,
976 .nx_enabled = svm_nx_enabled,
977 .interrupts_enabled = svm_interrupts_enabled,
978 .guest_x86_mode = svm_guest_x86_mode,
979 .get_guest_ctrl_reg = svm_get_ctrl_reg,
980 .get_segment_base = svm_get_segment_base,
981 .get_segment_register = svm_get_segment_register,
982 .update_host_cr3 = svm_update_host_cr3,
983 .update_guest_cr3 = svm_update_guest_cr3,
984 .flush_guest_tlbs = svm_flush_guest_tlbs,
985 .update_vtpr = svm_update_vtpr,
986 .stts = svm_stts,
987 .set_tsc_offset = svm_set_tsc_offset,
988 .inject_exception = svm_hvm_inject_exception,
989 .init_ap_context = svm_init_ap_context,
990 .init_hypercall_page = svm_init_hypercall_page,
991 .event_pending = svm_event_pending
992 };
994 static void svm_npt_detect(void)
995 {
996 u32 eax, ebx, ecx, edx;
998 /* Check CPUID for nested paging support. */
999 cpuid(0x8000000A, &eax, &ebx, &ecx, &edx);
1001 if ( !(edx & 1) && opt_hap_enabled )
1003 printk("SVM: Nested paging is not supported by this CPU.\n");
1004 opt_hap_enabled = 0;
1008 int start_svm(struct cpuinfo_x86 *c)
1010 u32 eax, ecx, edx;
1011 u32 phys_hsa_lo, phys_hsa_hi;
1012 u64 phys_hsa;
1013 int cpu = smp_processor_id();
1015 /* Xen does not fill x86_capability words except 0. */
1016 ecx = cpuid_ecx(0x80000001);
1017 boot_cpu_data.x86_capability[5] = ecx;
1019 if ( !(test_bit(X86_FEATURE_SVME, &boot_cpu_data.x86_capability)) )
1020 return 0;
1022 /* Check whether SVM feature is disabled in BIOS */
1023 rdmsr(MSR_K8_VM_CR, eax, edx);
1024 if ( eax & K8_VMCR_SVME_DISABLE )
1026 printk("AMD SVM Extension is disabled in BIOS.\n");
1027 return 0;
1030 if ( ((hsa[cpu] = alloc_host_save_area()) == NULL) ||
1031 ((root_vmcb[cpu] = alloc_vmcb()) == NULL) )
1032 return 0;
1034 write_efer(read_efer() | EFER_SVME);
1036 svm_npt_detect();
1038 /* Initialize the HSA for this core. */
1039 phys_hsa = (u64) virt_to_maddr(hsa[cpu]);
1040 phys_hsa_lo = (u32) phys_hsa;
1041 phys_hsa_hi = (u32) (phys_hsa >> 32);
1042 wrmsr(MSR_K8_VM_HSAVE_PA, phys_hsa_lo, phys_hsa_hi);
1044 /* Initialize core's ASID handling. */
1045 svm_asid_init(c);
1047 if ( cpu != 0 )
1048 return 1;
1050 setup_vmcb_dump();
1052 hvm_enable(&svm_function_table);
1054 if ( opt_hap_enabled )
1055 printk("SVM: Nested paging enabled.\n");
1057 return 1;
1060 static int svm_do_nested_pgfault(paddr_t gpa, struct cpu_user_regs *regs)
1062 if (mmio_space(gpa)) {
1063 handle_mmio(gpa);
1064 return 1;
1067 paging_mark_dirty(current->domain, get_mfn_from_gpfn(gpa >> PAGE_SHIFT));
1068 return p2m_set_flags(current->domain, gpa, __PAGE_HYPERVISOR|_PAGE_USER);
1071 static void svm_do_no_device_fault(struct vmcb_struct *vmcb)
1073 struct vcpu *v = current;
1075 setup_fpu(v);
1076 vmcb->exception_intercepts &= ~(1U << TRAP_no_device);
1078 if ( !(v->arch.hvm_svm.cpu_shadow_cr0 & X86_CR0_TS) )
1079 vmcb->cr0 &= ~X86_CR0_TS;
1082 /* Reserved bits ECX: [31:14], [12:4], [2:1]*/
1083 #define SVM_VCPU_CPUID_L1_ECX_RESERVED 0xffffdff6
1084 /* Reserved bits EDX: [31:29], [27], [22:20], [18], [10] */
1085 #define SVM_VCPU_CPUID_L1_EDX_RESERVED 0xe8740400
1087 static void svm_vmexit_do_cpuid(struct vmcb_struct *vmcb,
1088 struct cpu_user_regs *regs)
1090 unsigned long input = regs->eax;
1091 unsigned int eax, ebx, ecx, edx;
1092 struct vcpu *v = current;
1093 int inst_len;
1095 hvm_cpuid(input, &eax, &ebx, &ecx, &edx);
1097 if ( input == 0x00000001 )
1099 /* Clear out reserved bits. */
1100 ecx &= ~SVM_VCPU_CPUID_L1_ECX_RESERVED;
1101 edx &= ~SVM_VCPU_CPUID_L1_EDX_RESERVED;
1103 /* Guest should only see one logical processor.
1104 * See details on page 23 of AMD CPUID Specification.
1105 */
1106 clear_bit(X86_FEATURE_HT & 31, &edx); /* clear the hyperthread bit */
1107 ebx &= 0xFF00FFFF; /* clear the logical processor count when HTT=0 */
1108 ebx |= 0x00010000; /* set to 1 just for precaution */
1110 else if ( input == 0x80000001 )
1112 if ( vlapic_hw_disabled(vcpu_vlapic(v)) )
1113 clear_bit(X86_FEATURE_APIC & 31, &edx);
1115 #if CONFIG_PAGING_LEVELS >= 3
1116 if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] )
1117 #endif
1118 clear_bit(X86_FEATURE_PAE & 31, &edx);
1120 clear_bit(X86_FEATURE_PSE36 & 31, &edx);
1122 /* Clear the Cmp_Legacy bit
1123 * This bit is supposed to be zero when HTT = 0.
1124 * See details on page 23 of AMD CPUID Specification.
1125 */
1126 clear_bit(X86_FEATURE_CMP_LEGACY & 31, &ecx);
1128 /* Make SVM feature invisible to the guest. */
1129 clear_bit(X86_FEATURE_SVME & 31, &ecx);
1131 /* So far, we do not support 3DNow for the guest. */
1132 clear_bit(X86_FEATURE_3DNOW & 31, &edx);
1133 clear_bit(X86_FEATURE_3DNOWEXT & 31, &edx);
1134 /* no FFXSR instructions feature. */
1135 clear_bit(X86_FEATURE_FFXSR & 31, &edx);
1137 else if ( input == 0x80000007 || input == 0x8000000A )
1139 /* Mask out features of power management and SVM extension. */
1140 eax = ebx = ecx = edx = 0;
1142 else if ( input == 0x80000008 )
1144 /* Make sure Number of CPU core is 1 when HTT=0 */
1145 ecx &= 0xFFFFFF00;
1148 regs->eax = (unsigned long)eax;
1149 regs->ebx = (unsigned long)ebx;
1150 regs->ecx = (unsigned long)ecx;
1151 regs->edx = (unsigned long)edx;
1153 HVMTRACE_3D(CPUID, v, input,
1154 ((uint64_t)eax << 32) | ebx, ((uint64_t)ecx << 32) | edx);
1156 inst_len = __get_instruction_length(v, INSTR_CPUID, NULL);
1157 ASSERT(inst_len > 0);
1158 __update_guest_eip(vmcb, inst_len);
1161 static unsigned long *get_reg_p(
1162 unsigned int gpreg,
1163 struct cpu_user_regs *regs, struct vmcb_struct *vmcb)
1165 unsigned long *reg_p = NULL;
1166 switch (gpreg)
1168 case SVM_REG_EAX:
1169 reg_p = (unsigned long *)&regs->eax;
1170 break;
1171 case SVM_REG_EBX:
1172 reg_p = (unsigned long *)&regs->ebx;
1173 break;
1174 case SVM_REG_ECX:
1175 reg_p = (unsigned long *)&regs->ecx;
1176 break;
1177 case SVM_REG_EDX:
1178 reg_p = (unsigned long *)&regs->edx;
1179 break;
1180 case SVM_REG_EDI:
1181 reg_p = (unsigned long *)&regs->edi;
1182 break;
1183 case SVM_REG_ESI:
1184 reg_p = (unsigned long *)&regs->esi;
1185 break;
1186 case SVM_REG_EBP:
1187 reg_p = (unsigned long *)&regs->ebp;
1188 break;
1189 case SVM_REG_ESP:
1190 reg_p = (unsigned long *)&vmcb->rsp;
1191 break;
1192 #ifdef __x86_64__
1193 case SVM_REG_R8:
1194 reg_p = (unsigned long *)&regs->r8;
1195 break;
1196 case SVM_REG_R9:
1197 reg_p = (unsigned long *)&regs->r9;
1198 break;
1199 case SVM_REG_R10:
1200 reg_p = (unsigned long *)&regs->r10;
1201 break;
1202 case SVM_REG_R11:
1203 reg_p = (unsigned long *)&regs->r11;
1204 break;
1205 case SVM_REG_R12:
1206 reg_p = (unsigned long *)&regs->r12;
1207 break;
1208 case SVM_REG_R13:
1209 reg_p = (unsigned long *)&regs->r13;
1210 break;
1211 case SVM_REG_R14:
1212 reg_p = (unsigned long *)&regs->r14;
1213 break;
1214 case SVM_REG_R15:
1215 reg_p = (unsigned long *)&regs->r15;
1216 break;
1217 #endif
1218 default:
1219 BUG();
1222 return reg_p;
1226 static unsigned long get_reg(
1227 unsigned int gpreg, struct cpu_user_regs *regs, struct vmcb_struct *vmcb)
1229 unsigned long *gp;
1230 gp = get_reg_p(gpreg, regs, vmcb);
1231 return *gp;
1235 static void set_reg(
1236 unsigned int gpreg, unsigned long value,
1237 struct cpu_user_regs *regs, struct vmcb_struct *vmcb)
1239 unsigned long *gp;
1240 gp = get_reg_p(gpreg, regs, vmcb);
1241 *gp = value;
1245 static void svm_dr_access(struct vcpu *v, struct cpu_user_regs *regs)
1247 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
1249 HVMTRACE_0D(DR_WRITE, v);
1251 v->arch.hvm_vcpu.flag_dr_dirty = 1;
1253 __restore_debug_registers(v);
1255 /* allow the guest full access to the debug registers */
1256 vmcb->dr_intercepts = 0;
1260 static void svm_get_prefix_info(struct vcpu *v, unsigned int dir,
1261 svm_segment_register_t **seg,
1262 unsigned int *asize)
1264 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
1265 unsigned char inst[MAX_INST_LEN];
1266 int i;
1268 memset(inst, 0, MAX_INST_LEN);
1269 if (inst_copy_from_guest(inst, svm_rip2pointer(v), sizeof(inst))
1270 != MAX_INST_LEN)
1272 gdprintk(XENLOG_ERR, "get guest instruction failed\n");
1273 domain_crash(current->domain);
1274 return;
1277 for (i = 0; i < MAX_INST_LEN; i++)
1279 switch (inst[i])
1281 case 0xf3: /* REPZ */
1282 case 0xf2: /* REPNZ */
1283 case 0xf0: /* LOCK */
1284 case 0x66: /* data32 */
1285 #ifdef __x86_64__
1286 /* REX prefixes */
1287 case 0x40:
1288 case 0x41:
1289 case 0x42:
1290 case 0x43:
1291 case 0x44:
1292 case 0x45:
1293 case 0x46:
1294 case 0x47:
1296 case 0x48:
1297 case 0x49:
1298 case 0x4a:
1299 case 0x4b:
1300 case 0x4c:
1301 case 0x4d:
1302 case 0x4e:
1303 case 0x4f:
1304 #endif
1305 continue;
1306 case 0x67: /* addr32 */
1307 *asize ^= 48; /* Switch 16/32 bits */
1308 continue;
1309 case 0x2e: /* CS */
1310 *seg = &vmcb->cs;
1311 continue;
1312 case 0x36: /* SS */
1313 *seg = &vmcb->ss;
1314 continue;
1315 case 0x26: /* ES */
1316 *seg = &vmcb->es;
1317 continue;
1318 case 0x64: /* FS */
1319 svm_sync_vmcb(v);
1320 *seg = &vmcb->fs;
1321 continue;
1322 case 0x65: /* GS */
1323 svm_sync_vmcb(v);
1324 *seg = &vmcb->gs;
1325 continue;
1326 case 0x3e: /* DS */
1327 *seg = &vmcb->ds;
1328 continue;
1329 default:
1330 break;
1332 return;
1337 /* Get the address of INS/OUTS instruction */
1338 static int svm_get_io_address(
1339 struct vcpu *v, struct cpu_user_regs *regs,
1340 unsigned int size, ioio_info_t info,
1341 unsigned long *count, unsigned long *addr)
1343 unsigned long reg;
1344 unsigned int asize, isize;
1345 int long_mode = 0;
1346 svm_segment_register_t *seg = NULL;
1347 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
1349 /* If we're in long mode, don't check the segment presence & limit */
1350 long_mode = vmcb->cs.attr.fields.l && svm_long_mode_enabled(v);
1352 /* d field of cs.attr is 1 for 32-bit, 0 for 16 or 64 bit.
1353 * l field combined with EFER_LMA says whether it's 16 or 64 bit.
1354 */
1355 asize = (long_mode)?64:((vmcb->cs.attr.fields.db)?32:16);
1358 /* The ins/outs instructions are single byte, so if we have got more
1359 * than one byte (+ maybe rep-prefix), we have some prefix so we need
1360 * to figure out what it is...
1361 */
1362 isize = vmcb->exitinfo2 - vmcb->rip;
1364 if (info.fields.rep)
1365 isize --;
1367 if (isize > 1)
1368 svm_get_prefix_info(v, info.fields.type, &seg, &asize);
1370 if (info.fields.type == IOREQ_WRITE)
1372 reg = regs->esi;
1373 if (!seg) /* If no prefix, used DS. */
1374 seg = &vmcb->ds;
1375 if (!long_mode && (seg->attr.fields.type & 0xa) == 0x8) {
1376 svm_inject_exception(v, TRAP_gp_fault, 1, 0);
1377 return 0;
1380 else
1382 reg = regs->edi;
1383 seg = &vmcb->es; /* Note: This is ALWAYS ES. */
1384 if (!long_mode && (seg->attr.fields.type & 0xa) != 0x2) {
1385 svm_inject_exception(v, TRAP_gp_fault, 1, 0);
1386 return 0;
1390 /* If the segment isn't present, give GP fault! */
1391 if (!long_mode && !seg->attr.fields.p)
1393 svm_inject_exception(v, TRAP_gp_fault, 1, 0);
1394 return 0;
1397 if (asize == 16)
1399 *addr = (reg & 0xFFFF);
1400 *count = regs->ecx & 0xffff;
1402 else
1404 *addr = reg;
1405 *count = regs->ecx;
1407 if (!info.fields.rep)
1408 *count = 1;
1410 if (!long_mode)
1412 ASSERT(*addr == (u32)*addr);
1413 if ((u32)(*addr + size - 1) < (u32)*addr ||
1414 (seg->attr.fields.type & 0xc) != 0x4 ?
1415 *addr + size - 1 > seg->limit :
1416 *addr <= seg->limit)
1418 svm_inject_exception(v, TRAP_gp_fault, 1, 0);
1419 return 0;
1422 /* Check the limit for repeated instructions, as above we checked only
1423 the first instance. Truncate the count if a limit violation would
1424 occur. Note that the checking is not necessary for page granular
1425 segments as transfers crossing page boundaries will be broken up
1426 anyway. */
1427 if (!seg->attr.fields.g && *count > 1)
1429 if ((seg->attr.fields.type & 0xc) != 0x4)
1431 /* expand-up */
1432 if (!(regs->eflags & EF_DF))
1434 if (*addr + *count * size - 1 < *addr ||
1435 *addr + *count * size - 1 > seg->limit)
1436 *count = (seg->limit + 1UL - *addr) / size;
1438 else
1440 if (*count - 1 > *addr / size)
1441 *count = *addr / size + 1;
1444 else
1446 /* expand-down */
1447 if (!(regs->eflags & EF_DF))
1449 if (*count - 1 > -(s32)*addr / size)
1450 *count = -(s32)*addr / size + 1UL;
1452 else
1454 if (*addr < (*count - 1) * size ||
1455 *addr - (*count - 1) * size <= seg->limit)
1456 *count = (*addr - seg->limit - 1) / size + 1;
1459 ASSERT(*count);
1462 *addr += seg->base;
1464 #ifdef __x86_64__
1465 else
1467 if (seg == &vmcb->fs || seg == &vmcb->gs)
1468 *addr += seg->base;
1470 if (!is_canonical_address(*addr) ||
1471 !is_canonical_address(*addr + size - 1))
1473 svm_inject_exception(v, TRAP_gp_fault, 1, 0);
1474 return 0;
1476 if (*count > (1UL << 48) / size)
1477 *count = (1UL << 48) / size;
1478 if (!(regs->eflags & EF_DF))
1480 if (*addr + *count * size - 1 < *addr ||
1481 !is_canonical_address(*addr + *count * size - 1))
1482 *count = (*addr & ~((1UL << 48) - 1)) / size;
1484 else
1486 if ((*count - 1) * size > *addr ||
1487 !is_canonical_address(*addr + (*count - 1) * size))
1488 *count = (*addr & ~((1UL << 48) - 1)) / size + 1;
1490 ASSERT(*count);
1492 #endif
1494 return 1;
1498 static void svm_io_instruction(struct vcpu *v)
1500 struct cpu_user_regs *regs;
1501 struct hvm_io_op *pio_opp;
1502 unsigned int port;
1503 unsigned int size, dir, df;
1504 ioio_info_t info;
1505 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
1507 pio_opp = &current->arch.hvm_vcpu.io_op;
1508 pio_opp->instr = INSTR_PIO;
1509 pio_opp->flags = 0;
1511 regs = &pio_opp->io_context;
1513 /* Copy current guest state into io instruction state structure. */
1514 memcpy(regs, guest_cpu_user_regs(), HVM_CONTEXT_STACK_BYTES);
1515 svm_store_cpu_guest_regs(v, regs, NULL);
1517 info.bytes = vmcb->exitinfo1;
1519 port = info.fields.port; /* port used to be addr */
1520 dir = info.fields.type; /* direction */
1521 df = regs->eflags & X86_EFLAGS_DF ? 1 : 0;
1523 if (info.fields.sz32)
1524 size = 4;
1525 else if (info.fields.sz16)
1526 size = 2;
1527 else
1528 size = 1;
1530 if (dir==IOREQ_READ)
1531 HVMTRACE_2D(IO_READ, v, port, size);
1532 else
1533 HVMTRACE_2D(IO_WRITE, v, port, size);
1535 HVM_DBG_LOG(DBG_LEVEL_IO,
1536 "svm_io_instruction: port 0x%x eip=%x:%"PRIx64", "
1537 "exit_qualification = %"PRIx64,
1538 port, vmcb->cs.sel, vmcb->rip, info.bytes);
1540 /* string instruction */
1541 if (info.fields.str)
1543 unsigned long addr, count;
1544 paddr_t paddr;
1545 unsigned long gfn;
1546 int sign = regs->eflags & X86_EFLAGS_DF ? -1 : 1;
1548 if (!svm_get_io_address(v, regs, size, info, &count, &addr))
1550 /* We failed to get a valid address, so don't do the IO operation -
1551 * it would just get worse if we do! Hopefully the guest is handing
1552 * gp-faults...
1553 */
1554 return;
1557 /* "rep" prefix */
1558 if (info.fields.rep)
1560 pio_opp->flags |= REPZ;
1563 /* Translate the address to a physical address */
1564 gfn = paging_gva_to_gfn(v, addr);
1565 if ( gfn == INVALID_GFN )
1567 /* The guest does not have the RAM address mapped.
1568 * Need to send in a page fault */
1569 int errcode = 0;
1570 /* IO read --> memory write */
1571 if ( dir == IOREQ_READ ) errcode |= PFEC_write_access;
1572 svm_hvm_inject_exception(TRAP_page_fault, errcode, addr);
1573 return;
1575 paddr = (paddr_t)gfn << PAGE_SHIFT | (addr & ~PAGE_MASK);
1577 /*
1578 * Handle string pio instructions that cross pages or that
1579 * are unaligned. See the comments in hvm_platform.c/handle_mmio()
1580 */
1581 if ((addr & PAGE_MASK) != ((addr + size - 1) & PAGE_MASK))
1583 unsigned long value = 0;
1585 pio_opp->flags |= OVERLAP;
1586 pio_opp->addr = addr;
1588 if (dir == IOREQ_WRITE) /* OUTS */
1590 if ( hvm_paging_enabled(current) )
1592 int rv = hvm_copy_from_guest_virt(&value, addr, size);
1593 if ( rv != 0 )
1595 /* Failed on the page-spanning copy. Inject PF into
1596 * the guest for the address where we failed. */
1597 addr += size - rv;
1598 gdprintk(XENLOG_DEBUG, "Pagefault reading non-io side "
1599 "of a page-spanning PIO: va=%#lx\n", addr);
1600 svm_hvm_inject_exception(TRAP_page_fault, 0, addr);
1601 return;
1604 else
1605 (void) hvm_copy_from_guest_phys(&value, addr, size);
1606 } else /* dir != IOREQ_WRITE */
1607 /* Remember where to write the result, as a *VA*.
1608 * Must be a VA so we can handle the page overlap
1609 * correctly in hvm_pio_assist() */
1610 pio_opp->addr = addr;
1612 if (count == 1)
1613 regs->eip = vmcb->exitinfo2;
1615 send_pio_req(port, 1, size, value, dir, df, 0);
1617 else
1619 unsigned long last_addr = sign > 0 ? addr + count * size - 1
1620 : addr - (count - 1) * size;
1622 if ((addr & PAGE_MASK) != (last_addr & PAGE_MASK))
1624 if (sign > 0)
1625 count = (PAGE_SIZE - (addr & ~PAGE_MASK)) / size;
1626 else
1627 count = (addr & ~PAGE_MASK) / size + 1;
1629 else
1630 regs->eip = vmcb->exitinfo2;
1632 send_pio_req(port, count, size, paddr, dir, df, 1);
1635 else
1637 /*
1638 * On SVM, the RIP of the intruction following the IN/OUT is saved in
1639 * ExitInfo2
1640 */
1641 regs->eip = vmcb->exitinfo2;
1643 if (port == 0xe9 && dir == IOREQ_WRITE && size == 1)
1644 hvm_print_line(v, regs->eax); /* guest debug output */
1646 send_pio_req(port, 1, size, regs->eax, dir, df, 0);
1650 static int svm_set_cr0(unsigned long value)
1652 struct vcpu *v = current;
1653 unsigned long mfn, old_value = v->arch.hvm_svm.cpu_shadow_cr0;
1654 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
1655 unsigned long old_base_mfn;
1657 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR0 value = %lx", value);
1659 if ( (u32)value != value )
1661 HVM_DBG_LOG(DBG_LEVEL_1,
1662 "Guest attempts to set upper 32 bits in CR0: %lx",
1663 value);
1664 svm_inject_exception(v, TRAP_gp_fault, 1, 0);
1665 return 0;
1668 value &= ~HVM_CR0_GUEST_RESERVED_BITS;
1670 /* ET is reserved and should be always be 1. */
1671 value |= X86_CR0_ET;
1673 if ( (value & (X86_CR0_PE|X86_CR0_PG)) == X86_CR0_PG )
1675 svm_inject_exception(v, TRAP_gp_fault, 1, 0);
1676 return 0;
1679 /* TS cleared? Then initialise FPU now. */
1680 if ( !(value & X86_CR0_TS) )
1682 setup_fpu(v);
1683 vmcb->exception_intercepts &= ~(1U << TRAP_no_device);
1686 if ( (value & X86_CR0_PG) && !(old_value & X86_CR0_PG) )
1688 if ( svm_lme_is_set(v) )
1690 if ( !svm_cr4_pae_is_set(v) )
1692 HVM_DBG_LOG(DBG_LEVEL_1, "Enable paging before PAE enable");
1693 svm_inject_exception(v, TRAP_gp_fault, 1, 0);
1694 return 0;
1696 HVM_DBG_LOG(DBG_LEVEL_1, "Enable the Long mode");
1697 v->arch.hvm_svm.cpu_shadow_efer |= EFER_LMA;
1698 vmcb->efer |= EFER_LMA | EFER_LME;
1701 if ( !paging_mode_hap(v->domain) )
1703 /* The guest CR3 must be pointing to the guest physical. */
1704 mfn = get_mfn_from_gpfn(v->arch.hvm_svm.cpu_cr3 >> PAGE_SHIFT);
1705 if ( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), v->domain))
1707 gdprintk(XENLOG_ERR, "Invalid CR3 value = %lx (mfn=%lx)\n",
1708 v->arch.hvm_svm.cpu_cr3, mfn);
1709 domain_crash(v->domain);
1710 return 0;
1713 /* Now arch.guest_table points to machine physical. */
1714 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1715 v->arch.guest_table = pagetable_from_pfn(mfn);
1716 if ( old_base_mfn )
1717 put_page(mfn_to_page(old_base_mfn));
1719 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, mfn = %lx",
1720 v->arch.hvm_vmx.cpu_cr3, mfn);
1723 else if ( !(value & X86_CR0_PG) && (old_value & X86_CR0_PG) )
1725 /* When CR0.PG is cleared, LMA is cleared immediately. */
1726 if ( svm_long_mode_enabled(v) )
1728 vmcb->efer &= ~(EFER_LME | EFER_LMA);
1729 v->arch.hvm_svm.cpu_shadow_efer &= ~EFER_LMA;
1732 if ( !paging_mode_hap(v->domain) && v->arch.hvm_svm.cpu_cr3 )
1734 put_page(mfn_to_page(get_mfn_from_gpfn(
1735 v->arch.hvm_svm.cpu_cr3 >> PAGE_SHIFT)));
1736 v->arch.guest_table = pagetable_null();
1740 vmcb->cr0 = v->arch.hvm_svm.cpu_shadow_cr0 = value;
1741 if ( !paging_mode_hap(v->domain) )
1742 vmcb->cr0 |= X86_CR0_PG | X86_CR0_WP;
1744 if ( (value ^ old_value) & X86_CR0_PG )
1746 paging_update_paging_modes(v);
1747 svm_asid_g_update_paging(v);
1750 return 1;
1753 /*
1754 * Read from control registers. CR0 and CR4 are read from the shadow.
1755 */
1756 static void mov_from_cr(int cr, int gp, struct cpu_user_regs *regs)
1758 unsigned long value = 0;
1759 struct vcpu *v = current;
1760 struct vlapic *vlapic = vcpu_vlapic(v);
1761 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
1763 switch ( cr )
1765 case 0:
1766 value = v->arch.hvm_svm.cpu_shadow_cr0;
1767 break;
1768 case 2:
1769 value = vmcb->cr2;
1770 break;
1771 case 3:
1772 value = (unsigned long)v->arch.hvm_svm.cpu_cr3;
1773 break;
1774 case 4:
1775 value = (unsigned long)v->arch.hvm_svm.cpu_shadow_cr4;
1776 break;
1777 case 8:
1778 value = (unsigned long)vlapic_get_reg(vlapic, APIC_TASKPRI);
1779 value = (value & 0xF0) >> 4;
1780 break;
1782 default:
1783 domain_crash(v->domain);
1784 return;
1787 HVMTRACE_2D(CR_READ, v, cr, value);
1789 set_reg(gp, value, regs, vmcb);
1791 HVM_DBG_LOG(DBG_LEVEL_VMMU, "mov_from_cr: CR%d, value = %lx", cr, value);
1795 /*
1796 * Write to control registers
1797 */
1798 static int mov_to_cr(int gpreg, int cr, struct cpu_user_regs *regs)
1800 unsigned long value, old_cr, old_base_mfn, mfn;
1801 struct vcpu *v = current;
1802 struct vlapic *vlapic = vcpu_vlapic(v);
1803 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
1805 value = get_reg(gpreg, regs, vmcb);
1807 HVMTRACE_2D(CR_WRITE, v, cr, value);
1809 HVM_DBG_LOG(DBG_LEVEL_1, "mov_to_cr: CR%d, value = %lx, current = %p",
1810 cr, value, v);
1812 switch ( cr )
1814 case 0:
1815 return svm_set_cr0(value);
1817 case 3:
1818 if ( paging_mode_hap(v->domain) )
1820 vmcb->cr3 = v->arch.hvm_svm.cpu_cr3 = value;
1821 break;
1824 /* If paging is not enabled yet, simply copy the value to CR3. */
1825 if ( !svm_paging_enabled(v) )
1827 v->arch.hvm_svm.cpu_cr3 = value;
1828 break;
1831 /* We make a new one if the shadow does not exist. */
1832 if ( value == v->arch.hvm_svm.cpu_cr3 )
1834 /*
1835 * This is simple TLB flush, implying the guest has
1836 * removed some translation or changed page attributes.
1837 * We simply invalidate the shadow.
1838 */
1839 mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
1840 if ( mfn != pagetable_get_pfn(v->arch.guest_table) )
1841 goto bad_cr3;
1842 paging_update_cr3(v);
1843 /* signal paging update to ASID handler */
1844 svm_asid_g_mov_to_cr3 (v);
1846 else
1848 /*
1849 * If different, make a shadow. Check if the PDBR is valid
1850 * first.
1851 */
1852 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 value = %lx", value);
1853 mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
1854 if ( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), v->domain) )
1855 goto bad_cr3;
1857 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1858 v->arch.guest_table = pagetable_from_pfn(mfn);
1860 if ( old_base_mfn )
1861 put_page(mfn_to_page(old_base_mfn));
1863 v->arch.hvm_svm.cpu_cr3 = value;
1864 update_cr3(v);
1865 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx", value);
1866 /* signal paging update to ASID handler */
1867 svm_asid_g_mov_to_cr3 (v);
1869 break;
1871 case 4: /* CR4 */
1872 if ( value & HVM_CR4_GUEST_RESERVED_BITS )
1874 HVM_DBG_LOG(DBG_LEVEL_1,
1875 "Guest attempts to set reserved bit in CR4: %lx",
1876 value);
1877 svm_inject_exception(v, TRAP_gp_fault, 1, 0);
1878 break;
1881 if ( paging_mode_hap(v->domain) )
1883 v->arch.hvm_svm.cpu_shadow_cr4 = value;
1884 vmcb->cr4 = value | (HVM_CR4_HOST_MASK & ~X86_CR4_PAE);
1885 paging_update_paging_modes(v);
1886 /* signal paging update to ASID handler */
1887 svm_asid_g_update_paging (v);
1888 break;
1891 old_cr = v->arch.hvm_svm.cpu_shadow_cr4;
1892 if ( value & X86_CR4_PAE && !(old_cr & X86_CR4_PAE) )
1894 if ( svm_pgbit_test(v) )
1896 #if CONFIG_PAGING_LEVELS >= 3
1897 /* The guest is a 32-bit PAE guest. */
1898 unsigned long mfn, old_base_mfn;
1899 mfn = get_mfn_from_gpfn(v->arch.hvm_svm.cpu_cr3 >> PAGE_SHIFT);
1900 if ( !mfn_valid(mfn) ||
1901 !get_page(mfn_to_page(mfn), v->domain) )
1902 goto bad_cr3;
1904 /*
1905 * Now arch.guest_table points to machine physical.
1906 */
1907 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1908 v->arch.guest_table = pagetable_from_pfn(mfn);
1909 if ( old_base_mfn )
1910 put_page(mfn_to_page(old_base_mfn));
1911 paging_update_paging_modes(v);
1912 /* signal paging update to ASID handler */
1913 svm_asid_g_update_paging (v);
1915 HVM_DBG_LOG(DBG_LEVEL_VMMU,
1916 "Update CR3 value = %lx, mfn = %lx",
1917 v->arch.hvm_svm.cpu_cr3, mfn);
1918 #endif
1921 else if ( !(value & X86_CR4_PAE) )
1923 if ( svm_long_mode_enabled(v) )
1925 svm_inject_exception(v, TRAP_gp_fault, 1, 0);
1929 v->arch.hvm_svm.cpu_shadow_cr4 = value;
1930 vmcb->cr4 = value | HVM_CR4_HOST_MASK;
1932 /*
1933 * Writing to CR4 to modify the PSE, PGE, or PAE flag invalidates
1934 * all TLB entries except global entries.
1935 */
1936 if ((old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE))
1938 paging_update_paging_modes(v);
1939 /* signal paging update to ASID handler */
1940 svm_asid_g_update_paging (v);
1942 break;
1944 case 8:
1945 vlapic_set_reg(vlapic, APIC_TASKPRI, ((value & 0x0F) << 4));
1946 vmcb->vintr.fields.tpr = value & 0x0F;
1947 break;
1949 default:
1950 gdprintk(XENLOG_ERR, "invalid cr: %d\n", cr);
1951 domain_crash(v->domain);
1952 return 0;
1955 return 1;
1957 bad_cr3:
1958 gdprintk(XENLOG_ERR, "Invalid CR3\n");
1959 domain_crash(v->domain);
1960 return 0;
1964 #define ARR_SIZE(x) (sizeof(x) / sizeof(x[0]))
1967 static int svm_cr_access(struct vcpu *v, unsigned int cr, unsigned int type,
1968 struct cpu_user_regs *regs)
1970 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
1971 int inst_len = 0;
1972 int index,addr_size,i;
1973 unsigned int gpreg,offset;
1974 unsigned long value,addr;
1975 u8 buffer[MAX_INST_LEN];
1976 u8 prefix = 0;
1977 u8 modrm;
1978 enum x86_segment seg;
1979 int result = 1;
1980 enum instruction_index list_a[] = {INSTR_MOV2CR, INSTR_CLTS, INSTR_LMSW};
1981 enum instruction_index list_b[] = {INSTR_MOVCR2, INSTR_SMSW};
1982 enum instruction_index match;
1984 inst_copy_from_guest(buffer, svm_rip2pointer(v), sizeof(buffer));
1986 /* get index to first actual instruction byte - as we will need to know
1987 where the prefix lives later on */
1988 index = skip_prefix_bytes(buffer, sizeof(buffer));
1990 if ( type == TYPE_MOV_TO_CR )
1992 inst_len = __get_instruction_length_from_list(
1993 v, list_a, ARR_SIZE(list_a), &buffer[index], &match);
1995 else /* type == TYPE_MOV_FROM_CR */
1997 inst_len = __get_instruction_length_from_list(
1998 v, list_b, ARR_SIZE(list_b), &buffer[index], &match);
2001 ASSERT(inst_len > 0);
2003 inst_len += index;
2005 /* Check for REX prefix - it's ALWAYS the last byte of any prefix bytes */
2006 if (index > 0 && (buffer[index-1] & 0xF0) == 0x40)
2007 prefix = buffer[index-1];
2009 HVM_DBG_LOG(DBG_LEVEL_1, "eip = %lx", (unsigned long) vmcb->rip);
2011 switch (match)
2013 case INSTR_MOV2CR:
2014 gpreg = decode_src_reg(prefix, buffer[index+2]);
2015 result = mov_to_cr(gpreg, cr, regs);
2016 break;
2018 case INSTR_MOVCR2:
2019 gpreg = decode_src_reg(prefix, buffer[index+2]);
2020 mov_from_cr(cr, gpreg, regs);
2021 break;
2023 case INSTR_CLTS:
2024 /* TS being cleared means that it's time to restore fpu state. */
2025 setup_fpu(current);
2026 vmcb->exception_intercepts &= ~(1U << TRAP_no_device);
2027 vmcb->cr0 &= ~X86_CR0_TS; /* clear TS */
2028 v->arch.hvm_svm.cpu_shadow_cr0 &= ~X86_CR0_TS; /* clear TS */
2029 break;
2031 case INSTR_LMSW:
2032 gpreg = decode_src_reg(prefix, buffer[index+2]);
2033 value = get_reg(gpreg, regs, vmcb) & 0xF;
2034 value = (v->arch.hvm_svm.cpu_shadow_cr0 & ~0xF) | value;
2035 result = svm_set_cr0(value);
2036 break;
2038 case INSTR_SMSW:
2039 value = v->arch.hvm_svm.cpu_shadow_cr0 & 0xFFFF;
2040 modrm = buffer[index+2];
2041 addr_size = svm_guest_x86_mode(v);
2042 if ( addr_size < 2 )
2043 addr_size = 2;
2044 if ( likely((modrm & 0xC0) >> 6 == 3) )
2046 gpreg = decode_src_reg(prefix, modrm);
2047 set_reg(gpreg, value, regs, vmcb);
2049 /*
2050 * For now, only implement decode of the offset mode, since that's the
2051 * only mode observed in a real-world OS. This code is also making the
2052 * assumption that we'll never hit this code in long mode.
2053 */
2054 else if ( (modrm == 0x26) || (modrm == 0x25) )
2056 seg = x86_seg_ds;
2057 i = index;
2058 /* Segment or address size overrides? */
2059 while ( i-- )
2061 switch ( buffer[i] )
2063 case 0x26: seg = x86_seg_es; break;
2064 case 0x2e: seg = x86_seg_cs; break;
2065 case 0x36: seg = x86_seg_ss; break;
2066 case 0x64: seg = x86_seg_fs; break;
2067 case 0x65: seg = x86_seg_gs; break;
2068 case 0x67: addr_size ^= 6; break;
2071 /* Bail unless this really is a seg_base + offset case */
2072 if ( ((modrm == 0x26) && (addr_size == 4)) ||
2073 ((modrm == 0x25) && (addr_size == 2)) )
2075 gdprintk(XENLOG_ERR, "SMSW emulation at guest address: "
2076 "%lx failed due to unhandled addressing mode."
2077 "ModRM byte was: %x \n", svm_rip2pointer(v), modrm);
2078 domain_crash(v->domain);
2080 inst_len += addr_size;
2081 offset = *(( unsigned int *) ( void *) &buffer[index + 3]);
2082 offset = ( addr_size == 4 ) ? offset : ( offset & 0xFFFF );
2083 addr = hvm_get_segment_base(v, seg);
2084 addr += offset;
2085 hvm_copy_to_guest_virt(addr,&value,2);
2087 else
2089 gdprintk(XENLOG_ERR, "SMSW emulation at guest address: %lx "
2090 "failed due to unhandled addressing mode!"
2091 "ModRM byte was: %x \n", svm_rip2pointer(v), modrm);
2092 domain_crash(v->domain);
2094 break;
2096 default:
2097 BUG();
2100 ASSERT(inst_len);
2102 __update_guest_eip(vmcb, inst_len);
2104 return result;
2107 static void svm_do_msr_access(
2108 struct vcpu *v, struct cpu_user_regs *regs)
2110 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
2111 int inst_len;
2112 u64 msr_content=0;
2113 u32 ecx = regs->ecx, eax, edx;
2115 HVM_DBG_LOG(DBG_LEVEL_1, "ecx=%x, eax=%x, edx=%x, exitinfo = %lx",
2116 ecx, (u32)regs->eax, (u32)regs->edx,
2117 (unsigned long)vmcb->exitinfo1);
2119 /* is it a read? */
2120 if (vmcb->exitinfo1 == 0)
2122 switch (ecx) {
2123 case MSR_IA32_TIME_STAMP_COUNTER:
2124 msr_content = hvm_get_guest_time(v);
2125 break;
2127 case MSR_IA32_APICBASE:
2128 msr_content = vcpu_vlapic(v)->hw.apic_base_msr;
2129 break;
2131 case MSR_EFER:
2132 msr_content = v->arch.hvm_svm.cpu_shadow_efer;
2133 break;
2135 case MSR_K8_MC4_MISC: /* Threshold register */
2136 /*
2137 * MCA/MCE: We report that the threshold register is unavailable
2138 * for OS use (locked by the BIOS).
2139 */
2140 msr_content = 1ULL << 61; /* MC4_MISC.Locked */
2141 break;
2143 case MSR_IA32_EBC_FREQUENCY_ID:
2144 /*
2145 * This Intel-only register may be accessed if this HVM guest
2146 * has been migrated from an Intel host. The value zero is not
2147 * particularly meaningful, but at least avoids the guest crashing!
2148 */
2149 msr_content = 0;
2150 break;
2152 case MSR_K8_VM_HSAVE_PA:
2153 svm_inject_exception(v, TRAP_gp_fault, 1, 0);
2154 break;
2156 case MSR_IA32_MCG_STATUS:
2157 case MSR_IA32_MC0_STATUS:
2158 case MSR_K8_MC1_STATUS:
2159 case MSR_K8_MC2_STATUS:
2160 case MSR_K8_MC3_STATUS:
2161 case MSR_K8_MC4_STATUS:
2162 case MSR_K8_MC5_STATUS:
2163 /* No point in letting the guest see real MCEs */
2164 msr_content = 0;
2165 break;
2167 default:
2168 if ( rdmsr_hypervisor_regs(ecx, &eax, &edx) ||
2169 rdmsr_safe(ecx, eax, edx) == 0 )
2171 regs->eax = eax;
2172 regs->edx = edx;
2173 goto done;
2175 svm_inject_exception(v, TRAP_gp_fault, 1, 0);
2176 return;
2178 regs->eax = msr_content & 0xFFFFFFFF;
2179 regs->edx = msr_content >> 32;
2181 done:
2182 HVMTRACE_2D(MSR_READ, v, ecx, msr_content);
2183 HVM_DBG_LOG(DBG_LEVEL_1, "returns: ecx=%x, eax=%lx, edx=%lx",
2184 ecx, (unsigned long)regs->eax, (unsigned long)regs->edx);
2186 inst_len = __get_instruction_length(v, INSTR_RDMSR, NULL);
2188 else
2190 msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
2192 HVMTRACE_2D(MSR_WRITE, v, ecx, msr_content);
2194 switch (ecx)
2196 case MSR_IA32_TIME_STAMP_COUNTER:
2197 hvm_set_guest_time(v, msr_content);
2198 pt_reset(v);
2199 break;
2201 case MSR_IA32_APICBASE:
2202 vlapic_msr_set(vcpu_vlapic(v), msr_content);
2203 break;
2205 case MSR_K8_VM_HSAVE_PA:
2206 svm_inject_exception(v, TRAP_gp_fault, 1, 0);
2207 break;
2209 default:
2210 switch ( long_mode_do_msr_write(regs) )
2212 case HNDL_unhandled:
2213 wrmsr_hypervisor_regs(ecx, regs->eax, regs->edx);
2214 break;
2215 case HNDL_exception_raised:
2216 return;
2217 case HNDL_done:
2218 break;
2220 break;
2223 inst_len = __get_instruction_length(v, INSTR_WRMSR, NULL);
2226 __update_guest_eip(vmcb, inst_len);
2229 static void svm_vmexit_do_hlt(struct vmcb_struct *vmcb)
2231 enum hvm_intack type = hvm_vcpu_has_pending_irq(current);
2233 __update_guest_eip(vmcb, 1);
2235 /* Check for interrupt not handled or new interrupt. */
2236 if ( vmcb->eventinj.fields.v ||
2237 ((type != hvm_intack_none) && svm_interrupts_enabled(current, type)) )
2239 HVMTRACE_1D(HLT, current, /*int pending=*/ 1);
2240 return;
2243 HVMTRACE_1D(HLT, current, /*int pending=*/ 0);
2244 hvm_hlt(vmcb->rflags);
2247 static void svm_vmexit_do_invd(struct vcpu *v)
2249 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
2250 int inst_len;
2252 /* Invalidate the cache - we can't really do that safely - maybe we should
2253 * WBINVD, but I think it's just fine to completely ignore it - we should
2254 * have cache-snooping that solves it anyways. -- Mats P.
2255 */
2257 /* Tell the user that we did this - just in case someone runs some really
2258 * weird operating system and wants to know why it's not working...
2259 */
2260 gdprintk(XENLOG_WARNING, "INVD instruction intercepted - ignored\n");
2262 inst_len = __get_instruction_length(v, INSTR_INVD, NULL);
2263 __update_guest_eip(vmcb, inst_len);
2266 void svm_handle_invlpg(const short invlpga, struct cpu_user_regs *regs)
2268 struct vcpu *v = current;
2269 u8 opcode[MAX_INST_LEN], prefix, length = MAX_INST_LEN;
2270 unsigned long g_vaddr;
2271 int inst_len;
2272 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
2274 /*
2275 * Unknown how many bytes the invlpg instruction will take. Use the
2276 * maximum instruction length here
2277 */
2278 if (inst_copy_from_guest(opcode, svm_rip2pointer(v), length) < length)
2280 gdprintk(XENLOG_ERR, "Error reading memory %d bytes\n", length);
2281 domain_crash(v->domain);
2282 return;
2285 if (invlpga)
2287 inst_len = __get_instruction_length(v, INSTR_INVLPGA, opcode);
2288 ASSERT(inst_len > 0);
2289 __update_guest_eip(vmcb, inst_len);
2291 /*
2292 * The address is implicit on this instruction. At the moment, we don't
2293 * use ecx (ASID) to identify individual guests pages
2294 */
2295 g_vaddr = regs->eax;
2297 else
2299 /* What about multiple prefix codes? */
2300 prefix = (is_prefix(opcode[0])?opcode[0]:0);
2301 inst_len = __get_instruction_length(v, INSTR_INVLPG, opcode);
2302 ASSERT(inst_len > 0);
2304 inst_len--;
2305 length -= inst_len;
2307 /*
2308 * Decode memory operand of the instruction including ModRM, SIB, and
2309 * displacement to get effective address and length in bytes. Assume
2310 * the system in either 32- or 64-bit mode.
2311 */
2312 g_vaddr = get_effective_addr_modrm64(regs, prefix, inst_len,
2313 &opcode[inst_len], &length);
2315 inst_len += length;
2316 __update_guest_eip (vmcb, inst_len);
2319 HVMTRACE_3D(INVLPG, v, (invlpga?1:0), g_vaddr, (invlpga?regs->ecx:0));
2321 paging_invlpg(v, g_vaddr);
2322 /* signal invplg to ASID handler */
2323 svm_asid_g_invlpg (v, g_vaddr);
2327 /*
2328 * Reset to realmode causes execution to start at 0xF000:0xFFF0 in
2329 * 16-bit realmode. Basically, this mimics a processor reset.
2331 * returns 0 on success, non-zero otherwise
2332 */
2333 static int svm_reset_to_realmode(struct vcpu *v,
2334 struct cpu_user_regs *regs)
2336 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
2338 /* clear the vmcb and user regs */
2339 memset(regs, 0, sizeof(struct cpu_user_regs));
2341 /* VMCB State */
2342 vmcb->cr0 = X86_CR0_ET | X86_CR0_PG | X86_CR0_WP;
2343 v->arch.hvm_svm.cpu_shadow_cr0 = X86_CR0_ET;
2345 vmcb->cr2 = 0;
2346 vmcb->efer = EFER_SVME;
2348 vmcb->cr4 = HVM_CR4_HOST_MASK;
2349 v->arch.hvm_svm.cpu_shadow_cr4 = 0;
2351 if ( paging_mode_hap(v->domain) ) {
2352 vmcb->cr0 = v->arch.hvm_svm.cpu_shadow_cr0;
2353 vmcb->cr4 = v->arch.hvm_svm.cpu_shadow_cr4 |
2354 (HVM_CR4_HOST_MASK & ~X86_CR4_PAE);
2357 /* This will jump to ROMBIOS */
2358 vmcb->rip = 0xFFF0;
2360 /* setup the segment registers and all their hidden states */
2361 vmcb->cs.sel = 0xF000;
2362 vmcb->cs.attr.bytes = 0x089b;
2363 vmcb->cs.limit = 0xffff;
2364 vmcb->cs.base = 0x000F0000;
2366 vmcb->ss.sel = 0x00;
2367 vmcb->ss.attr.bytes = 0x0893;
2368 vmcb->ss.limit = 0xffff;
2369 vmcb->ss.base = 0x00;
2371 vmcb->ds.sel = 0x00;
2372 vmcb->ds.attr.bytes = 0x0893;
2373 vmcb->ds.limit = 0xffff;
2374 vmcb->ds.base = 0x00;
2376 vmcb->es.sel = 0x00;
2377 vmcb->es.attr.bytes = 0x0893;
2378 vmcb->es.limit = 0xffff;
2379 vmcb->es.base = 0x00;
2381 vmcb->fs.sel = 0x00;
2382 vmcb->fs.attr.bytes = 0x0893;
2383 vmcb->fs.limit = 0xffff;
2384 vmcb->fs.base = 0x00;
2386 vmcb->gs.sel = 0x00;
2387 vmcb->gs.attr.bytes = 0x0893;
2388 vmcb->gs.limit = 0xffff;
2389 vmcb->gs.base = 0x00;
2391 vmcb->ldtr.sel = 0x00;
2392 vmcb->ldtr.attr.bytes = 0x0000;
2393 vmcb->ldtr.limit = 0x0;
2394 vmcb->ldtr.base = 0x00;
2396 vmcb->gdtr.sel = 0x00;
2397 vmcb->gdtr.attr.bytes = 0x0000;
2398 vmcb->gdtr.limit = 0x0;
2399 vmcb->gdtr.base = 0x00;
2401 vmcb->tr.sel = 0;
2402 vmcb->tr.attr.bytes = 0;
2403 vmcb->tr.limit = 0x0;
2404 vmcb->tr.base = 0;
2406 vmcb->idtr.sel = 0x00;
2407 vmcb->idtr.attr.bytes = 0x0000;
2408 vmcb->idtr.limit = 0x3ff;
2409 vmcb->idtr.base = 0x00;
2411 vmcb->rax = 0;
2412 vmcb->rsp = 0;
2414 return 0;
2417 asmlinkage void svm_vmexit_handler(struct cpu_user_regs *regs)
2419 unsigned int exit_reason;
2420 unsigned long eip;
2421 struct vcpu *v = current;
2422 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
2423 eventinj_t eventinj;
2424 int inst_len, rc;
2426 exit_reason = vmcb->exitcode;
2428 HVMTRACE_2D(VMEXIT, v, vmcb->rip, exit_reason);
2430 if ( unlikely(exit_reason == VMEXIT_INVALID) )
2432 svm_dump_vmcb(__func__, vmcb);
2433 goto exit_and_crash;
2436 perfc_incra(svmexits, exit_reason);
2437 eip = vmcb->rip;
2439 /* Event delivery caused this intercept? Queue for redelivery. */
2440 eventinj = vmcb->exitintinfo;
2441 if ( unlikely(eventinj.fields.v) &&
2442 hvm_event_needs_reinjection(eventinj.fields.type,
2443 eventinj.fields.vector) )
2444 vmcb->eventinj = eventinj;
2446 switch ( exit_reason )
2448 case VMEXIT_INTR:
2449 /* Asynchronous event, handled when we STGI'd after the VMEXIT. */
2450 HVMTRACE_0D(INTR, v);
2451 break;
2453 case VMEXIT_NMI:
2454 /* Asynchronous event, handled when we STGI'd after the VMEXIT. */
2455 HVMTRACE_0D(NMI, v);
2456 break;
2458 case VMEXIT_SMI:
2459 /* Asynchronous event, handled when we STGI'd after the VMEXIT. */
2460 HVMTRACE_0D(SMI, v);
2461 break;
2463 case VMEXIT_EXCEPTION_DB:
2464 if ( !v->domain->debugger_attached )
2465 goto exit_and_crash;
2466 domain_pause_for_debugger();
2467 break;
2469 case VMEXIT_EXCEPTION_BP:
2470 if ( !v->domain->debugger_attached )
2471 goto exit_and_crash;
2472 /* AMD Vol2, 15.11: INT3, INTO, BOUND intercepts do not update RIP. */
2473 inst_len = __get_instruction_length(v, INSTR_INT3, NULL);
2474 __update_guest_eip(vmcb, inst_len);
2475 domain_pause_for_debugger();
2476 break;
2478 case VMEXIT_EXCEPTION_NM:
2479 svm_do_no_device_fault(vmcb);
2480 break;
2482 case VMEXIT_EXCEPTION_PF: {
2483 unsigned long va;
2484 va = vmcb->exitinfo2;
2485 regs->error_code = vmcb->exitinfo1;
2486 HVM_DBG_LOG(DBG_LEVEL_VMMU,
2487 "eax=%lx, ebx=%lx, ecx=%lx, edx=%lx, esi=%lx, edi=%lx",
2488 (unsigned long)regs->eax, (unsigned long)regs->ebx,
2489 (unsigned long)regs->ecx, (unsigned long)regs->edx,
2490 (unsigned long)regs->esi, (unsigned long)regs->edi);
2492 if ( paging_fault(va, regs) )
2494 HVMTRACE_2D(PF_XEN, v, va, regs->error_code);
2495 break;
2498 v->arch.hvm_svm.cpu_cr2 = vmcb->cr2 = va;
2499 svm_inject_exception(v, TRAP_page_fault, 1, regs->error_code);
2500 break;
2503 case VMEXIT_EXCEPTION_MC:
2504 HVMTRACE_0D(MCE, v);
2505 svm_store_cpu_guest_regs(v, regs, NULL);
2506 do_machine_check(regs);
2507 break;
2509 case VMEXIT_VINTR:
2510 vmcb->vintr.fields.irq = 0;
2511 vmcb->general1_intercepts &= ~GENERAL1_INTERCEPT_VINTR;
2512 break;
2514 case VMEXIT_INVD:
2515 svm_vmexit_do_invd(v);
2516 break;
2518 case VMEXIT_GDTR_WRITE:
2519 printk("WRITE to GDTR\n");
2520 break;
2522 case VMEXIT_TASK_SWITCH:
2523 goto exit_and_crash;
2525 case VMEXIT_CPUID:
2526 svm_vmexit_do_cpuid(vmcb, regs);
2527 break;
2529 case VMEXIT_HLT:
2530 svm_vmexit_do_hlt(vmcb);
2531 break;
2533 case VMEXIT_INVLPG:
2534 svm_handle_invlpg(0, regs);
2535 break;
2537 case VMEXIT_INVLPGA:
2538 svm_handle_invlpg(1, regs);
2539 break;
2541 case VMEXIT_VMMCALL:
2542 inst_len = __get_instruction_length(v, INSTR_VMCALL, NULL);
2543 ASSERT(inst_len > 0);
2544 HVMTRACE_1D(VMMCALL, v, regs->eax);
2545 rc = hvm_do_hypercall(regs);
2546 if ( rc != HVM_HCALL_preempted )
2548 __update_guest_eip(vmcb, inst_len);
2549 if ( rc == HVM_HCALL_invalidate )
2550 send_invalidate_req();
2552 break;
2554 case VMEXIT_CR0_READ:
2555 svm_cr_access(v, 0, TYPE_MOV_FROM_CR, regs);
2556 break;
2558 case VMEXIT_CR2_READ:
2559 svm_cr_access(v, 2, TYPE_MOV_FROM_CR, regs);
2560 break;
2562 case VMEXIT_CR3_READ:
2563 svm_cr_access(v, 3, TYPE_MOV_FROM_CR, regs);
2564 break;
2566 case VMEXIT_CR4_READ:
2567 svm_cr_access(v, 4, TYPE_MOV_FROM_CR, regs);
2568 break;
2570 case VMEXIT_CR8_READ:
2571 svm_cr_access(v, 8, TYPE_MOV_FROM_CR, regs);
2572 break;
2574 case VMEXIT_CR0_WRITE:
2575 svm_cr_access(v, 0, TYPE_MOV_TO_CR, regs);
2576 break;
2578 case VMEXIT_CR2_WRITE:
2579 svm_cr_access(v, 2, TYPE_MOV_TO_CR, regs);
2580 break;
2582 case VMEXIT_CR3_WRITE:
2583 svm_cr_access(v, 3, TYPE_MOV_TO_CR, regs);
2584 local_flush_tlb();
2585 break;
2587 case VMEXIT_CR4_WRITE:
2588 svm_cr_access(v, 4, TYPE_MOV_TO_CR, regs);
2589 break;
2591 case VMEXIT_CR8_WRITE:
2592 svm_cr_access(v, 8, TYPE_MOV_TO_CR, regs);
2593 break;
2595 case VMEXIT_DR0_WRITE ... VMEXIT_DR7_WRITE:
2596 svm_dr_access(v, regs);
2597 break;
2599 case VMEXIT_IOIO:
2600 svm_io_instruction(v);
2601 break;
2603 case VMEXIT_MSR:
2604 svm_do_msr_access(v, regs);
2605 break;
2607 case VMEXIT_SHUTDOWN:
2608 hvm_triple_fault();
2609 break;
2611 case VMEXIT_VMRUN:
2612 case VMEXIT_VMLOAD:
2613 case VMEXIT_VMSAVE:
2614 case VMEXIT_STGI:
2615 case VMEXIT_CLGI:
2616 case VMEXIT_SKINIT:
2617 /* Report "Invalid opcode" on any VM-operation except VMMCALL */
2618 svm_inject_exception(v, TRAP_invalid_op, 0, 0);
2619 break;
2621 case VMEXIT_NPF:
2622 regs->error_code = vmcb->exitinfo1;
2623 if ( !svm_do_nested_pgfault(vmcb->exitinfo2, regs) )
2624 domain_crash(v->domain);
2625 break;
2627 default:
2628 exit_and_crash:
2629 gdprintk(XENLOG_ERR, "unexpected VMEXIT: exit reason = 0x%x, "
2630 "exitinfo1 = %"PRIx64", exitinfo2 = %"PRIx64"\n",
2631 exit_reason,
2632 (u64)vmcb->exitinfo1, (u64)vmcb->exitinfo2);
2633 domain_crash(v->domain);
2634 break;
2638 asmlinkage void svm_trace_vmentry(void)
2640 struct vcpu *v = current;
2642 /* This is the last C code before the VMRUN instruction. */
2643 HVMTRACE_0D(VMENTRY, v);
2646 /*
2647 * Local variables:
2648 * mode: C
2649 * c-set-style: "BSD"
2650 * c-basic-offset: 4
2651 * tab-width: 4
2652 * indent-tabs-mode: nil
2653 * End:
2654 */