ia64/xen-unstable

view xen/arch/x86/hvm/svm/svm.c @ 15675:66147ca8f9c4

hvm: Define common (across VMX and SVM) set of event types.
Signed-off-by: Keir Fraser <keir@xensource.com>
author kfraser@localhost.localdomain
date Tue Jul 31 10:11:47 2007 +0100 (2007-07-31)
parents 9174a8cfb578
children 0636f262ecd8
line source
1 /*
2 * svm.c: handling SVM architecture-related VM exits
3 * Copyright (c) 2004, Intel Corporation.
4 * Copyright (c) 2005-2007, Advanced Micro Devices, Inc.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms and conditions of the GNU General Public License,
8 * version 2, as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
17 * Place - Suite 330, Boston, MA 02111-1307 USA.
18 */
20 #include <xen/config.h>
21 #include <xen/init.h>
22 #include <xen/lib.h>
23 #include <xen/trace.h>
24 #include <xen/sched.h>
25 #include <xen/irq.h>
26 #include <xen/softirq.h>
27 #include <xen/hypercall.h>
28 #include <xen/domain_page.h>
29 #include <asm/current.h>
30 #include <asm/io.h>
31 #include <asm/paging.h>
32 #include <asm/p2m.h>
33 #include <asm/regs.h>
34 #include <asm/cpufeature.h>
35 #include <asm/processor.h>
36 #include <asm/types.h>
37 #include <asm/msr.h>
38 #include <asm/spinlock.h>
39 #include <asm/hvm/hvm.h>
40 #include <asm/hvm/support.h>
41 #include <asm/hvm/io.h>
42 #include <asm/hvm/svm/asid.h>
43 #include <asm/hvm/svm/svm.h>
44 #include <asm/hvm/svm/vmcb.h>
45 #include <asm/hvm/svm/emulate.h>
46 #include <asm/hvm/svm/intr.h>
47 #include <asm/x86_emulate.h>
48 #include <public/sched.h>
49 #include <asm/hvm/vpt.h>
50 #include <asm/hvm/trace.h>
51 #include <asm/hap.h>
53 #define set_segment_register(name, value) \
54 asm volatile ( "movw %%ax ,%%" STR(name) "" : : "a" (value) )
56 enum handler_return { HNDL_done, HNDL_unhandled, HNDL_exception_raised };
58 int inst_copy_from_guest(unsigned char *buf, unsigned long guest_eip,
59 int inst_len);
60 asmlinkage void do_IRQ(struct cpu_user_regs *);
62 static int svm_reset_to_realmode(struct vcpu *v,
63 struct cpu_user_regs *regs);
65 /* va of hardware host save area */
66 static void *hsa[NR_CPUS] __read_mostly;
68 /* vmcb used for extended host state */
69 static void *root_vmcb[NR_CPUS] __read_mostly;
71 /* hardware assisted paging bits */
72 extern int opt_hap_enabled;
74 static void svm_inject_exception(
75 struct vcpu *v, int trap, int ev, int error_code)
76 {
77 eventinj_t event;
78 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
80 if ( trap == TRAP_page_fault )
81 HVMTRACE_2D(PF_INJECT, v, v->arch.hvm_svm.cpu_cr2, error_code);
82 else
83 HVMTRACE_2D(INJ_EXC, v, trap, error_code);
85 event.bytes = 0;
86 event.fields.v = 1;
87 event.fields.type = X86_EVENTTYPE_HW_EXCEPTION;
88 event.fields.vector = trap;
89 event.fields.ev = ev;
90 event.fields.errorcode = error_code;
92 ASSERT(vmcb->eventinj.fields.v == 0);
94 vmcb->eventinj = event;
95 }
97 static void svm_cpu_down(void)
98 {
99 write_efer(read_efer() & ~EFER_SVME);
100 }
102 #ifdef __x86_64__
104 static int svm_lme_is_set(struct vcpu *v)
105 {
106 u64 guest_efer = v->arch.hvm_svm.cpu_shadow_efer;
107 return guest_efer & EFER_LME;
108 }
110 static int svm_long_mode_enabled(struct vcpu *v)
111 {
112 u64 guest_efer = v->arch.hvm_svm.cpu_shadow_efer;
113 return guest_efer & EFER_LMA;
114 }
116 #else /* __i386__ */
118 static int svm_lme_is_set(struct vcpu *v)
119 { return 0; }
120 static int svm_long_mode_enabled(struct vcpu *v)
121 { return 0; }
123 #endif
125 static int svm_cr4_pae_is_set(struct vcpu *v)
126 {
127 unsigned long guest_cr4 = v->arch.hvm_svm.cpu_shadow_cr4;
128 return guest_cr4 & X86_CR4_PAE;
129 }
131 static int svm_paging_enabled(struct vcpu *v)
132 {
133 unsigned long guest_cr0 = v->arch.hvm_svm.cpu_shadow_cr0;
134 return (guest_cr0 & X86_CR0_PE) && (guest_cr0 & X86_CR0_PG);
135 }
137 static int svm_pae_enabled(struct vcpu *v)
138 {
139 unsigned long guest_cr4 = v->arch.hvm_svm.cpu_shadow_cr4;
140 return svm_paging_enabled(v) && (guest_cr4 & X86_CR4_PAE);
141 }
143 static int svm_nx_enabled(struct vcpu *v)
144 {
145 return v->arch.hvm_svm.cpu_shadow_efer & EFER_NX;
146 }
148 static int svm_pgbit_test(struct vcpu *v)
149 {
150 return v->arch.hvm_svm.cpu_shadow_cr0 & X86_CR0_PG;
151 }
153 static void svm_store_cpu_guest_regs(
154 struct vcpu *v, struct cpu_user_regs *regs, unsigned long *crs)
155 {
156 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
158 if ( regs != NULL )
159 {
160 regs->ss = vmcb->ss.sel;
161 regs->esp = vmcb->rsp;
162 regs->eflags = vmcb->rflags;
163 regs->cs = vmcb->cs.sel;
164 regs->eip = vmcb->rip;
165 }
167 if ( crs != NULL )
168 {
169 /* Returning the guest's regs */
170 crs[0] = v->arch.hvm_svm.cpu_shadow_cr0;
171 crs[2] = v->arch.hvm_svm.cpu_cr2;
172 crs[3] = v->arch.hvm_svm.cpu_cr3;
173 crs[4] = v->arch.hvm_svm.cpu_shadow_cr4;
174 }
175 }
177 static enum handler_return long_mode_do_msr_write(struct cpu_user_regs *regs)
178 {
179 u64 msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
180 u32 ecx = regs->ecx;
181 struct vcpu *v = current;
182 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
184 HVM_DBG_LOG(DBG_LEVEL_0, "msr %x msr_content %"PRIx64,
185 ecx, msr_content);
187 switch ( ecx )
188 {
189 case MSR_EFER:
190 /* Offending reserved bit will cause #GP. */
191 #ifdef __x86_64__
192 if ( (msr_content & ~(EFER_LME | EFER_LMA | EFER_NX | EFER_SCE)) ||
193 #else
194 if ( (msr_content & ~(EFER_NX | EFER_SCE)) ||
195 #endif
196 (!cpu_has_nx && (msr_content & EFER_NX)) ||
197 (!cpu_has_syscall && (msr_content & EFER_SCE)) )
198 {
199 gdprintk(XENLOG_WARNING, "Trying to set reserved bit in "
200 "EFER: %"PRIx64"\n", msr_content);
201 goto gp_fault;
202 }
204 if ( (msr_content & EFER_LME) && !svm_lme_is_set(v) )
205 {
206 /* EFER.LME transition from 0 to 1. */
207 if ( svm_paging_enabled(v) || !svm_cr4_pae_is_set(v) )
208 {
209 gdprintk(XENLOG_WARNING, "Trying to set LME bit when "
210 "in paging mode or PAE bit is not set\n");
211 goto gp_fault;
212 }
213 }
214 else if ( !(msr_content & EFER_LME) && svm_lme_is_set(v) )
215 {
216 /* EFER.LME transistion from 1 to 0. */
217 if ( svm_paging_enabled(v) )
218 {
219 gdprintk(XENLOG_WARNING,
220 "Trying to clear EFER.LME while paging enabled\n");
221 goto gp_fault;
222 }
223 }
225 v->arch.hvm_svm.cpu_shadow_efer = msr_content;
226 vmcb->efer = msr_content | EFER_SVME;
227 if ( !svm_paging_enabled(v) )
228 vmcb->efer &= ~(EFER_LME | EFER_LMA);
230 break;
232 case MSR_K8_MC4_MISC: /* Threshold register */
233 /*
234 * MCA/MCE: Threshold register is reported to be locked, so we ignore
235 * all write accesses. This behaviour matches real HW, so guests should
236 * have no problem with this.
237 */
238 break;
240 default:
241 return HNDL_unhandled;
242 }
244 return HNDL_done;
246 gp_fault:
247 svm_inject_exception(v, TRAP_gp_fault, 1, 0);
248 return HNDL_exception_raised;
249 }
252 #define loaddebug(_v,_reg) \
253 asm volatile ("mov %0,%%db" #_reg : : "r" ((_v)->debugreg[_reg]))
254 #define savedebug(_v,_reg) \
255 asm volatile ("mov %%db" #_reg ",%0" : : "r" ((_v)->debugreg[_reg]))
257 static void svm_save_dr(struct vcpu *v)
258 {
259 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
261 if ( !v->arch.hvm_vcpu.flag_dr_dirty )
262 return;
264 /* Clear the DR dirty flag and re-enable intercepts for DR accesses. */
265 v->arch.hvm_vcpu.flag_dr_dirty = 0;
266 v->arch.hvm_svm.vmcb->dr_intercepts = DR_INTERCEPT_ALL_WRITES;
268 savedebug(&v->arch.guest_context, 0);
269 savedebug(&v->arch.guest_context, 1);
270 savedebug(&v->arch.guest_context, 2);
271 savedebug(&v->arch.guest_context, 3);
272 v->arch.guest_context.debugreg[6] = vmcb->dr6;
273 v->arch.guest_context.debugreg[7] = vmcb->dr7;
274 }
277 static void __restore_debug_registers(struct vcpu *v)
278 {
279 loaddebug(&v->arch.guest_context, 0);
280 loaddebug(&v->arch.guest_context, 1);
281 loaddebug(&v->arch.guest_context, 2);
282 loaddebug(&v->arch.guest_context, 3);
283 /* DR6 and DR7 are loaded from the VMCB. */
284 }
287 int svm_vmcb_save(struct vcpu *v, struct hvm_hw_cpu *c)
288 {
289 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
291 c->rip = vmcb->rip;
293 #ifdef HVM_DEBUG_SUSPEND
294 printk("%s: eip=0x%"PRIx64".\n",
295 __func__,
296 inst_len, c->eip);
297 #endif
299 c->rsp = vmcb->rsp;
300 c->rflags = vmcb->rflags;
302 c->cr0 = v->arch.hvm_svm.cpu_shadow_cr0;
303 c->cr2 = v->arch.hvm_svm.cpu_cr2;
304 c->cr3 = v->arch.hvm_svm.cpu_cr3;
305 c->cr4 = v->arch.hvm_svm.cpu_shadow_cr4;
307 #ifdef HVM_DEBUG_SUSPEND
308 printk("%s: cr3=0x%"PRIx64", cr0=0x%"PRIx64", cr4=0x%"PRIx64".\n",
309 __func__,
310 c->cr3,
311 c->cr0,
312 c->cr4);
313 #endif
315 c->idtr_limit = vmcb->idtr.limit;
316 c->idtr_base = vmcb->idtr.base;
318 c->gdtr_limit = vmcb->gdtr.limit;
319 c->gdtr_base = vmcb->gdtr.base;
321 c->cs_sel = vmcb->cs.sel;
322 c->cs_limit = vmcb->cs.limit;
323 c->cs_base = vmcb->cs.base;
324 c->cs_arbytes = vmcb->cs.attr.bytes;
326 c->ds_sel = vmcb->ds.sel;
327 c->ds_limit = vmcb->ds.limit;
328 c->ds_base = vmcb->ds.base;
329 c->ds_arbytes = vmcb->ds.attr.bytes;
331 c->es_sel = vmcb->es.sel;
332 c->es_limit = vmcb->es.limit;
333 c->es_base = vmcb->es.base;
334 c->es_arbytes = vmcb->es.attr.bytes;
336 c->ss_sel = vmcb->ss.sel;
337 c->ss_limit = vmcb->ss.limit;
338 c->ss_base = vmcb->ss.base;
339 c->ss_arbytes = vmcb->ss.attr.bytes;
341 c->fs_sel = vmcb->fs.sel;
342 c->fs_limit = vmcb->fs.limit;
343 c->fs_base = vmcb->fs.base;
344 c->fs_arbytes = vmcb->fs.attr.bytes;
346 c->gs_sel = vmcb->gs.sel;
347 c->gs_limit = vmcb->gs.limit;
348 c->gs_base = vmcb->gs.base;
349 c->gs_arbytes = vmcb->gs.attr.bytes;
351 c->tr_sel = vmcb->tr.sel;
352 c->tr_limit = vmcb->tr.limit;
353 c->tr_base = vmcb->tr.base;
354 c->tr_arbytes = vmcb->tr.attr.bytes;
356 c->ldtr_sel = vmcb->ldtr.sel;
357 c->ldtr_limit = vmcb->ldtr.limit;
358 c->ldtr_base = vmcb->ldtr.base;
359 c->ldtr_arbytes = vmcb->ldtr.attr.bytes;
361 c->sysenter_cs = vmcb->sysenter_cs;
362 c->sysenter_esp = vmcb->sysenter_esp;
363 c->sysenter_eip = vmcb->sysenter_eip;
365 /* Save any event/interrupt that was being injected when we last exited. */
366 if ( vmcb->exitintinfo.fields.v )
367 {
368 c->pending_event = vmcb->exitintinfo.bytes & 0xffffffff;
369 c->error_code = vmcb->exitintinfo.fields.errorcode;
370 }
371 else if ( vmcb->eventinj.fields.v )
372 {
373 c->pending_event = vmcb->eventinj.bytes & 0xffffffff;
374 c->error_code = vmcb->eventinj.fields.errorcode;
375 }
376 else
377 {
378 c->pending_event = 0;
379 c->error_code = 0;
380 }
382 return 1;
383 }
386 int svm_vmcb_restore(struct vcpu *v, struct hvm_hw_cpu *c)
387 {
388 unsigned long mfn, old_base_mfn;
389 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
391 vmcb->rip = c->rip;
392 vmcb->rsp = c->rsp;
393 vmcb->rflags = c->rflags;
395 v->arch.hvm_svm.cpu_shadow_cr0 = c->cr0;
396 vmcb->cr0 = c->cr0 | X86_CR0_WP | X86_CR0_ET | X86_CR0_PG;
398 v->arch.hvm_svm.cpu_cr2 = c->cr2;
400 #ifdef HVM_DEBUG_SUSPEND
401 printk("%s: cr3=0x%"PRIx64", cr0=0x%"PRIx64", cr4=0x%"PRIx64".\n",
402 __func__,
403 c->cr3,
404 c->cr0,
405 c->cr4);
406 #endif
408 if ( !svm_paging_enabled(v) )
409 {
410 printk("%s: paging not enabled.\n", __func__);
411 goto skip_cr3;
412 }
414 if ( c->cr3 == v->arch.hvm_svm.cpu_cr3 )
415 {
416 /*
417 * This is simple TLB flush, implying the guest has
418 * removed some translation or changed page attributes.
419 * We simply invalidate the shadow.
420 */
421 mfn = gmfn_to_mfn(v->domain, c->cr3 >> PAGE_SHIFT);
422 if ( mfn != pagetable_get_pfn(v->arch.guest_table) )
423 goto bad_cr3;
424 }
425 else
426 {
427 /*
428 * If different, make a shadow. Check if the PDBR is valid
429 * first.
430 */
431 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 c->cr3 = %"PRIx64, c->cr3);
432 mfn = gmfn_to_mfn(v->domain, c->cr3 >> PAGE_SHIFT);
433 if( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), v->domain) )
434 goto bad_cr3;
436 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
437 v->arch.guest_table = pagetable_from_pfn(mfn);
438 if (old_base_mfn)
439 put_page(mfn_to_page(old_base_mfn));
440 v->arch.hvm_svm.cpu_cr3 = c->cr3;
441 }
443 skip_cr3:
444 vmcb->cr4 = c->cr4 | HVM_CR4_HOST_MASK;
445 v->arch.hvm_svm.cpu_shadow_cr4 = c->cr4;
447 vmcb->idtr.limit = c->idtr_limit;
448 vmcb->idtr.base = c->idtr_base;
450 vmcb->gdtr.limit = c->gdtr_limit;
451 vmcb->gdtr.base = c->gdtr_base;
453 vmcb->cs.sel = c->cs_sel;
454 vmcb->cs.limit = c->cs_limit;
455 vmcb->cs.base = c->cs_base;
456 vmcb->cs.attr.bytes = c->cs_arbytes;
458 vmcb->ds.sel = c->ds_sel;
459 vmcb->ds.limit = c->ds_limit;
460 vmcb->ds.base = c->ds_base;
461 vmcb->ds.attr.bytes = c->ds_arbytes;
463 vmcb->es.sel = c->es_sel;
464 vmcb->es.limit = c->es_limit;
465 vmcb->es.base = c->es_base;
466 vmcb->es.attr.bytes = c->es_arbytes;
468 vmcb->ss.sel = c->ss_sel;
469 vmcb->ss.limit = c->ss_limit;
470 vmcb->ss.base = c->ss_base;
471 vmcb->ss.attr.bytes = c->ss_arbytes;
472 vmcb->cpl = vmcb->ss.attr.fields.dpl;
474 vmcb->fs.sel = c->fs_sel;
475 vmcb->fs.limit = c->fs_limit;
476 vmcb->fs.base = c->fs_base;
477 vmcb->fs.attr.bytes = c->fs_arbytes;
479 vmcb->gs.sel = c->gs_sel;
480 vmcb->gs.limit = c->gs_limit;
481 vmcb->gs.base = c->gs_base;
482 vmcb->gs.attr.bytes = c->gs_arbytes;
484 vmcb->tr.sel = c->tr_sel;
485 vmcb->tr.limit = c->tr_limit;
486 vmcb->tr.base = c->tr_base;
487 vmcb->tr.attr.bytes = c->tr_arbytes;
489 vmcb->ldtr.sel = c->ldtr_sel;
490 vmcb->ldtr.limit = c->ldtr_limit;
491 vmcb->ldtr.base = c->ldtr_base;
492 vmcb->ldtr.attr.bytes = c->ldtr_arbytes;
494 vmcb->sysenter_cs = c->sysenter_cs;
495 vmcb->sysenter_esp = c->sysenter_esp;
496 vmcb->sysenter_eip = c->sysenter_eip;
498 /* update VMCB for nested paging restore */
499 if ( paging_mode_hap(v->domain) ) {
500 vmcb->cr0 = v->arch.hvm_svm.cpu_shadow_cr0;
501 vmcb->cr4 = v->arch.hvm_svm.cpu_shadow_cr4 |
502 (HVM_CR4_HOST_MASK & ~X86_CR4_PAE);
503 vmcb->cr3 = c->cr3;
504 vmcb->np_enable = 1;
505 vmcb->g_pat = 0x0007040600070406ULL; /* guest PAT */
506 vmcb->h_cr3 = pagetable_get_paddr(v->domain->arch.phys_table);
507 }
509 vmcb->dr6 = c->dr6;
510 vmcb->dr7 = c->dr7;
512 if ( c->pending_valid )
513 {
514 gdprintk(XENLOG_INFO, "Re-injecting 0x%"PRIx32", 0x%"PRIx32"\n",
515 c->pending_event, c->error_code);
517 /* VMX uses a different type for #OF and #BP; fold into "Exception" */
518 if ( c->pending_type == 6 )
519 c->pending_type = 3;
520 /* Sanity check */
521 if ( c->pending_type == 1 || c->pending_type > 4
522 || c->pending_reserved != 0 )
523 {
524 gdprintk(XENLOG_ERR, "Invalid pending event 0x%"PRIx32"\n",
525 c->pending_event);
526 return -EINVAL;
527 }
528 /* Put this pending event in exitintinfo and svm_intr_assist()
529 * will reinject it when we return to the guest. */
530 vmcb->exitintinfo.bytes = c->pending_event;
531 vmcb->exitintinfo.fields.errorcode = c->error_code;
532 }
534 paging_update_paging_modes(v);
535 /* signal paging update to ASID handler */
536 svm_asid_g_update_paging (v);
538 return 0;
540 bad_cr3:
541 gdprintk(XENLOG_ERR, "Invalid CR3 value=0x%"PRIx64"\n", c->cr3);
542 return -EINVAL;
543 }
546 static void svm_save_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data)
547 {
548 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
550 data->shadow_gs = vmcb->kerngsbase;
551 data->msr_lstar = vmcb->lstar;
552 data->msr_star = vmcb->star;
553 data->msr_cstar = vmcb->cstar;
554 data->msr_syscall_mask = vmcb->sfmask;
555 data->msr_efer = v->arch.hvm_svm.cpu_shadow_efer;
556 data->msr_flags = -1ULL;
558 data->tsc = hvm_get_guest_time(v);
559 }
562 static void svm_load_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data)
563 {
564 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
566 vmcb->kerngsbase = data->shadow_gs;
567 vmcb->lstar = data->msr_lstar;
568 vmcb->star = data->msr_star;
569 vmcb->cstar = data->msr_cstar;
570 vmcb->sfmask = data->msr_syscall_mask;
571 v->arch.hvm_svm.cpu_shadow_efer = data->msr_efer;
572 vmcb->efer = data->msr_efer | EFER_SVME;
573 /* VMCB's EFER.LME isn't set unless we're actually in long mode
574 * (see long_mode_do_msr_write()) */
575 if ( !(vmcb->efer & EFER_LMA) )
576 vmcb->efer &= ~EFER_LME;
578 hvm_set_guest_time(v, data->tsc);
579 }
581 static void svm_save_vmcb_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt)
582 {
583 svm_save_cpu_state(v, ctxt);
584 svm_vmcb_save(v, ctxt);
585 }
587 static int svm_load_vmcb_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt)
588 {
589 svm_load_cpu_state(v, ctxt);
590 if (svm_vmcb_restore(v, ctxt)) {
591 printk("svm_vmcb restore failed!\n");
592 domain_crash(v->domain);
593 return -EINVAL;
594 }
596 return 0;
597 }
599 static void svm_restore_dr(struct vcpu *v)
600 {
601 if ( unlikely(v->arch.guest_context.debugreg[7] & 0xFF) )
602 __restore_debug_registers(v);
603 }
605 static int svm_interrupts_enabled(struct vcpu *v, enum hvm_intack type)
606 {
607 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
609 if ( type == hvm_intack_nmi )
610 return !vmcb->interrupt_shadow;
612 ASSERT((type == hvm_intack_pic) || (type == hvm_intack_lapic));
613 return !irq_masked(vmcb->rflags) && !vmcb->interrupt_shadow;
614 }
616 static int svm_guest_x86_mode(struct vcpu *v)
617 {
618 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
620 if ( unlikely(!(v->arch.hvm_svm.cpu_shadow_cr0 & X86_CR0_PE)) )
621 return 0;
622 if ( unlikely(vmcb->rflags & X86_EFLAGS_VM) )
623 return 1;
624 if ( svm_long_mode_enabled(v) && likely(vmcb->cs.attr.fields.l) )
625 return 8;
626 return (likely(vmcb->cs.attr.fields.db) ? 4 : 2);
627 }
629 static void svm_update_host_cr3(struct vcpu *v)
630 {
631 /* SVM doesn't have a HOST_CR3 equivalent to update. */
632 }
634 static void svm_update_guest_cr3(struct vcpu *v)
635 {
636 v->arch.hvm_svm.vmcb->cr3 = v->arch.hvm_vcpu.hw_cr3;
637 }
639 static void svm_flush_guest_tlbs(void)
640 {
641 /* Roll over the CPU's ASID generation, so it gets a clean TLB when we
642 * next VMRUN. (If ASIDs are disabled, the whole TLB is flushed on
643 * VMRUN anyway). */
644 svm_asid_inc_generation();
645 }
647 static void svm_update_vtpr(struct vcpu *v, unsigned long value)
648 {
649 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
651 vmcb->vintr.fields.tpr = value & 0x0f;
652 }
654 static unsigned long svm_get_ctrl_reg(struct vcpu *v, unsigned int num)
655 {
656 switch ( num )
657 {
658 case 0:
659 return v->arch.hvm_svm.cpu_shadow_cr0;
660 case 2:
661 return v->arch.hvm_svm.cpu_cr2;
662 case 3:
663 return v->arch.hvm_svm.cpu_cr3;
664 case 4:
665 return v->arch.hvm_svm.cpu_shadow_cr4;
666 default:
667 BUG();
668 }
669 return 0; /* dummy */
670 }
672 static void svm_sync_vmcb(struct vcpu *v)
673 {
674 struct arch_svm_struct *arch_svm = &v->arch.hvm_svm;
676 if ( arch_svm->vmcb_in_sync )
677 return;
679 arch_svm->vmcb_in_sync = 1;
681 asm volatile (
682 ".byte 0x0f,0x01,0xdb" /* vmsave */
683 : : "a" (__pa(arch_svm->vmcb)) );
684 }
686 static unsigned long svm_get_segment_base(struct vcpu *v, enum x86_segment seg)
687 {
688 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
689 int long_mode = vmcb->cs.attr.fields.l && svm_long_mode_enabled(v);
691 switch ( seg )
692 {
693 case x86_seg_cs: return long_mode ? 0 : vmcb->cs.base;
694 case x86_seg_ds: return long_mode ? 0 : vmcb->ds.base;
695 case x86_seg_es: return long_mode ? 0 : vmcb->es.base;
696 case x86_seg_fs: svm_sync_vmcb(v); return vmcb->fs.base;
697 case x86_seg_gs: svm_sync_vmcb(v); return vmcb->gs.base;
698 case x86_seg_ss: return long_mode ? 0 : vmcb->ss.base;
699 case x86_seg_tr: svm_sync_vmcb(v); return vmcb->tr.base;
700 case x86_seg_gdtr: return vmcb->gdtr.base;
701 case x86_seg_idtr: return vmcb->idtr.base;
702 case x86_seg_ldtr: svm_sync_vmcb(v); return vmcb->ldtr.base;
703 }
704 BUG();
705 return 0;
706 }
708 static void svm_get_segment_register(struct vcpu *v, enum x86_segment seg,
709 struct segment_register *reg)
710 {
711 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
712 switch ( seg )
713 {
714 case x86_seg_cs:
715 memcpy(reg, &vmcb->cs, sizeof(*reg));
716 break;
717 case x86_seg_ds:
718 memcpy(reg, &vmcb->ds, sizeof(*reg));
719 break;
720 case x86_seg_es:
721 memcpy(reg, &vmcb->es, sizeof(*reg));
722 break;
723 case x86_seg_fs:
724 svm_sync_vmcb(v);
725 memcpy(reg, &vmcb->fs, sizeof(*reg));
726 break;
727 case x86_seg_gs:
728 svm_sync_vmcb(v);
729 memcpy(reg, &vmcb->gs, sizeof(*reg));
730 break;
731 case x86_seg_ss:
732 memcpy(reg, &vmcb->ss, sizeof(*reg));
733 break;
734 case x86_seg_tr:
735 svm_sync_vmcb(v);
736 memcpy(reg, &vmcb->tr, sizeof(*reg));
737 break;
738 case x86_seg_gdtr:
739 memcpy(reg, &vmcb->gdtr, sizeof(*reg));
740 break;
741 case x86_seg_idtr:
742 memcpy(reg, &vmcb->idtr, sizeof(*reg));
743 break;
744 case x86_seg_ldtr:
745 svm_sync_vmcb(v);
746 memcpy(reg, &vmcb->ldtr, sizeof(*reg));
747 break;
748 default: BUG();
749 }
750 }
752 /* Make sure that xen intercepts any FP accesses from current */
753 static void svm_stts(struct vcpu *v)
754 {
755 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
757 /*
758 * If the guest does not have TS enabled then we must cause and handle an
759 * exception on first use of the FPU. If the guest *does* have TS enabled
760 * then this is not necessary: no FPU activity can occur until the guest
761 * clears CR0.TS, and we will initialise the FPU when that happens.
762 */
763 if ( !(v->arch.hvm_svm.cpu_shadow_cr0 & X86_CR0_TS) )
764 {
765 v->arch.hvm_svm.vmcb->exception_intercepts |= 1U << TRAP_no_device;
766 vmcb->cr0 |= X86_CR0_TS;
767 }
768 }
771 static void svm_set_tsc_offset(struct vcpu *v, u64 offset)
772 {
773 v->arch.hvm_svm.vmcb->tsc_offset = offset;
774 }
777 static void svm_init_ap_context(
778 struct vcpu_guest_context *ctxt, int vcpuid, int trampoline_vector)
779 {
780 struct vcpu *v;
781 struct vmcb_struct *vmcb;
782 cpu_user_regs_t *regs;
783 u16 cs_sel;
785 /* We know this is safe because hvm_bringup_ap() does it */
786 v = current->domain->vcpu[vcpuid];
787 vmcb = v->arch.hvm_svm.vmcb;
788 regs = &v->arch.guest_context.user_regs;
790 memset(ctxt, 0, sizeof(*ctxt));
792 /*
793 * We execute the trampoline code in real mode. The trampoline vector
794 * passed to us is page alligned and is the physical frame number for
795 * the code. We will execute this code in real mode.
796 */
797 cs_sel = trampoline_vector << 8;
798 ctxt->user_regs.eip = 0x0;
799 ctxt->user_regs.cs = cs_sel;
801 /*
802 * This is the launch of an AP; set state so that we begin executing
803 * the trampoline code in real-mode.
804 */
805 svm_reset_to_realmode(v, regs);
806 /* Adjust the vmcb's hidden register state. */
807 vmcb->rip = 0;
808 vmcb->cs.sel = cs_sel;
809 vmcb->cs.base = (cs_sel << 4);
810 }
812 static void svm_init_hypercall_page(struct domain *d, void *hypercall_page)
813 {
814 char *p;
815 int i;
817 for ( i = 0; i < (PAGE_SIZE / 32); i++ )
818 {
819 p = (char *)(hypercall_page + (i * 32));
820 *(u8 *)(p + 0) = 0xb8; /* mov imm32, %eax */
821 *(u32 *)(p + 1) = i;
822 *(u8 *)(p + 5) = 0x0f; /* vmmcall */
823 *(u8 *)(p + 6) = 0x01;
824 *(u8 *)(p + 7) = 0xd9;
825 *(u8 *)(p + 8) = 0xc3; /* ret */
826 }
828 /* Don't support HYPERVISOR_iret at the moment */
829 *(u16 *)(hypercall_page + (__HYPERVISOR_iret * 32)) = 0x0b0f; /* ud2 */
830 }
832 static void svm_load_cpu_guest_regs(struct vcpu *v, struct cpu_user_regs *regs)
833 {
834 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
836 vmcb->ss.sel = regs->ss;
837 vmcb->rsp = regs->esp;
838 vmcb->rflags = regs->eflags | 2UL;
839 vmcb->cs.sel = regs->cs;
840 vmcb->rip = regs->eip;
841 }
843 static void svm_ctxt_switch_from(struct vcpu *v)
844 {
845 int cpu = smp_processor_id();
847 svm_save_dr(v);
849 svm_sync_vmcb(v);
851 asm volatile (
852 ".byte 0x0f,0x01,0xda" /* vmload */
853 : : "a" (__pa(root_vmcb[cpu])) );
855 #ifdef __x86_64__
856 /* Resume use of ISTs now that the host TR is reinstated. */
857 idt_tables[cpu][TRAP_double_fault].a |= IST_DF << 32;
858 idt_tables[cpu][TRAP_nmi].a |= IST_NMI << 32;
859 idt_tables[cpu][TRAP_machine_check].a |= IST_MCE << 32;
860 #endif
861 }
863 static void svm_ctxt_switch_to(struct vcpu *v)
864 {
865 int cpu = smp_processor_id();
867 #ifdef __x86_64__
868 /*
869 * This is required, because VMRUN does consistency check
870 * and some of the DOM0 selectors are pointing to
871 * invalid GDT locations, and cause AMD processors
872 * to shutdown.
873 */
874 set_segment_register(ds, 0);
875 set_segment_register(es, 0);
876 set_segment_register(ss, 0);
878 /*
879 * Cannot use ISTs for NMI/#MC/#DF while we are running with the guest TR.
880 * But this doesn't matter: the IST is only req'd to handle SYSCALL/SYSRET.
881 */
882 idt_tables[cpu][TRAP_double_fault].a &= ~(7UL << 32);
883 idt_tables[cpu][TRAP_nmi].a &= ~(7UL << 32);
884 idt_tables[cpu][TRAP_machine_check].a &= ~(7UL << 32);
885 #endif
887 svm_restore_dr(v);
889 asm volatile (
890 ".byte 0x0f,0x01,0xdb" /* vmsave */
891 : : "a" (__pa(root_vmcb[cpu])) );
892 asm volatile (
893 ".byte 0x0f,0x01,0xda" /* vmload */
894 : : "a" (__pa(v->arch.hvm_svm.vmcb)) );
895 }
897 static void svm_do_resume(struct vcpu *v)
898 {
899 bool_t debug_state = v->domain->debugger_attached;
901 if ( unlikely(v->arch.hvm_vcpu.debug_state_latch != debug_state) )
902 {
903 uint32_t mask = (1U << TRAP_debug) | (1U << TRAP_int3);
904 v->arch.hvm_vcpu.debug_state_latch = debug_state;
905 if ( debug_state )
906 v->arch.hvm_svm.vmcb->exception_intercepts |= mask;
907 else
908 v->arch.hvm_svm.vmcb->exception_intercepts &= ~mask;
909 }
911 if ( v->arch.hvm_svm.launch_core != smp_processor_id() )
912 {
913 v->arch.hvm_svm.launch_core = smp_processor_id();
914 hvm_migrate_timers(v);
916 /* Migrating to another ASID domain. Request a new ASID. */
917 svm_asid_init_vcpu(v);
918 }
920 hvm_do_resume(v);
921 reset_stack_and_jump(svm_asm_do_resume);
922 }
924 static int svm_domain_initialise(struct domain *d)
925 {
926 return 0;
927 }
929 static void svm_domain_destroy(struct domain *d)
930 {
931 }
933 static int svm_vcpu_initialise(struct vcpu *v)
934 {
935 int rc;
937 v->arch.schedule_tail = svm_do_resume;
938 v->arch.ctxt_switch_from = svm_ctxt_switch_from;
939 v->arch.ctxt_switch_to = svm_ctxt_switch_to;
941 v->arch.hvm_svm.launch_core = -1;
943 if ( (rc = svm_create_vmcb(v)) != 0 )
944 {
945 dprintk(XENLOG_WARNING,
946 "Failed to create VMCB for vcpu %d: err=%d.\n",
947 v->vcpu_id, rc);
948 return rc;
949 }
951 return 0;
952 }
954 static void svm_vcpu_destroy(struct vcpu *v)
955 {
956 svm_destroy_vmcb(v);
957 }
959 static void svm_hvm_inject_exception(
960 unsigned int trapnr, int errcode, unsigned long cr2)
961 {
962 struct vcpu *v = current;
963 if ( trapnr == TRAP_page_fault )
964 v->arch.hvm_svm.vmcb->cr2 = v->arch.hvm_svm.cpu_cr2 = cr2;
965 svm_inject_exception(v, trapnr, (errcode != -1), errcode);
966 }
968 static int svm_event_injection_faulted(struct vcpu *v)
969 {
970 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
971 return vmcb->exitintinfo.fields.v;
972 }
974 static struct hvm_function_table svm_function_table = {
975 .name = "SVM",
976 .cpu_down = svm_cpu_down,
977 .domain_initialise = svm_domain_initialise,
978 .domain_destroy = svm_domain_destroy,
979 .vcpu_initialise = svm_vcpu_initialise,
980 .vcpu_destroy = svm_vcpu_destroy,
981 .store_cpu_guest_regs = svm_store_cpu_guest_regs,
982 .load_cpu_guest_regs = svm_load_cpu_guest_regs,
983 .save_cpu_ctxt = svm_save_vmcb_ctxt,
984 .load_cpu_ctxt = svm_load_vmcb_ctxt,
985 .paging_enabled = svm_paging_enabled,
986 .long_mode_enabled = svm_long_mode_enabled,
987 .pae_enabled = svm_pae_enabled,
988 .nx_enabled = svm_nx_enabled,
989 .interrupts_enabled = svm_interrupts_enabled,
990 .guest_x86_mode = svm_guest_x86_mode,
991 .get_guest_ctrl_reg = svm_get_ctrl_reg,
992 .get_segment_base = svm_get_segment_base,
993 .get_segment_register = svm_get_segment_register,
994 .update_host_cr3 = svm_update_host_cr3,
995 .update_guest_cr3 = svm_update_guest_cr3,
996 .flush_guest_tlbs = svm_flush_guest_tlbs,
997 .update_vtpr = svm_update_vtpr,
998 .stts = svm_stts,
999 .set_tsc_offset = svm_set_tsc_offset,
1000 .inject_exception = svm_hvm_inject_exception,
1001 .init_ap_context = svm_init_ap_context,
1002 .init_hypercall_page = svm_init_hypercall_page,
1003 .event_injection_faulted = svm_event_injection_faulted
1004 };
1006 static void svm_npt_detect(void)
1008 u32 eax, ebx, ecx, edx;
1010 /* Check CPUID for nested paging support. */
1011 cpuid(0x8000000A, &eax, &ebx, &ecx, &edx);
1013 if ( !(edx & 1) && opt_hap_enabled )
1015 printk("SVM: Nested paging is not supported by this CPU.\n");
1016 opt_hap_enabled = 0;
1020 int start_svm(struct cpuinfo_x86 *c)
1022 u32 eax, ecx, edx;
1023 u32 phys_hsa_lo, phys_hsa_hi;
1024 u64 phys_hsa;
1025 int cpu = smp_processor_id();
1027 /* Xen does not fill x86_capability words except 0. */
1028 ecx = cpuid_ecx(0x80000001);
1029 boot_cpu_data.x86_capability[5] = ecx;
1031 if ( !(test_bit(X86_FEATURE_SVME, &boot_cpu_data.x86_capability)) )
1032 return 0;
1034 /* Check whether SVM feature is disabled in BIOS */
1035 rdmsr(MSR_K8_VM_CR, eax, edx);
1036 if ( eax & K8_VMCR_SVME_DISABLE )
1038 printk("AMD SVM Extension is disabled in BIOS.\n");
1039 return 0;
1042 if ( ((hsa[cpu] = alloc_host_save_area()) == NULL) ||
1043 ((root_vmcb[cpu] = alloc_vmcb()) == NULL) )
1044 return 0;
1046 write_efer(read_efer() | EFER_SVME);
1048 svm_npt_detect();
1050 /* Initialize the HSA for this core. */
1051 phys_hsa = (u64) virt_to_maddr(hsa[cpu]);
1052 phys_hsa_lo = (u32) phys_hsa;
1053 phys_hsa_hi = (u32) (phys_hsa >> 32);
1054 wrmsr(MSR_K8_VM_HSAVE_PA, phys_hsa_lo, phys_hsa_hi);
1056 /* Initialize core's ASID handling. */
1057 svm_asid_init(c);
1059 if ( cpu != 0 )
1060 return 1;
1062 setup_vmcb_dump();
1064 hvm_enable(&svm_function_table);
1066 if ( opt_hap_enabled )
1067 printk("SVM: Nested paging enabled.\n");
1069 return 1;
1072 static int svm_do_nested_pgfault(paddr_t gpa, struct cpu_user_regs *regs)
1074 if (mmio_space(gpa)) {
1075 handle_mmio(gpa);
1076 return 1;
1079 paging_mark_dirty(current->domain, get_mfn_from_gpfn(gpa >> PAGE_SHIFT));
1080 return p2m_set_flags(current->domain, gpa, __PAGE_HYPERVISOR|_PAGE_USER);
1083 static void svm_do_no_device_fault(struct vmcb_struct *vmcb)
1085 struct vcpu *v = current;
1087 setup_fpu(v);
1088 vmcb->exception_intercepts &= ~(1U << TRAP_no_device);
1090 if ( !(v->arch.hvm_svm.cpu_shadow_cr0 & X86_CR0_TS) )
1091 vmcb->cr0 &= ~X86_CR0_TS;
1094 /* Reserved bits ECX: [31:14], [12:4], [2:1]*/
1095 #define SVM_VCPU_CPUID_L1_ECX_RESERVED 0xffffdff6
1096 /* Reserved bits EDX: [31:29], [27], [22:20], [18], [10] */
1097 #define SVM_VCPU_CPUID_L1_EDX_RESERVED 0xe8740400
1099 static void svm_vmexit_do_cpuid(struct vmcb_struct *vmcb,
1100 struct cpu_user_regs *regs)
1102 unsigned long input = regs->eax;
1103 unsigned int eax, ebx, ecx, edx;
1104 struct vcpu *v = current;
1105 int inst_len;
1107 hvm_cpuid(input, &eax, &ebx, &ecx, &edx);
1109 if ( input == 0x00000001 )
1111 /* Clear out reserved bits. */
1112 ecx &= ~SVM_VCPU_CPUID_L1_ECX_RESERVED;
1113 edx &= ~SVM_VCPU_CPUID_L1_EDX_RESERVED;
1115 /* Guest should only see one logical processor.
1116 * See details on page 23 of AMD CPUID Specification.
1117 */
1118 clear_bit(X86_FEATURE_HT & 31, &edx); /* clear the hyperthread bit */
1119 ebx &= 0xFF00FFFF; /* clear the logical processor count when HTT=0 */
1120 ebx |= 0x00010000; /* set to 1 just for precaution */
1122 else if ( input == 0x80000001 )
1124 if ( vlapic_hw_disabled(vcpu_vlapic(v)) )
1125 clear_bit(X86_FEATURE_APIC & 31, &edx);
1127 #if CONFIG_PAGING_LEVELS >= 3
1128 if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] )
1129 #endif
1130 clear_bit(X86_FEATURE_PAE & 31, &edx);
1132 clear_bit(X86_FEATURE_PSE36 & 31, &edx);
1134 /* Clear the Cmp_Legacy bit
1135 * This bit is supposed to be zero when HTT = 0.
1136 * See details on page 23 of AMD CPUID Specification.
1137 */
1138 clear_bit(X86_FEATURE_CMP_LEGACY & 31, &ecx);
1140 /* Make SVM feature invisible to the guest. */
1141 clear_bit(X86_FEATURE_SVME & 31, &ecx);
1143 /* So far, we do not support 3DNow for the guest. */
1144 clear_bit(X86_FEATURE_3DNOW & 31, &edx);
1145 clear_bit(X86_FEATURE_3DNOWEXT & 31, &edx);
1146 /* no FFXSR instructions feature. */
1147 clear_bit(X86_FEATURE_FFXSR & 31, &edx);
1149 else if ( input == 0x80000007 || input == 0x8000000A )
1151 /* Mask out features of power management and SVM extension. */
1152 eax = ebx = ecx = edx = 0;
1154 else if ( input == 0x80000008 )
1156 /* Make sure Number of CPU core is 1 when HTT=0 */
1157 ecx &= 0xFFFFFF00;
1160 regs->eax = (unsigned long)eax;
1161 regs->ebx = (unsigned long)ebx;
1162 regs->ecx = (unsigned long)ecx;
1163 regs->edx = (unsigned long)edx;
1165 HVMTRACE_3D(CPUID, v, input,
1166 ((uint64_t)eax << 32) | ebx, ((uint64_t)ecx << 32) | edx);
1168 inst_len = __get_instruction_length(v, INSTR_CPUID, NULL);
1169 ASSERT(inst_len > 0);
1170 __update_guest_eip(vmcb, inst_len);
1173 static unsigned long *get_reg_p(
1174 unsigned int gpreg,
1175 struct cpu_user_regs *regs, struct vmcb_struct *vmcb)
1177 unsigned long *reg_p = NULL;
1178 switch (gpreg)
1180 case SVM_REG_EAX:
1181 reg_p = (unsigned long *)&regs->eax;
1182 break;
1183 case SVM_REG_EBX:
1184 reg_p = (unsigned long *)&regs->ebx;
1185 break;
1186 case SVM_REG_ECX:
1187 reg_p = (unsigned long *)&regs->ecx;
1188 break;
1189 case SVM_REG_EDX:
1190 reg_p = (unsigned long *)&regs->edx;
1191 break;
1192 case SVM_REG_EDI:
1193 reg_p = (unsigned long *)&regs->edi;
1194 break;
1195 case SVM_REG_ESI:
1196 reg_p = (unsigned long *)&regs->esi;
1197 break;
1198 case SVM_REG_EBP:
1199 reg_p = (unsigned long *)&regs->ebp;
1200 break;
1201 case SVM_REG_ESP:
1202 reg_p = (unsigned long *)&vmcb->rsp;
1203 break;
1204 #ifdef __x86_64__
1205 case SVM_REG_R8:
1206 reg_p = (unsigned long *)&regs->r8;
1207 break;
1208 case SVM_REG_R9:
1209 reg_p = (unsigned long *)&regs->r9;
1210 break;
1211 case SVM_REG_R10:
1212 reg_p = (unsigned long *)&regs->r10;
1213 break;
1214 case SVM_REG_R11:
1215 reg_p = (unsigned long *)&regs->r11;
1216 break;
1217 case SVM_REG_R12:
1218 reg_p = (unsigned long *)&regs->r12;
1219 break;
1220 case SVM_REG_R13:
1221 reg_p = (unsigned long *)&regs->r13;
1222 break;
1223 case SVM_REG_R14:
1224 reg_p = (unsigned long *)&regs->r14;
1225 break;
1226 case SVM_REG_R15:
1227 reg_p = (unsigned long *)&regs->r15;
1228 break;
1229 #endif
1230 default:
1231 BUG();
1234 return reg_p;
1238 static unsigned long get_reg(
1239 unsigned int gpreg, struct cpu_user_regs *regs, struct vmcb_struct *vmcb)
1241 unsigned long *gp;
1242 gp = get_reg_p(gpreg, regs, vmcb);
1243 return *gp;
1247 static void set_reg(
1248 unsigned int gpreg, unsigned long value,
1249 struct cpu_user_regs *regs, struct vmcb_struct *vmcb)
1251 unsigned long *gp;
1252 gp = get_reg_p(gpreg, regs, vmcb);
1253 *gp = value;
1257 static void svm_dr_access(struct vcpu *v, struct cpu_user_regs *regs)
1259 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
1261 HVMTRACE_0D(DR_WRITE, v);
1263 v->arch.hvm_vcpu.flag_dr_dirty = 1;
1265 __restore_debug_registers(v);
1267 /* allow the guest full access to the debug registers */
1268 vmcb->dr_intercepts = 0;
1272 static void svm_get_prefix_info(struct vcpu *v, unsigned int dir,
1273 svm_segment_register_t **seg,
1274 unsigned int *asize)
1276 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
1277 unsigned char inst[MAX_INST_LEN];
1278 int i;
1280 memset(inst, 0, MAX_INST_LEN);
1281 if (inst_copy_from_guest(inst, svm_rip2pointer(v), sizeof(inst))
1282 != MAX_INST_LEN)
1284 gdprintk(XENLOG_ERR, "get guest instruction failed\n");
1285 domain_crash(current->domain);
1286 return;
1289 for (i = 0; i < MAX_INST_LEN; i++)
1291 switch (inst[i])
1293 case 0xf3: /* REPZ */
1294 case 0xf2: /* REPNZ */
1295 case 0xf0: /* LOCK */
1296 case 0x66: /* data32 */
1297 #ifdef __x86_64__
1298 /* REX prefixes */
1299 case 0x40:
1300 case 0x41:
1301 case 0x42:
1302 case 0x43:
1303 case 0x44:
1304 case 0x45:
1305 case 0x46:
1306 case 0x47:
1308 case 0x48:
1309 case 0x49:
1310 case 0x4a:
1311 case 0x4b:
1312 case 0x4c:
1313 case 0x4d:
1314 case 0x4e:
1315 case 0x4f:
1316 #endif
1317 continue;
1318 case 0x67: /* addr32 */
1319 *asize ^= 48; /* Switch 16/32 bits */
1320 continue;
1321 case 0x2e: /* CS */
1322 *seg = &vmcb->cs;
1323 continue;
1324 case 0x36: /* SS */
1325 *seg = &vmcb->ss;
1326 continue;
1327 case 0x26: /* ES */
1328 *seg = &vmcb->es;
1329 continue;
1330 case 0x64: /* FS */
1331 svm_sync_vmcb(v);
1332 *seg = &vmcb->fs;
1333 continue;
1334 case 0x65: /* GS */
1335 svm_sync_vmcb(v);
1336 *seg = &vmcb->gs;
1337 continue;
1338 case 0x3e: /* DS */
1339 *seg = &vmcb->ds;
1340 continue;
1341 default:
1342 break;
1344 return;
1349 /* Get the address of INS/OUTS instruction */
1350 static int svm_get_io_address(
1351 struct vcpu *v, struct cpu_user_regs *regs,
1352 unsigned int size, ioio_info_t info,
1353 unsigned long *count, unsigned long *addr)
1355 unsigned long reg;
1356 unsigned int asize, isize;
1357 int long_mode = 0;
1358 svm_segment_register_t *seg = NULL;
1359 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
1361 /* If we're in long mode, don't check the segment presence & limit */
1362 long_mode = vmcb->cs.attr.fields.l && svm_long_mode_enabled(v);
1364 /* d field of cs.attr is 1 for 32-bit, 0 for 16 or 64 bit.
1365 * l field combined with EFER_LMA says whether it's 16 or 64 bit.
1366 */
1367 asize = (long_mode)?64:((vmcb->cs.attr.fields.db)?32:16);
1370 /* The ins/outs instructions are single byte, so if we have got more
1371 * than one byte (+ maybe rep-prefix), we have some prefix so we need
1372 * to figure out what it is...
1373 */
1374 isize = vmcb->exitinfo2 - vmcb->rip;
1376 if (info.fields.rep)
1377 isize --;
1379 if (isize > 1)
1380 svm_get_prefix_info(v, info.fields.type, &seg, &asize);
1382 if (info.fields.type == IOREQ_WRITE)
1384 reg = regs->esi;
1385 if (!seg) /* If no prefix, used DS. */
1386 seg = &vmcb->ds;
1387 if (!long_mode && (seg->attr.fields.type & 0xa) == 0x8) {
1388 svm_inject_exception(v, TRAP_gp_fault, 1, 0);
1389 return 0;
1392 else
1394 reg = regs->edi;
1395 seg = &vmcb->es; /* Note: This is ALWAYS ES. */
1396 if (!long_mode && (seg->attr.fields.type & 0xa) != 0x2) {
1397 svm_inject_exception(v, TRAP_gp_fault, 1, 0);
1398 return 0;
1402 /* If the segment isn't present, give GP fault! */
1403 if (!long_mode && !seg->attr.fields.p)
1405 svm_inject_exception(v, TRAP_gp_fault, 1, 0);
1406 return 0;
1409 if (asize == 16)
1411 *addr = (reg & 0xFFFF);
1412 *count = regs->ecx & 0xffff;
1414 else
1416 *addr = reg;
1417 *count = regs->ecx;
1419 if (!info.fields.rep)
1420 *count = 1;
1422 if (!long_mode)
1424 ASSERT(*addr == (u32)*addr);
1425 if ((u32)(*addr + size - 1) < (u32)*addr ||
1426 (seg->attr.fields.type & 0xc) != 0x4 ?
1427 *addr + size - 1 > seg->limit :
1428 *addr <= seg->limit)
1430 svm_inject_exception(v, TRAP_gp_fault, 1, 0);
1431 return 0;
1434 /* Check the limit for repeated instructions, as above we checked only
1435 the first instance. Truncate the count if a limit violation would
1436 occur. Note that the checking is not necessary for page granular
1437 segments as transfers crossing page boundaries will be broken up
1438 anyway. */
1439 if (!seg->attr.fields.g && *count > 1)
1441 if ((seg->attr.fields.type & 0xc) != 0x4)
1443 /* expand-up */
1444 if (!(regs->eflags & EF_DF))
1446 if (*addr + *count * size - 1 < *addr ||
1447 *addr + *count * size - 1 > seg->limit)
1448 *count = (seg->limit + 1UL - *addr) / size;
1450 else
1452 if (*count - 1 > *addr / size)
1453 *count = *addr / size + 1;
1456 else
1458 /* expand-down */
1459 if (!(regs->eflags & EF_DF))
1461 if (*count - 1 > -(s32)*addr / size)
1462 *count = -(s32)*addr / size + 1UL;
1464 else
1466 if (*addr < (*count - 1) * size ||
1467 *addr - (*count - 1) * size <= seg->limit)
1468 *count = (*addr - seg->limit - 1) / size + 1;
1471 ASSERT(*count);
1474 *addr += seg->base;
1476 #ifdef __x86_64__
1477 else
1479 if (seg == &vmcb->fs || seg == &vmcb->gs)
1480 *addr += seg->base;
1482 if (!is_canonical_address(*addr) ||
1483 !is_canonical_address(*addr + size - 1))
1485 svm_inject_exception(v, TRAP_gp_fault, 1, 0);
1486 return 0;
1488 if (*count > (1UL << 48) / size)
1489 *count = (1UL << 48) / size;
1490 if (!(regs->eflags & EF_DF))
1492 if (*addr + *count * size - 1 < *addr ||
1493 !is_canonical_address(*addr + *count * size - 1))
1494 *count = (*addr & ~((1UL << 48) - 1)) / size;
1496 else
1498 if ((*count - 1) * size > *addr ||
1499 !is_canonical_address(*addr + (*count - 1) * size))
1500 *count = (*addr & ~((1UL << 48) - 1)) / size + 1;
1502 ASSERT(*count);
1504 #endif
1506 return 1;
1510 static void svm_io_instruction(struct vcpu *v)
1512 struct cpu_user_regs *regs;
1513 struct hvm_io_op *pio_opp;
1514 unsigned int port;
1515 unsigned int size, dir, df;
1516 ioio_info_t info;
1517 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
1519 pio_opp = &current->arch.hvm_vcpu.io_op;
1520 pio_opp->instr = INSTR_PIO;
1521 pio_opp->flags = 0;
1523 regs = &pio_opp->io_context;
1525 /* Copy current guest state into io instruction state structure. */
1526 memcpy(regs, guest_cpu_user_regs(), HVM_CONTEXT_STACK_BYTES);
1527 svm_store_cpu_guest_regs(v, regs, NULL);
1529 info.bytes = vmcb->exitinfo1;
1531 port = info.fields.port; /* port used to be addr */
1532 dir = info.fields.type; /* direction */
1533 df = regs->eflags & X86_EFLAGS_DF ? 1 : 0;
1535 if (info.fields.sz32)
1536 size = 4;
1537 else if (info.fields.sz16)
1538 size = 2;
1539 else
1540 size = 1;
1542 if (dir==IOREQ_READ)
1543 HVMTRACE_2D(IO_READ, v, port, size);
1544 else
1545 HVMTRACE_2D(IO_WRITE, v, port, size);
1547 HVM_DBG_LOG(DBG_LEVEL_IO,
1548 "svm_io_instruction: port 0x%x eip=%x:%"PRIx64", "
1549 "exit_qualification = %"PRIx64,
1550 port, vmcb->cs.sel, vmcb->rip, info.bytes);
1552 /* string instruction */
1553 if (info.fields.str)
1555 unsigned long addr, count;
1556 paddr_t paddr;
1557 unsigned long gfn;
1558 int sign = regs->eflags & X86_EFLAGS_DF ? -1 : 1;
1560 if (!svm_get_io_address(v, regs, size, info, &count, &addr))
1562 /* We failed to get a valid address, so don't do the IO operation -
1563 * it would just get worse if we do! Hopefully the guest is handing
1564 * gp-faults...
1565 */
1566 return;
1569 /* "rep" prefix */
1570 if (info.fields.rep)
1572 pio_opp->flags |= REPZ;
1575 /* Translate the address to a physical address */
1576 gfn = paging_gva_to_gfn(v, addr);
1577 if ( gfn == INVALID_GFN )
1579 /* The guest does not have the RAM address mapped.
1580 * Need to send in a page fault */
1581 int errcode = 0;
1582 /* IO read --> memory write */
1583 if ( dir == IOREQ_READ ) errcode |= PFEC_write_access;
1584 svm_hvm_inject_exception(TRAP_page_fault, errcode, addr);
1585 return;
1587 paddr = (paddr_t)gfn << PAGE_SHIFT | (addr & ~PAGE_MASK);
1589 /*
1590 * Handle string pio instructions that cross pages or that
1591 * are unaligned. See the comments in hvm_platform.c/handle_mmio()
1592 */
1593 if ((addr & PAGE_MASK) != ((addr + size - 1) & PAGE_MASK))
1595 unsigned long value = 0;
1597 pio_opp->flags |= OVERLAP;
1598 pio_opp->addr = addr;
1600 if (dir == IOREQ_WRITE) /* OUTS */
1602 if ( hvm_paging_enabled(current) )
1604 int rv = hvm_copy_from_guest_virt(&value, addr, size);
1605 if ( rv != 0 )
1607 /* Failed on the page-spanning copy. Inject PF into
1608 * the guest for the address where we failed. */
1609 addr += size - rv;
1610 gdprintk(XENLOG_DEBUG, "Pagefault reading non-io side "
1611 "of a page-spanning PIO: va=%#lx\n", addr);
1612 svm_hvm_inject_exception(TRAP_page_fault, 0, addr);
1613 return;
1616 else
1617 (void) hvm_copy_from_guest_phys(&value, addr, size);
1618 } else /* dir != IOREQ_WRITE */
1619 /* Remember where to write the result, as a *VA*.
1620 * Must be a VA so we can handle the page overlap
1621 * correctly in hvm_pio_assist() */
1622 pio_opp->addr = addr;
1624 if (count == 1)
1625 regs->eip = vmcb->exitinfo2;
1627 send_pio_req(port, 1, size, value, dir, df, 0);
1629 else
1631 unsigned long last_addr = sign > 0 ? addr + count * size - 1
1632 : addr - (count - 1) * size;
1634 if ((addr & PAGE_MASK) != (last_addr & PAGE_MASK))
1636 if (sign > 0)
1637 count = (PAGE_SIZE - (addr & ~PAGE_MASK)) / size;
1638 else
1639 count = (addr & ~PAGE_MASK) / size + 1;
1641 else
1642 regs->eip = vmcb->exitinfo2;
1644 send_pio_req(port, count, size, paddr, dir, df, 1);
1647 else
1649 /*
1650 * On SVM, the RIP of the intruction following the IN/OUT is saved in
1651 * ExitInfo2
1652 */
1653 regs->eip = vmcb->exitinfo2;
1655 if (port == 0xe9 && dir == IOREQ_WRITE && size == 1)
1656 hvm_print_line(v, regs->eax); /* guest debug output */
1658 send_pio_req(port, 1, size, regs->eax, dir, df, 0);
1662 static int svm_set_cr0(unsigned long value)
1664 struct vcpu *v = current;
1665 unsigned long mfn, old_value = v->arch.hvm_svm.cpu_shadow_cr0;
1666 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
1667 unsigned long old_base_mfn;
1669 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR0 value = %lx", value);
1671 if ( (u32)value != value )
1673 HVM_DBG_LOG(DBG_LEVEL_1,
1674 "Guest attempts to set upper 32 bits in CR0: %lx",
1675 value);
1676 svm_inject_exception(v, TRAP_gp_fault, 1, 0);
1677 return 0;
1680 value &= ~HVM_CR0_GUEST_RESERVED_BITS;
1682 /* ET is reserved and should be always be 1. */
1683 value |= X86_CR0_ET;
1685 if ( (value & (X86_CR0_PE|X86_CR0_PG)) == X86_CR0_PG )
1687 svm_inject_exception(v, TRAP_gp_fault, 1, 0);
1688 return 0;
1691 /* TS cleared? Then initialise FPU now. */
1692 if ( !(value & X86_CR0_TS) )
1694 setup_fpu(v);
1695 vmcb->exception_intercepts &= ~(1U << TRAP_no_device);
1698 if ( (value & X86_CR0_PG) && !(old_value & X86_CR0_PG) )
1700 if ( svm_lme_is_set(v) )
1702 if ( !svm_cr4_pae_is_set(v) )
1704 HVM_DBG_LOG(DBG_LEVEL_1, "Enable paging before PAE enable");
1705 svm_inject_exception(v, TRAP_gp_fault, 1, 0);
1706 return 0;
1708 HVM_DBG_LOG(DBG_LEVEL_1, "Enable the Long mode");
1709 v->arch.hvm_svm.cpu_shadow_efer |= EFER_LMA;
1710 vmcb->efer |= EFER_LMA | EFER_LME;
1713 if ( !paging_mode_hap(v->domain) )
1715 /* The guest CR3 must be pointing to the guest physical. */
1716 mfn = get_mfn_from_gpfn(v->arch.hvm_svm.cpu_cr3 >> PAGE_SHIFT);
1717 if ( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), v->domain))
1719 gdprintk(XENLOG_ERR, "Invalid CR3 value = %lx (mfn=%lx)\n",
1720 v->arch.hvm_svm.cpu_cr3, mfn);
1721 domain_crash(v->domain);
1722 return 0;
1725 /* Now arch.guest_table points to machine physical. */
1726 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1727 v->arch.guest_table = pagetable_from_pfn(mfn);
1728 if ( old_base_mfn )
1729 put_page(mfn_to_page(old_base_mfn));
1731 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, mfn = %lx",
1732 v->arch.hvm_vmx.cpu_cr3, mfn);
1735 else if ( !(value & X86_CR0_PG) && (old_value & X86_CR0_PG) )
1737 /* When CR0.PG is cleared, LMA is cleared immediately. */
1738 if ( svm_long_mode_enabled(v) )
1740 vmcb->efer &= ~(EFER_LME | EFER_LMA);
1741 v->arch.hvm_svm.cpu_shadow_efer &= ~EFER_LMA;
1744 if ( !paging_mode_hap(v->domain) && v->arch.hvm_svm.cpu_cr3 )
1746 put_page(mfn_to_page(get_mfn_from_gpfn(
1747 v->arch.hvm_svm.cpu_cr3 >> PAGE_SHIFT)));
1748 v->arch.guest_table = pagetable_null();
1752 vmcb->cr0 = v->arch.hvm_svm.cpu_shadow_cr0 = value;
1753 if ( !paging_mode_hap(v->domain) )
1754 vmcb->cr0 |= X86_CR0_PG | X86_CR0_WP;
1756 if ( (value ^ old_value) & X86_CR0_PG )
1758 paging_update_paging_modes(v);
1759 svm_asid_g_update_paging(v);
1762 return 1;
1765 /*
1766 * Read from control registers. CR0 and CR4 are read from the shadow.
1767 */
1768 static void mov_from_cr(int cr, int gp, struct cpu_user_regs *regs)
1770 unsigned long value = 0;
1771 struct vcpu *v = current;
1772 struct vlapic *vlapic = vcpu_vlapic(v);
1773 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
1775 switch ( cr )
1777 case 0:
1778 value = v->arch.hvm_svm.cpu_shadow_cr0;
1779 break;
1780 case 2:
1781 value = vmcb->cr2;
1782 break;
1783 case 3:
1784 value = (unsigned long)v->arch.hvm_svm.cpu_cr3;
1785 break;
1786 case 4:
1787 value = (unsigned long)v->arch.hvm_svm.cpu_shadow_cr4;
1788 break;
1789 case 8:
1790 value = (unsigned long)vlapic_get_reg(vlapic, APIC_TASKPRI);
1791 value = (value & 0xF0) >> 4;
1792 break;
1794 default:
1795 domain_crash(v->domain);
1796 return;
1799 HVMTRACE_2D(CR_READ, v, cr, value);
1801 set_reg(gp, value, regs, vmcb);
1803 HVM_DBG_LOG(DBG_LEVEL_VMMU, "mov_from_cr: CR%d, value = %lx", cr, value);
1807 /*
1808 * Write to control registers
1809 */
1810 static int mov_to_cr(int gpreg, int cr, struct cpu_user_regs *regs)
1812 unsigned long value, old_cr, old_base_mfn, mfn;
1813 struct vcpu *v = current;
1814 struct vlapic *vlapic = vcpu_vlapic(v);
1815 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
1817 value = get_reg(gpreg, regs, vmcb);
1819 HVMTRACE_2D(CR_WRITE, v, cr, value);
1821 HVM_DBG_LOG(DBG_LEVEL_1, "mov_to_cr: CR%d, value = %lx, current = %p",
1822 cr, value, v);
1824 switch ( cr )
1826 case 0:
1827 return svm_set_cr0(value);
1829 case 3:
1830 if ( paging_mode_hap(v->domain) )
1832 vmcb->cr3 = v->arch.hvm_svm.cpu_cr3 = value;
1833 break;
1836 /* If paging is not enabled yet, simply copy the value to CR3. */
1837 if ( !svm_paging_enabled(v) )
1839 v->arch.hvm_svm.cpu_cr3 = value;
1840 break;
1843 /* We make a new one if the shadow does not exist. */
1844 if ( value == v->arch.hvm_svm.cpu_cr3 )
1846 /*
1847 * This is simple TLB flush, implying the guest has
1848 * removed some translation or changed page attributes.
1849 * We simply invalidate the shadow.
1850 */
1851 mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
1852 if ( mfn != pagetable_get_pfn(v->arch.guest_table) )
1853 goto bad_cr3;
1854 paging_update_cr3(v);
1855 /* signal paging update to ASID handler */
1856 svm_asid_g_mov_to_cr3 (v);
1858 else
1860 /*
1861 * If different, make a shadow. Check if the PDBR is valid
1862 * first.
1863 */
1864 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 value = %lx", value);
1865 mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
1866 if ( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), v->domain) )
1867 goto bad_cr3;
1869 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1870 v->arch.guest_table = pagetable_from_pfn(mfn);
1872 if ( old_base_mfn )
1873 put_page(mfn_to_page(old_base_mfn));
1875 v->arch.hvm_svm.cpu_cr3 = value;
1876 update_cr3(v);
1877 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx", value);
1878 /* signal paging update to ASID handler */
1879 svm_asid_g_mov_to_cr3 (v);
1881 break;
1883 case 4: /* CR4 */
1884 if ( value & HVM_CR4_GUEST_RESERVED_BITS )
1886 HVM_DBG_LOG(DBG_LEVEL_1,
1887 "Guest attempts to set reserved bit in CR4: %lx",
1888 value);
1889 svm_inject_exception(v, TRAP_gp_fault, 1, 0);
1890 break;
1893 if ( paging_mode_hap(v->domain) )
1895 v->arch.hvm_svm.cpu_shadow_cr4 = value;
1896 vmcb->cr4 = value | (HVM_CR4_HOST_MASK & ~X86_CR4_PAE);
1897 paging_update_paging_modes(v);
1898 /* signal paging update to ASID handler */
1899 svm_asid_g_update_paging (v);
1900 break;
1903 old_cr = v->arch.hvm_svm.cpu_shadow_cr4;
1904 if ( value & X86_CR4_PAE && !(old_cr & X86_CR4_PAE) )
1906 if ( svm_pgbit_test(v) )
1908 #if CONFIG_PAGING_LEVELS >= 3
1909 /* The guest is a 32-bit PAE guest. */
1910 unsigned long mfn, old_base_mfn;
1911 mfn = get_mfn_from_gpfn(v->arch.hvm_svm.cpu_cr3 >> PAGE_SHIFT);
1912 if ( !mfn_valid(mfn) ||
1913 !get_page(mfn_to_page(mfn), v->domain) )
1914 goto bad_cr3;
1916 /*
1917 * Now arch.guest_table points to machine physical.
1918 */
1919 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1920 v->arch.guest_table = pagetable_from_pfn(mfn);
1921 if ( old_base_mfn )
1922 put_page(mfn_to_page(old_base_mfn));
1923 paging_update_paging_modes(v);
1924 /* signal paging update to ASID handler */
1925 svm_asid_g_update_paging (v);
1927 HVM_DBG_LOG(DBG_LEVEL_VMMU,
1928 "Update CR3 value = %lx, mfn = %lx",
1929 v->arch.hvm_svm.cpu_cr3, mfn);
1930 #endif
1933 else if ( !(value & X86_CR4_PAE) )
1935 if ( svm_long_mode_enabled(v) )
1937 svm_inject_exception(v, TRAP_gp_fault, 1, 0);
1941 v->arch.hvm_svm.cpu_shadow_cr4 = value;
1942 vmcb->cr4 = value | HVM_CR4_HOST_MASK;
1944 /*
1945 * Writing to CR4 to modify the PSE, PGE, or PAE flag invalidates
1946 * all TLB entries except global entries.
1947 */
1948 if ((old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE))
1950 paging_update_paging_modes(v);
1951 /* signal paging update to ASID handler */
1952 svm_asid_g_update_paging (v);
1954 break;
1956 case 8:
1957 vlapic_set_reg(vlapic, APIC_TASKPRI, ((value & 0x0F) << 4));
1958 vmcb->vintr.fields.tpr = value & 0x0F;
1959 break;
1961 default:
1962 gdprintk(XENLOG_ERR, "invalid cr: %d\n", cr);
1963 domain_crash(v->domain);
1964 return 0;
1967 return 1;
1969 bad_cr3:
1970 gdprintk(XENLOG_ERR, "Invalid CR3\n");
1971 domain_crash(v->domain);
1972 return 0;
1976 #define ARR_SIZE(x) (sizeof(x) / sizeof(x[0]))
1979 static int svm_cr_access(struct vcpu *v, unsigned int cr, unsigned int type,
1980 struct cpu_user_regs *regs)
1982 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
1983 int inst_len = 0;
1984 int index,addr_size,i;
1985 unsigned int gpreg,offset;
1986 unsigned long value,addr;
1987 u8 buffer[MAX_INST_LEN];
1988 u8 prefix = 0;
1989 u8 modrm;
1990 enum x86_segment seg;
1991 int result = 1;
1992 enum instruction_index list_a[] = {INSTR_MOV2CR, INSTR_CLTS, INSTR_LMSW};
1993 enum instruction_index list_b[] = {INSTR_MOVCR2, INSTR_SMSW};
1994 enum instruction_index match;
1996 inst_copy_from_guest(buffer, svm_rip2pointer(v), sizeof(buffer));
1998 /* get index to first actual instruction byte - as we will need to know
1999 where the prefix lives later on */
2000 index = skip_prefix_bytes(buffer, sizeof(buffer));
2002 if ( type == TYPE_MOV_TO_CR )
2004 inst_len = __get_instruction_length_from_list(
2005 v, list_a, ARR_SIZE(list_a), &buffer[index], &match);
2007 else /* type == TYPE_MOV_FROM_CR */
2009 inst_len = __get_instruction_length_from_list(
2010 v, list_b, ARR_SIZE(list_b), &buffer[index], &match);
2013 ASSERT(inst_len > 0);
2015 inst_len += index;
2017 /* Check for REX prefix - it's ALWAYS the last byte of any prefix bytes */
2018 if (index > 0 && (buffer[index-1] & 0xF0) == 0x40)
2019 prefix = buffer[index-1];
2021 HVM_DBG_LOG(DBG_LEVEL_1, "eip = %lx", (unsigned long) vmcb->rip);
2023 switch (match)
2025 case INSTR_MOV2CR:
2026 gpreg = decode_src_reg(prefix, buffer[index+2]);
2027 result = mov_to_cr(gpreg, cr, regs);
2028 break;
2030 case INSTR_MOVCR2:
2031 gpreg = decode_src_reg(prefix, buffer[index+2]);
2032 mov_from_cr(cr, gpreg, regs);
2033 break;
2035 case INSTR_CLTS:
2036 /* TS being cleared means that it's time to restore fpu state. */
2037 setup_fpu(current);
2038 vmcb->exception_intercepts &= ~(1U << TRAP_no_device);
2039 vmcb->cr0 &= ~X86_CR0_TS; /* clear TS */
2040 v->arch.hvm_svm.cpu_shadow_cr0 &= ~X86_CR0_TS; /* clear TS */
2041 break;
2043 case INSTR_LMSW:
2044 gpreg = decode_src_reg(prefix, buffer[index+2]);
2045 value = get_reg(gpreg, regs, vmcb) & 0xF;
2046 value = (v->arch.hvm_svm.cpu_shadow_cr0 & ~0xF) | value;
2047 result = svm_set_cr0(value);
2048 break;
2050 case INSTR_SMSW:
2051 value = v->arch.hvm_svm.cpu_shadow_cr0 & 0xFFFF;
2052 modrm = buffer[index+2];
2053 addr_size = svm_guest_x86_mode(v);
2054 if ( addr_size < 2 )
2055 addr_size = 2;
2056 if ( likely((modrm & 0xC0) >> 6 == 3) )
2058 gpreg = decode_src_reg(prefix, modrm);
2059 set_reg(gpreg, value, regs, vmcb);
2061 /*
2062 * For now, only implement decode of the offset mode, since that's the
2063 * only mode observed in a real-world OS. This code is also making the
2064 * assumption that we'll never hit this code in long mode.
2065 */
2066 else if ( (modrm == 0x26) || (modrm == 0x25) )
2068 seg = x86_seg_ds;
2069 i = index;
2070 /* Segment or address size overrides? */
2071 while ( i-- )
2073 switch ( buffer[i] )
2075 case 0x26: seg = x86_seg_es; break;
2076 case 0x2e: seg = x86_seg_cs; break;
2077 case 0x36: seg = x86_seg_ss; break;
2078 case 0x64: seg = x86_seg_fs; break;
2079 case 0x65: seg = x86_seg_gs; break;
2080 case 0x67: addr_size ^= 6; break;
2083 /* Bail unless this really is a seg_base + offset case */
2084 if ( ((modrm == 0x26) && (addr_size == 4)) ||
2085 ((modrm == 0x25) && (addr_size == 2)) )
2087 gdprintk(XENLOG_ERR, "SMSW emulation at guest address: "
2088 "%lx failed due to unhandled addressing mode."
2089 "ModRM byte was: %x \n", svm_rip2pointer(v), modrm);
2090 domain_crash(v->domain);
2092 inst_len += addr_size;
2093 offset = *(( unsigned int *) ( void *) &buffer[index + 3]);
2094 offset = ( addr_size == 4 ) ? offset : ( offset & 0xFFFF );
2095 addr = hvm_get_segment_base(v, seg);
2096 addr += offset;
2097 hvm_copy_to_guest_virt(addr,&value,2);
2099 else
2101 gdprintk(XENLOG_ERR, "SMSW emulation at guest address: %lx "
2102 "failed due to unhandled addressing mode!"
2103 "ModRM byte was: %x \n", svm_rip2pointer(v), modrm);
2104 domain_crash(v->domain);
2106 break;
2108 default:
2109 BUG();
2112 ASSERT(inst_len);
2114 __update_guest_eip(vmcb, inst_len);
2116 return result;
2119 static void svm_do_msr_access(
2120 struct vcpu *v, struct cpu_user_regs *regs)
2122 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
2123 int inst_len;
2124 u64 msr_content=0;
2125 u32 ecx = regs->ecx, eax, edx;
2127 HVM_DBG_LOG(DBG_LEVEL_1, "ecx=%x, eax=%x, edx=%x, exitinfo = %lx",
2128 ecx, (u32)regs->eax, (u32)regs->edx,
2129 (unsigned long)vmcb->exitinfo1);
2131 /* is it a read? */
2132 if (vmcb->exitinfo1 == 0)
2134 switch (ecx) {
2135 case MSR_IA32_TIME_STAMP_COUNTER:
2136 msr_content = hvm_get_guest_time(v);
2137 break;
2139 case MSR_IA32_APICBASE:
2140 msr_content = vcpu_vlapic(v)->hw.apic_base_msr;
2141 break;
2143 case MSR_EFER:
2144 msr_content = v->arch.hvm_svm.cpu_shadow_efer;
2145 break;
2147 case MSR_K8_MC4_MISC: /* Threshold register */
2148 /*
2149 * MCA/MCE: We report that the threshold register is unavailable
2150 * for OS use (locked by the BIOS).
2151 */
2152 msr_content = 1ULL << 61; /* MC4_MISC.Locked */
2153 break;
2155 case MSR_IA32_EBC_FREQUENCY_ID:
2156 /*
2157 * This Intel-only register may be accessed if this HVM guest
2158 * has been migrated from an Intel host. The value zero is not
2159 * particularly meaningful, but at least avoids the guest crashing!
2160 */
2161 msr_content = 0;
2162 break;
2164 case MSR_K8_VM_HSAVE_PA:
2165 svm_inject_exception(v, TRAP_gp_fault, 1, 0);
2166 break;
2168 case MSR_IA32_MCG_STATUS:
2169 case MSR_IA32_MC0_STATUS:
2170 case MSR_K8_MC1_STATUS:
2171 case MSR_K8_MC2_STATUS:
2172 case MSR_K8_MC3_STATUS:
2173 case MSR_K8_MC4_STATUS:
2174 /* No point in letting the guest see real MCEs */
2175 msr_content = 0;
2176 break;
2178 default:
2179 if ( rdmsr_hypervisor_regs(ecx, &eax, &edx) ||
2180 rdmsr_safe(ecx, eax, edx) == 0 )
2182 regs->eax = eax;
2183 regs->edx = edx;
2184 goto done;
2186 svm_inject_exception(v, TRAP_gp_fault, 1, 0);
2187 return;
2189 regs->eax = msr_content & 0xFFFFFFFF;
2190 regs->edx = msr_content >> 32;
2192 done:
2193 HVMTRACE_2D(MSR_READ, v, ecx, msr_content);
2194 HVM_DBG_LOG(DBG_LEVEL_1, "returns: ecx=%x, eax=%lx, edx=%lx",
2195 ecx, (unsigned long)regs->eax, (unsigned long)regs->edx);
2197 inst_len = __get_instruction_length(v, INSTR_RDMSR, NULL);
2199 else
2201 msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
2203 HVMTRACE_2D(MSR_WRITE, v, ecx, msr_content);
2205 switch (ecx)
2207 case MSR_IA32_TIME_STAMP_COUNTER:
2208 hvm_set_guest_time(v, msr_content);
2209 pt_reset(v);
2210 break;
2212 case MSR_IA32_APICBASE:
2213 vlapic_msr_set(vcpu_vlapic(v), msr_content);
2214 break;
2216 case MSR_K8_VM_HSAVE_PA:
2217 svm_inject_exception(v, TRAP_gp_fault, 1, 0);
2218 break;
2220 default:
2221 switch ( long_mode_do_msr_write(regs) )
2223 case HNDL_unhandled:
2224 wrmsr_hypervisor_regs(ecx, regs->eax, regs->edx);
2225 break;
2226 case HNDL_exception_raised:
2227 return;
2228 case HNDL_done:
2229 break;
2231 break;
2234 inst_len = __get_instruction_length(v, INSTR_WRMSR, NULL);
2237 __update_guest_eip(vmcb, inst_len);
2240 static void svm_vmexit_do_hlt(struct vmcb_struct *vmcb)
2242 enum hvm_intack type = hvm_vcpu_has_pending_irq(current);
2244 __update_guest_eip(vmcb, 1);
2246 /* Check for interrupt not handled or new interrupt. */
2247 if ( vmcb->eventinj.fields.v ||
2248 ((type != hvm_intack_none) && svm_interrupts_enabled(current, type)) )
2250 HVMTRACE_1D(HLT, current, /*int pending=*/ 1);
2251 return;
2254 HVMTRACE_1D(HLT, current, /*int pending=*/ 0);
2255 hvm_hlt(vmcb->rflags);
2258 static void svm_vmexit_do_invd(struct vcpu *v)
2260 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
2261 int inst_len;
2263 /* Invalidate the cache - we can't really do that safely - maybe we should
2264 * WBINVD, but I think it's just fine to completely ignore it - we should
2265 * have cache-snooping that solves it anyways. -- Mats P.
2266 */
2268 /* Tell the user that we did this - just in case someone runs some really
2269 * weird operating system and wants to know why it's not working...
2270 */
2271 gdprintk(XENLOG_WARNING, "INVD instruction intercepted - ignored\n");
2273 inst_len = __get_instruction_length(v, INSTR_INVD, NULL);
2274 __update_guest_eip(vmcb, inst_len);
2277 void svm_handle_invlpg(const short invlpga, struct cpu_user_regs *regs)
2279 struct vcpu *v = current;
2280 u8 opcode[MAX_INST_LEN], prefix, length = MAX_INST_LEN;
2281 unsigned long g_vaddr;
2282 int inst_len;
2283 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
2285 /*
2286 * Unknown how many bytes the invlpg instruction will take. Use the
2287 * maximum instruction length here
2288 */
2289 if (inst_copy_from_guest(opcode, svm_rip2pointer(v), length) < length)
2291 gdprintk(XENLOG_ERR, "Error reading memory %d bytes\n", length);
2292 domain_crash(v->domain);
2293 return;
2296 if (invlpga)
2298 inst_len = __get_instruction_length(v, INSTR_INVLPGA, opcode);
2299 ASSERT(inst_len > 0);
2300 __update_guest_eip(vmcb, inst_len);
2302 /*
2303 * The address is implicit on this instruction. At the moment, we don't
2304 * use ecx (ASID) to identify individual guests pages
2305 */
2306 g_vaddr = regs->eax;
2308 else
2310 /* What about multiple prefix codes? */
2311 prefix = (is_prefix(opcode[0])?opcode[0]:0);
2312 inst_len = __get_instruction_length(v, INSTR_INVLPG, opcode);
2313 ASSERT(inst_len > 0);
2315 inst_len--;
2316 length -= inst_len;
2318 /*
2319 * Decode memory operand of the instruction including ModRM, SIB, and
2320 * displacement to get effective address and length in bytes. Assume
2321 * the system in either 32- or 64-bit mode.
2322 */
2323 g_vaddr = get_effective_addr_modrm64(regs, prefix, inst_len,
2324 &opcode[inst_len], &length);
2326 inst_len += length;
2327 __update_guest_eip (vmcb, inst_len);
2330 HVMTRACE_3D(INVLPG, v, (invlpga?1:0), g_vaddr, (invlpga?regs->ecx:0));
2332 paging_invlpg(v, g_vaddr);
2333 /* signal invplg to ASID handler */
2334 svm_asid_g_invlpg (v, g_vaddr);
2338 /*
2339 * Reset to realmode causes execution to start at 0xF000:0xFFF0 in
2340 * 16-bit realmode. Basically, this mimics a processor reset.
2342 * returns 0 on success, non-zero otherwise
2343 */
2344 static int svm_reset_to_realmode(struct vcpu *v,
2345 struct cpu_user_regs *regs)
2347 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
2349 /* clear the vmcb and user regs */
2350 memset(regs, 0, sizeof(struct cpu_user_regs));
2352 /* VMCB State */
2353 vmcb->cr0 = X86_CR0_ET | X86_CR0_PG | X86_CR0_WP;
2354 v->arch.hvm_svm.cpu_shadow_cr0 = X86_CR0_ET;
2356 vmcb->cr2 = 0;
2357 vmcb->efer = EFER_SVME;
2359 vmcb->cr4 = HVM_CR4_HOST_MASK;
2360 v->arch.hvm_svm.cpu_shadow_cr4 = 0;
2362 if ( paging_mode_hap(v->domain) ) {
2363 vmcb->cr0 = v->arch.hvm_svm.cpu_shadow_cr0;
2364 vmcb->cr4 = v->arch.hvm_svm.cpu_shadow_cr4 |
2365 (HVM_CR4_HOST_MASK & ~X86_CR4_PAE);
2368 /* This will jump to ROMBIOS */
2369 vmcb->rip = 0xFFF0;
2371 /* setup the segment registers and all their hidden states */
2372 vmcb->cs.sel = 0xF000;
2373 vmcb->cs.attr.bytes = 0x089b;
2374 vmcb->cs.limit = 0xffff;
2375 vmcb->cs.base = 0x000F0000;
2377 vmcb->ss.sel = 0x00;
2378 vmcb->ss.attr.bytes = 0x0893;
2379 vmcb->ss.limit = 0xffff;
2380 vmcb->ss.base = 0x00;
2382 vmcb->ds.sel = 0x00;
2383 vmcb->ds.attr.bytes = 0x0893;
2384 vmcb->ds.limit = 0xffff;
2385 vmcb->ds.base = 0x00;
2387 vmcb->es.sel = 0x00;
2388 vmcb->es.attr.bytes = 0x0893;
2389 vmcb->es.limit = 0xffff;
2390 vmcb->es.base = 0x00;
2392 vmcb->fs.sel = 0x00;
2393 vmcb->fs.attr.bytes = 0x0893;
2394 vmcb->fs.limit = 0xffff;
2395 vmcb->fs.base = 0x00;
2397 vmcb->gs.sel = 0x00;
2398 vmcb->gs.attr.bytes = 0x0893;
2399 vmcb->gs.limit = 0xffff;
2400 vmcb->gs.base = 0x00;
2402 vmcb->ldtr.sel = 0x00;
2403 vmcb->ldtr.attr.bytes = 0x0000;
2404 vmcb->ldtr.limit = 0x0;
2405 vmcb->ldtr.base = 0x00;
2407 vmcb->gdtr.sel = 0x00;
2408 vmcb->gdtr.attr.bytes = 0x0000;
2409 vmcb->gdtr.limit = 0x0;
2410 vmcb->gdtr.base = 0x00;
2412 vmcb->tr.sel = 0;
2413 vmcb->tr.attr.bytes = 0;
2414 vmcb->tr.limit = 0x0;
2415 vmcb->tr.base = 0;
2417 vmcb->idtr.sel = 0x00;
2418 vmcb->idtr.attr.bytes = 0x0000;
2419 vmcb->idtr.limit = 0x3ff;
2420 vmcb->idtr.base = 0x00;
2422 vmcb->rax = 0;
2423 vmcb->rsp = 0;
2425 return 0;
2428 asmlinkage void svm_vmexit_handler(struct cpu_user_regs *regs)
2430 unsigned int exit_reason;
2431 unsigned long eip;
2432 struct vcpu *v = current;
2433 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
2434 int inst_len, rc;
2436 exit_reason = vmcb->exitcode;
2438 HVMTRACE_2D(VMEXIT, v, vmcb->rip, exit_reason);
2440 if ( unlikely(exit_reason == VMEXIT_INVALID) )
2442 svm_dump_vmcb(__func__, vmcb);
2443 goto exit_and_crash;
2446 perfc_incra(svmexits, exit_reason);
2447 eip = vmcb->rip;
2449 switch ( exit_reason )
2451 case VMEXIT_INTR:
2452 /* Asynchronous event, handled when we STGI'd after the VMEXIT. */
2453 HVMTRACE_0D(INTR, v);
2454 break;
2456 case VMEXIT_NMI:
2457 /* Asynchronous event, handled when we STGI'd after the VMEXIT. */
2458 HVMTRACE_0D(NMI, v);
2459 break;
2461 case VMEXIT_SMI:
2462 /* Asynchronous event, handled when we STGI'd after the VMEXIT. */
2463 HVMTRACE_0D(SMI, v);
2464 break;
2466 case VMEXIT_EXCEPTION_DB:
2467 if ( !v->domain->debugger_attached )
2468 goto exit_and_crash;
2469 domain_pause_for_debugger();
2470 break;
2472 case VMEXIT_EXCEPTION_BP:
2473 if ( !v->domain->debugger_attached )
2474 goto exit_and_crash;
2475 /* AMD Vol2, 15.11: INT3, INTO, BOUND intercepts do not update RIP. */
2476 inst_len = __get_instruction_length(v, INSTR_INT3, NULL);
2477 __update_guest_eip(vmcb, inst_len);
2478 domain_pause_for_debugger();
2479 break;
2481 case VMEXIT_EXCEPTION_NM:
2482 svm_do_no_device_fault(vmcb);
2483 break;
2485 case VMEXIT_EXCEPTION_PF: {
2486 unsigned long va;
2487 va = vmcb->exitinfo2;
2488 regs->error_code = vmcb->exitinfo1;
2489 HVM_DBG_LOG(DBG_LEVEL_VMMU,
2490 "eax=%lx, ebx=%lx, ecx=%lx, edx=%lx, esi=%lx, edi=%lx",
2491 (unsigned long)regs->eax, (unsigned long)regs->ebx,
2492 (unsigned long)regs->ecx, (unsigned long)regs->edx,
2493 (unsigned long)regs->esi, (unsigned long)regs->edi);
2495 if ( paging_fault(va, regs) )
2497 HVMTRACE_2D(PF_XEN, v, va, regs->error_code);
2498 break;
2501 v->arch.hvm_svm.cpu_cr2 = vmcb->cr2 = va;
2502 svm_inject_exception(v, TRAP_page_fault, 1, regs->error_code);
2503 break;
2506 case VMEXIT_EXCEPTION_MC:
2507 HVMTRACE_0D(MCE, v);
2508 svm_store_cpu_guest_regs(v, regs, NULL);
2509 do_machine_check(regs);
2510 break;
2512 case VMEXIT_VINTR:
2513 vmcb->vintr.fields.irq = 0;
2514 vmcb->general1_intercepts &= ~GENERAL1_INTERCEPT_VINTR;
2515 break;
2517 case VMEXIT_INVD:
2518 svm_vmexit_do_invd(v);
2519 break;
2521 case VMEXIT_GDTR_WRITE:
2522 printk("WRITE to GDTR\n");
2523 break;
2525 case VMEXIT_TASK_SWITCH:
2526 goto exit_and_crash;
2528 case VMEXIT_CPUID:
2529 svm_vmexit_do_cpuid(vmcb, regs);
2530 break;
2532 case VMEXIT_HLT:
2533 svm_vmexit_do_hlt(vmcb);
2534 break;
2536 case VMEXIT_INVLPG:
2537 svm_handle_invlpg(0, regs);
2538 break;
2540 case VMEXIT_INVLPGA:
2541 svm_handle_invlpg(1, regs);
2542 break;
2544 case VMEXIT_VMMCALL:
2545 inst_len = __get_instruction_length(v, INSTR_VMCALL, NULL);
2546 ASSERT(inst_len > 0);
2547 HVMTRACE_1D(VMMCALL, v, regs->eax);
2548 rc = hvm_do_hypercall(regs);
2549 if ( rc != HVM_HCALL_preempted )
2551 __update_guest_eip(vmcb, inst_len);
2552 if ( rc == HVM_HCALL_invalidate )
2553 send_invalidate_req();
2555 break;
2557 case VMEXIT_CR0_READ:
2558 svm_cr_access(v, 0, TYPE_MOV_FROM_CR, regs);
2559 break;
2561 case VMEXIT_CR2_READ:
2562 svm_cr_access(v, 2, TYPE_MOV_FROM_CR, regs);
2563 break;
2565 case VMEXIT_CR3_READ:
2566 svm_cr_access(v, 3, TYPE_MOV_FROM_CR, regs);
2567 break;
2569 case VMEXIT_CR4_READ:
2570 svm_cr_access(v, 4, TYPE_MOV_FROM_CR, regs);
2571 break;
2573 case VMEXIT_CR8_READ:
2574 svm_cr_access(v, 8, TYPE_MOV_FROM_CR, regs);
2575 break;
2577 case VMEXIT_CR0_WRITE:
2578 svm_cr_access(v, 0, TYPE_MOV_TO_CR, regs);
2579 break;
2581 case VMEXIT_CR2_WRITE:
2582 svm_cr_access(v, 2, TYPE_MOV_TO_CR, regs);
2583 break;
2585 case VMEXIT_CR3_WRITE:
2586 svm_cr_access(v, 3, TYPE_MOV_TO_CR, regs);
2587 local_flush_tlb();
2588 break;
2590 case VMEXIT_CR4_WRITE:
2591 svm_cr_access(v, 4, TYPE_MOV_TO_CR, regs);
2592 break;
2594 case VMEXIT_CR8_WRITE:
2595 svm_cr_access(v, 8, TYPE_MOV_TO_CR, regs);
2596 break;
2598 case VMEXIT_DR0_WRITE ... VMEXIT_DR7_WRITE:
2599 svm_dr_access(v, regs);
2600 break;
2602 case VMEXIT_IOIO:
2603 svm_io_instruction(v);
2604 break;
2606 case VMEXIT_MSR:
2607 svm_do_msr_access(v, regs);
2608 break;
2610 case VMEXIT_SHUTDOWN:
2611 hvm_triple_fault();
2612 break;
2614 case VMEXIT_VMRUN:
2615 case VMEXIT_VMLOAD:
2616 case VMEXIT_VMSAVE:
2617 case VMEXIT_STGI:
2618 case VMEXIT_CLGI:
2619 case VMEXIT_SKINIT:
2620 /* Report "Invalid opcode" on any VM-operation except VMMCALL */
2621 svm_inject_exception(v, TRAP_invalid_op, 0, 0);
2622 break;
2624 case VMEXIT_NPF:
2625 regs->error_code = vmcb->exitinfo1;
2626 if ( !svm_do_nested_pgfault(vmcb->exitinfo2, regs) )
2627 domain_crash(v->domain);
2628 break;
2630 default:
2631 exit_and_crash:
2632 gdprintk(XENLOG_ERR, "unexpected VMEXIT: exit reason = 0x%x, "
2633 "exitinfo1 = %"PRIx64", exitinfo2 = %"PRIx64"\n",
2634 exit_reason,
2635 (u64)vmcb->exitinfo1, (u64)vmcb->exitinfo2);
2636 domain_crash(v->domain);
2637 break;
2641 asmlinkage void svm_trace_vmentry(void)
2643 struct vcpu *v = current;
2645 /* This is the last C code before the VMRUN instruction. */
2646 HVMTRACE_0D(VMENTRY, v);
2649 /*
2650 * Local variables:
2651 * mode: C
2652 * c-set-style: "BSD"
2653 * c-basic-offset: 4
2654 * tab-width: 4
2655 * indent-tabs-mode: nil
2656 * End:
2657 */