direct-io.hg

view xen/arch/x86/hvm/svm/svm.c @ 15454:83cbda5c1e1b

x86-64: bump STACK_SIZE to 32 so that trampoline and IST stacks fit
without undue squeezing.
Signed-off-by: Jan Beulich <jbeulich@novell.com>
author kfraser@localhost.localdomain
date Tue Jul 03 11:41:25 2007 +0100 (2007-07-03)
parents 87d34c8c2fe1
children 50c18666d660
line source
1 /*
2 * svm.c: handling SVM architecture-related VM exits
3 * Copyright (c) 2004, Intel Corporation.
4 * Copyright (c) 2005-2007, Advanced Micro Devices, Inc.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms and conditions of the GNU General Public License,
8 * version 2, as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
17 * Place - Suite 330, Boston, MA 02111-1307 USA.
18 */
20 #include <xen/config.h>
21 #include <xen/init.h>
22 #include <xen/lib.h>
23 #include <xen/trace.h>
24 #include <xen/sched.h>
25 #include <xen/irq.h>
26 #include <xen/softirq.h>
27 #include <xen/hypercall.h>
28 #include <xen/domain_page.h>
29 #include <asm/current.h>
30 #include <asm/io.h>
31 #include <asm/paging.h>
32 #include <asm/p2m.h>
33 #include <asm/regs.h>
34 #include <asm/cpufeature.h>
35 #include <asm/processor.h>
36 #include <asm/types.h>
37 #include <asm/msr.h>
38 #include <asm/spinlock.h>
39 #include <asm/hvm/hvm.h>
40 #include <asm/hvm/support.h>
41 #include <asm/hvm/io.h>
42 #include <asm/hvm/svm/asid.h>
43 #include <asm/hvm/svm/svm.h>
44 #include <asm/hvm/svm/vmcb.h>
45 #include <asm/hvm/svm/emulate.h>
46 #include <asm/hvm/svm/intr.h>
47 #include <asm/x86_emulate.h>
48 #include <public/sched.h>
49 #include <asm/hvm/vpt.h>
50 #include <asm/hvm/trace.h>
51 #include <asm/hap.h>
53 #define set_segment_register(name, value) \
54 asm volatile ( "movw %%ax ,%%" STR(name) "" : : "a" (value) )
56 int inst_copy_from_guest(unsigned char *buf, unsigned long guest_eip,
57 int inst_len);
58 asmlinkage void do_IRQ(struct cpu_user_regs *);
60 static int svm_reset_to_realmode(struct vcpu *v,
61 struct cpu_user_regs *regs);
63 /* va of hardware host save area */
64 static void *hsa[NR_CPUS] __read_mostly;
66 /* vmcb used for extended host state */
67 static void *root_vmcb[NR_CPUS] __read_mostly;
69 /* hardware assisted paging bits */
70 extern int opt_hap_enabled;
72 static inline void svm_inject_exception(struct vcpu *v, int trap,
73 int ev, int error_code)
74 {
75 eventinj_t event;
76 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
78 if ( trap == TRAP_page_fault )
79 HVMTRACE_2D(PF_INJECT, v, v->arch.hvm_svm.cpu_cr2, error_code);
80 else
81 HVMTRACE_2D(INJ_EXC, v, trap, error_code);
83 event.bytes = 0;
84 event.fields.v = 1;
85 event.fields.type = EVENTTYPE_EXCEPTION;
86 event.fields.vector = trap;
87 event.fields.ev = ev;
88 event.fields.errorcode = error_code;
90 ASSERT(vmcb->eventinj.fields.v == 0);
92 vmcb->eventinj = event;
93 }
95 static void stop_svm(void)
96 {
97 /* We turn off the EFER_SVME bit. */
98 write_efer(read_efer() & ~EFER_SVME);
99 }
101 static void svm_store_cpu_guest_regs(
102 struct vcpu *v, struct cpu_user_regs *regs, unsigned long *crs)
103 {
104 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
106 if ( regs != NULL )
107 {
108 regs->ss = vmcb->ss.sel;
109 regs->esp = vmcb->rsp;
110 regs->eflags = vmcb->rflags;
111 regs->cs = vmcb->cs.sel;
112 regs->eip = vmcb->rip;
113 }
115 if ( crs != NULL )
116 {
117 /* Returning the guest's regs */
118 crs[0] = v->arch.hvm_svm.cpu_shadow_cr0;
119 crs[2] = v->arch.hvm_svm.cpu_cr2;
120 crs[3] = v->arch.hvm_svm.cpu_cr3;
121 crs[4] = v->arch.hvm_svm.cpu_shadow_cr4;
122 }
123 }
125 static inline int long_mode_do_msr_write(struct cpu_user_regs *regs)
126 {
127 u64 msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
128 u32 ecx = regs->ecx;
129 struct vcpu *v = current;
130 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
132 HVM_DBG_LOG(DBG_LEVEL_0, "msr %x msr_content %"PRIx64,
133 ecx, msr_content);
135 switch ( ecx )
136 {
137 case MSR_EFER:
138 /* Offending reserved bit will cause #GP. */
139 #ifdef __x86_64__
140 if ( (msr_content & ~(EFER_LME | EFER_LMA | EFER_NX | EFER_SCE)) ||
141 #else
142 if ( (msr_content & ~(EFER_NX | EFER_SCE)) ||
143 #endif
144 (!cpu_has_nx && (msr_content & EFER_NX)) ||
145 (!cpu_has_syscall && (msr_content & EFER_SCE)) )
146 {
147 gdprintk(XENLOG_WARNING, "Trying to set reserved bit in "
148 "EFER: %"PRIx64"\n", msr_content);
149 goto gp_fault;
150 }
152 #ifdef __x86_64__
153 if ( (msr_content & EFER_LME) && !svm_lme_is_set(v) )
154 {
155 /* EFER.LME transition from 0 to 1. */
156 if ( svm_paging_enabled(v) || !svm_cr4_pae_is_set(v) )
157 {
158 gdprintk(XENLOG_WARNING, "Trying to set LME bit when "
159 "in paging mode or PAE bit is not set\n");
160 goto gp_fault;
161 }
162 }
163 else if ( !(msr_content & EFER_LME) && svm_lme_is_set(v) )
164 {
165 /* EFER.LME transistion from 1 to 0. */
166 if ( svm_paging_enabled(v) )
167 {
168 gdprintk(XENLOG_WARNING,
169 "Trying to clear EFER.LME while paging enabled\n");
170 goto gp_fault;
171 }
172 }
173 #endif /* __x86_64__ */
175 v->arch.hvm_svm.cpu_shadow_efer = msr_content;
176 vmcb->efer = msr_content | EFER_SVME;
177 if ( !svm_paging_enabled(v) )
178 vmcb->efer &= ~(EFER_LME | EFER_LMA);
180 break;
182 case MSR_K8_MC4_MISC: /* Threshold register */
183 /*
184 * MCA/MCE: Threshold register is reported to be locked, so we ignore
185 * all write accesses. This behaviour matches real HW, so guests should
186 * have no problem with this.
187 */
188 break;
190 default:
191 return 0;
192 }
194 return 1;
196 gp_fault:
197 svm_inject_exception(v, TRAP_gp_fault, 1, 0);
198 return 0;
199 }
202 #define loaddebug(_v,_reg) \
203 asm volatile ("mov %0,%%db" #_reg : : "r" ((_v)->debugreg[_reg]))
204 #define savedebug(_v,_reg) \
205 asm volatile ("mov %%db" #_reg ",%0" : : "r" ((_v)->debugreg[_reg]))
207 static inline void svm_save_dr(struct vcpu *v)
208 {
209 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
211 if ( !v->arch.hvm_vcpu.flag_dr_dirty )
212 return;
214 /* Clear the DR dirty flag and re-enable intercepts for DR accesses. */
215 v->arch.hvm_vcpu.flag_dr_dirty = 0;
216 v->arch.hvm_svm.vmcb->dr_intercepts = DR_INTERCEPT_ALL_WRITES;
218 savedebug(&v->arch.guest_context, 0);
219 savedebug(&v->arch.guest_context, 1);
220 savedebug(&v->arch.guest_context, 2);
221 savedebug(&v->arch.guest_context, 3);
222 v->arch.guest_context.debugreg[6] = vmcb->dr6;
223 v->arch.guest_context.debugreg[7] = vmcb->dr7;
224 }
227 static inline void __restore_debug_registers(struct vcpu *v)
228 {
229 loaddebug(&v->arch.guest_context, 0);
230 loaddebug(&v->arch.guest_context, 1);
231 loaddebug(&v->arch.guest_context, 2);
232 loaddebug(&v->arch.guest_context, 3);
233 /* DR6 and DR7 are loaded from the VMCB. */
234 }
237 int svm_vmcb_save(struct vcpu *v, struct hvm_hw_cpu *c)
238 {
239 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
241 c->rip = vmcb->rip;
243 #ifdef HVM_DEBUG_SUSPEND
244 printk("%s: eip=0x%"PRIx64".\n",
245 __func__,
246 inst_len, c->eip);
247 #endif
249 c->rsp = vmcb->rsp;
250 c->rflags = vmcb->rflags;
252 c->cr0 = v->arch.hvm_svm.cpu_shadow_cr0;
253 c->cr2 = v->arch.hvm_svm.cpu_cr2;
254 c->cr3 = v->arch.hvm_svm.cpu_cr3;
255 c->cr4 = v->arch.hvm_svm.cpu_shadow_cr4;
257 #ifdef HVM_DEBUG_SUSPEND
258 printk("%s: cr3=0x%"PRIx64", cr0=0x%"PRIx64", cr4=0x%"PRIx64".\n",
259 __func__,
260 c->cr3,
261 c->cr0,
262 c->cr4);
263 #endif
265 c->idtr_limit = vmcb->idtr.limit;
266 c->idtr_base = vmcb->idtr.base;
268 c->gdtr_limit = vmcb->gdtr.limit;
269 c->gdtr_base = vmcb->gdtr.base;
271 c->cs_sel = vmcb->cs.sel;
272 c->cs_limit = vmcb->cs.limit;
273 c->cs_base = vmcb->cs.base;
274 c->cs_arbytes = vmcb->cs.attr.bytes;
276 c->ds_sel = vmcb->ds.sel;
277 c->ds_limit = vmcb->ds.limit;
278 c->ds_base = vmcb->ds.base;
279 c->ds_arbytes = vmcb->ds.attr.bytes;
281 c->es_sel = vmcb->es.sel;
282 c->es_limit = vmcb->es.limit;
283 c->es_base = vmcb->es.base;
284 c->es_arbytes = vmcb->es.attr.bytes;
286 c->ss_sel = vmcb->ss.sel;
287 c->ss_limit = vmcb->ss.limit;
288 c->ss_base = vmcb->ss.base;
289 c->ss_arbytes = vmcb->ss.attr.bytes;
291 c->fs_sel = vmcb->fs.sel;
292 c->fs_limit = vmcb->fs.limit;
293 c->fs_base = vmcb->fs.base;
294 c->fs_arbytes = vmcb->fs.attr.bytes;
296 c->gs_sel = vmcb->gs.sel;
297 c->gs_limit = vmcb->gs.limit;
298 c->gs_base = vmcb->gs.base;
299 c->gs_arbytes = vmcb->gs.attr.bytes;
301 c->tr_sel = vmcb->tr.sel;
302 c->tr_limit = vmcb->tr.limit;
303 c->tr_base = vmcb->tr.base;
304 c->tr_arbytes = vmcb->tr.attr.bytes;
306 c->ldtr_sel = vmcb->ldtr.sel;
307 c->ldtr_limit = vmcb->ldtr.limit;
308 c->ldtr_base = vmcb->ldtr.base;
309 c->ldtr_arbytes = vmcb->ldtr.attr.bytes;
311 c->sysenter_cs = vmcb->sysenter_cs;
312 c->sysenter_esp = vmcb->sysenter_esp;
313 c->sysenter_eip = vmcb->sysenter_eip;
315 /* Save any event/interrupt that was being injected when we last exited. */
316 if ( vmcb->exitintinfo.fields.v )
317 {
318 c->pending_event = vmcb->exitintinfo.bytes & 0xffffffff;
319 c->error_code = vmcb->exitintinfo.fields.errorcode;
320 }
321 else if ( vmcb->eventinj.fields.v )
322 {
323 c->pending_event = vmcb->eventinj.bytes & 0xffffffff;
324 c->error_code = vmcb->eventinj.fields.errorcode;
325 }
326 else
327 {
328 c->pending_event = 0;
329 c->error_code = 0;
330 }
332 return 1;
333 }
336 int svm_vmcb_restore(struct vcpu *v, struct hvm_hw_cpu *c)
337 {
338 unsigned long mfn, old_base_mfn;
339 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
341 vmcb->rip = c->rip;
342 vmcb->rsp = c->rsp;
343 vmcb->rflags = c->rflags;
345 v->arch.hvm_svm.cpu_shadow_cr0 = c->cr0;
346 vmcb->cr0 = c->cr0 | X86_CR0_WP | X86_CR0_ET | X86_CR0_PG;
348 v->arch.hvm_svm.cpu_cr2 = c->cr2;
350 #ifdef HVM_DEBUG_SUSPEND
351 printk("%s: cr3=0x%"PRIx64", cr0=0x%"PRIx64", cr4=0x%"PRIx64".\n",
352 __func__,
353 c->cr3,
354 c->cr0,
355 c->cr4);
356 #endif
358 if ( !svm_paging_enabled(v) )
359 {
360 printk("%s: paging not enabled.\n", __func__);
361 goto skip_cr3;
362 }
364 if ( c->cr3 == v->arch.hvm_svm.cpu_cr3 )
365 {
366 /*
367 * This is simple TLB flush, implying the guest has
368 * removed some translation or changed page attributes.
369 * We simply invalidate the shadow.
370 */
371 mfn = gmfn_to_mfn(v->domain, c->cr3 >> PAGE_SHIFT);
372 if ( mfn != pagetable_get_pfn(v->arch.guest_table) )
373 goto bad_cr3;
374 }
375 else
376 {
377 /*
378 * If different, make a shadow. Check if the PDBR is valid
379 * first.
380 */
381 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 c->cr3 = %"PRIx64, c->cr3);
382 mfn = gmfn_to_mfn(v->domain, c->cr3 >> PAGE_SHIFT);
383 if( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), v->domain) )
384 goto bad_cr3;
386 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
387 v->arch.guest_table = pagetable_from_pfn(mfn);
388 if (old_base_mfn)
389 put_page(mfn_to_page(old_base_mfn));
390 v->arch.hvm_svm.cpu_cr3 = c->cr3;
391 }
393 skip_cr3:
394 vmcb->cr4 = c->cr4 | HVM_CR4_HOST_MASK;
395 v->arch.hvm_svm.cpu_shadow_cr4 = c->cr4;
397 vmcb->idtr.limit = c->idtr_limit;
398 vmcb->idtr.base = c->idtr_base;
400 vmcb->gdtr.limit = c->gdtr_limit;
401 vmcb->gdtr.base = c->gdtr_base;
403 vmcb->cs.sel = c->cs_sel;
404 vmcb->cs.limit = c->cs_limit;
405 vmcb->cs.base = c->cs_base;
406 vmcb->cs.attr.bytes = c->cs_arbytes;
408 vmcb->ds.sel = c->ds_sel;
409 vmcb->ds.limit = c->ds_limit;
410 vmcb->ds.base = c->ds_base;
411 vmcb->ds.attr.bytes = c->ds_arbytes;
413 vmcb->es.sel = c->es_sel;
414 vmcb->es.limit = c->es_limit;
415 vmcb->es.base = c->es_base;
416 vmcb->es.attr.bytes = c->es_arbytes;
418 vmcb->ss.sel = c->ss_sel;
419 vmcb->ss.limit = c->ss_limit;
420 vmcb->ss.base = c->ss_base;
421 vmcb->ss.attr.bytes = c->ss_arbytes;
422 vmcb->cpl = vmcb->ss.attr.fields.dpl;
424 vmcb->fs.sel = c->fs_sel;
425 vmcb->fs.limit = c->fs_limit;
426 vmcb->fs.base = c->fs_base;
427 vmcb->fs.attr.bytes = c->fs_arbytes;
429 vmcb->gs.sel = c->gs_sel;
430 vmcb->gs.limit = c->gs_limit;
431 vmcb->gs.base = c->gs_base;
432 vmcb->gs.attr.bytes = c->gs_arbytes;
434 vmcb->tr.sel = c->tr_sel;
435 vmcb->tr.limit = c->tr_limit;
436 vmcb->tr.base = c->tr_base;
437 vmcb->tr.attr.bytes = c->tr_arbytes;
439 vmcb->ldtr.sel = c->ldtr_sel;
440 vmcb->ldtr.limit = c->ldtr_limit;
441 vmcb->ldtr.base = c->ldtr_base;
442 vmcb->ldtr.attr.bytes = c->ldtr_arbytes;
444 vmcb->sysenter_cs = c->sysenter_cs;
445 vmcb->sysenter_esp = c->sysenter_esp;
446 vmcb->sysenter_eip = c->sysenter_eip;
448 /* update VMCB for nested paging restore */
449 if ( paging_mode_hap(v->domain) ) {
450 vmcb->cr0 = v->arch.hvm_svm.cpu_shadow_cr0;
451 vmcb->cr4 = v->arch.hvm_svm.cpu_shadow_cr4 |
452 (HVM_CR4_HOST_MASK & ~X86_CR4_PAE);
453 vmcb->cr3 = c->cr3;
454 vmcb->np_enable = 1;
455 vmcb->g_pat = 0x0007040600070406ULL; /* guest PAT */
456 vmcb->h_cr3 = pagetable_get_paddr(v->domain->arch.phys_table);
457 }
459 vmcb->dr6 = c->dr6;
460 vmcb->dr7 = c->dr7;
462 if ( c->pending_valid )
463 {
464 gdprintk(XENLOG_INFO, "Re-injecting 0x%"PRIx32", 0x%"PRIx32"\n",
465 c->pending_event, c->error_code);
467 /* VMX uses a different type for #OF and #BP; fold into "Exception" */
468 if ( c->pending_type == 6 )
469 c->pending_type = 3;
470 /* Sanity check */
471 if ( c->pending_type == 1 || c->pending_type > 4
472 || c->pending_reserved != 0 )
473 {
474 gdprintk(XENLOG_ERR, "Invalid pending event 0x%"PRIx32"\n",
475 c->pending_event);
476 return -EINVAL;
477 }
478 /* Put this pending event in exitintinfo and svm_intr_assist()
479 * will reinject it when we return to the guest. */
480 vmcb->exitintinfo.bytes = c->pending_event;
481 vmcb->exitintinfo.fields.errorcode = c->error_code;
482 }
484 paging_update_paging_modes(v);
485 /* signal paging update to ASID handler */
486 svm_asid_g_update_paging (v);
488 return 0;
490 bad_cr3:
491 gdprintk(XENLOG_ERR, "Invalid CR3 value=0x%"PRIx64"\n", c->cr3);
492 return -EINVAL;
493 }
496 static void svm_save_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data)
497 {
498 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
500 data->shadow_gs = vmcb->kerngsbase;
501 data->msr_lstar = vmcb->lstar;
502 data->msr_star = vmcb->star;
503 data->msr_cstar = vmcb->cstar;
504 data->msr_syscall_mask = vmcb->sfmask;
505 data->msr_efer = v->arch.hvm_svm.cpu_shadow_efer;
506 data->msr_flags = -1ULL;
508 data->tsc = hvm_get_guest_time(v);
509 }
512 static void svm_load_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data)
513 {
514 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
516 vmcb->kerngsbase = data->shadow_gs;
517 vmcb->lstar = data->msr_lstar;
518 vmcb->star = data->msr_star;
519 vmcb->cstar = data->msr_cstar;
520 vmcb->sfmask = data->msr_syscall_mask;
521 v->arch.hvm_svm.cpu_shadow_efer = data->msr_efer;
522 vmcb->efer = data->msr_efer | EFER_SVME;
523 /* VMCB's EFER.LME isn't set unless we're actually in long mode
524 * (see long_mode_do_msr_write()) */
525 if ( !(vmcb->efer & EFER_LMA) )
526 vmcb->efer &= ~EFER_LME;
528 hvm_set_guest_time(v, data->tsc);
529 }
531 static void svm_save_vmcb_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt)
532 {
533 svm_save_cpu_state(v, ctxt);
534 svm_vmcb_save(v, ctxt);
535 }
537 static int svm_load_vmcb_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt)
538 {
539 svm_load_cpu_state(v, ctxt);
540 if (svm_vmcb_restore(v, ctxt)) {
541 printk("svm_vmcb restore failed!\n");
542 domain_crash(v->domain);
543 return -EINVAL;
544 }
546 return 0;
547 }
549 static inline void svm_restore_dr(struct vcpu *v)
550 {
551 if ( unlikely(v->arch.guest_context.debugreg[7] & 0xFF) )
552 __restore_debug_registers(v);
553 }
555 static int svm_interrupts_enabled(struct vcpu *v, enum hvm_intack type)
556 {
557 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
559 if ( type == hvm_intack_nmi )
560 return !vmcb->interrupt_shadow;
562 ASSERT((type == hvm_intack_pic) || (type == hvm_intack_lapic));
563 return !irq_masked(vmcb->rflags) && !vmcb->interrupt_shadow;
564 }
566 static int svm_guest_x86_mode(struct vcpu *v)
567 {
568 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
570 if ( unlikely(!(v->arch.hvm_svm.cpu_shadow_cr0 & X86_CR0_PE)) )
571 return 0;
572 if ( unlikely(vmcb->rflags & X86_EFLAGS_VM) )
573 return 1;
574 if ( svm_long_mode_enabled(v) && likely(vmcb->cs.attr.fields.l) )
575 return 8;
576 return (likely(vmcb->cs.attr.fields.db) ? 4 : 2);
577 }
579 static void svm_update_host_cr3(struct vcpu *v)
580 {
581 /* SVM doesn't have a HOST_CR3 equivalent to update. */
582 }
584 static void svm_update_guest_cr3(struct vcpu *v)
585 {
586 v->arch.hvm_svm.vmcb->cr3 = v->arch.hvm_vcpu.hw_cr3;
587 }
589 static void svm_flush_guest_tlbs(void)
590 {
591 /* Roll over the CPU's ASID generation, so it gets a clean TLB when we
592 * next VMRUN. (If ASIDs are disabled, the whole TLB is flushed on
593 * VMRUN anyway). */
594 svm_asid_inc_generation();
595 }
597 static void svm_update_vtpr(struct vcpu *v, unsigned long value)
598 {
599 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
601 vmcb->vintr.fields.tpr = value & 0x0f;
602 }
604 static unsigned long svm_get_ctrl_reg(struct vcpu *v, unsigned int num)
605 {
606 switch ( num )
607 {
608 case 0:
609 return v->arch.hvm_svm.cpu_shadow_cr0;
610 case 2:
611 return v->arch.hvm_svm.cpu_cr2;
612 case 3:
613 return v->arch.hvm_svm.cpu_cr3;
614 case 4:
615 return v->arch.hvm_svm.cpu_shadow_cr4;
616 default:
617 BUG();
618 }
619 return 0; /* dummy */
620 }
622 static void svm_sync_vmcb(struct vcpu *v)
623 {
624 struct arch_svm_struct *arch_svm = &v->arch.hvm_svm;
626 if ( arch_svm->vmcb_in_sync )
627 return;
629 arch_svm->vmcb_in_sync = 1;
631 asm volatile (
632 ".byte 0x0f,0x01,0xdb" /* vmsave */
633 : : "a" (__pa(arch_svm->vmcb)) );
634 }
636 static unsigned long svm_get_segment_base(struct vcpu *v, enum x86_segment seg)
637 {
638 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
639 int long_mode = 0;
641 #ifdef __x86_64__
642 long_mode = vmcb->cs.attr.fields.l && svm_long_mode_enabled(v);
643 #endif
644 switch ( seg )
645 {
646 case x86_seg_cs: return long_mode ? 0 : vmcb->cs.base;
647 case x86_seg_ds: return long_mode ? 0 : vmcb->ds.base;
648 case x86_seg_es: return long_mode ? 0 : vmcb->es.base;
649 case x86_seg_fs: svm_sync_vmcb(v); return vmcb->fs.base;
650 case x86_seg_gs: svm_sync_vmcb(v); return vmcb->gs.base;
651 case x86_seg_ss: return long_mode ? 0 : vmcb->ss.base;
652 case x86_seg_tr: svm_sync_vmcb(v); return vmcb->tr.base;
653 case x86_seg_gdtr: return vmcb->gdtr.base;
654 case x86_seg_idtr: return vmcb->idtr.base;
655 case x86_seg_ldtr: svm_sync_vmcb(v); return vmcb->ldtr.base;
656 }
657 BUG();
658 return 0;
659 }
661 static void svm_get_segment_register(struct vcpu *v, enum x86_segment seg,
662 struct segment_register *reg)
663 {
664 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
665 switch ( seg )
666 {
667 case x86_seg_cs:
668 memcpy(reg, &vmcb->cs, sizeof(*reg));
669 break;
670 case x86_seg_ds:
671 memcpy(reg, &vmcb->ds, sizeof(*reg));
672 break;
673 case x86_seg_es:
674 memcpy(reg, &vmcb->es, sizeof(*reg));
675 break;
676 case x86_seg_fs:
677 svm_sync_vmcb(v);
678 memcpy(reg, &vmcb->fs, sizeof(*reg));
679 break;
680 case x86_seg_gs:
681 svm_sync_vmcb(v);
682 memcpy(reg, &vmcb->gs, sizeof(*reg));
683 break;
684 case x86_seg_ss:
685 memcpy(reg, &vmcb->ss, sizeof(*reg));
686 break;
687 case x86_seg_tr:
688 svm_sync_vmcb(v);
689 memcpy(reg, &vmcb->tr, sizeof(*reg));
690 break;
691 case x86_seg_gdtr:
692 memcpy(reg, &vmcb->gdtr, sizeof(*reg));
693 break;
694 case x86_seg_idtr:
695 memcpy(reg, &vmcb->idtr, sizeof(*reg));
696 break;
697 case x86_seg_ldtr:
698 svm_sync_vmcb(v);
699 memcpy(reg, &vmcb->ldtr, sizeof(*reg));
700 break;
701 default: BUG();
702 }
703 }
705 /* Make sure that xen intercepts any FP accesses from current */
706 static void svm_stts(struct vcpu *v)
707 {
708 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
710 /*
711 * If the guest does not have TS enabled then we must cause and handle an
712 * exception on first use of the FPU. If the guest *does* have TS enabled
713 * then this is not necessary: no FPU activity can occur until the guest
714 * clears CR0.TS, and we will initialise the FPU when that happens.
715 */
716 if ( !(v->arch.hvm_svm.cpu_shadow_cr0 & X86_CR0_TS) )
717 {
718 v->arch.hvm_svm.vmcb->exception_intercepts |= 1U << TRAP_no_device;
719 vmcb->cr0 |= X86_CR0_TS;
720 }
721 }
724 static void svm_set_tsc_offset(struct vcpu *v, u64 offset)
725 {
726 v->arch.hvm_svm.vmcb->tsc_offset = offset;
727 }
730 static void svm_init_ap_context(
731 struct vcpu_guest_context *ctxt, int vcpuid, int trampoline_vector)
732 {
733 struct vcpu *v;
734 struct vmcb_struct *vmcb;
735 cpu_user_regs_t *regs;
736 u16 cs_sel;
738 /* We know this is safe because hvm_bringup_ap() does it */
739 v = current->domain->vcpu[vcpuid];
740 vmcb = v->arch.hvm_svm.vmcb;
741 regs = &v->arch.guest_context.user_regs;
743 memset(ctxt, 0, sizeof(*ctxt));
745 /*
746 * We execute the trampoline code in real mode. The trampoline vector
747 * passed to us is page alligned and is the physical frame number for
748 * the code. We will execute this code in real mode.
749 */
750 cs_sel = trampoline_vector << 8;
751 ctxt->user_regs.eip = 0x0;
752 ctxt->user_regs.cs = cs_sel;
754 /*
755 * This is the launch of an AP; set state so that we begin executing
756 * the trampoline code in real-mode.
757 */
758 svm_reset_to_realmode(v, regs);
759 /* Adjust the vmcb's hidden register state. */
760 vmcb->rip = 0;
761 vmcb->cs.sel = cs_sel;
762 vmcb->cs.base = (cs_sel << 4);
763 }
765 static void svm_init_hypercall_page(struct domain *d, void *hypercall_page)
766 {
767 char *p;
768 int i;
770 for ( i = 0; i < (PAGE_SIZE / 32); i++ )
771 {
772 p = (char *)(hypercall_page + (i * 32));
773 *(u8 *)(p + 0) = 0xb8; /* mov imm32, %eax */
774 *(u32 *)(p + 1) = i;
775 *(u8 *)(p + 5) = 0x0f; /* vmmcall */
776 *(u8 *)(p + 6) = 0x01;
777 *(u8 *)(p + 7) = 0xd9;
778 *(u8 *)(p + 8) = 0xc3; /* ret */
779 }
781 /* Don't support HYPERVISOR_iret at the moment */
782 *(u16 *)(hypercall_page + (__HYPERVISOR_iret * 32)) = 0x0b0f; /* ud2 */
783 }
785 static void svm_load_cpu_guest_regs(struct vcpu *v, struct cpu_user_regs *regs)
786 {
787 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
789 vmcb->ss.sel = regs->ss;
790 vmcb->rsp = regs->esp;
791 vmcb->rflags = regs->eflags | 2UL;
792 vmcb->cs.sel = regs->cs;
793 vmcb->rip = regs->eip;
794 }
796 static void svm_ctxt_switch_from(struct vcpu *v)
797 {
798 int cpu = smp_processor_id();
800 svm_save_dr(v);
802 svm_sync_vmcb(v);
804 asm volatile (
805 ".byte 0x0f,0x01,0xda" /* vmload */
806 : : "a" (__pa(root_vmcb[cpu])) );
808 #ifdef __x86_64__
809 /* Resume use of ISTs now that the host TR is reinstated. */
810 idt_tables[cpu][TRAP_double_fault].a |= IST_DF << 32;
811 idt_tables[cpu][TRAP_nmi].a |= IST_NMI << 32;
812 idt_tables[cpu][TRAP_machine_check].a |= IST_MCE << 32;
813 #endif
814 }
816 static void svm_ctxt_switch_to(struct vcpu *v)
817 {
818 int cpu = smp_processor_id();
820 #ifdef __x86_64__
821 /*
822 * This is required, because VMRUN does consistency check
823 * and some of the DOM0 selectors are pointing to
824 * invalid GDT locations, and cause AMD processors
825 * to shutdown.
826 */
827 set_segment_register(ds, 0);
828 set_segment_register(es, 0);
829 set_segment_register(ss, 0);
831 /*
832 * Cannot use ISTs for NMI/#MC/#DF while we are running with the guest TR.
833 * But this doesn't matter: the IST is only req'd to handle SYSCALL/SYSRET.
834 */
835 idt_tables[cpu][TRAP_double_fault].a &= ~(7UL << 32);
836 idt_tables[cpu][TRAP_nmi].a &= ~(7UL << 32);
837 idt_tables[cpu][TRAP_machine_check].a &= ~(7UL << 32);
838 #endif
840 svm_restore_dr(v);
842 asm volatile (
843 ".byte 0x0f,0x01,0xdb" /* vmsave */
844 : : "a" (__pa(root_vmcb[cpu])) );
845 asm volatile (
846 ".byte 0x0f,0x01,0xda" /* vmload */
847 : : "a" (__pa(v->arch.hvm_svm.vmcb)) );
848 }
850 static void svm_do_resume(struct vcpu *v)
851 {
852 bool_t debug_state = v->domain->debugger_attached;
854 if ( unlikely(v->arch.hvm_vcpu.debug_state_latch != debug_state) )
855 {
856 uint32_t mask = (1U << TRAP_debug) | (1U << TRAP_int3);
857 v->arch.hvm_vcpu.debug_state_latch = debug_state;
858 if ( debug_state )
859 v->arch.hvm_svm.vmcb->exception_intercepts |= mask;
860 else
861 v->arch.hvm_svm.vmcb->exception_intercepts &= ~mask;
862 }
864 if ( v->arch.hvm_svm.launch_core != smp_processor_id() )
865 {
866 v->arch.hvm_svm.launch_core = smp_processor_id();
867 hvm_migrate_timers(v);
869 /* Migrating to another ASID domain. Request a new ASID. */
870 svm_asid_init_vcpu(v);
871 }
873 hvm_do_resume(v);
874 reset_stack_and_jump(svm_asm_do_resume);
875 }
877 static int svm_domain_initialise(struct domain *d)
878 {
879 return 0;
880 }
882 static void svm_domain_destroy(struct domain *d)
883 {
884 }
886 static int svm_vcpu_initialise(struct vcpu *v)
887 {
888 int rc;
890 v->arch.schedule_tail = svm_do_resume;
891 v->arch.ctxt_switch_from = svm_ctxt_switch_from;
892 v->arch.ctxt_switch_to = svm_ctxt_switch_to;
894 v->arch.hvm_svm.launch_core = -1;
896 if ( (rc = svm_create_vmcb(v)) != 0 )
897 {
898 dprintk(XENLOG_WARNING,
899 "Failed to create VMCB for vcpu %d: err=%d.\n",
900 v->vcpu_id, rc);
901 return rc;
902 }
904 return 0;
905 }
907 static void svm_vcpu_destroy(struct vcpu *v)
908 {
909 svm_destroy_vmcb(v);
910 }
912 static void svm_hvm_inject_exception(
913 unsigned int trapnr, int errcode, unsigned long cr2)
914 {
915 struct vcpu *v = current;
916 if ( trapnr == TRAP_page_fault )
917 v->arch.hvm_svm.vmcb->cr2 = v->arch.hvm_svm.cpu_cr2 = cr2;
918 svm_inject_exception(v, trapnr, (errcode != -1), errcode);
919 }
921 static int svm_event_injection_faulted(struct vcpu *v)
922 {
923 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
924 return vmcb->exitintinfo.fields.v;
925 }
927 static struct hvm_function_table svm_function_table = {
928 .name = "SVM",
929 .disable = stop_svm,
930 .domain_initialise = svm_domain_initialise,
931 .domain_destroy = svm_domain_destroy,
932 .vcpu_initialise = svm_vcpu_initialise,
933 .vcpu_destroy = svm_vcpu_destroy,
934 .store_cpu_guest_regs = svm_store_cpu_guest_regs,
935 .load_cpu_guest_regs = svm_load_cpu_guest_regs,
936 .save_cpu_ctxt = svm_save_vmcb_ctxt,
937 .load_cpu_ctxt = svm_load_vmcb_ctxt,
938 .paging_enabled = svm_paging_enabled,
939 .long_mode_enabled = svm_long_mode_enabled,
940 .pae_enabled = svm_pae_enabled,
941 .nx_enabled = svm_nx_enabled,
942 .interrupts_enabled = svm_interrupts_enabled,
943 .guest_x86_mode = svm_guest_x86_mode,
944 .get_guest_ctrl_reg = svm_get_ctrl_reg,
945 .get_segment_base = svm_get_segment_base,
946 .get_segment_register = svm_get_segment_register,
947 .update_host_cr3 = svm_update_host_cr3,
948 .update_guest_cr3 = svm_update_guest_cr3,
949 .flush_guest_tlbs = svm_flush_guest_tlbs,
950 .update_vtpr = svm_update_vtpr,
951 .stts = svm_stts,
952 .set_tsc_offset = svm_set_tsc_offset,
953 .inject_exception = svm_hvm_inject_exception,
954 .init_ap_context = svm_init_ap_context,
955 .init_hypercall_page = svm_init_hypercall_page,
956 .event_injection_faulted = svm_event_injection_faulted
957 };
959 static void svm_npt_detect(void)
960 {
961 u32 eax, ebx, ecx, edx;
963 /* Check CPUID for nested paging support. */
964 cpuid(0x8000000A, &eax, &ebx, &ecx, &edx);
966 if ( !(edx & 1) && opt_hap_enabled )
967 {
968 printk("SVM: Nested paging is not supported by this CPU.\n");
969 opt_hap_enabled = 0;
970 }
971 }
973 int start_svm(struct cpuinfo_x86 *c)
974 {
975 u32 eax, ecx, edx;
976 u32 phys_hsa_lo, phys_hsa_hi;
977 u64 phys_hsa;
978 int cpu = smp_processor_id();
980 /* Xen does not fill x86_capability words except 0. */
981 ecx = cpuid_ecx(0x80000001);
982 boot_cpu_data.x86_capability[5] = ecx;
984 if ( !(test_bit(X86_FEATURE_SVME, &boot_cpu_data.x86_capability)) )
985 return 0;
987 /* Check whether SVM feature is disabled in BIOS */
988 rdmsr(MSR_K8_VM_CR, eax, edx);
989 if ( eax & K8_VMCR_SVME_DISABLE )
990 {
991 printk("AMD SVM Extension is disabled in BIOS.\n");
992 return 0;
993 }
995 if ( ((hsa[cpu] = alloc_host_save_area()) == NULL) ||
996 ((root_vmcb[cpu] = alloc_vmcb()) == NULL) )
997 return 0;
999 write_efer(read_efer() | EFER_SVME);
1001 svm_npt_detect();
1003 /* Initialize the HSA for this core. */
1004 phys_hsa = (u64) virt_to_maddr(hsa[cpu]);
1005 phys_hsa_lo = (u32) phys_hsa;
1006 phys_hsa_hi = (u32) (phys_hsa >> 32);
1007 wrmsr(MSR_K8_VM_HSAVE_PA, phys_hsa_lo, phys_hsa_hi);
1009 /* Initialize core's ASID handling. */
1010 svm_asid_init(c);
1012 if ( cpu != 0 )
1013 return 1;
1015 setup_vmcb_dump();
1017 hvm_enable(&svm_function_table);
1019 if ( opt_hap_enabled )
1020 printk("SVM: Nested paging enabled.\n");
1022 return 1;
1025 static int svm_do_nested_pgfault(paddr_t gpa, struct cpu_user_regs *regs)
1027 if (mmio_space(gpa)) {
1028 handle_mmio(gpa);
1029 return 1;
1032 paging_mark_dirty(current->domain, get_mfn_from_gpfn(gpa >> PAGE_SHIFT));
1033 return p2m_set_flags(current->domain, gpa, __PAGE_HYPERVISOR|_PAGE_USER);
1036 static void svm_do_no_device_fault(struct vmcb_struct *vmcb)
1038 struct vcpu *v = current;
1040 setup_fpu(v);
1041 vmcb->exception_intercepts &= ~(1U << TRAP_no_device);
1043 if ( !(v->arch.hvm_svm.cpu_shadow_cr0 & X86_CR0_TS) )
1044 vmcb->cr0 &= ~X86_CR0_TS;
1047 /* Reserved bits ECX: [31:14], [12:4], [2:1]*/
1048 #define SVM_VCPU_CPUID_L1_ECX_RESERVED 0xffffdff6
1049 /* Reserved bits EDX: [31:29], [27], [22:20], [18], [10] */
1050 #define SVM_VCPU_CPUID_L1_EDX_RESERVED 0xe8740400
1052 static void svm_vmexit_do_cpuid(struct vmcb_struct *vmcb,
1053 struct cpu_user_regs *regs)
1055 unsigned long input = regs->eax;
1056 unsigned int eax, ebx, ecx, edx;
1057 struct vcpu *v = current;
1058 int inst_len;
1060 hvm_cpuid(input, &eax, &ebx, &ecx, &edx);
1062 if ( input == 0x00000001 )
1064 /* Clear out reserved bits. */
1065 ecx &= ~SVM_VCPU_CPUID_L1_ECX_RESERVED;
1066 edx &= ~SVM_VCPU_CPUID_L1_EDX_RESERVED;
1068 /* Guest should only see one logical processor.
1069 * See details on page 23 of AMD CPUID Specification.
1070 */
1071 clear_bit(X86_FEATURE_HT & 31, &edx); /* clear the hyperthread bit */
1072 ebx &= 0xFF00FFFF; /* clear the logical processor count when HTT=0 */
1073 ebx |= 0x00010000; /* set to 1 just for precaution */
1075 else if ( input == 0x80000001 )
1077 if ( vlapic_hw_disabled(vcpu_vlapic(v)) )
1078 clear_bit(X86_FEATURE_APIC & 31, &edx);
1080 #if CONFIG_PAGING_LEVELS >= 3
1081 if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] )
1082 #endif
1083 clear_bit(X86_FEATURE_PAE & 31, &edx);
1085 clear_bit(X86_FEATURE_PSE36 & 31, &edx);
1087 /* Clear the Cmp_Legacy bit
1088 * This bit is supposed to be zero when HTT = 0.
1089 * See details on page 23 of AMD CPUID Specification.
1090 */
1091 clear_bit(X86_FEATURE_CMP_LEGACY & 31, &ecx);
1093 /* Make SVM feature invisible to the guest. */
1094 clear_bit(X86_FEATURE_SVME & 31, &ecx);
1096 /* So far, we do not support 3DNow for the guest. */
1097 clear_bit(X86_FEATURE_3DNOW & 31, &edx);
1098 clear_bit(X86_FEATURE_3DNOWEXT & 31, &edx);
1099 /* no FFXSR instructions feature. */
1100 clear_bit(X86_FEATURE_FFXSR & 31, &edx);
1102 else if ( input == 0x80000007 || input == 0x8000000A )
1104 /* Mask out features of power management and SVM extension. */
1105 eax = ebx = ecx = edx = 0;
1107 else if ( input == 0x80000008 )
1109 /* Make sure Number of CPU core is 1 when HTT=0 */
1110 ecx &= 0xFFFFFF00;
1113 regs->eax = (unsigned long)eax;
1114 regs->ebx = (unsigned long)ebx;
1115 regs->ecx = (unsigned long)ecx;
1116 regs->edx = (unsigned long)edx;
1118 HVMTRACE_3D(CPUID, v, input,
1119 ((uint64_t)eax << 32) | ebx, ((uint64_t)ecx << 32) | edx);
1121 inst_len = __get_instruction_length(v, INSTR_CPUID, NULL);
1122 ASSERT(inst_len > 0);
1123 __update_guest_eip(vmcb, inst_len);
1126 static inline unsigned long *get_reg_p(
1127 unsigned int gpreg,
1128 struct cpu_user_regs *regs, struct vmcb_struct *vmcb)
1130 unsigned long *reg_p = NULL;
1131 switch (gpreg)
1133 case SVM_REG_EAX:
1134 reg_p = (unsigned long *)&regs->eax;
1135 break;
1136 case SVM_REG_EBX:
1137 reg_p = (unsigned long *)&regs->ebx;
1138 break;
1139 case SVM_REG_ECX:
1140 reg_p = (unsigned long *)&regs->ecx;
1141 break;
1142 case SVM_REG_EDX:
1143 reg_p = (unsigned long *)&regs->edx;
1144 break;
1145 case SVM_REG_EDI:
1146 reg_p = (unsigned long *)&regs->edi;
1147 break;
1148 case SVM_REG_ESI:
1149 reg_p = (unsigned long *)&regs->esi;
1150 break;
1151 case SVM_REG_EBP:
1152 reg_p = (unsigned long *)&regs->ebp;
1153 break;
1154 case SVM_REG_ESP:
1155 reg_p = (unsigned long *)&vmcb->rsp;
1156 break;
1157 #ifdef __x86_64__
1158 case SVM_REG_R8:
1159 reg_p = (unsigned long *)&regs->r8;
1160 break;
1161 case SVM_REG_R9:
1162 reg_p = (unsigned long *)&regs->r9;
1163 break;
1164 case SVM_REG_R10:
1165 reg_p = (unsigned long *)&regs->r10;
1166 break;
1167 case SVM_REG_R11:
1168 reg_p = (unsigned long *)&regs->r11;
1169 break;
1170 case SVM_REG_R12:
1171 reg_p = (unsigned long *)&regs->r12;
1172 break;
1173 case SVM_REG_R13:
1174 reg_p = (unsigned long *)&regs->r13;
1175 break;
1176 case SVM_REG_R14:
1177 reg_p = (unsigned long *)&regs->r14;
1178 break;
1179 case SVM_REG_R15:
1180 reg_p = (unsigned long *)&regs->r15;
1181 break;
1182 #endif
1183 default:
1184 BUG();
1187 return reg_p;
1191 static inline unsigned long get_reg(
1192 unsigned int gpreg, struct cpu_user_regs *regs, struct vmcb_struct *vmcb)
1194 unsigned long *gp;
1195 gp = get_reg_p(gpreg, regs, vmcb);
1196 return *gp;
1200 static inline void set_reg(
1201 unsigned int gpreg, unsigned long value,
1202 struct cpu_user_regs *regs, struct vmcb_struct *vmcb)
1204 unsigned long *gp;
1205 gp = get_reg_p(gpreg, regs, vmcb);
1206 *gp = value;
1210 static void svm_dr_access(struct vcpu *v, struct cpu_user_regs *regs)
1212 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
1214 HVMTRACE_0D(DR_WRITE, v);
1216 v->arch.hvm_vcpu.flag_dr_dirty = 1;
1218 __restore_debug_registers(v);
1220 /* allow the guest full access to the debug registers */
1221 vmcb->dr_intercepts = 0;
1225 static void svm_get_prefix_info(struct vcpu *v, unsigned int dir,
1226 svm_segment_register_t **seg,
1227 unsigned int *asize)
1229 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
1230 unsigned char inst[MAX_INST_LEN];
1231 int i;
1233 memset(inst, 0, MAX_INST_LEN);
1234 if (inst_copy_from_guest(inst, svm_rip2pointer(v), sizeof(inst))
1235 != MAX_INST_LEN)
1237 gdprintk(XENLOG_ERR, "get guest instruction failed\n");
1238 domain_crash(current->domain);
1239 return;
1242 for (i = 0; i < MAX_INST_LEN; i++)
1244 switch (inst[i])
1246 case 0xf3: /* REPZ */
1247 case 0xf2: /* REPNZ */
1248 case 0xf0: /* LOCK */
1249 case 0x66: /* data32 */
1250 #ifdef __x86_64__
1251 /* REX prefixes */
1252 case 0x40:
1253 case 0x41:
1254 case 0x42:
1255 case 0x43:
1256 case 0x44:
1257 case 0x45:
1258 case 0x46:
1259 case 0x47:
1261 case 0x48:
1262 case 0x49:
1263 case 0x4a:
1264 case 0x4b:
1265 case 0x4c:
1266 case 0x4d:
1267 case 0x4e:
1268 case 0x4f:
1269 #endif
1270 continue;
1271 case 0x67: /* addr32 */
1272 *asize ^= 48; /* Switch 16/32 bits */
1273 continue;
1274 case 0x2e: /* CS */
1275 *seg = &vmcb->cs;
1276 continue;
1277 case 0x36: /* SS */
1278 *seg = &vmcb->ss;
1279 continue;
1280 case 0x26: /* ES */
1281 *seg = &vmcb->es;
1282 continue;
1283 case 0x64: /* FS */
1284 svm_sync_vmcb(v);
1285 *seg = &vmcb->fs;
1286 continue;
1287 case 0x65: /* GS */
1288 svm_sync_vmcb(v);
1289 *seg = &vmcb->gs;
1290 continue;
1291 case 0x3e: /* DS */
1292 *seg = &vmcb->ds;
1293 continue;
1294 default:
1295 break;
1297 return;
1302 /* Get the address of INS/OUTS instruction */
1303 static inline int svm_get_io_address(
1304 struct vcpu *v, struct cpu_user_regs *regs,
1305 unsigned int size, ioio_info_t info,
1306 unsigned long *count, unsigned long *addr)
1308 unsigned long reg;
1309 unsigned int asize, isize;
1310 int long_mode = 0;
1311 svm_segment_register_t *seg = NULL;
1312 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
1314 #ifdef __x86_64__
1315 /* If we're in long mode, we shouldn't check the segment presence & limit */
1316 long_mode = vmcb->cs.attr.fields.l && svm_long_mode_enabled(v);
1317 #endif
1319 /* d field of cs.attr is 1 for 32-bit, 0 for 16 or 64 bit.
1320 * l field combined with EFER_LMA says whether it's 16 or 64 bit.
1321 */
1322 asize = (long_mode)?64:((vmcb->cs.attr.fields.db)?32:16);
1325 /* The ins/outs instructions are single byte, so if we have got more
1326 * than one byte (+ maybe rep-prefix), we have some prefix so we need
1327 * to figure out what it is...
1328 */
1329 isize = vmcb->exitinfo2 - vmcb->rip;
1331 if (info.fields.rep)
1332 isize --;
1334 if (isize > 1)
1335 svm_get_prefix_info(v, info.fields.type, &seg, &asize);
1337 if (info.fields.type == IOREQ_WRITE)
1339 reg = regs->esi;
1340 if (!seg) /* If no prefix, used DS. */
1341 seg = &vmcb->ds;
1342 if (!long_mode && (seg->attr.fields.type & 0xa) == 0x8) {
1343 svm_inject_exception(v, TRAP_gp_fault, 1, 0);
1344 return 0;
1347 else
1349 reg = regs->edi;
1350 seg = &vmcb->es; /* Note: This is ALWAYS ES. */
1351 if (!long_mode && (seg->attr.fields.type & 0xa) != 0x2) {
1352 svm_inject_exception(v, TRAP_gp_fault, 1, 0);
1353 return 0;
1357 /* If the segment isn't present, give GP fault! */
1358 if (!long_mode && !seg->attr.fields.p)
1360 svm_inject_exception(v, TRAP_gp_fault, 1, 0);
1361 return 0;
1364 if (asize == 16)
1366 *addr = (reg & 0xFFFF);
1367 *count = regs->ecx & 0xffff;
1369 else
1371 *addr = reg;
1372 *count = regs->ecx;
1374 if (!info.fields.rep)
1375 *count = 1;
1377 if (!long_mode)
1379 ASSERT(*addr == (u32)*addr);
1380 if ((u32)(*addr + size - 1) < (u32)*addr ||
1381 (seg->attr.fields.type & 0xc) != 0x4 ?
1382 *addr + size - 1 > seg->limit :
1383 *addr <= seg->limit)
1385 svm_inject_exception(v, TRAP_gp_fault, 1, 0);
1386 return 0;
1389 /* Check the limit for repeated instructions, as above we checked only
1390 the first instance. Truncate the count if a limit violation would
1391 occur. Note that the checking is not necessary for page granular
1392 segments as transfers crossing page boundaries will be broken up
1393 anyway. */
1394 if (!seg->attr.fields.g && *count > 1)
1396 if ((seg->attr.fields.type & 0xc) != 0x4)
1398 /* expand-up */
1399 if (!(regs->eflags & EF_DF))
1401 if (*addr + *count * size - 1 < *addr ||
1402 *addr + *count * size - 1 > seg->limit)
1403 *count = (seg->limit + 1UL - *addr) / size;
1405 else
1407 if (*count - 1 > *addr / size)
1408 *count = *addr / size + 1;
1411 else
1413 /* expand-down */
1414 if (!(regs->eflags & EF_DF))
1416 if (*count - 1 > -(s32)*addr / size)
1417 *count = -(s32)*addr / size + 1UL;
1419 else
1421 if (*addr < (*count - 1) * size ||
1422 *addr - (*count - 1) * size <= seg->limit)
1423 *count = (*addr - seg->limit - 1) / size + 1;
1426 ASSERT(*count);
1429 *addr += seg->base;
1431 #ifdef __x86_64__
1432 else
1434 if (seg == &vmcb->fs || seg == &vmcb->gs)
1435 *addr += seg->base;
1437 if (!is_canonical_address(*addr) ||
1438 !is_canonical_address(*addr + size - 1))
1440 svm_inject_exception(v, TRAP_gp_fault, 1, 0);
1441 return 0;
1443 if (*count > (1UL << 48) / size)
1444 *count = (1UL << 48) / size;
1445 if (!(regs->eflags & EF_DF))
1447 if (*addr + *count * size - 1 < *addr ||
1448 !is_canonical_address(*addr + *count * size - 1))
1449 *count = (*addr & ~((1UL << 48) - 1)) / size;
1451 else
1453 if ((*count - 1) * size > *addr ||
1454 !is_canonical_address(*addr + (*count - 1) * size))
1455 *count = (*addr & ~((1UL << 48) - 1)) / size + 1;
1457 ASSERT(*count);
1459 #endif
1461 return 1;
1465 static void svm_io_instruction(struct vcpu *v)
1467 struct cpu_user_regs *regs;
1468 struct hvm_io_op *pio_opp;
1469 unsigned int port;
1470 unsigned int size, dir, df;
1471 ioio_info_t info;
1472 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
1474 pio_opp = &current->arch.hvm_vcpu.io_op;
1475 pio_opp->instr = INSTR_PIO;
1476 pio_opp->flags = 0;
1478 regs = &pio_opp->io_context;
1480 /* Copy current guest state into io instruction state structure. */
1481 memcpy(regs, guest_cpu_user_regs(), HVM_CONTEXT_STACK_BYTES);
1482 svm_store_cpu_guest_regs(v, regs, NULL);
1484 info.bytes = vmcb->exitinfo1;
1486 port = info.fields.port; /* port used to be addr */
1487 dir = info.fields.type; /* direction */
1488 df = regs->eflags & X86_EFLAGS_DF ? 1 : 0;
1490 if (info.fields.sz32)
1491 size = 4;
1492 else if (info.fields.sz16)
1493 size = 2;
1494 else
1495 size = 1;
1497 if (dir==IOREQ_READ)
1498 HVMTRACE_2D(IO_READ, v, port, size);
1499 else
1500 HVMTRACE_2D(IO_WRITE, v, port, size);
1502 HVM_DBG_LOG(DBG_LEVEL_IO,
1503 "svm_io_instruction: port 0x%x eip=%x:%"PRIx64", "
1504 "exit_qualification = %"PRIx64,
1505 port, vmcb->cs.sel, vmcb->rip, info.bytes);
1507 /* string instruction */
1508 if (info.fields.str)
1510 unsigned long addr, count;
1511 paddr_t paddr;
1512 unsigned long gfn;
1513 int sign = regs->eflags & X86_EFLAGS_DF ? -1 : 1;
1515 if (!svm_get_io_address(v, regs, size, info, &count, &addr))
1517 /* We failed to get a valid address, so don't do the IO operation -
1518 * it would just get worse if we do! Hopefully the guest is handing
1519 * gp-faults...
1520 */
1521 return;
1524 /* "rep" prefix */
1525 if (info.fields.rep)
1527 pio_opp->flags |= REPZ;
1530 /* Translate the address to a physical address */
1531 gfn = paging_gva_to_gfn(v, addr);
1532 if ( gfn == INVALID_GFN )
1534 /* The guest does not have the RAM address mapped.
1535 * Need to send in a page fault */
1536 int errcode = 0;
1537 /* IO read --> memory write */
1538 if ( dir == IOREQ_READ ) errcode |= PFEC_write_access;
1539 svm_hvm_inject_exception(TRAP_page_fault, errcode, addr);
1540 return;
1542 paddr = (paddr_t)gfn << PAGE_SHIFT | (addr & ~PAGE_MASK);
1544 /*
1545 * Handle string pio instructions that cross pages or that
1546 * are unaligned. See the comments in hvm_platform.c/handle_mmio()
1547 */
1548 if ((addr & PAGE_MASK) != ((addr + size - 1) & PAGE_MASK))
1550 unsigned long value = 0;
1552 pio_opp->flags |= OVERLAP;
1553 pio_opp->addr = addr;
1555 if (dir == IOREQ_WRITE) /* OUTS */
1557 if ( hvm_paging_enabled(current) )
1559 int rv = hvm_copy_from_guest_virt(&value, addr, size);
1560 if ( rv != 0 )
1562 /* Failed on the page-spanning copy. Inject PF into
1563 * the guest for the address where we failed. */
1564 addr += size - rv;
1565 gdprintk(XENLOG_DEBUG, "Pagefault reading non-io side "
1566 "of a page-spanning PIO: va=%#lx\n", addr);
1567 svm_hvm_inject_exception(TRAP_page_fault, 0, addr);
1568 return;
1571 else
1572 (void) hvm_copy_from_guest_phys(&value, addr, size);
1573 } else /* dir != IOREQ_WRITE */
1574 /* Remember where to write the result, as a *VA*.
1575 * Must be a VA so we can handle the page overlap
1576 * correctly in hvm_pio_assist() */
1577 pio_opp->addr = addr;
1579 if (count == 1)
1580 regs->eip = vmcb->exitinfo2;
1582 send_pio_req(port, 1, size, value, dir, df, 0);
1584 else
1586 unsigned long last_addr = sign > 0 ? addr + count * size - 1
1587 : addr - (count - 1) * size;
1589 if ((addr & PAGE_MASK) != (last_addr & PAGE_MASK))
1591 if (sign > 0)
1592 count = (PAGE_SIZE - (addr & ~PAGE_MASK)) / size;
1593 else
1594 count = (addr & ~PAGE_MASK) / size + 1;
1596 else
1597 regs->eip = vmcb->exitinfo2;
1599 send_pio_req(port, count, size, paddr, dir, df, 1);
1602 else
1604 /*
1605 * On SVM, the RIP of the intruction following the IN/OUT is saved in
1606 * ExitInfo2
1607 */
1608 regs->eip = vmcb->exitinfo2;
1610 if (port == 0xe9 && dir == IOREQ_WRITE && size == 1)
1611 hvm_print_line(v, regs->eax); /* guest debug output */
1613 send_pio_req(port, 1, size, regs->eax, dir, df, 0);
1617 static int svm_set_cr0(unsigned long value)
1619 struct vcpu *v = current;
1620 unsigned long mfn, old_value = v->arch.hvm_svm.cpu_shadow_cr0;
1621 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
1622 unsigned long old_base_mfn;
1624 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR0 value = %lx", value);
1626 /* ET is reserved and should be always be 1. */
1627 value |= X86_CR0_ET;
1629 if ( (value & (X86_CR0_PE|X86_CR0_PG)) == X86_CR0_PG )
1631 svm_inject_exception(v, TRAP_gp_fault, 1, 0);
1632 return 0;
1635 /* TS cleared? Then initialise FPU now. */
1636 if ( !(value & X86_CR0_TS) )
1638 setup_fpu(v);
1639 vmcb->exception_intercepts &= ~(1U << TRAP_no_device);
1642 if ( (value & X86_CR0_PG) && !(old_value & X86_CR0_PG) )
1644 #if defined(__x86_64__)
1645 if ( svm_lme_is_set(v) )
1647 if ( !svm_cr4_pae_is_set(v) )
1649 HVM_DBG_LOG(DBG_LEVEL_1, "Enable paging before PAE enable");
1650 svm_inject_exception(v, TRAP_gp_fault, 1, 0);
1651 return 0;
1653 HVM_DBG_LOG(DBG_LEVEL_1, "Enable the Long mode");
1654 v->arch.hvm_svm.cpu_shadow_efer |= EFER_LMA;
1655 vmcb->efer |= EFER_LMA | EFER_LME;
1657 #endif /* __x86_64__ */
1659 if ( !paging_mode_hap(v->domain) )
1661 /* The guest CR3 must be pointing to the guest physical. */
1662 mfn = get_mfn_from_gpfn(v->arch.hvm_svm.cpu_cr3 >> PAGE_SHIFT);
1663 if ( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), v->domain))
1665 gdprintk(XENLOG_ERR, "Invalid CR3 value = %lx (mfn=%lx)\n",
1666 v->arch.hvm_svm.cpu_cr3, mfn);
1667 domain_crash(v->domain);
1668 return 0;
1671 /* Now arch.guest_table points to machine physical. */
1672 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1673 v->arch.guest_table = pagetable_from_pfn(mfn);
1674 if ( old_base_mfn )
1675 put_page(mfn_to_page(old_base_mfn));
1677 HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
1678 (unsigned long) (mfn << PAGE_SHIFT));
1681 else if ( !(value & X86_CR0_PG) && (old_value & X86_CR0_PG) )
1683 /* When CR0.PG is cleared, LMA is cleared immediately. */
1684 if ( svm_long_mode_enabled(v) )
1686 vmcb->efer &= ~(EFER_LME | EFER_LMA);
1687 v->arch.hvm_svm.cpu_shadow_efer &= ~EFER_LMA;
1690 if ( !paging_mode_hap(v->domain) && v->arch.hvm_svm.cpu_cr3 )
1692 put_page(mfn_to_page(get_mfn_from_gpfn(
1693 v->arch.hvm_svm.cpu_cr3 >> PAGE_SHIFT)));
1694 v->arch.guest_table = pagetable_null();
1698 vmcb->cr0 = v->arch.hvm_svm.cpu_shadow_cr0 = value;
1699 if ( !paging_mode_hap(v->domain) )
1700 vmcb->cr0 |= X86_CR0_PG | X86_CR0_WP;
1702 if ( (value ^ old_value) & X86_CR0_PG )
1704 paging_update_paging_modes(v);
1705 /* signal paging update to ASID handler */
1706 svm_asid_g_update_paging (v);
1709 return 1;
1712 /*
1713 * Read from control registers. CR0 and CR4 are read from the shadow.
1714 */
1715 static void mov_from_cr(int cr, int gp, struct cpu_user_regs *regs)
1717 unsigned long value = 0;
1718 struct vcpu *v = current;
1719 struct vlapic *vlapic = vcpu_vlapic(v);
1720 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
1722 switch ( cr )
1724 case 0:
1725 value = v->arch.hvm_svm.cpu_shadow_cr0;
1726 break;
1727 case 2:
1728 value = vmcb->cr2;
1729 break;
1730 case 3:
1731 value = (unsigned long)v->arch.hvm_svm.cpu_cr3;
1732 break;
1733 case 4:
1734 value = (unsigned long)v->arch.hvm_svm.cpu_shadow_cr4;
1735 break;
1736 case 8:
1737 value = (unsigned long)vlapic_get_reg(vlapic, APIC_TASKPRI);
1738 value = (value & 0xF0) >> 4;
1739 break;
1741 default:
1742 domain_crash(v->domain);
1743 return;
1746 HVMTRACE_2D(CR_READ, v, cr, value);
1748 set_reg(gp, value, regs, vmcb);
1750 HVM_DBG_LOG(DBG_LEVEL_VMMU, "mov_from_cr: CR%d, value = %lx", cr, value);
1754 /*
1755 * Write to control registers
1756 */
1757 static int mov_to_cr(int gpreg, int cr, struct cpu_user_regs *regs)
1759 unsigned long value, old_cr, old_base_mfn, mfn;
1760 struct vcpu *v = current;
1761 struct vlapic *vlapic = vcpu_vlapic(v);
1762 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
1764 value = get_reg(gpreg, regs, vmcb);
1766 HVMTRACE_2D(CR_WRITE, v, cr, value);
1768 HVM_DBG_LOG(DBG_LEVEL_1, "mov_to_cr: CR%d, value = %lx, current = %p",
1769 cr, value, v);
1771 switch ( cr )
1773 case 0:
1774 return svm_set_cr0(value);
1776 case 3:
1777 if ( paging_mode_hap(v->domain) )
1779 vmcb->cr3 = v->arch.hvm_svm.cpu_cr3 = value;
1780 break;
1783 /* If paging is not enabled yet, simply copy the value to CR3. */
1784 if ( !svm_paging_enabled(v) )
1786 v->arch.hvm_svm.cpu_cr3 = value;
1787 break;
1790 /* We make a new one if the shadow does not exist. */
1791 if ( value == v->arch.hvm_svm.cpu_cr3 )
1793 /*
1794 * This is simple TLB flush, implying the guest has
1795 * removed some translation or changed page attributes.
1796 * We simply invalidate the shadow.
1797 */
1798 mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
1799 if ( mfn != pagetable_get_pfn(v->arch.guest_table) )
1800 goto bad_cr3;
1801 paging_update_cr3(v);
1802 /* signal paging update to ASID handler */
1803 svm_asid_g_mov_to_cr3 (v);
1805 else
1807 /*
1808 * If different, make a shadow. Check if the PDBR is valid
1809 * first.
1810 */
1811 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 value = %lx", value);
1812 mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
1813 if ( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), v->domain) )
1814 goto bad_cr3;
1816 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1817 v->arch.guest_table = pagetable_from_pfn(mfn);
1819 if ( old_base_mfn )
1820 put_page(mfn_to_page(old_base_mfn));
1822 v->arch.hvm_svm.cpu_cr3 = value;
1823 update_cr3(v);
1824 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx", value);
1825 /* signal paging update to ASID handler */
1826 svm_asid_g_mov_to_cr3 (v);
1828 break;
1830 case 4: /* CR4 */
1831 if ( value & HVM_CR4_GUEST_RESERVED_BITS )
1833 HVM_DBG_LOG(DBG_LEVEL_1,
1834 "Guest attempts to set reserved bit in CR4: %lx",
1835 value);
1836 svm_inject_exception(v, TRAP_gp_fault, 1, 0);
1837 break;
1840 if ( paging_mode_hap(v->domain) )
1842 v->arch.hvm_svm.cpu_shadow_cr4 = value;
1843 vmcb->cr4 = value | (HVM_CR4_HOST_MASK & ~X86_CR4_PAE);
1844 paging_update_paging_modes(v);
1845 /* signal paging update to ASID handler */
1846 svm_asid_g_update_paging (v);
1847 break;
1850 old_cr = v->arch.hvm_svm.cpu_shadow_cr4;
1851 if ( value & X86_CR4_PAE && !(old_cr & X86_CR4_PAE) )
1853 if ( svm_pgbit_test(v) )
1855 /* The guest is a 32-bit PAE guest. */
1856 #if CONFIG_PAGING_LEVELS >= 3
1857 unsigned long mfn, old_base_mfn;
1858 mfn = get_mfn_from_gpfn(v->arch.hvm_svm.cpu_cr3 >> PAGE_SHIFT);
1859 if ( !mfn_valid(mfn) ||
1860 !get_page(mfn_to_page(mfn), v->domain) )
1861 goto bad_cr3;
1863 /*
1864 * Now arch.guest_table points to machine physical.
1865 */
1867 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
1868 v->arch.guest_table = pagetable_from_pfn(mfn);
1869 if ( old_base_mfn )
1870 put_page(mfn_to_page(old_base_mfn));
1871 paging_update_paging_modes(v);
1872 /* signal paging update to ASID handler */
1873 svm_asid_g_update_paging (v);
1875 HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
1876 (unsigned long) (mfn << PAGE_SHIFT));
1878 HVM_DBG_LOG(DBG_LEVEL_VMMU,
1879 "Update CR3 value = %lx, mfn = %lx",
1880 v->arch.hvm_svm.cpu_cr3, mfn);
1881 #endif
1884 else if ( !(value & X86_CR4_PAE) )
1886 if ( svm_long_mode_enabled(v) )
1888 svm_inject_exception(v, TRAP_gp_fault, 1, 0);
1892 v->arch.hvm_svm.cpu_shadow_cr4 = value;
1893 vmcb->cr4 = value | HVM_CR4_HOST_MASK;
1895 /*
1896 * Writing to CR4 to modify the PSE, PGE, or PAE flag invalidates
1897 * all TLB entries except global entries.
1898 */
1899 if ((old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE))
1901 paging_update_paging_modes(v);
1902 /* signal paging update to ASID handler */
1903 svm_asid_g_update_paging (v);
1905 break;
1907 case 8:
1908 vlapic_set_reg(vlapic, APIC_TASKPRI, ((value & 0x0F) << 4));
1909 vmcb->vintr.fields.tpr = value & 0x0F;
1910 break;
1912 default:
1913 gdprintk(XENLOG_ERR, "invalid cr: %d\n", cr);
1914 domain_crash(v->domain);
1915 return 0;
1918 return 1;
1920 bad_cr3:
1921 gdprintk(XENLOG_ERR, "Invalid CR3\n");
1922 domain_crash(v->domain);
1923 return 0;
1927 #define ARR_SIZE(x) (sizeof(x) / sizeof(x[0]))
1930 static int svm_cr_access(struct vcpu *v, unsigned int cr, unsigned int type,
1931 struct cpu_user_regs *regs)
1933 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
1934 int inst_len = 0;
1935 int index,addr_size,i;
1936 unsigned int gpreg,offset;
1937 unsigned long value,addr;
1938 u8 buffer[MAX_INST_LEN];
1939 u8 prefix = 0;
1940 u8 modrm;
1941 enum x86_segment seg;
1942 int result = 1;
1943 enum instruction_index list_a[] = {INSTR_MOV2CR, INSTR_CLTS, INSTR_LMSW};
1944 enum instruction_index list_b[] = {INSTR_MOVCR2, INSTR_SMSW};
1945 enum instruction_index match;
1947 inst_copy_from_guest(buffer, svm_rip2pointer(v), sizeof(buffer));
1949 /* get index to first actual instruction byte - as we will need to know
1950 where the prefix lives later on */
1951 index = skip_prefix_bytes(buffer, sizeof(buffer));
1953 if ( type == TYPE_MOV_TO_CR )
1955 inst_len = __get_instruction_length_from_list(
1956 v, list_a, ARR_SIZE(list_a), &buffer[index], &match);
1958 else /* type == TYPE_MOV_FROM_CR */
1960 inst_len = __get_instruction_length_from_list(
1961 v, list_b, ARR_SIZE(list_b), &buffer[index], &match);
1964 ASSERT(inst_len > 0);
1966 inst_len += index;
1968 /* Check for REX prefix - it's ALWAYS the last byte of any prefix bytes */
1969 if (index > 0 && (buffer[index-1] & 0xF0) == 0x40)
1970 prefix = buffer[index-1];
1972 HVM_DBG_LOG(DBG_LEVEL_1, "eip = %lx", (unsigned long) vmcb->rip);
1974 switch (match)
1976 case INSTR_MOV2CR:
1977 gpreg = decode_src_reg(prefix, buffer[index+2]);
1978 result = mov_to_cr(gpreg, cr, regs);
1979 break;
1981 case INSTR_MOVCR2:
1982 gpreg = decode_src_reg(prefix, buffer[index+2]);
1983 mov_from_cr(cr, gpreg, regs);
1984 break;
1986 case INSTR_CLTS:
1987 /* TS being cleared means that it's time to restore fpu state. */
1988 setup_fpu(current);
1989 vmcb->exception_intercepts &= ~(1U << TRAP_no_device);
1990 vmcb->cr0 &= ~X86_CR0_TS; /* clear TS */
1991 v->arch.hvm_svm.cpu_shadow_cr0 &= ~X86_CR0_TS; /* clear TS */
1992 break;
1994 case INSTR_LMSW:
1995 gpreg = decode_src_reg(prefix, buffer[index+2]);
1996 value = get_reg(gpreg, regs, vmcb) & 0xF;
1997 value = (v->arch.hvm_svm.cpu_shadow_cr0 & ~0xF) | value;
1998 result = svm_set_cr0(value);
1999 break;
2001 case INSTR_SMSW:
2002 value = v->arch.hvm_svm.cpu_shadow_cr0 & 0xFFFF;
2003 modrm = buffer[index+2];
2004 addr_size = svm_guest_x86_mode(v);
2005 if ( addr_size < 2 )
2006 addr_size = 2;
2007 if ( likely((modrm & 0xC0) >> 6 == 3) )
2009 gpreg = decode_src_reg(prefix, modrm);
2010 set_reg(gpreg, value, regs, vmcb);
2012 /*
2013 * For now, only implement decode of the offset mode, since that's the
2014 * only mode observed in a real-world OS. This code is also making the
2015 * assumption that we'll never hit this code in long mode.
2016 */
2017 else if ( (modrm == 0x26) || (modrm == 0x25) )
2019 seg = x86_seg_ds;
2020 i = index;
2021 /* Segment or address size overrides? */
2022 while ( i-- )
2024 switch ( buffer[i] )
2026 case 0x26: seg = x86_seg_es; break;
2027 case 0x2e: seg = x86_seg_cs; break;
2028 case 0x36: seg = x86_seg_ss; break;
2029 case 0x64: seg = x86_seg_fs; break;
2030 case 0x65: seg = x86_seg_gs; break;
2031 case 0x67: addr_size ^= 6; break;
2034 /* Bail unless this really is a seg_base + offset case */
2035 if ( ((modrm == 0x26) && (addr_size == 4)) ||
2036 ((modrm == 0x25) && (addr_size == 2)) )
2038 gdprintk(XENLOG_ERR, "SMSW emulation at guest address: "
2039 "%lx failed due to unhandled addressing mode."
2040 "ModRM byte was: %x \n", svm_rip2pointer(v), modrm);
2041 domain_crash(v->domain);
2043 inst_len += addr_size;
2044 offset = *(( unsigned int *) ( void *) &buffer[index + 3]);
2045 offset = ( addr_size == 4 ) ? offset : ( offset & 0xFFFF );
2046 addr = hvm_get_segment_base(v, seg);
2047 addr += offset;
2048 hvm_copy_to_guest_virt(addr,&value,2);
2050 else
2052 gdprintk(XENLOG_ERR, "SMSW emulation at guest address: %lx "
2053 "failed due to unhandled addressing mode!"
2054 "ModRM byte was: %x \n", svm_rip2pointer(v), modrm);
2055 domain_crash(v->domain);
2057 break;
2059 default:
2060 BUG();
2063 ASSERT(inst_len);
2065 __update_guest_eip(vmcb, inst_len);
2067 return result;
2070 static inline void svm_do_msr_access(
2071 struct vcpu *v, struct cpu_user_regs *regs)
2073 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
2074 int inst_len;
2075 u64 msr_content=0;
2076 u32 ecx = regs->ecx, eax, edx;
2078 HVM_DBG_LOG(DBG_LEVEL_1, "ecx=%x, eax=%x, edx=%x, exitinfo = %lx",
2079 ecx, (u32)regs->eax, (u32)regs->edx,
2080 (unsigned long)vmcb->exitinfo1);
2082 /* is it a read? */
2083 if (vmcb->exitinfo1 == 0)
2085 switch (ecx) {
2086 case MSR_IA32_TIME_STAMP_COUNTER:
2087 msr_content = hvm_get_guest_time(v);
2088 break;
2090 case MSR_IA32_APICBASE:
2091 msr_content = vcpu_vlapic(v)->hw.apic_base_msr;
2092 break;
2094 case MSR_EFER:
2095 msr_content = v->arch.hvm_svm.cpu_shadow_efer;
2096 break;
2098 case MSR_K8_MC4_MISC: /* Threshold register */
2099 /*
2100 * MCA/MCE: We report that the threshold register is unavailable
2101 * for OS use (locked by the BIOS).
2102 */
2103 msr_content = 1ULL << 61; /* MC4_MISC.Locked */
2104 break;
2106 case MSR_IA32_EBC_FREQUENCY_ID:
2107 /*
2108 * This Intel-only register may be accessed if this HVM guest
2109 * has been migrated from an Intel host. The value zero is not
2110 * particularly meaningful, but at least avoids the guest crashing!
2111 */
2112 msr_content = 0;
2113 break;
2115 case MSR_K8_VM_HSAVE_PA:
2116 svm_inject_exception(v, TRAP_gp_fault, 1, 0);
2117 break;
2119 default:
2120 if ( rdmsr_hypervisor_regs(ecx, &eax, &edx) ||
2121 rdmsr_safe(ecx, eax, edx) == 0 )
2123 regs->eax = eax;
2124 regs->edx = edx;
2125 goto done;
2127 svm_inject_exception(v, TRAP_gp_fault, 1, 0);
2128 return;
2130 regs->eax = msr_content & 0xFFFFFFFF;
2131 regs->edx = msr_content >> 32;
2133 done:
2134 HVMTRACE_2D(MSR_READ, v, ecx, msr_content);
2135 HVM_DBG_LOG(DBG_LEVEL_1, "returns: ecx=%x, eax=%lx, edx=%lx",
2136 ecx, (unsigned long)regs->eax, (unsigned long)regs->edx);
2138 inst_len = __get_instruction_length(v, INSTR_RDMSR, NULL);
2140 else
2142 msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
2144 HVMTRACE_2D(MSR_WRITE, v, ecx, msr_content);
2146 switch (ecx)
2148 case MSR_IA32_TIME_STAMP_COUNTER:
2149 hvm_set_guest_time(v, msr_content);
2150 pt_reset(v);
2151 break;
2153 case MSR_IA32_APICBASE:
2154 vlapic_msr_set(vcpu_vlapic(v), msr_content);
2155 break;
2157 case MSR_K8_VM_HSAVE_PA:
2158 svm_inject_exception(v, TRAP_gp_fault, 1, 0);
2159 break;
2161 default:
2162 if ( !long_mode_do_msr_write(regs) )
2163 wrmsr_hypervisor_regs(ecx, regs->eax, regs->edx);
2164 break;
2167 inst_len = __get_instruction_length(v, INSTR_WRMSR, NULL);
2170 __update_guest_eip(vmcb, inst_len);
2173 static inline void svm_vmexit_do_hlt(struct vmcb_struct *vmcb)
2175 enum hvm_intack type = hvm_vcpu_has_pending_irq(current);
2177 __update_guest_eip(vmcb, 1);
2179 /* Check for interrupt not handled or new interrupt. */
2180 if ( vmcb->eventinj.fields.v ||
2181 ((type != hvm_intack_none) && svm_interrupts_enabled(current, type)) )
2183 HVMTRACE_1D(HLT, current, /*int pending=*/ 1);
2184 return;
2187 HVMTRACE_1D(HLT, current, /*int pending=*/ 0);
2188 hvm_hlt(vmcb->rflags);
2191 static void svm_vmexit_do_invd(struct vcpu *v)
2193 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
2194 int inst_len;
2196 /* Invalidate the cache - we can't really do that safely - maybe we should
2197 * WBINVD, but I think it's just fine to completely ignore it - we should
2198 * have cache-snooping that solves it anyways. -- Mats P.
2199 */
2201 /* Tell the user that we did this - just in case someone runs some really
2202 * weird operating system and wants to know why it's not working...
2203 */
2204 gdprintk(XENLOG_WARNING, "INVD instruction intercepted - ignored\n");
2206 inst_len = __get_instruction_length(v, INSTR_INVD, NULL);
2207 __update_guest_eip(vmcb, inst_len);
2210 void svm_handle_invlpg(const short invlpga, struct cpu_user_regs *regs)
2212 struct vcpu *v = current;
2213 u8 opcode[MAX_INST_LEN], prefix, length = MAX_INST_LEN;
2214 unsigned long g_vaddr;
2215 int inst_len;
2216 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
2218 /*
2219 * Unknown how many bytes the invlpg instruction will take. Use the
2220 * maximum instruction length here
2221 */
2222 if (inst_copy_from_guest(opcode, svm_rip2pointer(v), length) < length)
2224 gdprintk(XENLOG_ERR, "Error reading memory %d bytes\n", length);
2225 domain_crash(v->domain);
2226 return;
2229 if (invlpga)
2231 inst_len = __get_instruction_length(v, INSTR_INVLPGA, opcode);
2232 ASSERT(inst_len > 0);
2233 __update_guest_eip(vmcb, inst_len);
2235 /*
2236 * The address is implicit on this instruction. At the moment, we don't
2237 * use ecx (ASID) to identify individual guests pages
2238 */
2239 g_vaddr = regs->eax;
2241 else
2243 /* What about multiple prefix codes? */
2244 prefix = (is_prefix(opcode[0])?opcode[0]:0);
2245 inst_len = __get_instruction_length(v, INSTR_INVLPG, opcode);
2246 ASSERT(inst_len > 0);
2248 inst_len--;
2249 length -= inst_len;
2251 /*
2252 * Decode memory operand of the instruction including ModRM, SIB, and
2253 * displacement to get effective address and length in bytes. Assume
2254 * the system in either 32- or 64-bit mode.
2255 */
2256 g_vaddr = get_effective_addr_modrm64(regs, prefix, inst_len,
2257 &opcode[inst_len], &length);
2259 inst_len += length;
2260 __update_guest_eip (vmcb, inst_len);
2263 HVMTRACE_3D(INVLPG, v, (invlpga?1:0), g_vaddr, (invlpga?regs->ecx:0));
2265 paging_invlpg(v, g_vaddr);
2266 /* signal invplg to ASID handler */
2267 svm_asid_g_invlpg (v, g_vaddr);
2271 /*
2272 * Reset to realmode causes execution to start at 0xF000:0xFFF0 in
2273 * 16-bit realmode. Basically, this mimics a processor reset.
2275 * returns 0 on success, non-zero otherwise
2276 */
2277 static int svm_reset_to_realmode(struct vcpu *v,
2278 struct cpu_user_regs *regs)
2280 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
2282 /* clear the vmcb and user regs */
2283 memset(regs, 0, sizeof(struct cpu_user_regs));
2285 /* VMCB Control */
2286 vmcb->tsc_offset = 0;
2288 /* VMCB State */
2289 vmcb->cr0 = X86_CR0_ET | X86_CR0_PG | X86_CR0_WP;
2290 v->arch.hvm_svm.cpu_shadow_cr0 = X86_CR0_ET;
2292 vmcb->cr2 = 0;
2293 vmcb->efer = EFER_SVME;
2295 vmcb->cr4 = HVM_CR4_HOST_MASK;
2296 v->arch.hvm_svm.cpu_shadow_cr4 = 0;
2298 if ( paging_mode_hap(v->domain) ) {
2299 vmcb->cr0 = v->arch.hvm_svm.cpu_shadow_cr0;
2300 vmcb->cr4 = v->arch.hvm_svm.cpu_shadow_cr4 |
2301 (HVM_CR4_HOST_MASK & ~X86_CR4_PAE);
2304 /* This will jump to ROMBIOS */
2305 vmcb->rip = 0xFFF0;
2307 /* setup the segment registers and all their hidden states */
2308 vmcb->cs.sel = 0xF000;
2309 vmcb->cs.attr.bytes = 0x089b;
2310 vmcb->cs.limit = 0xffff;
2311 vmcb->cs.base = 0x000F0000;
2313 vmcb->ss.sel = 0x00;
2314 vmcb->ss.attr.bytes = 0x0893;
2315 vmcb->ss.limit = 0xffff;
2316 vmcb->ss.base = 0x00;
2318 vmcb->ds.sel = 0x00;
2319 vmcb->ds.attr.bytes = 0x0893;
2320 vmcb->ds.limit = 0xffff;
2321 vmcb->ds.base = 0x00;
2323 vmcb->es.sel = 0x00;
2324 vmcb->es.attr.bytes = 0x0893;
2325 vmcb->es.limit = 0xffff;
2326 vmcb->es.base = 0x00;
2328 vmcb->fs.sel = 0x00;
2329 vmcb->fs.attr.bytes = 0x0893;
2330 vmcb->fs.limit = 0xffff;
2331 vmcb->fs.base = 0x00;
2333 vmcb->gs.sel = 0x00;
2334 vmcb->gs.attr.bytes = 0x0893;
2335 vmcb->gs.limit = 0xffff;
2336 vmcb->gs.base = 0x00;
2338 vmcb->ldtr.sel = 0x00;
2339 vmcb->ldtr.attr.bytes = 0x0000;
2340 vmcb->ldtr.limit = 0x0;
2341 vmcb->ldtr.base = 0x00;
2343 vmcb->gdtr.sel = 0x00;
2344 vmcb->gdtr.attr.bytes = 0x0000;
2345 vmcb->gdtr.limit = 0x0;
2346 vmcb->gdtr.base = 0x00;
2348 vmcb->tr.sel = 0;
2349 vmcb->tr.attr.bytes = 0;
2350 vmcb->tr.limit = 0x0;
2351 vmcb->tr.base = 0;
2353 vmcb->idtr.sel = 0x00;
2354 vmcb->idtr.attr.bytes = 0x0000;
2355 vmcb->idtr.limit = 0x3ff;
2356 vmcb->idtr.base = 0x00;
2358 vmcb->rax = 0;
2359 vmcb->rsp = 0;
2361 return 0;
2364 asmlinkage void svm_vmexit_handler(struct cpu_user_regs *regs)
2366 unsigned int exit_reason;
2367 unsigned long eip;
2368 struct vcpu *v = current;
2369 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
2370 int inst_len, rc;
2372 exit_reason = vmcb->exitcode;
2374 HVMTRACE_2D(VMEXIT, v, vmcb->rip, exit_reason);
2376 if ( unlikely(exit_reason == VMEXIT_INVALID) )
2378 svm_dump_vmcb(__func__, vmcb);
2379 goto exit_and_crash;
2382 perfc_incra(svmexits, exit_reason);
2383 eip = vmcb->rip;
2385 switch ( exit_reason )
2387 case VMEXIT_INTR:
2388 /* Asynchronous event, handled when we STGI'd after the VMEXIT. */
2389 HVMTRACE_0D(INTR, v);
2390 break;
2392 case VMEXIT_NMI:
2393 /* Asynchronous event, handled when we STGI'd after the VMEXIT. */
2394 HVMTRACE_0D(NMI, v);
2395 break;
2397 case VMEXIT_SMI:
2398 /* Asynchronous event, handled when we STGI'd after the VMEXIT. */
2399 HVMTRACE_0D(SMI, v);
2400 break;
2402 case VMEXIT_EXCEPTION_DB:
2403 if ( !v->domain->debugger_attached )
2404 goto exit_and_crash;
2405 domain_pause_for_debugger();
2406 break;
2408 case VMEXIT_EXCEPTION_BP:
2409 if ( !v->domain->debugger_attached )
2410 goto exit_and_crash;
2411 /* AMD Vol2, 15.11: INT3, INTO, BOUND intercepts do not update RIP. */
2412 inst_len = __get_instruction_length(v, INSTR_INT3, NULL);
2413 __update_guest_eip(vmcb, inst_len);
2414 domain_pause_for_debugger();
2415 break;
2417 case VMEXIT_EXCEPTION_NM:
2418 svm_do_no_device_fault(vmcb);
2419 break;
2421 case VMEXIT_EXCEPTION_PF: {
2422 unsigned long va;
2423 va = vmcb->exitinfo2;
2424 regs->error_code = vmcb->exitinfo1;
2425 HVM_DBG_LOG(DBG_LEVEL_VMMU,
2426 "eax=%lx, ebx=%lx, ecx=%lx, edx=%lx, esi=%lx, edi=%lx",
2427 (unsigned long)regs->eax, (unsigned long)regs->ebx,
2428 (unsigned long)regs->ecx, (unsigned long)regs->edx,
2429 (unsigned long)regs->esi, (unsigned long)regs->edi);
2431 if ( paging_fault(va, regs) )
2433 HVMTRACE_2D(PF_XEN, v, va, regs->error_code);
2434 break;
2437 v->arch.hvm_svm.cpu_cr2 = vmcb->cr2 = va;
2438 svm_inject_exception(v, TRAP_page_fault, 1, regs->error_code);
2439 break;
2442 case VMEXIT_EXCEPTION_MC:
2443 HVMTRACE_0D(MCE, v);
2444 svm_store_cpu_guest_regs(v, regs, NULL);
2445 do_machine_check(regs);
2446 break;
2448 case VMEXIT_VINTR:
2449 vmcb->vintr.fields.irq = 0;
2450 vmcb->general1_intercepts &= ~GENERAL1_INTERCEPT_VINTR;
2451 break;
2453 case VMEXIT_INVD:
2454 svm_vmexit_do_invd(v);
2455 break;
2457 case VMEXIT_GDTR_WRITE:
2458 printk("WRITE to GDTR\n");
2459 break;
2461 case VMEXIT_TASK_SWITCH:
2462 goto exit_and_crash;
2464 case VMEXIT_CPUID:
2465 svm_vmexit_do_cpuid(vmcb, regs);
2466 break;
2468 case VMEXIT_HLT:
2469 svm_vmexit_do_hlt(vmcb);
2470 break;
2472 case VMEXIT_INVLPG:
2473 svm_handle_invlpg(0, regs);
2474 break;
2476 case VMEXIT_INVLPGA:
2477 svm_handle_invlpg(1, regs);
2478 break;
2480 case VMEXIT_VMMCALL:
2481 inst_len = __get_instruction_length(v, INSTR_VMCALL, NULL);
2482 ASSERT(inst_len > 0);
2483 HVMTRACE_1D(VMMCALL, v, regs->eax);
2484 rc = hvm_do_hypercall(regs);
2485 if ( rc != HVM_HCALL_preempted )
2487 __update_guest_eip(vmcb, inst_len);
2488 if ( rc == HVM_HCALL_invalidate )
2489 send_invalidate_req();
2491 break;
2493 case VMEXIT_CR0_READ:
2494 svm_cr_access(v, 0, TYPE_MOV_FROM_CR, regs);
2495 break;
2497 case VMEXIT_CR2_READ:
2498 svm_cr_access(v, 2, TYPE_MOV_FROM_CR, regs);
2499 break;
2501 case VMEXIT_CR3_READ:
2502 svm_cr_access(v, 3, TYPE_MOV_FROM_CR, regs);
2503 break;
2505 case VMEXIT_CR4_READ:
2506 svm_cr_access(v, 4, TYPE_MOV_FROM_CR, regs);
2507 break;
2509 case VMEXIT_CR8_READ:
2510 svm_cr_access(v, 8, TYPE_MOV_FROM_CR, regs);
2511 break;
2513 case VMEXIT_CR0_WRITE:
2514 svm_cr_access(v, 0, TYPE_MOV_TO_CR, regs);
2515 break;
2517 case VMEXIT_CR2_WRITE:
2518 svm_cr_access(v, 2, TYPE_MOV_TO_CR, regs);
2519 break;
2521 case VMEXIT_CR3_WRITE:
2522 svm_cr_access(v, 3, TYPE_MOV_TO_CR, regs);
2523 local_flush_tlb();
2524 break;
2526 case VMEXIT_CR4_WRITE:
2527 svm_cr_access(v, 4, TYPE_MOV_TO_CR, regs);
2528 break;
2530 case VMEXIT_CR8_WRITE:
2531 svm_cr_access(v, 8, TYPE_MOV_TO_CR, regs);
2532 break;
2534 case VMEXIT_DR0_WRITE ... VMEXIT_DR7_WRITE:
2535 svm_dr_access(v, regs);
2536 break;
2538 case VMEXIT_IOIO:
2539 svm_io_instruction(v);
2540 break;
2542 case VMEXIT_MSR:
2543 svm_do_msr_access(v, regs);
2544 break;
2546 case VMEXIT_SHUTDOWN:
2547 hvm_triple_fault();
2548 break;
2550 case VMEXIT_VMRUN:
2551 case VMEXIT_VMLOAD:
2552 case VMEXIT_VMSAVE:
2553 case VMEXIT_STGI:
2554 case VMEXIT_CLGI:
2555 case VMEXIT_SKINIT:
2556 /* Report "Invalid opcode" on any VM-operation except VMMCALL */
2557 svm_inject_exception(v, TRAP_invalid_op, 0, 0);
2558 break;
2560 case VMEXIT_NPF:
2561 regs->error_code = vmcb->exitinfo1;
2562 if ( !svm_do_nested_pgfault(vmcb->exitinfo2, regs) )
2563 domain_crash(v->domain);
2564 break;
2566 default:
2567 exit_and_crash:
2568 gdprintk(XENLOG_ERR, "unexpected VMEXIT: exit reason = 0x%x, "
2569 "exitinfo1 = %"PRIx64", exitinfo2 = %"PRIx64"\n",
2570 exit_reason,
2571 (u64)vmcb->exitinfo1, (u64)vmcb->exitinfo2);
2572 domain_crash(v->domain);
2573 break;
2577 asmlinkage void svm_trace_vmentry(void)
2579 struct vcpu *v = current;
2581 /* This is the last C code before the VMRUN instruction. */
2582 HVMTRACE_0D(VMENTRY, v);
2585 /*
2586 * Local variables:
2587 * mode: C
2588 * c-set-style: "BSD"
2589 * c-basic-offset: 4
2590 * tab-width: 4
2591 * indent-tabs-mode: nil
2592 * End:
2593 */