ia64/xen-unstable

view xen/arch/x86/hvm/hvm.c @ 16197:b3fa9b58a102

hvm, vt-d: Add memory cache-attribute pinning domctl for HVM
guests. Use this to pin virtual framebuffer VRAM as attribute WB, even
if guest tries to map with other attributes.
Signed-off-by: Disheng Su <disheng.su@intel.com>
author Keir Fraser <keir@xensource.com>
date Tue Oct 23 14:38:47 2007 +0100 (2007-10-23)
parents 3e7c86602c70
children b5a2cbca3930
line source
1 /*
2 * hvm.c: Common hardware virtual machine abstractions.
3 *
4 * Copyright (c) 2004, Intel Corporation.
5 * Copyright (c) 2005, International Business Machines Corporation.
6 *
7 * This program is free software; you can redistribute it and/or modify it
8 * under the terms and conditions of the GNU General Public License,
9 * version 2, as published by the Free Software Foundation.
10 *
11 * This program is distributed in the hope it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
14 * more details.
15 *
16 * You should have received a copy of the GNU General Public License along with
17 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
18 * Place - Suite 330, Boston, MA 02111-1307 USA.
19 */
21 #include <xen/config.h>
22 #include <xen/init.h>
23 #include <xen/lib.h>
24 #include <xen/trace.h>
25 #include <xen/sched.h>
26 #include <xen/irq.h>
27 #include <xen/softirq.h>
28 #include <xen/domain.h>
29 #include <xen/domain_page.h>
30 #include <xen/hypercall.h>
31 #include <xen/guest_access.h>
32 #include <xen/event.h>
33 #include <asm/current.h>
34 #include <asm/e820.h>
35 #include <asm/io.h>
36 #include <asm/paging.h>
37 #include <asm/regs.h>
38 #include <asm/cpufeature.h>
39 #include <asm/processor.h>
40 #include <asm/types.h>
41 #include <asm/msr.h>
42 #include <asm/mc146818rtc.h>
43 #include <asm/spinlock.h>
44 #include <asm/hvm/hvm.h>
45 #include <asm/hvm/vpt.h>
46 #include <asm/hvm/support.h>
47 #include <asm/hvm/cacheattr.h>
48 #include <public/sched.h>
49 #include <public/hvm/ioreq.h>
50 #include <public/version.h>
51 #include <public/memory.h>
53 /* Xen command-line option to disable hardware-assisted paging */
54 static int opt_hap_disabled;
55 invbool_param("hap", opt_hap_disabled);
57 int hvm_enabled __read_mostly;
59 unsigned int opt_hvm_debug_level __read_mostly;
60 integer_param("hvm_debug", opt_hvm_debug_level);
62 struct hvm_function_table hvm_funcs __read_mostly;
64 /* I/O permission bitmap is globally shared by all HVM guests. */
65 char __attribute__ ((__section__ (".bss.page_aligned")))
66 hvm_io_bitmap[3*PAGE_SIZE];
68 void hvm_enable(struct hvm_function_table *fns)
69 {
70 BUG_ON(hvm_enabled);
71 printk("HVM: %s enabled\n", fns->name);
73 /*
74 * Allow direct access to the PC debug port (it is often used for I/O
75 * delays, but the vmexits simply slow things down).
76 */
77 memset(hvm_io_bitmap, ~0, sizeof(hvm_io_bitmap));
78 clear_bit(0x80, hvm_io_bitmap);
80 hvm_funcs = *fns;
81 hvm_enabled = 1;
83 if ( hvm_funcs.hap_supported )
84 {
85 if ( opt_hap_disabled )
86 hvm_funcs.hap_supported = 0;
87 printk("HVM: Hardware Assisted Paging %sabled\n",
88 hvm_funcs.hap_supported ? "en" : "dis");
89 }
90 }
92 void hvm_set_guest_time(struct vcpu *v, u64 gtime)
93 {
94 u64 host_tsc;
96 rdtscll(host_tsc);
98 v->arch.hvm_vcpu.cache_tsc_offset = gtime - host_tsc;
99 hvm_funcs.set_tsc_offset(v, v->arch.hvm_vcpu.cache_tsc_offset);
100 }
102 u64 hvm_get_guest_time(struct vcpu *v)
103 {
104 u64 host_tsc;
106 rdtscll(host_tsc);
107 return host_tsc + v->arch.hvm_vcpu.cache_tsc_offset;
108 }
110 void hvm_migrate_timers(struct vcpu *v)
111 {
112 rtc_migrate_timers(v);
113 hpet_migrate_timers(v);
114 pt_migrate(v);
115 }
117 void hvm_do_resume(struct vcpu *v)
118 {
119 ioreq_t *p;
121 if ( !v->fpu_dirtied )
122 hvm_funcs.stts(v);
124 pt_thaw_time(v);
126 /* NB. Optimised for common case (p->state == STATE_IOREQ_NONE). */
127 p = &get_ioreq(v)->vp_ioreq;
128 while ( p->state != STATE_IOREQ_NONE )
129 {
130 switch ( p->state )
131 {
132 case STATE_IORESP_READY: /* IORESP_READY -> NONE */
133 hvm_io_assist();
134 break;
135 case STATE_IOREQ_READY: /* IOREQ_{READY,INPROCESS} -> IORESP_READY */
136 case STATE_IOREQ_INPROCESS:
137 wait_on_xen_event_channel(v->arch.hvm_vcpu.xen_port,
138 (p->state != STATE_IOREQ_READY) &&
139 (p->state != STATE_IOREQ_INPROCESS));
140 break;
141 default:
142 gdprintk(XENLOG_ERR, "Weird HVM iorequest state %d.\n", p->state);
143 domain_crash_synchronous();
144 }
145 }
146 }
148 static void hvm_init_ioreq_page(
149 struct domain *d, struct hvm_ioreq_page *iorp)
150 {
151 memset(iorp, 0, sizeof(*iorp));
152 spin_lock_init(&iorp->lock);
153 domain_pause(d);
154 }
156 static void hvm_destroy_ioreq_page(
157 struct domain *d, struct hvm_ioreq_page *iorp)
158 {
159 spin_lock(&iorp->lock);
161 ASSERT(d->is_dying);
163 if ( iorp->va != NULL )
164 {
165 unmap_domain_page_global(iorp->va);
166 put_page_and_type(iorp->page);
167 iorp->va = NULL;
168 }
170 spin_unlock(&iorp->lock);
171 }
173 static int hvm_set_ioreq_page(
174 struct domain *d, struct hvm_ioreq_page *iorp, unsigned long gmfn)
175 {
176 struct page_info *page;
177 p2m_type_t p2mt;
178 unsigned long mfn;
179 void *va;
181 mfn = mfn_x(gfn_to_mfn(d, gmfn, &p2mt));
182 if ( !p2m_is_ram(p2mt) )
183 return -EINVAL;
184 ASSERT(mfn_valid(mfn));
186 page = mfn_to_page(mfn);
187 if ( !get_page_and_type(page, d, PGT_writable_page) )
188 return -EINVAL;
190 va = map_domain_page_global(mfn);
191 if ( va == NULL )
192 {
193 put_page_and_type(page);
194 return -ENOMEM;
195 }
197 spin_lock(&iorp->lock);
199 if ( (iorp->va != NULL) || d->is_dying )
200 {
201 spin_unlock(&iorp->lock);
202 unmap_domain_page_global(va);
203 put_page_and_type(mfn_to_page(mfn));
204 return -EINVAL;
205 }
207 iorp->va = va;
208 iorp->page = page;
210 spin_unlock(&iorp->lock);
212 domain_unpause(d);
214 return 0;
215 }
217 int hvm_domain_initialise(struct domain *d)
218 {
219 int rc;
221 if ( !hvm_enabled )
222 {
223 gdprintk(XENLOG_WARNING, "Attempt to create a HVM guest "
224 "on a non-VT/AMDV platform.\n");
225 return -EINVAL;
226 }
228 spin_lock_init(&d->arch.hvm_domain.pbuf_lock);
229 spin_lock_init(&d->arch.hvm_domain.irq_lock);
230 spin_lock_init(&d->arch.hvm_domain.uc_lock);
232 hvm_init_cacheattr_region_list(d);
234 rc = paging_enable(d, PG_refcounts|PG_translate|PG_external);
235 if ( rc != 0 )
236 goto fail1;
238 vpic_init(d);
240 rc = vioapic_init(d);
241 if ( rc != 0 )
242 goto fail1;
244 hvm_init_ioreq_page(d, &d->arch.hvm_domain.ioreq);
245 hvm_init_ioreq_page(d, &d->arch.hvm_domain.buf_ioreq);
247 rc = hvm_funcs.domain_initialise(d);
248 if ( rc != 0 )
249 goto fail2;
251 return 0;
253 fail2:
254 vioapic_deinit(d);
255 fail1:
256 hvm_destroy_cacheattr_region_list(d);
257 return rc;
258 }
260 void hvm_domain_relinquish_resources(struct domain *d)
261 {
262 hvm_destroy_ioreq_page(d, &d->arch.hvm_domain.ioreq);
263 hvm_destroy_ioreq_page(d, &d->arch.hvm_domain.buf_ioreq);
265 pit_deinit(d);
266 rtc_deinit(d);
267 pmtimer_deinit(d);
268 hpet_deinit(d);
269 }
271 void hvm_domain_destroy(struct domain *d)
272 {
273 hvm_funcs.domain_destroy(d);
274 vioapic_deinit(d);
275 hvm_destroy_cacheattr_region_list(d);
276 }
278 static int hvm_save_cpu_ctxt(struct domain *d, hvm_domain_context_t *h)
279 {
280 struct vcpu *v;
281 struct hvm_hw_cpu ctxt;
282 struct vcpu_guest_context *vc;
284 for_each_vcpu(d, v)
285 {
286 /* We don't need to save state for a vcpu that is down; the restore
287 * code will leave it down if there is nothing saved. */
288 if ( test_bit(_VPF_down, &v->pause_flags) )
289 continue;
291 /* Architecture-specific vmcs/vmcb bits */
292 hvm_funcs.save_cpu_ctxt(v, &ctxt);
294 /* Other vcpu register state */
295 vc = &v->arch.guest_context;
296 if ( v->fpu_initialised )
297 memcpy(ctxt.fpu_regs, &vc->fpu_ctxt, sizeof(ctxt.fpu_regs));
298 else
299 memset(ctxt.fpu_regs, 0, sizeof(ctxt.fpu_regs));
300 ctxt.rax = vc->user_regs.eax;
301 ctxt.rbx = vc->user_regs.ebx;
302 ctxt.rcx = vc->user_regs.ecx;
303 ctxt.rdx = vc->user_regs.edx;
304 ctxt.rbp = vc->user_regs.ebp;
305 ctxt.rsi = vc->user_regs.esi;
306 ctxt.rdi = vc->user_regs.edi;
307 ctxt.rsp = vc->user_regs.esp;
308 ctxt.rip = vc->user_regs.eip;
309 ctxt.rflags = vc->user_regs.eflags;
310 #ifdef __x86_64__
311 ctxt.r8 = vc->user_regs.r8;
312 ctxt.r9 = vc->user_regs.r9;
313 ctxt.r10 = vc->user_regs.r10;
314 ctxt.r11 = vc->user_regs.r11;
315 ctxt.r12 = vc->user_regs.r12;
316 ctxt.r13 = vc->user_regs.r13;
317 ctxt.r14 = vc->user_regs.r14;
318 ctxt.r15 = vc->user_regs.r15;
319 #endif
320 ctxt.dr0 = vc->debugreg[0];
321 ctxt.dr1 = vc->debugreg[1];
322 ctxt.dr2 = vc->debugreg[2];
323 ctxt.dr3 = vc->debugreg[3];
324 ctxt.dr6 = vc->debugreg[6];
325 ctxt.dr7 = vc->debugreg[7];
327 if ( hvm_save_entry(CPU, v->vcpu_id, h, &ctxt) != 0 )
328 return 1;
329 }
330 return 0;
331 }
333 static int hvm_load_cpu_ctxt(struct domain *d, hvm_domain_context_t *h)
334 {
335 int vcpuid, rc;
336 struct vcpu *v;
337 struct hvm_hw_cpu ctxt;
338 struct vcpu_guest_context *vc;
340 /* Which vcpu is this? */
341 vcpuid = hvm_load_instance(h);
342 if ( vcpuid > MAX_VIRT_CPUS || (v = d->vcpu[vcpuid]) == NULL )
343 {
344 gdprintk(XENLOG_ERR, "HVM restore: domain has no vcpu %u\n", vcpuid);
345 return -EINVAL;
346 }
347 vc = &v->arch.guest_context;
349 /* Need to init this vcpu before loading its contents */
350 LOCK_BIGLOCK(d);
351 if ( !v->is_initialised )
352 if ( (rc = boot_vcpu(d, vcpuid, vc)) != 0 )
353 return rc;
354 UNLOCK_BIGLOCK(d);
356 if ( hvm_load_entry(CPU, h, &ctxt) != 0 )
357 return -EINVAL;
359 /* Sanity check some control registers. */
360 if ( (ctxt.cr0 & HVM_CR0_GUEST_RESERVED_BITS) ||
361 !(ctxt.cr0 & X86_CR0_ET) ||
362 ((ctxt.cr0 & (X86_CR0_PE|X86_CR0_PG)) == X86_CR0_PG) )
363 {
364 gdprintk(XENLOG_ERR, "HVM restore: bad CR0 0x%"PRIx64"\n",
365 ctxt.msr_efer);
366 return -EINVAL;
367 }
369 if ( ctxt.cr4 & HVM_CR4_GUEST_RESERVED_BITS )
370 {
371 gdprintk(XENLOG_ERR, "HVM restore: bad CR4 0x%"PRIx64"\n",
372 ctxt.msr_efer);
373 return -EINVAL;
374 }
376 if ( (ctxt.msr_efer & ~(EFER_FFXSE | EFER_LME | EFER_LMA |
377 EFER_NX | EFER_SCE)) ||
378 ((sizeof(long) != 8) && (ctxt.msr_efer & EFER_LME)) ||
379 (!cpu_has_nx && (ctxt.msr_efer & EFER_NX)) ||
380 (!cpu_has_syscall && (ctxt.msr_efer & EFER_SCE)) ||
381 (!cpu_has_ffxsr && (ctxt.msr_efer & EFER_FFXSE)) ||
382 ((ctxt.msr_efer & (EFER_LME|EFER_LMA)) == EFER_LMA) )
383 {
384 gdprintk(XENLOG_ERR, "HVM restore: bad EFER 0x%"PRIx64"\n",
385 ctxt.msr_efer);
386 return -EINVAL;
387 }
389 /* Architecture-specific vmcs/vmcb bits */
390 if ( hvm_funcs.load_cpu_ctxt(v, &ctxt) < 0 )
391 return -EINVAL;
393 /* Other vcpu register state */
394 memcpy(&vc->fpu_ctxt, ctxt.fpu_regs, sizeof(ctxt.fpu_regs));
395 vc->user_regs.eax = ctxt.rax;
396 vc->user_regs.ebx = ctxt.rbx;
397 vc->user_regs.ecx = ctxt.rcx;
398 vc->user_regs.edx = ctxt.rdx;
399 vc->user_regs.ebp = ctxt.rbp;
400 vc->user_regs.esi = ctxt.rsi;
401 vc->user_regs.edi = ctxt.rdi;
402 vc->user_regs.esp = ctxt.rsp;
403 vc->user_regs.eip = ctxt.rip;
404 vc->user_regs.eflags = ctxt.rflags | 2;
405 #ifdef __x86_64__
406 vc->user_regs.r8 = ctxt.r8;
407 vc->user_regs.r9 = ctxt.r9;
408 vc->user_regs.r10 = ctxt.r10;
409 vc->user_regs.r11 = ctxt.r11;
410 vc->user_regs.r12 = ctxt.r12;
411 vc->user_regs.r13 = ctxt.r13;
412 vc->user_regs.r14 = ctxt.r14;
413 vc->user_regs.r15 = ctxt.r15;
414 #endif
415 vc->debugreg[0] = ctxt.dr0;
416 vc->debugreg[1] = ctxt.dr1;
417 vc->debugreg[2] = ctxt.dr2;
418 vc->debugreg[3] = ctxt.dr3;
419 vc->debugreg[6] = ctxt.dr6;
420 vc->debugreg[7] = ctxt.dr7;
422 vc->flags = VGCF_online;
423 v->fpu_initialised = 1;
425 /* Auxiliary processors should be woken immediately. */
426 if ( test_and_clear_bit(_VPF_down, &v->pause_flags) )
427 vcpu_wake(v);
429 return 0;
430 }
432 HVM_REGISTER_SAVE_RESTORE(CPU, hvm_save_cpu_ctxt, hvm_load_cpu_ctxt,
433 1, HVMSR_PER_VCPU);
435 extern int reset_vmsr(struct mtrr_state *m, u64 *p);
437 int hvm_vcpu_initialise(struct vcpu *v)
438 {
439 int rc;
441 if ( (rc = vlapic_init(v)) != 0 )
442 goto fail1;
444 if ( (rc = hvm_funcs.vcpu_initialise(v)) != 0 )
445 goto fail2;
447 /* Create ioreq event channel. */
448 rc = alloc_unbound_xen_event_channel(v, 0);
449 if ( rc < 0 )
450 goto fail3;
452 /* Register ioreq event channel. */
453 v->arch.hvm_vcpu.xen_port = rc;
454 spin_lock(&v->domain->arch.hvm_domain.ioreq.lock);
455 if ( v->domain->arch.hvm_domain.ioreq.va != NULL )
456 get_ioreq(v)->vp_eport = v->arch.hvm_vcpu.xen_port;
457 spin_unlock(&v->domain->arch.hvm_domain.ioreq.lock);
459 spin_lock_init(&v->arch.hvm_vcpu.tm_lock);
460 INIT_LIST_HEAD(&v->arch.hvm_vcpu.tm_list);
462 rc = reset_vmsr(&v->arch.hvm_vcpu.mtrr, &v->arch.hvm_vcpu.pat_cr);
463 if ( rc != 0 )
464 goto fail3;
466 v->arch.guest_context.user_regs.eflags = 2;
468 if ( v->vcpu_id == 0 )
469 {
470 /* NB. All these really belong in hvm_domain_initialise(). */
471 pit_init(v, cpu_khz);
472 rtc_init(v, RTC_PORT(0));
473 pmtimer_init(v);
474 hpet_init(v);
476 /* Init guest TSC to start from zero. */
477 hvm_set_guest_time(v, 0);
479 /* Can start up without SIPI-SIPI or setvcpucontext domctl. */
480 v->is_initialised = 1;
481 clear_bit(_VPF_down, &v->pause_flags);
482 }
484 return 0;
486 fail3:
487 hvm_funcs.vcpu_destroy(v);
488 fail2:
489 vlapic_destroy(v);
490 fail1:
491 return rc;
492 }
494 void hvm_vcpu_destroy(struct vcpu *v)
495 {
496 vlapic_destroy(v);
497 hvm_funcs.vcpu_destroy(v);
499 /* Event channel is already freed by evtchn_destroy(). */
500 /*free_xen_event_channel(v, v->arch.hvm_vcpu.xen_port);*/
501 }
504 void hvm_vcpu_reset(struct vcpu *v)
505 {
506 vcpu_pause(v);
508 vlapic_reset(vcpu_vlapic(v));
510 hvm_funcs.vcpu_initialise(v);
512 set_bit(_VPF_down, &v->pause_flags);
513 clear_bit(_VPF_blocked, &v->pause_flags);
514 v->fpu_initialised = 0;
515 v->fpu_dirtied = 0;
516 v->is_initialised = 0;
518 vcpu_unpause(v);
519 }
521 static void hvm_vcpu_down(void)
522 {
523 struct vcpu *v = current;
524 struct domain *d = v->domain;
525 int online_count = 0;
527 gdprintk(XENLOG_INFO, "DOM%d/VCPU%d: going offline.\n",
528 d->domain_id, v->vcpu_id);
530 /* Doesn't halt us immediately, but we'll never return to guest context. */
531 set_bit(_VPF_down, &v->pause_flags);
532 vcpu_sleep_nosync(v);
534 /* Any other VCPUs online? ... */
535 LOCK_BIGLOCK(d);
536 for_each_vcpu ( d, v )
537 if ( !test_bit(_VPF_down, &v->pause_flags) )
538 online_count++;
539 UNLOCK_BIGLOCK(d);
541 /* ... Shut down the domain if not. */
542 if ( online_count == 0 )
543 {
544 gdprintk(XENLOG_INFO, "DOM%d: all CPUs offline -- powering off.\n",
545 d->domain_id);
546 domain_shutdown(d, SHUTDOWN_poweroff);
547 }
548 }
550 void hvm_send_assist_req(struct vcpu *v)
551 {
552 ioreq_t *p;
554 if ( unlikely(!vcpu_start_shutdown_deferral(v)) )
555 return; /* implicitly bins the i/o operation */
557 p = &get_ioreq(v)->vp_ioreq;
558 if ( unlikely(p->state != STATE_IOREQ_NONE) )
559 {
560 /* This indicates a bug in the device model. Crash the domain. */
561 gdprintk(XENLOG_ERR, "Device model set bad IO state %d.\n", p->state);
562 domain_crash_synchronous();
563 }
565 prepare_wait_on_xen_event_channel(v->arch.hvm_vcpu.xen_port);
567 /*
568 * Following happens /after/ blocking and setting up ioreq contents.
569 * prepare_wait_on_xen_event_channel() is an implicit barrier.
570 */
571 p->state = STATE_IOREQ_READY;
572 notify_via_xen_event_channel(v->arch.hvm_vcpu.xen_port);
573 }
575 void hvm_hlt(unsigned long rflags)
576 {
577 /*
578 * If we halt with interrupts disabled, that's a pretty sure sign that we
579 * want to shut down. In a real processor, NMIs are the only way to break
580 * out of this.
581 */
582 if ( unlikely(!(rflags & X86_EFLAGS_IF)) )
583 return hvm_vcpu_down();
585 do_sched_op_compat(SCHEDOP_block, 0);
586 }
588 void hvm_triple_fault(void)
589 {
590 struct vcpu *v = current;
591 gdprintk(XENLOG_INFO, "Triple fault on VCPU%d - "
592 "invoking HVM system reset.\n", v->vcpu_id);
593 domain_shutdown(v->domain, SHUTDOWN_reboot);
594 }
596 int hvm_set_efer(uint64_t value)
597 {
598 struct vcpu *v = current;
600 value &= ~EFER_LMA;
602 if ( (value & ~(EFER_FFXSE | EFER_LME | EFER_NX | EFER_SCE)) ||
603 ((sizeof(long) != 8) && (value & EFER_LME)) ||
604 (!cpu_has_nx && (value & EFER_NX)) ||
605 (!cpu_has_syscall && (value & EFER_SCE)) ||
606 (!cpu_has_ffxsr && (value & EFER_FFXSE)) )
607 {
608 gdprintk(XENLOG_WARNING, "Trying to set reserved bit in "
609 "EFER: %"PRIx64"\n", value);
610 hvm_inject_exception(TRAP_gp_fault, 0, 0);
611 return 0;
612 }
614 if ( ((value ^ v->arch.hvm_vcpu.guest_efer) & EFER_LME) &&
615 hvm_paging_enabled(v) )
616 {
617 gdprintk(XENLOG_WARNING,
618 "Trying to change EFER.LME with paging enabled\n");
619 hvm_inject_exception(TRAP_gp_fault, 0, 0);
620 return 0;
621 }
623 value |= v->arch.hvm_vcpu.guest_efer & EFER_LMA;
624 v->arch.hvm_vcpu.guest_efer = value;
625 hvm_update_guest_efer(v);
627 return 1;
628 }
630 extern void shadow_blow_tables_per_domain(struct domain *d);
631 extern bool_t mtrr_pat_not_equal(struct vcpu *vd, struct vcpu *vs);
633 /* Exit UC mode only if all VCPUs agree on MTRR/PAT and are not in no_fill. */
634 static bool_t domain_exit_uc_mode(struct vcpu *v)
635 {
636 struct domain *d = v->domain;
637 struct vcpu *vs;
639 for_each_vcpu ( d, vs )
640 {
641 if ( (vs == v) || !vs->is_initialised )
642 continue;
643 if ( (vs->arch.hvm_vcpu.cache_mode == NO_FILL_CACHE_MODE) ||
644 mtrr_pat_not_equal(vs, v) )
645 return 0;
646 }
648 return 1;
649 }
651 static void local_flush_cache(void *info)
652 {
653 wbinvd();
654 }
656 int hvm_set_cr0(unsigned long value)
657 {
658 struct vcpu *v = current;
659 p2m_type_t p2mt;
660 unsigned long gfn, mfn, old_value = v->arch.hvm_vcpu.guest_cr[0];
662 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR0 value = %lx", value);
664 if ( (u32)value != value )
665 {
666 HVM_DBG_LOG(DBG_LEVEL_1,
667 "Guest attempts to set upper 32 bits in CR0: %lx",
668 value);
669 hvm_inject_exception(TRAP_gp_fault, 0, 0);
670 return 0;
671 }
673 value &= ~HVM_CR0_GUEST_RESERVED_BITS;
675 /* ET is reserved and should be always be 1. */
676 value |= X86_CR0_ET;
678 if ( (value & (X86_CR0_PE|X86_CR0_PG)) == X86_CR0_PG )
679 {
680 hvm_inject_exception(TRAP_gp_fault, 0, 0);
681 return 0;
682 }
684 if ( (value & X86_CR0_PG) && !(old_value & X86_CR0_PG) )
685 {
686 if ( v->arch.hvm_vcpu.guest_efer & EFER_LME )
687 {
688 if ( !(v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_PAE) )
689 {
690 HVM_DBG_LOG(DBG_LEVEL_1, "Enable paging before PAE enable");
691 hvm_inject_exception(TRAP_gp_fault, 0, 0);
692 return 0;
693 }
694 HVM_DBG_LOG(DBG_LEVEL_1, "Enabling long mode");
695 v->arch.hvm_vcpu.guest_efer |= EFER_LMA;
696 hvm_update_guest_efer(v);
697 }
699 if ( !paging_mode_hap(v->domain) )
700 {
701 /* The guest CR3 must be pointing to the guest physical. */
702 gfn = v->arch.hvm_vcpu.guest_cr[3]>>PAGE_SHIFT;
703 mfn = mfn_x(gfn_to_mfn_current(gfn, &p2mt));
704 if ( !p2m_is_ram(p2mt) || !mfn_valid(mfn) ||
705 !get_page(mfn_to_page(mfn), v->domain))
706 {
707 gdprintk(XENLOG_ERR, "Invalid CR3 value = %lx (mfn=%lx)\n",
708 v->arch.hvm_vcpu.guest_cr[3], mfn);
709 domain_crash(v->domain);
710 return 0;
711 }
713 /* Now arch.guest_table points to machine physical. */
714 v->arch.guest_table = pagetable_from_pfn(mfn);
716 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, mfn = %lx",
717 v->arch.hvm_vcpu.guest_cr[3], mfn);
718 }
719 }
720 else if ( !(value & X86_CR0_PG) && (old_value & X86_CR0_PG) )
721 {
722 /* When CR0.PG is cleared, LMA is cleared immediately. */
723 if ( hvm_long_mode_enabled(v) )
724 {
725 v->arch.hvm_vcpu.guest_efer &= ~EFER_LMA;
726 hvm_update_guest_efer(v);
727 }
729 if ( !paging_mode_hap(v->domain) )
730 {
731 put_page(pagetable_get_page(v->arch.guest_table));
732 v->arch.guest_table = pagetable_null();
733 }
734 }
736 if ( !list_empty(&(domain_hvm_iommu(v->domain)->pdev_list)) )
737 {
738 if ( (value & X86_CR0_CD) && !(value & X86_CR0_NW) )
739 {
740 /* Entering no fill cache mode. */
741 spin_lock(&v->domain->arch.hvm_domain.uc_lock);
742 v->arch.hvm_vcpu.cache_mode = NO_FILL_CACHE_MODE;
744 if ( !v->domain->arch.hvm_domain.is_in_uc_mode )
745 {
746 /* Flush physical caches. */
747 on_each_cpu(local_flush_cache, NULL, 1, 1);
748 /* Shadow pagetables must recognise UC mode. */
749 v->domain->arch.hvm_domain.is_in_uc_mode = 1;
750 shadow_blow_tables_per_domain(v->domain);
751 }
752 spin_unlock(&v->domain->arch.hvm_domain.uc_lock);
753 }
754 else if ( !(value & (X86_CR0_CD | X86_CR0_NW)) &&
755 (v->arch.hvm_vcpu.cache_mode == NO_FILL_CACHE_MODE) )
756 {
757 /* Exit from no fill cache mode. */
758 spin_lock(&v->domain->arch.hvm_domain.uc_lock);
759 v->arch.hvm_vcpu.cache_mode = NORMAL_CACHE_MODE;
761 if ( domain_exit_uc_mode(v) )
762 {
763 /* Shadow pagetables must recognise normal caching mode. */
764 v->domain->arch.hvm_domain.is_in_uc_mode = 0;
765 shadow_blow_tables_per_domain(v->domain);
766 }
767 spin_unlock(&v->domain->arch.hvm_domain.uc_lock);
768 }
769 }
771 v->arch.hvm_vcpu.guest_cr[0] = value;
772 hvm_update_guest_cr(v, 0);
774 if ( (value ^ old_value) & X86_CR0_PG )
775 paging_update_paging_modes(v);
777 return 1;
778 }
780 int hvm_set_cr3(unsigned long value)
781 {
782 unsigned long mfn;
783 p2m_type_t p2mt;
784 struct vcpu *v = current;
786 if ( hvm_paging_enabled(v) && !paging_mode_hap(v->domain) &&
787 (value != v->arch.hvm_vcpu.guest_cr[3]) )
788 {
789 /* Shadow-mode CR3 change. Check PDBR and update refcounts. */
790 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 value = %lx", value);
791 mfn = mfn_x(gfn_to_mfn_current(value >> PAGE_SHIFT, &p2mt));
792 if ( !p2m_is_ram(p2mt) || !mfn_valid(mfn) ||
793 !get_page(mfn_to_page(mfn), v->domain) )
794 goto bad_cr3;
796 put_page(pagetable_get_page(v->arch.guest_table));
797 v->arch.guest_table = pagetable_from_pfn(mfn);
799 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx", value);
800 }
802 v->arch.hvm_vcpu.guest_cr[3] = value;
803 paging_update_cr3(v);
804 return 1;
806 bad_cr3:
807 gdprintk(XENLOG_ERR, "Invalid CR3\n");
808 domain_crash(v->domain);
809 return 0;
810 }
812 int hvm_set_cr4(unsigned long value)
813 {
814 struct vcpu *v = current;
815 unsigned long old_cr;
817 if ( value & HVM_CR4_GUEST_RESERVED_BITS )
818 {
819 HVM_DBG_LOG(DBG_LEVEL_1,
820 "Guest attempts to set reserved bit in CR4: %lx",
821 value);
822 goto gpf;
823 }
825 if ( !(value & X86_CR4_PAE) && hvm_long_mode_enabled(v) )
826 {
827 HVM_DBG_LOG(DBG_LEVEL_1, "Guest cleared CR4.PAE while "
828 "EFER.LMA is set");
829 goto gpf;
830 }
832 old_cr = v->arch.hvm_vcpu.guest_cr[4];
833 v->arch.hvm_vcpu.guest_cr[4] = value;
834 hvm_update_guest_cr(v, 4);
836 /* Modifying CR4.{PSE,PAE,PGE} invalidates all TLB entries, inc. Global. */
837 if ( (old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE) )
838 paging_update_paging_modes(v);
840 return 1;
842 gpf:
843 hvm_inject_exception(TRAP_gp_fault, 0, 0);
844 return 0;
845 }
847 int hvm_virtual_to_linear_addr(
848 enum x86_segment seg,
849 struct segment_register *reg,
850 unsigned long offset,
851 unsigned int bytes,
852 enum hvm_access_type access_type,
853 unsigned int addr_size,
854 unsigned long *linear_addr)
855 {
856 unsigned long addr = offset;
857 uint32_t last_byte;
859 if ( addr_size != 64 )
860 {
861 /*
862 * COMPATIBILITY MODE: Apply segment checks and add base.
863 */
865 switch ( access_type )
866 {
867 case hvm_access_read:
868 if ( (reg->attr.fields.type & 0xa) == 0x8 )
869 goto gpf; /* execute-only code segment */
870 break;
871 case hvm_access_write:
872 if ( (reg->attr.fields.type & 0xa) != 0x2 )
873 goto gpf; /* not a writable data segment */
874 break;
875 default:
876 break;
877 }
879 last_byte = offset + bytes - 1;
881 /* Is this a grows-down data segment? Special limit check if so. */
882 if ( (reg->attr.fields.type & 0xc) == 0x4 )
883 {
884 /* Is upper limit 0xFFFF or 0xFFFFFFFF? */
885 if ( !reg->attr.fields.db )
886 last_byte = (uint16_t)last_byte;
888 /* Check first byte and last byte against respective bounds. */
889 if ( (offset <= reg->limit) || (last_byte < offset) )
890 goto gpf;
891 }
892 else if ( (last_byte > reg->limit) || (last_byte < offset) )
893 goto gpf; /* last byte is beyond limit or wraps 0xFFFFFFFF */
895 /*
896 * Hardware truncates to 32 bits in compatibility mode.
897 * It does not truncate to 16 bits in 16-bit address-size mode.
898 */
899 addr = (uint32_t)(addr + reg->base);
900 }
901 else
902 {
903 /*
904 * LONG MODE: FS and GS add segment base. Addresses must be canonical.
905 */
907 if ( (seg == x86_seg_fs) || (seg == x86_seg_gs) )
908 addr += reg->base;
910 if ( !is_canonical_address(addr) )
911 goto gpf;
912 }
914 *linear_addr = addr;
915 return 1;
917 gpf:
918 return 0;
919 }
921 static void *hvm_map(unsigned long va, int size)
922 {
923 unsigned long gfn, mfn;
924 p2m_type_t p2mt;
926 if ( ((va & ~PAGE_MASK) + size) > PAGE_SIZE )
927 {
928 hvm_inject_exception(TRAP_page_fault, PFEC_write_access,
929 (va + PAGE_SIZE - 1) & PAGE_MASK);
930 return NULL;
931 }
933 gfn = paging_gva_to_gfn(current, va);
934 mfn = mfn_x(gfn_to_mfn_current(gfn, &p2mt));
935 if ( !p2m_is_ram(p2mt) )
936 {
937 hvm_inject_exception(TRAP_page_fault, PFEC_write_access, va);
938 return NULL;
939 }
941 ASSERT(mfn_valid(mfn));
943 paging_mark_dirty(current->domain, mfn);
945 return (char *)map_domain_page(mfn) + (va & ~PAGE_MASK);
946 }
948 static void hvm_unmap(void *p)
949 {
950 if ( p )
951 unmap_domain_page(p);
952 }
954 static int hvm_load_segment_selector(
955 struct vcpu *v, enum x86_segment seg, uint16_t sel)
956 {
957 struct segment_register desctab, cs, segr;
958 struct desc_struct *pdesc, desc;
959 u8 dpl, rpl, cpl;
960 int fault_type = TRAP_invalid_tss;
962 /* NULL selector? */
963 if ( (sel & 0xfffc) == 0 )
964 {
965 if ( (seg == x86_seg_cs) || (seg == x86_seg_ss) )
966 goto fail;
967 memset(&segr, 0, sizeof(segr));
968 hvm_set_segment_register(v, seg, &segr);
969 return 0;
970 }
972 /* LDT descriptor must be in the GDT. */
973 if ( (seg == x86_seg_ldtr) && (sel & 4) )
974 goto fail;
976 hvm_get_segment_register(v, x86_seg_cs, &cs);
977 hvm_get_segment_register(
978 v, (sel & 4) ? x86_seg_ldtr : x86_seg_gdtr, &desctab);
980 /* Check against descriptor table limit. */
981 if ( ((sel & 0xfff8) + 7) > desctab.limit )
982 goto fail;
984 pdesc = hvm_map(desctab.base + (sel & 0xfff8), 8);
985 if ( pdesc == NULL )
986 goto hvm_map_fail;
988 do {
989 desc = *pdesc;
991 /* Segment present in memory? */
992 if ( !(desc.b & (1u<<15)) )
993 {
994 fault_type = TRAP_no_segment;
995 goto unmap_and_fail;
996 }
998 /* LDT descriptor is a system segment. All others are code/data. */
999 if ( (desc.b & (1u<<12)) == ((seg == x86_seg_ldtr) << 12) )
1000 goto unmap_and_fail;
1002 dpl = (desc.b >> 13) & 3;
1003 rpl = sel & 3;
1004 cpl = cs.sel & 3;
1006 switch ( seg )
1008 case x86_seg_cs:
1009 /* Code segment? */
1010 if ( !(desc.b & (1u<<11)) )
1011 goto unmap_and_fail;
1012 /* Non-conforming segment: check DPL against RPL. */
1013 if ( ((desc.b & (6u<<9)) != 6) && (dpl != rpl) )
1014 goto unmap_and_fail;
1015 break;
1016 case x86_seg_ss:
1017 /* Writable data segment? */
1018 if ( (desc.b & (5u<<9)) != (1u<<9) )
1019 goto unmap_and_fail;
1020 if ( (dpl != cpl) || (dpl != rpl) )
1021 goto unmap_and_fail;
1022 break;
1023 case x86_seg_ldtr:
1024 /* LDT system segment? */
1025 if ( (desc.b & (15u<<8)) != (2u<<8) )
1026 goto unmap_and_fail;
1027 goto skip_accessed_flag;
1028 default:
1029 /* Readable code or data segment? */
1030 if ( (desc.b & (5u<<9)) == (4u<<9) )
1031 goto unmap_and_fail;
1032 /* Non-conforming segment: check DPL against RPL and CPL. */
1033 if ( ((desc.b & (6u<<9)) != 6) && ((dpl < cpl) || (dpl < rpl)) )
1034 goto unmap_and_fail;
1035 break;
1037 } while ( !(desc.b & 0x100) && /* Ensure Accessed flag is set */
1038 (cmpxchg(&pdesc->b, desc.b, desc.b | 0x100) != desc.b) );
1040 /* Force the Accessed flag in our local copy. */
1041 desc.b |= 0x100;
1043 skip_accessed_flag:
1044 hvm_unmap(pdesc);
1046 segr.base = (((desc.b << 0) & 0xff000000u) |
1047 ((desc.b << 16) & 0x00ff0000u) |
1048 ((desc.a >> 16) & 0x0000ffffu));
1049 segr.attr.bytes = (((desc.b >> 8) & 0x00ffu) |
1050 ((desc.b >> 12) & 0x0f00u));
1051 segr.limit = (desc.b & 0x000f0000u) | (desc.a & 0x0000ffffu);
1052 if ( segr.attr.fields.g )
1053 segr.limit = (segr.limit << 12) | 0xfffu;
1054 segr.sel = sel;
1055 hvm_set_segment_register(v, seg, &segr);
1057 return 0;
1059 unmap_and_fail:
1060 hvm_unmap(pdesc);
1061 fail:
1062 hvm_inject_exception(fault_type, sel & 0xfffc, 0);
1063 hvm_map_fail:
1064 return 1;
1067 void hvm_task_switch(
1068 uint16_t tss_sel, enum hvm_task_switch_reason taskswitch_reason,
1069 int32_t errcode)
1071 struct vcpu *v = current;
1072 struct cpu_user_regs *regs = guest_cpu_user_regs();
1073 struct segment_register gdt, tr, prev_tr, segr;
1074 struct desc_struct *optss_desc = NULL, *nptss_desc = NULL, tss_desc;
1075 unsigned long eflags;
1076 int exn_raised;
1077 struct {
1078 u16 back_link,__blh;
1079 u32 esp0;
1080 u16 ss0, _0;
1081 u32 esp1;
1082 u16 ss1, _1;
1083 u32 esp2;
1084 u16 ss2, _2;
1085 u32 cr3, eip, eflags, eax, ecx, edx, ebx, esp, ebp, esi, edi;
1086 u16 es, _3, cs, _4, ss, _5, ds, _6, fs, _7, gs, _8, ldt, _9;
1087 u16 trace, iomap;
1088 } *ptss, tss;
1090 hvm_get_segment_register(v, x86_seg_gdtr, &gdt);
1091 hvm_get_segment_register(v, x86_seg_tr, &prev_tr);
1093 if ( ((tss_sel & 0xfff8) + 7) > gdt.limit )
1095 hvm_inject_exception((taskswitch_reason == TSW_iret) ?
1096 TRAP_invalid_tss : TRAP_gp_fault,
1097 tss_sel & 0xfff8, 0);
1098 goto out;
1101 optss_desc = hvm_map(gdt.base + (prev_tr.sel & 0xfff8), 8);
1102 if ( optss_desc == NULL )
1103 goto out;
1105 nptss_desc = hvm_map(gdt.base + (tss_sel & 0xfff8), 8);
1106 if ( nptss_desc == NULL )
1107 goto out;
1109 tss_desc = *nptss_desc;
1110 tr.sel = tss_sel;
1111 tr.base = (((tss_desc.b << 0) & 0xff000000u) |
1112 ((tss_desc.b << 16) & 0x00ff0000u) |
1113 ((tss_desc.a >> 16) & 0x0000ffffu));
1114 tr.attr.bytes = (((tss_desc.b >> 8) & 0x00ffu) |
1115 ((tss_desc.b >> 12) & 0x0f00u));
1116 tr.limit = (tss_desc.b & 0x000f0000u) | (tss_desc.a & 0x0000ffffu);
1117 if ( tr.attr.fields.g )
1118 tr.limit = (tr.limit << 12) | 0xfffu;
1120 if ( !tr.attr.fields.p )
1122 hvm_inject_exception(TRAP_no_segment, tss_sel & 0xfff8, 0);
1123 goto out;
1126 if ( tr.attr.fields.type != ((taskswitch_reason == TSW_iret) ? 0xb : 0x9) )
1128 hvm_inject_exception(
1129 (taskswitch_reason == TSW_iret) ? TRAP_invalid_tss : TRAP_gp_fault,
1130 tss_sel & 0xfff8, 0);
1131 goto out;
1134 if ( !tr.attr.fields.g && (tr.limit < (sizeof(tss)-1)) )
1136 hvm_inject_exception(TRAP_invalid_tss, tss_sel & 0xfff8, 0);
1137 goto out;
1140 ptss = hvm_map(prev_tr.base, sizeof(tss));
1141 if ( ptss == NULL )
1142 goto out;
1144 eflags = regs->eflags;
1145 if ( taskswitch_reason == TSW_iret )
1146 eflags &= ~X86_EFLAGS_NT;
1148 ptss->cr3 = v->arch.hvm_vcpu.guest_cr[3];
1149 ptss->eip = regs->eip;
1150 ptss->eflags = eflags;
1151 ptss->eax = regs->eax;
1152 ptss->ecx = regs->ecx;
1153 ptss->edx = regs->edx;
1154 ptss->ebx = regs->ebx;
1155 ptss->esp = regs->esp;
1156 ptss->ebp = regs->ebp;
1157 ptss->esi = regs->esi;
1158 ptss->edi = regs->edi;
1160 hvm_get_segment_register(v, x86_seg_es, &segr);
1161 ptss->es = segr.sel;
1162 hvm_get_segment_register(v, x86_seg_cs, &segr);
1163 ptss->cs = segr.sel;
1164 hvm_get_segment_register(v, x86_seg_ss, &segr);
1165 ptss->ss = segr.sel;
1166 hvm_get_segment_register(v, x86_seg_ds, &segr);
1167 ptss->ds = segr.sel;
1168 hvm_get_segment_register(v, x86_seg_fs, &segr);
1169 ptss->fs = segr.sel;
1170 hvm_get_segment_register(v, x86_seg_gs, &segr);
1171 ptss->gs = segr.sel;
1172 hvm_get_segment_register(v, x86_seg_ldtr, &segr);
1173 ptss->ldt = segr.sel;
1175 hvm_unmap(ptss);
1177 ptss = hvm_map(tr.base, sizeof(tss));
1178 if ( ptss == NULL )
1179 goto out;
1181 if ( !hvm_set_cr3(ptss->cr3) )
1183 hvm_unmap(ptss);
1184 goto out;
1187 regs->eip = ptss->eip;
1188 regs->eflags = ptss->eflags | 2;
1189 regs->eax = ptss->eax;
1190 regs->ecx = ptss->ecx;
1191 regs->edx = ptss->edx;
1192 regs->ebx = ptss->ebx;
1193 regs->esp = ptss->esp;
1194 regs->ebp = ptss->ebp;
1195 regs->esi = ptss->esi;
1196 regs->edi = ptss->edi;
1198 if ( (taskswitch_reason == TSW_call_or_int) )
1200 regs->eflags |= X86_EFLAGS_NT;
1201 ptss->back_link = prev_tr.sel;
1204 exn_raised = 0;
1205 if ( hvm_load_segment_selector(v, x86_seg_es, ptss->es) ||
1206 hvm_load_segment_selector(v, x86_seg_cs, ptss->cs) ||
1207 hvm_load_segment_selector(v, x86_seg_ss, ptss->ss) ||
1208 hvm_load_segment_selector(v, x86_seg_ds, ptss->ds) ||
1209 hvm_load_segment_selector(v, x86_seg_fs, ptss->fs) ||
1210 hvm_load_segment_selector(v, x86_seg_gs, ptss->gs) ||
1211 hvm_load_segment_selector(v, x86_seg_ldtr, ptss->ldt) )
1212 exn_raised = 1;
1214 if ( (ptss->trace & 1) && !exn_raised )
1215 hvm_inject_exception(TRAP_debug, tss_sel & 0xfff8, 0);
1217 hvm_unmap(ptss);
1219 tr.attr.fields.type = 0xb; /* busy 32-bit tss */
1220 hvm_set_segment_register(v, x86_seg_tr, &tr);
1222 v->arch.hvm_vcpu.guest_cr[0] |= X86_CR0_TS;
1223 hvm_update_guest_cr(v, 0);
1225 if ( (taskswitch_reason == TSW_iret) ||
1226 (taskswitch_reason == TSW_jmp) )
1227 clear_bit(41, optss_desc); /* clear B flag of old task */
1229 if ( taskswitch_reason != TSW_iret )
1230 set_bit(41, nptss_desc); /* set B flag of new task */
1232 if ( errcode >= 0 )
1234 struct segment_register reg;
1235 unsigned long linear_addr;
1236 regs->esp -= 4;
1237 hvm_get_segment_register(current, x86_seg_ss, &reg);
1238 /* Todo: do not ignore access faults here. */
1239 if ( hvm_virtual_to_linear_addr(x86_seg_ss, &reg, regs->esp,
1240 4, hvm_access_write, 32,
1241 &linear_addr) )
1242 hvm_copy_to_guest_virt(linear_addr, &errcode, 4);
1245 out:
1246 hvm_unmap(optss_desc);
1247 hvm_unmap(nptss_desc);
1250 /*
1251 * __hvm_copy():
1252 * @buf = hypervisor buffer
1253 * @addr = guest address to copy to/from
1254 * @size = number of bytes to copy
1255 * @dir = copy *to* guest (TRUE) or *from* guest (FALSE)?
1256 * @virt = addr is *virtual* (TRUE) or *guest physical* (FALSE)?
1257 * Returns number of bytes failed to copy (0 == complete success).
1258 */
1259 static int __hvm_copy(void *buf, paddr_t addr, int size, int dir, int virt)
1261 unsigned long gfn, mfn;
1262 p2m_type_t p2mt;
1263 char *p;
1264 int count, todo;
1266 todo = size;
1267 while ( todo > 0 )
1269 count = min_t(int, PAGE_SIZE - (addr & ~PAGE_MASK), todo);
1271 if ( virt )
1272 gfn = paging_gva_to_gfn(current, addr);
1273 else
1274 gfn = addr >> PAGE_SHIFT;
1276 mfn = mfn_x(gfn_to_mfn_current(gfn, &p2mt));
1278 if ( !p2m_is_ram(p2mt) )
1279 return todo;
1280 ASSERT(mfn_valid(mfn));
1282 p = (char *)map_domain_page(mfn) + (addr & ~PAGE_MASK);
1284 if ( dir )
1286 memcpy(p, buf, count); /* dir == TRUE: *to* guest */
1287 paging_mark_dirty(current->domain, mfn);
1289 else
1290 memcpy(buf, p, count); /* dir == FALSE: *from guest */
1292 unmap_domain_page(p);
1294 addr += count;
1295 buf += count;
1296 todo -= count;
1299 return 0;
1302 int hvm_copy_to_guest_phys(paddr_t paddr, void *buf, int size)
1304 return __hvm_copy(buf, paddr, size, 1, 0);
1307 int hvm_copy_from_guest_phys(void *buf, paddr_t paddr, int size)
1309 return __hvm_copy(buf, paddr, size, 0, 0);
1312 int hvm_copy_to_guest_virt(unsigned long vaddr, void *buf, int size)
1314 return __hvm_copy(buf, vaddr, size, 1, 1);
1317 int hvm_copy_from_guest_virt(void *buf, unsigned long vaddr, int size)
1319 return __hvm_copy(buf, vaddr, size, 0, 1);
1323 /* HVM specific printbuf. Mostly used for hvmloader chit-chat. */
1324 void hvm_print_line(struct vcpu *v, const char c)
1326 struct hvm_domain *hd = &v->domain->arch.hvm_domain;
1328 spin_lock(&hd->pbuf_lock);
1329 hd->pbuf[hd->pbuf_idx++] = c;
1330 if ( (hd->pbuf_idx == (sizeof(hd->pbuf) - 2)) || (c == '\n') )
1332 if ( c != '\n' )
1333 hd->pbuf[hd->pbuf_idx++] = '\n';
1334 hd->pbuf[hd->pbuf_idx] = '\0';
1335 printk(XENLOG_G_DEBUG "HVM%u: %s", v->domain->domain_id, hd->pbuf);
1336 hd->pbuf_idx = 0;
1338 spin_unlock(&hd->pbuf_lock);
1341 void hvm_cpuid(unsigned int input, unsigned int *eax, unsigned int *ebx,
1342 unsigned int *ecx, unsigned int *edx)
1344 struct vcpu *v = current;
1346 if ( cpuid_hypervisor_leaves(input, eax, ebx, ecx, edx) )
1347 return;
1349 cpuid(input, eax, ebx, ecx, edx);
1351 switch ( input )
1353 case 0x00000001:
1354 __clear_bit(X86_FEATURE_MWAIT & 31, ecx);
1356 if ( vlapic_hw_disabled(vcpu_vlapic(v)) )
1357 __clear_bit(X86_FEATURE_APIC & 31, edx);
1359 #if CONFIG_PAGING_LEVELS >= 3
1360 if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] )
1361 #endif
1362 __clear_bit(X86_FEATURE_PAE & 31, edx);
1363 __clear_bit(X86_FEATURE_PSE36 & 31, edx);
1364 break;
1366 case 0x80000001:
1367 #if CONFIG_PAGING_LEVELS >= 3
1368 if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] )
1369 #endif
1370 __clear_bit(X86_FEATURE_NX & 31, edx);
1371 #ifdef __i386__
1372 /* Mask feature for Intel ia32e or AMD long mode. */
1373 __clear_bit(X86_FEATURE_LAHF_LM & 31, ecx);
1374 __clear_bit(X86_FEATURE_LM & 31, edx);
1375 __clear_bit(X86_FEATURE_SYSCALL & 31, edx);
1376 #endif
1377 break;
1381 static long hvm_grant_table_op(
1382 unsigned int cmd, XEN_GUEST_HANDLE(void) uop, unsigned int count)
1384 if ( cmd != GNTTABOP_query_size )
1385 return -ENOSYS; /* all other commands need auditing */
1386 return do_grant_table_op(cmd, uop, count);
1389 typedef unsigned long hvm_hypercall_t(
1390 unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
1392 #define HYPERCALL(x) \
1393 [ __HYPERVISOR_ ## x ] = (hvm_hypercall_t *) do_ ## x
1395 #if defined(__i386__)
1397 static hvm_hypercall_t *hvm_hypercall32_table[NR_hypercalls] = {
1398 HYPERCALL(memory_op),
1399 [ __HYPERVISOR_grant_table_op ] = (hvm_hypercall_t *)hvm_grant_table_op,
1400 HYPERCALL(xen_version),
1401 HYPERCALL(grant_table_op),
1402 HYPERCALL(event_channel_op),
1403 HYPERCALL(sched_op),
1404 HYPERCALL(hvm_op)
1405 };
1407 #else /* defined(__x86_64__) */
1409 static long do_memory_op_compat32(int cmd, XEN_GUEST_HANDLE(void) arg)
1411 extern long do_add_to_physmap(struct xen_add_to_physmap *xatp);
1412 long rc;
1414 switch ( cmd )
1416 case XENMEM_add_to_physmap:
1418 struct {
1419 domid_t domid;
1420 uint32_t space;
1421 uint32_t idx;
1422 uint32_t gpfn;
1423 } u;
1424 struct xen_add_to_physmap h;
1426 if ( copy_from_guest(&u, arg, 1) )
1427 return -EFAULT;
1429 h.domid = u.domid;
1430 h.space = u.space;
1431 h.idx = u.idx;
1432 h.gpfn = u.gpfn;
1434 this_cpu(guest_handles_in_xen_space) = 1;
1435 rc = do_memory_op(cmd, guest_handle_from_ptr(&h, void));
1436 this_cpu(guest_handles_in_xen_space) = 0;
1438 break;
1441 default:
1442 gdprintk(XENLOG_WARNING, "memory_op %d.\n", cmd);
1443 rc = -ENOSYS;
1444 break;
1447 return rc;
1450 static hvm_hypercall_t *hvm_hypercall64_table[NR_hypercalls] = {
1451 HYPERCALL(memory_op),
1452 [ __HYPERVISOR_grant_table_op ] = (hvm_hypercall_t *)hvm_grant_table_op,
1453 HYPERCALL(xen_version),
1454 HYPERCALL(grant_table_op),
1455 HYPERCALL(event_channel_op),
1456 HYPERCALL(sched_op),
1457 HYPERCALL(hvm_op)
1458 };
1460 static hvm_hypercall_t *hvm_hypercall32_table[NR_hypercalls] = {
1461 [ __HYPERVISOR_memory_op ] = (hvm_hypercall_t *)do_memory_op_compat32,
1462 [ __HYPERVISOR_grant_table_op ] = (hvm_hypercall_t *)hvm_grant_table_op,
1463 HYPERCALL(xen_version),
1464 HYPERCALL(grant_table_op),
1465 HYPERCALL(event_channel_op),
1466 HYPERCALL(sched_op),
1467 HYPERCALL(hvm_op)
1468 };
1470 #endif /* defined(__x86_64__) */
1472 int hvm_do_hypercall(struct cpu_user_regs *regs)
1474 int flush, mode = hvm_guest_x86_mode(current);
1475 uint32_t eax = regs->eax;
1477 switch ( mode )
1479 #ifdef __x86_64__
1480 case 8:
1481 #endif
1482 case 4:
1483 case 2:
1484 if ( unlikely(ring_3(regs)) )
1486 default:
1487 regs->eax = -EPERM;
1488 return HVM_HCALL_completed;
1490 case 0:
1491 break;
1494 if ( (eax >= NR_hypercalls) || !hvm_hypercall32_table[eax] )
1496 regs->eax = -ENOSYS;
1497 return HVM_HCALL_completed;
1500 /*
1501 * NB. In future flush only on decrease_reservation.
1502 * For now we also need to flush when pages are added, as qemu-dm is not
1503 * yet capable of faulting pages into an existing valid mapcache bucket.
1504 */
1505 flush = ((eax == __HYPERVISOR_memory_op) ||
1506 (eax == __HYPERVISOR_grant_table_op)); /* needed ? */
1507 this_cpu(hc_preempted) = 0;
1509 #ifdef __x86_64__
1510 if ( mode == 8 )
1512 HVM_DBG_LOG(DBG_LEVEL_HCALL, "hcall%u(%lx, %lx, %lx, %lx, %lx)", eax,
1513 regs->rdi, regs->rsi, regs->rdx, regs->r10, regs->r8);
1515 regs->rax = hvm_hypercall64_table[eax](regs->rdi,
1516 regs->rsi,
1517 regs->rdx,
1518 regs->r10,
1519 regs->r8);
1521 else
1522 #endif
1524 HVM_DBG_LOG(DBG_LEVEL_HCALL, "hcall%u(%x, %x, %x, %x, %x)", eax,
1525 (uint32_t)regs->ebx, (uint32_t)regs->ecx,
1526 (uint32_t)regs->edx, (uint32_t)regs->esi,
1527 (uint32_t)regs->edi);
1529 regs->eax = hvm_hypercall32_table[eax]((uint32_t)regs->ebx,
1530 (uint32_t)regs->ecx,
1531 (uint32_t)regs->edx,
1532 (uint32_t)regs->esi,
1533 (uint32_t)regs->edi);
1536 HVM_DBG_LOG(DBG_LEVEL_HCALL, "hcall%u -> %lx",
1537 eax, (unsigned long)regs->eax);
1539 return (this_cpu(hc_preempted) ? HVM_HCALL_preempted :
1540 flush ? HVM_HCALL_invalidate : HVM_HCALL_completed);
1543 static void hvm_latch_shinfo_size(struct domain *d)
1545 /*
1546 * Called from operations which are among the very first executed by
1547 * PV drivers on initialisation or after save/restore. These are sensible
1548 * points at which to sample the execution mode of the guest and latch
1549 * 32- or 64-bit format for shared state.
1550 */
1551 if ( current->domain == d )
1552 d->arch.has_32bit_shinfo = (hvm_guest_x86_mode(current) != 8);
1555 /* Initialise a hypercall transfer page for a VMX domain using
1556 paravirtualised drivers. */
1557 void hvm_hypercall_page_initialise(struct domain *d,
1558 void *hypercall_page)
1560 hvm_latch_shinfo_size(d);
1561 hvm_funcs.init_hypercall_page(d, hypercall_page);
1565 /*
1566 * only called in HVM domain BSP context
1567 * when booting, vcpuid is always equal to apic_id
1568 */
1569 int hvm_bringup_ap(int vcpuid, int trampoline_vector)
1571 struct vcpu *v;
1572 struct domain *d = current->domain;
1573 struct vcpu_guest_context *ctxt;
1574 int rc = 0;
1576 BUG_ON(!is_hvm_domain(d));
1578 if ( (v = d->vcpu[vcpuid]) == NULL )
1579 return -ENOENT;
1581 if ( (ctxt = xmalloc(struct vcpu_guest_context)) == NULL )
1583 gdprintk(XENLOG_ERR,
1584 "Failed to allocate memory in hvm_bringup_ap.\n");
1585 return -ENOMEM;
1588 hvm_init_ap_context(ctxt, vcpuid, trampoline_vector);
1590 /* Sync AP's TSC with BSP's. */
1591 v->arch.hvm_vcpu.cache_tsc_offset =
1592 v->domain->vcpu[0]->arch.hvm_vcpu.cache_tsc_offset;
1593 hvm_funcs.set_tsc_offset(v, v->arch.hvm_vcpu.cache_tsc_offset);
1595 LOCK_BIGLOCK(d);
1596 rc = -EEXIST;
1597 if ( !v->is_initialised )
1598 rc = boot_vcpu(d, vcpuid, ctxt);
1599 UNLOCK_BIGLOCK(d);
1601 if ( rc != 0 )
1603 gdprintk(XENLOG_ERR,
1604 "AP %d bringup failed in boot_vcpu %x.\n", vcpuid, rc);
1605 goto out;
1608 if ( test_and_clear_bit(_VPF_down, &v->pause_flags) )
1609 vcpu_wake(v);
1610 gdprintk(XENLOG_INFO, "AP %d bringup suceeded.\n", vcpuid);
1612 out:
1613 xfree(ctxt);
1614 return rc;
1617 static int hvmop_set_pci_intx_level(
1618 XEN_GUEST_HANDLE(xen_hvm_set_pci_intx_level_t) uop)
1620 struct xen_hvm_set_pci_intx_level op;
1621 struct domain *d;
1622 int rc;
1624 if ( copy_from_guest(&op, uop, 1) )
1625 return -EFAULT;
1627 if ( !IS_PRIV(current->domain) )
1628 return -EPERM;
1630 if ( (op.domain > 0) || (op.bus > 0) || (op.device > 31) || (op.intx > 3) )
1631 return -EINVAL;
1633 d = rcu_lock_domain_by_id(op.domid);
1634 if ( d == NULL )
1635 return -ESRCH;
1637 rc = -EINVAL;
1638 if ( !is_hvm_domain(d) )
1639 goto out;
1641 rc = xsm_hvm_set_pci_intx_level(d);
1642 if ( rc )
1643 goto out;
1645 rc = 0;
1646 switch ( op.level )
1648 case 0:
1649 hvm_pci_intx_deassert(d, op.device, op.intx);
1650 break;
1651 case 1:
1652 hvm_pci_intx_assert(d, op.device, op.intx);
1653 break;
1654 default:
1655 rc = -EINVAL;
1656 break;
1659 out:
1660 rcu_unlock_domain(d);
1661 return rc;
1664 static int hvmop_set_isa_irq_level(
1665 XEN_GUEST_HANDLE(xen_hvm_set_isa_irq_level_t) uop)
1667 struct xen_hvm_set_isa_irq_level op;
1668 struct domain *d;
1669 int rc;
1671 if ( copy_from_guest(&op, uop, 1) )
1672 return -EFAULT;
1674 if ( !IS_PRIV(current->domain) )
1675 return -EPERM;
1677 if ( op.isa_irq > 15 )
1678 return -EINVAL;
1680 d = rcu_lock_domain_by_id(op.domid);
1681 if ( d == NULL )
1682 return -ESRCH;
1684 rc = -EINVAL;
1685 if ( !is_hvm_domain(d) )
1686 goto out;
1688 rc = xsm_hvm_set_isa_irq_level(d);
1689 if ( rc )
1690 goto out;
1692 rc = 0;
1693 switch ( op.level )
1695 case 0:
1696 hvm_isa_irq_deassert(d, op.isa_irq);
1697 break;
1698 case 1:
1699 hvm_isa_irq_assert(d, op.isa_irq);
1700 break;
1701 default:
1702 rc = -EINVAL;
1703 break;
1706 out:
1707 rcu_unlock_domain(d);
1708 return rc;
1711 static int hvmop_set_pci_link_route(
1712 XEN_GUEST_HANDLE(xen_hvm_set_pci_link_route_t) uop)
1714 struct xen_hvm_set_pci_link_route op;
1715 struct domain *d;
1716 int rc;
1718 if ( copy_from_guest(&op, uop, 1) )
1719 return -EFAULT;
1721 if ( !IS_PRIV(current->domain) )
1722 return -EPERM;
1724 if ( (op.link > 3) || (op.isa_irq > 15) )
1725 return -EINVAL;
1727 d = rcu_lock_domain_by_id(op.domid);
1728 if ( d == NULL )
1729 return -ESRCH;
1731 rc = -EINVAL;
1732 if ( !is_hvm_domain(d) )
1733 goto out;
1735 rc = xsm_hvm_set_pci_link_route(d);
1736 if ( rc )
1737 goto out;
1739 rc = 0;
1740 hvm_set_pci_link_route(d, op.link, op.isa_irq);
1742 out:
1743 rcu_unlock_domain(d);
1744 return rc;
1747 static int hvmop_flush_tlb_all(void)
1749 struct domain *d = current->domain;
1750 struct vcpu *v;
1752 /* Avoid deadlock if more than one vcpu tries this at the same time. */
1753 if ( !spin_trylock(&d->hypercall_deadlock_mutex) )
1754 return -EAGAIN;
1756 /* Pause all other vcpus. */
1757 for_each_vcpu ( d, v )
1758 if ( v != current )
1759 vcpu_pause_nosync(v);
1761 /* Now that all VCPUs are signalled to deschedule, we wait... */
1762 for_each_vcpu ( d, v )
1763 if ( v != current )
1764 while ( !vcpu_runnable(v) && v->is_running )
1765 cpu_relax();
1767 /* All other vcpus are paused, safe to unlock now. */
1768 spin_unlock(&d->hypercall_deadlock_mutex);
1770 /* Flush paging-mode soft state (e.g., va->gfn cache; PAE PDPE cache). */
1771 for_each_vcpu ( d, v )
1772 paging_update_cr3(v);
1774 /* Flush all dirty TLBs. */
1775 flush_tlb_mask(d->domain_dirty_cpumask);
1777 /* Done. */
1778 for_each_vcpu ( d, v )
1779 if ( v != current )
1780 vcpu_unpause(v);
1782 return 0;
1785 long do_hvm_op(unsigned long op, XEN_GUEST_HANDLE(void) arg)
1788 long rc = 0;
1790 switch ( op )
1792 case HVMOP_set_param:
1793 case HVMOP_get_param:
1795 struct xen_hvm_param a;
1796 struct hvm_ioreq_page *iorp;
1797 struct domain *d;
1798 struct vcpu *v;
1800 if ( copy_from_guest(&a, arg, 1) )
1801 return -EFAULT;
1803 if ( a.index >= HVM_NR_PARAMS )
1804 return -EINVAL;
1806 if ( a.domid == DOMID_SELF )
1807 d = rcu_lock_current_domain();
1808 else if ( IS_PRIV(current->domain) )
1809 d = rcu_lock_domain_by_id(a.domid);
1810 else
1811 return -EPERM;
1813 if ( d == NULL )
1814 return -ESRCH;
1816 rc = -EINVAL;
1817 if ( !is_hvm_domain(d) )
1818 goto param_fail;
1820 rc = xsm_hvm_param(d, op);
1821 if ( rc )
1822 goto param_fail;
1824 if ( op == HVMOP_set_param )
1826 switch ( a.index )
1828 case HVM_PARAM_IOREQ_PFN:
1829 iorp = &d->arch.hvm_domain.ioreq;
1830 rc = hvm_set_ioreq_page(d, iorp, a.value);
1831 spin_lock(&iorp->lock);
1832 if ( (rc == 0) && (iorp->va != NULL) )
1833 /* Initialise evtchn port info if VCPUs already created. */
1834 for_each_vcpu ( d, v )
1835 get_ioreq(v)->vp_eport = v->arch.hvm_vcpu.xen_port;
1836 spin_unlock(&iorp->lock);
1837 break;
1838 case HVM_PARAM_BUFIOREQ_PFN:
1839 iorp = &d->arch.hvm_domain.buf_ioreq;
1840 rc = hvm_set_ioreq_page(d, iorp, a.value);
1841 break;
1842 case HVM_PARAM_CALLBACK_IRQ:
1843 hvm_set_callback_via(d, a.value);
1844 hvm_latch_shinfo_size(d);
1845 break;
1847 d->arch.hvm_domain.params[a.index] = a.value;
1848 rc = 0;
1850 else
1852 a.value = d->arch.hvm_domain.params[a.index];
1853 rc = copy_to_guest(arg, &a, 1) ? -EFAULT : 0;
1856 HVM_DBG_LOG(DBG_LEVEL_HCALL, "%s param %u = %"PRIx64,
1857 op == HVMOP_set_param ? "set" : "get",
1858 a.index, a.value);
1860 param_fail:
1861 rcu_unlock_domain(d);
1862 break;
1865 case HVMOP_set_pci_intx_level:
1866 rc = hvmop_set_pci_intx_level(
1867 guest_handle_cast(arg, xen_hvm_set_pci_intx_level_t));
1868 break;
1870 case HVMOP_set_isa_irq_level:
1871 rc = hvmop_set_isa_irq_level(
1872 guest_handle_cast(arg, xen_hvm_set_isa_irq_level_t));
1873 break;
1875 case HVMOP_set_pci_link_route:
1876 rc = hvmop_set_pci_link_route(
1877 guest_handle_cast(arg, xen_hvm_set_pci_link_route_t));
1878 break;
1880 case HVMOP_flush_tlbs:
1881 rc = guest_handle_is_null(arg) ? hvmop_flush_tlb_all() : -ENOSYS;
1882 break;
1884 default:
1886 gdprintk(XENLOG_WARNING, "Bad HVM op %ld.\n", op);
1887 rc = -ENOSYS;
1888 break;
1892 if ( rc == -EAGAIN )
1893 rc = hypercall_create_continuation(
1894 __HYPERVISOR_hvm_op, "lh", op, arg);
1896 return rc;
1899 /*
1900 * Local variables:
1901 * mode: C
1902 * c-set-style: "BSD"
1903 * c-basic-offset: 4
1904 * tab-width: 4
1905 * indent-tabs-mode: nil
1906 * End:
1907 */