ia64/xen-unstable

view xen/arch/x86/hvm/hvm.c @ 16121:ac37f61f6908

svm: allow guest to use EFER.FFXSE
Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Keir Fraser <keir@xensource.com>
author Keir Fraser <keir@xensource.com>
date Fri Oct 12 11:00:26 2007 +0100 (2007-10-12)
parents 960a05895a4a
children 723b9837db1b
line source
1 /*
2 * hvm.c: Common hardware virtual machine abstractions.
3 *
4 * Copyright (c) 2004, Intel Corporation.
5 * Copyright (c) 2005, International Business Machines Corporation.
6 *
7 * This program is free software; you can redistribute it and/or modify it
8 * under the terms and conditions of the GNU General Public License,
9 * version 2, as published by the Free Software Foundation.
10 *
11 * This program is distributed in the hope it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
14 * more details.
15 *
16 * You should have received a copy of the GNU General Public License along with
17 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
18 * Place - Suite 330, Boston, MA 02111-1307 USA.
19 */
21 #include <xen/config.h>
22 #include <xen/init.h>
23 #include <xen/lib.h>
24 #include <xen/trace.h>
25 #include <xen/sched.h>
26 #include <xen/irq.h>
27 #include <xen/softirq.h>
28 #include <xen/domain.h>
29 #include <xen/domain_page.h>
30 #include <xen/hypercall.h>
31 #include <xen/guest_access.h>
32 #include <xen/event.h>
33 #include <asm/current.h>
34 #include <asm/e820.h>
35 #include <asm/io.h>
36 #include <asm/paging.h>
37 #include <asm/regs.h>
38 #include <asm/cpufeature.h>
39 #include <asm/processor.h>
40 #include <asm/types.h>
41 #include <asm/msr.h>
42 #include <asm/mc146818rtc.h>
43 #include <asm/spinlock.h>
44 #include <asm/hvm/hvm.h>
45 #include <asm/hvm/vpt.h>
46 #include <asm/hvm/support.h>
47 #include <public/sched.h>
48 #include <public/hvm/ioreq.h>
49 #include <public/version.h>
50 #include <public/memory.h>
52 /* Xen command-line option to disable hardware-assisted paging */
53 static int opt_hap_disabled;
54 invbool_param("hap", opt_hap_disabled);
56 int hvm_enabled __read_mostly;
58 unsigned int opt_hvm_debug_level __read_mostly;
59 integer_param("hvm_debug", opt_hvm_debug_level);
61 struct hvm_function_table hvm_funcs __read_mostly;
63 /* I/O permission bitmap is globally shared by all HVM guests. */
64 char __attribute__ ((__section__ (".bss.page_aligned")))
65 hvm_io_bitmap[3*PAGE_SIZE];
67 void hvm_enable(struct hvm_function_table *fns)
68 {
69 BUG_ON(hvm_enabled);
70 printk("HVM: %s enabled\n", fns->name);
72 /*
73 * Allow direct access to the PC debug port (it is often used for I/O
74 * delays, but the vmexits simply slow things down).
75 */
76 memset(hvm_io_bitmap, ~0, sizeof(hvm_io_bitmap));
77 clear_bit(0x80, hvm_io_bitmap);
79 hvm_funcs = *fns;
80 hvm_enabled = 1;
82 if ( hvm_funcs.hap_supported )
83 {
84 if ( opt_hap_disabled )
85 hvm_funcs.hap_supported = 0;
86 printk("HVM: Hardware Assisted Paging %sabled\n",
87 hvm_funcs.hap_supported ? "en" : "dis");
88 }
89 }
91 void hvm_set_guest_time(struct vcpu *v, u64 gtime)
92 {
93 u64 host_tsc;
95 rdtscll(host_tsc);
97 v->arch.hvm_vcpu.cache_tsc_offset = gtime - host_tsc;
98 hvm_funcs.set_tsc_offset(v, v->arch.hvm_vcpu.cache_tsc_offset);
99 }
101 u64 hvm_get_guest_time(struct vcpu *v)
102 {
103 u64 host_tsc;
105 rdtscll(host_tsc);
106 return host_tsc + v->arch.hvm_vcpu.cache_tsc_offset;
107 }
109 void hvm_migrate_timers(struct vcpu *v)
110 {
111 rtc_migrate_timers(v);
112 hpet_migrate_timers(v);
113 pt_migrate(v);
114 }
116 void hvm_do_resume(struct vcpu *v)
117 {
118 ioreq_t *p;
120 if ( !v->fpu_dirtied )
121 hvm_funcs.stts(v);
123 pt_thaw_time(v);
125 /* NB. Optimised for common case (p->state == STATE_IOREQ_NONE). */
126 p = &get_ioreq(v)->vp_ioreq;
127 while ( p->state != STATE_IOREQ_NONE )
128 {
129 switch ( p->state )
130 {
131 case STATE_IORESP_READY: /* IORESP_READY -> NONE */
132 hvm_io_assist();
133 break;
134 case STATE_IOREQ_READY: /* IOREQ_{READY,INPROCESS} -> IORESP_READY */
135 case STATE_IOREQ_INPROCESS:
136 wait_on_xen_event_channel(v->arch.hvm_vcpu.xen_port,
137 (p->state != STATE_IOREQ_READY) &&
138 (p->state != STATE_IOREQ_INPROCESS));
139 break;
140 default:
141 gdprintk(XENLOG_ERR, "Weird HVM iorequest state %d.\n", p->state);
142 domain_crash_synchronous();
143 }
144 }
145 }
147 static void hvm_init_ioreq_page(
148 struct domain *d, struct hvm_ioreq_page *iorp)
149 {
150 memset(iorp, 0, sizeof(*iorp));
151 spin_lock_init(&iorp->lock);
152 domain_pause(d);
153 }
155 static void hvm_destroy_ioreq_page(
156 struct domain *d, struct hvm_ioreq_page *iorp)
157 {
158 spin_lock(&iorp->lock);
160 ASSERT(d->is_dying);
162 if ( iorp->va != NULL )
163 {
164 unmap_domain_page_global(iorp->va);
165 put_page_and_type(iorp->page);
166 iorp->va = NULL;
167 }
169 spin_unlock(&iorp->lock);
170 }
172 static int hvm_set_ioreq_page(
173 struct domain *d, struct hvm_ioreq_page *iorp, unsigned long gmfn)
174 {
175 struct page_info *page;
176 p2m_type_t p2mt;
177 unsigned long mfn;
178 void *va;
180 mfn = mfn_x(gfn_to_mfn(d, gmfn, &p2mt));
181 if ( !p2m_is_ram(p2mt) )
182 return -EINVAL;
183 ASSERT(mfn_valid(mfn));
185 page = mfn_to_page(mfn);
186 if ( !get_page_and_type(page, d, PGT_writable_page) )
187 return -EINVAL;
189 va = map_domain_page_global(mfn);
190 if ( va == NULL )
191 {
192 put_page_and_type(page);
193 return -ENOMEM;
194 }
196 spin_lock(&iorp->lock);
198 if ( (iorp->va != NULL) || d->is_dying )
199 {
200 spin_unlock(&iorp->lock);
201 unmap_domain_page_global(va);
202 put_page_and_type(mfn_to_page(mfn));
203 return -EINVAL;
204 }
206 iorp->va = va;
207 iorp->page = page;
209 spin_unlock(&iorp->lock);
211 domain_unpause(d);
213 return 0;
214 }
216 int hvm_domain_initialise(struct domain *d)
217 {
218 int rc;
220 if ( !hvm_enabled )
221 {
222 gdprintk(XENLOG_WARNING, "Attempt to create a HVM guest "
223 "on a non-VT/AMDV platform.\n");
224 return -EINVAL;
225 }
227 spin_lock_init(&d->arch.hvm_domain.pbuf_lock);
228 spin_lock_init(&d->arch.hvm_domain.irq_lock);
230 rc = paging_enable(d, PG_refcounts|PG_translate|PG_external);
231 if ( rc != 0 )
232 return rc;
234 vpic_init(d);
236 rc = vioapic_init(d);
237 if ( rc != 0 )
238 return rc;
240 hvm_init_ioreq_page(d, &d->arch.hvm_domain.ioreq);
241 hvm_init_ioreq_page(d, &d->arch.hvm_domain.buf_ioreq);
243 return hvm_funcs.domain_initialise(d);
244 }
246 void hvm_domain_relinquish_resources(struct domain *d)
247 {
248 hvm_destroy_ioreq_page(d, &d->arch.hvm_domain.ioreq);
249 hvm_destroy_ioreq_page(d, &d->arch.hvm_domain.buf_ioreq);
251 pit_deinit(d);
252 rtc_deinit(d);
253 pmtimer_deinit(d);
254 hpet_deinit(d);
255 }
257 void hvm_domain_destroy(struct domain *d)
258 {
259 hvm_funcs.domain_destroy(d);
260 vioapic_deinit(d);
261 }
263 static int hvm_save_cpu_ctxt(struct domain *d, hvm_domain_context_t *h)
264 {
265 struct vcpu *v;
266 struct hvm_hw_cpu ctxt;
267 struct vcpu_guest_context *vc;
269 for_each_vcpu(d, v)
270 {
271 /* We don't need to save state for a vcpu that is down; the restore
272 * code will leave it down if there is nothing saved. */
273 if ( test_bit(_VPF_down, &v->pause_flags) )
274 continue;
276 /* Architecture-specific vmcs/vmcb bits */
277 hvm_funcs.save_cpu_ctxt(v, &ctxt);
279 /* Other vcpu register state */
280 vc = &v->arch.guest_context;
281 if ( v->fpu_initialised )
282 memcpy(ctxt.fpu_regs, &vc->fpu_ctxt, sizeof(ctxt.fpu_regs));
283 else
284 memset(ctxt.fpu_regs, 0, sizeof(ctxt.fpu_regs));
285 ctxt.rax = vc->user_regs.eax;
286 ctxt.rbx = vc->user_regs.ebx;
287 ctxt.rcx = vc->user_regs.ecx;
288 ctxt.rdx = vc->user_regs.edx;
289 ctxt.rbp = vc->user_regs.ebp;
290 ctxt.rsi = vc->user_regs.esi;
291 ctxt.rdi = vc->user_regs.edi;
292 ctxt.rsp = vc->user_regs.esp;
293 ctxt.rip = vc->user_regs.eip;
294 ctxt.rflags = vc->user_regs.eflags;
295 #ifdef __x86_64__
296 ctxt.r8 = vc->user_regs.r8;
297 ctxt.r9 = vc->user_regs.r9;
298 ctxt.r10 = vc->user_regs.r10;
299 ctxt.r11 = vc->user_regs.r11;
300 ctxt.r12 = vc->user_regs.r12;
301 ctxt.r13 = vc->user_regs.r13;
302 ctxt.r14 = vc->user_regs.r14;
303 ctxt.r15 = vc->user_regs.r15;
304 #endif
305 ctxt.dr0 = vc->debugreg[0];
306 ctxt.dr1 = vc->debugreg[1];
307 ctxt.dr2 = vc->debugreg[2];
308 ctxt.dr3 = vc->debugreg[3];
309 ctxt.dr6 = vc->debugreg[6];
310 ctxt.dr7 = vc->debugreg[7];
312 if ( hvm_save_entry(CPU, v->vcpu_id, h, &ctxt) != 0 )
313 return 1;
314 }
315 return 0;
316 }
318 static int hvm_load_cpu_ctxt(struct domain *d, hvm_domain_context_t *h)
319 {
320 int vcpuid, rc;
321 struct vcpu *v;
322 struct hvm_hw_cpu ctxt;
323 struct vcpu_guest_context *vc;
325 /* Which vcpu is this? */
326 vcpuid = hvm_load_instance(h);
327 if ( vcpuid > MAX_VIRT_CPUS || (v = d->vcpu[vcpuid]) == NULL )
328 {
329 gdprintk(XENLOG_ERR, "HVM restore: domain has no vcpu %u\n", vcpuid);
330 return -EINVAL;
331 }
332 vc = &v->arch.guest_context;
334 /* Need to init this vcpu before loading its contents */
335 LOCK_BIGLOCK(d);
336 if ( !v->is_initialised )
337 if ( (rc = boot_vcpu(d, vcpuid, vc)) != 0 )
338 return rc;
339 UNLOCK_BIGLOCK(d);
341 if ( hvm_load_entry(CPU, h, &ctxt) != 0 )
342 return -EINVAL;
344 /* Sanity check some control registers. */
345 if ( (ctxt.cr0 & HVM_CR0_GUEST_RESERVED_BITS) ||
346 !(ctxt.cr0 & X86_CR0_ET) ||
347 ((ctxt.cr0 & (X86_CR0_PE|X86_CR0_PG)) == X86_CR0_PG) )
348 {
349 gdprintk(XENLOG_ERR, "HVM restore: bad CR0 0x%"PRIx64"\n",
350 ctxt.msr_efer);
351 return -EINVAL;
352 }
354 if ( ctxt.cr4 & HVM_CR4_GUEST_RESERVED_BITS )
355 {
356 gdprintk(XENLOG_ERR, "HVM restore: bad CR4 0x%"PRIx64"\n",
357 ctxt.msr_efer);
358 return -EINVAL;
359 }
361 if ( (ctxt.msr_efer & ~(EFER_FFXSE | EFER_LME | EFER_LMA |
362 EFER_NX | EFER_SCE)) ||
363 ((sizeof(long) != 8) && (ctxt.msr_efer & EFER_LME)) ||
364 (!cpu_has_nx && (ctxt.msr_efer & EFER_NX)) ||
365 (!cpu_has_syscall && (ctxt.msr_efer & EFER_SCE)) ||
366 (!cpu_has_ffxsr && (ctxt.msr_efer & EFER_FFXSE)) ||
367 ((ctxt.msr_efer & (EFER_LME|EFER_LMA)) == EFER_LMA) )
368 {
369 gdprintk(XENLOG_ERR, "HVM restore: bad EFER 0x%"PRIx64"\n",
370 ctxt.msr_efer);
371 return -EINVAL;
372 }
374 /* Architecture-specific vmcs/vmcb bits */
375 if ( hvm_funcs.load_cpu_ctxt(v, &ctxt) < 0 )
376 return -EINVAL;
378 /* Other vcpu register state */
379 memcpy(&vc->fpu_ctxt, ctxt.fpu_regs, sizeof(ctxt.fpu_regs));
380 vc->user_regs.eax = ctxt.rax;
381 vc->user_regs.ebx = ctxt.rbx;
382 vc->user_regs.ecx = ctxt.rcx;
383 vc->user_regs.edx = ctxt.rdx;
384 vc->user_regs.ebp = ctxt.rbp;
385 vc->user_regs.esi = ctxt.rsi;
386 vc->user_regs.edi = ctxt.rdi;
387 vc->user_regs.esp = ctxt.rsp;
388 vc->user_regs.eip = ctxt.rip;
389 vc->user_regs.eflags = ctxt.rflags | 2;
390 #ifdef __x86_64__
391 vc->user_regs.r8 = ctxt.r8;
392 vc->user_regs.r9 = ctxt.r9;
393 vc->user_regs.r10 = ctxt.r10;
394 vc->user_regs.r11 = ctxt.r11;
395 vc->user_regs.r12 = ctxt.r12;
396 vc->user_regs.r13 = ctxt.r13;
397 vc->user_regs.r14 = ctxt.r14;
398 vc->user_regs.r15 = ctxt.r15;
399 #endif
400 vc->debugreg[0] = ctxt.dr0;
401 vc->debugreg[1] = ctxt.dr1;
402 vc->debugreg[2] = ctxt.dr2;
403 vc->debugreg[3] = ctxt.dr3;
404 vc->debugreg[6] = ctxt.dr6;
405 vc->debugreg[7] = ctxt.dr7;
407 vc->flags = VGCF_online;
408 v->fpu_initialised = 1;
410 /* Auxiliary processors should be woken immediately. */
411 if ( test_and_clear_bit(_VPF_down, &v->pause_flags) )
412 vcpu_wake(v);
414 return 0;
415 }
417 HVM_REGISTER_SAVE_RESTORE(CPU, hvm_save_cpu_ctxt, hvm_load_cpu_ctxt,
418 1, HVMSR_PER_VCPU);
420 int hvm_vcpu_initialise(struct vcpu *v)
421 {
422 int rc;
424 if ( (rc = vlapic_init(v)) != 0 )
425 return rc;
427 if ( (rc = hvm_funcs.vcpu_initialise(v)) != 0 )
428 {
429 vlapic_destroy(v);
430 return rc;
431 }
433 /* Create ioreq event channel. */
434 rc = alloc_unbound_xen_event_channel(v, 0);
435 if ( rc < 0 )
436 {
437 hvm_funcs.vcpu_destroy(v);
438 vlapic_destroy(v);
439 return rc;
440 }
442 /* Register ioreq event channel. */
443 v->arch.hvm_vcpu.xen_port = rc;
444 spin_lock(&v->domain->arch.hvm_domain.ioreq.lock);
445 if ( v->domain->arch.hvm_domain.ioreq.va != NULL )
446 get_ioreq(v)->vp_eport = v->arch.hvm_vcpu.xen_port;
447 spin_unlock(&v->domain->arch.hvm_domain.ioreq.lock);
449 spin_lock_init(&v->arch.hvm_vcpu.tm_lock);
450 INIT_LIST_HEAD(&v->arch.hvm_vcpu.tm_list);
452 v->arch.guest_context.user_regs.eflags = 2;
454 if ( v->vcpu_id == 0 )
455 {
456 /* NB. All these really belong in hvm_domain_initialise(). */
457 pit_init(v, cpu_khz);
458 rtc_init(v, RTC_PORT(0));
459 pmtimer_init(v);
460 hpet_init(v);
462 /* Init guest TSC to start from zero. */
463 hvm_set_guest_time(v, 0);
465 /* Can start up without SIPI-SIPI or setvcpucontext domctl. */
466 v->is_initialised = 1;
467 clear_bit(_VPF_down, &v->pause_flags);
468 }
470 return 0;
471 }
473 void hvm_vcpu_destroy(struct vcpu *v)
474 {
475 vlapic_destroy(v);
476 hvm_funcs.vcpu_destroy(v);
478 /* Event channel is already freed by evtchn_destroy(). */
479 /*free_xen_event_channel(v, v->arch.hvm_vcpu.xen_port);*/
480 }
483 void hvm_vcpu_reset(struct vcpu *v)
484 {
485 vcpu_pause(v);
487 vlapic_reset(vcpu_vlapic(v));
489 hvm_funcs.vcpu_initialise(v);
491 set_bit(_VPF_down, &v->pause_flags);
492 clear_bit(_VPF_blocked, &v->pause_flags);
493 v->fpu_initialised = 0;
494 v->fpu_dirtied = 0;
495 v->is_initialised = 0;
497 vcpu_unpause(v);
498 }
500 static void hvm_vcpu_down(void)
501 {
502 struct vcpu *v = current;
503 struct domain *d = v->domain;
504 int online_count = 0;
506 gdprintk(XENLOG_INFO, "DOM%d/VCPU%d: going offline.\n",
507 d->domain_id, v->vcpu_id);
509 /* Doesn't halt us immediately, but we'll never return to guest context. */
510 set_bit(_VPF_down, &v->pause_flags);
511 vcpu_sleep_nosync(v);
513 /* Any other VCPUs online? ... */
514 LOCK_BIGLOCK(d);
515 for_each_vcpu ( d, v )
516 if ( !test_bit(_VPF_down, &v->pause_flags) )
517 online_count++;
518 UNLOCK_BIGLOCK(d);
520 /* ... Shut down the domain if not. */
521 if ( online_count == 0 )
522 {
523 gdprintk(XENLOG_INFO, "DOM%d: all CPUs offline -- powering off.\n",
524 d->domain_id);
525 domain_shutdown(d, SHUTDOWN_poweroff);
526 }
527 }
529 void hvm_send_assist_req(struct vcpu *v)
530 {
531 ioreq_t *p;
533 if ( unlikely(!vcpu_start_shutdown_deferral(v)) )
534 return; /* implicitly bins the i/o operation */
536 p = &get_ioreq(v)->vp_ioreq;
537 if ( unlikely(p->state != STATE_IOREQ_NONE) )
538 {
539 /* This indicates a bug in the device model. Crash the domain. */
540 gdprintk(XENLOG_ERR, "Device model set bad IO state %d.\n", p->state);
541 domain_crash_synchronous();
542 }
544 prepare_wait_on_xen_event_channel(v->arch.hvm_vcpu.xen_port);
546 /*
547 * Following happens /after/ blocking and setting up ioreq contents.
548 * prepare_wait_on_xen_event_channel() is an implicit barrier.
549 */
550 p->state = STATE_IOREQ_READY;
551 notify_via_xen_event_channel(v->arch.hvm_vcpu.xen_port);
552 }
554 void hvm_hlt(unsigned long rflags)
555 {
556 /*
557 * If we halt with interrupts disabled, that's a pretty sure sign that we
558 * want to shut down. In a real processor, NMIs are the only way to break
559 * out of this.
560 */
561 if ( unlikely(!(rflags & X86_EFLAGS_IF)) )
562 return hvm_vcpu_down();
564 do_sched_op_compat(SCHEDOP_block, 0);
565 }
567 void hvm_triple_fault(void)
568 {
569 struct vcpu *v = current;
570 gdprintk(XENLOG_INFO, "Triple fault on VCPU%d - "
571 "invoking HVM system reset.\n", v->vcpu_id);
572 domain_shutdown(v->domain, SHUTDOWN_reboot);
573 }
575 int hvm_set_efer(uint64_t value)
576 {
577 struct vcpu *v = current;
579 value &= ~EFER_LMA;
581 if ( (value & ~(EFER_FFXSE | EFER_LME | EFER_NX | EFER_SCE)) ||
582 ((sizeof(long) != 8) && (value & EFER_LME)) ||
583 (!cpu_has_nx && (value & EFER_NX)) ||
584 (!cpu_has_syscall && (value & EFER_SCE)) ||
585 (!cpu_has_ffxsr && (value & EFER_FFXSE)) )
586 {
587 gdprintk(XENLOG_WARNING, "Trying to set reserved bit in "
588 "EFER: %"PRIx64"\n", value);
589 hvm_inject_exception(TRAP_gp_fault, 0, 0);
590 return 0;
591 }
593 if ( ((value ^ v->arch.hvm_vcpu.guest_efer) & EFER_LME) &&
594 hvm_paging_enabled(v) )
595 {
596 gdprintk(XENLOG_WARNING,
597 "Trying to change EFER.LME with paging enabled\n");
598 hvm_inject_exception(TRAP_gp_fault, 0, 0);
599 return 0;
600 }
602 value |= v->arch.hvm_vcpu.guest_efer & EFER_LMA;
603 v->arch.hvm_vcpu.guest_efer = value;
604 hvm_update_guest_efer(v);
606 return 1;
607 }
609 int hvm_set_cr0(unsigned long value)
610 {
611 struct vcpu *v = current;
612 p2m_type_t p2mt;
613 unsigned long gfn, mfn, old_value = v->arch.hvm_vcpu.guest_cr[0];
615 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR0 value = %lx", value);
617 if ( (u32)value != value )
618 {
619 HVM_DBG_LOG(DBG_LEVEL_1,
620 "Guest attempts to set upper 32 bits in CR0: %lx",
621 value);
622 hvm_inject_exception(TRAP_gp_fault, 0, 0);
623 return 0;
624 }
626 value &= ~HVM_CR0_GUEST_RESERVED_BITS;
628 /* ET is reserved and should be always be 1. */
629 value |= X86_CR0_ET;
631 if ( (value & (X86_CR0_PE|X86_CR0_PG)) == X86_CR0_PG )
632 {
633 hvm_inject_exception(TRAP_gp_fault, 0, 0);
634 return 0;
635 }
637 if ( (value & X86_CR0_PG) && !(old_value & X86_CR0_PG) )
638 {
639 if ( v->arch.hvm_vcpu.guest_efer & EFER_LME )
640 {
641 if ( !(v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_PAE) )
642 {
643 HVM_DBG_LOG(DBG_LEVEL_1, "Enable paging before PAE enable");
644 hvm_inject_exception(TRAP_gp_fault, 0, 0);
645 return 0;
646 }
647 HVM_DBG_LOG(DBG_LEVEL_1, "Enabling long mode");
648 v->arch.hvm_vcpu.guest_efer |= EFER_LMA;
649 hvm_update_guest_efer(v);
650 }
652 if ( !paging_mode_hap(v->domain) )
653 {
654 /* The guest CR3 must be pointing to the guest physical. */
655 gfn = v->arch.hvm_vcpu.guest_cr[3]>>PAGE_SHIFT;
656 mfn = mfn_x(gfn_to_mfn_current(gfn, &p2mt));
657 if ( !p2m_is_ram(p2mt) || !mfn_valid(mfn) ||
658 !get_page(mfn_to_page(mfn), v->domain))
659 {
660 gdprintk(XENLOG_ERR, "Invalid CR3 value = %lx (mfn=%lx)\n",
661 v->arch.hvm_vcpu.guest_cr[3], mfn);
662 domain_crash(v->domain);
663 return 0;
664 }
666 /* Now arch.guest_table points to machine physical. */
667 v->arch.guest_table = pagetable_from_pfn(mfn);
669 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, mfn = %lx",
670 v->arch.hvm_vcpu.guest_cr[3], mfn);
671 }
672 }
673 else if ( !(value & X86_CR0_PG) && (old_value & X86_CR0_PG) )
674 {
675 /* When CR0.PG is cleared, LMA is cleared immediately. */
676 if ( hvm_long_mode_enabled(v) )
677 {
678 v->arch.hvm_vcpu.guest_efer &= ~EFER_LMA;
679 hvm_update_guest_efer(v);
680 }
682 if ( !paging_mode_hap(v->domain) )
683 {
684 put_page(pagetable_get_page(v->arch.guest_table));
685 v->arch.guest_table = pagetable_null();
686 }
687 }
689 v->arch.hvm_vcpu.guest_cr[0] = value;
690 hvm_update_guest_cr(v, 0);
692 if ( (value ^ old_value) & X86_CR0_PG )
693 paging_update_paging_modes(v);
695 return 1;
696 }
698 int hvm_set_cr3(unsigned long value)
699 {
700 unsigned long mfn;
701 p2m_type_t p2mt;
702 struct vcpu *v = current;
704 if ( hvm_paging_enabled(v) && !paging_mode_hap(v->domain) &&
705 (value != v->arch.hvm_vcpu.guest_cr[3]) )
706 {
707 /* Shadow-mode CR3 change. Check PDBR and update refcounts. */
708 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 value = %lx", value);
709 mfn = mfn_x(gfn_to_mfn_current(value >> PAGE_SHIFT, &p2mt));
710 if ( !p2m_is_ram(p2mt) || !mfn_valid(mfn) ||
711 !get_page(mfn_to_page(mfn), v->domain) )
712 goto bad_cr3;
714 put_page(pagetable_get_page(v->arch.guest_table));
715 v->arch.guest_table = pagetable_from_pfn(mfn);
717 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx", value);
718 }
720 v->arch.hvm_vcpu.guest_cr[3] = value;
721 paging_update_cr3(v);
722 return 1;
724 bad_cr3:
725 gdprintk(XENLOG_ERR, "Invalid CR3\n");
726 domain_crash(v->domain);
727 return 0;
728 }
730 int hvm_set_cr4(unsigned long value)
731 {
732 struct vcpu *v = current;
733 unsigned long old_cr;
735 if ( value & HVM_CR4_GUEST_RESERVED_BITS )
736 {
737 HVM_DBG_LOG(DBG_LEVEL_1,
738 "Guest attempts to set reserved bit in CR4: %lx",
739 value);
740 goto gpf;
741 }
743 if ( !(value & X86_CR4_PAE) && hvm_long_mode_enabled(v) )
744 {
745 HVM_DBG_LOG(DBG_LEVEL_1, "Guest cleared CR4.PAE while "
746 "EFER.LMA is set");
747 goto gpf;
748 }
750 old_cr = v->arch.hvm_vcpu.guest_cr[4];
751 v->arch.hvm_vcpu.guest_cr[4] = value;
752 hvm_update_guest_cr(v, 4);
754 /* Modifying CR4.{PSE,PAE,PGE} invalidates all TLB entries, inc. Global. */
755 if ( (old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE) )
756 paging_update_paging_modes(v);
758 return 1;
760 gpf:
761 hvm_inject_exception(TRAP_gp_fault, 0, 0);
762 return 0;
763 }
765 int hvm_virtual_to_linear_addr(
766 enum x86_segment seg,
767 struct segment_register *reg,
768 unsigned long offset,
769 unsigned int bytes,
770 enum hvm_access_type access_type,
771 unsigned int addr_size,
772 unsigned long *linear_addr)
773 {
774 unsigned long addr = offset;
775 uint32_t last_byte;
777 if ( addr_size != 64 )
778 {
779 /*
780 * COMPATIBILITY MODE: Apply segment checks and add base.
781 */
783 switch ( access_type )
784 {
785 case hvm_access_read:
786 if ( (reg->attr.fields.type & 0xa) == 0x8 )
787 goto gpf; /* execute-only code segment */
788 break;
789 case hvm_access_write:
790 if ( (reg->attr.fields.type & 0xa) != 0x2 )
791 goto gpf; /* not a writable data segment */
792 break;
793 default:
794 break;
795 }
797 last_byte = offset + bytes - 1;
799 /* Is this a grows-down data segment? Special limit check if so. */
800 if ( (reg->attr.fields.type & 0xc) == 0x4 )
801 {
802 /* Is upper limit 0xFFFF or 0xFFFFFFFF? */
803 if ( !reg->attr.fields.db )
804 last_byte = (uint16_t)last_byte;
806 /* Check first byte and last byte against respective bounds. */
807 if ( (offset <= reg->limit) || (last_byte < offset) )
808 goto gpf;
809 }
810 else if ( (last_byte > reg->limit) || (last_byte < offset) )
811 goto gpf; /* last byte is beyond limit or wraps 0xFFFFFFFF */
813 /*
814 * Hardware truncates to 32 bits in compatibility mode.
815 * It does not truncate to 16 bits in 16-bit address-size mode.
816 */
817 addr = (uint32_t)(addr + reg->base);
818 }
819 else
820 {
821 /*
822 * LONG MODE: FS and GS add segment base. Addresses must be canonical.
823 */
825 if ( (seg == x86_seg_fs) || (seg == x86_seg_gs) )
826 addr += reg->base;
828 if ( !is_canonical_address(addr) )
829 goto gpf;
830 }
832 *linear_addr = addr;
833 return 1;
835 gpf:
836 return 0;
837 }
839 static void *hvm_map(unsigned long va, int size)
840 {
841 unsigned long gfn, mfn;
842 p2m_type_t p2mt;
844 if ( ((va & ~PAGE_MASK) + size) > PAGE_SIZE )
845 {
846 hvm_inject_exception(TRAP_page_fault, PFEC_write_access,
847 (va + PAGE_SIZE - 1) & PAGE_MASK);
848 return NULL;
849 }
851 gfn = paging_gva_to_gfn(current, va);
852 mfn = mfn_x(gfn_to_mfn_current(gfn, &p2mt));
853 if ( !p2m_is_ram(p2mt) )
854 {
855 hvm_inject_exception(TRAP_page_fault, PFEC_write_access, va);
856 return NULL;
857 }
859 ASSERT(mfn_valid(mfn));
861 paging_mark_dirty(current->domain, mfn);
863 return (char *)map_domain_page(mfn) + (va & ~PAGE_MASK);
864 }
866 static void hvm_unmap(void *p)
867 {
868 if ( p )
869 unmap_domain_page(p);
870 }
872 static int hvm_load_segment_selector(
873 struct vcpu *v, enum x86_segment seg, uint16_t sel)
874 {
875 struct segment_register desctab, cs, segr;
876 struct desc_struct *pdesc, desc;
877 u8 dpl, rpl, cpl;
878 int fault_type = TRAP_invalid_tss;
880 /* NULL selector? */
881 if ( (sel & 0xfffc) == 0 )
882 {
883 if ( (seg == x86_seg_cs) || (seg == x86_seg_ss) )
884 goto fail;
885 memset(&segr, 0, sizeof(segr));
886 hvm_set_segment_register(v, seg, &segr);
887 return 0;
888 }
890 /* LDT descriptor must be in the GDT. */
891 if ( (seg == x86_seg_ldtr) && (sel & 4) )
892 goto fail;
894 hvm_get_segment_register(v, x86_seg_cs, &cs);
895 hvm_get_segment_register(
896 v, (sel & 4) ? x86_seg_ldtr : x86_seg_gdtr, &desctab);
898 /* Check against descriptor table limit. */
899 if ( ((sel & 0xfff8) + 7) > desctab.limit )
900 goto fail;
902 pdesc = hvm_map(desctab.base + (sel & 0xfff8), 8);
903 if ( pdesc == NULL )
904 goto hvm_map_fail;
906 do {
907 desc = *pdesc;
909 /* Segment present in memory? */
910 if ( !(desc.b & (1u<<15)) )
911 {
912 fault_type = TRAP_no_segment;
913 goto unmap_and_fail;
914 }
916 /* LDT descriptor is a system segment. All others are code/data. */
917 if ( (desc.b & (1u<<12)) == ((seg == x86_seg_ldtr) << 12) )
918 goto unmap_and_fail;
920 dpl = (desc.b >> 13) & 3;
921 rpl = sel & 3;
922 cpl = cs.sel & 3;
924 switch ( seg )
925 {
926 case x86_seg_cs:
927 /* Code segment? */
928 if ( !(desc.b & (1u<<11)) )
929 goto unmap_and_fail;
930 /* Non-conforming segment: check DPL against RPL. */
931 if ( ((desc.b & (6u<<9)) != 6) && (dpl != rpl) )
932 goto unmap_and_fail;
933 break;
934 case x86_seg_ss:
935 /* Writable data segment? */
936 if ( (desc.b & (5u<<9)) != (1u<<9) )
937 goto unmap_and_fail;
938 if ( (dpl != cpl) || (dpl != rpl) )
939 goto unmap_and_fail;
940 break;
941 case x86_seg_ldtr:
942 /* LDT system segment? */
943 if ( (desc.b & (15u<<8)) != (2u<<8) )
944 goto unmap_and_fail;
945 goto skip_accessed_flag;
946 default:
947 /* Readable code or data segment? */
948 if ( (desc.b & (5u<<9)) == (4u<<9) )
949 goto unmap_and_fail;
950 /* Non-conforming segment: check DPL against RPL and CPL. */
951 if ( ((desc.b & (6u<<9)) != 6) && ((dpl < cpl) || (dpl < rpl)) )
952 goto unmap_and_fail;
953 break;
954 }
955 } while ( !(desc.b & 0x100) && /* Ensure Accessed flag is set */
956 (cmpxchg(&pdesc->b, desc.b, desc.b | 0x100) != desc.b) );
958 /* Force the Accessed flag in our local copy. */
959 desc.b |= 0x100;
961 skip_accessed_flag:
962 hvm_unmap(pdesc);
964 segr.base = (((desc.b << 0) & 0xff000000u) |
965 ((desc.b << 16) & 0x00ff0000u) |
966 ((desc.a >> 16) & 0x0000ffffu));
967 segr.attr.bytes = (((desc.b >> 8) & 0x00ffu) |
968 ((desc.b >> 12) & 0x0f00u));
969 segr.limit = (desc.b & 0x000f0000u) | (desc.a & 0x0000ffffu);
970 if ( segr.attr.fields.g )
971 segr.limit = (segr.limit << 12) | 0xfffu;
972 segr.sel = sel;
973 hvm_set_segment_register(v, seg, &segr);
975 return 0;
977 unmap_and_fail:
978 hvm_unmap(pdesc);
979 fail:
980 hvm_inject_exception(fault_type, sel & 0xfffc, 0);
981 hvm_map_fail:
982 return 1;
983 }
985 void hvm_task_switch(
986 uint16_t tss_sel, enum hvm_task_switch_reason taskswitch_reason,
987 int32_t errcode)
988 {
989 struct vcpu *v = current;
990 struct cpu_user_regs *regs = guest_cpu_user_regs();
991 struct segment_register gdt, tr, prev_tr, segr;
992 struct desc_struct *optss_desc = NULL, *nptss_desc = NULL, tss_desc;
993 unsigned long eflags;
994 int exn_raised;
995 struct {
996 u16 back_link,__blh;
997 u32 esp0;
998 u16 ss0, _0;
999 u32 esp1;
1000 u16 ss1, _1;
1001 u32 esp2;
1002 u16 ss2, _2;
1003 u32 cr3, eip, eflags, eax, ecx, edx, ebx, esp, ebp, esi, edi;
1004 u16 es, _3, cs, _4, ss, _5, ds, _6, fs, _7, gs, _8, ldt, _9;
1005 u16 trace, iomap;
1006 } *ptss, tss;
1008 hvm_get_segment_register(v, x86_seg_gdtr, &gdt);
1009 hvm_get_segment_register(v, x86_seg_tr, &prev_tr);
1011 if ( ((tss_sel & 0xfff8) + 7) > gdt.limit )
1013 hvm_inject_exception((taskswitch_reason == TSW_iret) ?
1014 TRAP_invalid_tss : TRAP_gp_fault,
1015 tss_sel & 0xfff8, 0);
1016 goto out;
1019 optss_desc = hvm_map(gdt.base + (prev_tr.sel & 0xfff8), 8);
1020 if ( optss_desc == NULL )
1021 goto out;
1023 nptss_desc = hvm_map(gdt.base + (tss_sel & 0xfff8), 8);
1024 if ( nptss_desc == NULL )
1025 goto out;
1027 tss_desc = *nptss_desc;
1028 tr.sel = tss_sel;
1029 tr.base = (((tss_desc.b << 0) & 0xff000000u) |
1030 ((tss_desc.b << 16) & 0x00ff0000u) |
1031 ((tss_desc.a >> 16) & 0x0000ffffu));
1032 tr.attr.bytes = (((tss_desc.b >> 8) & 0x00ffu) |
1033 ((tss_desc.b >> 12) & 0x0f00u));
1034 tr.limit = (tss_desc.b & 0x000f0000u) | (tss_desc.a & 0x0000ffffu);
1035 if ( tr.attr.fields.g )
1036 tr.limit = (tr.limit << 12) | 0xfffu;
1038 if ( !tr.attr.fields.p )
1040 hvm_inject_exception(TRAP_no_segment, tss_sel & 0xfff8, 0);
1041 goto out;
1044 if ( tr.attr.fields.type != ((taskswitch_reason == TSW_iret) ? 0xb : 0x9) )
1046 hvm_inject_exception(
1047 (taskswitch_reason == TSW_iret) ? TRAP_invalid_tss : TRAP_gp_fault,
1048 tss_sel & 0xfff8, 0);
1049 goto out;
1052 if ( !tr.attr.fields.g && (tr.limit < (sizeof(tss)-1)) )
1054 hvm_inject_exception(TRAP_invalid_tss, tss_sel & 0xfff8, 0);
1055 goto out;
1058 ptss = hvm_map(prev_tr.base, sizeof(tss));
1059 if ( ptss == NULL )
1060 goto out;
1062 eflags = regs->eflags;
1063 if ( taskswitch_reason == TSW_iret )
1064 eflags &= ~X86_EFLAGS_NT;
1066 ptss->cr3 = v->arch.hvm_vcpu.guest_cr[3];
1067 ptss->eip = regs->eip;
1068 ptss->eflags = eflags;
1069 ptss->eax = regs->eax;
1070 ptss->ecx = regs->ecx;
1071 ptss->edx = regs->edx;
1072 ptss->ebx = regs->ebx;
1073 ptss->esp = regs->esp;
1074 ptss->ebp = regs->ebp;
1075 ptss->esi = regs->esi;
1076 ptss->edi = regs->edi;
1078 hvm_get_segment_register(v, x86_seg_es, &segr);
1079 ptss->es = segr.sel;
1080 hvm_get_segment_register(v, x86_seg_cs, &segr);
1081 ptss->cs = segr.sel;
1082 hvm_get_segment_register(v, x86_seg_ss, &segr);
1083 ptss->ss = segr.sel;
1084 hvm_get_segment_register(v, x86_seg_ds, &segr);
1085 ptss->ds = segr.sel;
1086 hvm_get_segment_register(v, x86_seg_fs, &segr);
1087 ptss->fs = segr.sel;
1088 hvm_get_segment_register(v, x86_seg_gs, &segr);
1089 ptss->gs = segr.sel;
1090 hvm_get_segment_register(v, x86_seg_ldtr, &segr);
1091 ptss->ldt = segr.sel;
1093 hvm_unmap(ptss);
1095 ptss = hvm_map(tr.base, sizeof(tss));
1096 if ( ptss == NULL )
1097 goto out;
1099 if ( !hvm_set_cr3(ptss->cr3) )
1101 hvm_unmap(ptss);
1102 goto out;
1105 regs->eip = ptss->eip;
1106 regs->eflags = ptss->eflags | 2;
1107 regs->eax = ptss->eax;
1108 regs->ecx = ptss->ecx;
1109 regs->edx = ptss->edx;
1110 regs->ebx = ptss->ebx;
1111 regs->esp = ptss->esp;
1112 regs->ebp = ptss->ebp;
1113 regs->esi = ptss->esi;
1114 regs->edi = ptss->edi;
1116 if ( (taskswitch_reason == TSW_call_or_int) )
1118 regs->eflags |= X86_EFLAGS_NT;
1119 ptss->back_link = prev_tr.sel;
1122 exn_raised = 0;
1123 if ( hvm_load_segment_selector(v, x86_seg_es, ptss->es) ||
1124 hvm_load_segment_selector(v, x86_seg_cs, ptss->cs) ||
1125 hvm_load_segment_selector(v, x86_seg_ss, ptss->ss) ||
1126 hvm_load_segment_selector(v, x86_seg_ds, ptss->ds) ||
1127 hvm_load_segment_selector(v, x86_seg_fs, ptss->fs) ||
1128 hvm_load_segment_selector(v, x86_seg_gs, ptss->gs) ||
1129 hvm_load_segment_selector(v, x86_seg_ldtr, ptss->ldt) )
1130 exn_raised = 1;
1132 if ( (ptss->trace & 1) && !exn_raised )
1133 hvm_inject_exception(TRAP_debug, tss_sel & 0xfff8, 0);
1135 hvm_unmap(ptss);
1137 tr.attr.fields.type = 0xb; /* busy 32-bit tss */
1138 hvm_set_segment_register(v, x86_seg_tr, &tr);
1140 v->arch.hvm_vcpu.guest_cr[0] |= X86_CR0_TS;
1141 hvm_update_guest_cr(v, 0);
1143 if ( (taskswitch_reason == TSW_iret) ||
1144 (taskswitch_reason == TSW_jmp) )
1145 clear_bit(41, optss_desc); /* clear B flag of old task */
1147 if ( taskswitch_reason != TSW_iret )
1148 set_bit(41, nptss_desc); /* set B flag of new task */
1150 if ( errcode >= 0 )
1152 struct segment_register reg;
1153 unsigned long linear_addr;
1154 regs->esp -= 4;
1155 hvm_get_segment_register(current, x86_seg_ss, &reg);
1156 /* Todo: do not ignore access faults here. */
1157 if ( hvm_virtual_to_linear_addr(x86_seg_ss, &reg, regs->esp,
1158 4, hvm_access_write, 32,
1159 &linear_addr) )
1160 hvm_copy_to_guest_virt(linear_addr, &errcode, 4);
1163 out:
1164 hvm_unmap(optss_desc);
1165 hvm_unmap(nptss_desc);
1168 /*
1169 * __hvm_copy():
1170 * @buf = hypervisor buffer
1171 * @addr = guest address to copy to/from
1172 * @size = number of bytes to copy
1173 * @dir = copy *to* guest (TRUE) or *from* guest (FALSE)?
1174 * @virt = addr is *virtual* (TRUE) or *guest physical* (FALSE)?
1175 * Returns number of bytes failed to copy (0 == complete success).
1176 */
1177 static int __hvm_copy(void *buf, paddr_t addr, int size, int dir, int virt)
1179 unsigned long gfn, mfn;
1180 p2m_type_t p2mt;
1181 char *p;
1182 int count, todo;
1184 todo = size;
1185 while ( todo > 0 )
1187 count = min_t(int, PAGE_SIZE - (addr & ~PAGE_MASK), todo);
1189 if ( virt )
1190 gfn = paging_gva_to_gfn(current, addr);
1191 else
1192 gfn = addr >> PAGE_SHIFT;
1194 mfn = mfn_x(gfn_to_mfn_current(gfn, &p2mt));
1196 if ( !p2m_is_ram(p2mt) )
1197 return todo;
1198 ASSERT(mfn_valid(mfn));
1200 p = (char *)map_domain_page(mfn) + (addr & ~PAGE_MASK);
1202 if ( dir )
1204 memcpy(p, buf, count); /* dir == TRUE: *to* guest */
1205 paging_mark_dirty(current->domain, mfn);
1207 else
1208 memcpy(buf, p, count); /* dir == FALSE: *from guest */
1210 unmap_domain_page(p);
1212 addr += count;
1213 buf += count;
1214 todo -= count;
1217 return 0;
1220 int hvm_copy_to_guest_phys(paddr_t paddr, void *buf, int size)
1222 return __hvm_copy(buf, paddr, size, 1, 0);
1225 int hvm_copy_from_guest_phys(void *buf, paddr_t paddr, int size)
1227 return __hvm_copy(buf, paddr, size, 0, 0);
1230 int hvm_copy_to_guest_virt(unsigned long vaddr, void *buf, int size)
1232 return __hvm_copy(buf, vaddr, size, 1, 1);
1235 int hvm_copy_from_guest_virt(void *buf, unsigned long vaddr, int size)
1237 return __hvm_copy(buf, vaddr, size, 0, 1);
1241 /* HVM specific printbuf. Mostly used for hvmloader chit-chat. */
1242 void hvm_print_line(struct vcpu *v, const char c)
1244 struct hvm_domain *hd = &v->domain->arch.hvm_domain;
1246 spin_lock(&hd->pbuf_lock);
1247 hd->pbuf[hd->pbuf_idx++] = c;
1248 if ( (hd->pbuf_idx == (sizeof(hd->pbuf) - 2)) || (c == '\n') )
1250 if ( c != '\n' )
1251 hd->pbuf[hd->pbuf_idx++] = '\n';
1252 hd->pbuf[hd->pbuf_idx] = '\0';
1253 printk(XENLOG_G_DEBUG "HVM%u: %s", v->domain->domain_id, hd->pbuf);
1254 hd->pbuf_idx = 0;
1256 spin_unlock(&hd->pbuf_lock);
1259 void hvm_cpuid(unsigned int input, unsigned int *eax, unsigned int *ebx,
1260 unsigned int *ecx, unsigned int *edx)
1262 if ( !cpuid_hypervisor_leaves(input, eax, ebx, ecx, edx) )
1264 cpuid(input, eax, ebx, ecx, edx);
1266 if ( input == 0x00000001 )
1268 struct vcpu *v = current;
1270 clear_bit(X86_FEATURE_MWAIT & 31, ecx);
1272 if ( vlapic_hw_disabled(vcpu_vlapic(v)) )
1273 clear_bit(X86_FEATURE_APIC & 31, edx);
1275 #if CONFIG_PAGING_LEVELS >= 3
1276 if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] )
1277 #endif
1278 clear_bit(X86_FEATURE_PAE & 31, edx);
1279 clear_bit(X86_FEATURE_PSE36 & 31, edx);
1281 else if ( input == 0x80000001 )
1283 #if CONFIG_PAGING_LEVELS >= 3
1284 struct vcpu *v = current;
1285 if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] )
1286 #endif
1287 clear_bit(X86_FEATURE_NX & 31, edx);
1288 #ifdef __i386__
1289 /* Mask feature for Intel ia32e or AMD long mode. */
1290 clear_bit(X86_FEATURE_LAHF_LM & 31, ecx);
1292 clear_bit(X86_FEATURE_LM & 31, edx);
1293 clear_bit(X86_FEATURE_SYSCALL & 31, edx);
1294 #endif
1299 static long hvm_grant_table_op(
1300 unsigned int cmd, XEN_GUEST_HANDLE(void) uop, unsigned int count)
1302 if ( cmd != GNTTABOP_query_size )
1303 return -ENOSYS; /* all other commands need auditing */
1304 return do_grant_table_op(cmd, uop, count);
1307 typedef unsigned long hvm_hypercall_t(
1308 unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
1310 #define HYPERCALL(x) \
1311 [ __HYPERVISOR_ ## x ] = (hvm_hypercall_t *) do_ ## x
1313 #if defined(__i386__)
1315 static hvm_hypercall_t *hvm_hypercall32_table[NR_hypercalls] = {
1316 HYPERCALL(memory_op),
1317 [ __HYPERVISOR_grant_table_op ] = (hvm_hypercall_t *)hvm_grant_table_op,
1318 HYPERCALL(xen_version),
1319 HYPERCALL(grant_table_op),
1320 HYPERCALL(event_channel_op),
1321 HYPERCALL(sched_op),
1322 HYPERCALL(hvm_op)
1323 };
1325 #else /* defined(__x86_64__) */
1327 static long do_memory_op_compat32(int cmd, XEN_GUEST_HANDLE(void) arg)
1329 extern long do_add_to_physmap(struct xen_add_to_physmap *xatp);
1330 long rc;
1332 switch ( cmd )
1334 case XENMEM_add_to_physmap:
1336 struct {
1337 domid_t domid;
1338 uint32_t space;
1339 uint32_t idx;
1340 uint32_t gpfn;
1341 } u;
1342 struct xen_add_to_physmap h;
1344 if ( copy_from_guest(&u, arg, 1) )
1345 return -EFAULT;
1347 h.domid = u.domid;
1348 h.space = u.space;
1349 h.idx = u.idx;
1350 h.gpfn = u.gpfn;
1352 this_cpu(guest_handles_in_xen_space) = 1;
1353 rc = do_memory_op(cmd, guest_handle_from_ptr(&h, void));
1354 this_cpu(guest_handles_in_xen_space) = 0;
1356 break;
1359 default:
1360 gdprintk(XENLOG_WARNING, "memory_op %d.\n", cmd);
1361 rc = -ENOSYS;
1362 break;
1365 return rc;
1368 static hvm_hypercall_t *hvm_hypercall64_table[NR_hypercalls] = {
1369 HYPERCALL(memory_op),
1370 [ __HYPERVISOR_grant_table_op ] = (hvm_hypercall_t *)hvm_grant_table_op,
1371 HYPERCALL(xen_version),
1372 HYPERCALL(grant_table_op),
1373 HYPERCALL(event_channel_op),
1374 HYPERCALL(sched_op),
1375 HYPERCALL(hvm_op)
1376 };
1378 static hvm_hypercall_t *hvm_hypercall32_table[NR_hypercalls] = {
1379 [ __HYPERVISOR_memory_op ] = (hvm_hypercall_t *)do_memory_op_compat32,
1380 [ __HYPERVISOR_grant_table_op ] = (hvm_hypercall_t *)hvm_grant_table_op,
1381 HYPERCALL(xen_version),
1382 HYPERCALL(grant_table_op),
1383 HYPERCALL(event_channel_op),
1384 HYPERCALL(sched_op),
1385 HYPERCALL(hvm_op)
1386 };
1388 #endif /* defined(__x86_64__) */
1390 int hvm_do_hypercall(struct cpu_user_regs *regs)
1392 int flush, mode = hvm_guest_x86_mode(current);
1393 uint32_t eax = regs->eax;
1395 switch ( mode )
1397 #ifdef __x86_64__
1398 case 8:
1399 #endif
1400 case 4:
1401 case 2:
1402 if ( unlikely(ring_3(regs)) )
1404 default:
1405 regs->eax = -EPERM;
1406 return HVM_HCALL_completed;
1408 case 0:
1409 break;
1412 if ( (eax >= NR_hypercalls) || !hvm_hypercall32_table[eax] )
1414 regs->eax = -ENOSYS;
1415 return HVM_HCALL_completed;
1418 /*
1419 * NB. In future flush only on decrease_reservation.
1420 * For now we also need to flush when pages are added, as qemu-dm is not
1421 * yet capable of faulting pages into an existing valid mapcache bucket.
1422 */
1423 flush = ((eax == __HYPERVISOR_memory_op) ||
1424 (eax == __HYPERVISOR_grant_table_op)); /* needed ? */
1425 this_cpu(hc_preempted) = 0;
1427 #ifdef __x86_64__
1428 if ( mode == 8 )
1430 HVM_DBG_LOG(DBG_LEVEL_HCALL, "hcall%u(%lx, %lx, %lx, %lx, %lx)", eax,
1431 regs->rdi, regs->rsi, regs->rdx, regs->r10, regs->r8);
1433 regs->rax = hvm_hypercall64_table[eax](regs->rdi,
1434 regs->rsi,
1435 regs->rdx,
1436 regs->r10,
1437 regs->r8);
1439 else
1440 #endif
1442 HVM_DBG_LOG(DBG_LEVEL_HCALL, "hcall%u(%x, %x, %x, %x, %x)", eax,
1443 (uint32_t)regs->ebx, (uint32_t)regs->ecx,
1444 (uint32_t)regs->edx, (uint32_t)regs->esi,
1445 (uint32_t)regs->edi);
1447 regs->eax = hvm_hypercall32_table[eax]((uint32_t)regs->ebx,
1448 (uint32_t)regs->ecx,
1449 (uint32_t)regs->edx,
1450 (uint32_t)regs->esi,
1451 (uint32_t)regs->edi);
1454 HVM_DBG_LOG(DBG_LEVEL_HCALL, "hcall%u -> %lx",
1455 eax, (unsigned long)regs->eax);
1457 return (this_cpu(hc_preempted) ? HVM_HCALL_preempted :
1458 flush ? HVM_HCALL_invalidate : HVM_HCALL_completed);
1461 static void hvm_latch_shinfo_size(struct domain *d)
1463 /*
1464 * Called from operations which are among the very first executed by
1465 * PV drivers on initialisation or after save/restore. These are sensible
1466 * points at which to sample the execution mode of the guest and latch
1467 * 32- or 64-bit format for shared state.
1468 */
1469 if ( current->domain == d )
1470 d->arch.has_32bit_shinfo = (hvm_guest_x86_mode(current) != 8);
1473 /* Initialise a hypercall transfer page for a VMX domain using
1474 paravirtualised drivers. */
1475 void hvm_hypercall_page_initialise(struct domain *d,
1476 void *hypercall_page)
1478 hvm_latch_shinfo_size(d);
1479 hvm_funcs.init_hypercall_page(d, hypercall_page);
1483 /*
1484 * only called in HVM domain BSP context
1485 * when booting, vcpuid is always equal to apic_id
1486 */
1487 int hvm_bringup_ap(int vcpuid, int trampoline_vector)
1489 struct vcpu *v;
1490 struct domain *d = current->domain;
1491 struct vcpu_guest_context *ctxt;
1492 int rc = 0;
1494 BUG_ON(!is_hvm_domain(d));
1496 if ( (v = d->vcpu[vcpuid]) == NULL )
1497 return -ENOENT;
1499 if ( (ctxt = xmalloc(struct vcpu_guest_context)) == NULL )
1501 gdprintk(XENLOG_ERR,
1502 "Failed to allocate memory in hvm_bringup_ap.\n");
1503 return -ENOMEM;
1506 hvm_init_ap_context(ctxt, vcpuid, trampoline_vector);
1508 /* Sync AP's TSC with BSP's. */
1509 v->arch.hvm_vcpu.cache_tsc_offset =
1510 v->domain->vcpu[0]->arch.hvm_vcpu.cache_tsc_offset;
1511 hvm_funcs.set_tsc_offset(v, v->arch.hvm_vcpu.cache_tsc_offset);
1513 LOCK_BIGLOCK(d);
1514 rc = -EEXIST;
1515 if ( !v->is_initialised )
1516 rc = boot_vcpu(d, vcpuid, ctxt);
1517 UNLOCK_BIGLOCK(d);
1519 if ( rc != 0 )
1521 gdprintk(XENLOG_ERR,
1522 "AP %d bringup failed in boot_vcpu %x.\n", vcpuid, rc);
1523 goto out;
1526 if ( test_and_clear_bit(_VPF_down, &v->pause_flags) )
1527 vcpu_wake(v);
1528 gdprintk(XENLOG_INFO, "AP %d bringup suceeded.\n", vcpuid);
1530 out:
1531 xfree(ctxt);
1532 return rc;
1535 static int hvmop_set_pci_intx_level(
1536 XEN_GUEST_HANDLE(xen_hvm_set_pci_intx_level_t) uop)
1538 struct xen_hvm_set_pci_intx_level op;
1539 struct domain *d;
1540 int rc;
1542 if ( copy_from_guest(&op, uop, 1) )
1543 return -EFAULT;
1545 if ( !IS_PRIV(current->domain) )
1546 return -EPERM;
1548 if ( (op.domain > 0) || (op.bus > 0) || (op.device > 31) || (op.intx > 3) )
1549 return -EINVAL;
1551 d = rcu_lock_domain_by_id(op.domid);
1552 if ( d == NULL )
1553 return -ESRCH;
1555 rc = -EINVAL;
1556 if ( !is_hvm_domain(d) )
1557 goto out;
1559 rc = xsm_hvm_set_pci_intx_level(d);
1560 if ( rc )
1561 goto out;
1563 rc = 0;
1564 switch ( op.level )
1566 case 0:
1567 hvm_pci_intx_deassert(d, op.device, op.intx);
1568 break;
1569 case 1:
1570 hvm_pci_intx_assert(d, op.device, op.intx);
1571 break;
1572 default:
1573 rc = -EINVAL;
1574 break;
1577 out:
1578 rcu_unlock_domain(d);
1579 return rc;
1582 static int hvmop_set_isa_irq_level(
1583 XEN_GUEST_HANDLE(xen_hvm_set_isa_irq_level_t) uop)
1585 struct xen_hvm_set_isa_irq_level op;
1586 struct domain *d;
1587 int rc;
1589 if ( copy_from_guest(&op, uop, 1) )
1590 return -EFAULT;
1592 if ( !IS_PRIV(current->domain) )
1593 return -EPERM;
1595 if ( op.isa_irq > 15 )
1596 return -EINVAL;
1598 d = rcu_lock_domain_by_id(op.domid);
1599 if ( d == NULL )
1600 return -ESRCH;
1602 rc = -EINVAL;
1603 if ( !is_hvm_domain(d) )
1604 goto out;
1606 rc = xsm_hvm_set_isa_irq_level(d);
1607 if ( rc )
1608 goto out;
1610 rc = 0;
1611 switch ( op.level )
1613 case 0:
1614 hvm_isa_irq_deassert(d, op.isa_irq);
1615 break;
1616 case 1:
1617 hvm_isa_irq_assert(d, op.isa_irq);
1618 break;
1619 default:
1620 rc = -EINVAL;
1621 break;
1624 out:
1625 rcu_unlock_domain(d);
1626 return rc;
1629 static int hvmop_set_pci_link_route(
1630 XEN_GUEST_HANDLE(xen_hvm_set_pci_link_route_t) uop)
1632 struct xen_hvm_set_pci_link_route op;
1633 struct domain *d;
1634 int rc;
1636 if ( copy_from_guest(&op, uop, 1) )
1637 return -EFAULT;
1639 if ( !IS_PRIV(current->domain) )
1640 return -EPERM;
1642 if ( (op.link > 3) || (op.isa_irq > 15) )
1643 return -EINVAL;
1645 d = rcu_lock_domain_by_id(op.domid);
1646 if ( d == NULL )
1647 return -ESRCH;
1649 rc = -EINVAL;
1650 if ( !is_hvm_domain(d) )
1651 goto out;
1653 rc = xsm_hvm_set_pci_link_route(d);
1654 if ( rc )
1655 goto out;
1657 rc = 0;
1658 hvm_set_pci_link_route(d, op.link, op.isa_irq);
1660 out:
1661 rcu_unlock_domain(d);
1662 return rc;
1665 static int hvmop_flush_tlb_all(void)
1667 struct domain *d = current->domain;
1668 struct vcpu *v;
1670 /* Avoid deadlock if more than one vcpu tries this at the same time. */
1671 if ( !spin_trylock(&d->hypercall_deadlock_mutex) )
1672 return -EAGAIN;
1674 /* Pause all other vcpus. */
1675 for_each_vcpu ( d, v )
1676 if ( v != current )
1677 vcpu_pause_nosync(v);
1679 /* Now that all VCPUs are signalled to deschedule, we wait... */
1680 for_each_vcpu ( d, v )
1681 if ( v != current )
1682 while ( !vcpu_runnable(v) && v->is_running )
1683 cpu_relax();
1685 /* All other vcpus are paused, safe to unlock now. */
1686 spin_unlock(&d->hypercall_deadlock_mutex);
1688 /* Flush paging-mode soft state (e.g., va->gfn cache; PAE PDPE cache). */
1689 for_each_vcpu ( d, v )
1690 paging_update_cr3(v);
1692 /* Flush all dirty TLBs. */
1693 flush_tlb_mask(d->domain_dirty_cpumask);
1695 /* Done. */
1696 for_each_vcpu ( d, v )
1697 if ( v != current )
1698 vcpu_unpause(v);
1700 return 0;
1703 long do_hvm_op(unsigned long op, XEN_GUEST_HANDLE(void) arg)
1706 long rc = 0;
1708 switch ( op )
1710 case HVMOP_set_param:
1711 case HVMOP_get_param:
1713 struct xen_hvm_param a;
1714 struct hvm_ioreq_page *iorp;
1715 struct domain *d;
1716 struct vcpu *v;
1718 if ( copy_from_guest(&a, arg, 1) )
1719 return -EFAULT;
1721 if ( a.index >= HVM_NR_PARAMS )
1722 return -EINVAL;
1724 if ( a.domid == DOMID_SELF )
1725 d = rcu_lock_current_domain();
1726 else if ( IS_PRIV(current->domain) )
1727 d = rcu_lock_domain_by_id(a.domid);
1728 else
1729 return -EPERM;
1731 if ( d == NULL )
1732 return -ESRCH;
1734 rc = -EINVAL;
1735 if ( !is_hvm_domain(d) )
1736 goto param_fail;
1738 rc = xsm_hvm_param(d, op);
1739 if ( rc )
1740 goto param_fail;
1742 if ( op == HVMOP_set_param )
1744 switch ( a.index )
1746 case HVM_PARAM_IOREQ_PFN:
1747 iorp = &d->arch.hvm_domain.ioreq;
1748 rc = hvm_set_ioreq_page(d, iorp, a.value);
1749 spin_lock(&iorp->lock);
1750 if ( (rc == 0) && (iorp->va != NULL) )
1751 /* Initialise evtchn port info if VCPUs already created. */
1752 for_each_vcpu ( d, v )
1753 get_ioreq(v)->vp_eport = v->arch.hvm_vcpu.xen_port;
1754 spin_unlock(&iorp->lock);
1755 break;
1756 case HVM_PARAM_BUFIOREQ_PFN:
1757 iorp = &d->arch.hvm_domain.buf_ioreq;
1758 rc = hvm_set_ioreq_page(d, iorp, a.value);
1759 break;
1760 case HVM_PARAM_CALLBACK_IRQ:
1761 hvm_set_callback_via(d, a.value);
1762 hvm_latch_shinfo_size(d);
1763 break;
1765 d->arch.hvm_domain.params[a.index] = a.value;
1766 rc = 0;
1768 else
1770 a.value = d->arch.hvm_domain.params[a.index];
1771 rc = copy_to_guest(arg, &a, 1) ? -EFAULT : 0;
1774 HVM_DBG_LOG(DBG_LEVEL_HCALL, "%s param %u = %"PRIx64,
1775 op == HVMOP_set_param ? "set" : "get",
1776 a.index, a.value);
1778 param_fail:
1779 rcu_unlock_domain(d);
1780 break;
1783 case HVMOP_set_pci_intx_level:
1784 rc = hvmop_set_pci_intx_level(
1785 guest_handle_cast(arg, xen_hvm_set_pci_intx_level_t));
1786 break;
1788 case HVMOP_set_isa_irq_level:
1789 rc = hvmop_set_isa_irq_level(
1790 guest_handle_cast(arg, xen_hvm_set_isa_irq_level_t));
1791 break;
1793 case HVMOP_set_pci_link_route:
1794 rc = hvmop_set_pci_link_route(
1795 guest_handle_cast(arg, xen_hvm_set_pci_link_route_t));
1796 break;
1798 case HVMOP_flush_tlbs:
1799 rc = guest_handle_is_null(arg) ? hvmop_flush_tlb_all() : -ENOSYS;
1800 break;
1802 default:
1804 gdprintk(XENLOG_WARNING, "Bad HVM op %ld.\n", op);
1805 rc = -ENOSYS;
1806 break;
1810 if ( rc == -EAGAIN )
1811 rc = hypercall_create_continuation(
1812 __HYPERVISOR_hvm_op, "lh", op, arg);
1814 return rc;
1817 /*
1818 * Local variables:
1819 * mode: C
1820 * c-set-style: "BSD"
1821 * c-basic-offset: 4
1822 * tab-width: 4
1823 * indent-tabs-mode: nil
1824 * End:
1825 */