ia64/xen-unstable

view xen/arch/x86/hvm/hvm.c @ 16029:772674585a1a

hvm: Avoid need for ugly setcpucontext() in HVM domain builder by
pre-setting the vcpu0 to runnable inside Xen, and have the builder
insert a JMP instruction to reach the hvmloader entry point from
address 0x0.
Signed-off-by: Keir Fraser <keir@xensource.com>
author Keir Fraser <keir@xensource.com>
date Mon Oct 01 15:12:05 2007 +0100 (2007-10-01)
parents 9eff4c97053b
children 60c898eeb17b
line source
1 /*
2 * hvm.c: Common hardware virtual machine abstractions.
3 *
4 * Copyright (c) 2004, Intel Corporation.
5 * Copyright (c) 2005, International Business Machines Corporation.
6 *
7 * This program is free software; you can redistribute it and/or modify it
8 * under the terms and conditions of the GNU General Public License,
9 * version 2, as published by the Free Software Foundation.
10 *
11 * This program is distributed in the hope it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
14 * more details.
15 *
16 * You should have received a copy of the GNU General Public License along with
17 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
18 * Place - Suite 330, Boston, MA 02111-1307 USA.
19 */
21 #include <xen/config.h>
22 #include <xen/init.h>
23 #include <xen/lib.h>
24 #include <xen/trace.h>
25 #include <xen/sched.h>
26 #include <xen/irq.h>
27 #include <xen/softirq.h>
28 #include <xen/domain.h>
29 #include <xen/domain_page.h>
30 #include <xen/hypercall.h>
31 #include <xen/guest_access.h>
32 #include <xen/event.h>
33 #include <asm/current.h>
34 #include <asm/e820.h>
35 #include <asm/io.h>
36 #include <asm/paging.h>
37 #include <asm/regs.h>
38 #include <asm/cpufeature.h>
39 #include <asm/processor.h>
40 #include <asm/types.h>
41 #include <asm/msr.h>
42 #include <asm/mc146818rtc.h>
43 #include <asm/spinlock.h>
44 #include <asm/hvm/hvm.h>
45 #include <asm/hvm/vpt.h>
46 #include <asm/hvm/support.h>
47 #include <public/sched.h>
48 #include <public/hvm/ioreq.h>
49 #include <public/version.h>
50 #include <public/memory.h>
52 /* Xen command-line option to disable hardware-assisted paging */
53 static int opt_hap_disabled;
54 invbool_param("hap", opt_hap_disabled);
56 int hvm_enabled __read_mostly;
58 unsigned int opt_hvm_debug_level __read_mostly;
59 integer_param("hvm_debug", opt_hvm_debug_level);
61 struct hvm_function_table hvm_funcs __read_mostly;
63 /* I/O permission bitmap is globally shared by all HVM guests. */
64 char __attribute__ ((__section__ (".bss.page_aligned")))
65 hvm_io_bitmap[3*PAGE_SIZE];
67 void hvm_enable(struct hvm_function_table *fns)
68 {
69 BUG_ON(hvm_enabled);
70 printk("HVM: %s enabled\n", fns->name);
72 /*
73 * Allow direct access to the PC debug port (it is often used for I/O
74 * delays, but the vmexits simply slow things down).
75 */
76 memset(hvm_io_bitmap, ~0, sizeof(hvm_io_bitmap));
77 clear_bit(0x80, hvm_io_bitmap);
79 hvm_funcs = *fns;
80 hvm_enabled = 1;
82 if ( hvm_funcs.hap_supported )
83 {
84 if ( opt_hap_disabled )
85 hvm_funcs.hap_supported = 0;
86 printk("HVM: Hardware Assisted Paging %sabled\n",
87 hvm_funcs.hap_supported ? "en" : "dis");
88 }
89 }
91 void hvm_set_guest_time(struct vcpu *v, u64 gtime)
92 {
93 u64 host_tsc;
95 rdtscll(host_tsc);
97 v->arch.hvm_vcpu.cache_tsc_offset = gtime - host_tsc;
98 hvm_funcs.set_tsc_offset(v, v->arch.hvm_vcpu.cache_tsc_offset);
99 }
101 u64 hvm_get_guest_time(struct vcpu *v)
102 {
103 u64 host_tsc;
105 rdtscll(host_tsc);
106 return host_tsc + v->arch.hvm_vcpu.cache_tsc_offset;
107 }
109 void hvm_migrate_timers(struct vcpu *v)
110 {
111 rtc_migrate_timers(v);
112 hpet_migrate_timers(v);
113 pt_migrate(v);
114 }
116 void hvm_do_resume(struct vcpu *v)
117 {
118 ioreq_t *p;
120 if ( !v->fpu_dirtied )
121 hvm_funcs.stts(v);
123 pt_thaw_time(v);
125 /* NB. Optimised for common case (p->state == STATE_IOREQ_NONE). */
126 p = &get_ioreq(v)->vp_ioreq;
127 while ( p->state != STATE_IOREQ_NONE )
128 {
129 switch ( p->state )
130 {
131 case STATE_IORESP_READY: /* IORESP_READY -> NONE */
132 hvm_io_assist();
133 break;
134 case STATE_IOREQ_READY: /* IOREQ_{READY,INPROCESS} -> IORESP_READY */
135 case STATE_IOREQ_INPROCESS:
136 wait_on_xen_event_channel(v->arch.hvm_vcpu.xen_port,
137 (p->state != STATE_IOREQ_READY) &&
138 (p->state != STATE_IOREQ_INPROCESS));
139 break;
140 default:
141 gdprintk(XENLOG_ERR, "Weird HVM iorequest state %d.\n", p->state);
142 domain_crash_synchronous();
143 }
144 }
145 }
147 static void hvm_init_ioreq_page(
148 struct domain *d, struct hvm_ioreq_page *iorp)
149 {
150 memset(iorp, 0, sizeof(*iorp));
151 spin_lock_init(&iorp->lock);
152 domain_pause(d);
153 }
155 static void hvm_destroy_ioreq_page(
156 struct domain *d, struct hvm_ioreq_page *iorp)
157 {
158 spin_lock(&iorp->lock);
160 ASSERT(d->is_dying);
162 if ( iorp->va != NULL )
163 {
164 unmap_domain_page_global(iorp->va);
165 put_page_and_type(iorp->page);
166 iorp->va = NULL;
167 }
169 spin_unlock(&iorp->lock);
170 }
172 static int hvm_set_ioreq_page(
173 struct domain *d, struct hvm_ioreq_page *iorp, unsigned long gmfn)
174 {
175 struct page_info *page;
176 p2m_type_t p2mt;
177 unsigned long mfn;
178 void *va;
180 mfn = mfn_x(gfn_to_mfn(d, gmfn, &p2mt));
181 if ( !p2m_is_ram(p2mt) )
182 return -EINVAL;
183 ASSERT(mfn_valid(mfn));
185 page = mfn_to_page(mfn);
186 if ( !get_page_and_type(page, d, PGT_writable_page) )
187 return -EINVAL;
189 va = map_domain_page_global(mfn);
190 if ( va == NULL )
191 {
192 put_page_and_type(page);
193 return -ENOMEM;
194 }
196 spin_lock(&iorp->lock);
198 if ( (iorp->va != NULL) || d->is_dying )
199 {
200 spin_unlock(&iorp->lock);
201 unmap_domain_page_global(va);
202 put_page_and_type(mfn_to_page(mfn));
203 return -EINVAL;
204 }
206 iorp->va = va;
207 iorp->page = page;
209 spin_unlock(&iorp->lock);
211 domain_unpause(d);
213 return 0;
214 }
216 int hvm_domain_initialise(struct domain *d)
217 {
218 int rc;
220 if ( !hvm_enabled )
221 {
222 gdprintk(XENLOG_WARNING, "Attempt to create a HVM guest "
223 "on a non-VT/AMDV platform.\n");
224 return -EINVAL;
225 }
227 spin_lock_init(&d->arch.hvm_domain.pbuf_lock);
228 spin_lock_init(&d->arch.hvm_domain.irq_lock);
230 rc = paging_enable(d, PG_refcounts|PG_translate|PG_external);
231 if ( rc != 0 )
232 return rc;
234 vpic_init(d);
235 vioapic_init(d);
237 hvm_init_ioreq_page(d, &d->arch.hvm_domain.ioreq);
238 hvm_init_ioreq_page(d, &d->arch.hvm_domain.buf_ioreq);
240 return hvm_funcs.domain_initialise(d);
241 }
243 void hvm_domain_relinquish_resources(struct domain *d)
244 {
245 hvm_destroy_ioreq_page(d, &d->arch.hvm_domain.ioreq);
246 hvm_destroy_ioreq_page(d, &d->arch.hvm_domain.buf_ioreq);
248 pit_deinit(d);
249 rtc_deinit(d);
250 pmtimer_deinit(d);
251 hpet_deinit(d);
252 }
254 void hvm_domain_destroy(struct domain *d)
255 {
256 hvm_funcs.domain_destroy(d);
257 }
259 static int hvm_save_cpu_ctxt(struct domain *d, hvm_domain_context_t *h)
260 {
261 struct vcpu *v;
262 struct hvm_hw_cpu ctxt;
263 struct vcpu_guest_context *vc;
265 for_each_vcpu(d, v)
266 {
267 /* We don't need to save state for a vcpu that is down; the restore
268 * code will leave it down if there is nothing saved. */
269 if ( test_bit(_VPF_down, &v->pause_flags) )
270 continue;
272 /* Architecture-specific vmcs/vmcb bits */
273 hvm_funcs.save_cpu_ctxt(v, &ctxt);
275 /* Other vcpu register state */
276 vc = &v->arch.guest_context;
277 if ( v->fpu_initialised )
278 memcpy(ctxt.fpu_regs, &vc->fpu_ctxt, sizeof(ctxt.fpu_regs));
279 else
280 memset(ctxt.fpu_regs, 0, sizeof(ctxt.fpu_regs));
281 ctxt.rax = vc->user_regs.eax;
282 ctxt.rbx = vc->user_regs.ebx;
283 ctxt.rcx = vc->user_regs.ecx;
284 ctxt.rdx = vc->user_regs.edx;
285 ctxt.rbp = vc->user_regs.ebp;
286 ctxt.rsi = vc->user_regs.esi;
287 ctxt.rdi = vc->user_regs.edi;
288 ctxt.rsp = vc->user_regs.esp;
289 ctxt.rip = vc->user_regs.eip;
290 ctxt.rflags = vc->user_regs.eflags;
291 #ifdef __x86_64__
292 ctxt.r8 = vc->user_regs.r8;
293 ctxt.r9 = vc->user_regs.r9;
294 ctxt.r10 = vc->user_regs.r10;
295 ctxt.r11 = vc->user_regs.r11;
296 ctxt.r12 = vc->user_regs.r12;
297 ctxt.r13 = vc->user_regs.r13;
298 ctxt.r14 = vc->user_regs.r14;
299 ctxt.r15 = vc->user_regs.r15;
300 #endif
301 ctxt.dr0 = vc->debugreg[0];
302 ctxt.dr1 = vc->debugreg[1];
303 ctxt.dr2 = vc->debugreg[2];
304 ctxt.dr3 = vc->debugreg[3];
305 ctxt.dr6 = vc->debugreg[6];
306 ctxt.dr7 = vc->debugreg[7];
308 if ( hvm_save_entry(CPU, v->vcpu_id, h, &ctxt) != 0 )
309 return 1;
310 }
311 return 0;
312 }
314 static int hvm_load_cpu_ctxt(struct domain *d, hvm_domain_context_t *h)
315 {
316 int vcpuid, rc;
317 struct vcpu *v;
318 struct hvm_hw_cpu ctxt;
319 struct vcpu_guest_context *vc;
321 /* Which vcpu is this? */
322 vcpuid = hvm_load_instance(h);
323 if ( vcpuid > MAX_VIRT_CPUS || (v = d->vcpu[vcpuid]) == NULL )
324 {
325 gdprintk(XENLOG_ERR, "HVM restore: domain has no vcpu %u\n", vcpuid);
326 return -EINVAL;
327 }
328 vc = &v->arch.guest_context;
330 /* Need to init this vcpu before loading its contents */
331 LOCK_BIGLOCK(d);
332 if ( !v->is_initialised )
333 if ( (rc = boot_vcpu(d, vcpuid, vc)) != 0 )
334 return rc;
335 UNLOCK_BIGLOCK(d);
337 if ( hvm_load_entry(CPU, h, &ctxt) != 0 )
338 return -EINVAL;
340 /* Sanity check some control registers. */
341 if ( (ctxt.cr0 & HVM_CR0_GUEST_RESERVED_BITS) ||
342 !(ctxt.cr0 & X86_CR0_ET) ||
343 ((ctxt.cr0 & (X86_CR0_PE|X86_CR0_PG)) == X86_CR0_PG) )
344 {
345 gdprintk(XENLOG_ERR, "HVM restore: bad CR0 0x%"PRIx64"\n",
346 ctxt.msr_efer);
347 return -EINVAL;
348 }
350 if ( ctxt.cr4 & HVM_CR4_GUEST_RESERVED_BITS )
351 {
352 gdprintk(XENLOG_ERR, "HVM restore: bad CR4 0x%"PRIx64"\n",
353 ctxt.msr_efer);
354 return -EINVAL;
355 }
357 if ( (ctxt.msr_efer & ~(EFER_LME | EFER_NX | EFER_SCE)) ||
358 ((sizeof(long) != 8) && (ctxt.msr_efer & EFER_LME)) ||
359 (!cpu_has_nx && (ctxt.msr_efer & EFER_NX)) ||
360 (!cpu_has_syscall && (ctxt.msr_efer & EFER_SCE)) ||
361 ((ctxt.msr_efer & (EFER_LME|EFER_LMA)) == EFER_LMA) )
362 {
363 gdprintk(XENLOG_ERR, "HVM restore: bad EFER 0x%"PRIx64"\n",
364 ctxt.msr_efer);
365 return -EINVAL;
366 }
368 /* Architecture-specific vmcs/vmcb bits */
369 if ( hvm_funcs.load_cpu_ctxt(v, &ctxt) < 0 )
370 return -EINVAL;
372 /* Other vcpu register state */
373 memcpy(&vc->fpu_ctxt, ctxt.fpu_regs, sizeof(ctxt.fpu_regs));
374 vc->user_regs.eax = ctxt.rax;
375 vc->user_regs.ebx = ctxt.rbx;
376 vc->user_regs.ecx = ctxt.rcx;
377 vc->user_regs.edx = ctxt.rdx;
378 vc->user_regs.ebp = ctxt.rbp;
379 vc->user_regs.esi = ctxt.rsi;
380 vc->user_regs.edi = ctxt.rdi;
381 vc->user_regs.esp = ctxt.rsp;
382 vc->user_regs.eip = ctxt.rip;
383 vc->user_regs.eflags = ctxt.rflags | 2;
384 #ifdef __x86_64__
385 vc->user_regs.r8 = ctxt.r8;
386 vc->user_regs.r9 = ctxt.r9;
387 vc->user_regs.r10 = ctxt.r10;
388 vc->user_regs.r11 = ctxt.r11;
389 vc->user_regs.r12 = ctxt.r12;
390 vc->user_regs.r13 = ctxt.r13;
391 vc->user_regs.r14 = ctxt.r14;
392 vc->user_regs.r15 = ctxt.r15;
393 #endif
394 vc->debugreg[0] = ctxt.dr0;
395 vc->debugreg[1] = ctxt.dr1;
396 vc->debugreg[2] = ctxt.dr2;
397 vc->debugreg[3] = ctxt.dr3;
398 vc->debugreg[6] = ctxt.dr6;
399 vc->debugreg[7] = ctxt.dr7;
401 vc->flags = VGCF_online;
402 v->fpu_initialised = 1;
404 /* Auxiliary processors should be woken immediately. */
405 if ( test_and_clear_bit(_VPF_down, &v->pause_flags) )
406 vcpu_wake(v);
408 return 0;
409 }
411 HVM_REGISTER_SAVE_RESTORE(CPU, hvm_save_cpu_ctxt, hvm_load_cpu_ctxt,
412 1, HVMSR_PER_VCPU);
414 int hvm_vcpu_initialise(struct vcpu *v)
415 {
416 int rc;
418 if ( (rc = vlapic_init(v)) != 0 )
419 return rc;
421 if ( (rc = hvm_funcs.vcpu_initialise(v)) != 0 )
422 {
423 vlapic_destroy(v);
424 return rc;
425 }
427 /* Create ioreq event channel. */
428 rc = alloc_unbound_xen_event_channel(v, 0);
429 if ( rc < 0 )
430 {
431 hvm_funcs.vcpu_destroy(v);
432 vlapic_destroy(v);
433 return rc;
434 }
436 /* Register ioreq event channel. */
437 v->arch.hvm_vcpu.xen_port = rc;
438 spin_lock(&v->domain->arch.hvm_domain.ioreq.lock);
439 if ( v->domain->arch.hvm_domain.ioreq.va != NULL )
440 get_ioreq(v)->vp_eport = v->arch.hvm_vcpu.xen_port;
441 spin_unlock(&v->domain->arch.hvm_domain.ioreq.lock);
443 spin_lock_init(&v->arch.hvm_vcpu.tm_lock);
444 INIT_LIST_HEAD(&v->arch.hvm_vcpu.tm_list);
446 v->arch.guest_context.user_regs.eflags = 2;
448 if ( v->vcpu_id == 0 )
449 {
450 /* NB. All these really belong in hvm_domain_initialise(). */
451 pit_init(v, cpu_khz);
452 rtc_init(v, RTC_PORT(0));
453 pmtimer_init(v);
454 hpet_init(v);
456 /* Init guest TSC to start from zero. */
457 hvm_set_guest_time(v, 0);
459 /* Can start up without SIPI-SIPI or setvcpucontext domctl. */
460 v->is_initialised = 1;
461 clear_bit(_VPF_down, &v->pause_flags);
462 }
464 return 0;
465 }
467 void hvm_vcpu_destroy(struct vcpu *v)
468 {
469 vlapic_destroy(v);
470 hvm_funcs.vcpu_destroy(v);
472 /* Event channel is already freed by evtchn_destroy(). */
473 /*free_xen_event_channel(v, v->arch.hvm_vcpu.xen_port);*/
474 }
477 void hvm_vcpu_reset(struct vcpu *v)
478 {
479 vcpu_pause(v);
481 vlapic_reset(vcpu_vlapic(v));
483 hvm_funcs.vcpu_initialise(v);
485 set_bit(_VPF_down, &v->pause_flags);
486 clear_bit(_VPF_blocked, &v->pause_flags);
487 v->fpu_initialised = 0;
488 v->fpu_dirtied = 0;
489 v->is_initialised = 0;
491 vcpu_unpause(v);
492 }
494 static void hvm_vcpu_down(void)
495 {
496 struct vcpu *v = current;
497 struct domain *d = v->domain;
498 int online_count = 0;
500 gdprintk(XENLOG_INFO, "DOM%d/VCPU%d: going offline.\n",
501 d->domain_id, v->vcpu_id);
503 /* Doesn't halt us immediately, but we'll never return to guest context. */
504 set_bit(_VPF_down, &v->pause_flags);
505 vcpu_sleep_nosync(v);
507 /* Any other VCPUs online? ... */
508 LOCK_BIGLOCK(d);
509 for_each_vcpu ( d, v )
510 if ( !test_bit(_VPF_down, &v->pause_flags) )
511 online_count++;
512 UNLOCK_BIGLOCK(d);
514 /* ... Shut down the domain if not. */
515 if ( online_count == 0 )
516 {
517 gdprintk(XENLOG_INFO, "DOM%d: all CPUs offline -- powering off.\n",
518 d->domain_id);
519 domain_shutdown(d, SHUTDOWN_poweroff);
520 }
521 }
523 void hvm_send_assist_req(struct vcpu *v)
524 {
525 ioreq_t *p;
527 if ( unlikely(!vcpu_start_shutdown_deferral(v)) )
528 return; /* implicitly bins the i/o operation */
530 p = &get_ioreq(v)->vp_ioreq;
531 if ( unlikely(p->state != STATE_IOREQ_NONE) )
532 {
533 /* This indicates a bug in the device model. Crash the domain. */
534 gdprintk(XENLOG_ERR, "Device model set bad IO state %d.\n", p->state);
535 domain_crash_synchronous();
536 }
538 prepare_wait_on_xen_event_channel(v->arch.hvm_vcpu.xen_port);
540 /*
541 * Following happens /after/ blocking and setting up ioreq contents.
542 * prepare_wait_on_xen_event_channel() is an implicit barrier.
543 */
544 p->state = STATE_IOREQ_READY;
545 notify_via_xen_event_channel(v->arch.hvm_vcpu.xen_port);
546 }
548 void hvm_hlt(unsigned long rflags)
549 {
550 /*
551 * If we halt with interrupts disabled, that's a pretty sure sign that we
552 * want to shut down. In a real processor, NMIs are the only way to break
553 * out of this.
554 */
555 if ( unlikely(!(rflags & X86_EFLAGS_IF)) )
556 return hvm_vcpu_down();
558 do_sched_op_compat(SCHEDOP_block, 0);
559 }
561 void hvm_triple_fault(void)
562 {
563 struct vcpu *v = current;
564 gdprintk(XENLOG_INFO, "Triple fault on VCPU%d - "
565 "invoking HVM system reset.\n", v->vcpu_id);
566 domain_shutdown(v->domain, SHUTDOWN_reboot);
567 }
569 int hvm_set_efer(uint64_t value)
570 {
571 struct vcpu *v = current;
573 value &= ~EFER_LMA;
575 if ( (value & ~(EFER_LME | EFER_NX | EFER_SCE)) ||
576 ((sizeof(long) != 8) && (value & EFER_LME)) ||
577 (!cpu_has_nx && (value & EFER_NX)) ||
578 (!cpu_has_syscall && (value & EFER_SCE)) )
579 {
580 gdprintk(XENLOG_WARNING, "Trying to set reserved bit in "
581 "EFER: %"PRIx64"\n", value);
582 hvm_inject_exception(TRAP_gp_fault, 0, 0);
583 return 0;
584 }
586 if ( ((value ^ v->arch.hvm_vcpu.guest_efer) & EFER_LME) &&
587 hvm_paging_enabled(v) )
588 {
589 gdprintk(XENLOG_WARNING,
590 "Trying to change EFER.LME with paging enabled\n");
591 hvm_inject_exception(TRAP_gp_fault, 0, 0);
592 return 0;
593 }
595 value |= v->arch.hvm_vcpu.guest_efer & EFER_LMA;
596 v->arch.hvm_vcpu.guest_efer = value;
597 hvm_update_guest_efer(v);
599 return 1;
600 }
602 int hvm_set_cr0(unsigned long value)
603 {
604 struct vcpu *v = current;
605 p2m_type_t p2mt;
606 unsigned long gfn, mfn, old_value = v->arch.hvm_vcpu.guest_cr[0];
608 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR0 value = %lx", value);
610 if ( (u32)value != value )
611 {
612 HVM_DBG_LOG(DBG_LEVEL_1,
613 "Guest attempts to set upper 32 bits in CR0: %lx",
614 value);
615 hvm_inject_exception(TRAP_gp_fault, 0, 0);
616 return 0;
617 }
619 value &= ~HVM_CR0_GUEST_RESERVED_BITS;
621 /* ET is reserved and should be always be 1. */
622 value |= X86_CR0_ET;
624 if ( (value & (X86_CR0_PE|X86_CR0_PG)) == X86_CR0_PG )
625 {
626 hvm_inject_exception(TRAP_gp_fault, 0, 0);
627 return 0;
628 }
630 if ( (value & X86_CR0_PG) && !(old_value & X86_CR0_PG) )
631 {
632 if ( v->arch.hvm_vcpu.guest_efer & EFER_LME )
633 {
634 if ( !(v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_PAE) )
635 {
636 HVM_DBG_LOG(DBG_LEVEL_1, "Enable paging before PAE enable");
637 hvm_inject_exception(TRAP_gp_fault, 0, 0);
638 return 0;
639 }
640 HVM_DBG_LOG(DBG_LEVEL_1, "Enabling long mode");
641 v->arch.hvm_vcpu.guest_efer |= EFER_LMA;
642 hvm_update_guest_efer(v);
643 }
645 if ( !paging_mode_hap(v->domain) )
646 {
647 /* The guest CR3 must be pointing to the guest physical. */
648 gfn = v->arch.hvm_vcpu.guest_cr[3]>>PAGE_SHIFT;
649 mfn = mfn_x(gfn_to_mfn_current(gfn, &p2mt));
650 if ( !p2m_is_ram(p2mt) || !mfn_valid(mfn) ||
651 !get_page(mfn_to_page(mfn), v->domain))
652 {
653 gdprintk(XENLOG_ERR, "Invalid CR3 value = %lx (mfn=%lx)\n",
654 v->arch.hvm_vcpu.guest_cr[3], mfn);
655 domain_crash(v->domain);
656 return 0;
657 }
659 /* Now arch.guest_table points to machine physical. */
660 v->arch.guest_table = pagetable_from_pfn(mfn);
662 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, mfn = %lx",
663 v->arch.hvm_vcpu.guest_cr[3], mfn);
664 }
665 }
666 else if ( !(value & X86_CR0_PG) && (old_value & X86_CR0_PG) )
667 {
668 /* When CR0.PG is cleared, LMA is cleared immediately. */
669 if ( hvm_long_mode_enabled(v) )
670 {
671 v->arch.hvm_vcpu.guest_efer &= ~EFER_LMA;
672 hvm_update_guest_efer(v);
673 }
675 if ( !paging_mode_hap(v->domain) )
676 {
677 put_page(pagetable_get_page(v->arch.guest_table));
678 v->arch.guest_table = pagetable_null();
679 }
680 }
682 v->arch.hvm_vcpu.guest_cr[0] = value;
683 hvm_update_guest_cr(v, 0);
685 if ( (value ^ old_value) & X86_CR0_PG )
686 paging_update_paging_modes(v);
688 return 1;
689 }
691 int hvm_set_cr3(unsigned long value)
692 {
693 unsigned long mfn;
694 p2m_type_t p2mt;
695 struct vcpu *v = current;
697 if ( hvm_paging_enabled(v) && !paging_mode_hap(v->domain) &&
698 (value != v->arch.hvm_vcpu.guest_cr[3]) )
699 {
700 /* Shadow-mode CR3 change. Check PDBR and update refcounts. */
701 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 value = %lx", value);
702 mfn = mfn_x(gfn_to_mfn_current(value >> PAGE_SHIFT, &p2mt));
703 if ( !p2m_is_ram(p2mt) || !mfn_valid(mfn) ||
704 !get_page(mfn_to_page(mfn), v->domain) )
705 goto bad_cr3;
707 put_page(pagetable_get_page(v->arch.guest_table));
708 v->arch.guest_table = pagetable_from_pfn(mfn);
710 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx", value);
711 }
713 v->arch.hvm_vcpu.guest_cr[3] = value;
714 paging_update_cr3(v);
715 return 1;
717 bad_cr3:
718 gdprintk(XENLOG_ERR, "Invalid CR3\n");
719 domain_crash(v->domain);
720 return 0;
721 }
723 int hvm_set_cr4(unsigned long value)
724 {
725 struct vcpu *v = current;
726 unsigned long old_cr;
728 if ( value & HVM_CR4_GUEST_RESERVED_BITS )
729 {
730 HVM_DBG_LOG(DBG_LEVEL_1,
731 "Guest attempts to set reserved bit in CR4: %lx",
732 value);
733 goto gpf;
734 }
736 if ( !(value & X86_CR4_PAE) && hvm_long_mode_enabled(v) )
737 {
738 HVM_DBG_LOG(DBG_LEVEL_1, "Guest cleared CR4.PAE while "
739 "EFER.LMA is set");
740 goto gpf;
741 }
743 old_cr = v->arch.hvm_vcpu.guest_cr[4];
744 v->arch.hvm_vcpu.guest_cr[4] = value;
745 hvm_update_guest_cr(v, 4);
747 /* Modifying CR4.{PSE,PAE,PGE} invalidates all TLB entries, inc. Global. */
748 if ( (old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE) )
749 paging_update_paging_modes(v);
751 return 1;
753 gpf:
754 hvm_inject_exception(TRAP_gp_fault, 0, 0);
755 return 0;
756 }
758 int hvm_virtual_to_linear_addr(
759 enum x86_segment seg,
760 struct segment_register *reg,
761 unsigned long offset,
762 unsigned int bytes,
763 enum hvm_access_type access_type,
764 unsigned int addr_size,
765 unsigned long *linear_addr)
766 {
767 unsigned long addr = offset;
768 uint32_t last_byte;
770 if ( addr_size != 64 )
771 {
772 /*
773 * COMPATIBILITY MODE: Apply segment checks and add base.
774 */
776 switch ( access_type )
777 {
778 case hvm_access_read:
779 if ( (reg->attr.fields.type & 0xa) == 0x8 )
780 goto gpf; /* execute-only code segment */
781 break;
782 case hvm_access_write:
783 if ( (reg->attr.fields.type & 0xa) != 0x2 )
784 goto gpf; /* not a writable data segment */
785 break;
786 default:
787 break;
788 }
790 last_byte = offset + bytes - 1;
792 /* Is this a grows-down data segment? Special limit check if so. */
793 if ( (reg->attr.fields.type & 0xc) == 0x4 )
794 {
795 /* Is upper limit 0xFFFF or 0xFFFFFFFF? */
796 if ( !reg->attr.fields.db )
797 last_byte = (uint16_t)last_byte;
799 /* Check first byte and last byte against respective bounds. */
800 if ( (offset <= reg->limit) || (last_byte < offset) )
801 goto gpf;
802 }
803 else if ( (last_byte > reg->limit) || (last_byte < offset) )
804 goto gpf; /* last byte is beyond limit or wraps 0xFFFFFFFF */
806 /*
807 * Hardware truncates to 32 bits in compatibility mode.
808 * It does not truncate to 16 bits in 16-bit address-size mode.
809 */
810 addr = (uint32_t)(addr + reg->base);
811 }
812 else
813 {
814 /*
815 * LONG MODE: FS and GS add segment base. Addresses must be canonical.
816 */
818 if ( (seg == x86_seg_fs) || (seg == x86_seg_gs) )
819 addr += reg->base;
821 if ( !is_canonical_address(addr) )
822 goto gpf;
823 }
825 *linear_addr = addr;
826 return 1;
828 gpf:
829 return 0;
830 }
832 static void *hvm_map(unsigned long va, int size)
833 {
834 unsigned long gfn, mfn;
835 p2m_type_t p2mt;
837 if ( ((va & ~PAGE_MASK) + size) > PAGE_SIZE )
838 {
839 hvm_inject_exception(TRAP_page_fault, PFEC_write_access,
840 (va + PAGE_SIZE - 1) & PAGE_MASK);
841 return NULL;
842 }
844 gfn = paging_gva_to_gfn(current, va);
845 mfn = mfn_x(gfn_to_mfn_current(gfn, &p2mt));
846 if ( !p2m_is_ram(p2mt) )
847 {
848 hvm_inject_exception(TRAP_page_fault, PFEC_write_access, va);
849 return NULL;
850 }
852 ASSERT(mfn_valid(mfn));
854 paging_mark_dirty(current->domain, mfn);
856 return (char *)map_domain_page(mfn) + (va & ~PAGE_MASK);
857 }
859 static void hvm_unmap(void *p)
860 {
861 if ( p )
862 unmap_domain_page(p);
863 }
865 static int hvm_load_segment_selector(
866 struct vcpu *v, enum x86_segment seg, uint16_t sel)
867 {
868 struct segment_register desctab, cs, segr;
869 struct desc_struct *pdesc, desc;
870 u8 dpl, rpl, cpl;
871 int fault_type = TRAP_invalid_tss;
873 /* NULL selector? */
874 if ( (sel & 0xfffc) == 0 )
875 {
876 if ( (seg == x86_seg_cs) || (seg == x86_seg_ss) )
877 goto fail;
878 memset(&segr, 0, sizeof(segr));
879 hvm_set_segment_register(v, seg, &segr);
880 return 0;
881 }
883 /* LDT descriptor must be in the GDT. */
884 if ( (seg == x86_seg_ldtr) && (sel & 4) )
885 goto fail;
887 hvm_get_segment_register(v, x86_seg_cs, &cs);
888 hvm_get_segment_register(
889 v, (sel & 4) ? x86_seg_ldtr : x86_seg_gdtr, &desctab);
891 /* Check against descriptor table limit. */
892 if ( ((sel & 0xfff8) + 7) > desctab.limit )
893 goto fail;
895 pdesc = hvm_map(desctab.base + (sel & 0xfff8), 8);
896 if ( pdesc == NULL )
897 goto hvm_map_fail;
899 do {
900 desc = *pdesc;
902 /* Segment present in memory? */
903 if ( !(desc.b & (1u<<15)) )
904 {
905 fault_type = TRAP_no_segment;
906 goto unmap_and_fail;
907 }
909 /* LDT descriptor is a system segment. All others are code/data. */
910 if ( (desc.b & (1u<<12)) == ((seg == x86_seg_ldtr) << 12) )
911 goto unmap_and_fail;
913 dpl = (desc.b >> 13) & 3;
914 rpl = sel & 3;
915 cpl = cs.sel & 3;
917 switch ( seg )
918 {
919 case x86_seg_cs:
920 /* Code segment? */
921 if ( !(desc.b & (1u<<11)) )
922 goto unmap_and_fail;
923 /* Non-conforming segment: check DPL against RPL. */
924 if ( ((desc.b & (6u<<9)) != 6) && (dpl != rpl) )
925 goto unmap_and_fail;
926 break;
927 case x86_seg_ss:
928 /* Writable data segment? */
929 if ( (desc.b & (5u<<9)) != (1u<<9) )
930 goto unmap_and_fail;
931 if ( (dpl != cpl) || (dpl != rpl) )
932 goto unmap_and_fail;
933 break;
934 case x86_seg_ldtr:
935 /* LDT system segment? */
936 if ( (desc.b & (15u<<8)) != (2u<<8) )
937 goto unmap_and_fail;
938 goto skip_accessed_flag;
939 default:
940 /* Readable code or data segment? */
941 if ( (desc.b & (5u<<9)) == (4u<<9) )
942 goto unmap_and_fail;
943 /* Non-conforming segment: check DPL against RPL and CPL. */
944 if ( ((desc.b & (6u<<9)) != 6) && ((dpl < cpl) || (dpl < rpl)) )
945 goto unmap_and_fail;
946 break;
947 }
948 } while ( !(desc.b & 0x100) && /* Ensure Accessed flag is set */
949 (cmpxchg(&pdesc->b, desc.b, desc.b | 0x100) != desc.b) );
951 /* Force the Accessed flag in our local copy. */
952 desc.b |= 0x100;
954 skip_accessed_flag:
955 hvm_unmap(pdesc);
957 segr.base = (((desc.b << 0) & 0xff000000u) |
958 ((desc.b << 16) & 0x00ff0000u) |
959 ((desc.a >> 16) & 0x0000ffffu));
960 segr.attr.bytes = (((desc.b >> 8) & 0x00ffu) |
961 ((desc.b >> 12) & 0x0f00u));
962 segr.limit = (desc.b & 0x000f0000u) | (desc.a & 0x0000ffffu);
963 if ( segr.attr.fields.g )
964 segr.limit = (segr.limit << 12) | 0xfffu;
965 segr.sel = sel;
966 hvm_set_segment_register(v, seg, &segr);
968 return 0;
970 unmap_and_fail:
971 hvm_unmap(pdesc);
972 fail:
973 hvm_inject_exception(fault_type, sel & 0xfffc, 0);
974 hvm_map_fail:
975 return 1;
976 }
978 void hvm_task_switch(
979 uint16_t tss_sel, enum hvm_task_switch_reason taskswitch_reason,
980 int32_t errcode)
981 {
982 struct vcpu *v = current;
983 struct cpu_user_regs *regs = guest_cpu_user_regs();
984 struct segment_register gdt, tr, prev_tr, segr;
985 struct desc_struct *optss_desc = NULL, *nptss_desc = NULL, tss_desc;
986 unsigned long eflags;
987 int exn_raised;
988 struct {
989 u16 back_link,__blh;
990 u32 esp0;
991 u16 ss0, _0;
992 u32 esp1;
993 u16 ss1, _1;
994 u32 esp2;
995 u16 ss2, _2;
996 u32 cr3, eip, eflags, eax, ecx, edx, ebx, esp, ebp, esi, edi;
997 u16 es, _3, cs, _4, ss, _5, ds, _6, fs, _7, gs, _8, ldt, _9;
998 u16 trace, iomap;
999 } *ptss, tss;
1001 hvm_get_segment_register(v, x86_seg_gdtr, &gdt);
1002 hvm_get_segment_register(v, x86_seg_tr, &prev_tr);
1004 if ( ((tss_sel & 0xfff8) + 7) > gdt.limit )
1006 hvm_inject_exception((taskswitch_reason == TSW_iret) ?
1007 TRAP_invalid_tss : TRAP_gp_fault,
1008 tss_sel & 0xfff8, 0);
1009 goto out;
1012 optss_desc = hvm_map(gdt.base + (prev_tr.sel & 0xfff8), 8);
1013 if ( optss_desc == NULL )
1014 goto out;
1016 nptss_desc = hvm_map(gdt.base + (tss_sel & 0xfff8), 8);
1017 if ( nptss_desc == NULL )
1018 goto out;
1020 tss_desc = *nptss_desc;
1021 tr.sel = tss_sel;
1022 tr.base = (((tss_desc.b << 0) & 0xff000000u) |
1023 ((tss_desc.b << 16) & 0x00ff0000u) |
1024 ((tss_desc.a >> 16) & 0x0000ffffu));
1025 tr.attr.bytes = (((tss_desc.b >> 8) & 0x00ffu) |
1026 ((tss_desc.b >> 12) & 0x0f00u));
1027 tr.limit = (tss_desc.b & 0x000f0000u) | (tss_desc.a & 0x0000ffffu);
1028 if ( tr.attr.fields.g )
1029 tr.limit = (tr.limit << 12) | 0xfffu;
1031 if ( !tr.attr.fields.p )
1033 hvm_inject_exception(TRAP_no_segment, tss_sel & 0xfff8, 0);
1034 goto out;
1037 if ( tr.attr.fields.type != ((taskswitch_reason == TSW_iret) ? 0xb : 0x9) )
1039 hvm_inject_exception(
1040 (taskswitch_reason == TSW_iret) ? TRAP_invalid_tss : TRAP_gp_fault,
1041 tss_sel & 0xfff8, 0);
1042 goto out;
1045 if ( !tr.attr.fields.g && (tr.limit < (sizeof(tss)-1)) )
1047 hvm_inject_exception(TRAP_invalid_tss, tss_sel & 0xfff8, 0);
1048 goto out;
1051 ptss = hvm_map(prev_tr.base, sizeof(tss));
1052 if ( ptss == NULL )
1053 goto out;
1055 eflags = regs->eflags;
1056 if ( taskswitch_reason == TSW_iret )
1057 eflags &= ~X86_EFLAGS_NT;
1059 ptss->cr3 = v->arch.hvm_vcpu.guest_cr[3];
1060 ptss->eip = regs->eip;
1061 ptss->eflags = eflags;
1062 ptss->eax = regs->eax;
1063 ptss->ecx = regs->ecx;
1064 ptss->edx = regs->edx;
1065 ptss->ebx = regs->ebx;
1066 ptss->esp = regs->esp;
1067 ptss->ebp = regs->ebp;
1068 ptss->esi = regs->esi;
1069 ptss->edi = regs->edi;
1071 hvm_get_segment_register(v, x86_seg_es, &segr);
1072 ptss->es = segr.sel;
1073 hvm_get_segment_register(v, x86_seg_cs, &segr);
1074 ptss->cs = segr.sel;
1075 hvm_get_segment_register(v, x86_seg_ss, &segr);
1076 ptss->ss = segr.sel;
1077 hvm_get_segment_register(v, x86_seg_ds, &segr);
1078 ptss->ds = segr.sel;
1079 hvm_get_segment_register(v, x86_seg_fs, &segr);
1080 ptss->fs = segr.sel;
1081 hvm_get_segment_register(v, x86_seg_gs, &segr);
1082 ptss->gs = segr.sel;
1083 hvm_get_segment_register(v, x86_seg_ldtr, &segr);
1084 ptss->ldt = segr.sel;
1086 hvm_unmap(ptss);
1088 ptss = hvm_map(tr.base, sizeof(tss));
1089 if ( ptss == NULL )
1090 goto out;
1092 if ( !hvm_set_cr3(ptss->cr3) )
1094 hvm_unmap(ptss);
1095 goto out;
1098 regs->eip = ptss->eip;
1099 regs->eflags = ptss->eflags | 2;
1100 regs->eax = ptss->eax;
1101 regs->ecx = ptss->ecx;
1102 regs->edx = ptss->edx;
1103 regs->ebx = ptss->ebx;
1104 regs->esp = ptss->esp;
1105 regs->ebp = ptss->ebp;
1106 regs->esi = ptss->esi;
1107 regs->edi = ptss->edi;
1109 if ( (taskswitch_reason == TSW_call_or_int) )
1111 regs->eflags |= X86_EFLAGS_NT;
1112 ptss->back_link = prev_tr.sel;
1115 exn_raised = 0;
1116 if ( hvm_load_segment_selector(v, x86_seg_es, ptss->es) ||
1117 hvm_load_segment_selector(v, x86_seg_cs, ptss->cs) ||
1118 hvm_load_segment_selector(v, x86_seg_ss, ptss->ss) ||
1119 hvm_load_segment_selector(v, x86_seg_ds, ptss->ds) ||
1120 hvm_load_segment_selector(v, x86_seg_fs, ptss->fs) ||
1121 hvm_load_segment_selector(v, x86_seg_gs, ptss->gs) ||
1122 hvm_load_segment_selector(v, x86_seg_ldtr, ptss->ldt) )
1123 exn_raised = 1;
1125 if ( (ptss->trace & 1) && !exn_raised )
1126 hvm_inject_exception(TRAP_debug, tss_sel & 0xfff8, 0);
1128 hvm_unmap(ptss);
1130 tr.attr.fields.type = 0xb; /* busy 32-bit tss */
1131 hvm_set_segment_register(v, x86_seg_tr, &tr);
1133 v->arch.hvm_vcpu.guest_cr[0] |= X86_CR0_TS;
1134 hvm_update_guest_cr(v, 0);
1136 if ( (taskswitch_reason == TSW_iret) ||
1137 (taskswitch_reason == TSW_jmp) )
1138 clear_bit(41, optss_desc); /* clear B flag of old task */
1140 if ( taskswitch_reason != TSW_iret )
1141 set_bit(41, nptss_desc); /* set B flag of new task */
1143 if ( errcode >= 0 )
1145 struct segment_register reg;
1146 unsigned long linear_addr;
1147 regs->esp -= 4;
1148 hvm_get_segment_register(current, x86_seg_ss, &reg);
1149 /* Todo: do not ignore access faults here. */
1150 if ( hvm_virtual_to_linear_addr(x86_seg_ss, &reg, regs->esp,
1151 4, hvm_access_write, 32,
1152 &linear_addr) )
1153 hvm_copy_to_guest_virt(linear_addr, &errcode, 4);
1156 out:
1157 hvm_unmap(optss_desc);
1158 hvm_unmap(nptss_desc);
1161 /*
1162 * __hvm_copy():
1163 * @buf = hypervisor buffer
1164 * @addr = guest address to copy to/from
1165 * @size = number of bytes to copy
1166 * @dir = copy *to* guest (TRUE) or *from* guest (FALSE)?
1167 * @virt = addr is *virtual* (TRUE) or *guest physical* (FALSE)?
1168 * Returns number of bytes failed to copy (0 == complete success).
1169 */
1170 static int __hvm_copy(void *buf, paddr_t addr, int size, int dir, int virt)
1172 unsigned long gfn, mfn;
1173 p2m_type_t p2mt;
1174 char *p;
1175 int count, todo;
1177 todo = size;
1178 while ( todo > 0 )
1180 count = min_t(int, PAGE_SIZE - (addr & ~PAGE_MASK), todo);
1182 if ( virt )
1183 gfn = paging_gva_to_gfn(current, addr);
1184 else
1185 gfn = addr >> PAGE_SHIFT;
1187 mfn = mfn_x(gfn_to_mfn_current(gfn, &p2mt));
1189 if ( !p2m_is_ram(p2mt) )
1190 return todo;
1191 ASSERT(mfn_valid(mfn));
1193 p = (char *)map_domain_page(mfn) + (addr & ~PAGE_MASK);
1195 if ( dir )
1197 memcpy(p, buf, count); /* dir == TRUE: *to* guest */
1198 paging_mark_dirty(current->domain, mfn);
1200 else
1201 memcpy(buf, p, count); /* dir == FALSE: *from guest */
1203 unmap_domain_page(p);
1205 addr += count;
1206 buf += count;
1207 todo -= count;
1210 return 0;
1213 int hvm_copy_to_guest_phys(paddr_t paddr, void *buf, int size)
1215 return __hvm_copy(buf, paddr, size, 1, 0);
1218 int hvm_copy_from_guest_phys(void *buf, paddr_t paddr, int size)
1220 return __hvm_copy(buf, paddr, size, 0, 0);
1223 int hvm_copy_to_guest_virt(unsigned long vaddr, void *buf, int size)
1225 return __hvm_copy(buf, vaddr, size, 1, 1);
1228 int hvm_copy_from_guest_virt(void *buf, unsigned long vaddr, int size)
1230 return __hvm_copy(buf, vaddr, size, 0, 1);
1234 /* HVM specific printbuf. Mostly used for hvmloader chit-chat. */
1235 void hvm_print_line(struct vcpu *v, const char c)
1237 struct hvm_domain *hd = &v->domain->arch.hvm_domain;
1239 spin_lock(&hd->pbuf_lock);
1240 hd->pbuf[hd->pbuf_idx++] = c;
1241 if ( (hd->pbuf_idx == (sizeof(hd->pbuf) - 2)) || (c == '\n') )
1243 if ( c != '\n' )
1244 hd->pbuf[hd->pbuf_idx++] = '\n';
1245 hd->pbuf[hd->pbuf_idx] = '\0';
1246 printk(XENLOG_G_DEBUG "HVM%u: %s", v->domain->domain_id, hd->pbuf);
1247 hd->pbuf_idx = 0;
1249 spin_unlock(&hd->pbuf_lock);
1252 void hvm_cpuid(unsigned int input, unsigned int *eax, unsigned int *ebx,
1253 unsigned int *ecx, unsigned int *edx)
1255 if ( !cpuid_hypervisor_leaves(input, eax, ebx, ecx, edx) )
1257 cpuid(input, eax, ebx, ecx, edx);
1259 if ( input == 0x00000001 )
1261 struct vcpu *v = current;
1263 clear_bit(X86_FEATURE_MWAIT & 31, ecx);
1265 if ( vlapic_hw_disabled(vcpu_vlapic(v)) )
1266 clear_bit(X86_FEATURE_APIC & 31, edx);
1268 #if CONFIG_PAGING_LEVELS >= 3
1269 if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] )
1270 #endif
1271 clear_bit(X86_FEATURE_PAE & 31, edx);
1272 clear_bit(X86_FEATURE_PSE36 & 31, edx);
1274 else if ( input == 0x80000001 )
1276 #if CONFIG_PAGING_LEVELS >= 3
1277 struct vcpu *v = current;
1278 if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] )
1279 #endif
1280 clear_bit(X86_FEATURE_NX & 31, edx);
1281 #ifdef __i386__
1282 /* Mask feature for Intel ia32e or AMD long mode. */
1283 clear_bit(X86_FEATURE_LAHF_LM & 31, ecx);
1285 clear_bit(X86_FEATURE_LM & 31, edx);
1286 clear_bit(X86_FEATURE_SYSCALL & 31, edx);
1287 #endif
1292 static long hvm_grant_table_op(
1293 unsigned int cmd, XEN_GUEST_HANDLE(void) uop, unsigned int count)
1295 if ( cmd != GNTTABOP_query_size )
1296 return -ENOSYS; /* all other commands need auditing */
1297 return do_grant_table_op(cmd, uop, count);
1300 typedef unsigned long hvm_hypercall_t(
1301 unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
1303 #define HYPERCALL(x) \
1304 [ __HYPERVISOR_ ## x ] = (hvm_hypercall_t *) do_ ## x
1306 #if defined(__i386__)
1308 static hvm_hypercall_t *hvm_hypercall32_table[NR_hypercalls] = {
1309 HYPERCALL(memory_op),
1310 [ __HYPERVISOR_grant_table_op ] = (hvm_hypercall_t *)hvm_grant_table_op,
1311 HYPERCALL(xen_version),
1312 HYPERCALL(grant_table_op),
1313 HYPERCALL(event_channel_op),
1314 HYPERCALL(sched_op),
1315 HYPERCALL(hvm_op)
1316 };
1318 #else /* defined(__x86_64__) */
1320 static long do_memory_op_compat32(int cmd, XEN_GUEST_HANDLE(void) arg)
1322 extern long do_add_to_physmap(struct xen_add_to_physmap *xatp);
1323 long rc;
1325 switch ( cmd )
1327 case XENMEM_add_to_physmap:
1329 struct {
1330 domid_t domid;
1331 uint32_t space;
1332 uint32_t idx;
1333 uint32_t gpfn;
1334 } u;
1335 struct xen_add_to_physmap h;
1337 if ( copy_from_guest(&u, arg, 1) )
1338 return -EFAULT;
1340 h.domid = u.domid;
1341 h.space = u.space;
1342 h.idx = u.idx;
1343 h.gpfn = u.gpfn;
1345 this_cpu(guest_handles_in_xen_space) = 1;
1346 rc = do_memory_op(cmd, guest_handle_from_ptr(&h, void));
1347 this_cpu(guest_handles_in_xen_space) = 0;
1349 break;
1352 default:
1353 gdprintk(XENLOG_WARNING, "memory_op %d.\n", cmd);
1354 rc = -ENOSYS;
1355 break;
1358 return rc;
1361 static hvm_hypercall_t *hvm_hypercall64_table[NR_hypercalls] = {
1362 HYPERCALL(memory_op),
1363 [ __HYPERVISOR_grant_table_op ] = (hvm_hypercall_t *)hvm_grant_table_op,
1364 HYPERCALL(xen_version),
1365 HYPERCALL(grant_table_op),
1366 HYPERCALL(event_channel_op),
1367 HYPERCALL(sched_op),
1368 HYPERCALL(hvm_op)
1369 };
1371 static hvm_hypercall_t *hvm_hypercall32_table[NR_hypercalls] = {
1372 [ __HYPERVISOR_memory_op ] = (hvm_hypercall_t *)do_memory_op_compat32,
1373 [ __HYPERVISOR_grant_table_op ] = (hvm_hypercall_t *)hvm_grant_table_op,
1374 HYPERCALL(xen_version),
1375 HYPERCALL(grant_table_op),
1376 HYPERCALL(event_channel_op),
1377 HYPERCALL(sched_op),
1378 HYPERCALL(hvm_op)
1379 };
1381 #endif /* defined(__x86_64__) */
1383 int hvm_do_hypercall(struct cpu_user_regs *regs)
1385 int flush, mode = hvm_guest_x86_mode(current);
1386 uint32_t eax = regs->eax;
1388 switch ( mode )
1390 #ifdef __x86_64__
1391 case 8:
1392 #endif
1393 case 4:
1394 case 2:
1395 if ( unlikely(ring_3(regs)) )
1397 default:
1398 regs->eax = -EPERM;
1399 return HVM_HCALL_completed;
1401 case 0:
1402 break;
1405 if ( (eax >= NR_hypercalls) || !hvm_hypercall32_table[eax] )
1407 regs->eax = -ENOSYS;
1408 return HVM_HCALL_completed;
1411 /*
1412 * NB. In future flush only on decrease_reservation.
1413 * For now we also need to flush when pages are added, as qemu-dm is not
1414 * yet capable of faulting pages into an existing valid mapcache bucket.
1415 */
1416 flush = ((eax == __HYPERVISOR_memory_op) ||
1417 (eax == __HYPERVISOR_grant_table_op)); /* needed ? */
1418 this_cpu(hc_preempted) = 0;
1420 #ifdef __x86_64__
1421 if ( mode == 8 )
1423 HVM_DBG_LOG(DBG_LEVEL_HCALL, "hcall%u(%lx, %lx, %lx, %lx, %lx)", eax,
1424 regs->rdi, regs->rsi, regs->rdx, regs->r10, regs->r8);
1426 regs->rax = hvm_hypercall64_table[eax](regs->rdi,
1427 regs->rsi,
1428 regs->rdx,
1429 regs->r10,
1430 regs->r8);
1432 else
1433 #endif
1435 HVM_DBG_LOG(DBG_LEVEL_HCALL, "hcall%u(%x, %x, %x, %x, %x)", eax,
1436 (uint32_t)regs->ebx, (uint32_t)regs->ecx,
1437 (uint32_t)regs->edx, (uint32_t)regs->esi,
1438 (uint32_t)regs->edi);
1440 regs->eax = hvm_hypercall32_table[eax]((uint32_t)regs->ebx,
1441 (uint32_t)regs->ecx,
1442 (uint32_t)regs->edx,
1443 (uint32_t)regs->esi,
1444 (uint32_t)regs->edi);
1447 HVM_DBG_LOG(DBG_LEVEL_HCALL, "hcall%u -> %lx",
1448 eax, (unsigned long)regs->eax);
1450 return (this_cpu(hc_preempted) ? HVM_HCALL_preempted :
1451 flush ? HVM_HCALL_invalidate : HVM_HCALL_completed);
1454 static void hvm_latch_shinfo_size(struct domain *d)
1456 /*
1457 * Called from operations which are among the very first executed by
1458 * PV drivers on initialisation or after save/restore. These are sensible
1459 * points at which to sample the execution mode of the guest and latch
1460 * 32- or 64-bit format for shared state.
1461 */
1462 if ( current->domain == d )
1463 d->arch.has_32bit_shinfo = (hvm_guest_x86_mode(current) != 8);
1466 /* Initialise a hypercall transfer page for a VMX domain using
1467 paravirtualised drivers. */
1468 void hvm_hypercall_page_initialise(struct domain *d,
1469 void *hypercall_page)
1471 hvm_latch_shinfo_size(d);
1472 hvm_funcs.init_hypercall_page(d, hypercall_page);
1476 /*
1477 * only called in HVM domain BSP context
1478 * when booting, vcpuid is always equal to apic_id
1479 */
1480 int hvm_bringup_ap(int vcpuid, int trampoline_vector)
1482 struct vcpu *v;
1483 struct domain *d = current->domain;
1484 struct vcpu_guest_context *ctxt;
1485 int rc = 0;
1487 BUG_ON(!is_hvm_domain(d));
1489 if ( (v = d->vcpu[vcpuid]) == NULL )
1490 return -ENOENT;
1492 if ( (ctxt = xmalloc(struct vcpu_guest_context)) == NULL )
1494 gdprintk(XENLOG_ERR,
1495 "Failed to allocate memory in hvm_bringup_ap.\n");
1496 return -ENOMEM;
1499 hvm_init_ap_context(ctxt, vcpuid, trampoline_vector);
1501 /* Sync AP's TSC with BSP's. */
1502 v->arch.hvm_vcpu.cache_tsc_offset =
1503 v->domain->vcpu[0]->arch.hvm_vcpu.cache_tsc_offset;
1504 hvm_funcs.set_tsc_offset(v, v->arch.hvm_vcpu.cache_tsc_offset);
1506 LOCK_BIGLOCK(d);
1507 rc = -EEXIST;
1508 if ( !v->is_initialised )
1509 rc = boot_vcpu(d, vcpuid, ctxt);
1510 UNLOCK_BIGLOCK(d);
1512 if ( rc != 0 )
1514 gdprintk(XENLOG_ERR,
1515 "AP %d bringup failed in boot_vcpu %x.\n", vcpuid, rc);
1516 goto out;
1519 if ( test_and_clear_bit(_VPF_down, &v->pause_flags) )
1520 vcpu_wake(v);
1521 gdprintk(XENLOG_INFO, "AP %d bringup suceeded.\n", vcpuid);
1523 out:
1524 xfree(ctxt);
1525 return rc;
1528 static int hvmop_set_pci_intx_level(
1529 XEN_GUEST_HANDLE(xen_hvm_set_pci_intx_level_t) uop)
1531 struct xen_hvm_set_pci_intx_level op;
1532 struct domain *d;
1533 int rc;
1535 if ( copy_from_guest(&op, uop, 1) )
1536 return -EFAULT;
1538 if ( !IS_PRIV(current->domain) )
1539 return -EPERM;
1541 if ( (op.domain > 0) || (op.bus > 0) || (op.device > 31) || (op.intx > 3) )
1542 return -EINVAL;
1544 d = rcu_lock_domain_by_id(op.domid);
1545 if ( d == NULL )
1546 return -ESRCH;
1548 rc = -EINVAL;
1549 if ( !is_hvm_domain(d) )
1550 goto out;
1552 rc = xsm_hvm_set_pci_intx_level(d);
1553 if ( rc )
1554 goto out;
1556 rc = 0;
1557 switch ( op.level )
1559 case 0:
1560 hvm_pci_intx_deassert(d, op.device, op.intx);
1561 break;
1562 case 1:
1563 hvm_pci_intx_assert(d, op.device, op.intx);
1564 break;
1565 default:
1566 rc = -EINVAL;
1567 break;
1570 out:
1571 rcu_unlock_domain(d);
1572 return rc;
1575 static int hvmop_set_isa_irq_level(
1576 XEN_GUEST_HANDLE(xen_hvm_set_isa_irq_level_t) uop)
1578 struct xen_hvm_set_isa_irq_level op;
1579 struct domain *d;
1580 int rc;
1582 if ( copy_from_guest(&op, uop, 1) )
1583 return -EFAULT;
1585 if ( !IS_PRIV(current->domain) )
1586 return -EPERM;
1588 if ( op.isa_irq > 15 )
1589 return -EINVAL;
1591 d = rcu_lock_domain_by_id(op.domid);
1592 if ( d == NULL )
1593 return -ESRCH;
1595 rc = -EINVAL;
1596 if ( !is_hvm_domain(d) )
1597 goto out;
1599 rc = xsm_hvm_set_isa_irq_level(d);
1600 if ( rc )
1601 goto out;
1603 rc = 0;
1604 switch ( op.level )
1606 case 0:
1607 hvm_isa_irq_deassert(d, op.isa_irq);
1608 break;
1609 case 1:
1610 hvm_isa_irq_assert(d, op.isa_irq);
1611 break;
1612 default:
1613 rc = -EINVAL;
1614 break;
1617 out:
1618 rcu_unlock_domain(d);
1619 return rc;
1622 static int hvmop_set_pci_link_route(
1623 XEN_GUEST_HANDLE(xen_hvm_set_pci_link_route_t) uop)
1625 struct xen_hvm_set_pci_link_route op;
1626 struct domain *d;
1627 int rc;
1629 if ( copy_from_guest(&op, uop, 1) )
1630 return -EFAULT;
1632 if ( !IS_PRIV(current->domain) )
1633 return -EPERM;
1635 if ( (op.link > 3) || (op.isa_irq > 15) )
1636 return -EINVAL;
1638 d = rcu_lock_domain_by_id(op.domid);
1639 if ( d == NULL )
1640 return -ESRCH;
1642 rc = -EINVAL;
1643 if ( !is_hvm_domain(d) )
1644 goto out;
1646 rc = xsm_hvm_set_pci_link_route(d);
1647 if ( rc )
1648 goto out;
1650 rc = 0;
1651 hvm_set_pci_link_route(d, op.link, op.isa_irq);
1653 out:
1654 rcu_unlock_domain(d);
1655 return rc;
1658 static int hvmop_flush_tlb_all(void)
1660 flush_tlb_mask(current->domain->domain_dirty_cpumask);
1661 return 0;
1664 long do_hvm_op(unsigned long op, XEN_GUEST_HANDLE(void) arg)
1667 long rc = 0;
1669 switch ( op )
1671 case HVMOP_set_param:
1672 case HVMOP_get_param:
1674 struct xen_hvm_param a;
1675 struct hvm_ioreq_page *iorp;
1676 struct domain *d;
1677 struct vcpu *v;
1679 if ( copy_from_guest(&a, arg, 1) )
1680 return -EFAULT;
1682 if ( a.index >= HVM_NR_PARAMS )
1683 return -EINVAL;
1685 if ( a.domid == DOMID_SELF )
1686 d = rcu_lock_current_domain();
1687 else if ( IS_PRIV(current->domain) )
1688 d = rcu_lock_domain_by_id(a.domid);
1689 else
1690 return -EPERM;
1692 if ( d == NULL )
1693 return -ESRCH;
1695 rc = -EINVAL;
1696 if ( !is_hvm_domain(d) )
1697 goto param_fail;
1699 rc = xsm_hvm_param(d, op);
1700 if ( rc )
1701 goto param_fail;
1703 if ( op == HVMOP_set_param )
1705 switch ( a.index )
1707 case HVM_PARAM_IOREQ_PFN:
1708 iorp = &d->arch.hvm_domain.ioreq;
1709 rc = hvm_set_ioreq_page(d, iorp, a.value);
1710 spin_lock(&iorp->lock);
1711 if ( (rc == 0) && (iorp->va != NULL) )
1712 /* Initialise evtchn port info if VCPUs already created. */
1713 for_each_vcpu ( d, v )
1714 get_ioreq(v)->vp_eport = v->arch.hvm_vcpu.xen_port;
1715 spin_unlock(&iorp->lock);
1716 break;
1717 case HVM_PARAM_BUFIOREQ_PFN:
1718 iorp = &d->arch.hvm_domain.buf_ioreq;
1719 rc = hvm_set_ioreq_page(d, iorp, a.value);
1720 break;
1721 case HVM_PARAM_CALLBACK_IRQ:
1722 hvm_set_callback_via(d, a.value);
1723 hvm_latch_shinfo_size(d);
1724 break;
1726 d->arch.hvm_domain.params[a.index] = a.value;
1727 rc = 0;
1729 else
1731 a.value = d->arch.hvm_domain.params[a.index];
1732 rc = copy_to_guest(arg, &a, 1) ? -EFAULT : 0;
1735 HVM_DBG_LOG(DBG_LEVEL_HCALL, "%s param %u = %"PRIx64,
1736 op == HVMOP_set_param ? "set" : "get",
1737 a.index, a.value);
1739 param_fail:
1740 rcu_unlock_domain(d);
1741 break;
1744 case HVMOP_set_pci_intx_level:
1745 rc = hvmop_set_pci_intx_level(
1746 guest_handle_cast(arg, xen_hvm_set_pci_intx_level_t));
1747 break;
1749 case HVMOP_set_isa_irq_level:
1750 rc = hvmop_set_isa_irq_level(
1751 guest_handle_cast(arg, xen_hvm_set_isa_irq_level_t));
1752 break;
1754 case HVMOP_set_pci_link_route:
1755 rc = hvmop_set_pci_link_route(
1756 guest_handle_cast(arg, xen_hvm_set_pci_link_route_t));
1757 break;
1759 case HVMOP_flush_tlbs:
1760 rc = guest_handle_is_null(arg) ? hvmop_flush_tlb_all() : -ENOSYS;
1761 break;
1763 default:
1765 gdprintk(XENLOG_WARNING, "Bad HVM op %ld.\n", op);
1766 rc = -ENOSYS;
1767 break;
1771 return rc;
1774 /*
1775 * Local variables:
1776 * mode: C
1777 * c-set-style: "BSD"
1778 * c-basic-offset: 4
1779 * tab-width: 4
1780 * indent-tabs-mode: nil
1781 * End:
1782 */