ia64/xen-unstable

view xen/arch/x86/hvm/hvm.c @ 15812:86a154e1ef5d

[HVM] Shadow: don't shadow the p2m table.
For HVM vcpus with paging disabled, we used to shadow the p2m table,
and skip the p2m lookup to go from gfn to mfn. Instead, we now
provide a simple pagetable that gives a one-to-one mapping of 4GB, and
shadow that, making the translations from gfn to mfn via the p2m.
This removes the paging-disabled special-case code from the shadow
fault handler, and allows us to expand the p2m interface, since all HVM
translations now go through the same p2m lookups.
Signed-off-by: Tim Deegan <Tim.Deegan@xensource.com>
author Tim Deegan <Tim.Deegan@xensource.com>
date Fri Aug 31 11:06:22 2007 +0100 (2007-08-31)
parents c398dad9d50a
children 96f64f4c42f0
line source
1 /*
2 * hvm.c: Common hardware virtual machine abstractions.
3 *
4 * Copyright (c) 2004, Intel Corporation.
5 * Copyright (c) 2005, International Business Machines Corporation.
6 *
7 * This program is free software; you can redistribute it and/or modify it
8 * under the terms and conditions of the GNU General Public License,
9 * version 2, as published by the Free Software Foundation.
10 *
11 * This program is distributed in the hope it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
14 * more details.
15 *
16 * You should have received a copy of the GNU General Public License along with
17 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
18 * Place - Suite 330, Boston, MA 02111-1307 USA.
19 */
21 #include <xen/config.h>
22 #include <xen/init.h>
23 #include <xen/lib.h>
24 #include <xen/trace.h>
25 #include <xen/sched.h>
26 #include <xen/irq.h>
27 #include <xen/softirq.h>
28 #include <xen/domain.h>
29 #include <xen/domain_page.h>
30 #include <xen/hypercall.h>
31 #include <xen/guest_access.h>
32 #include <xen/event.h>
33 #include <asm/current.h>
34 #include <asm/e820.h>
35 #include <asm/io.h>
36 #include <asm/paging.h>
37 #include <asm/regs.h>
38 #include <asm/cpufeature.h>
39 #include <asm/processor.h>
40 #include <asm/types.h>
41 #include <asm/msr.h>
42 #include <asm/mc146818rtc.h>
43 #include <asm/spinlock.h>
44 #include <asm/hvm/hvm.h>
45 #include <asm/hvm/vpt.h>
46 #include <asm/hvm/support.h>
47 #include <public/sched.h>
48 #include <public/hvm/ioreq.h>
49 #include <public/version.h>
50 #include <public/memory.h>
52 int hvm_enabled __read_mostly;
54 unsigned int opt_hvm_debug_level __read_mostly;
55 integer_param("hvm_debug", opt_hvm_debug_level);
57 struct hvm_function_table hvm_funcs __read_mostly;
59 /* I/O permission bitmap is globally shared by all HVM guests. */
60 char __attribute__ ((__section__ (".bss.page_aligned")))
61 hvm_io_bitmap[3*PAGE_SIZE];
63 void hvm_enable(struct hvm_function_table *fns)
64 {
65 BUG_ON(hvm_enabled);
66 printk("HVM: %s enabled\n", fns->name);
68 /*
69 * Allow direct access to the PC debug port (it is often used for I/O
70 * delays, but the vmexits simply slow things down).
71 */
72 memset(hvm_io_bitmap, ~0, sizeof(hvm_io_bitmap));
73 clear_bit(0x80, hvm_io_bitmap);
75 hvm_funcs = *fns;
76 hvm_enabled = 1;
77 }
79 void hvm_set_guest_time(struct vcpu *v, u64 gtime)
80 {
81 u64 host_tsc;
83 rdtscll(host_tsc);
85 v->arch.hvm_vcpu.cache_tsc_offset = gtime - host_tsc;
86 hvm_funcs.set_tsc_offset(v, v->arch.hvm_vcpu.cache_tsc_offset);
87 }
89 u64 hvm_get_guest_time(struct vcpu *v)
90 {
91 u64 host_tsc;
93 rdtscll(host_tsc);
94 return host_tsc + v->arch.hvm_vcpu.cache_tsc_offset;
95 }
97 void hvm_migrate_timers(struct vcpu *v)
98 {
99 rtc_migrate_timers(v);
100 hpet_migrate_timers(v);
101 pt_migrate(v);
102 }
104 void hvm_do_resume(struct vcpu *v)
105 {
106 ioreq_t *p;
108 if ( !v->fpu_dirtied )
109 hvm_funcs.stts(v);
111 pt_thaw_time(v);
113 /* NB. Optimised for common case (p->state == STATE_IOREQ_NONE). */
114 p = &get_ioreq(v)->vp_ioreq;
115 while ( p->state != STATE_IOREQ_NONE )
116 {
117 switch ( p->state )
118 {
119 case STATE_IORESP_READY: /* IORESP_READY -> NONE */
120 hvm_io_assist();
121 break;
122 case STATE_IOREQ_READY: /* IOREQ_{READY,INPROCESS} -> IORESP_READY */
123 case STATE_IOREQ_INPROCESS:
124 wait_on_xen_event_channel(v->arch.hvm_vcpu.xen_port,
125 (p->state != STATE_IOREQ_READY) &&
126 (p->state != STATE_IOREQ_INPROCESS));
127 break;
128 default:
129 gdprintk(XENLOG_ERR, "Weird HVM iorequest state %d.\n", p->state);
130 domain_crash_synchronous();
131 }
132 }
133 }
135 static void hvm_init_ioreq_page(
136 struct domain *d, struct hvm_ioreq_page *iorp)
137 {
138 memset(iorp, 0, sizeof(*iorp));
139 spin_lock_init(&iorp->lock);
140 domain_pause(d);
141 }
143 static void hvm_destroy_ioreq_page(
144 struct domain *d, struct hvm_ioreq_page *iorp)
145 {
146 spin_lock(&iorp->lock);
148 ASSERT(d->is_dying);
150 if ( iorp->va != NULL )
151 {
152 unmap_domain_page_global(iorp->va);
153 put_page_and_type(iorp->page);
154 iorp->va = NULL;
155 }
157 spin_unlock(&iorp->lock);
158 }
160 static int hvm_set_ioreq_page(
161 struct domain *d, struct hvm_ioreq_page *iorp, unsigned long gmfn)
162 {
163 struct page_info *page;
164 unsigned long mfn;
165 void *va;
167 mfn = gmfn_to_mfn(d, gmfn);
168 if ( !mfn_valid(mfn) )
169 return -EINVAL;
171 page = mfn_to_page(mfn);
172 if ( !get_page_and_type(page, d, PGT_writable_page) )
173 return -EINVAL;
175 va = map_domain_page_global(mfn);
176 if ( va == NULL )
177 {
178 put_page_and_type(page);
179 return -ENOMEM;
180 }
182 spin_lock(&iorp->lock);
184 if ( (iorp->va != NULL) || d->is_dying )
185 {
186 spin_unlock(&iorp->lock);
187 unmap_domain_page_global(va);
188 put_page_and_type(mfn_to_page(mfn));
189 return -EINVAL;
190 }
192 iorp->va = va;
193 iorp->page = page;
195 spin_unlock(&iorp->lock);
197 domain_unpause(d);
199 return 0;
200 }
202 int hvm_domain_initialise(struct domain *d)
203 {
204 int rc;
206 if ( !hvm_enabled )
207 {
208 gdprintk(XENLOG_WARNING, "Attempt to create a HVM guest "
209 "on a non-VT/AMDV platform.\n");
210 return -EINVAL;
211 }
213 spin_lock_init(&d->arch.hvm_domain.pbuf_lock);
214 spin_lock_init(&d->arch.hvm_domain.irq_lock);
216 rc = paging_enable(d, PG_refcounts|PG_translate|PG_external);
217 if ( rc != 0 )
218 return rc;
220 vpic_init(d);
221 vioapic_init(d);
223 hvm_init_ioreq_page(d, &d->arch.hvm_domain.ioreq);
224 hvm_init_ioreq_page(d, &d->arch.hvm_domain.buf_ioreq);
226 return hvm_funcs.domain_initialise(d);
227 }
229 void hvm_domain_relinquish_resources(struct domain *d)
230 {
231 hvm_destroy_ioreq_page(d, &d->arch.hvm_domain.ioreq);
232 hvm_destroy_ioreq_page(d, &d->arch.hvm_domain.buf_ioreq);
234 pit_deinit(d);
235 rtc_deinit(d);
236 pmtimer_deinit(d);
237 hpet_deinit(d);
238 }
240 void hvm_domain_destroy(struct domain *d)
241 {
242 hvm_funcs.domain_destroy(d);
243 }
245 static int hvm_save_cpu_ctxt(struct domain *d, hvm_domain_context_t *h)
246 {
247 struct vcpu *v;
248 struct hvm_hw_cpu ctxt;
249 struct vcpu_guest_context *vc;
251 for_each_vcpu(d, v)
252 {
253 /* We don't need to save state for a vcpu that is down; the restore
254 * code will leave it down if there is nothing saved. */
255 if ( test_bit(_VPF_down, &v->pause_flags) )
256 continue;
258 /* Architecture-specific vmcs/vmcb bits */
259 hvm_funcs.save_cpu_ctxt(v, &ctxt);
261 /* Other vcpu register state */
262 vc = &v->arch.guest_context;
263 if ( v->fpu_initialised )
264 memcpy(ctxt.fpu_regs, &vc->fpu_ctxt, sizeof(ctxt.fpu_regs));
265 else
266 memset(ctxt.fpu_regs, 0, sizeof(ctxt.fpu_regs));
267 ctxt.rax = vc->user_regs.eax;
268 ctxt.rbx = vc->user_regs.ebx;
269 ctxt.rcx = vc->user_regs.ecx;
270 ctxt.rdx = vc->user_regs.edx;
271 ctxt.rbp = vc->user_regs.ebp;
272 ctxt.rsi = vc->user_regs.esi;
273 ctxt.rdi = vc->user_regs.edi;
274 /* %rsp handled by arch-specific call above */
275 #ifdef __x86_64__
276 ctxt.r8 = vc->user_regs.r8;
277 ctxt.r9 = vc->user_regs.r9;
278 ctxt.r10 = vc->user_regs.r10;
279 ctxt.r11 = vc->user_regs.r11;
280 ctxt.r12 = vc->user_regs.r12;
281 ctxt.r13 = vc->user_regs.r13;
282 ctxt.r14 = vc->user_regs.r14;
283 ctxt.r15 = vc->user_regs.r15;
284 #endif
285 ctxt.dr0 = vc->debugreg[0];
286 ctxt.dr1 = vc->debugreg[1];
287 ctxt.dr2 = vc->debugreg[2];
288 ctxt.dr3 = vc->debugreg[3];
289 ctxt.dr6 = vc->debugreg[6];
290 ctxt.dr7 = vc->debugreg[7];
292 if ( hvm_save_entry(CPU, v->vcpu_id, h, &ctxt) != 0 )
293 return 1;
294 }
295 return 0;
296 }
298 static int hvm_load_cpu_ctxt(struct domain *d, hvm_domain_context_t *h)
299 {
300 int vcpuid, rc;
301 struct vcpu *v;
302 struct hvm_hw_cpu ctxt;
303 struct vcpu_guest_context *vc;
305 /* Which vcpu is this? */
306 vcpuid = hvm_load_instance(h);
307 if ( vcpuid > MAX_VIRT_CPUS || (v = d->vcpu[vcpuid]) == NULL )
308 {
309 gdprintk(XENLOG_ERR, "HVM restore: domain has no vcpu %u\n", vcpuid);
310 return -EINVAL;
311 }
312 vc = &v->arch.guest_context;
314 /* Need to init this vcpu before loading its contents */
315 LOCK_BIGLOCK(d);
316 if ( !v->is_initialised )
317 if ( (rc = boot_vcpu(d, vcpuid, vc)) != 0 )
318 return rc;
319 UNLOCK_BIGLOCK(d);
321 if ( hvm_load_entry(CPU, h, &ctxt) != 0 )
322 return -EINVAL;
324 /* Architecture-specific vmcs/vmcb bits */
325 if ( hvm_funcs.load_cpu_ctxt(v, &ctxt) < 0 )
326 return -EINVAL;
328 /* Other vcpu register state */
329 memcpy(&vc->fpu_ctxt, ctxt.fpu_regs, sizeof(ctxt.fpu_regs));
330 vc->user_regs.eax = ctxt.rax;
331 vc->user_regs.ebx = ctxt.rbx;
332 vc->user_regs.ecx = ctxt.rcx;
333 vc->user_regs.edx = ctxt.rdx;
334 vc->user_regs.ebp = ctxt.rbp;
335 vc->user_regs.esi = ctxt.rsi;
336 vc->user_regs.edi = ctxt.rdi;
337 vc->user_regs.esp = ctxt.rsp;
338 #ifdef __x86_64__
339 vc->user_regs.r8 = ctxt.r8;
340 vc->user_regs.r9 = ctxt.r9;
341 vc->user_regs.r10 = ctxt.r10;
342 vc->user_regs.r11 = ctxt.r11;
343 vc->user_regs.r12 = ctxt.r12;
344 vc->user_regs.r13 = ctxt.r13;
345 vc->user_regs.r14 = ctxt.r14;
346 vc->user_regs.r15 = ctxt.r15;
347 #endif
348 vc->debugreg[0] = ctxt.dr0;
349 vc->debugreg[1] = ctxt.dr1;
350 vc->debugreg[2] = ctxt.dr2;
351 vc->debugreg[3] = ctxt.dr3;
352 vc->debugreg[6] = ctxt.dr6;
353 vc->debugreg[7] = ctxt.dr7;
355 vc->flags = VGCF_online;
356 v->fpu_initialised = 1;
358 /* Auxiliary processors should be woken immediately. */
359 if ( test_and_clear_bit(_VPF_down, &v->pause_flags) )
360 vcpu_wake(v);
362 return 0;
363 }
365 HVM_REGISTER_SAVE_RESTORE(CPU, hvm_save_cpu_ctxt, hvm_load_cpu_ctxt,
366 1, HVMSR_PER_VCPU);
368 int hvm_vcpu_initialise(struct vcpu *v)
369 {
370 int rc;
372 if ( (rc = vlapic_init(v)) != 0 )
373 return rc;
375 if ( (rc = hvm_funcs.vcpu_initialise(v)) != 0 )
376 {
377 vlapic_destroy(v);
378 return rc;
379 }
381 /* Create ioreq event channel. */
382 rc = alloc_unbound_xen_event_channel(v, 0);
383 if ( rc < 0 )
384 {
385 hvm_funcs.vcpu_destroy(v);
386 vlapic_destroy(v);
387 return rc;
388 }
390 /* Register ioreq event channel. */
391 v->arch.hvm_vcpu.xen_port = rc;
392 spin_lock(&v->domain->arch.hvm_domain.ioreq.lock);
393 if ( v->domain->arch.hvm_domain.ioreq.va != NULL )
394 get_ioreq(v)->vp_eport = v->arch.hvm_vcpu.xen_port;
395 spin_unlock(&v->domain->arch.hvm_domain.ioreq.lock);
397 spin_lock_init(&v->arch.hvm_vcpu.tm_lock);
398 INIT_LIST_HEAD(&v->arch.hvm_vcpu.tm_list);
400 if ( v->vcpu_id == 0 )
401 {
402 /* NB. All these really belong in hvm_domain_initialise(). */
403 pit_init(v, cpu_khz);
404 rtc_init(v, RTC_PORT(0));
405 pmtimer_init(v);
406 hpet_init(v);
408 /* Init guest TSC to start from zero. */
409 hvm_set_guest_time(v, 0);
410 }
412 return 0;
413 }
415 void hvm_vcpu_destroy(struct vcpu *v)
416 {
417 vlapic_destroy(v);
418 hvm_funcs.vcpu_destroy(v);
420 /* Event channel is already freed by evtchn_destroy(). */
421 /*free_xen_event_channel(v, v->arch.hvm_vcpu.xen_port);*/
422 }
425 void hvm_vcpu_reset(struct vcpu *v)
426 {
427 vcpu_pause(v);
429 vlapic_reset(vcpu_vlapic(v));
431 hvm_funcs.vcpu_initialise(v);
433 set_bit(_VPF_down, &v->pause_flags);
434 clear_bit(_VPF_blocked, &v->pause_flags);
435 v->fpu_initialised = 0;
436 v->fpu_dirtied = 0;
437 v->is_initialised = 0;
439 vcpu_unpause(v);
440 }
442 static void hvm_vcpu_down(void)
443 {
444 struct vcpu *v = current;
445 struct domain *d = v->domain;
446 int online_count = 0;
448 gdprintk(XENLOG_INFO, "DOM%d/VCPU%d: going offline.\n",
449 d->domain_id, v->vcpu_id);
451 /* Doesn't halt us immediately, but we'll never return to guest context. */
452 set_bit(_VPF_down, &v->pause_flags);
453 vcpu_sleep_nosync(v);
455 /* Any other VCPUs online? ... */
456 LOCK_BIGLOCK(d);
457 for_each_vcpu ( d, v )
458 if ( !test_bit(_VPF_down, &v->pause_flags) )
459 online_count++;
460 UNLOCK_BIGLOCK(d);
462 /* ... Shut down the domain if not. */
463 if ( online_count == 0 )
464 {
465 gdprintk(XENLOG_INFO, "DOM%d: all CPUs offline -- powering off.\n",
466 d->domain_id);
467 domain_shutdown(d, SHUTDOWN_poweroff);
468 }
469 }
471 void hvm_send_assist_req(struct vcpu *v)
472 {
473 ioreq_t *p;
475 if ( unlikely(!vcpu_start_shutdown_deferral(v)) )
476 return; /* implicitly bins the i/o operation */
478 p = &get_ioreq(v)->vp_ioreq;
479 if ( unlikely(p->state != STATE_IOREQ_NONE) )
480 {
481 /* This indicates a bug in the device model. Crash the domain. */
482 gdprintk(XENLOG_ERR, "Device model set bad IO state %d.\n", p->state);
483 domain_crash_synchronous();
484 }
486 prepare_wait_on_xen_event_channel(v->arch.hvm_vcpu.xen_port);
488 /*
489 * Following happens /after/ blocking and setting up ioreq contents.
490 * prepare_wait_on_xen_event_channel() is an implicit barrier.
491 */
492 p->state = STATE_IOREQ_READY;
493 notify_via_xen_event_channel(v->arch.hvm_vcpu.xen_port);
494 }
496 void hvm_hlt(unsigned long rflags)
497 {
498 /*
499 * If we halt with interrupts disabled, that's a pretty sure sign that we
500 * want to shut down. In a real processor, NMIs are the only way to break
501 * out of this.
502 */
503 if ( unlikely(!(rflags & X86_EFLAGS_IF)) )
504 return hvm_vcpu_down();
506 do_sched_op_compat(SCHEDOP_block, 0);
507 }
509 void hvm_triple_fault(void)
510 {
511 struct vcpu *v = current;
512 gdprintk(XENLOG_INFO, "Triple fault on VCPU%d - "
513 "invoking HVM system reset.\n", v->vcpu_id);
514 domain_shutdown(v->domain, SHUTDOWN_reboot);
515 }
517 int hvm_set_cr0(unsigned long value)
518 {
519 struct vcpu *v = current;
520 unsigned long mfn, old_value = v->arch.hvm_vcpu.guest_cr[0];
522 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR0 value = %lx", value);
524 if ( (u32)value != value )
525 {
526 HVM_DBG_LOG(DBG_LEVEL_1,
527 "Guest attempts to set upper 32 bits in CR0: %lx",
528 value);
529 hvm_inject_exception(TRAP_gp_fault, 0, 0);
530 return 0;
531 }
533 value &= ~HVM_CR0_GUEST_RESERVED_BITS;
535 /* ET is reserved and should be always be 1. */
536 value |= X86_CR0_ET;
538 if ( (value & (X86_CR0_PE|X86_CR0_PG)) == X86_CR0_PG )
539 {
540 hvm_inject_exception(TRAP_gp_fault, 0, 0);
541 return 0;
542 }
544 if ( (value & X86_CR0_PG) && !(old_value & X86_CR0_PG) )
545 {
546 if ( v->arch.hvm_vcpu.guest_efer & EFER_LME )
547 {
548 if ( !(v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_PAE) )
549 {
550 HVM_DBG_LOG(DBG_LEVEL_1, "Enable paging before PAE enable");
551 hvm_inject_exception(TRAP_gp_fault, 0, 0);
552 return 0;
553 }
554 HVM_DBG_LOG(DBG_LEVEL_1, "Enabling long mode");
555 v->arch.hvm_vcpu.guest_efer |= EFER_LMA;
556 hvm_update_guest_efer(v);
557 }
559 if ( !paging_mode_hap(v->domain) )
560 {
561 /* The guest CR3 must be pointing to the guest physical. */
562 mfn = get_mfn_from_gpfn(v->arch.hvm_vcpu.guest_cr[3]>>PAGE_SHIFT);
563 if ( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), v->domain))
564 {
565 gdprintk(XENLOG_ERR, "Invalid CR3 value = %lx (mfn=%lx)\n",
566 v->arch.hvm_vcpu.guest_cr[3], mfn);
567 domain_crash(v->domain);
568 return 0;
569 }
571 /* Now arch.guest_table points to machine physical. */
572 v->arch.guest_table = pagetable_from_pfn(mfn);
574 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, mfn = %lx",
575 v->arch.hvm_vcpu.guest_cr[3], mfn);
576 }
577 }
578 else if ( !(value & X86_CR0_PG) && (old_value & X86_CR0_PG) )
579 {
580 /* When CR0.PG is cleared, LMA is cleared immediately. */
581 if ( hvm_long_mode_enabled(v) )
582 {
583 v->arch.hvm_vcpu.guest_efer &= ~EFER_LMA;
584 hvm_update_guest_efer(v);
585 }
587 if ( !paging_mode_hap(v->domain) )
588 {
589 put_page(mfn_to_page(get_mfn_from_gpfn(
590 v->arch.hvm_vcpu.guest_cr[3] >> PAGE_SHIFT)));
591 v->arch.guest_table = pagetable_null();
592 }
593 }
595 v->arch.hvm_vcpu.guest_cr[0] = value;
596 hvm_update_guest_cr(v, 0);
598 if ( (value ^ old_value) & X86_CR0_PG )
599 paging_update_paging_modes(v);
601 return 1;
602 }
604 int hvm_set_cr3(unsigned long value)
605 {
606 unsigned long old_base_mfn, mfn;
607 struct vcpu *v = current;
609 if ( paging_mode_hap(v->domain) || !hvm_paging_enabled(v) )
610 {
611 /* Nothing to do. */
612 }
613 else if ( value == v->arch.hvm_vcpu.guest_cr[3] )
614 {
615 /* Shadow-mode TLB flush. Invalidate the shadow. */
616 mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
617 if ( mfn != pagetable_get_pfn(v->arch.guest_table) )
618 goto bad_cr3;
619 }
620 else
621 {
622 /* Shadow-mode CR3 change. Check PDBR and then make a new shadow. */
623 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 value = %lx", value);
624 mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
625 if ( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), v->domain) )
626 goto bad_cr3;
628 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
629 v->arch.guest_table = pagetable_from_pfn(mfn);
631 if ( old_base_mfn )
632 put_page(mfn_to_page(old_base_mfn));
634 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx", value);
635 }
637 v->arch.hvm_vcpu.guest_cr[3] = value;
638 paging_update_cr3(v);
639 return 1;
641 bad_cr3:
642 gdprintk(XENLOG_ERR, "Invalid CR3\n");
643 domain_crash(v->domain);
644 return 0;
645 }
647 int hvm_set_cr4(unsigned long value)
648 {
649 struct vcpu *v = current;
650 unsigned long old_cr;
652 if ( value & HVM_CR4_GUEST_RESERVED_BITS )
653 {
654 HVM_DBG_LOG(DBG_LEVEL_1,
655 "Guest attempts to set reserved bit in CR4: %lx",
656 value);
657 goto gpf;
658 }
660 if ( !(value & X86_CR4_PAE) && hvm_long_mode_enabled(v) )
661 {
662 HVM_DBG_LOG(DBG_LEVEL_1, "Guest cleared CR4.PAE while "
663 "EFER.LMA is set");
664 goto gpf;
665 }
667 old_cr = v->arch.hvm_vcpu.guest_cr[4];
668 v->arch.hvm_vcpu.guest_cr[4] = value;
669 hvm_update_guest_cr(v, 4);
671 /* Modifying CR4.{PSE,PAE,PGE} invalidates all TLB entries, inc. Global. */
672 if ( (old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE) )
673 paging_update_paging_modes(v);
675 return 1;
677 gpf:
678 hvm_inject_exception(TRAP_gp_fault, 0, 0);
679 return 0;
680 }
682 /*
683 * __hvm_copy():
684 * @buf = hypervisor buffer
685 * @addr = guest address to copy to/from
686 * @size = number of bytes to copy
687 * @dir = copy *to* guest (TRUE) or *from* guest (FALSE)?
688 * @virt = addr is *virtual* (TRUE) or *guest physical* (FALSE)?
689 * Returns number of bytes failed to copy (0 == complete success).
690 */
691 static int __hvm_copy(void *buf, paddr_t addr, int size, int dir, int virt)
692 {
693 unsigned long gfn, mfn;
694 char *p;
695 int count, todo;
697 todo = size;
698 while ( todo > 0 )
699 {
700 count = min_t(int, PAGE_SIZE - (addr & ~PAGE_MASK), todo);
702 if ( virt )
703 gfn = paging_gva_to_gfn(current, addr);
704 else
705 gfn = addr >> PAGE_SHIFT;
707 mfn = get_mfn_from_gpfn(gfn);
709 if ( mfn == INVALID_MFN )
710 return todo;
712 p = (char *)map_domain_page(mfn) + (addr & ~PAGE_MASK);
714 if ( dir )
715 {
716 memcpy(p, buf, count); /* dir == TRUE: *to* guest */
717 paging_mark_dirty(current->domain, mfn);
718 }
719 else
720 memcpy(buf, p, count); /* dir == FALSE: *from guest */
722 unmap_domain_page(p);
724 addr += count;
725 buf += count;
726 todo -= count;
727 }
729 return 0;
730 }
732 int hvm_copy_to_guest_phys(paddr_t paddr, void *buf, int size)
733 {
734 return __hvm_copy(buf, paddr, size, 1, 0);
735 }
737 int hvm_copy_from_guest_phys(void *buf, paddr_t paddr, int size)
738 {
739 return __hvm_copy(buf, paddr, size, 0, 0);
740 }
742 int hvm_copy_to_guest_virt(unsigned long vaddr, void *buf, int size)
743 {
744 return __hvm_copy(buf, vaddr, size, 1, 1);
745 }
747 int hvm_copy_from_guest_virt(void *buf, unsigned long vaddr, int size)
748 {
749 return __hvm_copy(buf, vaddr, size, 0, 1);
750 }
753 /* HVM specific printbuf. Mostly used for hvmloader chit-chat. */
754 void hvm_print_line(struct vcpu *v, const char c)
755 {
756 struct hvm_domain *hd = &v->domain->arch.hvm_domain;
758 spin_lock(&hd->pbuf_lock);
759 hd->pbuf[hd->pbuf_idx++] = c;
760 if ( (hd->pbuf_idx == (sizeof(hd->pbuf) - 2)) || (c == '\n') )
761 {
762 if ( c != '\n' )
763 hd->pbuf[hd->pbuf_idx++] = '\n';
764 hd->pbuf[hd->pbuf_idx] = '\0';
765 printk(XENLOG_G_DEBUG "HVM%u: %s", v->domain->domain_id, hd->pbuf);
766 hd->pbuf_idx = 0;
767 }
768 spin_unlock(&hd->pbuf_lock);
769 }
771 void hvm_cpuid(unsigned int input, unsigned int *eax, unsigned int *ebx,
772 unsigned int *ecx, unsigned int *edx)
773 {
774 if ( !cpuid_hypervisor_leaves(input, eax, ebx, ecx, edx) )
775 {
776 cpuid(input, eax, ebx, ecx, edx);
778 if ( input == 0x00000001 )
779 {
780 struct vcpu *v = current;
782 clear_bit(X86_FEATURE_MWAIT & 31, ecx);
784 if ( vlapic_hw_disabled(vcpu_vlapic(v)) )
785 clear_bit(X86_FEATURE_APIC & 31, edx);
787 #if CONFIG_PAGING_LEVELS >= 3
788 if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] )
789 #endif
790 clear_bit(X86_FEATURE_PAE & 31, edx);
791 clear_bit(X86_FEATURE_PSE36 & 31, edx);
792 }
793 else if ( input == 0x80000001 )
794 {
795 #if CONFIG_PAGING_LEVELS >= 3
796 struct vcpu *v = current;
797 if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] )
798 #endif
799 clear_bit(X86_FEATURE_NX & 31, edx);
800 #ifdef __i386__
801 /* Mask feature for Intel ia32e or AMD long mode. */
802 clear_bit(X86_FEATURE_LAHF_LM & 31, ecx);
804 clear_bit(X86_FEATURE_LM & 31, edx);
805 clear_bit(X86_FEATURE_SYSCALL & 31, edx);
806 #endif
807 }
808 }
809 }
811 static long hvm_grant_table_op(
812 unsigned int cmd, XEN_GUEST_HANDLE(void) uop, unsigned int count)
813 {
814 if ( cmd != GNTTABOP_query_size )
815 return -ENOSYS; /* all other commands need auditing */
816 return do_grant_table_op(cmd, uop, count);
817 }
819 typedef unsigned long hvm_hypercall_t(
820 unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
822 #define HYPERCALL(x) \
823 [ __HYPERVISOR_ ## x ] = (hvm_hypercall_t *) do_ ## x
825 #if defined(__i386__)
827 static hvm_hypercall_t *hvm_hypercall32_table[NR_hypercalls] = {
828 HYPERCALL(memory_op),
829 [ __HYPERVISOR_grant_table_op ] = (hvm_hypercall_t *)hvm_grant_table_op,
830 HYPERCALL(xen_version),
831 HYPERCALL(grant_table_op),
832 HYPERCALL(event_channel_op),
833 HYPERCALL(sched_op),
834 HYPERCALL(hvm_op)
835 };
837 #else /* defined(__x86_64__) */
839 static long do_memory_op_compat32(int cmd, XEN_GUEST_HANDLE(void) arg)
840 {
841 extern long do_add_to_physmap(struct xen_add_to_physmap *xatp);
842 long rc;
844 switch ( cmd )
845 {
846 case XENMEM_add_to_physmap:
847 {
848 struct {
849 domid_t domid;
850 uint32_t space;
851 uint32_t idx;
852 uint32_t gpfn;
853 } u;
854 struct xen_add_to_physmap h;
856 if ( copy_from_guest(&u, arg, 1) )
857 return -EFAULT;
859 h.domid = u.domid;
860 h.space = u.space;
861 h.idx = u.idx;
862 h.gpfn = u.gpfn;
864 this_cpu(guest_handles_in_xen_space) = 1;
865 rc = do_memory_op(cmd, guest_handle_from_ptr(&h, void));
866 this_cpu(guest_handles_in_xen_space) = 0;
868 break;
869 }
871 default:
872 gdprintk(XENLOG_WARNING, "memory_op %d.\n", cmd);
873 rc = -ENOSYS;
874 break;
875 }
877 return rc;
878 }
880 static hvm_hypercall_t *hvm_hypercall64_table[NR_hypercalls] = {
881 HYPERCALL(memory_op),
882 [ __HYPERVISOR_grant_table_op ] = (hvm_hypercall_t *)hvm_grant_table_op,
883 HYPERCALL(xen_version),
884 HYPERCALL(grant_table_op),
885 HYPERCALL(event_channel_op),
886 HYPERCALL(sched_op),
887 HYPERCALL(hvm_op)
888 };
890 static hvm_hypercall_t *hvm_hypercall32_table[NR_hypercalls] = {
891 [ __HYPERVISOR_memory_op ] = (hvm_hypercall_t *)do_memory_op_compat32,
892 [ __HYPERVISOR_grant_table_op ] = (hvm_hypercall_t *)hvm_grant_table_op,
893 HYPERCALL(xen_version),
894 HYPERCALL(grant_table_op),
895 HYPERCALL(event_channel_op),
896 HYPERCALL(sched_op),
897 HYPERCALL(hvm_op)
898 };
900 #endif /* defined(__x86_64__) */
902 int hvm_do_hypercall(struct cpu_user_regs *regs)
903 {
904 int flush, mode = hvm_guest_x86_mode(current);
905 uint32_t eax = regs->eax;
907 switch ( mode )
908 {
909 #ifdef __x86_64__
910 case 8:
911 #endif
912 case 4:
913 case 2:
914 hvm_store_cpu_guest_regs(current, regs, NULL);
915 if ( unlikely(ring_3(regs)) )
916 {
917 default:
918 regs->eax = -EPERM;
919 return HVM_HCALL_completed;
920 }
921 case 0:
922 break;
923 }
925 if ( (eax >= NR_hypercalls) || !hvm_hypercall32_table[eax] )
926 {
927 regs->eax = -ENOSYS;
928 return HVM_HCALL_completed;
929 }
931 /*
932 * NB. In future flush only on decrease_reservation.
933 * For now we also need to flush when pages are added, as qemu-dm is not
934 * yet capable of faulting pages into an existing valid mapcache bucket.
935 */
936 flush = ((eax == __HYPERVISOR_memory_op) ||
937 (eax == __HYPERVISOR_grant_table_op)); /* needed ? */
938 this_cpu(hc_preempted) = 0;
940 #ifdef __x86_64__
941 if ( mode == 8 )
942 {
943 HVM_DBG_LOG(DBG_LEVEL_HCALL, "hcall%u(%lx, %lx, %lx, %lx, %lx)", eax,
944 regs->rdi, regs->rsi, regs->rdx, regs->r10, regs->r8);
946 regs->rax = hvm_hypercall64_table[eax](regs->rdi,
947 regs->rsi,
948 regs->rdx,
949 regs->r10,
950 regs->r8);
951 }
952 else
953 #endif
954 {
955 HVM_DBG_LOG(DBG_LEVEL_HCALL, "hcall%u(%x, %x, %x, %x, %x)", eax,
956 (uint32_t)regs->ebx, (uint32_t)regs->ecx,
957 (uint32_t)regs->edx, (uint32_t)regs->esi,
958 (uint32_t)regs->edi);
960 regs->eax = hvm_hypercall32_table[eax]((uint32_t)regs->ebx,
961 (uint32_t)regs->ecx,
962 (uint32_t)regs->edx,
963 (uint32_t)regs->esi,
964 (uint32_t)regs->edi);
965 }
967 HVM_DBG_LOG(DBG_LEVEL_HCALL, "hcall%u -> %lx",
968 eax, (unsigned long)regs->eax);
970 return (this_cpu(hc_preempted) ? HVM_HCALL_preempted :
971 flush ? HVM_HCALL_invalidate : HVM_HCALL_completed);
972 }
974 static void hvm_latch_shinfo_size(struct domain *d)
975 {
976 /*
977 * Called from operations which are among the very first executed by
978 * PV drivers on initialisation or after save/restore. These are sensible
979 * points at which to sample the execution mode of the guest and latch
980 * 32- or 64-bit format for shared state.
981 */
982 if ( current->domain == d )
983 d->arch.has_32bit_shinfo = (hvm_guest_x86_mode(current) != 8);
984 }
986 /* Initialise a hypercall transfer page for a VMX domain using
987 paravirtualised drivers. */
988 void hvm_hypercall_page_initialise(struct domain *d,
989 void *hypercall_page)
990 {
991 hvm_latch_shinfo_size(d);
992 hvm_funcs.init_hypercall_page(d, hypercall_page);
993 }
996 /*
997 * only called in HVM domain BSP context
998 * when booting, vcpuid is always equal to apic_id
999 */
1000 int hvm_bringup_ap(int vcpuid, int trampoline_vector)
1002 struct vcpu *v;
1003 struct domain *d = current->domain;
1004 struct vcpu_guest_context *ctxt;
1005 int rc = 0;
1007 BUG_ON(!is_hvm_domain(d));
1009 if ( (v = d->vcpu[vcpuid]) == NULL )
1010 return -ENOENT;
1012 if ( (ctxt = xmalloc(struct vcpu_guest_context)) == NULL )
1014 gdprintk(XENLOG_ERR,
1015 "Failed to allocate memory in hvm_bringup_ap.\n");
1016 return -ENOMEM;
1019 hvm_init_ap_context(ctxt, vcpuid, trampoline_vector);
1021 /* Sync AP's TSC with BSP's. */
1022 v->arch.hvm_vcpu.cache_tsc_offset =
1023 v->domain->vcpu[0]->arch.hvm_vcpu.cache_tsc_offset;
1024 hvm_funcs.set_tsc_offset(v, v->arch.hvm_vcpu.cache_tsc_offset);
1026 LOCK_BIGLOCK(d);
1027 rc = -EEXIST;
1028 if ( !v->is_initialised )
1029 rc = boot_vcpu(d, vcpuid, ctxt);
1030 UNLOCK_BIGLOCK(d);
1032 if ( rc != 0 )
1034 gdprintk(XENLOG_ERR,
1035 "AP %d bringup failed in boot_vcpu %x.\n", vcpuid, rc);
1036 goto out;
1039 if ( test_and_clear_bit(_VPF_down, &v->pause_flags) )
1040 vcpu_wake(v);
1041 gdprintk(XENLOG_INFO, "AP %d bringup suceeded.\n", vcpuid);
1043 out:
1044 xfree(ctxt);
1045 return rc;
1048 static int hvmop_set_pci_intx_level(
1049 XEN_GUEST_HANDLE(xen_hvm_set_pci_intx_level_t) uop)
1051 struct xen_hvm_set_pci_intx_level op;
1052 struct domain *d;
1053 int rc;
1055 if ( copy_from_guest(&op, uop, 1) )
1056 return -EFAULT;
1058 if ( !IS_PRIV(current->domain) )
1059 return -EPERM;
1061 if ( (op.domain > 0) || (op.bus > 0) || (op.device > 31) || (op.intx > 3) )
1062 return -EINVAL;
1064 d = rcu_lock_domain_by_id(op.domid);
1065 if ( d == NULL )
1066 return -ESRCH;
1068 rc = -EINVAL;
1069 if ( !is_hvm_domain(d) )
1070 goto out;
1072 rc = 0;
1073 switch ( op.level )
1075 case 0:
1076 hvm_pci_intx_deassert(d, op.device, op.intx);
1077 break;
1078 case 1:
1079 hvm_pci_intx_assert(d, op.device, op.intx);
1080 break;
1081 default:
1082 rc = -EINVAL;
1083 break;
1086 out:
1087 rcu_unlock_domain(d);
1088 return rc;
1091 static int hvmop_set_isa_irq_level(
1092 XEN_GUEST_HANDLE(xen_hvm_set_isa_irq_level_t) uop)
1094 struct xen_hvm_set_isa_irq_level op;
1095 struct domain *d;
1096 int rc;
1098 if ( copy_from_guest(&op, uop, 1) )
1099 return -EFAULT;
1101 if ( !IS_PRIV(current->domain) )
1102 return -EPERM;
1104 if ( op.isa_irq > 15 )
1105 return -EINVAL;
1107 d = rcu_lock_domain_by_id(op.domid);
1108 if ( d == NULL )
1109 return -ESRCH;
1111 rc = -EINVAL;
1112 if ( !is_hvm_domain(d) )
1113 goto out;
1115 rc = 0;
1116 switch ( op.level )
1118 case 0:
1119 hvm_isa_irq_deassert(d, op.isa_irq);
1120 break;
1121 case 1:
1122 hvm_isa_irq_assert(d, op.isa_irq);
1123 break;
1124 default:
1125 rc = -EINVAL;
1126 break;
1129 out:
1130 rcu_unlock_domain(d);
1131 return rc;
1134 static int hvmop_set_pci_link_route(
1135 XEN_GUEST_HANDLE(xen_hvm_set_pci_link_route_t) uop)
1137 struct xen_hvm_set_pci_link_route op;
1138 struct domain *d;
1139 int rc;
1141 if ( copy_from_guest(&op, uop, 1) )
1142 return -EFAULT;
1144 if ( !IS_PRIV(current->domain) )
1145 return -EPERM;
1147 if ( (op.link > 3) || (op.isa_irq > 15) )
1148 return -EINVAL;
1150 d = rcu_lock_domain_by_id(op.domid);
1151 if ( d == NULL )
1152 return -ESRCH;
1154 rc = -EINVAL;
1155 if ( !is_hvm_domain(d) )
1156 goto out;
1158 rc = 0;
1159 hvm_set_pci_link_route(d, op.link, op.isa_irq);
1161 out:
1162 rcu_unlock_domain(d);
1163 return rc;
1166 static int hvmop_flush_tlb_all(void)
1168 flush_tlb_mask(current->domain->domain_dirty_cpumask);
1169 return 0;
1172 long do_hvm_op(unsigned long op, XEN_GUEST_HANDLE(void) arg)
1175 long rc = 0;
1177 switch ( op )
1179 case HVMOP_set_param:
1180 case HVMOP_get_param:
1182 struct xen_hvm_param a;
1183 struct hvm_ioreq_page *iorp;
1184 struct domain *d;
1185 struct vcpu *v;
1187 if ( copy_from_guest(&a, arg, 1) )
1188 return -EFAULT;
1190 if ( a.index >= HVM_NR_PARAMS )
1191 return -EINVAL;
1193 if ( a.domid == DOMID_SELF )
1194 d = rcu_lock_current_domain();
1195 else if ( IS_PRIV(current->domain) )
1196 d = rcu_lock_domain_by_id(a.domid);
1197 else
1198 return -EPERM;
1200 if ( d == NULL )
1201 return -ESRCH;
1203 rc = -EINVAL;
1204 if ( !is_hvm_domain(d) )
1205 goto param_fail;
1207 if ( op == HVMOP_set_param )
1209 switch ( a.index )
1211 case HVM_PARAM_IOREQ_PFN:
1212 iorp = &d->arch.hvm_domain.ioreq;
1213 rc = hvm_set_ioreq_page(d, iorp, a.value);
1214 spin_lock(&iorp->lock);
1215 if ( (rc == 0) && (iorp->va != NULL) )
1216 /* Initialise evtchn port info if VCPUs already created. */
1217 for_each_vcpu ( d, v )
1218 get_ioreq(v)->vp_eport = v->arch.hvm_vcpu.xen_port;
1219 spin_unlock(&iorp->lock);
1220 break;
1221 case HVM_PARAM_BUFIOREQ_PFN:
1222 iorp = &d->arch.hvm_domain.buf_ioreq;
1223 rc = hvm_set_ioreq_page(d, iorp, a.value);
1224 break;
1225 case HVM_PARAM_CALLBACK_IRQ:
1226 hvm_set_callback_via(d, a.value);
1227 hvm_latch_shinfo_size(d);
1228 break;
1230 d->arch.hvm_domain.params[a.index] = a.value;
1231 rc = 0;
1233 else
1235 a.value = d->arch.hvm_domain.params[a.index];
1236 rc = copy_to_guest(arg, &a, 1) ? -EFAULT : 0;
1239 HVM_DBG_LOG(DBG_LEVEL_HCALL, "%s param %u = %"PRIx64,
1240 op == HVMOP_set_param ? "set" : "get",
1241 a.index, a.value);
1243 param_fail:
1244 rcu_unlock_domain(d);
1245 break;
1248 case HVMOP_set_pci_intx_level:
1249 rc = hvmop_set_pci_intx_level(
1250 guest_handle_cast(arg, xen_hvm_set_pci_intx_level_t));
1251 break;
1253 case HVMOP_set_isa_irq_level:
1254 rc = hvmop_set_isa_irq_level(
1255 guest_handle_cast(arg, xen_hvm_set_isa_irq_level_t));
1256 break;
1258 case HVMOP_set_pci_link_route:
1259 rc = hvmop_set_pci_link_route(
1260 guest_handle_cast(arg, xen_hvm_set_pci_link_route_t));
1261 break;
1263 case HVMOP_flush_tlbs:
1264 rc = guest_handle_is_null(arg) ? hvmop_flush_tlb_all() : -ENOSYS;
1265 break;
1267 default:
1269 gdprintk(XENLOG_WARNING, "Bad HVM op %ld.\n", op);
1270 rc = -ENOSYS;
1271 break;
1275 return rc;
1278 /*
1279 * Local variables:
1280 * mode: C
1281 * c-set-style: "BSD"
1282 * c-basic-offset: 4
1283 * tab-width: 4
1284 * indent-tabs-mode: nil
1285 * End:
1286 */