ia64/xen-unstable

view xen/arch/x86/hvm/hvm.c @ 15798:c398dad9d50a

hvm: Provide an HVMOP_flush_tlbs to flush VCPU TLBs.
From: Peter Johnston <pjohnston@xensource.com>
Signed-off-by: Keir Fraser <keir@xensource.com>
author kfraser@localhost.localdomain
date Thu Aug 30 09:57:09 2007 +0100 (2007-08-30)
parents 08e962b8597c
children 86a154e1ef5d
line source
1 /*
2 * hvm.c: Common hardware virtual machine abstractions.
3 *
4 * Copyright (c) 2004, Intel Corporation.
5 * Copyright (c) 2005, International Business Machines Corporation.
6 *
7 * This program is free software; you can redistribute it and/or modify it
8 * under the terms and conditions of the GNU General Public License,
9 * version 2, as published by the Free Software Foundation.
10 *
11 * This program is distributed in the hope it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
14 * more details.
15 *
16 * You should have received a copy of the GNU General Public License along with
17 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
18 * Place - Suite 330, Boston, MA 02111-1307 USA.
19 */
21 #include <xen/config.h>
22 #include <xen/init.h>
23 #include <xen/lib.h>
24 #include <xen/trace.h>
25 #include <xen/sched.h>
26 #include <xen/irq.h>
27 #include <xen/softirq.h>
28 #include <xen/domain.h>
29 #include <xen/domain_page.h>
30 #include <xen/hypercall.h>
31 #include <xen/guest_access.h>
32 #include <xen/event.h>
33 #include <asm/current.h>
34 #include <asm/e820.h>
35 #include <asm/io.h>
36 #include <asm/paging.h>
37 #include <asm/regs.h>
38 #include <asm/cpufeature.h>
39 #include <asm/processor.h>
40 #include <asm/types.h>
41 #include <asm/msr.h>
42 #include <asm/mc146818rtc.h>
43 #include <asm/spinlock.h>
44 #include <asm/hvm/hvm.h>
45 #include <asm/hvm/vpt.h>
46 #include <asm/hvm/support.h>
47 #include <public/sched.h>
48 #include <public/hvm/ioreq.h>
49 #include <public/version.h>
50 #include <public/memory.h>
52 int hvm_enabled __read_mostly;
54 unsigned int opt_hvm_debug_level __read_mostly;
55 integer_param("hvm_debug", opt_hvm_debug_level);
57 struct hvm_function_table hvm_funcs __read_mostly;
59 /* I/O permission bitmap is globally shared by all HVM guests. */
60 char __attribute__ ((__section__ (".bss.page_aligned")))
61 hvm_io_bitmap[3*PAGE_SIZE];
63 void hvm_enable(struct hvm_function_table *fns)
64 {
65 BUG_ON(hvm_enabled);
66 printk("HVM: %s enabled\n", fns->name);
68 /*
69 * Allow direct access to the PC debug port (it is often used for I/O
70 * delays, but the vmexits simply slow things down).
71 */
72 memset(hvm_io_bitmap, ~0, sizeof(hvm_io_bitmap));
73 clear_bit(0x80, hvm_io_bitmap);
75 hvm_funcs = *fns;
76 hvm_enabled = 1;
77 }
79 void hvm_set_guest_time(struct vcpu *v, u64 gtime)
80 {
81 u64 host_tsc;
83 rdtscll(host_tsc);
85 v->arch.hvm_vcpu.cache_tsc_offset = gtime - host_tsc;
86 hvm_funcs.set_tsc_offset(v, v->arch.hvm_vcpu.cache_tsc_offset);
87 }
89 u64 hvm_get_guest_time(struct vcpu *v)
90 {
91 u64 host_tsc;
93 rdtscll(host_tsc);
94 return host_tsc + v->arch.hvm_vcpu.cache_tsc_offset;
95 }
97 void hvm_migrate_timers(struct vcpu *v)
98 {
99 rtc_migrate_timers(v);
100 hpet_migrate_timers(v);
101 pt_migrate(v);
102 }
104 void hvm_do_resume(struct vcpu *v)
105 {
106 ioreq_t *p;
108 if ( !v->fpu_dirtied )
109 hvm_funcs.stts(v);
111 pt_thaw_time(v);
113 /* NB. Optimised for common case (p->state == STATE_IOREQ_NONE). */
114 p = &get_ioreq(v)->vp_ioreq;
115 while ( p->state != STATE_IOREQ_NONE )
116 {
117 switch ( p->state )
118 {
119 case STATE_IORESP_READY: /* IORESP_READY -> NONE */
120 hvm_io_assist();
121 break;
122 case STATE_IOREQ_READY: /* IOREQ_{READY,INPROCESS} -> IORESP_READY */
123 case STATE_IOREQ_INPROCESS:
124 wait_on_xen_event_channel(v->arch.hvm_vcpu.xen_port,
125 (p->state != STATE_IOREQ_READY) &&
126 (p->state != STATE_IOREQ_INPROCESS));
127 break;
128 default:
129 gdprintk(XENLOG_ERR, "Weird HVM iorequest state %d.\n", p->state);
130 domain_crash_synchronous();
131 }
132 }
133 }
135 static void hvm_init_ioreq_page(
136 struct domain *d, struct hvm_ioreq_page *iorp)
137 {
138 memset(iorp, 0, sizeof(*iorp));
139 spin_lock_init(&iorp->lock);
140 domain_pause(d);
141 }
143 static void hvm_destroy_ioreq_page(
144 struct domain *d, struct hvm_ioreq_page *iorp)
145 {
146 spin_lock(&iorp->lock);
148 ASSERT(d->is_dying);
150 if ( iorp->va != NULL )
151 {
152 unmap_domain_page_global(iorp->va);
153 put_page_and_type(iorp->page);
154 iorp->va = NULL;
155 }
157 spin_unlock(&iorp->lock);
158 }
160 static int hvm_set_ioreq_page(
161 struct domain *d, struct hvm_ioreq_page *iorp, unsigned long gmfn)
162 {
163 struct page_info *page;
164 unsigned long mfn;
165 void *va;
167 mfn = gmfn_to_mfn(d, gmfn);
168 if ( !mfn_valid(mfn) )
169 return -EINVAL;
171 page = mfn_to_page(mfn);
172 if ( !get_page_and_type(page, d, PGT_writable_page) )
173 return -EINVAL;
175 va = map_domain_page_global(mfn);
176 if ( va == NULL )
177 {
178 put_page_and_type(page);
179 return -ENOMEM;
180 }
182 spin_lock(&iorp->lock);
184 if ( (iorp->va != NULL) || d->is_dying )
185 {
186 spin_unlock(&iorp->lock);
187 unmap_domain_page_global(va);
188 put_page_and_type(mfn_to_page(mfn));
189 return -EINVAL;
190 }
192 iorp->va = va;
193 iorp->page = page;
195 spin_unlock(&iorp->lock);
197 domain_unpause(d);
199 return 0;
200 }
202 int hvm_domain_initialise(struct domain *d)
203 {
204 int rc;
206 if ( !hvm_enabled )
207 {
208 gdprintk(XENLOG_WARNING, "Attempt to create a HVM guest "
209 "on a non-VT/AMDV platform.\n");
210 return -EINVAL;
211 }
213 spin_lock_init(&d->arch.hvm_domain.pbuf_lock);
214 spin_lock_init(&d->arch.hvm_domain.irq_lock);
216 rc = paging_enable(d, PG_refcounts|PG_translate|PG_external);
217 if ( rc != 0 )
218 return rc;
220 vpic_init(d);
221 vioapic_init(d);
223 hvm_init_ioreq_page(d, &d->arch.hvm_domain.ioreq);
224 hvm_init_ioreq_page(d, &d->arch.hvm_domain.buf_ioreq);
226 return hvm_funcs.domain_initialise(d);
227 }
229 void hvm_domain_relinquish_resources(struct domain *d)
230 {
231 hvm_destroy_ioreq_page(d, &d->arch.hvm_domain.ioreq);
232 hvm_destroy_ioreq_page(d, &d->arch.hvm_domain.buf_ioreq);
234 pit_deinit(d);
235 rtc_deinit(d);
236 pmtimer_deinit(d);
237 hpet_deinit(d);
238 }
240 void hvm_domain_destroy(struct domain *d)
241 {
242 hvm_funcs.domain_destroy(d);
243 }
245 static int hvm_save_cpu_ctxt(struct domain *d, hvm_domain_context_t *h)
246 {
247 struct vcpu *v;
248 struct hvm_hw_cpu ctxt;
249 struct vcpu_guest_context *vc;
251 for_each_vcpu(d, v)
252 {
253 /* We don't need to save state for a vcpu that is down; the restore
254 * code will leave it down if there is nothing saved. */
255 if ( test_bit(_VPF_down, &v->pause_flags) )
256 continue;
258 /* Architecture-specific vmcs/vmcb bits */
259 hvm_funcs.save_cpu_ctxt(v, &ctxt);
261 /* Other vcpu register state */
262 vc = &v->arch.guest_context;
263 if ( v->fpu_initialised )
264 memcpy(ctxt.fpu_regs, &vc->fpu_ctxt, sizeof(ctxt.fpu_regs));
265 else
266 memset(ctxt.fpu_regs, 0, sizeof(ctxt.fpu_regs));
267 ctxt.rax = vc->user_regs.eax;
268 ctxt.rbx = vc->user_regs.ebx;
269 ctxt.rcx = vc->user_regs.ecx;
270 ctxt.rdx = vc->user_regs.edx;
271 ctxt.rbp = vc->user_regs.ebp;
272 ctxt.rsi = vc->user_regs.esi;
273 ctxt.rdi = vc->user_regs.edi;
274 /* %rsp handled by arch-specific call above */
275 #ifdef __x86_64__
276 ctxt.r8 = vc->user_regs.r8;
277 ctxt.r9 = vc->user_regs.r9;
278 ctxt.r10 = vc->user_regs.r10;
279 ctxt.r11 = vc->user_regs.r11;
280 ctxt.r12 = vc->user_regs.r12;
281 ctxt.r13 = vc->user_regs.r13;
282 ctxt.r14 = vc->user_regs.r14;
283 ctxt.r15 = vc->user_regs.r15;
284 #endif
285 ctxt.dr0 = vc->debugreg[0];
286 ctxt.dr1 = vc->debugreg[1];
287 ctxt.dr2 = vc->debugreg[2];
288 ctxt.dr3 = vc->debugreg[3];
289 ctxt.dr6 = vc->debugreg[6];
290 ctxt.dr7 = vc->debugreg[7];
292 if ( hvm_save_entry(CPU, v->vcpu_id, h, &ctxt) != 0 )
293 return 1;
294 }
295 return 0;
296 }
298 static int hvm_load_cpu_ctxt(struct domain *d, hvm_domain_context_t *h)
299 {
300 int vcpuid, rc;
301 struct vcpu *v;
302 struct hvm_hw_cpu ctxt;
303 struct vcpu_guest_context *vc;
305 /* Which vcpu is this? */
306 vcpuid = hvm_load_instance(h);
307 if ( vcpuid > MAX_VIRT_CPUS || (v = d->vcpu[vcpuid]) == NULL )
308 {
309 gdprintk(XENLOG_ERR, "HVM restore: domain has no vcpu %u\n", vcpuid);
310 return -EINVAL;
311 }
312 vc = &v->arch.guest_context;
314 /* Need to init this vcpu before loading its contents */
315 LOCK_BIGLOCK(d);
316 if ( !v->is_initialised )
317 if ( (rc = boot_vcpu(d, vcpuid, vc)) != 0 )
318 return rc;
319 UNLOCK_BIGLOCK(d);
321 if ( hvm_load_entry(CPU, h, &ctxt) != 0 )
322 return -EINVAL;
324 /* Architecture-specific vmcs/vmcb bits */
325 if ( hvm_funcs.load_cpu_ctxt(v, &ctxt) < 0 )
326 return -EINVAL;
328 /* Other vcpu register state */
329 memcpy(&vc->fpu_ctxt, ctxt.fpu_regs, sizeof(ctxt.fpu_regs));
330 vc->user_regs.eax = ctxt.rax;
331 vc->user_regs.ebx = ctxt.rbx;
332 vc->user_regs.ecx = ctxt.rcx;
333 vc->user_regs.edx = ctxt.rdx;
334 vc->user_regs.ebp = ctxt.rbp;
335 vc->user_regs.esi = ctxt.rsi;
336 vc->user_regs.edi = ctxt.rdi;
337 vc->user_regs.esp = ctxt.rsp;
338 #ifdef __x86_64__
339 vc->user_regs.r8 = ctxt.r8;
340 vc->user_regs.r9 = ctxt.r9;
341 vc->user_regs.r10 = ctxt.r10;
342 vc->user_regs.r11 = ctxt.r11;
343 vc->user_regs.r12 = ctxt.r12;
344 vc->user_regs.r13 = ctxt.r13;
345 vc->user_regs.r14 = ctxt.r14;
346 vc->user_regs.r15 = ctxt.r15;
347 #endif
348 vc->debugreg[0] = ctxt.dr0;
349 vc->debugreg[1] = ctxt.dr1;
350 vc->debugreg[2] = ctxt.dr2;
351 vc->debugreg[3] = ctxt.dr3;
352 vc->debugreg[6] = ctxt.dr6;
353 vc->debugreg[7] = ctxt.dr7;
355 vc->flags = VGCF_online;
356 v->fpu_initialised = 1;
358 /* Auxiliary processors should be woken immediately. */
359 if ( test_and_clear_bit(_VPF_down, &v->pause_flags) )
360 vcpu_wake(v);
362 return 0;
363 }
365 HVM_REGISTER_SAVE_RESTORE(CPU, hvm_save_cpu_ctxt, hvm_load_cpu_ctxt,
366 1, HVMSR_PER_VCPU);
368 int hvm_vcpu_initialise(struct vcpu *v)
369 {
370 int rc;
372 if ( (rc = vlapic_init(v)) != 0 )
373 return rc;
375 if ( (rc = hvm_funcs.vcpu_initialise(v)) != 0 )
376 {
377 vlapic_destroy(v);
378 return rc;
379 }
381 /* Create ioreq event channel. */
382 rc = alloc_unbound_xen_event_channel(v, 0);
383 if ( rc < 0 )
384 {
385 hvm_funcs.vcpu_destroy(v);
386 vlapic_destroy(v);
387 return rc;
388 }
390 /* Register ioreq event channel. */
391 v->arch.hvm_vcpu.xen_port = rc;
392 spin_lock(&v->domain->arch.hvm_domain.ioreq.lock);
393 if ( v->domain->arch.hvm_domain.ioreq.va != NULL )
394 get_ioreq(v)->vp_eport = v->arch.hvm_vcpu.xen_port;
395 spin_unlock(&v->domain->arch.hvm_domain.ioreq.lock);
397 spin_lock_init(&v->arch.hvm_vcpu.tm_lock);
398 INIT_LIST_HEAD(&v->arch.hvm_vcpu.tm_list);
400 if ( v->vcpu_id == 0 )
401 {
402 /* NB. All these really belong in hvm_domain_initialise(). */
403 pit_init(v, cpu_khz);
404 rtc_init(v, RTC_PORT(0));
405 pmtimer_init(v);
406 hpet_init(v);
408 /* Init guest TSC to start from zero. */
409 hvm_set_guest_time(v, 0);
410 }
412 return 0;
413 }
415 void hvm_vcpu_destroy(struct vcpu *v)
416 {
417 vlapic_destroy(v);
418 hvm_funcs.vcpu_destroy(v);
420 /* Event channel is already freed by evtchn_destroy(). */
421 /*free_xen_event_channel(v, v->arch.hvm_vcpu.xen_port);*/
422 }
425 void hvm_vcpu_reset(struct vcpu *v)
426 {
427 vcpu_pause(v);
429 vlapic_reset(vcpu_vlapic(v));
431 hvm_funcs.vcpu_initialise(v);
433 set_bit(_VPF_down, &v->pause_flags);
434 clear_bit(_VPF_blocked, &v->pause_flags);
435 v->fpu_initialised = 0;
436 v->fpu_dirtied = 0;
437 v->is_initialised = 0;
439 vcpu_unpause(v);
440 }
442 static void hvm_vcpu_down(void)
443 {
444 struct vcpu *v = current;
445 struct domain *d = v->domain;
446 int online_count = 0;
448 gdprintk(XENLOG_INFO, "DOM%d/VCPU%d: going offline.\n",
449 d->domain_id, v->vcpu_id);
451 /* Doesn't halt us immediately, but we'll never return to guest context. */
452 set_bit(_VPF_down, &v->pause_flags);
453 vcpu_sleep_nosync(v);
455 /* Any other VCPUs online? ... */
456 LOCK_BIGLOCK(d);
457 for_each_vcpu ( d, v )
458 if ( !test_bit(_VPF_down, &v->pause_flags) )
459 online_count++;
460 UNLOCK_BIGLOCK(d);
462 /* ... Shut down the domain if not. */
463 if ( online_count == 0 )
464 {
465 gdprintk(XENLOG_INFO, "DOM%d: all CPUs offline -- powering off.\n",
466 d->domain_id);
467 domain_shutdown(d, SHUTDOWN_poweroff);
468 }
469 }
471 void hvm_send_assist_req(struct vcpu *v)
472 {
473 ioreq_t *p;
475 if ( unlikely(!vcpu_start_shutdown_deferral(v)) )
476 return; /* implicitly bins the i/o operation */
478 p = &get_ioreq(v)->vp_ioreq;
479 if ( unlikely(p->state != STATE_IOREQ_NONE) )
480 {
481 /* This indicates a bug in the device model. Crash the domain. */
482 gdprintk(XENLOG_ERR, "Device model set bad IO state %d.\n", p->state);
483 domain_crash_synchronous();
484 }
486 prepare_wait_on_xen_event_channel(v->arch.hvm_vcpu.xen_port);
488 /*
489 * Following happens /after/ blocking and setting up ioreq contents.
490 * prepare_wait_on_xen_event_channel() is an implicit barrier.
491 */
492 p->state = STATE_IOREQ_READY;
493 notify_via_xen_event_channel(v->arch.hvm_vcpu.xen_port);
494 }
496 void hvm_hlt(unsigned long rflags)
497 {
498 /*
499 * If we halt with interrupts disabled, that's a pretty sure sign that we
500 * want to shut down. In a real processor, NMIs are the only way to break
501 * out of this.
502 */
503 if ( unlikely(!(rflags & X86_EFLAGS_IF)) )
504 return hvm_vcpu_down();
506 do_sched_op_compat(SCHEDOP_block, 0);
507 }
509 void hvm_triple_fault(void)
510 {
511 struct vcpu *v = current;
512 gdprintk(XENLOG_INFO, "Triple fault on VCPU%d - "
513 "invoking HVM system reset.\n", v->vcpu_id);
514 domain_shutdown(v->domain, SHUTDOWN_reboot);
515 }
517 int hvm_set_cr0(unsigned long value)
518 {
519 struct vcpu *v = current;
520 unsigned long mfn, old_base_mfn, old_value = v->arch.hvm_vcpu.guest_cr[0];
522 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR0 value = %lx", value);
524 if ( (u32)value != value )
525 {
526 HVM_DBG_LOG(DBG_LEVEL_1,
527 "Guest attempts to set upper 32 bits in CR0: %lx",
528 value);
529 hvm_inject_exception(TRAP_gp_fault, 0, 0);
530 return 0;
531 }
533 value &= ~HVM_CR0_GUEST_RESERVED_BITS;
535 /* ET is reserved and should be always be 1. */
536 value |= X86_CR0_ET;
538 if ( (value & (X86_CR0_PE|X86_CR0_PG)) == X86_CR0_PG )
539 {
540 hvm_inject_exception(TRAP_gp_fault, 0, 0);
541 return 0;
542 }
544 if ( (value & X86_CR0_PG) && !(old_value & X86_CR0_PG) )
545 {
546 if ( v->arch.hvm_vcpu.guest_efer & EFER_LME )
547 {
548 if ( !(v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_PAE) )
549 {
550 HVM_DBG_LOG(DBG_LEVEL_1, "Enable paging before PAE enable");
551 hvm_inject_exception(TRAP_gp_fault, 0, 0);
552 return 0;
553 }
554 HVM_DBG_LOG(DBG_LEVEL_1, "Enabling long mode");
555 v->arch.hvm_vcpu.guest_efer |= EFER_LMA;
556 hvm_update_guest_efer(v);
557 }
559 if ( !paging_mode_hap(v->domain) )
560 {
561 /* The guest CR3 must be pointing to the guest physical. */
562 mfn = get_mfn_from_gpfn(v->arch.hvm_vcpu.guest_cr[3]>>PAGE_SHIFT);
563 if ( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), v->domain))
564 {
565 gdprintk(XENLOG_ERR, "Invalid CR3 value = %lx (mfn=%lx)\n",
566 v->arch.hvm_vcpu.guest_cr[3], mfn);
567 domain_crash(v->domain);
568 return 0;
569 }
571 /* Now arch.guest_table points to machine physical. */
572 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
573 v->arch.guest_table = pagetable_from_pfn(mfn);
574 if ( old_base_mfn )
575 put_page(mfn_to_page(old_base_mfn));
577 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, mfn = %lx",
578 v->arch.hvm_vcpu.guest_cr[3], mfn);
579 }
580 }
581 else if ( !(value & X86_CR0_PG) && (old_value & X86_CR0_PG) )
582 {
583 /* When CR0.PG is cleared, LMA is cleared immediately. */
584 if ( hvm_long_mode_enabled(v) )
585 {
586 v->arch.hvm_vcpu.guest_efer &= ~EFER_LMA;
587 hvm_update_guest_efer(v);
588 }
590 if ( !paging_mode_hap(v->domain) )
591 {
592 put_page(mfn_to_page(get_mfn_from_gpfn(
593 v->arch.hvm_vcpu.guest_cr[3] >> PAGE_SHIFT)));
594 v->arch.guest_table = pagetable_null();
595 }
596 }
598 v->arch.hvm_vcpu.guest_cr[0] = value;
599 hvm_update_guest_cr(v, 0);
601 if ( (value ^ old_value) & X86_CR0_PG )
602 paging_update_paging_modes(v);
604 return 1;
605 }
607 int hvm_set_cr3(unsigned long value)
608 {
609 unsigned long old_base_mfn, mfn;
610 struct vcpu *v = current;
612 if ( paging_mode_hap(v->domain) || !hvm_paging_enabled(v) )
613 {
614 /* Nothing to do. */
615 }
616 else if ( value == v->arch.hvm_vcpu.guest_cr[3] )
617 {
618 /* Shadow-mode TLB flush. Invalidate the shadow. */
619 mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
620 if ( mfn != pagetable_get_pfn(v->arch.guest_table) )
621 goto bad_cr3;
622 }
623 else
624 {
625 /* Shadow-mode CR3 change. Check PDBR and then make a new shadow. */
626 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 value = %lx", value);
627 mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
628 if ( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), v->domain) )
629 goto bad_cr3;
631 old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
632 v->arch.guest_table = pagetable_from_pfn(mfn);
634 if ( old_base_mfn )
635 put_page(mfn_to_page(old_base_mfn));
637 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx", value);
638 }
640 v->arch.hvm_vcpu.guest_cr[3] = value;
641 paging_update_cr3(v);
642 return 1;
644 bad_cr3:
645 gdprintk(XENLOG_ERR, "Invalid CR3\n");
646 domain_crash(v->domain);
647 return 0;
648 }
650 int hvm_set_cr4(unsigned long value)
651 {
652 struct vcpu *v = current;
653 unsigned long old_cr;
655 if ( value & HVM_CR4_GUEST_RESERVED_BITS )
656 {
657 HVM_DBG_LOG(DBG_LEVEL_1,
658 "Guest attempts to set reserved bit in CR4: %lx",
659 value);
660 goto gpf;
661 }
663 if ( !(value & X86_CR4_PAE) && hvm_long_mode_enabled(v) )
664 {
665 HVM_DBG_LOG(DBG_LEVEL_1, "Guest cleared CR4.PAE while "
666 "EFER.LMA is set");
667 goto gpf;
668 }
670 old_cr = v->arch.hvm_vcpu.guest_cr[4];
671 v->arch.hvm_vcpu.guest_cr[4] = value;
672 hvm_update_guest_cr(v, 4);
674 /* Modifying CR4.{PSE,PAE,PGE} invalidates all TLB entries, inc. Global. */
675 if ( (old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE) )
676 paging_update_paging_modes(v);
678 return 1;
680 gpf:
681 hvm_inject_exception(TRAP_gp_fault, 0, 0);
682 return 0;
683 }
685 /*
686 * __hvm_copy():
687 * @buf = hypervisor buffer
688 * @addr = guest address to copy to/from
689 * @size = number of bytes to copy
690 * @dir = copy *to* guest (TRUE) or *from* guest (FALSE)?
691 * @virt = addr is *virtual* (TRUE) or *guest physical* (FALSE)?
692 * Returns number of bytes failed to copy (0 == complete success).
693 */
694 static int __hvm_copy(void *buf, paddr_t addr, int size, int dir, int virt)
695 {
696 unsigned long gfn, mfn;
697 char *p;
698 int count, todo;
700 todo = size;
701 while ( todo > 0 )
702 {
703 count = min_t(int, PAGE_SIZE - (addr & ~PAGE_MASK), todo);
705 if ( virt )
706 gfn = paging_gva_to_gfn(current, addr);
707 else
708 gfn = addr >> PAGE_SHIFT;
710 mfn = get_mfn_from_gpfn(gfn);
712 if ( mfn == INVALID_MFN )
713 return todo;
715 p = (char *)map_domain_page(mfn) + (addr & ~PAGE_MASK);
717 if ( dir )
718 {
719 memcpy(p, buf, count); /* dir == TRUE: *to* guest */
720 paging_mark_dirty(current->domain, mfn);
721 }
722 else
723 memcpy(buf, p, count); /* dir == FALSE: *from guest */
725 unmap_domain_page(p);
727 addr += count;
728 buf += count;
729 todo -= count;
730 }
732 return 0;
733 }
735 int hvm_copy_to_guest_phys(paddr_t paddr, void *buf, int size)
736 {
737 return __hvm_copy(buf, paddr, size, 1, 0);
738 }
740 int hvm_copy_from_guest_phys(void *buf, paddr_t paddr, int size)
741 {
742 return __hvm_copy(buf, paddr, size, 0, 0);
743 }
745 int hvm_copy_to_guest_virt(unsigned long vaddr, void *buf, int size)
746 {
747 return __hvm_copy(buf, vaddr, size, 1, 1);
748 }
750 int hvm_copy_from_guest_virt(void *buf, unsigned long vaddr, int size)
751 {
752 return __hvm_copy(buf, vaddr, size, 0, 1);
753 }
756 /* HVM specific printbuf. Mostly used for hvmloader chit-chat. */
757 void hvm_print_line(struct vcpu *v, const char c)
758 {
759 struct hvm_domain *hd = &v->domain->arch.hvm_domain;
761 spin_lock(&hd->pbuf_lock);
762 hd->pbuf[hd->pbuf_idx++] = c;
763 if ( (hd->pbuf_idx == (sizeof(hd->pbuf) - 2)) || (c == '\n') )
764 {
765 if ( c != '\n' )
766 hd->pbuf[hd->pbuf_idx++] = '\n';
767 hd->pbuf[hd->pbuf_idx] = '\0';
768 printk(XENLOG_G_DEBUG "HVM%u: %s", v->domain->domain_id, hd->pbuf);
769 hd->pbuf_idx = 0;
770 }
771 spin_unlock(&hd->pbuf_lock);
772 }
774 void hvm_cpuid(unsigned int input, unsigned int *eax, unsigned int *ebx,
775 unsigned int *ecx, unsigned int *edx)
776 {
777 if ( !cpuid_hypervisor_leaves(input, eax, ebx, ecx, edx) )
778 {
779 cpuid(input, eax, ebx, ecx, edx);
781 if ( input == 0x00000001 )
782 {
783 struct vcpu *v = current;
785 clear_bit(X86_FEATURE_MWAIT & 31, ecx);
787 if ( vlapic_hw_disabled(vcpu_vlapic(v)) )
788 clear_bit(X86_FEATURE_APIC & 31, edx);
790 #if CONFIG_PAGING_LEVELS >= 3
791 if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] )
792 #endif
793 clear_bit(X86_FEATURE_PAE & 31, edx);
794 clear_bit(X86_FEATURE_PSE36 & 31, edx);
795 }
796 else if ( input == 0x80000001 )
797 {
798 #if CONFIG_PAGING_LEVELS >= 3
799 struct vcpu *v = current;
800 if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] )
801 #endif
802 clear_bit(X86_FEATURE_NX & 31, edx);
803 #ifdef __i386__
804 /* Mask feature for Intel ia32e or AMD long mode. */
805 clear_bit(X86_FEATURE_LAHF_LM & 31, ecx);
807 clear_bit(X86_FEATURE_LM & 31, edx);
808 clear_bit(X86_FEATURE_SYSCALL & 31, edx);
809 #endif
810 }
811 }
812 }
814 static long hvm_grant_table_op(
815 unsigned int cmd, XEN_GUEST_HANDLE(void) uop, unsigned int count)
816 {
817 if ( cmd != GNTTABOP_query_size )
818 return -ENOSYS; /* all other commands need auditing */
819 return do_grant_table_op(cmd, uop, count);
820 }
822 typedef unsigned long hvm_hypercall_t(
823 unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
825 #define HYPERCALL(x) \
826 [ __HYPERVISOR_ ## x ] = (hvm_hypercall_t *) do_ ## x
828 #if defined(__i386__)
830 static hvm_hypercall_t *hvm_hypercall32_table[NR_hypercalls] = {
831 HYPERCALL(memory_op),
832 [ __HYPERVISOR_grant_table_op ] = (hvm_hypercall_t *)hvm_grant_table_op,
833 HYPERCALL(xen_version),
834 HYPERCALL(grant_table_op),
835 HYPERCALL(event_channel_op),
836 HYPERCALL(sched_op),
837 HYPERCALL(hvm_op)
838 };
840 #else /* defined(__x86_64__) */
842 static long do_memory_op_compat32(int cmd, XEN_GUEST_HANDLE(void) arg)
843 {
844 extern long do_add_to_physmap(struct xen_add_to_physmap *xatp);
845 long rc;
847 switch ( cmd )
848 {
849 case XENMEM_add_to_physmap:
850 {
851 struct {
852 domid_t domid;
853 uint32_t space;
854 uint32_t idx;
855 uint32_t gpfn;
856 } u;
857 struct xen_add_to_physmap h;
859 if ( copy_from_guest(&u, arg, 1) )
860 return -EFAULT;
862 h.domid = u.domid;
863 h.space = u.space;
864 h.idx = u.idx;
865 h.gpfn = u.gpfn;
867 this_cpu(guest_handles_in_xen_space) = 1;
868 rc = do_memory_op(cmd, guest_handle_from_ptr(&h, void));
869 this_cpu(guest_handles_in_xen_space) = 0;
871 break;
872 }
874 default:
875 gdprintk(XENLOG_WARNING, "memory_op %d.\n", cmd);
876 rc = -ENOSYS;
877 break;
878 }
880 return rc;
881 }
883 static hvm_hypercall_t *hvm_hypercall64_table[NR_hypercalls] = {
884 HYPERCALL(memory_op),
885 [ __HYPERVISOR_grant_table_op ] = (hvm_hypercall_t *)hvm_grant_table_op,
886 HYPERCALL(xen_version),
887 HYPERCALL(grant_table_op),
888 HYPERCALL(event_channel_op),
889 HYPERCALL(sched_op),
890 HYPERCALL(hvm_op)
891 };
893 static hvm_hypercall_t *hvm_hypercall32_table[NR_hypercalls] = {
894 [ __HYPERVISOR_memory_op ] = (hvm_hypercall_t *)do_memory_op_compat32,
895 [ __HYPERVISOR_grant_table_op ] = (hvm_hypercall_t *)hvm_grant_table_op,
896 HYPERCALL(xen_version),
897 HYPERCALL(grant_table_op),
898 HYPERCALL(event_channel_op),
899 HYPERCALL(sched_op),
900 HYPERCALL(hvm_op)
901 };
903 #endif /* defined(__x86_64__) */
905 int hvm_do_hypercall(struct cpu_user_regs *regs)
906 {
907 int flush, mode = hvm_guest_x86_mode(current);
908 uint32_t eax = regs->eax;
910 switch ( mode )
911 {
912 #ifdef __x86_64__
913 case 8:
914 #endif
915 case 4:
916 case 2:
917 hvm_store_cpu_guest_regs(current, regs, NULL);
918 if ( unlikely(ring_3(regs)) )
919 {
920 default:
921 regs->eax = -EPERM;
922 return HVM_HCALL_completed;
923 }
924 case 0:
925 break;
926 }
928 if ( (eax >= NR_hypercalls) || !hvm_hypercall32_table[eax] )
929 {
930 regs->eax = -ENOSYS;
931 return HVM_HCALL_completed;
932 }
934 /*
935 * NB. In future flush only on decrease_reservation.
936 * For now we also need to flush when pages are added, as qemu-dm is not
937 * yet capable of faulting pages into an existing valid mapcache bucket.
938 */
939 flush = ((eax == __HYPERVISOR_memory_op) ||
940 (eax == __HYPERVISOR_grant_table_op)); /* needed ? */
941 this_cpu(hc_preempted) = 0;
943 #ifdef __x86_64__
944 if ( mode == 8 )
945 {
946 HVM_DBG_LOG(DBG_LEVEL_HCALL, "hcall%u(%lx, %lx, %lx, %lx, %lx)", eax,
947 regs->rdi, regs->rsi, regs->rdx, regs->r10, regs->r8);
949 regs->rax = hvm_hypercall64_table[eax](regs->rdi,
950 regs->rsi,
951 regs->rdx,
952 regs->r10,
953 regs->r8);
954 }
955 else
956 #endif
957 {
958 HVM_DBG_LOG(DBG_LEVEL_HCALL, "hcall%u(%x, %x, %x, %x, %x)", eax,
959 (uint32_t)regs->ebx, (uint32_t)regs->ecx,
960 (uint32_t)regs->edx, (uint32_t)regs->esi,
961 (uint32_t)regs->edi);
963 regs->eax = hvm_hypercall32_table[eax]((uint32_t)regs->ebx,
964 (uint32_t)regs->ecx,
965 (uint32_t)regs->edx,
966 (uint32_t)regs->esi,
967 (uint32_t)regs->edi);
968 }
970 HVM_DBG_LOG(DBG_LEVEL_HCALL, "hcall%u -> %lx",
971 eax, (unsigned long)regs->eax);
973 return (this_cpu(hc_preempted) ? HVM_HCALL_preempted :
974 flush ? HVM_HCALL_invalidate : HVM_HCALL_completed);
975 }
977 static void hvm_latch_shinfo_size(struct domain *d)
978 {
979 /*
980 * Called from operations which are among the very first executed by
981 * PV drivers on initialisation or after save/restore. These are sensible
982 * points at which to sample the execution mode of the guest and latch
983 * 32- or 64-bit format for shared state.
984 */
985 if ( current->domain == d )
986 d->arch.has_32bit_shinfo = (hvm_guest_x86_mode(current) != 8);
987 }
989 /* Initialise a hypercall transfer page for a VMX domain using
990 paravirtualised drivers. */
991 void hvm_hypercall_page_initialise(struct domain *d,
992 void *hypercall_page)
993 {
994 hvm_latch_shinfo_size(d);
995 hvm_funcs.init_hypercall_page(d, hypercall_page);
996 }
999 /*
1000 * only called in HVM domain BSP context
1001 * when booting, vcpuid is always equal to apic_id
1002 */
1003 int hvm_bringup_ap(int vcpuid, int trampoline_vector)
1005 struct vcpu *v;
1006 struct domain *d = current->domain;
1007 struct vcpu_guest_context *ctxt;
1008 int rc = 0;
1010 BUG_ON(!is_hvm_domain(d));
1012 if ( (v = d->vcpu[vcpuid]) == NULL )
1013 return -ENOENT;
1015 if ( (ctxt = xmalloc(struct vcpu_guest_context)) == NULL )
1017 gdprintk(XENLOG_ERR,
1018 "Failed to allocate memory in hvm_bringup_ap.\n");
1019 return -ENOMEM;
1022 hvm_init_ap_context(ctxt, vcpuid, trampoline_vector);
1024 /* Sync AP's TSC with BSP's. */
1025 v->arch.hvm_vcpu.cache_tsc_offset =
1026 v->domain->vcpu[0]->arch.hvm_vcpu.cache_tsc_offset;
1027 hvm_funcs.set_tsc_offset(v, v->arch.hvm_vcpu.cache_tsc_offset);
1029 LOCK_BIGLOCK(d);
1030 rc = -EEXIST;
1031 if ( !v->is_initialised )
1032 rc = boot_vcpu(d, vcpuid, ctxt);
1033 UNLOCK_BIGLOCK(d);
1035 if ( rc != 0 )
1037 gdprintk(XENLOG_ERR,
1038 "AP %d bringup failed in boot_vcpu %x.\n", vcpuid, rc);
1039 goto out;
1042 if ( test_and_clear_bit(_VPF_down, &v->pause_flags) )
1043 vcpu_wake(v);
1044 gdprintk(XENLOG_INFO, "AP %d bringup suceeded.\n", vcpuid);
1046 out:
1047 xfree(ctxt);
1048 return rc;
1051 static int hvmop_set_pci_intx_level(
1052 XEN_GUEST_HANDLE(xen_hvm_set_pci_intx_level_t) uop)
1054 struct xen_hvm_set_pci_intx_level op;
1055 struct domain *d;
1056 int rc;
1058 if ( copy_from_guest(&op, uop, 1) )
1059 return -EFAULT;
1061 if ( !IS_PRIV(current->domain) )
1062 return -EPERM;
1064 if ( (op.domain > 0) || (op.bus > 0) || (op.device > 31) || (op.intx > 3) )
1065 return -EINVAL;
1067 d = rcu_lock_domain_by_id(op.domid);
1068 if ( d == NULL )
1069 return -ESRCH;
1071 rc = -EINVAL;
1072 if ( !is_hvm_domain(d) )
1073 goto out;
1075 rc = 0;
1076 switch ( op.level )
1078 case 0:
1079 hvm_pci_intx_deassert(d, op.device, op.intx);
1080 break;
1081 case 1:
1082 hvm_pci_intx_assert(d, op.device, op.intx);
1083 break;
1084 default:
1085 rc = -EINVAL;
1086 break;
1089 out:
1090 rcu_unlock_domain(d);
1091 return rc;
1094 static int hvmop_set_isa_irq_level(
1095 XEN_GUEST_HANDLE(xen_hvm_set_isa_irq_level_t) uop)
1097 struct xen_hvm_set_isa_irq_level op;
1098 struct domain *d;
1099 int rc;
1101 if ( copy_from_guest(&op, uop, 1) )
1102 return -EFAULT;
1104 if ( !IS_PRIV(current->domain) )
1105 return -EPERM;
1107 if ( op.isa_irq > 15 )
1108 return -EINVAL;
1110 d = rcu_lock_domain_by_id(op.domid);
1111 if ( d == NULL )
1112 return -ESRCH;
1114 rc = -EINVAL;
1115 if ( !is_hvm_domain(d) )
1116 goto out;
1118 rc = 0;
1119 switch ( op.level )
1121 case 0:
1122 hvm_isa_irq_deassert(d, op.isa_irq);
1123 break;
1124 case 1:
1125 hvm_isa_irq_assert(d, op.isa_irq);
1126 break;
1127 default:
1128 rc = -EINVAL;
1129 break;
1132 out:
1133 rcu_unlock_domain(d);
1134 return rc;
1137 static int hvmop_set_pci_link_route(
1138 XEN_GUEST_HANDLE(xen_hvm_set_pci_link_route_t) uop)
1140 struct xen_hvm_set_pci_link_route op;
1141 struct domain *d;
1142 int rc;
1144 if ( copy_from_guest(&op, uop, 1) )
1145 return -EFAULT;
1147 if ( !IS_PRIV(current->domain) )
1148 return -EPERM;
1150 if ( (op.link > 3) || (op.isa_irq > 15) )
1151 return -EINVAL;
1153 d = rcu_lock_domain_by_id(op.domid);
1154 if ( d == NULL )
1155 return -ESRCH;
1157 rc = -EINVAL;
1158 if ( !is_hvm_domain(d) )
1159 goto out;
1161 rc = 0;
1162 hvm_set_pci_link_route(d, op.link, op.isa_irq);
1164 out:
1165 rcu_unlock_domain(d);
1166 return rc;
1169 static int hvmop_flush_tlb_all(void)
1171 flush_tlb_mask(current->domain->domain_dirty_cpumask);
1172 return 0;
1175 long do_hvm_op(unsigned long op, XEN_GUEST_HANDLE(void) arg)
1178 long rc = 0;
1180 switch ( op )
1182 case HVMOP_set_param:
1183 case HVMOP_get_param:
1185 struct xen_hvm_param a;
1186 struct hvm_ioreq_page *iorp;
1187 struct domain *d;
1188 struct vcpu *v;
1190 if ( copy_from_guest(&a, arg, 1) )
1191 return -EFAULT;
1193 if ( a.index >= HVM_NR_PARAMS )
1194 return -EINVAL;
1196 if ( a.domid == DOMID_SELF )
1197 d = rcu_lock_current_domain();
1198 else if ( IS_PRIV(current->domain) )
1199 d = rcu_lock_domain_by_id(a.domid);
1200 else
1201 return -EPERM;
1203 if ( d == NULL )
1204 return -ESRCH;
1206 rc = -EINVAL;
1207 if ( !is_hvm_domain(d) )
1208 goto param_fail;
1210 if ( op == HVMOP_set_param )
1212 switch ( a.index )
1214 case HVM_PARAM_IOREQ_PFN:
1215 iorp = &d->arch.hvm_domain.ioreq;
1216 rc = hvm_set_ioreq_page(d, iorp, a.value);
1217 spin_lock(&iorp->lock);
1218 if ( (rc == 0) && (iorp->va != NULL) )
1219 /* Initialise evtchn port info if VCPUs already created. */
1220 for_each_vcpu ( d, v )
1221 get_ioreq(v)->vp_eport = v->arch.hvm_vcpu.xen_port;
1222 spin_unlock(&iorp->lock);
1223 break;
1224 case HVM_PARAM_BUFIOREQ_PFN:
1225 iorp = &d->arch.hvm_domain.buf_ioreq;
1226 rc = hvm_set_ioreq_page(d, iorp, a.value);
1227 break;
1228 case HVM_PARAM_CALLBACK_IRQ:
1229 hvm_set_callback_via(d, a.value);
1230 hvm_latch_shinfo_size(d);
1231 break;
1233 d->arch.hvm_domain.params[a.index] = a.value;
1234 rc = 0;
1236 else
1238 a.value = d->arch.hvm_domain.params[a.index];
1239 rc = copy_to_guest(arg, &a, 1) ? -EFAULT : 0;
1242 HVM_DBG_LOG(DBG_LEVEL_HCALL, "%s param %u = %"PRIx64,
1243 op == HVMOP_set_param ? "set" : "get",
1244 a.index, a.value);
1246 param_fail:
1247 rcu_unlock_domain(d);
1248 break;
1251 case HVMOP_set_pci_intx_level:
1252 rc = hvmop_set_pci_intx_level(
1253 guest_handle_cast(arg, xen_hvm_set_pci_intx_level_t));
1254 break;
1256 case HVMOP_set_isa_irq_level:
1257 rc = hvmop_set_isa_irq_level(
1258 guest_handle_cast(arg, xen_hvm_set_isa_irq_level_t));
1259 break;
1261 case HVMOP_set_pci_link_route:
1262 rc = hvmop_set_pci_link_route(
1263 guest_handle_cast(arg, xen_hvm_set_pci_link_route_t));
1264 break;
1266 case HVMOP_flush_tlbs:
1267 rc = guest_handle_is_null(arg) ? hvmop_flush_tlb_all() : -ENOSYS;
1268 break;
1270 default:
1272 gdprintk(XENLOG_WARNING, "Bad HVM op %ld.\n", op);
1273 rc = -ENOSYS;
1274 break;
1278 return rc;
1281 /*
1282 * Local variables:
1283 * mode: C
1284 * c-set-style: "BSD"
1285 * c-basic-offset: 4
1286 * tab-width: 4
1287 * indent-tabs-mode: nil
1288 * End:
1289 */