ia64/xen-unstable

view xen/arch/x86/hvm/hvm.c @ 16383:ef4b60c99735

x86, hvm: Small code cleanups.
Based on patch from Xin Li.
Signed-off-by: Keir Fraser <keir.fraser@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Fri Nov 16 16:22:00 2007 +0000 (2007-11-16)
parents db9f62d8f7f4
children 0f9b5ab59579
line source
1 /*
2 * hvm.c: Common hardware virtual machine abstractions.
3 *
4 * Copyright (c) 2004, Intel Corporation.
5 * Copyright (c) 2005, International Business Machines Corporation.
6 *
7 * This program is free software; you can redistribute it and/or modify it
8 * under the terms and conditions of the GNU General Public License,
9 * version 2, as published by the Free Software Foundation.
10 *
11 * This program is distributed in the hope it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
14 * more details.
15 *
16 * You should have received a copy of the GNU General Public License along with
17 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
18 * Place - Suite 330, Boston, MA 02111-1307 USA.
19 */
21 #include <xen/config.h>
22 #include <xen/init.h>
23 #include <xen/lib.h>
24 #include <xen/trace.h>
25 #include <xen/sched.h>
26 #include <xen/irq.h>
27 #include <xen/softirq.h>
28 #include <xen/domain.h>
29 #include <xen/domain_page.h>
30 #include <xen/hypercall.h>
31 #include <xen/guest_access.h>
32 #include <xen/event.h>
33 #include <asm/current.h>
34 #include <asm/e820.h>
35 #include <asm/io.h>
36 #include <asm/paging.h>
37 #include <asm/regs.h>
38 #include <asm/cpufeature.h>
39 #include <asm/processor.h>
40 #include <asm/types.h>
41 #include <asm/msr.h>
42 #include <asm/mc146818rtc.h>
43 #include <asm/spinlock.h>
44 #include <asm/hvm/hvm.h>
45 #include <asm/hvm/vpt.h>
46 #include <asm/hvm/support.h>
47 #include <asm/hvm/cacheattr.h>
48 #include <public/sched.h>
49 #include <public/hvm/ioreq.h>
50 #include <public/version.h>
51 #include <public/memory.h>
53 /*
54 * Xen command-line option to allow/disallow hardware-assisted paging.
55 * Since the phys-to-machine table of AMD NPT is in host format, 32-bit Xen
56 * can only support guests using NPT with up to a 4GB memory map. Therefore
57 * we disallow HAP by default on PAE Xen (by default we want to support an
58 * 8GB pseudophysical memory map for HVM guests on a PAE host).
59 */
60 static int opt_hap_permitted = (CONFIG_PAGING_LEVELS != 3);
61 boolean_param("hap", opt_hap_permitted);
63 int hvm_enabled __read_mostly;
65 unsigned int opt_hvm_debug_level __read_mostly;
66 integer_param("hvm_debug", opt_hvm_debug_level);
68 struct hvm_function_table hvm_funcs __read_mostly;
70 /* I/O permission bitmap is globally shared by all HVM guests. */
71 char __attribute__ ((__section__ (".bss.page_aligned")))
72 hvm_io_bitmap[3*PAGE_SIZE];
74 void hvm_enable(struct hvm_function_table *fns)
75 {
76 BUG_ON(hvm_enabled);
77 printk("HVM: %s enabled\n", fns->name);
79 /*
80 * Allow direct access to the PC debug port (it is often used for I/O
81 * delays, but the vmexits simply slow things down).
82 */
83 memset(hvm_io_bitmap, ~0, sizeof(hvm_io_bitmap));
84 clear_bit(0x80, hvm_io_bitmap);
86 hvm_funcs = *fns;
87 hvm_enabled = 1;
89 if ( hvm_funcs.hap_supported )
90 {
91 if ( !opt_hap_permitted )
92 hvm_funcs.hap_supported = 0;
93 printk("HVM: Hardware Assisted Paging detected %s.\n",
94 hvm_funcs.hap_supported ? "and enabled" : "but disabled");
95 }
96 }
98 void hvm_set_guest_tsc(struct vcpu *v, u64 guest_tsc)
99 {
100 u64 host_tsc;
102 rdtscll(host_tsc);
104 v->arch.hvm_vcpu.cache_tsc_offset = guest_tsc - host_tsc;
105 hvm_funcs.set_tsc_offset(v, v->arch.hvm_vcpu.cache_tsc_offset);
106 }
108 u64 hvm_get_guest_tsc(struct vcpu *v)
109 {
110 u64 host_tsc;
112 rdtscll(host_tsc);
113 return host_tsc + v->arch.hvm_vcpu.cache_tsc_offset;
114 }
116 void hvm_migrate_timers(struct vcpu *v)
117 {
118 rtc_migrate_timers(v);
119 hpet_migrate_timers(v);
120 pt_migrate(v);
121 }
123 void hvm_do_resume(struct vcpu *v)
124 {
125 ioreq_t *p;
127 if ( !v->fpu_dirtied )
128 hvm_funcs.stts(v);
130 pt_restore_timer(v);
132 /* NB. Optimised for common case (p->state == STATE_IOREQ_NONE). */
133 p = &get_ioreq(v)->vp_ioreq;
134 while ( p->state != STATE_IOREQ_NONE )
135 {
136 switch ( p->state )
137 {
138 case STATE_IORESP_READY: /* IORESP_READY -> NONE */
139 hvm_io_assist();
140 break;
141 case STATE_IOREQ_READY: /* IOREQ_{READY,INPROCESS} -> IORESP_READY */
142 case STATE_IOREQ_INPROCESS:
143 wait_on_xen_event_channel(v->arch.hvm_vcpu.xen_port,
144 (p->state != STATE_IOREQ_READY) &&
145 (p->state != STATE_IOREQ_INPROCESS));
146 break;
147 default:
148 gdprintk(XENLOG_ERR, "Weird HVM iorequest state %d.\n", p->state);
149 domain_crash_synchronous();
150 }
151 }
152 }
154 static void hvm_init_ioreq_page(
155 struct domain *d, struct hvm_ioreq_page *iorp)
156 {
157 memset(iorp, 0, sizeof(*iorp));
158 spin_lock_init(&iorp->lock);
159 domain_pause(d);
160 }
162 static void hvm_destroy_ioreq_page(
163 struct domain *d, struct hvm_ioreq_page *iorp)
164 {
165 spin_lock(&iorp->lock);
167 ASSERT(d->is_dying);
169 if ( iorp->va != NULL )
170 {
171 unmap_domain_page_global(iorp->va);
172 put_page_and_type(iorp->page);
173 iorp->va = NULL;
174 }
176 spin_unlock(&iorp->lock);
177 }
179 static int hvm_set_ioreq_page(
180 struct domain *d, struct hvm_ioreq_page *iorp, unsigned long gmfn)
181 {
182 struct page_info *page;
183 p2m_type_t p2mt;
184 unsigned long mfn;
185 void *va;
187 mfn = mfn_x(gfn_to_mfn(d, gmfn, &p2mt));
188 if ( !p2m_is_ram(p2mt) )
189 return -EINVAL;
190 ASSERT(mfn_valid(mfn));
192 page = mfn_to_page(mfn);
193 if ( !get_page_and_type(page, d, PGT_writable_page) )
194 return -EINVAL;
196 va = map_domain_page_global(mfn);
197 if ( va == NULL )
198 {
199 put_page_and_type(page);
200 return -ENOMEM;
201 }
203 spin_lock(&iorp->lock);
205 if ( (iorp->va != NULL) || d->is_dying )
206 {
207 spin_unlock(&iorp->lock);
208 unmap_domain_page_global(va);
209 put_page_and_type(mfn_to_page(mfn));
210 return -EINVAL;
211 }
213 iorp->va = va;
214 iorp->page = page;
216 spin_unlock(&iorp->lock);
218 domain_unpause(d);
220 return 0;
221 }
223 int hvm_domain_initialise(struct domain *d)
224 {
225 int rc;
227 if ( !hvm_enabled )
228 {
229 gdprintk(XENLOG_WARNING, "Attempt to create a HVM guest "
230 "on a non-VT/AMDV platform.\n");
231 return -EINVAL;
232 }
234 spin_lock_init(&d->arch.hvm_domain.pbuf_lock);
235 spin_lock_init(&d->arch.hvm_domain.irq_lock);
236 spin_lock_init(&d->arch.hvm_domain.uc_lock);
238 hvm_init_cacheattr_region_list(d);
240 rc = paging_enable(d, PG_refcounts|PG_translate|PG_external);
241 if ( rc != 0 )
242 goto fail1;
244 vpic_init(d);
246 rc = vioapic_init(d);
247 if ( rc != 0 )
248 goto fail1;
250 stdvga_init(d);
252 hvm_init_ioreq_page(d, &d->arch.hvm_domain.ioreq);
253 hvm_init_ioreq_page(d, &d->arch.hvm_domain.buf_ioreq);
255 rc = hvm_funcs.domain_initialise(d);
256 if ( rc != 0 )
257 goto fail2;
259 return 0;
261 fail2:
262 vioapic_deinit(d);
263 fail1:
264 hvm_destroy_cacheattr_region_list(d);
265 return rc;
266 }
268 void hvm_domain_relinquish_resources(struct domain *d)
269 {
270 hvm_destroy_ioreq_page(d, &d->arch.hvm_domain.ioreq);
271 hvm_destroy_ioreq_page(d, &d->arch.hvm_domain.buf_ioreq);
273 pit_deinit(d);
274 rtc_deinit(d);
275 pmtimer_deinit(d);
276 hpet_deinit(d);
277 stdvga_deinit(d);
278 }
280 void hvm_domain_destroy(struct domain *d)
281 {
282 hvm_funcs.domain_destroy(d);
283 vioapic_deinit(d);
284 hvm_destroy_cacheattr_region_list(d);
285 }
287 static int hvm_save_cpu_ctxt(struct domain *d, hvm_domain_context_t *h)
288 {
289 struct vcpu *v;
290 struct hvm_hw_cpu ctxt;
291 struct vcpu_guest_context *vc;
293 for_each_vcpu(d, v)
294 {
295 /* We don't need to save state for a vcpu that is down; the restore
296 * code will leave it down if there is nothing saved. */
297 if ( test_bit(_VPF_down, &v->pause_flags) )
298 continue;
300 /* Architecture-specific vmcs/vmcb bits */
301 hvm_funcs.save_cpu_ctxt(v, &ctxt);
303 /* Other vcpu register state */
304 vc = &v->arch.guest_context;
305 if ( v->fpu_initialised )
306 memcpy(ctxt.fpu_regs, &vc->fpu_ctxt, sizeof(ctxt.fpu_regs));
307 else
308 memset(ctxt.fpu_regs, 0, sizeof(ctxt.fpu_regs));
309 ctxt.rax = vc->user_regs.eax;
310 ctxt.rbx = vc->user_regs.ebx;
311 ctxt.rcx = vc->user_regs.ecx;
312 ctxt.rdx = vc->user_regs.edx;
313 ctxt.rbp = vc->user_regs.ebp;
314 ctxt.rsi = vc->user_regs.esi;
315 ctxt.rdi = vc->user_regs.edi;
316 ctxt.rsp = vc->user_regs.esp;
317 ctxt.rip = vc->user_regs.eip;
318 ctxt.rflags = vc->user_regs.eflags;
319 #ifdef __x86_64__
320 ctxt.r8 = vc->user_regs.r8;
321 ctxt.r9 = vc->user_regs.r9;
322 ctxt.r10 = vc->user_regs.r10;
323 ctxt.r11 = vc->user_regs.r11;
324 ctxt.r12 = vc->user_regs.r12;
325 ctxt.r13 = vc->user_regs.r13;
326 ctxt.r14 = vc->user_regs.r14;
327 ctxt.r15 = vc->user_regs.r15;
328 #endif
329 ctxt.dr0 = vc->debugreg[0];
330 ctxt.dr1 = vc->debugreg[1];
331 ctxt.dr2 = vc->debugreg[2];
332 ctxt.dr3 = vc->debugreg[3];
333 ctxt.dr6 = vc->debugreg[6];
334 ctxt.dr7 = vc->debugreg[7];
336 if ( hvm_save_entry(CPU, v->vcpu_id, h, &ctxt) != 0 )
337 return 1;
338 }
339 return 0;
340 }
342 static int hvm_load_cpu_ctxt(struct domain *d, hvm_domain_context_t *h)
343 {
344 int vcpuid, rc;
345 struct vcpu *v;
346 struct hvm_hw_cpu ctxt;
347 struct vcpu_guest_context *vc;
349 /* Which vcpu is this? */
350 vcpuid = hvm_load_instance(h);
351 if ( vcpuid > MAX_VIRT_CPUS || (v = d->vcpu[vcpuid]) == NULL )
352 {
353 gdprintk(XENLOG_ERR, "HVM restore: domain has no vcpu %u\n", vcpuid);
354 return -EINVAL;
355 }
356 vc = &v->arch.guest_context;
358 /* Need to init this vcpu before loading its contents */
359 LOCK_BIGLOCK(d);
360 if ( !v->is_initialised )
361 if ( (rc = boot_vcpu(d, vcpuid, vc)) != 0 )
362 return rc;
363 UNLOCK_BIGLOCK(d);
365 if ( hvm_load_entry(CPU, h, &ctxt) != 0 )
366 return -EINVAL;
368 /* Sanity check some control registers. */
369 if ( (ctxt.cr0 & HVM_CR0_GUEST_RESERVED_BITS) ||
370 !(ctxt.cr0 & X86_CR0_ET) ||
371 ((ctxt.cr0 & (X86_CR0_PE|X86_CR0_PG)) == X86_CR0_PG) )
372 {
373 gdprintk(XENLOG_ERR, "HVM restore: bad CR0 0x%"PRIx64"\n",
374 ctxt.msr_efer);
375 return -EINVAL;
376 }
378 if ( ctxt.cr4 & HVM_CR4_GUEST_RESERVED_BITS )
379 {
380 gdprintk(XENLOG_ERR, "HVM restore: bad CR4 0x%"PRIx64"\n",
381 ctxt.msr_efer);
382 return -EINVAL;
383 }
385 if ( (ctxt.msr_efer & ~(EFER_FFXSE | EFER_LME | EFER_LMA |
386 EFER_NX | EFER_SCE)) ||
387 ((sizeof(long) != 8) && (ctxt.msr_efer & EFER_LME)) ||
388 (!cpu_has_nx && (ctxt.msr_efer & EFER_NX)) ||
389 (!cpu_has_syscall && (ctxt.msr_efer & EFER_SCE)) ||
390 (!cpu_has_ffxsr && (ctxt.msr_efer & EFER_FFXSE)) ||
391 ((ctxt.msr_efer & (EFER_LME|EFER_LMA)) == EFER_LMA) )
392 {
393 gdprintk(XENLOG_ERR, "HVM restore: bad EFER 0x%"PRIx64"\n",
394 ctxt.msr_efer);
395 return -EINVAL;
396 }
398 /* Architecture-specific vmcs/vmcb bits */
399 if ( hvm_funcs.load_cpu_ctxt(v, &ctxt) < 0 )
400 return -EINVAL;
402 /* Other vcpu register state */
403 memcpy(&vc->fpu_ctxt, ctxt.fpu_regs, sizeof(ctxt.fpu_regs));
404 vc->user_regs.eax = ctxt.rax;
405 vc->user_regs.ebx = ctxt.rbx;
406 vc->user_regs.ecx = ctxt.rcx;
407 vc->user_regs.edx = ctxt.rdx;
408 vc->user_regs.ebp = ctxt.rbp;
409 vc->user_regs.esi = ctxt.rsi;
410 vc->user_regs.edi = ctxt.rdi;
411 vc->user_regs.esp = ctxt.rsp;
412 vc->user_regs.eip = ctxt.rip;
413 vc->user_regs.eflags = ctxt.rflags | 2;
414 #ifdef __x86_64__
415 vc->user_regs.r8 = ctxt.r8;
416 vc->user_regs.r9 = ctxt.r9;
417 vc->user_regs.r10 = ctxt.r10;
418 vc->user_regs.r11 = ctxt.r11;
419 vc->user_regs.r12 = ctxt.r12;
420 vc->user_regs.r13 = ctxt.r13;
421 vc->user_regs.r14 = ctxt.r14;
422 vc->user_regs.r15 = ctxt.r15;
423 #endif
424 vc->debugreg[0] = ctxt.dr0;
425 vc->debugreg[1] = ctxt.dr1;
426 vc->debugreg[2] = ctxt.dr2;
427 vc->debugreg[3] = ctxt.dr3;
428 vc->debugreg[6] = ctxt.dr6;
429 vc->debugreg[7] = ctxt.dr7;
431 vc->flags = VGCF_online;
432 v->fpu_initialised = 1;
434 /* Auxiliary processors should be woken immediately. */
435 if ( test_and_clear_bit(_VPF_down, &v->pause_flags) )
436 vcpu_wake(v);
438 return 0;
439 }
441 HVM_REGISTER_SAVE_RESTORE(CPU, hvm_save_cpu_ctxt, hvm_load_cpu_ctxt,
442 1, HVMSR_PER_VCPU);
444 extern int reset_vmsr(struct mtrr_state *m, u64 *p);
446 int hvm_vcpu_initialise(struct vcpu *v)
447 {
448 int rc;
450 if ( (rc = vlapic_init(v)) != 0 )
451 goto fail1;
453 if ( (rc = hvm_funcs.vcpu_initialise(v)) != 0 )
454 goto fail2;
456 /* Create ioreq event channel. */
457 rc = alloc_unbound_xen_event_channel(v, 0);
458 if ( rc < 0 )
459 goto fail3;
461 /* Register ioreq event channel. */
462 v->arch.hvm_vcpu.xen_port = rc;
463 spin_lock(&v->domain->arch.hvm_domain.ioreq.lock);
464 if ( v->domain->arch.hvm_domain.ioreq.va != NULL )
465 get_ioreq(v)->vp_eport = v->arch.hvm_vcpu.xen_port;
466 spin_unlock(&v->domain->arch.hvm_domain.ioreq.lock);
468 spin_lock_init(&v->arch.hvm_vcpu.tm_lock);
469 INIT_LIST_HEAD(&v->arch.hvm_vcpu.tm_list);
471 rc = reset_vmsr(&v->arch.hvm_vcpu.mtrr, &v->arch.hvm_vcpu.pat_cr);
472 if ( rc != 0 )
473 goto fail3;
475 v->arch.guest_context.user_regs.eflags = 2;
477 if ( v->vcpu_id == 0 )
478 {
479 /* NB. All these really belong in hvm_domain_initialise(). */
480 pit_init(v, cpu_khz);
481 rtc_init(v, RTC_PORT(0));
482 pmtimer_init(v);
483 hpet_init(v);
485 /* Init guest TSC to start from zero. */
486 hvm_set_guest_time(v, 0);
488 /* Can start up without SIPI-SIPI or setvcpucontext domctl. */
489 v->is_initialised = 1;
490 clear_bit(_VPF_down, &v->pause_flags);
491 }
493 return 0;
495 fail3:
496 hvm_funcs.vcpu_destroy(v);
497 fail2:
498 vlapic_destroy(v);
499 fail1:
500 return rc;
501 }
503 void hvm_vcpu_destroy(struct vcpu *v)
504 {
505 vlapic_destroy(v);
506 hvm_funcs.vcpu_destroy(v);
508 /* Event channel is already freed by evtchn_destroy(). */
509 /*free_xen_event_channel(v, v->arch.hvm_vcpu.xen_port);*/
510 }
513 void hvm_vcpu_reset(struct vcpu *v)
514 {
515 vcpu_pause(v);
517 vlapic_reset(vcpu_vlapic(v));
519 hvm_funcs.vcpu_initialise(v);
521 set_bit(_VPF_down, &v->pause_flags);
522 clear_bit(_VPF_blocked, &v->pause_flags);
523 v->fpu_initialised = 0;
524 v->fpu_dirtied = 0;
525 v->is_initialised = 0;
527 vcpu_unpause(v);
528 }
530 static void hvm_vcpu_down(void)
531 {
532 struct vcpu *v = current;
533 struct domain *d = v->domain;
534 int online_count = 0;
536 gdprintk(XENLOG_INFO, "DOM%d/VCPU%d: going offline.\n",
537 d->domain_id, v->vcpu_id);
539 /* Doesn't halt us immediately, but we'll never return to guest context. */
540 set_bit(_VPF_down, &v->pause_flags);
541 vcpu_sleep_nosync(v);
543 /* Any other VCPUs online? ... */
544 LOCK_BIGLOCK(d);
545 for_each_vcpu ( d, v )
546 if ( !test_bit(_VPF_down, &v->pause_flags) )
547 online_count++;
548 UNLOCK_BIGLOCK(d);
550 /* ... Shut down the domain if not. */
551 if ( online_count == 0 )
552 {
553 gdprintk(XENLOG_INFO, "DOM%d: all CPUs offline -- powering off.\n",
554 d->domain_id);
555 domain_shutdown(d, SHUTDOWN_poweroff);
556 }
557 }
559 void hvm_send_assist_req(struct vcpu *v)
560 {
561 ioreq_t *p;
563 if ( unlikely(!vcpu_start_shutdown_deferral(v)) )
564 return; /* implicitly bins the i/o operation */
566 p = &get_ioreq(v)->vp_ioreq;
567 if ( unlikely(p->state != STATE_IOREQ_NONE) )
568 {
569 /* This indicates a bug in the device model. Crash the domain. */
570 gdprintk(XENLOG_ERR, "Device model set bad IO state %d.\n", p->state);
571 domain_crash_synchronous();
572 }
574 prepare_wait_on_xen_event_channel(v->arch.hvm_vcpu.xen_port);
576 /*
577 * Following happens /after/ blocking and setting up ioreq contents.
578 * prepare_wait_on_xen_event_channel() is an implicit barrier.
579 */
580 p->state = STATE_IOREQ_READY;
581 notify_via_xen_event_channel(v->arch.hvm_vcpu.xen_port);
582 }
584 void hvm_hlt(unsigned long rflags)
585 {
586 /*
587 * If we halt with interrupts disabled, that's a pretty sure sign that we
588 * want to shut down. In a real processor, NMIs are the only way to break
589 * out of this.
590 */
591 if ( unlikely(!(rflags & X86_EFLAGS_IF)) )
592 return hvm_vcpu_down();
594 do_sched_op_compat(SCHEDOP_block, 0);
595 }
597 void hvm_triple_fault(void)
598 {
599 struct vcpu *v = current;
600 gdprintk(XENLOG_INFO, "Triple fault on VCPU%d - "
601 "invoking HVM system reset.\n", v->vcpu_id);
602 domain_shutdown(v->domain, SHUTDOWN_reboot);
603 }
605 int hvm_set_efer(uint64_t value)
606 {
607 struct vcpu *v = current;
609 value &= ~EFER_LMA;
611 if ( (value & ~(EFER_FFXSE | EFER_LME | EFER_NX | EFER_SCE)) ||
612 ((sizeof(long) != 8) && (value & EFER_LME)) ||
613 (!cpu_has_nx && (value & EFER_NX)) ||
614 (!cpu_has_syscall && (value & EFER_SCE)) ||
615 (!cpu_has_ffxsr && (value & EFER_FFXSE)) )
616 {
617 gdprintk(XENLOG_WARNING, "Trying to set reserved bit in "
618 "EFER: %"PRIx64"\n", value);
619 hvm_inject_exception(TRAP_gp_fault, 0, 0);
620 return 0;
621 }
623 if ( ((value ^ v->arch.hvm_vcpu.guest_efer) & EFER_LME) &&
624 hvm_paging_enabled(v) )
625 {
626 gdprintk(XENLOG_WARNING,
627 "Trying to change EFER.LME with paging enabled\n");
628 hvm_inject_exception(TRAP_gp_fault, 0, 0);
629 return 0;
630 }
632 value |= v->arch.hvm_vcpu.guest_efer & EFER_LMA;
633 v->arch.hvm_vcpu.guest_efer = value;
634 hvm_update_guest_efer(v);
636 return 1;
637 }
639 extern void shadow_blow_tables_per_domain(struct domain *d);
640 extern bool_t mtrr_pat_not_equal(struct vcpu *vd, struct vcpu *vs);
642 /* Exit UC mode only if all VCPUs agree on MTRR/PAT and are not in no_fill. */
643 static bool_t domain_exit_uc_mode(struct vcpu *v)
644 {
645 struct domain *d = v->domain;
646 struct vcpu *vs;
648 for_each_vcpu ( d, vs )
649 {
650 if ( (vs == v) || !vs->is_initialised )
651 continue;
652 if ( (vs->arch.hvm_vcpu.cache_mode == NO_FILL_CACHE_MODE) ||
653 mtrr_pat_not_equal(vs, v) )
654 return 0;
655 }
657 return 1;
658 }
660 static void local_flush_cache(void *info)
661 {
662 wbinvd();
663 }
665 int hvm_set_cr0(unsigned long value)
666 {
667 struct vcpu *v = current;
668 p2m_type_t p2mt;
669 unsigned long gfn, mfn, old_value = v->arch.hvm_vcpu.guest_cr[0];
671 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR0 value = %lx", value);
673 if ( (u32)value != value )
674 {
675 HVM_DBG_LOG(DBG_LEVEL_1,
676 "Guest attempts to set upper 32 bits in CR0: %lx",
677 value);
678 hvm_inject_exception(TRAP_gp_fault, 0, 0);
679 return 0;
680 }
682 value &= ~HVM_CR0_GUEST_RESERVED_BITS;
684 /* ET is reserved and should be always be 1. */
685 value |= X86_CR0_ET;
687 if ( (value & (X86_CR0_PE | X86_CR0_PG)) == X86_CR0_PG )
688 {
689 hvm_inject_exception(TRAP_gp_fault, 0, 0);
690 return 0;
691 }
693 if ( (value & X86_CR0_PG) && !(old_value & X86_CR0_PG) )
694 {
695 if ( v->arch.hvm_vcpu.guest_efer & EFER_LME )
696 {
697 if ( !(v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_PAE) )
698 {
699 HVM_DBG_LOG(DBG_LEVEL_1, "Enable paging before PAE enable");
700 hvm_inject_exception(TRAP_gp_fault, 0, 0);
701 return 0;
702 }
703 HVM_DBG_LOG(DBG_LEVEL_1, "Enabling long mode");
704 v->arch.hvm_vcpu.guest_efer |= EFER_LMA;
705 hvm_update_guest_efer(v);
706 }
708 if ( !paging_mode_hap(v->domain) )
709 {
710 /* The guest CR3 must be pointing to the guest physical. */
711 gfn = v->arch.hvm_vcpu.guest_cr[3]>>PAGE_SHIFT;
712 mfn = mfn_x(gfn_to_mfn_current(gfn, &p2mt));
713 if ( !p2m_is_ram(p2mt) || !mfn_valid(mfn) ||
714 !get_page(mfn_to_page(mfn), v->domain))
715 {
716 gdprintk(XENLOG_ERR, "Invalid CR3 value = %lx (mfn=%lx)\n",
717 v->arch.hvm_vcpu.guest_cr[3], mfn);
718 domain_crash(v->domain);
719 return 0;
720 }
722 /* Now arch.guest_table points to machine physical. */
723 v->arch.guest_table = pagetable_from_pfn(mfn);
725 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, mfn = %lx",
726 v->arch.hvm_vcpu.guest_cr[3], mfn);
727 }
728 }
729 else if ( !(value & X86_CR0_PG) && (old_value & X86_CR0_PG) )
730 {
731 /* When CR0.PG is cleared, LMA is cleared immediately. */
732 if ( hvm_long_mode_enabled(v) )
733 {
734 v->arch.hvm_vcpu.guest_efer &= ~EFER_LMA;
735 hvm_update_guest_efer(v);
736 }
738 if ( !paging_mode_hap(v->domain) )
739 {
740 put_page(pagetable_get_page(v->arch.guest_table));
741 v->arch.guest_table = pagetable_null();
742 }
743 }
745 if ( !list_empty(&domain_hvm_iommu(v->domain)->pdev_list) )
746 {
747 if ( (value & X86_CR0_CD) && !(value & X86_CR0_NW) )
748 {
749 /* Entering no fill cache mode. */
750 spin_lock(&v->domain->arch.hvm_domain.uc_lock);
751 v->arch.hvm_vcpu.cache_mode = NO_FILL_CACHE_MODE;
753 if ( !v->domain->arch.hvm_domain.is_in_uc_mode )
754 {
755 /* Flush physical caches. */
756 on_each_cpu(local_flush_cache, NULL, 1, 1);
757 /* Shadow pagetables must recognise UC mode. */
758 v->domain->arch.hvm_domain.is_in_uc_mode = 1;
759 shadow_blow_tables_per_domain(v->domain);
760 }
761 spin_unlock(&v->domain->arch.hvm_domain.uc_lock);
762 }
763 else if ( !(value & (X86_CR0_CD | X86_CR0_NW)) &&
764 (v->arch.hvm_vcpu.cache_mode == NO_FILL_CACHE_MODE) )
765 {
766 /* Exit from no fill cache mode. */
767 spin_lock(&v->domain->arch.hvm_domain.uc_lock);
768 v->arch.hvm_vcpu.cache_mode = NORMAL_CACHE_MODE;
770 if ( domain_exit_uc_mode(v) )
771 {
772 /* Shadow pagetables must recognise normal caching mode. */
773 v->domain->arch.hvm_domain.is_in_uc_mode = 0;
774 shadow_blow_tables_per_domain(v->domain);
775 }
776 spin_unlock(&v->domain->arch.hvm_domain.uc_lock);
777 }
778 }
780 v->arch.hvm_vcpu.guest_cr[0] = value;
781 hvm_update_guest_cr(v, 0);
783 if ( (value ^ old_value) & X86_CR0_PG )
784 paging_update_paging_modes(v);
786 return 1;
787 }
789 int hvm_set_cr3(unsigned long value)
790 {
791 unsigned long mfn;
792 p2m_type_t p2mt;
793 struct vcpu *v = current;
795 if ( hvm_paging_enabled(v) && !paging_mode_hap(v->domain) &&
796 (value != v->arch.hvm_vcpu.guest_cr[3]) )
797 {
798 /* Shadow-mode CR3 change. Check PDBR and update refcounts. */
799 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 value = %lx", value);
800 mfn = mfn_x(gfn_to_mfn_current(value >> PAGE_SHIFT, &p2mt));
801 if ( !p2m_is_ram(p2mt) || !mfn_valid(mfn) ||
802 !get_page(mfn_to_page(mfn), v->domain) )
803 goto bad_cr3;
805 put_page(pagetable_get_page(v->arch.guest_table));
806 v->arch.guest_table = pagetable_from_pfn(mfn);
808 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx", value);
809 }
811 v->arch.hvm_vcpu.guest_cr[3] = value;
812 paging_update_cr3(v);
813 return 1;
815 bad_cr3:
816 gdprintk(XENLOG_ERR, "Invalid CR3\n");
817 domain_crash(v->domain);
818 return 0;
819 }
821 int hvm_set_cr4(unsigned long value)
822 {
823 struct vcpu *v = current;
824 unsigned long old_cr;
826 if ( value & HVM_CR4_GUEST_RESERVED_BITS )
827 {
828 HVM_DBG_LOG(DBG_LEVEL_1,
829 "Guest attempts to set reserved bit in CR4: %lx",
830 value);
831 goto gpf;
832 }
834 if ( !(value & X86_CR4_PAE) && hvm_long_mode_enabled(v) )
835 {
836 HVM_DBG_LOG(DBG_LEVEL_1, "Guest cleared CR4.PAE while "
837 "EFER.LMA is set");
838 goto gpf;
839 }
841 old_cr = v->arch.hvm_vcpu.guest_cr[4];
842 v->arch.hvm_vcpu.guest_cr[4] = value;
843 hvm_update_guest_cr(v, 4);
845 /* Modifying CR4.{PSE,PAE,PGE} invalidates all TLB entries, inc. Global. */
846 if ( (old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE) )
847 paging_update_paging_modes(v);
849 return 1;
851 gpf:
852 hvm_inject_exception(TRAP_gp_fault, 0, 0);
853 return 0;
854 }
856 int hvm_virtual_to_linear_addr(
857 enum x86_segment seg,
858 struct segment_register *reg,
859 unsigned long offset,
860 unsigned int bytes,
861 enum hvm_access_type access_type,
862 unsigned int addr_size,
863 unsigned long *linear_addr)
864 {
865 unsigned long addr = offset;
866 uint32_t last_byte;
868 if ( addr_size != 64 )
869 {
870 /*
871 * COMPATIBILITY MODE: Apply segment checks and add base.
872 */
874 switch ( access_type )
875 {
876 case hvm_access_read:
877 if ( (reg->attr.fields.type & 0xa) == 0x8 )
878 goto gpf; /* execute-only code segment */
879 break;
880 case hvm_access_write:
881 if ( (reg->attr.fields.type & 0xa) != 0x2 )
882 goto gpf; /* not a writable data segment */
883 break;
884 default:
885 break;
886 }
888 last_byte = offset + bytes - 1;
890 /* Is this a grows-down data segment? Special limit check if so. */
891 if ( (reg->attr.fields.type & 0xc) == 0x4 )
892 {
893 /* Is upper limit 0xFFFF or 0xFFFFFFFF? */
894 if ( !reg->attr.fields.db )
895 last_byte = (uint16_t)last_byte;
897 /* Check first byte and last byte against respective bounds. */
898 if ( (offset <= reg->limit) || (last_byte < offset) )
899 goto gpf;
900 }
901 else if ( (last_byte > reg->limit) || (last_byte < offset) )
902 goto gpf; /* last byte is beyond limit or wraps 0xFFFFFFFF */
904 /*
905 * Hardware truncates to 32 bits in compatibility mode.
906 * It does not truncate to 16 bits in 16-bit address-size mode.
907 */
908 addr = (uint32_t)(addr + reg->base);
909 }
910 else
911 {
912 /*
913 * LONG MODE: FS and GS add segment base. Addresses must be canonical.
914 */
916 if ( (seg == x86_seg_fs) || (seg == x86_seg_gs) )
917 addr += reg->base;
919 if ( !is_canonical_address(addr) )
920 goto gpf;
921 }
923 *linear_addr = addr;
924 return 1;
926 gpf:
927 return 0;
928 }
930 static void *hvm_map(unsigned long va, int size)
931 {
932 unsigned long gfn, mfn;
933 p2m_type_t p2mt;
934 uint32_t pfec;
936 if ( ((va & ~PAGE_MASK) + size) > PAGE_SIZE )
937 {
938 hvm_inject_exception(TRAP_page_fault, PFEC_write_access,
939 (va + PAGE_SIZE - 1) & PAGE_MASK);
940 return NULL;
941 }
943 /* We're mapping on behalf of the segment-load logic, which might
944 * write the accessed flags in the descriptors (in 32-bit mode), but
945 * we still treat it as a kernel-mode read (i.e. no access checks). */
946 pfec = PFEC_page_present;
947 gfn = paging_gva_to_gfn(current, va, &pfec);
948 mfn = mfn_x(gfn_to_mfn_current(gfn, &p2mt));
949 if ( !p2m_is_ram(p2mt) )
950 {
951 hvm_inject_exception(TRAP_page_fault, pfec, va);
952 return NULL;
953 }
955 ASSERT(mfn_valid(mfn));
957 paging_mark_dirty(current->domain, mfn);
959 return (char *)map_domain_page(mfn) + (va & ~PAGE_MASK);
960 }
962 static void hvm_unmap(void *p)
963 {
964 if ( p )
965 unmap_domain_page(p);
966 }
968 static int hvm_load_segment_selector(
969 struct vcpu *v, enum x86_segment seg, uint16_t sel)
970 {
971 struct segment_register desctab, cs, segr;
972 struct desc_struct *pdesc, desc;
973 u8 dpl, rpl, cpl;
974 int fault_type = TRAP_invalid_tss;
976 /* NULL selector? */
977 if ( (sel & 0xfffc) == 0 )
978 {
979 if ( (seg == x86_seg_cs) || (seg == x86_seg_ss) )
980 goto fail;
981 memset(&segr, 0, sizeof(segr));
982 hvm_set_segment_register(v, seg, &segr);
983 return 0;
984 }
986 /* LDT descriptor must be in the GDT. */
987 if ( (seg == x86_seg_ldtr) && (sel & 4) )
988 goto fail;
990 hvm_get_segment_register(v, x86_seg_cs, &cs);
991 hvm_get_segment_register(
992 v, (sel & 4) ? x86_seg_ldtr : x86_seg_gdtr, &desctab);
994 /* Check against descriptor table limit. */
995 if ( ((sel & 0xfff8) + 7) > desctab.limit )
996 goto fail;
998 pdesc = hvm_map(desctab.base + (sel & 0xfff8), 8);
999 if ( pdesc == NULL )
1000 goto hvm_map_fail;
1002 do {
1003 desc = *pdesc;
1005 /* Segment present in memory? */
1006 if ( !(desc.b & (1u<<15)) )
1008 fault_type = TRAP_no_segment;
1009 goto unmap_and_fail;
1012 /* LDT descriptor is a system segment. All others are code/data. */
1013 if ( (desc.b & (1u<<12)) == ((seg == x86_seg_ldtr) << 12) )
1014 goto unmap_and_fail;
1016 dpl = (desc.b >> 13) & 3;
1017 rpl = sel & 3;
1018 cpl = cs.sel & 3;
1020 switch ( seg )
1022 case x86_seg_cs:
1023 /* Code segment? */
1024 if ( !(desc.b & (1u<<11)) )
1025 goto unmap_and_fail;
1026 /* Non-conforming segment: check DPL against RPL. */
1027 if ( ((desc.b & (6u<<9)) != 6) && (dpl != rpl) )
1028 goto unmap_and_fail;
1029 break;
1030 case x86_seg_ss:
1031 /* Writable data segment? */
1032 if ( (desc.b & (5u<<9)) != (1u<<9) )
1033 goto unmap_and_fail;
1034 if ( (dpl != cpl) || (dpl != rpl) )
1035 goto unmap_and_fail;
1036 break;
1037 case x86_seg_ldtr:
1038 /* LDT system segment? */
1039 if ( (desc.b & (15u<<8)) != (2u<<8) )
1040 goto unmap_and_fail;
1041 goto skip_accessed_flag;
1042 default:
1043 /* Readable code or data segment? */
1044 if ( (desc.b & (5u<<9)) == (4u<<9) )
1045 goto unmap_and_fail;
1046 /* Non-conforming segment: check DPL against RPL and CPL. */
1047 if ( ((desc.b & (6u<<9)) != 6) && ((dpl < cpl) || (dpl < rpl)) )
1048 goto unmap_and_fail;
1049 break;
1051 } while ( !(desc.b & 0x100) && /* Ensure Accessed flag is set */
1052 (cmpxchg(&pdesc->b, desc.b, desc.b | 0x100) != desc.b) );
1054 /* Force the Accessed flag in our local copy. */
1055 desc.b |= 0x100;
1057 skip_accessed_flag:
1058 hvm_unmap(pdesc);
1060 segr.base = (((desc.b << 0) & 0xff000000u) |
1061 ((desc.b << 16) & 0x00ff0000u) |
1062 ((desc.a >> 16) & 0x0000ffffu));
1063 segr.attr.bytes = (((desc.b >> 8) & 0x00ffu) |
1064 ((desc.b >> 12) & 0x0f00u));
1065 segr.limit = (desc.b & 0x000f0000u) | (desc.a & 0x0000ffffu);
1066 if ( segr.attr.fields.g )
1067 segr.limit = (segr.limit << 12) | 0xfffu;
1068 segr.sel = sel;
1069 hvm_set_segment_register(v, seg, &segr);
1071 return 0;
1073 unmap_and_fail:
1074 hvm_unmap(pdesc);
1075 fail:
1076 hvm_inject_exception(fault_type, sel & 0xfffc, 0);
1077 hvm_map_fail:
1078 return 1;
1081 void hvm_task_switch(
1082 uint16_t tss_sel, enum hvm_task_switch_reason taskswitch_reason,
1083 int32_t errcode)
1085 struct vcpu *v = current;
1086 struct cpu_user_regs *regs = guest_cpu_user_regs();
1087 struct segment_register gdt, tr, prev_tr, segr;
1088 struct desc_struct *optss_desc = NULL, *nptss_desc = NULL, tss_desc;
1089 unsigned long eflags;
1090 int exn_raised;
1091 struct {
1092 u16 back_link,__blh;
1093 u32 esp0;
1094 u16 ss0, _0;
1095 u32 esp1;
1096 u16 ss1, _1;
1097 u32 esp2;
1098 u16 ss2, _2;
1099 u32 cr3, eip, eflags, eax, ecx, edx, ebx, esp, ebp, esi, edi;
1100 u16 es, _3, cs, _4, ss, _5, ds, _6, fs, _7, gs, _8, ldt, _9;
1101 u16 trace, iomap;
1102 } *ptss, tss;
1104 hvm_get_segment_register(v, x86_seg_gdtr, &gdt);
1105 hvm_get_segment_register(v, x86_seg_tr, &prev_tr);
1107 if ( ((tss_sel & 0xfff8) + 7) > gdt.limit )
1109 hvm_inject_exception((taskswitch_reason == TSW_iret) ?
1110 TRAP_invalid_tss : TRAP_gp_fault,
1111 tss_sel & 0xfff8, 0);
1112 goto out;
1115 optss_desc = hvm_map(gdt.base + (prev_tr.sel & 0xfff8), 8);
1116 if ( optss_desc == NULL )
1117 goto out;
1119 nptss_desc = hvm_map(gdt.base + (tss_sel & 0xfff8), 8);
1120 if ( nptss_desc == NULL )
1121 goto out;
1123 tss_desc = *nptss_desc;
1124 tr.sel = tss_sel;
1125 tr.base = (((tss_desc.b << 0) & 0xff000000u) |
1126 ((tss_desc.b << 16) & 0x00ff0000u) |
1127 ((tss_desc.a >> 16) & 0x0000ffffu));
1128 tr.attr.bytes = (((tss_desc.b >> 8) & 0x00ffu) |
1129 ((tss_desc.b >> 12) & 0x0f00u));
1130 tr.limit = (tss_desc.b & 0x000f0000u) | (tss_desc.a & 0x0000ffffu);
1131 if ( tr.attr.fields.g )
1132 tr.limit = (tr.limit << 12) | 0xfffu;
1134 if ( !tr.attr.fields.p )
1136 hvm_inject_exception(TRAP_no_segment, tss_sel & 0xfff8, 0);
1137 goto out;
1140 if ( tr.attr.fields.type != ((taskswitch_reason == TSW_iret) ? 0xb : 0x9) )
1142 hvm_inject_exception(
1143 (taskswitch_reason == TSW_iret) ? TRAP_invalid_tss : TRAP_gp_fault,
1144 tss_sel & 0xfff8, 0);
1145 goto out;
1148 if ( !tr.attr.fields.g && (tr.limit < (sizeof(tss)-1)) )
1150 hvm_inject_exception(TRAP_invalid_tss, tss_sel & 0xfff8, 0);
1151 goto out;
1154 ptss = hvm_map(prev_tr.base, sizeof(tss));
1155 if ( ptss == NULL )
1156 goto out;
1158 eflags = regs->eflags;
1159 if ( taskswitch_reason == TSW_iret )
1160 eflags &= ~X86_EFLAGS_NT;
1162 ptss->cr3 = v->arch.hvm_vcpu.guest_cr[3];
1163 ptss->eip = regs->eip;
1164 ptss->eflags = eflags;
1165 ptss->eax = regs->eax;
1166 ptss->ecx = regs->ecx;
1167 ptss->edx = regs->edx;
1168 ptss->ebx = regs->ebx;
1169 ptss->esp = regs->esp;
1170 ptss->ebp = regs->ebp;
1171 ptss->esi = regs->esi;
1172 ptss->edi = regs->edi;
1174 hvm_get_segment_register(v, x86_seg_es, &segr);
1175 ptss->es = segr.sel;
1176 hvm_get_segment_register(v, x86_seg_cs, &segr);
1177 ptss->cs = segr.sel;
1178 hvm_get_segment_register(v, x86_seg_ss, &segr);
1179 ptss->ss = segr.sel;
1180 hvm_get_segment_register(v, x86_seg_ds, &segr);
1181 ptss->ds = segr.sel;
1182 hvm_get_segment_register(v, x86_seg_fs, &segr);
1183 ptss->fs = segr.sel;
1184 hvm_get_segment_register(v, x86_seg_gs, &segr);
1185 ptss->gs = segr.sel;
1186 hvm_get_segment_register(v, x86_seg_ldtr, &segr);
1187 ptss->ldt = segr.sel;
1189 hvm_unmap(ptss);
1191 ptss = hvm_map(tr.base, sizeof(tss));
1192 if ( ptss == NULL )
1193 goto out;
1195 if ( !hvm_set_cr3(ptss->cr3) )
1197 hvm_unmap(ptss);
1198 goto out;
1201 regs->eip = ptss->eip;
1202 regs->eflags = ptss->eflags | 2;
1203 regs->eax = ptss->eax;
1204 regs->ecx = ptss->ecx;
1205 regs->edx = ptss->edx;
1206 regs->ebx = ptss->ebx;
1207 regs->esp = ptss->esp;
1208 regs->ebp = ptss->ebp;
1209 regs->esi = ptss->esi;
1210 regs->edi = ptss->edi;
1212 if ( (taskswitch_reason == TSW_call_or_int) )
1214 regs->eflags |= X86_EFLAGS_NT;
1215 ptss->back_link = prev_tr.sel;
1218 exn_raised = 0;
1219 if ( hvm_load_segment_selector(v, x86_seg_es, ptss->es) ||
1220 hvm_load_segment_selector(v, x86_seg_cs, ptss->cs) ||
1221 hvm_load_segment_selector(v, x86_seg_ss, ptss->ss) ||
1222 hvm_load_segment_selector(v, x86_seg_ds, ptss->ds) ||
1223 hvm_load_segment_selector(v, x86_seg_fs, ptss->fs) ||
1224 hvm_load_segment_selector(v, x86_seg_gs, ptss->gs) ||
1225 hvm_load_segment_selector(v, x86_seg_ldtr, ptss->ldt) )
1226 exn_raised = 1;
1228 if ( (ptss->trace & 1) && !exn_raised )
1229 hvm_inject_exception(TRAP_debug, tss_sel & 0xfff8, 0);
1231 hvm_unmap(ptss);
1233 tr.attr.fields.type = 0xb; /* busy 32-bit tss */
1234 hvm_set_segment_register(v, x86_seg_tr, &tr);
1236 v->arch.hvm_vcpu.guest_cr[0] |= X86_CR0_TS;
1237 hvm_update_guest_cr(v, 0);
1239 if ( (taskswitch_reason == TSW_iret) ||
1240 (taskswitch_reason == TSW_jmp) )
1241 clear_bit(41, optss_desc); /* clear B flag of old task */
1243 if ( taskswitch_reason != TSW_iret )
1244 set_bit(41, nptss_desc); /* set B flag of new task */
1246 if ( errcode >= 0 )
1248 struct segment_register reg;
1249 unsigned long linear_addr;
1250 regs->esp -= 4;
1251 hvm_get_segment_register(current, x86_seg_ss, &reg);
1252 /* Todo: do not ignore access faults here. */
1253 if ( hvm_virtual_to_linear_addr(x86_seg_ss, &reg, regs->esp,
1254 4, hvm_access_write, 32,
1255 &linear_addr) )
1256 hvm_copy_to_guest_virt(linear_addr, &errcode, 4);
1259 out:
1260 hvm_unmap(optss_desc);
1261 hvm_unmap(nptss_desc);
1264 /*
1265 * __hvm_copy():
1266 * @buf = hypervisor buffer
1267 * @addr = guest address to copy to/from
1268 * @size = number of bytes to copy
1269 * @dir = copy *to* guest (TRUE) or *from* guest (FALSE)?
1270 * @virt = addr is *virtual* (TRUE) or *guest physical* (FALSE)?
1271 * @fetch = copy is an instruction fetch?
1272 * Returns number of bytes failed to copy (0 == complete success).
1273 */
1274 static int __hvm_copy(void *buf, paddr_t addr, int size, int dir,
1275 int virt, int fetch)
1277 unsigned long gfn, mfn;
1278 p2m_type_t p2mt;
1279 char *p;
1280 int count, todo;
1281 uint32_t pfec = PFEC_page_present;
1283 if ( dir )
1284 pfec |= PFEC_write_access;
1285 if ( ring_3(guest_cpu_user_regs()) )
1286 pfec |= PFEC_user_mode;
1287 if ( fetch )
1288 pfec |= PFEC_insn_fetch;
1290 todo = size;
1291 while ( todo > 0 )
1293 count = min_t(int, PAGE_SIZE - (addr & ~PAGE_MASK), todo);
1295 if ( virt )
1296 gfn = paging_gva_to_gfn(current, addr, &pfec);
1297 else
1298 gfn = addr >> PAGE_SHIFT;
1300 mfn = mfn_x(gfn_to_mfn_current(gfn, &p2mt));
1302 if ( !p2m_is_ram(p2mt) )
1303 return todo;
1304 ASSERT(mfn_valid(mfn));
1306 p = (char *)map_domain_page(mfn) + (addr & ~PAGE_MASK);
1308 if ( dir )
1310 memcpy(p, buf, count); /* dir == TRUE: *to* guest */
1311 paging_mark_dirty(current->domain, mfn);
1313 else
1314 memcpy(buf, p, count); /* dir == FALSE: *from guest */
1316 unmap_domain_page(p);
1318 addr += count;
1319 buf += count;
1320 todo -= count;
1323 return 0;
1326 int hvm_copy_to_guest_phys(paddr_t paddr, void *buf, int size)
1328 return __hvm_copy(buf, paddr, size, 1, 0, 0);
1331 int hvm_copy_from_guest_phys(void *buf, paddr_t paddr, int size)
1333 return __hvm_copy(buf, paddr, size, 0, 0, 0);
1336 int hvm_copy_to_guest_virt(unsigned long vaddr, void *buf, int size)
1338 return __hvm_copy(buf, vaddr, size, 1, 1, 0);
1341 int hvm_copy_from_guest_virt(void *buf, unsigned long vaddr, int size)
1343 return __hvm_copy(buf, vaddr, size, 0, 1, 0);
1346 int hvm_fetch_from_guest_virt(void *buf, unsigned long vaddr, int size)
1348 return __hvm_copy(buf, vaddr, size, 0, 1, hvm_nx_enabled(current));
1352 /* HVM specific printbuf. Mostly used for hvmloader chit-chat. */
1353 void hvm_print_line(struct vcpu *v, const char c)
1355 struct hvm_domain *hd = &v->domain->arch.hvm_domain;
1357 spin_lock(&hd->pbuf_lock);
1358 hd->pbuf[hd->pbuf_idx++] = c;
1359 if ( (hd->pbuf_idx == (sizeof(hd->pbuf) - 2)) || (c == '\n') )
1361 if ( c != '\n' )
1362 hd->pbuf[hd->pbuf_idx++] = '\n';
1363 hd->pbuf[hd->pbuf_idx] = '\0';
1364 printk(XENLOG_G_DEBUG "HVM%u: %s", v->domain->domain_id, hd->pbuf);
1365 hd->pbuf_idx = 0;
1367 spin_unlock(&hd->pbuf_lock);
1370 void hvm_cpuid(unsigned int input, unsigned int *eax, unsigned int *ebx,
1371 unsigned int *ecx, unsigned int *edx)
1373 struct vcpu *v = current;
1375 if ( cpuid_hypervisor_leaves(input, eax, ebx, ecx, edx) )
1376 return;
1378 cpuid(input, eax, ebx, ecx, edx);
1380 switch ( input )
1382 case 0x00000001:
1383 __clear_bit(X86_FEATURE_MWAIT & 31, ecx);
1385 if ( vlapic_hw_disabled(vcpu_vlapic(v)) )
1386 __clear_bit(X86_FEATURE_APIC & 31, edx);
1388 #if CONFIG_PAGING_LEVELS >= 3
1389 if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] )
1390 #endif
1391 __clear_bit(X86_FEATURE_PAE & 31, edx);
1392 __clear_bit(X86_FEATURE_PSE36 & 31, edx);
1393 break;
1395 case 0x80000001:
1396 #if CONFIG_PAGING_LEVELS >= 3
1397 if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] )
1398 #endif
1399 __clear_bit(X86_FEATURE_NX & 31, edx);
1400 #ifdef __i386__
1401 /* Mask feature for Intel ia32e or AMD long mode. */
1402 __clear_bit(X86_FEATURE_LAHF_LM & 31, ecx);
1403 __clear_bit(X86_FEATURE_LM & 31, edx);
1404 __clear_bit(X86_FEATURE_SYSCALL & 31, edx);
1405 #endif
1406 break;
1410 static long hvm_grant_table_op(
1411 unsigned int cmd, XEN_GUEST_HANDLE(void) uop, unsigned int count)
1413 if ( cmd != GNTTABOP_query_size )
1414 return -ENOSYS; /* all other commands need auditing */
1415 return do_grant_table_op(cmd, uop, count);
1418 typedef unsigned long hvm_hypercall_t(
1419 unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
1421 #define HYPERCALL(x) \
1422 [ __HYPERVISOR_ ## x ] = (hvm_hypercall_t *) do_ ## x
1424 #if defined(__i386__)
1426 static hvm_hypercall_t *hvm_hypercall32_table[NR_hypercalls] = {
1427 HYPERCALL(memory_op),
1428 [ __HYPERVISOR_grant_table_op ] = (hvm_hypercall_t *)hvm_grant_table_op,
1429 HYPERCALL(xen_version),
1430 HYPERCALL(grant_table_op),
1431 HYPERCALL(event_channel_op),
1432 HYPERCALL(sched_op),
1433 HYPERCALL(hvm_op)
1434 };
1436 #else /* defined(__x86_64__) */
1438 static long do_memory_op_compat32(int cmd, XEN_GUEST_HANDLE(void) arg)
1440 extern long do_add_to_physmap(struct xen_add_to_physmap *xatp);
1441 long rc;
1443 switch ( cmd )
1445 case XENMEM_add_to_physmap:
1447 struct {
1448 domid_t domid;
1449 uint32_t space;
1450 uint32_t idx;
1451 uint32_t gpfn;
1452 } u;
1453 struct xen_add_to_physmap h;
1455 if ( copy_from_guest(&u, arg, 1) )
1456 return -EFAULT;
1458 h.domid = u.domid;
1459 h.space = u.space;
1460 h.idx = u.idx;
1461 h.gpfn = u.gpfn;
1463 this_cpu(guest_handles_in_xen_space) = 1;
1464 rc = do_memory_op(cmd, guest_handle_from_ptr(&h, void));
1465 this_cpu(guest_handles_in_xen_space) = 0;
1467 break;
1470 default:
1471 gdprintk(XENLOG_WARNING, "memory_op %d.\n", cmd);
1472 rc = -ENOSYS;
1473 break;
1476 return rc;
1479 static hvm_hypercall_t *hvm_hypercall64_table[NR_hypercalls] = {
1480 HYPERCALL(memory_op),
1481 [ __HYPERVISOR_grant_table_op ] = (hvm_hypercall_t *)hvm_grant_table_op,
1482 HYPERCALL(xen_version),
1483 HYPERCALL(grant_table_op),
1484 HYPERCALL(event_channel_op),
1485 HYPERCALL(sched_op),
1486 HYPERCALL(hvm_op)
1487 };
1489 static hvm_hypercall_t *hvm_hypercall32_table[NR_hypercalls] = {
1490 [ __HYPERVISOR_memory_op ] = (hvm_hypercall_t *)do_memory_op_compat32,
1491 [ __HYPERVISOR_grant_table_op ] = (hvm_hypercall_t *)hvm_grant_table_op,
1492 HYPERCALL(xen_version),
1493 HYPERCALL(grant_table_op),
1494 HYPERCALL(event_channel_op),
1495 HYPERCALL(sched_op),
1496 HYPERCALL(hvm_op)
1497 };
1499 #endif /* defined(__x86_64__) */
1501 int hvm_do_hypercall(struct cpu_user_regs *regs)
1503 int flush, mode = hvm_guest_x86_mode(current);
1504 uint32_t eax = regs->eax;
1506 switch ( mode )
1508 #ifdef __x86_64__
1509 case 8:
1510 #endif
1511 case 4:
1512 case 2:
1513 if ( unlikely(ring_3(regs)) )
1515 default:
1516 regs->eax = -EPERM;
1517 return HVM_HCALL_completed;
1519 case 0:
1520 break;
1523 if ( (eax >= NR_hypercalls) || !hvm_hypercall32_table[eax] )
1525 regs->eax = -ENOSYS;
1526 return HVM_HCALL_completed;
1529 /*
1530 * NB. In future flush only on decrease_reservation.
1531 * For now we also need to flush when pages are added, as qemu-dm is not
1532 * yet capable of faulting pages into an existing valid mapcache bucket.
1533 */
1534 flush = ((eax == __HYPERVISOR_memory_op) ||
1535 (eax == __HYPERVISOR_grant_table_op)); /* needed ? */
1536 this_cpu(hc_preempted) = 0;
1538 #ifdef __x86_64__
1539 if ( mode == 8 )
1541 HVM_DBG_LOG(DBG_LEVEL_HCALL, "hcall%u(%lx, %lx, %lx, %lx, %lx)", eax,
1542 regs->rdi, regs->rsi, regs->rdx, regs->r10, regs->r8);
1544 regs->rax = hvm_hypercall64_table[eax](regs->rdi,
1545 regs->rsi,
1546 regs->rdx,
1547 regs->r10,
1548 regs->r8);
1550 else
1551 #endif
1553 HVM_DBG_LOG(DBG_LEVEL_HCALL, "hcall%u(%x, %x, %x, %x, %x)", eax,
1554 (uint32_t)regs->ebx, (uint32_t)regs->ecx,
1555 (uint32_t)regs->edx, (uint32_t)regs->esi,
1556 (uint32_t)regs->edi);
1558 regs->eax = hvm_hypercall32_table[eax]((uint32_t)regs->ebx,
1559 (uint32_t)regs->ecx,
1560 (uint32_t)regs->edx,
1561 (uint32_t)regs->esi,
1562 (uint32_t)regs->edi);
1565 HVM_DBG_LOG(DBG_LEVEL_HCALL, "hcall%u -> %lx",
1566 eax, (unsigned long)regs->eax);
1568 return (this_cpu(hc_preempted) ? HVM_HCALL_preempted :
1569 flush ? HVM_HCALL_invalidate : HVM_HCALL_completed);
1572 static void hvm_latch_shinfo_size(struct domain *d)
1574 /*
1575 * Called from operations which are among the very first executed by
1576 * PV drivers on initialisation or after save/restore. These are sensible
1577 * points at which to sample the execution mode of the guest and latch
1578 * 32- or 64-bit format for shared state.
1579 */
1580 if ( current->domain == d )
1581 d->arch.has_32bit_shinfo = (hvm_guest_x86_mode(current) != 8);
1584 /* Initialise a hypercall transfer page for a VMX domain using
1585 paravirtualised drivers. */
1586 void hvm_hypercall_page_initialise(struct domain *d,
1587 void *hypercall_page)
1589 hvm_latch_shinfo_size(d);
1590 hvm_funcs.init_hypercall_page(d, hypercall_page);
1594 /*
1595 * only called in HVM domain BSP context
1596 * when booting, vcpuid is always equal to apic_id
1597 */
1598 int hvm_bringup_ap(int vcpuid, int trampoline_vector)
1600 struct vcpu *v;
1601 struct domain *d = current->domain;
1602 struct vcpu_guest_context *ctxt;
1603 int rc = 0;
1605 BUG_ON(!is_hvm_domain(d));
1607 if ( (v = d->vcpu[vcpuid]) == NULL )
1608 return -ENOENT;
1610 if ( (ctxt = xmalloc(struct vcpu_guest_context)) == NULL )
1612 gdprintk(XENLOG_ERR,
1613 "Failed to allocate memory in hvm_bringup_ap.\n");
1614 return -ENOMEM;
1617 hvm_init_ap_context(ctxt, vcpuid, trampoline_vector);
1619 /* Sync AP's TSC with BSP's. */
1620 v->arch.hvm_vcpu.cache_tsc_offset =
1621 v->domain->vcpu[0]->arch.hvm_vcpu.cache_tsc_offset;
1622 hvm_funcs.set_tsc_offset(v, v->arch.hvm_vcpu.cache_tsc_offset);
1624 LOCK_BIGLOCK(d);
1625 rc = -EEXIST;
1626 if ( !v->is_initialised )
1627 rc = boot_vcpu(d, vcpuid, ctxt);
1628 UNLOCK_BIGLOCK(d);
1630 if ( rc != 0 )
1632 gdprintk(XENLOG_ERR,
1633 "AP %d bringup failed in boot_vcpu %x.\n", vcpuid, rc);
1634 goto out;
1637 if ( test_and_clear_bit(_VPF_down, &v->pause_flags) )
1638 vcpu_wake(v);
1639 gdprintk(XENLOG_INFO, "AP %d bringup suceeded.\n", vcpuid);
1641 out:
1642 xfree(ctxt);
1643 return rc;
1646 static int hvmop_set_pci_intx_level(
1647 XEN_GUEST_HANDLE(xen_hvm_set_pci_intx_level_t) uop)
1649 struct xen_hvm_set_pci_intx_level op;
1650 struct domain *d;
1651 int rc;
1653 if ( copy_from_guest(&op, uop, 1) )
1654 return -EFAULT;
1656 if ( !IS_PRIV(current->domain) )
1657 return -EPERM;
1659 if ( (op.domain > 0) || (op.bus > 0) || (op.device > 31) || (op.intx > 3) )
1660 return -EINVAL;
1662 d = rcu_lock_domain_by_id(op.domid);
1663 if ( d == NULL )
1664 return -ESRCH;
1666 rc = -EINVAL;
1667 if ( !is_hvm_domain(d) )
1668 goto out;
1670 rc = xsm_hvm_set_pci_intx_level(d);
1671 if ( rc )
1672 goto out;
1674 rc = 0;
1675 switch ( op.level )
1677 case 0:
1678 hvm_pci_intx_deassert(d, op.device, op.intx);
1679 break;
1680 case 1:
1681 hvm_pci_intx_assert(d, op.device, op.intx);
1682 break;
1683 default:
1684 rc = -EINVAL;
1685 break;
1688 out:
1689 rcu_unlock_domain(d);
1690 return rc;
1693 static int hvmop_set_isa_irq_level(
1694 XEN_GUEST_HANDLE(xen_hvm_set_isa_irq_level_t) uop)
1696 struct xen_hvm_set_isa_irq_level op;
1697 struct domain *d;
1698 int rc;
1700 if ( copy_from_guest(&op, uop, 1) )
1701 return -EFAULT;
1703 if ( !IS_PRIV(current->domain) )
1704 return -EPERM;
1706 if ( op.isa_irq > 15 )
1707 return -EINVAL;
1709 d = rcu_lock_domain_by_id(op.domid);
1710 if ( d == NULL )
1711 return -ESRCH;
1713 rc = -EINVAL;
1714 if ( !is_hvm_domain(d) )
1715 goto out;
1717 rc = xsm_hvm_set_isa_irq_level(d);
1718 if ( rc )
1719 goto out;
1721 rc = 0;
1722 switch ( op.level )
1724 case 0:
1725 hvm_isa_irq_deassert(d, op.isa_irq);
1726 break;
1727 case 1:
1728 hvm_isa_irq_assert(d, op.isa_irq);
1729 break;
1730 default:
1731 rc = -EINVAL;
1732 break;
1735 out:
1736 rcu_unlock_domain(d);
1737 return rc;
1740 static int hvmop_set_pci_link_route(
1741 XEN_GUEST_HANDLE(xen_hvm_set_pci_link_route_t) uop)
1743 struct xen_hvm_set_pci_link_route op;
1744 struct domain *d;
1745 int rc;
1747 if ( copy_from_guest(&op, uop, 1) )
1748 return -EFAULT;
1750 if ( !IS_PRIV(current->domain) )
1751 return -EPERM;
1753 if ( (op.link > 3) || (op.isa_irq > 15) )
1754 return -EINVAL;
1756 d = rcu_lock_domain_by_id(op.domid);
1757 if ( d == NULL )
1758 return -ESRCH;
1760 rc = -EINVAL;
1761 if ( !is_hvm_domain(d) )
1762 goto out;
1764 rc = xsm_hvm_set_pci_link_route(d);
1765 if ( rc )
1766 goto out;
1768 rc = 0;
1769 hvm_set_pci_link_route(d, op.link, op.isa_irq);
1771 out:
1772 rcu_unlock_domain(d);
1773 return rc;
1776 static int hvmop_flush_tlb_all(void)
1778 struct domain *d = current->domain;
1779 struct vcpu *v;
1781 /* Avoid deadlock if more than one vcpu tries this at the same time. */
1782 if ( !spin_trylock(&d->hypercall_deadlock_mutex) )
1783 return -EAGAIN;
1785 /* Pause all other vcpus. */
1786 for_each_vcpu ( d, v )
1787 if ( v != current )
1788 vcpu_pause_nosync(v);
1790 /* Now that all VCPUs are signalled to deschedule, we wait... */
1791 for_each_vcpu ( d, v )
1792 if ( v != current )
1793 while ( !vcpu_runnable(v) && v->is_running )
1794 cpu_relax();
1796 /* All other vcpus are paused, safe to unlock now. */
1797 spin_unlock(&d->hypercall_deadlock_mutex);
1799 /* Flush paging-mode soft state (e.g., va->gfn cache; PAE PDPE cache). */
1800 for_each_vcpu ( d, v )
1801 paging_update_cr3(v);
1803 /* Flush all dirty TLBs. */
1804 flush_tlb_mask(d->domain_dirty_cpumask);
1806 /* Done. */
1807 for_each_vcpu ( d, v )
1808 if ( v != current )
1809 vcpu_unpause(v);
1811 return 0;
1814 long do_hvm_op(unsigned long op, XEN_GUEST_HANDLE(void) arg)
1817 long rc = 0;
1819 switch ( op )
1821 case HVMOP_set_param:
1822 case HVMOP_get_param:
1824 struct xen_hvm_param a;
1825 struct hvm_ioreq_page *iorp;
1826 struct domain *d;
1827 struct vcpu *v;
1829 if ( copy_from_guest(&a, arg, 1) )
1830 return -EFAULT;
1832 if ( a.index >= HVM_NR_PARAMS )
1833 return -EINVAL;
1835 if ( a.domid == DOMID_SELF )
1836 d = rcu_lock_current_domain();
1837 else if ( IS_PRIV(current->domain) )
1838 d = rcu_lock_domain_by_id(a.domid);
1839 else
1840 return -EPERM;
1842 if ( d == NULL )
1843 return -ESRCH;
1845 rc = -EINVAL;
1846 if ( !is_hvm_domain(d) )
1847 goto param_fail;
1849 rc = xsm_hvm_param(d, op);
1850 if ( rc )
1851 goto param_fail;
1853 if ( op == HVMOP_set_param )
1855 switch ( a.index )
1857 case HVM_PARAM_IOREQ_PFN:
1858 iorp = &d->arch.hvm_domain.ioreq;
1859 rc = hvm_set_ioreq_page(d, iorp, a.value);
1860 spin_lock(&iorp->lock);
1861 if ( (rc == 0) && (iorp->va != NULL) )
1862 /* Initialise evtchn port info if VCPUs already created. */
1863 for_each_vcpu ( d, v )
1864 get_ioreq(v)->vp_eport = v->arch.hvm_vcpu.xen_port;
1865 spin_unlock(&iorp->lock);
1866 break;
1867 case HVM_PARAM_BUFIOREQ_PFN:
1868 iorp = &d->arch.hvm_domain.buf_ioreq;
1869 rc = hvm_set_ioreq_page(d, iorp, a.value);
1870 break;
1871 case HVM_PARAM_CALLBACK_IRQ:
1872 hvm_set_callback_via(d, a.value);
1873 hvm_latch_shinfo_size(d);
1874 break;
1875 case HVM_PARAM_TIMER_MODE:
1876 rc = -EINVAL;
1877 if ( (a.value != HVMPTM_delay_for_missed_ticks) &&
1878 (a.value != HVMPTM_no_delay_for_missed_ticks) &&
1879 (a.value != HVMPTM_no_missed_tick_accounting) )
1880 goto param_fail;
1881 break;
1883 d->arch.hvm_domain.params[a.index] = a.value;
1884 rc = 0;
1886 else
1888 a.value = d->arch.hvm_domain.params[a.index];
1889 rc = copy_to_guest(arg, &a, 1) ? -EFAULT : 0;
1892 HVM_DBG_LOG(DBG_LEVEL_HCALL, "%s param %u = %"PRIx64,
1893 op == HVMOP_set_param ? "set" : "get",
1894 a.index, a.value);
1896 param_fail:
1897 rcu_unlock_domain(d);
1898 break;
1901 case HVMOP_set_pci_intx_level:
1902 rc = hvmop_set_pci_intx_level(
1903 guest_handle_cast(arg, xen_hvm_set_pci_intx_level_t));
1904 break;
1906 case HVMOP_set_isa_irq_level:
1907 rc = hvmop_set_isa_irq_level(
1908 guest_handle_cast(arg, xen_hvm_set_isa_irq_level_t));
1909 break;
1911 case HVMOP_set_pci_link_route:
1912 rc = hvmop_set_pci_link_route(
1913 guest_handle_cast(arg, xen_hvm_set_pci_link_route_t));
1914 break;
1916 case HVMOP_flush_tlbs:
1917 rc = guest_handle_is_null(arg) ? hvmop_flush_tlb_all() : -ENOSYS;
1918 break;
1920 default:
1922 gdprintk(XENLOG_WARNING, "Bad HVM op %ld.\n", op);
1923 rc = -ENOSYS;
1924 break;
1928 if ( rc == -EAGAIN )
1929 rc = hypercall_create_continuation(
1930 __HYPERVISOR_hvm_op, "lh", op, arg);
1932 return rc;
1935 /*
1936 * Local variables:
1937 * mode: C
1938 * c-set-style: "BSD"
1939 * c-basic-offset: 4
1940 * tab-width: 4
1941 * indent-tabs-mode: nil
1942 * End:
1943 */