ia64/xen-unstable

view xen/arch/x86/hvm/hvm.c @ 16620:966a6d3b7408

SVM: Treat the vlapic's tpr as the master copy and sync the vtpr to it
before every vm entry. This fixes HVM save/restore/migrate, as the
vtpr value was only being synced on guest TPR writes before.

Signed-off-by: Tim Deegan <Tim.Deegan@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Fri Dec 14 11:50:24 2007 +0000 (2007-12-14)
parents cb0ce96c02de
children 1e3e30670ce4
line source
1 /*
2 * hvm.c: Common hardware virtual machine abstractions.
3 *
4 * Copyright (c) 2004, Intel Corporation.
5 * Copyright (c) 2005, International Business Machines Corporation.
6 *
7 * This program is free software; you can redistribute it and/or modify it
8 * under the terms and conditions of the GNU General Public License,
9 * version 2, as published by the Free Software Foundation.
10 *
11 * This program is distributed in the hope it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
14 * more details.
15 *
16 * You should have received a copy of the GNU General Public License along with
17 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
18 * Place - Suite 330, Boston, MA 02111-1307 USA.
19 */
21 #include <xen/config.h>
22 #include <xen/init.h>
23 #include <xen/lib.h>
24 #include <xen/trace.h>
25 #include <xen/sched.h>
26 #include <xen/irq.h>
27 #include <xen/softirq.h>
28 #include <xen/domain.h>
29 #include <xen/domain_page.h>
30 #include <xen/hypercall.h>
31 #include <xen/guest_access.h>
32 #include <xen/event.h>
33 #include <asm/current.h>
34 #include <asm/e820.h>
35 #include <asm/io.h>
36 #include <asm/paging.h>
37 #include <asm/regs.h>
38 #include <asm/cpufeature.h>
39 #include <asm/processor.h>
40 #include <asm/types.h>
41 #include <asm/msr.h>
42 #include <asm/mc146818rtc.h>
43 #include <asm/spinlock.h>
44 #include <asm/hvm/hvm.h>
45 #include <asm/hvm/vpt.h>
46 #include <asm/hvm/support.h>
47 #include <asm/hvm/cacheattr.h>
48 #include <public/sched.h>
49 #include <public/hvm/ioreq.h>
50 #include <public/version.h>
51 #include <public/memory.h>
53 /*
54 * Xen command-line option to allow/disallow hardware-assisted paging.
55 * Since the phys-to-machine table of AMD NPT is in host format, 32-bit Xen
56 * can only support guests using NPT with up to a 4GB memory map. Therefore
57 * we disallow HAP by default on PAE Xen (by default we want to support an
58 * 8GB pseudophysical memory map for HVM guests on a PAE host).
59 */
60 static int opt_hap_permitted = (CONFIG_PAGING_LEVELS != 3);
61 boolean_param("hap", opt_hap_permitted);
63 int hvm_enabled __read_mostly;
65 unsigned int opt_hvm_debug_level __read_mostly;
66 integer_param("hvm_debug", opt_hvm_debug_level);
68 struct hvm_function_table hvm_funcs __read_mostly;
70 /* I/O permission bitmap is globally shared by all HVM guests. */
71 char __attribute__ ((__section__ (".bss.page_aligned")))
72 hvm_io_bitmap[3*PAGE_SIZE];
74 void hvm_enable(struct hvm_function_table *fns)
75 {
76 BUG_ON(hvm_enabled);
77 printk("HVM: %s enabled\n", fns->name);
79 /*
80 * Allow direct access to the PC debug port (it is often used for I/O
81 * delays, but the vmexits simply slow things down).
82 */
83 memset(hvm_io_bitmap, ~0, sizeof(hvm_io_bitmap));
84 __clear_bit(0x80, hvm_io_bitmap);
86 hvm_funcs = *fns;
87 hvm_enabled = 1;
89 if ( hvm_funcs.hap_supported )
90 {
91 if ( !opt_hap_permitted )
92 hvm_funcs.hap_supported = 0;
93 printk("HVM: Hardware Assisted Paging detected %s.\n",
94 hvm_funcs.hap_supported ? "and enabled" : "but disabled");
95 }
96 }
98 void hvm_set_guest_tsc(struct vcpu *v, u64 guest_tsc)
99 {
100 u64 host_tsc;
102 rdtscll(host_tsc);
104 v->arch.hvm_vcpu.cache_tsc_offset = guest_tsc - host_tsc;
105 hvm_funcs.set_tsc_offset(v, v->arch.hvm_vcpu.cache_tsc_offset);
106 }
108 u64 hvm_get_guest_tsc(struct vcpu *v)
109 {
110 u64 host_tsc;
112 rdtscll(host_tsc);
113 return host_tsc + v->arch.hvm_vcpu.cache_tsc_offset;
114 }
116 void hvm_migrate_timers(struct vcpu *v)
117 {
118 rtc_migrate_timers(v);
119 hpet_migrate_timers(v);
120 pt_migrate(v);
121 }
123 void hvm_do_resume(struct vcpu *v)
124 {
125 ioreq_t *p;
127 if ( !v->fpu_dirtied )
128 hvm_funcs.stts(v);
130 pt_restore_timer(v);
132 /* NB. Optimised for common case (p->state == STATE_IOREQ_NONE). */
133 p = &get_ioreq(v)->vp_ioreq;
134 while ( p->state != STATE_IOREQ_NONE )
135 {
136 switch ( p->state )
137 {
138 case STATE_IORESP_READY: /* IORESP_READY -> NONE */
139 hvm_io_assist();
140 break;
141 case STATE_IOREQ_READY: /* IOREQ_{READY,INPROCESS} -> IORESP_READY */
142 case STATE_IOREQ_INPROCESS:
143 wait_on_xen_event_channel(v->arch.hvm_vcpu.xen_port,
144 (p->state != STATE_IOREQ_READY) &&
145 (p->state != STATE_IOREQ_INPROCESS));
146 break;
147 default:
148 gdprintk(XENLOG_ERR, "Weird HVM iorequest state %d.\n", p->state);
149 domain_crash_synchronous();
150 }
151 }
152 }
154 static void hvm_init_ioreq_page(
155 struct domain *d, struct hvm_ioreq_page *iorp)
156 {
157 memset(iorp, 0, sizeof(*iorp));
158 spin_lock_init(&iorp->lock);
159 domain_pause(d);
160 }
162 static void hvm_destroy_ioreq_page(
163 struct domain *d, struct hvm_ioreq_page *iorp)
164 {
165 spin_lock(&iorp->lock);
167 ASSERT(d->is_dying);
169 if ( iorp->va != NULL )
170 {
171 unmap_domain_page_global(iorp->va);
172 put_page_and_type(iorp->page);
173 iorp->va = NULL;
174 }
176 spin_unlock(&iorp->lock);
177 }
179 static int hvm_set_ioreq_page(
180 struct domain *d, struct hvm_ioreq_page *iorp, unsigned long gmfn)
181 {
182 struct page_info *page;
183 p2m_type_t p2mt;
184 unsigned long mfn;
185 void *va;
187 mfn = mfn_x(gfn_to_mfn(d, gmfn, &p2mt));
188 if ( !p2m_is_ram(p2mt) )
189 return -EINVAL;
190 ASSERT(mfn_valid(mfn));
192 page = mfn_to_page(mfn);
193 if ( !get_page_and_type(page, d, PGT_writable_page) )
194 return -EINVAL;
196 va = map_domain_page_global(mfn);
197 if ( va == NULL )
198 {
199 put_page_and_type(page);
200 return -ENOMEM;
201 }
203 spin_lock(&iorp->lock);
205 if ( (iorp->va != NULL) || d->is_dying )
206 {
207 spin_unlock(&iorp->lock);
208 unmap_domain_page_global(va);
209 put_page_and_type(mfn_to_page(mfn));
210 return -EINVAL;
211 }
213 iorp->va = va;
214 iorp->page = page;
216 spin_unlock(&iorp->lock);
218 domain_unpause(d);
220 return 0;
221 }
223 int hvm_domain_initialise(struct domain *d)
224 {
225 int rc;
227 if ( !hvm_enabled )
228 {
229 gdprintk(XENLOG_WARNING, "Attempt to create a HVM guest "
230 "on a non-VT/AMDV platform.\n");
231 return -EINVAL;
232 }
234 spin_lock_init(&d->arch.hvm_domain.pbuf_lock);
235 spin_lock_init(&d->arch.hvm_domain.irq_lock);
236 spin_lock_init(&d->arch.hvm_domain.uc_lock);
238 hvm_init_cacheattr_region_list(d);
240 rc = paging_enable(d, PG_refcounts|PG_translate|PG_external);
241 if ( rc != 0 )
242 goto fail1;
244 vpic_init(d);
246 rc = vioapic_init(d);
247 if ( rc != 0 )
248 goto fail1;
250 stdvga_init(d);
252 hvm_init_ioreq_page(d, &d->arch.hvm_domain.ioreq);
253 hvm_init_ioreq_page(d, &d->arch.hvm_domain.buf_ioreq);
255 rc = hvm_funcs.domain_initialise(d);
256 if ( rc != 0 )
257 goto fail2;
259 return 0;
261 fail2:
262 vioapic_deinit(d);
263 fail1:
264 hvm_destroy_cacheattr_region_list(d);
265 return rc;
266 }
268 void hvm_domain_relinquish_resources(struct domain *d)
269 {
270 hvm_destroy_ioreq_page(d, &d->arch.hvm_domain.ioreq);
271 hvm_destroy_ioreq_page(d, &d->arch.hvm_domain.buf_ioreq);
273 pit_deinit(d);
274 rtc_deinit(d);
275 pmtimer_deinit(d);
276 hpet_deinit(d);
277 stdvga_deinit(d);
278 }
280 void hvm_domain_destroy(struct domain *d)
281 {
282 hvm_funcs.domain_destroy(d);
283 vioapic_deinit(d);
284 hvm_destroy_cacheattr_region_list(d);
285 }
287 static int hvm_save_cpu_ctxt(struct domain *d, hvm_domain_context_t *h)
288 {
289 struct vcpu *v;
290 struct hvm_hw_cpu ctxt;
291 struct vcpu_guest_context *vc;
293 for_each_vcpu(d, v)
294 {
295 /* We don't need to save state for a vcpu that is down; the restore
296 * code will leave it down if there is nothing saved. */
297 if ( test_bit(_VPF_down, &v->pause_flags) )
298 continue;
300 /* Architecture-specific vmcs/vmcb bits */
301 hvm_funcs.save_cpu_ctxt(v, &ctxt);
303 /* Other vcpu register state */
304 vc = &v->arch.guest_context;
305 if ( v->fpu_initialised )
306 memcpy(ctxt.fpu_regs, &vc->fpu_ctxt, sizeof(ctxt.fpu_regs));
307 else
308 memset(ctxt.fpu_regs, 0, sizeof(ctxt.fpu_regs));
309 ctxt.rax = vc->user_regs.eax;
310 ctxt.rbx = vc->user_regs.ebx;
311 ctxt.rcx = vc->user_regs.ecx;
312 ctxt.rdx = vc->user_regs.edx;
313 ctxt.rbp = vc->user_regs.ebp;
314 ctxt.rsi = vc->user_regs.esi;
315 ctxt.rdi = vc->user_regs.edi;
316 ctxt.rsp = vc->user_regs.esp;
317 ctxt.rip = vc->user_regs.eip;
318 ctxt.rflags = vc->user_regs.eflags;
319 #ifdef __x86_64__
320 ctxt.r8 = vc->user_regs.r8;
321 ctxt.r9 = vc->user_regs.r9;
322 ctxt.r10 = vc->user_regs.r10;
323 ctxt.r11 = vc->user_regs.r11;
324 ctxt.r12 = vc->user_regs.r12;
325 ctxt.r13 = vc->user_regs.r13;
326 ctxt.r14 = vc->user_regs.r14;
327 ctxt.r15 = vc->user_regs.r15;
328 #endif
329 ctxt.dr0 = vc->debugreg[0];
330 ctxt.dr1 = vc->debugreg[1];
331 ctxt.dr2 = vc->debugreg[2];
332 ctxt.dr3 = vc->debugreg[3];
333 ctxt.dr6 = vc->debugreg[6];
334 ctxt.dr7 = vc->debugreg[7];
336 if ( hvm_save_entry(CPU, v->vcpu_id, h, &ctxt) != 0 )
337 return 1;
338 }
339 return 0;
340 }
342 static int hvm_load_cpu_ctxt(struct domain *d, hvm_domain_context_t *h)
343 {
344 int vcpuid, rc;
345 struct vcpu *v;
346 struct hvm_hw_cpu ctxt;
347 struct vcpu_guest_context *vc;
349 /* Which vcpu is this? */
350 vcpuid = hvm_load_instance(h);
351 if ( vcpuid > MAX_VIRT_CPUS || (v = d->vcpu[vcpuid]) == NULL )
352 {
353 gdprintk(XENLOG_ERR, "HVM restore: domain has no vcpu %u\n", vcpuid);
354 return -EINVAL;
355 }
356 vc = &v->arch.guest_context;
358 /* Need to init this vcpu before loading its contents */
359 LOCK_BIGLOCK(d);
360 if ( !v->is_initialised )
361 if ( (rc = boot_vcpu(d, vcpuid, vc)) != 0 )
362 return rc;
363 UNLOCK_BIGLOCK(d);
365 if ( hvm_load_entry(CPU, h, &ctxt) != 0 )
366 return -EINVAL;
368 /* Sanity check some control registers. */
369 if ( (ctxt.cr0 & HVM_CR0_GUEST_RESERVED_BITS) ||
370 !(ctxt.cr0 & X86_CR0_ET) ||
371 ((ctxt.cr0 & (X86_CR0_PE|X86_CR0_PG)) == X86_CR0_PG) )
372 {
373 gdprintk(XENLOG_ERR, "HVM restore: bad CR0 0x%"PRIx64"\n",
374 ctxt.msr_efer);
375 return -EINVAL;
376 }
378 if ( ctxt.cr4 & HVM_CR4_GUEST_RESERVED_BITS )
379 {
380 gdprintk(XENLOG_ERR, "HVM restore: bad CR4 0x%"PRIx64"\n",
381 ctxt.msr_efer);
382 return -EINVAL;
383 }
385 if ( (ctxt.msr_efer & ~(EFER_FFXSE | EFER_LME | EFER_LMA |
386 EFER_NX | EFER_SCE)) ||
387 ((sizeof(long) != 8) && (ctxt.msr_efer & EFER_LME)) ||
388 (!cpu_has_nx && (ctxt.msr_efer & EFER_NX)) ||
389 (!cpu_has_syscall && (ctxt.msr_efer & EFER_SCE)) ||
390 (!cpu_has_ffxsr && (ctxt.msr_efer & EFER_FFXSE)) ||
391 ((ctxt.msr_efer & (EFER_LME|EFER_LMA)) == EFER_LMA) )
392 {
393 gdprintk(XENLOG_ERR, "HVM restore: bad EFER 0x%"PRIx64"\n",
394 ctxt.msr_efer);
395 return -EINVAL;
396 }
398 /* Architecture-specific vmcs/vmcb bits */
399 if ( hvm_funcs.load_cpu_ctxt(v, &ctxt) < 0 )
400 return -EINVAL;
402 /* Other vcpu register state */
403 memcpy(&vc->fpu_ctxt, ctxt.fpu_regs, sizeof(ctxt.fpu_regs));
404 vc->user_regs.eax = ctxt.rax;
405 vc->user_regs.ebx = ctxt.rbx;
406 vc->user_regs.ecx = ctxt.rcx;
407 vc->user_regs.edx = ctxt.rdx;
408 vc->user_regs.ebp = ctxt.rbp;
409 vc->user_regs.esi = ctxt.rsi;
410 vc->user_regs.edi = ctxt.rdi;
411 vc->user_regs.esp = ctxt.rsp;
412 vc->user_regs.eip = ctxt.rip;
413 vc->user_regs.eflags = ctxt.rflags | 2;
414 #ifdef __x86_64__
415 vc->user_regs.r8 = ctxt.r8;
416 vc->user_regs.r9 = ctxt.r9;
417 vc->user_regs.r10 = ctxt.r10;
418 vc->user_regs.r11 = ctxt.r11;
419 vc->user_regs.r12 = ctxt.r12;
420 vc->user_regs.r13 = ctxt.r13;
421 vc->user_regs.r14 = ctxt.r14;
422 vc->user_regs.r15 = ctxt.r15;
423 #endif
424 vc->debugreg[0] = ctxt.dr0;
425 vc->debugreg[1] = ctxt.dr1;
426 vc->debugreg[2] = ctxt.dr2;
427 vc->debugreg[3] = ctxt.dr3;
428 vc->debugreg[6] = ctxt.dr6;
429 vc->debugreg[7] = ctxt.dr7;
431 vc->flags = VGCF_online;
432 v->fpu_initialised = 1;
434 /* Auxiliary processors should be woken immediately. */
435 if ( test_and_clear_bit(_VPF_down, &v->pause_flags) )
436 vcpu_wake(v);
438 return 0;
439 }
441 HVM_REGISTER_SAVE_RESTORE(CPU, hvm_save_cpu_ctxt, hvm_load_cpu_ctxt,
442 1, HVMSR_PER_VCPU);
444 extern int reset_vmsr(struct mtrr_state *m, u64 *p);
446 int hvm_vcpu_initialise(struct vcpu *v)
447 {
448 int rc;
450 if ( (rc = vlapic_init(v)) != 0 )
451 goto fail1;
453 if ( (rc = hvm_funcs.vcpu_initialise(v)) != 0 )
454 goto fail2;
456 /* Create ioreq event channel. */
457 rc = alloc_unbound_xen_event_channel(v, 0);
458 if ( rc < 0 )
459 goto fail3;
461 /* Register ioreq event channel. */
462 v->arch.hvm_vcpu.xen_port = rc;
463 spin_lock(&v->domain->arch.hvm_domain.ioreq.lock);
464 if ( v->domain->arch.hvm_domain.ioreq.va != NULL )
465 get_ioreq(v)->vp_eport = v->arch.hvm_vcpu.xen_port;
466 spin_unlock(&v->domain->arch.hvm_domain.ioreq.lock);
468 spin_lock_init(&v->arch.hvm_vcpu.tm_lock);
469 INIT_LIST_HEAD(&v->arch.hvm_vcpu.tm_list);
471 rc = reset_vmsr(&v->arch.hvm_vcpu.mtrr, &v->arch.hvm_vcpu.pat_cr);
472 if ( rc != 0 )
473 goto fail3;
475 v->arch.guest_context.user_regs.eflags = 2;
477 if ( v->vcpu_id == 0 )
478 {
479 /* NB. All these really belong in hvm_domain_initialise(). */
480 pit_init(v, cpu_khz);
481 rtc_init(v, RTC_PORT(0));
482 pmtimer_init(v);
483 hpet_init(v);
485 /* Init guest TSC to start from zero. */
486 hvm_set_guest_time(v, 0);
488 /* Can start up without SIPI-SIPI or setvcpucontext domctl. */
489 v->is_initialised = 1;
490 clear_bit(_VPF_down, &v->pause_flags);
491 }
493 return 0;
495 fail3:
496 hvm_funcs.vcpu_destroy(v);
497 fail2:
498 vlapic_destroy(v);
499 fail1:
500 return rc;
501 }
503 void hvm_vcpu_destroy(struct vcpu *v)
504 {
505 vlapic_destroy(v);
506 hvm_funcs.vcpu_destroy(v);
508 /* Event channel is already freed by evtchn_destroy(). */
509 /*free_xen_event_channel(v, v->arch.hvm_vcpu.xen_port);*/
510 }
513 void hvm_vcpu_reset(struct vcpu *v)
514 {
515 vcpu_pause(v);
517 vlapic_reset(vcpu_vlapic(v));
519 hvm_funcs.vcpu_initialise(v);
521 set_bit(_VPF_down, &v->pause_flags);
522 clear_bit(_VPF_blocked, &v->pause_flags);
523 v->fpu_initialised = 0;
524 v->fpu_dirtied = 0;
525 v->is_initialised = 0;
527 vcpu_unpause(v);
528 }
530 static void hvm_vcpu_down(void)
531 {
532 struct vcpu *v = current;
533 struct domain *d = v->domain;
534 int online_count = 0;
536 gdprintk(XENLOG_INFO, "VCPU%d: going offline.\n", v->vcpu_id);
538 /* Doesn't halt us immediately, but we'll never return to guest context. */
539 set_bit(_VPF_down, &v->pause_flags);
540 vcpu_sleep_nosync(v);
542 /* Any other VCPUs online? ... */
543 LOCK_BIGLOCK(d);
544 for_each_vcpu ( d, v )
545 if ( !test_bit(_VPF_down, &v->pause_flags) )
546 online_count++;
547 UNLOCK_BIGLOCK(d);
549 /* ... Shut down the domain if not. */
550 if ( online_count == 0 )
551 {
552 gdprintk(XENLOG_INFO, "all CPUs offline -- powering off.\n");
553 domain_shutdown(d, SHUTDOWN_poweroff);
554 }
555 }
557 void hvm_send_assist_req(struct vcpu *v)
558 {
559 ioreq_t *p;
561 if ( unlikely(!vcpu_start_shutdown_deferral(v)) )
562 return; /* implicitly bins the i/o operation */
564 p = &get_ioreq(v)->vp_ioreq;
565 if ( unlikely(p->state != STATE_IOREQ_NONE) )
566 {
567 /* This indicates a bug in the device model. Crash the domain. */
568 gdprintk(XENLOG_ERR, "Device model set bad IO state %d.\n", p->state);
569 domain_crash_synchronous();
570 }
572 prepare_wait_on_xen_event_channel(v->arch.hvm_vcpu.xen_port);
574 /*
575 * Following happens /after/ blocking and setting up ioreq contents.
576 * prepare_wait_on_xen_event_channel() is an implicit barrier.
577 */
578 p->state = STATE_IOREQ_READY;
579 notify_via_xen_event_channel(v->arch.hvm_vcpu.xen_port);
580 }
582 void hvm_hlt(unsigned long rflags)
583 {
584 /*
585 * If we halt with interrupts disabled, that's a pretty sure sign that we
586 * want to shut down. In a real processor, NMIs are the only way to break
587 * out of this.
588 */
589 if ( unlikely(!(rflags & X86_EFLAGS_IF)) )
590 return hvm_vcpu_down();
592 do_sched_op_compat(SCHEDOP_block, 0);
593 }
595 void hvm_triple_fault(void)
596 {
597 struct vcpu *v = current;
598 gdprintk(XENLOG_INFO, "Triple fault on VCPU%d - "
599 "invoking HVM system reset.\n", v->vcpu_id);
600 domain_shutdown(v->domain, SHUTDOWN_reboot);
601 }
603 int hvm_set_efer(uint64_t value)
604 {
605 struct vcpu *v = current;
607 value &= ~EFER_LMA;
609 if ( (value & ~(EFER_FFXSE | EFER_LME | EFER_NX | EFER_SCE)) ||
610 ((sizeof(long) != 8) && (value & EFER_LME)) ||
611 (!cpu_has_nx && (value & EFER_NX)) ||
612 (!cpu_has_syscall && (value & EFER_SCE)) ||
613 (!cpu_has_ffxsr && (value & EFER_FFXSE)) )
614 {
615 gdprintk(XENLOG_WARNING, "Trying to set reserved bit in "
616 "EFER: %"PRIx64"\n", value);
617 hvm_inject_exception(TRAP_gp_fault, 0, 0);
618 return 0;
619 }
621 if ( ((value ^ v->arch.hvm_vcpu.guest_efer) & EFER_LME) &&
622 hvm_paging_enabled(v) )
623 {
624 gdprintk(XENLOG_WARNING,
625 "Trying to change EFER.LME with paging enabled\n");
626 hvm_inject_exception(TRAP_gp_fault, 0, 0);
627 return 0;
628 }
630 value |= v->arch.hvm_vcpu.guest_efer & EFER_LMA;
631 v->arch.hvm_vcpu.guest_efer = value;
632 hvm_update_guest_efer(v);
634 return 1;
635 }
637 extern void shadow_blow_tables_per_domain(struct domain *d);
638 extern bool_t mtrr_pat_not_equal(struct vcpu *vd, struct vcpu *vs);
640 /* Exit UC mode only if all VCPUs agree on MTRR/PAT and are not in no_fill. */
641 static bool_t domain_exit_uc_mode(struct vcpu *v)
642 {
643 struct domain *d = v->domain;
644 struct vcpu *vs;
646 for_each_vcpu ( d, vs )
647 {
648 if ( (vs == v) || !vs->is_initialised )
649 continue;
650 if ( (vs->arch.hvm_vcpu.cache_mode == NO_FILL_CACHE_MODE) ||
651 mtrr_pat_not_equal(vs, v) )
652 return 0;
653 }
655 return 1;
656 }
658 static void local_flush_cache(void *info)
659 {
660 wbinvd();
661 }
663 int hvm_set_cr0(unsigned long value)
664 {
665 struct vcpu *v = current;
666 p2m_type_t p2mt;
667 unsigned long gfn, mfn, old_value = v->arch.hvm_vcpu.guest_cr[0];
669 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR0 value = %lx", value);
671 if ( (u32)value != value )
672 {
673 HVM_DBG_LOG(DBG_LEVEL_1,
674 "Guest attempts to set upper 32 bits in CR0: %lx",
675 value);
676 hvm_inject_exception(TRAP_gp_fault, 0, 0);
677 return 0;
678 }
680 value &= ~HVM_CR0_GUEST_RESERVED_BITS;
682 /* ET is reserved and should be always be 1. */
683 value |= X86_CR0_ET;
685 if ( (value & (X86_CR0_PE | X86_CR0_PG)) == X86_CR0_PG )
686 {
687 hvm_inject_exception(TRAP_gp_fault, 0, 0);
688 return 0;
689 }
691 if ( (value & X86_CR0_PG) && !(old_value & X86_CR0_PG) )
692 {
693 if ( v->arch.hvm_vcpu.guest_efer & EFER_LME )
694 {
695 if ( !(v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_PAE) )
696 {
697 HVM_DBG_LOG(DBG_LEVEL_1, "Enable paging before PAE enable");
698 hvm_inject_exception(TRAP_gp_fault, 0, 0);
699 return 0;
700 }
701 HVM_DBG_LOG(DBG_LEVEL_1, "Enabling long mode");
702 v->arch.hvm_vcpu.guest_efer |= EFER_LMA;
703 hvm_update_guest_efer(v);
704 }
706 if ( !paging_mode_hap(v->domain) )
707 {
708 /* The guest CR3 must be pointing to the guest physical. */
709 gfn = v->arch.hvm_vcpu.guest_cr[3]>>PAGE_SHIFT;
710 mfn = mfn_x(gfn_to_mfn_current(gfn, &p2mt));
711 if ( !p2m_is_ram(p2mt) || !mfn_valid(mfn) ||
712 !get_page(mfn_to_page(mfn), v->domain))
713 {
714 gdprintk(XENLOG_ERR, "Invalid CR3 value = %lx (mfn=%lx)\n",
715 v->arch.hvm_vcpu.guest_cr[3], mfn);
716 domain_crash(v->domain);
717 return 0;
718 }
720 /* Now arch.guest_table points to machine physical. */
721 v->arch.guest_table = pagetable_from_pfn(mfn);
723 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, mfn = %lx",
724 v->arch.hvm_vcpu.guest_cr[3], mfn);
725 }
726 }
727 else if ( !(value & X86_CR0_PG) && (old_value & X86_CR0_PG) )
728 {
729 /* When CR0.PG is cleared, LMA is cleared immediately. */
730 if ( hvm_long_mode_enabled(v) )
731 {
732 v->arch.hvm_vcpu.guest_efer &= ~EFER_LMA;
733 hvm_update_guest_efer(v);
734 }
736 if ( !paging_mode_hap(v->domain) )
737 {
738 put_page(pagetable_get_page(v->arch.guest_table));
739 v->arch.guest_table = pagetable_null();
740 }
741 }
743 if ( !list_empty(&domain_hvm_iommu(v->domain)->pdev_list) )
744 {
745 if ( (value & X86_CR0_CD) && !(value & X86_CR0_NW) )
746 {
747 /* Entering no fill cache mode. */
748 spin_lock(&v->domain->arch.hvm_domain.uc_lock);
749 v->arch.hvm_vcpu.cache_mode = NO_FILL_CACHE_MODE;
751 if ( !v->domain->arch.hvm_domain.is_in_uc_mode )
752 {
753 /* Flush physical caches. */
754 on_each_cpu(local_flush_cache, NULL, 1, 1);
755 /* Shadow pagetables must recognise UC mode. */
756 v->domain->arch.hvm_domain.is_in_uc_mode = 1;
757 shadow_blow_tables_per_domain(v->domain);
758 }
759 spin_unlock(&v->domain->arch.hvm_domain.uc_lock);
760 }
761 else if ( !(value & (X86_CR0_CD | X86_CR0_NW)) &&
762 (v->arch.hvm_vcpu.cache_mode == NO_FILL_CACHE_MODE) )
763 {
764 /* Exit from no fill cache mode. */
765 spin_lock(&v->domain->arch.hvm_domain.uc_lock);
766 v->arch.hvm_vcpu.cache_mode = NORMAL_CACHE_MODE;
768 if ( domain_exit_uc_mode(v) )
769 {
770 /* Shadow pagetables must recognise normal caching mode. */
771 v->domain->arch.hvm_domain.is_in_uc_mode = 0;
772 shadow_blow_tables_per_domain(v->domain);
773 }
774 spin_unlock(&v->domain->arch.hvm_domain.uc_lock);
775 }
776 }
778 v->arch.hvm_vcpu.guest_cr[0] = value;
779 hvm_update_guest_cr(v, 0);
781 if ( (value ^ old_value) & X86_CR0_PG )
782 paging_update_paging_modes(v);
784 return 1;
785 }
787 int hvm_set_cr3(unsigned long value)
788 {
789 unsigned long mfn;
790 p2m_type_t p2mt;
791 struct vcpu *v = current;
793 if ( hvm_paging_enabled(v) && !paging_mode_hap(v->domain) &&
794 (value != v->arch.hvm_vcpu.guest_cr[3]) )
795 {
796 /* Shadow-mode CR3 change. Check PDBR and update refcounts. */
797 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 value = %lx", value);
798 mfn = mfn_x(gfn_to_mfn_current(value >> PAGE_SHIFT, &p2mt));
799 if ( !p2m_is_ram(p2mt) || !mfn_valid(mfn) ||
800 !get_page(mfn_to_page(mfn), v->domain) )
801 goto bad_cr3;
803 put_page(pagetable_get_page(v->arch.guest_table));
804 v->arch.guest_table = pagetable_from_pfn(mfn);
806 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx", value);
807 }
809 v->arch.hvm_vcpu.guest_cr[3] = value;
810 paging_update_cr3(v);
811 return 1;
813 bad_cr3:
814 gdprintk(XENLOG_ERR, "Invalid CR3\n");
815 domain_crash(v->domain);
816 return 0;
817 }
819 int hvm_set_cr4(unsigned long value)
820 {
821 struct vcpu *v = current;
822 unsigned long old_cr;
824 if ( value & HVM_CR4_GUEST_RESERVED_BITS )
825 {
826 HVM_DBG_LOG(DBG_LEVEL_1,
827 "Guest attempts to set reserved bit in CR4: %lx",
828 value);
829 goto gpf;
830 }
832 if ( !(value & X86_CR4_PAE) && hvm_long_mode_enabled(v) )
833 {
834 HVM_DBG_LOG(DBG_LEVEL_1, "Guest cleared CR4.PAE while "
835 "EFER.LMA is set");
836 goto gpf;
837 }
839 old_cr = v->arch.hvm_vcpu.guest_cr[4];
840 v->arch.hvm_vcpu.guest_cr[4] = value;
841 hvm_update_guest_cr(v, 4);
843 /* Modifying CR4.{PSE,PAE,PGE} invalidates all TLB entries, inc. Global. */
844 if ( (old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE) )
845 paging_update_paging_modes(v);
847 return 1;
849 gpf:
850 hvm_inject_exception(TRAP_gp_fault, 0, 0);
851 return 0;
852 }
854 int hvm_virtual_to_linear_addr(
855 enum x86_segment seg,
856 struct segment_register *reg,
857 unsigned long offset,
858 unsigned int bytes,
859 enum hvm_access_type access_type,
860 unsigned int addr_size,
861 unsigned long *linear_addr)
862 {
863 unsigned long addr = offset;
864 uint32_t last_byte;
866 if ( addr_size != 64 )
867 {
868 /*
869 * COMPATIBILITY MODE: Apply segment checks and add base.
870 */
872 switch ( access_type )
873 {
874 case hvm_access_read:
875 if ( (reg->attr.fields.type & 0xa) == 0x8 )
876 goto gpf; /* execute-only code segment */
877 break;
878 case hvm_access_write:
879 if ( (reg->attr.fields.type & 0xa) != 0x2 )
880 goto gpf; /* not a writable data segment */
881 break;
882 default:
883 break;
884 }
886 last_byte = offset + bytes - 1;
888 /* Is this a grows-down data segment? Special limit check if so. */
889 if ( (reg->attr.fields.type & 0xc) == 0x4 )
890 {
891 /* Is upper limit 0xFFFF or 0xFFFFFFFF? */
892 if ( !reg->attr.fields.db )
893 last_byte = (uint16_t)last_byte;
895 /* Check first byte and last byte against respective bounds. */
896 if ( (offset <= reg->limit) || (last_byte < offset) )
897 goto gpf;
898 }
899 else if ( (last_byte > reg->limit) || (last_byte < offset) )
900 goto gpf; /* last byte is beyond limit or wraps 0xFFFFFFFF */
902 /*
903 * Hardware truncates to 32 bits in compatibility mode.
904 * It does not truncate to 16 bits in 16-bit address-size mode.
905 */
906 addr = (uint32_t)(addr + reg->base);
907 }
908 else
909 {
910 /*
911 * LONG MODE: FS and GS add segment base. Addresses must be canonical.
912 */
914 if ( (seg == x86_seg_fs) || (seg == x86_seg_gs) )
915 addr += reg->base;
917 if ( !is_canonical_address(addr) )
918 goto gpf;
919 }
921 *linear_addr = addr;
922 return 1;
924 gpf:
925 return 0;
926 }
928 static void *hvm_map(unsigned long va, int size)
929 {
930 unsigned long gfn, mfn;
931 p2m_type_t p2mt;
932 uint32_t pfec;
934 if ( ((va & ~PAGE_MASK) + size) > PAGE_SIZE )
935 {
936 hvm_inject_exception(TRAP_page_fault, PFEC_write_access,
937 (va + PAGE_SIZE - 1) & PAGE_MASK);
938 return NULL;
939 }
941 /* We're mapping on behalf of the segment-load logic, which might
942 * write the accessed flags in the descriptors (in 32-bit mode), but
943 * we still treat it as a kernel-mode read (i.e. no access checks). */
944 pfec = PFEC_page_present;
945 gfn = paging_gva_to_gfn(current, va, &pfec);
946 mfn = mfn_x(gfn_to_mfn_current(gfn, &p2mt));
947 if ( !p2m_is_ram(p2mt) )
948 {
949 hvm_inject_exception(TRAP_page_fault, pfec, va);
950 return NULL;
951 }
953 ASSERT(mfn_valid(mfn));
955 paging_mark_dirty(current->domain, mfn);
957 return (char *)map_domain_page(mfn) + (va & ~PAGE_MASK);
958 }
960 static void hvm_unmap(void *p)
961 {
962 if ( p )
963 unmap_domain_page(p);
964 }
966 static int hvm_load_segment_selector(
967 struct vcpu *v, enum x86_segment seg, uint16_t sel)
968 {
969 struct segment_register desctab, cs, segr;
970 struct desc_struct *pdesc, desc;
971 u8 dpl, rpl, cpl;
972 int fault_type = TRAP_invalid_tss;
974 /* NULL selector? */
975 if ( (sel & 0xfffc) == 0 )
976 {
977 if ( (seg == x86_seg_cs) || (seg == x86_seg_ss) )
978 goto fail;
979 memset(&segr, 0, sizeof(segr));
980 hvm_set_segment_register(v, seg, &segr);
981 return 0;
982 }
984 /* LDT descriptor must be in the GDT. */
985 if ( (seg == x86_seg_ldtr) && (sel & 4) )
986 goto fail;
988 hvm_get_segment_register(v, x86_seg_cs, &cs);
989 hvm_get_segment_register(
990 v, (sel & 4) ? x86_seg_ldtr : x86_seg_gdtr, &desctab);
992 /* Check against descriptor table limit. */
993 if ( ((sel & 0xfff8) + 7) > desctab.limit )
994 goto fail;
996 pdesc = hvm_map(desctab.base + (sel & 0xfff8), 8);
997 if ( pdesc == NULL )
998 goto hvm_map_fail;
1000 do {
1001 desc = *pdesc;
1003 /* Segment present in memory? */
1004 if ( !(desc.b & (1u<<15)) )
1006 fault_type = TRAP_no_segment;
1007 goto unmap_and_fail;
1010 /* LDT descriptor is a system segment. All others are code/data. */
1011 if ( (desc.b & (1u<<12)) == ((seg == x86_seg_ldtr) << 12) )
1012 goto unmap_and_fail;
1014 dpl = (desc.b >> 13) & 3;
1015 rpl = sel & 3;
1016 cpl = cs.sel & 3;
1018 switch ( seg )
1020 case x86_seg_cs:
1021 /* Code segment? */
1022 if ( !(desc.b & (1u<<11)) )
1023 goto unmap_and_fail;
1024 /* Non-conforming segment: check DPL against RPL. */
1025 if ( ((desc.b & (6u<<9)) != 6) && (dpl != rpl) )
1026 goto unmap_and_fail;
1027 break;
1028 case x86_seg_ss:
1029 /* Writable data segment? */
1030 if ( (desc.b & (5u<<9)) != (1u<<9) )
1031 goto unmap_and_fail;
1032 if ( (dpl != cpl) || (dpl != rpl) )
1033 goto unmap_and_fail;
1034 break;
1035 case x86_seg_ldtr:
1036 /* LDT system segment? */
1037 if ( (desc.b & (15u<<8)) != (2u<<8) )
1038 goto unmap_and_fail;
1039 goto skip_accessed_flag;
1040 default:
1041 /* Readable code or data segment? */
1042 if ( (desc.b & (5u<<9)) == (4u<<9) )
1043 goto unmap_and_fail;
1044 /* Non-conforming segment: check DPL against RPL and CPL. */
1045 if ( ((desc.b & (6u<<9)) != 6) && ((dpl < cpl) || (dpl < rpl)) )
1046 goto unmap_and_fail;
1047 break;
1049 } while ( !(desc.b & 0x100) && /* Ensure Accessed flag is set */
1050 (cmpxchg(&pdesc->b, desc.b, desc.b | 0x100) != desc.b) );
1052 /* Force the Accessed flag in our local copy. */
1053 desc.b |= 0x100;
1055 skip_accessed_flag:
1056 hvm_unmap(pdesc);
1058 segr.base = (((desc.b << 0) & 0xff000000u) |
1059 ((desc.b << 16) & 0x00ff0000u) |
1060 ((desc.a >> 16) & 0x0000ffffu));
1061 segr.attr.bytes = (((desc.b >> 8) & 0x00ffu) |
1062 ((desc.b >> 12) & 0x0f00u));
1063 segr.limit = (desc.b & 0x000f0000u) | (desc.a & 0x0000ffffu);
1064 if ( segr.attr.fields.g )
1065 segr.limit = (segr.limit << 12) | 0xfffu;
1066 segr.sel = sel;
1067 hvm_set_segment_register(v, seg, &segr);
1069 return 0;
1071 unmap_and_fail:
1072 hvm_unmap(pdesc);
1073 fail:
1074 hvm_inject_exception(fault_type, sel & 0xfffc, 0);
1075 hvm_map_fail:
1076 return 1;
1079 void hvm_task_switch(
1080 uint16_t tss_sel, enum hvm_task_switch_reason taskswitch_reason,
1081 int32_t errcode)
1083 struct vcpu *v = current;
1084 struct cpu_user_regs *regs = guest_cpu_user_regs();
1085 struct segment_register gdt, tr, prev_tr, segr;
1086 struct desc_struct *optss_desc = NULL, *nptss_desc = NULL, tss_desc;
1087 unsigned long eflags;
1088 int exn_raised;
1089 struct {
1090 u16 back_link,__blh;
1091 u32 esp0;
1092 u16 ss0, _0;
1093 u32 esp1;
1094 u16 ss1, _1;
1095 u32 esp2;
1096 u16 ss2, _2;
1097 u32 cr3, eip, eflags, eax, ecx, edx, ebx, esp, ebp, esi, edi;
1098 u16 es, _3, cs, _4, ss, _5, ds, _6, fs, _7, gs, _8, ldt, _9;
1099 u16 trace, iomap;
1100 } *ptss, tss;
1102 hvm_get_segment_register(v, x86_seg_gdtr, &gdt);
1103 hvm_get_segment_register(v, x86_seg_tr, &prev_tr);
1105 if ( ((tss_sel & 0xfff8) + 7) > gdt.limit )
1107 hvm_inject_exception((taskswitch_reason == TSW_iret) ?
1108 TRAP_invalid_tss : TRAP_gp_fault,
1109 tss_sel & 0xfff8, 0);
1110 goto out;
1113 optss_desc = hvm_map(gdt.base + (prev_tr.sel & 0xfff8), 8);
1114 if ( optss_desc == NULL )
1115 goto out;
1117 nptss_desc = hvm_map(gdt.base + (tss_sel & 0xfff8), 8);
1118 if ( nptss_desc == NULL )
1119 goto out;
1121 tss_desc = *nptss_desc;
1122 tr.sel = tss_sel;
1123 tr.base = (((tss_desc.b << 0) & 0xff000000u) |
1124 ((tss_desc.b << 16) & 0x00ff0000u) |
1125 ((tss_desc.a >> 16) & 0x0000ffffu));
1126 tr.attr.bytes = (((tss_desc.b >> 8) & 0x00ffu) |
1127 ((tss_desc.b >> 12) & 0x0f00u));
1128 tr.limit = (tss_desc.b & 0x000f0000u) | (tss_desc.a & 0x0000ffffu);
1129 if ( tr.attr.fields.g )
1130 tr.limit = (tr.limit << 12) | 0xfffu;
1132 if ( !tr.attr.fields.p )
1134 hvm_inject_exception(TRAP_no_segment, tss_sel & 0xfff8, 0);
1135 goto out;
1138 if ( tr.attr.fields.type != ((taskswitch_reason == TSW_iret) ? 0xb : 0x9) )
1140 hvm_inject_exception(
1141 (taskswitch_reason == TSW_iret) ? TRAP_invalid_tss : TRAP_gp_fault,
1142 tss_sel & 0xfff8, 0);
1143 goto out;
1146 if ( !tr.attr.fields.g && (tr.limit < (sizeof(tss)-1)) )
1148 hvm_inject_exception(TRAP_invalid_tss, tss_sel & 0xfff8, 0);
1149 goto out;
1152 ptss = hvm_map(prev_tr.base, sizeof(tss));
1153 if ( ptss == NULL )
1154 goto out;
1156 eflags = regs->eflags;
1157 if ( taskswitch_reason == TSW_iret )
1158 eflags &= ~X86_EFLAGS_NT;
1160 ptss->cr3 = v->arch.hvm_vcpu.guest_cr[3];
1161 ptss->eip = regs->eip;
1162 ptss->eflags = eflags;
1163 ptss->eax = regs->eax;
1164 ptss->ecx = regs->ecx;
1165 ptss->edx = regs->edx;
1166 ptss->ebx = regs->ebx;
1167 ptss->esp = regs->esp;
1168 ptss->ebp = regs->ebp;
1169 ptss->esi = regs->esi;
1170 ptss->edi = regs->edi;
1172 hvm_get_segment_register(v, x86_seg_es, &segr);
1173 ptss->es = segr.sel;
1174 hvm_get_segment_register(v, x86_seg_cs, &segr);
1175 ptss->cs = segr.sel;
1176 hvm_get_segment_register(v, x86_seg_ss, &segr);
1177 ptss->ss = segr.sel;
1178 hvm_get_segment_register(v, x86_seg_ds, &segr);
1179 ptss->ds = segr.sel;
1180 hvm_get_segment_register(v, x86_seg_fs, &segr);
1181 ptss->fs = segr.sel;
1182 hvm_get_segment_register(v, x86_seg_gs, &segr);
1183 ptss->gs = segr.sel;
1184 hvm_get_segment_register(v, x86_seg_ldtr, &segr);
1185 ptss->ldt = segr.sel;
1187 hvm_unmap(ptss);
1189 ptss = hvm_map(tr.base, sizeof(tss));
1190 if ( ptss == NULL )
1191 goto out;
1193 if ( !hvm_set_cr3(ptss->cr3) )
1195 hvm_unmap(ptss);
1196 goto out;
1199 regs->eip = ptss->eip;
1200 regs->eflags = ptss->eflags | 2;
1201 regs->eax = ptss->eax;
1202 regs->ecx = ptss->ecx;
1203 regs->edx = ptss->edx;
1204 regs->ebx = ptss->ebx;
1205 regs->esp = ptss->esp;
1206 regs->ebp = ptss->ebp;
1207 regs->esi = ptss->esi;
1208 regs->edi = ptss->edi;
1210 if ( (taskswitch_reason == TSW_call_or_int) )
1212 regs->eflags |= X86_EFLAGS_NT;
1213 ptss->back_link = prev_tr.sel;
1216 exn_raised = 0;
1217 if ( hvm_load_segment_selector(v, x86_seg_es, ptss->es) ||
1218 hvm_load_segment_selector(v, x86_seg_cs, ptss->cs) ||
1219 hvm_load_segment_selector(v, x86_seg_ss, ptss->ss) ||
1220 hvm_load_segment_selector(v, x86_seg_ds, ptss->ds) ||
1221 hvm_load_segment_selector(v, x86_seg_fs, ptss->fs) ||
1222 hvm_load_segment_selector(v, x86_seg_gs, ptss->gs) ||
1223 hvm_load_segment_selector(v, x86_seg_ldtr, ptss->ldt) )
1224 exn_raised = 1;
1226 if ( (ptss->trace & 1) && !exn_raised )
1227 hvm_inject_exception(TRAP_debug, tss_sel & 0xfff8, 0);
1229 hvm_unmap(ptss);
1231 tr.attr.fields.type = 0xb; /* busy 32-bit tss */
1232 hvm_set_segment_register(v, x86_seg_tr, &tr);
1234 v->arch.hvm_vcpu.guest_cr[0] |= X86_CR0_TS;
1235 hvm_update_guest_cr(v, 0);
1237 if ( (taskswitch_reason == TSW_iret) ||
1238 (taskswitch_reason == TSW_jmp) )
1239 clear_bit(41, optss_desc); /* clear B flag of old task */
1241 if ( taskswitch_reason != TSW_iret )
1242 set_bit(41, nptss_desc); /* set B flag of new task */
1244 if ( errcode >= 0 )
1246 struct segment_register reg;
1247 unsigned long linear_addr;
1248 regs->esp -= 4;
1249 hvm_get_segment_register(current, x86_seg_ss, &reg);
1250 /* Todo: do not ignore access faults here. */
1251 if ( hvm_virtual_to_linear_addr(x86_seg_ss, &reg, regs->esp,
1252 4, hvm_access_write, 32,
1253 &linear_addr) )
1254 hvm_copy_to_guest_virt(linear_addr, &errcode, 4);
1257 out:
1258 hvm_unmap(optss_desc);
1259 hvm_unmap(nptss_desc);
1262 /*
1263 * __hvm_copy():
1264 * @buf = hypervisor buffer
1265 * @addr = guest address to copy to/from
1266 * @size = number of bytes to copy
1267 * @dir = copy *to* guest (TRUE) or *from* guest (FALSE)?
1268 * @virt = addr is *virtual* (TRUE) or *guest physical* (FALSE)?
1269 * @fetch = copy is an instruction fetch?
1270 * Returns number of bytes failed to copy (0 == complete success).
1271 */
1272 static int __hvm_copy(void *buf, paddr_t addr, int size, int dir,
1273 int virt, int fetch)
1275 unsigned long gfn, mfn;
1276 p2m_type_t p2mt;
1277 char *p;
1278 int count, todo;
1279 uint32_t pfec = PFEC_page_present;
1281 if ( dir )
1282 pfec |= PFEC_write_access;
1283 if ( ring_3(guest_cpu_user_regs()) )
1284 pfec |= PFEC_user_mode;
1285 if ( fetch )
1286 pfec |= PFEC_insn_fetch;
1288 todo = size;
1289 while ( todo > 0 )
1291 count = min_t(int, PAGE_SIZE - (addr & ~PAGE_MASK), todo);
1293 if ( virt )
1294 gfn = paging_gva_to_gfn(current, addr, &pfec);
1295 else
1296 gfn = addr >> PAGE_SHIFT;
1298 mfn = mfn_x(gfn_to_mfn_current(gfn, &p2mt));
1300 if ( !p2m_is_ram(p2mt) )
1301 return todo;
1302 ASSERT(mfn_valid(mfn));
1304 p = (char *)map_domain_page(mfn) + (addr & ~PAGE_MASK);
1306 if ( dir )
1308 memcpy(p, buf, count); /* dir == TRUE: *to* guest */
1309 paging_mark_dirty(current->domain, mfn);
1311 else
1312 memcpy(buf, p, count); /* dir == FALSE: *from guest */
1314 unmap_domain_page(p);
1316 addr += count;
1317 buf += count;
1318 todo -= count;
1321 return 0;
1324 int hvm_copy_to_guest_phys(paddr_t paddr, void *buf, int size)
1326 return __hvm_copy(buf, paddr, size, 1, 0, 0);
1329 int hvm_copy_from_guest_phys(void *buf, paddr_t paddr, int size)
1331 return __hvm_copy(buf, paddr, size, 0, 0, 0);
1334 int hvm_copy_to_guest_virt(unsigned long vaddr, void *buf, int size)
1336 return __hvm_copy(buf, vaddr, size, 1, 1, 0);
1339 int hvm_copy_from_guest_virt(void *buf, unsigned long vaddr, int size)
1341 return __hvm_copy(buf, vaddr, size, 0, 1, 0);
1344 int hvm_fetch_from_guest_virt(void *buf, unsigned long vaddr, int size)
1346 return __hvm_copy(buf, vaddr, size, 0, 1, hvm_nx_enabled(current));
1350 /* HVM specific printbuf. Mostly used for hvmloader chit-chat. */
1351 void hvm_print_line(struct vcpu *v, const char c)
1353 struct hvm_domain *hd = &v->domain->arch.hvm_domain;
1355 spin_lock(&hd->pbuf_lock);
1356 hd->pbuf[hd->pbuf_idx++] = c;
1357 if ( (hd->pbuf_idx == (sizeof(hd->pbuf) - 2)) || (c == '\n') )
1359 if ( c != '\n' )
1360 hd->pbuf[hd->pbuf_idx++] = '\n';
1361 hd->pbuf[hd->pbuf_idx] = '\0';
1362 printk(XENLOG_G_DEBUG "HVM%u: %s", v->domain->domain_id, hd->pbuf);
1363 hd->pbuf_idx = 0;
1365 spin_unlock(&hd->pbuf_lock);
1368 void hvm_cpuid(unsigned int input, unsigned int *eax, unsigned int *ebx,
1369 unsigned int *ecx, unsigned int *edx)
1371 struct vcpu *v = current;
1373 if ( cpuid_hypervisor_leaves(input, eax, ebx, ecx, edx) )
1374 return;
1376 cpuid(input, eax, ebx, ecx, edx);
1378 switch ( input )
1380 case 0x00000001:
1381 __clear_bit(X86_FEATURE_MWAIT & 31, ecx);
1383 if ( vlapic_hw_disabled(vcpu_vlapic(v)) )
1384 __clear_bit(X86_FEATURE_APIC & 31, edx);
1386 #if CONFIG_PAGING_LEVELS >= 3
1387 if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] )
1388 #endif
1389 __clear_bit(X86_FEATURE_PAE & 31, edx);
1390 __clear_bit(X86_FEATURE_PSE36 & 31, edx);
1391 break;
1393 case 0x80000001:
1394 #if CONFIG_PAGING_LEVELS >= 3
1395 if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] )
1396 #endif
1397 __clear_bit(X86_FEATURE_NX & 31, edx);
1398 #ifdef __i386__
1399 /* Mask feature for Intel ia32e or AMD long mode. */
1400 __clear_bit(X86_FEATURE_LAHF_LM & 31, ecx);
1401 __clear_bit(X86_FEATURE_LM & 31, edx);
1402 __clear_bit(X86_FEATURE_SYSCALL & 31, edx);
1403 #endif
1404 break;
1408 enum hvm_intblk hvm_interrupt_blocked(struct vcpu *v, struct hvm_intack intack)
1410 enum hvm_intblk r;
1411 ASSERT(v == current);
1413 r = hvm_funcs.interrupt_blocked(v, intack);
1414 if ( r != hvm_intblk_none )
1415 return r;
1417 if ( intack.source == hvm_intsrc_lapic )
1419 uint32_t tpr = vlapic_get_reg(vcpu_vlapic(v), APIC_TASKPRI) & 0xF0;
1420 if ( (tpr >> 4) >= (intack.vector >> 4) )
1421 return hvm_intblk_tpr;
1424 return r;
1427 static long hvm_grant_table_op(
1428 unsigned int cmd, XEN_GUEST_HANDLE(void) uop, unsigned int count)
1430 if ( cmd != GNTTABOP_query_size )
1431 return -ENOSYS; /* all other commands need auditing */
1432 return do_grant_table_op(cmd, uop, count);
1435 typedef unsigned long hvm_hypercall_t(
1436 unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
1438 #define HYPERCALL(x) \
1439 [ __HYPERVISOR_ ## x ] = (hvm_hypercall_t *) do_ ## x
1441 #if defined(__i386__)
1443 static hvm_hypercall_t *hvm_hypercall32_table[NR_hypercalls] = {
1444 HYPERCALL(memory_op),
1445 [ __HYPERVISOR_grant_table_op ] = (hvm_hypercall_t *)hvm_grant_table_op,
1446 HYPERCALL(xen_version),
1447 HYPERCALL(event_channel_op),
1448 HYPERCALL(sched_op),
1449 HYPERCALL(hvm_op)
1450 };
1452 #else /* defined(__x86_64__) */
1454 static long do_memory_op_compat32(int cmd, XEN_GUEST_HANDLE(void) arg)
1456 extern long do_add_to_physmap(struct xen_add_to_physmap *xatp);
1457 long rc;
1459 switch ( cmd )
1461 case XENMEM_add_to_physmap:
1463 struct {
1464 domid_t domid;
1465 uint32_t space;
1466 uint32_t idx;
1467 uint32_t gpfn;
1468 } u;
1469 struct xen_add_to_physmap h;
1471 if ( copy_from_guest(&u, arg, 1) )
1472 return -EFAULT;
1474 h.domid = u.domid;
1475 h.space = u.space;
1476 h.idx = u.idx;
1477 h.gpfn = u.gpfn;
1479 this_cpu(guest_handles_in_xen_space) = 1;
1480 rc = do_memory_op(cmd, guest_handle_from_ptr(&h, void));
1481 this_cpu(guest_handles_in_xen_space) = 0;
1483 break;
1486 default:
1487 gdprintk(XENLOG_WARNING, "memory_op %d.\n", cmd);
1488 rc = -ENOSYS;
1489 break;
1492 return rc;
1495 static hvm_hypercall_t *hvm_hypercall64_table[NR_hypercalls] = {
1496 HYPERCALL(memory_op),
1497 [ __HYPERVISOR_grant_table_op ] = (hvm_hypercall_t *)hvm_grant_table_op,
1498 HYPERCALL(xen_version),
1499 HYPERCALL(event_channel_op),
1500 HYPERCALL(sched_op),
1501 HYPERCALL(hvm_op)
1502 };
1504 static hvm_hypercall_t *hvm_hypercall32_table[NR_hypercalls] = {
1505 [ __HYPERVISOR_memory_op ] = (hvm_hypercall_t *)do_memory_op_compat32,
1506 [ __HYPERVISOR_grant_table_op ] = (hvm_hypercall_t *)hvm_grant_table_op,
1507 HYPERCALL(xen_version),
1508 HYPERCALL(event_channel_op),
1509 HYPERCALL(sched_op),
1510 HYPERCALL(hvm_op)
1511 };
1513 #endif /* defined(__x86_64__) */
1515 int hvm_do_hypercall(struct cpu_user_regs *regs)
1517 int flush, mode = hvm_guest_x86_mode(current);
1518 uint32_t eax = regs->eax;
1520 switch ( mode )
1522 #ifdef __x86_64__
1523 case 8:
1524 #endif
1525 case 4:
1526 case 2:
1527 if ( unlikely(ring_3(regs)) )
1529 default:
1530 regs->eax = -EPERM;
1531 return HVM_HCALL_completed;
1533 case 0:
1534 break;
1537 if ( (eax >= NR_hypercalls) || !hvm_hypercall32_table[eax] )
1539 regs->eax = -ENOSYS;
1540 return HVM_HCALL_completed;
1543 /*
1544 * NB. In future flush only on decrease_reservation.
1545 * For now we also need to flush when pages are added, as qemu-dm is not
1546 * yet capable of faulting pages into an existing valid mapcache bucket.
1547 */
1548 flush = ((eax == __HYPERVISOR_memory_op) ||
1549 (eax == __HYPERVISOR_grant_table_op)); /* needed ? */
1550 this_cpu(hc_preempted) = 0;
1552 #ifdef __x86_64__
1553 if ( mode == 8 )
1555 HVM_DBG_LOG(DBG_LEVEL_HCALL, "hcall%u(%lx, %lx, %lx, %lx, %lx)", eax,
1556 regs->rdi, regs->rsi, regs->rdx, regs->r10, regs->r8);
1558 regs->rax = hvm_hypercall64_table[eax](regs->rdi,
1559 regs->rsi,
1560 regs->rdx,
1561 regs->r10,
1562 regs->r8);
1564 else
1565 #endif
1567 HVM_DBG_LOG(DBG_LEVEL_HCALL, "hcall%u(%x, %x, %x, %x, %x)", eax,
1568 (uint32_t)regs->ebx, (uint32_t)regs->ecx,
1569 (uint32_t)regs->edx, (uint32_t)regs->esi,
1570 (uint32_t)regs->edi);
1572 regs->eax = hvm_hypercall32_table[eax]((uint32_t)regs->ebx,
1573 (uint32_t)regs->ecx,
1574 (uint32_t)regs->edx,
1575 (uint32_t)regs->esi,
1576 (uint32_t)regs->edi);
1579 HVM_DBG_LOG(DBG_LEVEL_HCALL, "hcall%u -> %lx",
1580 eax, (unsigned long)regs->eax);
1582 return (this_cpu(hc_preempted) ? HVM_HCALL_preempted :
1583 flush ? HVM_HCALL_invalidate : HVM_HCALL_completed);
1586 static void hvm_latch_shinfo_size(struct domain *d)
1588 /*
1589 * Called from operations which are among the very first executed by
1590 * PV drivers on initialisation or after save/restore. These are sensible
1591 * points at which to sample the execution mode of the guest and latch
1592 * 32- or 64-bit format for shared state.
1593 */
1594 if ( current->domain == d )
1595 d->arch.has_32bit_shinfo = (hvm_guest_x86_mode(current) != 8);
1598 /* Initialise a hypercall transfer page for a VMX domain using
1599 paravirtualised drivers. */
1600 void hvm_hypercall_page_initialise(struct domain *d,
1601 void *hypercall_page)
1603 hvm_latch_shinfo_size(d);
1604 hvm_funcs.init_hypercall_page(d, hypercall_page);
1607 int hvm_bringup_ap(int vcpuid, int trampoline_vector)
1609 struct domain *d = current->domain;
1610 struct vcpu *v;
1611 struct vcpu_guest_context *ctxt;
1612 struct segment_register reg;
1614 ASSERT(is_hvm_domain(d));
1616 if ( (v = d->vcpu[vcpuid]) == NULL )
1617 return -ENOENT;
1619 v->fpu_initialised = 0;
1620 v->arch.flags |= TF_kernel_mode;
1621 v->is_initialised = 1;
1623 ctxt = &v->arch.guest_context;
1624 memset(ctxt, 0, sizeof(*ctxt));
1625 ctxt->flags = VGCF_online;
1626 ctxt->user_regs.eflags = 2;
1628 #ifdef VMXASSIST
1629 if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
1631 ctxt->user_regs.eip = VMXASSIST_BASE;
1632 ctxt->user_regs.edx = vcpuid;
1633 ctxt->user_regs.ebx = trampoline_vector;
1634 goto done;
1636 #endif
1638 v->arch.hvm_vcpu.guest_cr[0] = X86_CR0_ET;
1639 hvm_update_guest_cr(v, 0);
1641 v->arch.hvm_vcpu.guest_cr[2] = 0;
1642 hvm_update_guest_cr(v, 2);
1644 v->arch.hvm_vcpu.guest_cr[3] = 0;
1645 hvm_update_guest_cr(v, 3);
1647 v->arch.hvm_vcpu.guest_cr[4] = 0;
1648 hvm_update_guest_cr(v, 4);
1650 v->arch.hvm_vcpu.guest_efer = 0;
1651 hvm_update_guest_efer(v);
1653 reg.sel = trampoline_vector << 8;
1654 reg.base = (uint32_t)reg.sel << 4;
1655 reg.limit = 0xffff;
1656 reg.attr.bytes = 0x89b;
1657 hvm_set_segment_register(v, x86_seg_cs, &reg);
1659 reg.sel = reg.base = 0;
1660 reg.limit = 0xffff;
1661 reg.attr.bytes = 0x893;
1662 hvm_set_segment_register(v, x86_seg_ds, &reg);
1663 hvm_set_segment_register(v, x86_seg_es, &reg);
1664 hvm_set_segment_register(v, x86_seg_fs, &reg);
1665 hvm_set_segment_register(v, x86_seg_gs, &reg);
1666 hvm_set_segment_register(v, x86_seg_ss, &reg);
1668 reg.attr.bytes = 0x82; /* LDT */
1669 hvm_set_segment_register(v, x86_seg_ldtr, &reg);
1671 reg.attr.bytes = 0x8b; /* 32-bit TSS (busy) */
1672 hvm_set_segment_register(v, x86_seg_tr, &reg);
1674 reg.attr.bytes = 0;
1675 hvm_set_segment_register(v, x86_seg_gdtr, &reg);
1676 hvm_set_segment_register(v, x86_seg_idtr, &reg);
1678 #ifdef VMXASSIST
1679 done:
1680 #endif
1681 /* Sync AP's TSC with BSP's. */
1682 v->arch.hvm_vcpu.cache_tsc_offset =
1683 v->domain->vcpu[0]->arch.hvm_vcpu.cache_tsc_offset;
1684 hvm_funcs.set_tsc_offset(v, v->arch.hvm_vcpu.cache_tsc_offset);
1686 if ( test_and_clear_bit(_VPF_down, &v->pause_flags) )
1687 vcpu_wake(v);
1689 gdprintk(XENLOG_INFO, "AP %d bringup succeeded.\n", vcpuid);
1690 return 0;
1693 static int hvmop_set_pci_intx_level(
1694 XEN_GUEST_HANDLE(xen_hvm_set_pci_intx_level_t) uop)
1696 struct xen_hvm_set_pci_intx_level op;
1697 struct domain *d;
1698 int rc;
1700 if ( copy_from_guest(&op, uop, 1) )
1701 return -EFAULT;
1703 if ( !IS_PRIV(current->domain) )
1704 return -EPERM;
1706 if ( (op.domain > 0) || (op.bus > 0) || (op.device > 31) || (op.intx > 3) )
1707 return -EINVAL;
1709 d = rcu_lock_domain_by_id(op.domid);
1710 if ( d == NULL )
1711 return -ESRCH;
1713 rc = -EINVAL;
1714 if ( !is_hvm_domain(d) )
1715 goto out;
1717 rc = xsm_hvm_set_pci_intx_level(d);
1718 if ( rc )
1719 goto out;
1721 rc = 0;
1722 switch ( op.level )
1724 case 0:
1725 hvm_pci_intx_deassert(d, op.device, op.intx);
1726 break;
1727 case 1:
1728 hvm_pci_intx_assert(d, op.device, op.intx);
1729 break;
1730 default:
1731 rc = -EINVAL;
1732 break;
1735 out:
1736 rcu_unlock_domain(d);
1737 return rc;
1740 static int hvmop_set_isa_irq_level(
1741 XEN_GUEST_HANDLE(xen_hvm_set_isa_irq_level_t) uop)
1743 struct xen_hvm_set_isa_irq_level op;
1744 struct domain *d;
1745 int rc;
1747 if ( copy_from_guest(&op, uop, 1) )
1748 return -EFAULT;
1750 if ( !IS_PRIV(current->domain) )
1751 return -EPERM;
1753 if ( op.isa_irq > 15 )
1754 return -EINVAL;
1756 d = rcu_lock_domain_by_id(op.domid);
1757 if ( d == NULL )
1758 return -ESRCH;
1760 rc = -EINVAL;
1761 if ( !is_hvm_domain(d) )
1762 goto out;
1764 rc = xsm_hvm_set_isa_irq_level(d);
1765 if ( rc )
1766 goto out;
1768 rc = 0;
1769 switch ( op.level )
1771 case 0:
1772 hvm_isa_irq_deassert(d, op.isa_irq);
1773 break;
1774 case 1:
1775 hvm_isa_irq_assert(d, op.isa_irq);
1776 break;
1777 default:
1778 rc = -EINVAL;
1779 break;
1782 out:
1783 rcu_unlock_domain(d);
1784 return rc;
1787 static int hvmop_set_pci_link_route(
1788 XEN_GUEST_HANDLE(xen_hvm_set_pci_link_route_t) uop)
1790 struct xen_hvm_set_pci_link_route op;
1791 struct domain *d;
1792 int rc;
1794 if ( copy_from_guest(&op, uop, 1) )
1795 return -EFAULT;
1797 if ( !IS_PRIV(current->domain) )
1798 return -EPERM;
1800 if ( (op.link > 3) || (op.isa_irq > 15) )
1801 return -EINVAL;
1803 d = rcu_lock_domain_by_id(op.domid);
1804 if ( d == NULL )
1805 return -ESRCH;
1807 rc = -EINVAL;
1808 if ( !is_hvm_domain(d) )
1809 goto out;
1811 rc = xsm_hvm_set_pci_link_route(d);
1812 if ( rc )
1813 goto out;
1815 rc = 0;
1816 hvm_set_pci_link_route(d, op.link, op.isa_irq);
1818 out:
1819 rcu_unlock_domain(d);
1820 return rc;
1823 static int hvmop_flush_tlb_all(void)
1825 struct domain *d = current->domain;
1826 struct vcpu *v;
1828 /* Avoid deadlock if more than one vcpu tries this at the same time. */
1829 if ( !spin_trylock(&d->hypercall_deadlock_mutex) )
1830 return -EAGAIN;
1832 /* Pause all other vcpus. */
1833 for_each_vcpu ( d, v )
1834 if ( v != current )
1835 vcpu_pause_nosync(v);
1837 /* Now that all VCPUs are signalled to deschedule, we wait... */
1838 for_each_vcpu ( d, v )
1839 if ( v != current )
1840 while ( !vcpu_runnable(v) && v->is_running )
1841 cpu_relax();
1843 /* All other vcpus are paused, safe to unlock now. */
1844 spin_unlock(&d->hypercall_deadlock_mutex);
1846 /* Flush paging-mode soft state (e.g., va->gfn cache; PAE PDPE cache). */
1847 for_each_vcpu ( d, v )
1848 paging_update_cr3(v);
1850 /* Flush all dirty TLBs. */
1851 flush_tlb_mask(d->domain_dirty_cpumask);
1853 /* Done. */
1854 for_each_vcpu ( d, v )
1855 if ( v != current )
1856 vcpu_unpause(v);
1858 return 0;
1861 long do_hvm_op(unsigned long op, XEN_GUEST_HANDLE(void) arg)
1864 long rc = 0;
1866 switch ( op )
1868 case HVMOP_set_param:
1869 case HVMOP_get_param:
1871 struct xen_hvm_param a;
1872 struct hvm_ioreq_page *iorp;
1873 struct domain *d;
1874 struct vcpu *v;
1876 if ( copy_from_guest(&a, arg, 1) )
1877 return -EFAULT;
1879 if ( a.index >= HVM_NR_PARAMS )
1880 return -EINVAL;
1882 if ( a.domid == DOMID_SELF )
1883 d = rcu_lock_current_domain();
1884 else if ( IS_PRIV(current->domain) )
1885 d = rcu_lock_domain_by_id(a.domid);
1886 else
1887 return -EPERM;
1889 if ( d == NULL )
1890 return -ESRCH;
1892 rc = -EINVAL;
1893 if ( !is_hvm_domain(d) )
1894 goto param_fail;
1896 rc = xsm_hvm_param(d, op);
1897 if ( rc )
1898 goto param_fail;
1900 if ( op == HVMOP_set_param )
1902 switch ( a.index )
1904 case HVM_PARAM_IOREQ_PFN:
1905 iorp = &d->arch.hvm_domain.ioreq;
1906 rc = hvm_set_ioreq_page(d, iorp, a.value);
1907 spin_lock(&iorp->lock);
1908 if ( (rc == 0) && (iorp->va != NULL) )
1909 /* Initialise evtchn port info if VCPUs already created. */
1910 for_each_vcpu ( d, v )
1911 get_ioreq(v)->vp_eport = v->arch.hvm_vcpu.xen_port;
1912 spin_unlock(&iorp->lock);
1913 break;
1914 case HVM_PARAM_BUFIOREQ_PFN:
1915 iorp = &d->arch.hvm_domain.buf_ioreq;
1916 rc = hvm_set_ioreq_page(d, iorp, a.value);
1917 break;
1918 case HVM_PARAM_CALLBACK_IRQ:
1919 hvm_set_callback_via(d, a.value);
1920 hvm_latch_shinfo_size(d);
1921 break;
1922 case HVM_PARAM_TIMER_MODE:
1923 rc = -EINVAL;
1924 if ( a.value > HVMPTM_one_missed_tick_pending )
1925 goto param_fail;
1926 break;
1928 d->arch.hvm_domain.params[a.index] = a.value;
1929 rc = 0;
1931 else
1933 a.value = d->arch.hvm_domain.params[a.index];
1934 rc = copy_to_guest(arg, &a, 1) ? -EFAULT : 0;
1937 HVM_DBG_LOG(DBG_LEVEL_HCALL, "%s param %u = %"PRIx64,
1938 op == HVMOP_set_param ? "set" : "get",
1939 a.index, a.value);
1941 param_fail:
1942 rcu_unlock_domain(d);
1943 break;
1946 case HVMOP_set_pci_intx_level:
1947 rc = hvmop_set_pci_intx_level(
1948 guest_handle_cast(arg, xen_hvm_set_pci_intx_level_t));
1949 break;
1951 case HVMOP_set_isa_irq_level:
1952 rc = hvmop_set_isa_irq_level(
1953 guest_handle_cast(arg, xen_hvm_set_isa_irq_level_t));
1954 break;
1956 case HVMOP_set_pci_link_route:
1957 rc = hvmop_set_pci_link_route(
1958 guest_handle_cast(arg, xen_hvm_set_pci_link_route_t));
1959 break;
1961 case HVMOP_flush_tlbs:
1962 rc = guest_handle_is_null(arg) ? hvmop_flush_tlb_all() : -ENOSYS;
1963 break;
1965 default:
1967 gdprintk(XENLOG_WARNING, "Bad HVM op %ld.\n", op);
1968 rc = -ENOSYS;
1969 break;
1973 if ( rc == -EAGAIN )
1974 rc = hypercall_create_continuation(
1975 __HYPERVISOR_hvm_op, "lh", op, arg);
1977 return rc;
1980 /*
1981 * Local variables:
1982 * mode: C
1983 * c-set-style: "BSD"
1984 * c-basic-offset: 4
1985 * tab-width: 4
1986 * indent-tabs-mode: nil
1987 * End:
1988 */