ia64/xen-unstable

view xen/arch/x86/hvm/hvm.c @ 17571:b6aa55ca599e

shadow: track video RAM dirty bits

This adds a new HVM op that enables tracking dirty bits of a range of
video RAM. The idea is to optimize just for the most common case
(only one guest mapping, with sometimes some temporary other
mappings), which permits to keep the overhead on shadow as low as
possible.

Signed-off-by: Samuel Thibault <samuel.thibault@eu.citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Fri May 02 15:08:27 2008 +0100 (2008-05-02)
parents 18727843db60
children e6f20d5ed5fe
line source
1 /*
2 * hvm.c: Common hardware virtual machine abstractions.
3 *
4 * Copyright (c) 2004, Intel Corporation.
5 * Copyright (c) 2005, International Business Machines Corporation.
6 * Copyright (c) 2008, Citrix Systems, Inc.
7 *
8 * This program is free software; you can redistribute it and/or modify it
9 * under the terms and conditions of the GNU General Public License,
10 * version 2, as published by the Free Software Foundation.
11 *
12 * This program is distributed in the hope it will be useful, but WITHOUT
13 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
15 * more details.
16 *
17 * You should have received a copy of the GNU General Public License along with
18 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
19 * Place - Suite 330, Boston, MA 02111-1307 USA.
20 */
22 #include <xen/config.h>
23 #include <xen/init.h>
24 #include <xen/lib.h>
25 #include <xen/trace.h>
26 #include <xen/sched.h>
27 #include <xen/irq.h>
28 #include <xen/softirq.h>
29 #include <xen/domain.h>
30 #include <xen/domain_page.h>
31 #include <xen/hypercall.h>
32 #include <xen/guest_access.h>
33 #include <xen/event.h>
34 #include <asm/current.h>
35 #include <asm/e820.h>
36 #include <asm/io.h>
37 #include <asm/paging.h>
38 #include <asm/regs.h>
39 #include <asm/cpufeature.h>
40 #include <asm/processor.h>
41 #include <asm/types.h>
42 #include <asm/msr.h>
43 #include <asm/mc146818rtc.h>
44 #include <asm/spinlock.h>
45 #include <asm/hvm/hvm.h>
46 #include <asm/hvm/vpt.h>
47 #include <asm/hvm/support.h>
48 #include <asm/hvm/cacheattr.h>
49 #include <public/sched.h>
50 #include <public/hvm/ioreq.h>
51 #include <public/version.h>
52 #include <public/memory.h>
54 int hvm_enabled __read_mostly;
56 unsigned int opt_hvm_debug_level __read_mostly;
57 integer_param("hvm_debug", opt_hvm_debug_level);
59 struct hvm_function_table hvm_funcs __read_mostly;
61 /* I/O permission bitmap is globally shared by all HVM guests. */
62 unsigned long __attribute__ ((__section__ (".bss.page_aligned")))
63 hvm_io_bitmap[3*PAGE_SIZE/BYTES_PER_LONG];
65 void hvm_enable(struct hvm_function_table *fns)
66 {
67 BUG_ON(hvm_enabled);
68 printk("HVM: %s enabled\n", fns->name);
70 /*
71 * Allow direct access to the PC debug port (it is often used for I/O
72 * delays, but the vmexits simply slow things down).
73 */
74 memset(hvm_io_bitmap, ~0, sizeof(hvm_io_bitmap));
75 __clear_bit(0x80, hvm_io_bitmap);
77 hvm_funcs = *fns;
78 hvm_enabled = 1;
80 if ( hvm_funcs.hap_supported )
81 printk("HVM: Hardware Assisted Paging detected.\n");
82 }
84 /*
85 * Need to re-inject a given event? We avoid re-injecting software exceptions
86 * and interrupts because the faulting/trapping instruction can simply be
87 * re-executed (neither VMX nor SVM update RIP when they VMEXIT during
88 * INT3/INTO/INTn).
89 */
90 int hvm_event_needs_reinjection(uint8_t type, uint8_t vector)
91 {
92 switch ( type )
93 {
94 case X86_EVENTTYPE_EXT_INTR:
95 case X86_EVENTTYPE_NMI:
96 return 1;
97 case X86_EVENTTYPE_HW_EXCEPTION:
98 /*
99 * SVM uses type 3 ("HW Exception") for #OF and #BP. We explicitly
100 * check for these vectors, as they are really SW Exceptions. SVM has
101 * not updated RIP to point after the trapping instruction (INT3/INTO).
102 */
103 return (vector != 3) && (vector != 4);
104 default:
105 /* Software exceptions/interrupts can be re-executed (e.g., INT n). */
106 break;
107 }
108 return 0;
109 }
111 /*
112 * Combine two hardware exceptions: @vec2 was raised during delivery of @vec1.
113 * This means we can assume that @vec2 is contributory or a page fault.
114 */
115 uint8_t hvm_combine_hw_exceptions(uint8_t vec1, uint8_t vec2)
116 {
117 /* Exception during double-fault delivery always causes a triple fault. */
118 if ( vec1 == TRAP_double_fault )
119 {
120 hvm_triple_fault();
121 return TRAP_double_fault; /* dummy return */
122 }
124 /* Exception during page-fault delivery always causes a double fault. */
125 if ( vec1 == TRAP_page_fault )
126 return TRAP_double_fault;
128 /* Discard the first exception if it's benign or if we now have a #PF. */
129 if ( !((1u << vec1) & 0x7c01u) || (vec2 == TRAP_page_fault) )
130 return vec2;
132 /* Cannot combine the exceptions: double fault. */
133 return TRAP_double_fault;
134 }
136 void hvm_set_guest_tsc(struct vcpu *v, u64 guest_tsc)
137 {
138 u64 host_tsc;
140 rdtscll(host_tsc);
142 v->arch.hvm_vcpu.cache_tsc_offset = guest_tsc - host_tsc;
143 hvm_funcs.set_tsc_offset(v, v->arch.hvm_vcpu.cache_tsc_offset);
144 }
146 u64 hvm_get_guest_tsc(struct vcpu *v)
147 {
148 u64 host_tsc;
150 rdtscll(host_tsc);
151 return host_tsc + v->arch.hvm_vcpu.cache_tsc_offset;
152 }
154 void hvm_migrate_timers(struct vcpu *v)
155 {
156 rtc_migrate_timers(v);
157 hpet_migrate_timers(v);
158 pt_migrate(v);
159 }
161 void hvm_do_resume(struct vcpu *v)
162 {
163 ioreq_t *p;
165 pt_restore_timer(v);
167 /* NB. Optimised for common case (p->state == STATE_IOREQ_NONE). */
168 p = &get_ioreq(v)->vp_ioreq;
169 while ( p->state != STATE_IOREQ_NONE )
170 {
171 switch ( p->state )
172 {
173 case STATE_IORESP_READY: /* IORESP_READY -> NONE */
174 hvm_io_assist();
175 break;
176 case STATE_IOREQ_READY: /* IOREQ_{READY,INPROCESS} -> IORESP_READY */
177 case STATE_IOREQ_INPROCESS:
178 wait_on_xen_event_channel(v->arch.hvm_vcpu.xen_port,
179 (p->state != STATE_IOREQ_READY) &&
180 (p->state != STATE_IOREQ_INPROCESS));
181 break;
182 default:
183 gdprintk(XENLOG_ERR, "Weird HVM iorequest state %d.\n", p->state);
184 domain_crash(v->domain);
185 return; /* bail */
186 }
187 }
188 }
190 static void hvm_init_ioreq_page(
191 struct domain *d, struct hvm_ioreq_page *iorp)
192 {
193 memset(iorp, 0, sizeof(*iorp));
194 spin_lock_init(&iorp->lock);
195 domain_pause(d);
196 }
198 static void hvm_destroy_ioreq_page(
199 struct domain *d, struct hvm_ioreq_page *iorp)
200 {
201 spin_lock(&iorp->lock);
203 ASSERT(d->is_dying);
205 if ( iorp->va != NULL )
206 {
207 unmap_domain_page_global(iorp->va);
208 put_page_and_type(iorp->page);
209 iorp->va = NULL;
210 }
212 spin_unlock(&iorp->lock);
213 }
215 static int hvm_set_ioreq_page(
216 struct domain *d, struct hvm_ioreq_page *iorp, unsigned long gmfn)
217 {
218 struct page_info *page;
219 p2m_type_t p2mt;
220 unsigned long mfn;
221 void *va;
223 mfn = mfn_x(gfn_to_mfn(d, gmfn, &p2mt));
224 if ( !p2m_is_ram(p2mt) )
225 return -EINVAL;
226 ASSERT(mfn_valid(mfn));
228 page = mfn_to_page(mfn);
229 if ( !get_page_and_type(page, d, PGT_writable_page) )
230 return -EINVAL;
232 va = map_domain_page_global(mfn);
233 if ( va == NULL )
234 {
235 put_page_and_type(page);
236 return -ENOMEM;
237 }
239 spin_lock(&iorp->lock);
241 if ( (iorp->va != NULL) || d->is_dying )
242 {
243 spin_unlock(&iorp->lock);
244 unmap_domain_page_global(va);
245 put_page_and_type(mfn_to_page(mfn));
246 return -EINVAL;
247 }
249 iorp->va = va;
250 iorp->page = page;
252 spin_unlock(&iorp->lock);
254 domain_unpause(d);
256 return 0;
257 }
259 static int hvm_print_line(
260 int dir, uint32_t port, uint32_t bytes, uint32_t *val)
261 {
262 struct vcpu *curr = current;
263 struct hvm_domain *hd = &curr->domain->arch.hvm_domain;
264 char c = *val;
266 BUG_ON(bytes != 1);
268 spin_lock(&hd->pbuf_lock);
269 hd->pbuf[hd->pbuf_idx++] = c;
270 if ( (hd->pbuf_idx == (sizeof(hd->pbuf) - 2)) || (c == '\n') )
271 {
272 if ( c != '\n' )
273 hd->pbuf[hd->pbuf_idx++] = '\n';
274 hd->pbuf[hd->pbuf_idx] = '\0';
275 printk(XENLOG_G_DEBUG "HVM%u: %s", curr->domain->domain_id, hd->pbuf);
276 hd->pbuf_idx = 0;
277 }
278 spin_unlock(&hd->pbuf_lock);
280 return X86EMUL_OKAY;
281 }
283 int hvm_domain_initialise(struct domain *d)
284 {
285 int rc;
287 if ( !hvm_enabled )
288 {
289 gdprintk(XENLOG_WARNING, "Attempt to create a HVM guest "
290 "on a non-VT/AMDV platform.\n");
291 return -EINVAL;
292 }
294 spin_lock_init(&d->arch.hvm_domain.pbuf_lock);
295 spin_lock_init(&d->arch.hvm_domain.irq_lock);
296 spin_lock_init(&d->arch.hvm_domain.uc_lock);
298 d->arch.hvm_domain.params[HVM_PARAM_HPET_ENABLED] = 1;
300 hvm_init_cacheattr_region_list(d);
302 rc = paging_enable(d, PG_refcounts|PG_translate|PG_external);
303 if ( rc != 0 )
304 goto fail1;
306 vpic_init(d);
308 rc = vioapic_init(d);
309 if ( rc != 0 )
310 goto fail1;
312 stdvga_init(d);
314 hvm_init_ioreq_page(d, &d->arch.hvm_domain.ioreq);
315 hvm_init_ioreq_page(d, &d->arch.hvm_domain.buf_ioreq);
317 register_portio_handler(d, 0xe9, 1, hvm_print_line);
319 rc = hvm_funcs.domain_initialise(d);
320 if ( rc != 0 )
321 goto fail2;
323 return 0;
325 fail2:
326 vioapic_deinit(d);
327 fail1:
328 hvm_destroy_cacheattr_region_list(d);
329 return rc;
330 }
332 void hvm_domain_relinquish_resources(struct domain *d)
333 {
334 hvm_destroy_ioreq_page(d, &d->arch.hvm_domain.ioreq);
335 hvm_destroy_ioreq_page(d, &d->arch.hvm_domain.buf_ioreq);
337 pit_deinit(d);
338 rtc_deinit(d);
339 pmtimer_deinit(d);
340 hpet_deinit(d);
341 stdvga_deinit(d);
342 }
344 void hvm_domain_destroy(struct domain *d)
345 {
346 hvm_funcs.domain_destroy(d);
347 vioapic_deinit(d);
348 hvm_destroy_cacheattr_region_list(d);
349 }
351 static int hvm_save_cpu_ctxt(struct domain *d, hvm_domain_context_t *h)
352 {
353 struct vcpu *v;
354 struct hvm_hw_cpu ctxt;
355 struct segment_register seg;
356 struct vcpu_guest_context *vc;
358 for_each_vcpu ( d, v )
359 {
360 /* We don't need to save state for a vcpu that is down; the restore
361 * code will leave it down if there is nothing saved. */
362 if ( test_bit(_VPF_down, &v->pause_flags) )
363 continue;
365 /* Architecture-specific vmcs/vmcb bits */
366 hvm_funcs.save_cpu_ctxt(v, &ctxt);
368 hvm_get_segment_register(v, x86_seg_idtr, &seg);
369 ctxt.idtr_limit = seg.limit;
370 ctxt.idtr_base = seg.base;
372 hvm_get_segment_register(v, x86_seg_gdtr, &seg);
373 ctxt.gdtr_limit = seg.limit;
374 ctxt.gdtr_base = seg.base;
376 hvm_get_segment_register(v, x86_seg_cs, &seg);
377 ctxt.cs_sel = seg.sel;
378 ctxt.cs_limit = seg.limit;
379 ctxt.cs_base = seg.base;
380 ctxt.cs_arbytes = seg.attr.bytes;
382 hvm_get_segment_register(v, x86_seg_ds, &seg);
383 ctxt.ds_sel = seg.sel;
384 ctxt.ds_limit = seg.limit;
385 ctxt.ds_base = seg.base;
386 ctxt.ds_arbytes = seg.attr.bytes;
388 hvm_get_segment_register(v, x86_seg_es, &seg);
389 ctxt.es_sel = seg.sel;
390 ctxt.es_limit = seg.limit;
391 ctxt.es_base = seg.base;
392 ctxt.es_arbytes = seg.attr.bytes;
394 hvm_get_segment_register(v, x86_seg_ss, &seg);
395 ctxt.ss_sel = seg.sel;
396 ctxt.ss_limit = seg.limit;
397 ctxt.ss_base = seg.base;
398 ctxt.ss_arbytes = seg.attr.bytes;
400 hvm_get_segment_register(v, x86_seg_fs, &seg);
401 ctxt.fs_sel = seg.sel;
402 ctxt.fs_limit = seg.limit;
403 ctxt.fs_base = seg.base;
404 ctxt.fs_arbytes = seg.attr.bytes;
406 hvm_get_segment_register(v, x86_seg_gs, &seg);
407 ctxt.gs_sel = seg.sel;
408 ctxt.gs_limit = seg.limit;
409 ctxt.gs_base = seg.base;
410 ctxt.gs_arbytes = seg.attr.bytes;
412 hvm_get_segment_register(v, x86_seg_tr, &seg);
413 ctxt.tr_sel = seg.sel;
414 ctxt.tr_limit = seg.limit;
415 ctxt.tr_base = seg.base;
416 ctxt.tr_arbytes = seg.attr.bytes;
418 hvm_get_segment_register(v, x86_seg_ldtr, &seg);
419 ctxt.ldtr_sel = seg.sel;
420 ctxt.ldtr_limit = seg.limit;
421 ctxt.ldtr_base = seg.base;
422 ctxt.ldtr_arbytes = seg.attr.bytes;
424 vc = &v->arch.guest_context;
426 if ( v->fpu_initialised )
427 memcpy(ctxt.fpu_regs, &vc->fpu_ctxt, sizeof(ctxt.fpu_regs));
428 else
429 memset(ctxt.fpu_regs, 0, sizeof(ctxt.fpu_regs));
431 ctxt.rax = vc->user_regs.eax;
432 ctxt.rbx = vc->user_regs.ebx;
433 ctxt.rcx = vc->user_regs.ecx;
434 ctxt.rdx = vc->user_regs.edx;
435 ctxt.rbp = vc->user_regs.ebp;
436 ctxt.rsi = vc->user_regs.esi;
437 ctxt.rdi = vc->user_regs.edi;
438 ctxt.rsp = vc->user_regs.esp;
439 ctxt.rip = vc->user_regs.eip;
440 ctxt.rflags = vc->user_regs.eflags;
441 #ifdef __x86_64__
442 ctxt.r8 = vc->user_regs.r8;
443 ctxt.r9 = vc->user_regs.r9;
444 ctxt.r10 = vc->user_regs.r10;
445 ctxt.r11 = vc->user_regs.r11;
446 ctxt.r12 = vc->user_regs.r12;
447 ctxt.r13 = vc->user_regs.r13;
448 ctxt.r14 = vc->user_regs.r14;
449 ctxt.r15 = vc->user_regs.r15;
450 #endif
451 ctxt.dr0 = vc->debugreg[0];
452 ctxt.dr1 = vc->debugreg[1];
453 ctxt.dr2 = vc->debugreg[2];
454 ctxt.dr3 = vc->debugreg[3];
455 ctxt.dr6 = vc->debugreg[6];
456 ctxt.dr7 = vc->debugreg[7];
458 if ( hvm_save_entry(CPU, v->vcpu_id, h, &ctxt) != 0 )
459 return 1;
460 }
461 return 0;
462 }
464 static int hvm_load_cpu_ctxt(struct domain *d, hvm_domain_context_t *h)
465 {
466 int vcpuid, rc;
467 struct vcpu *v;
468 struct hvm_hw_cpu ctxt;
469 struct segment_register seg;
470 struct vcpu_guest_context *vc;
472 /* Which vcpu is this? */
473 vcpuid = hvm_load_instance(h);
474 if ( vcpuid > MAX_VIRT_CPUS || (v = d->vcpu[vcpuid]) == NULL )
475 {
476 gdprintk(XENLOG_ERR, "HVM restore: domain has no vcpu %u\n", vcpuid);
477 return -EINVAL;
478 }
479 vc = &v->arch.guest_context;
481 /* Need to init this vcpu before loading its contents */
482 domain_lock(d);
483 if ( !v->is_initialised )
484 if ( (rc = boot_vcpu(d, vcpuid, vc)) != 0 )
485 return rc;
486 domain_unlock(d);
488 if ( hvm_load_entry(CPU, h, &ctxt) != 0 )
489 return -EINVAL;
491 /* Sanity check some control registers. */
492 if ( (ctxt.cr0 & HVM_CR0_GUEST_RESERVED_BITS) ||
493 !(ctxt.cr0 & X86_CR0_ET) ||
494 ((ctxt.cr0 & (X86_CR0_PE|X86_CR0_PG)) == X86_CR0_PG) )
495 {
496 gdprintk(XENLOG_ERR, "HVM restore: bad CR0 0x%"PRIx64"\n",
497 ctxt.cr0);
498 return -EINVAL;
499 }
501 if ( ctxt.cr4 & HVM_CR4_GUEST_RESERVED_BITS )
502 {
503 gdprintk(XENLOG_ERR, "HVM restore: bad CR4 0x%"PRIx64"\n",
504 ctxt.cr4);
505 return -EINVAL;
506 }
508 if ( (ctxt.msr_efer & ~(EFER_FFXSE | EFER_LME | EFER_LMA |
509 EFER_NX | EFER_SCE)) ||
510 ((sizeof(long) != 8) && (ctxt.msr_efer & EFER_LME)) ||
511 (!cpu_has_nx && (ctxt.msr_efer & EFER_NX)) ||
512 (!cpu_has_syscall && (ctxt.msr_efer & EFER_SCE)) ||
513 (!cpu_has_ffxsr && (ctxt.msr_efer & EFER_FFXSE)) ||
514 ((ctxt.msr_efer & (EFER_LME|EFER_LMA)) == EFER_LMA) )
515 {
516 gdprintk(XENLOG_ERR, "HVM restore: bad EFER 0x%"PRIx64"\n",
517 ctxt.msr_efer);
518 return -EINVAL;
519 }
521 /* Architecture-specific vmcs/vmcb bits */
522 if ( hvm_funcs.load_cpu_ctxt(v, &ctxt) < 0 )
523 return -EINVAL;
525 seg.limit = ctxt.idtr_limit;
526 seg.base = ctxt.idtr_base;
527 hvm_set_segment_register(v, x86_seg_idtr, &seg);
529 seg.limit = ctxt.gdtr_limit;
530 seg.base = ctxt.gdtr_base;
531 hvm_set_segment_register(v, x86_seg_gdtr, &seg);
533 seg.sel = ctxt.cs_sel;
534 seg.limit = ctxt.cs_limit;
535 seg.base = ctxt.cs_base;
536 seg.attr.bytes = ctxt.cs_arbytes;
537 hvm_set_segment_register(v, x86_seg_cs, &seg);
539 seg.sel = ctxt.ds_sel;
540 seg.limit = ctxt.ds_limit;
541 seg.base = ctxt.ds_base;
542 seg.attr.bytes = ctxt.ds_arbytes;
543 hvm_set_segment_register(v, x86_seg_ds, &seg);
545 seg.sel = ctxt.es_sel;
546 seg.limit = ctxt.es_limit;
547 seg.base = ctxt.es_base;
548 seg.attr.bytes = ctxt.es_arbytes;
549 hvm_set_segment_register(v, x86_seg_es, &seg);
551 seg.sel = ctxt.ss_sel;
552 seg.limit = ctxt.ss_limit;
553 seg.base = ctxt.ss_base;
554 seg.attr.bytes = ctxt.ss_arbytes;
555 hvm_set_segment_register(v, x86_seg_ss, &seg);
557 seg.sel = ctxt.fs_sel;
558 seg.limit = ctxt.fs_limit;
559 seg.base = ctxt.fs_base;
560 seg.attr.bytes = ctxt.fs_arbytes;
561 hvm_set_segment_register(v, x86_seg_fs, &seg);
563 seg.sel = ctxt.gs_sel;
564 seg.limit = ctxt.gs_limit;
565 seg.base = ctxt.gs_base;
566 seg.attr.bytes = ctxt.gs_arbytes;
567 hvm_set_segment_register(v, x86_seg_gs, &seg);
569 seg.sel = ctxt.tr_sel;
570 seg.limit = ctxt.tr_limit;
571 seg.base = ctxt.tr_base;
572 seg.attr.bytes = ctxt.tr_arbytes;
573 hvm_set_segment_register(v, x86_seg_tr, &seg);
575 seg.sel = ctxt.ldtr_sel;
576 seg.limit = ctxt.ldtr_limit;
577 seg.base = ctxt.ldtr_base;
578 seg.attr.bytes = ctxt.ldtr_arbytes;
579 hvm_set_segment_register(v, x86_seg_ldtr, &seg);
581 memcpy(&vc->fpu_ctxt, ctxt.fpu_regs, sizeof(ctxt.fpu_regs));
583 vc->user_regs.eax = ctxt.rax;
584 vc->user_regs.ebx = ctxt.rbx;
585 vc->user_regs.ecx = ctxt.rcx;
586 vc->user_regs.edx = ctxt.rdx;
587 vc->user_regs.ebp = ctxt.rbp;
588 vc->user_regs.esi = ctxt.rsi;
589 vc->user_regs.edi = ctxt.rdi;
590 vc->user_regs.esp = ctxt.rsp;
591 vc->user_regs.eip = ctxt.rip;
592 vc->user_regs.eflags = ctxt.rflags | 2;
593 #ifdef __x86_64__
594 vc->user_regs.r8 = ctxt.r8;
595 vc->user_regs.r9 = ctxt.r9;
596 vc->user_regs.r10 = ctxt.r10;
597 vc->user_regs.r11 = ctxt.r11;
598 vc->user_regs.r12 = ctxt.r12;
599 vc->user_regs.r13 = ctxt.r13;
600 vc->user_regs.r14 = ctxt.r14;
601 vc->user_regs.r15 = ctxt.r15;
602 #endif
603 vc->debugreg[0] = ctxt.dr0;
604 vc->debugreg[1] = ctxt.dr1;
605 vc->debugreg[2] = ctxt.dr2;
606 vc->debugreg[3] = ctxt.dr3;
607 vc->debugreg[6] = ctxt.dr6;
608 vc->debugreg[7] = ctxt.dr7;
610 vc->flags = VGCF_online;
611 v->fpu_initialised = 1;
613 /* Auxiliary processors should be woken immediately. */
614 if ( test_and_clear_bit(_VPF_down, &v->pause_flags) )
615 vcpu_wake(v);
617 return 0;
618 }
620 HVM_REGISTER_SAVE_RESTORE(CPU, hvm_save_cpu_ctxt, hvm_load_cpu_ctxt,
621 1, HVMSR_PER_VCPU);
623 int hvm_vcpu_initialise(struct vcpu *v)
624 {
625 int rc;
627 if ( (rc = vlapic_init(v)) != 0 )
628 goto fail1;
630 if ( (rc = hvm_funcs.vcpu_initialise(v)) != 0 )
631 goto fail2;
633 /* Create ioreq event channel. */
634 rc = alloc_unbound_xen_event_channel(v, 0);
635 if ( rc < 0 )
636 goto fail3;
638 /* Register ioreq event channel. */
639 v->arch.hvm_vcpu.xen_port = rc;
640 spin_lock(&v->domain->arch.hvm_domain.ioreq.lock);
641 if ( v->domain->arch.hvm_domain.ioreq.va != NULL )
642 get_ioreq(v)->vp_eport = v->arch.hvm_vcpu.xen_port;
643 spin_unlock(&v->domain->arch.hvm_domain.ioreq.lock);
645 spin_lock_init(&v->arch.hvm_vcpu.tm_lock);
646 INIT_LIST_HEAD(&v->arch.hvm_vcpu.tm_list);
648 rc = hvm_vcpu_cacheattr_init(v);
649 if ( rc != 0 )
650 goto fail3;
652 v->arch.guest_context.user_regs.eflags = 2;
654 if ( v->vcpu_id == 0 )
655 {
656 /* NB. All these really belong in hvm_domain_initialise(). */
657 pit_init(v, cpu_khz);
658 rtc_init(v, RTC_PORT(0));
659 pmtimer_init(v);
660 hpet_init(v);
662 /* Init guest TSC to start from zero. */
663 hvm_set_guest_time(v, 0);
665 /* Can start up without SIPI-SIPI or setvcpucontext domctl. */
666 v->is_initialised = 1;
667 clear_bit(_VPF_down, &v->pause_flags);
668 }
670 return 0;
672 fail3:
673 hvm_funcs.vcpu_destroy(v);
674 fail2:
675 vlapic_destroy(v);
676 fail1:
677 return rc;
678 }
680 void hvm_vcpu_destroy(struct vcpu *v)
681 {
682 hvm_vcpu_cacheattr_destroy(v);
683 vlapic_destroy(v);
684 hvm_funcs.vcpu_destroy(v);
686 /* Event channel is already freed by evtchn_destroy(). */
687 /*free_xen_event_channel(v, v->arch.hvm_vcpu.xen_port);*/
688 }
690 void hvm_vcpu_down(struct vcpu *v)
691 {
692 struct domain *d = v->domain;
693 int online_count = 0;
695 /* Doesn't halt us immediately, but we'll never return to guest context. */
696 set_bit(_VPF_down, &v->pause_flags);
697 vcpu_sleep_nosync(v);
699 /* Any other VCPUs online? ... */
700 domain_lock(d);
701 for_each_vcpu ( d, v )
702 if ( !test_bit(_VPF_down, &v->pause_flags) )
703 online_count++;
704 domain_unlock(d);
706 /* ... Shut down the domain if not. */
707 if ( online_count == 0 )
708 {
709 gdprintk(XENLOG_INFO, "All CPUs offline -- powering off.\n");
710 domain_shutdown(d, SHUTDOWN_poweroff);
711 }
712 }
714 void hvm_send_assist_req(struct vcpu *v)
715 {
716 ioreq_t *p;
718 if ( unlikely(!vcpu_start_shutdown_deferral(v)) )
719 return; /* implicitly bins the i/o operation */
721 p = &get_ioreq(v)->vp_ioreq;
722 if ( unlikely(p->state != STATE_IOREQ_NONE) )
723 {
724 /* This indicates a bug in the device model. Crash the domain. */
725 gdprintk(XENLOG_ERR, "Device model set bad IO state %d.\n", p->state);
726 domain_crash(v->domain);
727 return;
728 }
730 prepare_wait_on_xen_event_channel(v->arch.hvm_vcpu.xen_port);
732 /*
733 * Following happens /after/ blocking and setting up ioreq contents.
734 * prepare_wait_on_xen_event_channel() is an implicit barrier.
735 */
736 p->state = STATE_IOREQ_READY;
737 notify_via_xen_event_channel(v->arch.hvm_vcpu.xen_port);
738 }
740 void hvm_hlt(unsigned long rflags)
741 {
742 /*
743 * If we halt with interrupts disabled, that's a pretty sure sign that we
744 * want to shut down. In a real processor, NMIs are the only way to break
745 * out of this.
746 */
747 if ( unlikely(!(rflags & X86_EFLAGS_IF)) )
748 return hvm_vcpu_down(current);
750 do_sched_op_compat(SCHEDOP_block, 0);
751 }
753 void hvm_triple_fault(void)
754 {
755 struct vcpu *v = current;
756 gdprintk(XENLOG_INFO, "Triple fault on VCPU%d - "
757 "invoking HVM system reset.\n", v->vcpu_id);
758 domain_shutdown(v->domain, SHUTDOWN_reboot);
759 }
761 int hvm_set_efer(uint64_t value)
762 {
763 struct vcpu *v = current;
765 value &= ~EFER_LMA;
767 if ( (value & ~(EFER_FFXSE | EFER_LME | EFER_NX | EFER_SCE)) ||
768 ((sizeof(long) != 8) && (value & EFER_LME)) ||
769 (!cpu_has_nx && (value & EFER_NX)) ||
770 (!cpu_has_syscall && (value & EFER_SCE)) ||
771 (!cpu_has_ffxsr && (value & EFER_FFXSE)) )
772 {
773 gdprintk(XENLOG_WARNING, "Trying to set reserved bit in "
774 "EFER: %"PRIx64"\n", value);
775 hvm_inject_exception(TRAP_gp_fault, 0, 0);
776 return X86EMUL_EXCEPTION;
777 }
779 if ( ((value ^ v->arch.hvm_vcpu.guest_efer) & EFER_LME) &&
780 hvm_paging_enabled(v) )
781 {
782 gdprintk(XENLOG_WARNING,
783 "Trying to change EFER.LME with paging enabled\n");
784 hvm_inject_exception(TRAP_gp_fault, 0, 0);
785 return X86EMUL_EXCEPTION;
786 }
788 value |= v->arch.hvm_vcpu.guest_efer & EFER_LMA;
789 v->arch.hvm_vcpu.guest_efer = value;
790 hvm_update_guest_efer(v);
792 return X86EMUL_OKAY;
793 }
795 extern void shadow_blow_tables_per_domain(struct domain *d);
796 extern bool_t mtrr_pat_not_equal(struct vcpu *vd, struct vcpu *vs);
798 /* Exit UC mode only if all VCPUs agree on MTRR/PAT and are not in no_fill. */
799 static bool_t domain_exit_uc_mode(struct vcpu *v)
800 {
801 struct domain *d = v->domain;
802 struct vcpu *vs;
804 for_each_vcpu ( d, vs )
805 {
806 if ( (vs == v) || !vs->is_initialised )
807 continue;
808 if ( (vs->arch.hvm_vcpu.cache_mode == NO_FILL_CACHE_MODE) ||
809 mtrr_pat_not_equal(vs, v) )
810 return 0;
811 }
813 return 1;
814 }
816 static void local_flush_cache(void *info)
817 {
818 wbinvd();
819 }
821 int hvm_set_cr0(unsigned long value)
822 {
823 struct vcpu *v = current;
824 p2m_type_t p2mt;
825 unsigned long gfn, mfn, old_value = v->arch.hvm_vcpu.guest_cr[0];
827 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR0 value = %lx", value);
829 if ( (u32)value != value )
830 {
831 HVM_DBG_LOG(DBG_LEVEL_1,
832 "Guest attempts to set upper 32 bits in CR0: %lx",
833 value);
834 goto gpf;
835 }
837 value &= ~HVM_CR0_GUEST_RESERVED_BITS;
839 /* ET is reserved and should be always be 1. */
840 value |= X86_CR0_ET;
842 if ( (value & (X86_CR0_PE | X86_CR0_PG)) == X86_CR0_PG )
843 goto gpf;
845 if ( (value & X86_CR0_PG) && !(old_value & X86_CR0_PG) )
846 {
847 if ( v->arch.hvm_vcpu.guest_efer & EFER_LME )
848 {
849 if ( !(v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_PAE) )
850 {
851 HVM_DBG_LOG(DBG_LEVEL_1, "Enable paging before PAE enable");
852 goto gpf;
853 }
854 HVM_DBG_LOG(DBG_LEVEL_1, "Enabling long mode");
855 v->arch.hvm_vcpu.guest_efer |= EFER_LMA;
856 hvm_update_guest_efer(v);
857 }
859 if ( !paging_mode_hap(v->domain) )
860 {
861 /* The guest CR3 must be pointing to the guest physical. */
862 gfn = v->arch.hvm_vcpu.guest_cr[3]>>PAGE_SHIFT;
863 mfn = mfn_x(gfn_to_mfn_current(gfn, &p2mt));
864 if ( !p2m_is_ram(p2mt) || !mfn_valid(mfn) ||
865 !get_page(mfn_to_page(mfn), v->domain))
866 {
867 gdprintk(XENLOG_ERR, "Invalid CR3 value = %lx (mfn=%lx)\n",
868 v->arch.hvm_vcpu.guest_cr[3], mfn);
869 domain_crash(v->domain);
870 return X86EMUL_UNHANDLEABLE;
871 }
873 /* Now arch.guest_table points to machine physical. */
874 v->arch.guest_table = pagetable_from_pfn(mfn);
876 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, mfn = %lx",
877 v->arch.hvm_vcpu.guest_cr[3], mfn);
878 }
879 }
880 else if ( !(value & X86_CR0_PG) && (old_value & X86_CR0_PG) )
881 {
882 /* When CR0.PG is cleared, LMA is cleared immediately. */
883 if ( hvm_long_mode_enabled(v) )
884 {
885 v->arch.hvm_vcpu.guest_efer &= ~EFER_LMA;
886 hvm_update_guest_efer(v);
887 }
889 if ( !paging_mode_hap(v->domain) )
890 {
891 put_page(pagetable_get_page(v->arch.guest_table));
892 v->arch.guest_table = pagetable_null();
893 }
894 }
896 if ( !list_empty(&domain_hvm_iommu(v->domain)->pdev_list) )
897 {
898 if ( (value & X86_CR0_CD) && !(value & X86_CR0_NW) )
899 {
900 /* Entering no fill cache mode. */
901 spin_lock(&v->domain->arch.hvm_domain.uc_lock);
902 v->arch.hvm_vcpu.cache_mode = NO_FILL_CACHE_MODE;
904 if ( !v->domain->arch.hvm_domain.is_in_uc_mode )
905 {
906 /* Flush physical caches. */
907 on_each_cpu(local_flush_cache, NULL, 1, 1);
908 /* Shadow pagetables must recognise UC mode. */
909 v->domain->arch.hvm_domain.is_in_uc_mode = 1;
910 shadow_blow_tables_per_domain(v->domain);
911 }
912 spin_unlock(&v->domain->arch.hvm_domain.uc_lock);
913 }
914 else if ( !(value & (X86_CR0_CD | X86_CR0_NW)) &&
915 (v->arch.hvm_vcpu.cache_mode == NO_FILL_CACHE_MODE) )
916 {
917 /* Exit from no fill cache mode. */
918 spin_lock(&v->domain->arch.hvm_domain.uc_lock);
919 v->arch.hvm_vcpu.cache_mode = NORMAL_CACHE_MODE;
921 if ( domain_exit_uc_mode(v) )
922 {
923 /* Shadow pagetables must recognise normal caching mode. */
924 v->domain->arch.hvm_domain.is_in_uc_mode = 0;
925 shadow_blow_tables_per_domain(v->domain);
926 }
927 spin_unlock(&v->domain->arch.hvm_domain.uc_lock);
928 }
929 }
931 v->arch.hvm_vcpu.guest_cr[0] = value;
932 hvm_update_guest_cr(v, 0);
934 if ( (value ^ old_value) & X86_CR0_PG )
935 paging_update_paging_modes(v);
937 return X86EMUL_OKAY;
939 gpf:
940 hvm_inject_exception(TRAP_gp_fault, 0, 0);
941 return X86EMUL_EXCEPTION;
942 }
944 int hvm_set_cr3(unsigned long value)
945 {
946 unsigned long mfn;
947 p2m_type_t p2mt;
948 struct vcpu *v = current;
950 if ( hvm_paging_enabled(v) && !paging_mode_hap(v->domain) &&
951 (value != v->arch.hvm_vcpu.guest_cr[3]) )
952 {
953 /* Shadow-mode CR3 change. Check PDBR and update refcounts. */
954 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 value = %lx", value);
955 mfn = mfn_x(gfn_to_mfn_current(value >> PAGE_SHIFT, &p2mt));
956 if ( !p2m_is_ram(p2mt) || !mfn_valid(mfn) ||
957 !get_page(mfn_to_page(mfn), v->domain) )
958 goto bad_cr3;
960 put_page(pagetable_get_page(v->arch.guest_table));
961 v->arch.guest_table = pagetable_from_pfn(mfn);
963 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx", value);
964 }
966 v->arch.hvm_vcpu.guest_cr[3] = value;
967 paging_update_cr3(v);
968 return X86EMUL_OKAY;
970 bad_cr3:
971 gdprintk(XENLOG_ERR, "Invalid CR3\n");
972 domain_crash(v->domain);
973 return X86EMUL_UNHANDLEABLE;
974 }
976 int hvm_set_cr4(unsigned long value)
977 {
978 struct vcpu *v = current;
979 unsigned long old_cr;
981 if ( value & HVM_CR4_GUEST_RESERVED_BITS )
982 {
983 HVM_DBG_LOG(DBG_LEVEL_1,
984 "Guest attempts to set reserved bit in CR4: %lx",
985 value);
986 goto gpf;
987 }
989 if ( !(value & X86_CR4_PAE) && hvm_long_mode_enabled(v) )
990 {
991 HVM_DBG_LOG(DBG_LEVEL_1, "Guest cleared CR4.PAE while "
992 "EFER.LMA is set");
993 goto gpf;
994 }
996 old_cr = v->arch.hvm_vcpu.guest_cr[4];
997 v->arch.hvm_vcpu.guest_cr[4] = value;
998 hvm_update_guest_cr(v, 4);
1000 /* Modifying CR4.{PSE,PAE,PGE} invalidates all TLB entries, inc. Global. */
1001 if ( (old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE) )
1002 paging_update_paging_modes(v);
1004 return X86EMUL_OKAY;
1006 gpf:
1007 hvm_inject_exception(TRAP_gp_fault, 0, 0);
1008 return X86EMUL_EXCEPTION;
1011 int hvm_virtual_to_linear_addr(
1012 enum x86_segment seg,
1013 struct segment_register *reg,
1014 unsigned long offset,
1015 unsigned int bytes,
1016 enum hvm_access_type access_type,
1017 unsigned int addr_size,
1018 unsigned long *linear_addr)
1020 unsigned long addr = offset;
1021 uint32_t last_byte;
1023 if ( !(current->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE) )
1025 /*
1026 * REAL MODE: Don't bother with segment access checks.
1027 * Certain of them are not done in native real mode anyway.
1028 */
1029 addr = (uint32_t)(addr + reg->base);
1031 else if ( addr_size != 64 )
1033 /*
1034 * COMPATIBILITY MODE: Apply segment checks and add base.
1035 */
1037 switch ( access_type )
1039 case hvm_access_read:
1040 if ( (reg->attr.fields.type & 0xa) == 0x8 )
1041 goto gpf; /* execute-only code segment */
1042 break;
1043 case hvm_access_write:
1044 if ( (reg->attr.fields.type & 0xa) != 0x2 )
1045 goto gpf; /* not a writable data segment */
1046 break;
1047 default:
1048 break;
1051 last_byte = offset + bytes - 1;
1053 /* Is this a grows-down data segment? Special limit check if so. */
1054 if ( (reg->attr.fields.type & 0xc) == 0x4 )
1056 /* Is upper limit 0xFFFF or 0xFFFFFFFF? */
1057 if ( !reg->attr.fields.db )
1058 last_byte = (uint16_t)last_byte;
1060 /* Check first byte and last byte against respective bounds. */
1061 if ( (offset <= reg->limit) || (last_byte < offset) )
1062 goto gpf;
1064 else if ( (last_byte > reg->limit) || (last_byte < offset) )
1065 goto gpf; /* last byte is beyond limit or wraps 0xFFFFFFFF */
1067 /*
1068 * Hardware truncates to 32 bits in compatibility mode.
1069 * It does not truncate to 16 bits in 16-bit address-size mode.
1070 */
1071 addr = (uint32_t)(addr + reg->base);
1073 else
1075 /*
1076 * LONG MODE: FS and GS add segment base. Addresses must be canonical.
1077 */
1079 if ( (seg == x86_seg_fs) || (seg == x86_seg_gs) )
1080 addr += reg->base;
1082 if ( !is_canonical_address(addr) )
1083 goto gpf;
1086 *linear_addr = addr;
1087 return 1;
1089 gpf:
1090 return 0;
1093 static void *hvm_map(unsigned long va, int size)
1095 unsigned long gfn, mfn;
1096 p2m_type_t p2mt;
1097 uint32_t pfec;
1099 if ( ((va & ~PAGE_MASK) + size) > PAGE_SIZE )
1101 hvm_inject_exception(TRAP_page_fault, PFEC_write_access,
1102 (va + PAGE_SIZE - 1) & PAGE_MASK);
1103 return NULL;
1106 /* We're mapping on behalf of the segment-load logic, which might
1107 * write the accessed flags in the descriptors (in 32-bit mode), but
1108 * we still treat it as a kernel-mode read (i.e. no access checks). */
1109 pfec = PFEC_page_present;
1110 gfn = paging_gva_to_gfn(current, va, &pfec);
1111 mfn = mfn_x(gfn_to_mfn_current(gfn, &p2mt));
1112 if ( !p2m_is_ram(p2mt) )
1114 hvm_inject_exception(TRAP_page_fault, pfec, va);
1115 return NULL;
1118 ASSERT(mfn_valid(mfn));
1120 paging_mark_dirty(current->domain, mfn);
1122 return (char *)map_domain_page(mfn) + (va & ~PAGE_MASK);
1125 static void hvm_unmap(void *p)
1127 if ( p )
1128 unmap_domain_page(p);
1131 static int hvm_load_segment_selector(
1132 struct vcpu *v, enum x86_segment seg, uint16_t sel)
1134 struct segment_register desctab, cs, segr;
1135 struct desc_struct *pdesc, desc;
1136 u8 dpl, rpl, cpl;
1137 int fault_type = TRAP_invalid_tss;
1139 /* NULL selector? */
1140 if ( (sel & 0xfffc) == 0 )
1142 if ( (seg == x86_seg_cs) || (seg == x86_seg_ss) )
1143 goto fail;
1144 memset(&segr, 0, sizeof(segr));
1145 hvm_set_segment_register(v, seg, &segr);
1146 return 0;
1149 /* LDT descriptor must be in the GDT. */
1150 if ( (seg == x86_seg_ldtr) && (sel & 4) )
1151 goto fail;
1153 hvm_get_segment_register(v, x86_seg_cs, &cs);
1154 hvm_get_segment_register(
1155 v, (sel & 4) ? x86_seg_ldtr : x86_seg_gdtr, &desctab);
1157 /* Check against descriptor table limit. */
1158 if ( ((sel & 0xfff8) + 7) > desctab.limit )
1159 goto fail;
1161 pdesc = hvm_map(desctab.base + (sel & 0xfff8), 8);
1162 if ( pdesc == NULL )
1163 goto hvm_map_fail;
1165 do {
1166 desc = *pdesc;
1168 /* Segment present in memory? */
1169 if ( !(desc.b & (1u<<15)) )
1171 fault_type = TRAP_no_segment;
1172 goto unmap_and_fail;
1175 /* LDT descriptor is a system segment. All others are code/data. */
1176 if ( (desc.b & (1u<<12)) == ((seg == x86_seg_ldtr) << 12) )
1177 goto unmap_and_fail;
1179 dpl = (desc.b >> 13) & 3;
1180 rpl = sel & 3;
1181 cpl = cs.sel & 3;
1183 switch ( seg )
1185 case x86_seg_cs:
1186 /* Code segment? */
1187 if ( !(desc.b & (1u<<11)) )
1188 goto unmap_and_fail;
1189 /* Non-conforming segment: check DPL against RPL. */
1190 if ( ((desc.b & (6u<<9)) != 6) && (dpl != rpl) )
1191 goto unmap_and_fail;
1192 break;
1193 case x86_seg_ss:
1194 /* Writable data segment? */
1195 if ( (desc.b & (5u<<9)) != (1u<<9) )
1196 goto unmap_and_fail;
1197 if ( (dpl != cpl) || (dpl != rpl) )
1198 goto unmap_and_fail;
1199 break;
1200 case x86_seg_ldtr:
1201 /* LDT system segment? */
1202 if ( (desc.b & (15u<<8)) != (2u<<8) )
1203 goto unmap_and_fail;
1204 goto skip_accessed_flag;
1205 default:
1206 /* Readable code or data segment? */
1207 if ( (desc.b & (5u<<9)) == (4u<<9) )
1208 goto unmap_and_fail;
1209 /* Non-conforming segment: check DPL against RPL and CPL. */
1210 if ( ((desc.b & (6u<<9)) != 6) && ((dpl < cpl) || (dpl < rpl)) )
1211 goto unmap_and_fail;
1212 break;
1214 } while ( !(desc.b & 0x100) && /* Ensure Accessed flag is set */
1215 (cmpxchg(&pdesc->b, desc.b, desc.b | 0x100) != desc.b) );
1217 /* Force the Accessed flag in our local copy. */
1218 desc.b |= 0x100;
1220 skip_accessed_flag:
1221 hvm_unmap(pdesc);
1223 segr.base = (((desc.b << 0) & 0xff000000u) |
1224 ((desc.b << 16) & 0x00ff0000u) |
1225 ((desc.a >> 16) & 0x0000ffffu));
1226 segr.attr.bytes = (((desc.b >> 8) & 0x00ffu) |
1227 ((desc.b >> 12) & 0x0f00u));
1228 segr.limit = (desc.b & 0x000f0000u) | (desc.a & 0x0000ffffu);
1229 if ( segr.attr.fields.g )
1230 segr.limit = (segr.limit << 12) | 0xfffu;
1231 segr.sel = sel;
1232 hvm_set_segment_register(v, seg, &segr);
1234 return 0;
1236 unmap_and_fail:
1237 hvm_unmap(pdesc);
1238 fail:
1239 hvm_inject_exception(fault_type, sel & 0xfffc, 0);
1240 hvm_map_fail:
1241 return 1;
1244 void hvm_task_switch(
1245 uint16_t tss_sel, enum hvm_task_switch_reason taskswitch_reason,
1246 int32_t errcode)
1248 struct vcpu *v = current;
1249 struct cpu_user_regs *regs = guest_cpu_user_regs();
1250 struct segment_register gdt, tr, prev_tr, segr;
1251 struct desc_struct *optss_desc = NULL, *nptss_desc = NULL, tss_desc;
1252 unsigned long eflags;
1253 int exn_raised;
1254 struct {
1255 u16 back_link,__blh;
1256 u32 esp0;
1257 u16 ss0, _0;
1258 u32 esp1;
1259 u16 ss1, _1;
1260 u32 esp2;
1261 u16 ss2, _2;
1262 u32 cr3, eip, eflags, eax, ecx, edx, ebx, esp, ebp, esi, edi;
1263 u16 es, _3, cs, _4, ss, _5, ds, _6, fs, _7, gs, _8, ldt, _9;
1264 u16 trace, iomap;
1265 } *ptss, tss;
1267 hvm_get_segment_register(v, x86_seg_gdtr, &gdt);
1268 hvm_get_segment_register(v, x86_seg_tr, &prev_tr);
1270 if ( ((tss_sel & 0xfff8) + 7) > gdt.limit )
1272 hvm_inject_exception((taskswitch_reason == TSW_iret) ?
1273 TRAP_invalid_tss : TRAP_gp_fault,
1274 tss_sel & 0xfff8, 0);
1275 goto out;
1278 optss_desc = hvm_map(gdt.base + (prev_tr.sel & 0xfff8), 8);
1279 if ( optss_desc == NULL )
1280 goto out;
1282 nptss_desc = hvm_map(gdt.base + (tss_sel & 0xfff8), 8);
1283 if ( nptss_desc == NULL )
1284 goto out;
1286 tss_desc = *nptss_desc;
1287 tr.sel = tss_sel;
1288 tr.base = (((tss_desc.b << 0) & 0xff000000u) |
1289 ((tss_desc.b << 16) & 0x00ff0000u) |
1290 ((tss_desc.a >> 16) & 0x0000ffffu));
1291 tr.attr.bytes = (((tss_desc.b >> 8) & 0x00ffu) |
1292 ((tss_desc.b >> 12) & 0x0f00u));
1293 tr.limit = (tss_desc.b & 0x000f0000u) | (tss_desc.a & 0x0000ffffu);
1294 if ( tr.attr.fields.g )
1295 tr.limit = (tr.limit << 12) | 0xfffu;
1297 if ( !tr.attr.fields.p )
1299 hvm_inject_exception(TRAP_no_segment, tss_sel & 0xfff8, 0);
1300 goto out;
1303 if ( tr.attr.fields.type != ((taskswitch_reason == TSW_iret) ? 0xb : 0x9) )
1305 hvm_inject_exception(
1306 (taskswitch_reason == TSW_iret) ? TRAP_invalid_tss : TRAP_gp_fault,
1307 tss_sel & 0xfff8, 0);
1308 goto out;
1311 if ( tr.limit < (sizeof(tss)-1) )
1313 hvm_inject_exception(TRAP_invalid_tss, tss_sel & 0xfff8, 0);
1314 goto out;
1317 ptss = hvm_map(prev_tr.base, sizeof(tss));
1318 if ( ptss == NULL )
1319 goto out;
1321 eflags = regs->eflags;
1322 if ( taskswitch_reason == TSW_iret )
1323 eflags &= ~X86_EFLAGS_NT;
1325 ptss->cr3 = v->arch.hvm_vcpu.guest_cr[3];
1326 ptss->eip = regs->eip;
1327 ptss->eflags = eflags;
1328 ptss->eax = regs->eax;
1329 ptss->ecx = regs->ecx;
1330 ptss->edx = regs->edx;
1331 ptss->ebx = regs->ebx;
1332 ptss->esp = regs->esp;
1333 ptss->ebp = regs->ebp;
1334 ptss->esi = regs->esi;
1335 ptss->edi = regs->edi;
1337 hvm_get_segment_register(v, x86_seg_es, &segr);
1338 ptss->es = segr.sel;
1339 hvm_get_segment_register(v, x86_seg_cs, &segr);
1340 ptss->cs = segr.sel;
1341 hvm_get_segment_register(v, x86_seg_ss, &segr);
1342 ptss->ss = segr.sel;
1343 hvm_get_segment_register(v, x86_seg_ds, &segr);
1344 ptss->ds = segr.sel;
1345 hvm_get_segment_register(v, x86_seg_fs, &segr);
1346 ptss->fs = segr.sel;
1347 hvm_get_segment_register(v, x86_seg_gs, &segr);
1348 ptss->gs = segr.sel;
1349 hvm_get_segment_register(v, x86_seg_ldtr, &segr);
1350 ptss->ldt = segr.sel;
1352 hvm_unmap(ptss);
1354 ptss = hvm_map(tr.base, sizeof(tss));
1355 if ( ptss == NULL )
1356 goto out;
1358 if ( hvm_set_cr3(ptss->cr3) )
1360 hvm_unmap(ptss);
1361 goto out;
1364 regs->eip = ptss->eip;
1365 regs->eflags = ptss->eflags | 2;
1366 regs->eax = ptss->eax;
1367 regs->ecx = ptss->ecx;
1368 regs->edx = ptss->edx;
1369 regs->ebx = ptss->ebx;
1370 regs->esp = ptss->esp;
1371 regs->ebp = ptss->ebp;
1372 regs->esi = ptss->esi;
1373 regs->edi = ptss->edi;
1375 if ( (taskswitch_reason == TSW_call_or_int) )
1377 regs->eflags |= X86_EFLAGS_NT;
1378 ptss->back_link = prev_tr.sel;
1381 exn_raised = 0;
1382 if ( hvm_load_segment_selector(v, x86_seg_es, ptss->es) ||
1383 hvm_load_segment_selector(v, x86_seg_cs, ptss->cs) ||
1384 hvm_load_segment_selector(v, x86_seg_ss, ptss->ss) ||
1385 hvm_load_segment_selector(v, x86_seg_ds, ptss->ds) ||
1386 hvm_load_segment_selector(v, x86_seg_fs, ptss->fs) ||
1387 hvm_load_segment_selector(v, x86_seg_gs, ptss->gs) ||
1388 hvm_load_segment_selector(v, x86_seg_ldtr, ptss->ldt) )
1389 exn_raised = 1;
1391 if ( (ptss->trace & 1) && !exn_raised )
1392 hvm_inject_exception(TRAP_debug, tss_sel & 0xfff8, 0);
1394 hvm_unmap(ptss);
1396 tr.attr.fields.type = 0xb; /* busy 32-bit tss */
1397 hvm_set_segment_register(v, x86_seg_tr, &tr);
1399 v->arch.hvm_vcpu.guest_cr[0] |= X86_CR0_TS;
1400 hvm_update_guest_cr(v, 0);
1402 if ( (taskswitch_reason == TSW_iret) ||
1403 (taskswitch_reason == TSW_jmp) )
1404 clear_bit(41, optss_desc); /* clear B flag of old task */
1406 if ( taskswitch_reason != TSW_iret )
1407 set_bit(41, nptss_desc); /* set B flag of new task */
1409 if ( errcode >= 0 )
1411 struct segment_register reg;
1412 unsigned long linear_addr;
1413 regs->esp -= 4;
1414 hvm_get_segment_register(current, x86_seg_ss, &reg);
1415 /* Todo: do not ignore access faults here. */
1416 if ( hvm_virtual_to_linear_addr(x86_seg_ss, &reg, regs->esp,
1417 4, hvm_access_write, 32,
1418 &linear_addr) )
1419 hvm_copy_to_guest_virt_nofault(linear_addr, &errcode, 4, 0);
1422 out:
1423 hvm_unmap(optss_desc);
1424 hvm_unmap(nptss_desc);
1427 #define HVMCOPY_from_guest (0u<<0)
1428 #define HVMCOPY_to_guest (1u<<0)
1429 #define HVMCOPY_no_fault (0u<<1)
1430 #define HVMCOPY_fault (1u<<1)
1431 #define HVMCOPY_phys (0u<<2)
1432 #define HVMCOPY_virt (1u<<2)
1433 static enum hvm_copy_result __hvm_copy(
1434 void *buf, paddr_t addr, int size, unsigned int flags, uint32_t pfec)
1436 struct vcpu *curr = current;
1437 unsigned long gfn, mfn;
1438 p2m_type_t p2mt;
1439 char *p;
1440 int count, todo = size;
1442 while ( todo > 0 )
1444 count = min_t(int, PAGE_SIZE - (addr & ~PAGE_MASK), todo);
1446 if ( flags & HVMCOPY_virt )
1448 gfn = paging_gva_to_gfn(curr, addr, &pfec);
1449 if ( gfn == INVALID_GFN )
1451 if ( flags & HVMCOPY_fault )
1452 hvm_inject_exception(TRAP_page_fault, pfec, addr);
1453 return HVMCOPY_bad_gva_to_gfn;
1456 else
1458 gfn = addr >> PAGE_SHIFT;
1461 mfn = mfn_x(gfn_to_mfn_current(gfn, &p2mt));
1463 if ( !p2m_is_ram(p2mt) )
1464 return HVMCOPY_bad_gfn_to_mfn;
1465 ASSERT(mfn_valid(mfn));
1467 p = (char *)map_domain_page(mfn) + (addr & ~PAGE_MASK);
1469 if ( flags & HVMCOPY_to_guest )
1471 memcpy(p, buf, count);
1472 paging_mark_dirty(curr->domain, mfn);
1474 else
1476 memcpy(buf, p, count);
1479 unmap_domain_page(p);
1481 addr += count;
1482 buf += count;
1483 todo -= count;
1486 return HVMCOPY_okay;
1489 enum hvm_copy_result hvm_copy_to_guest_phys(
1490 paddr_t paddr, void *buf, int size)
1492 return __hvm_copy(buf, paddr, size,
1493 HVMCOPY_to_guest | HVMCOPY_fault | HVMCOPY_phys,
1494 0);
1497 enum hvm_copy_result hvm_copy_from_guest_phys(
1498 void *buf, paddr_t paddr, int size)
1500 return __hvm_copy(buf, paddr, size,
1501 HVMCOPY_from_guest | HVMCOPY_fault | HVMCOPY_phys,
1502 0);
1505 enum hvm_copy_result hvm_copy_to_guest_virt(
1506 unsigned long vaddr, void *buf, int size, uint32_t pfec)
1508 return __hvm_copy(buf, vaddr, size,
1509 HVMCOPY_to_guest | HVMCOPY_fault | HVMCOPY_virt,
1510 PFEC_page_present | PFEC_write_access | pfec);
1513 enum hvm_copy_result hvm_copy_from_guest_virt(
1514 void *buf, unsigned long vaddr, int size, uint32_t pfec)
1516 return __hvm_copy(buf, vaddr, size,
1517 HVMCOPY_from_guest | HVMCOPY_fault | HVMCOPY_virt,
1518 PFEC_page_present | pfec);
1521 enum hvm_copy_result hvm_fetch_from_guest_virt(
1522 void *buf, unsigned long vaddr, int size, uint32_t pfec)
1524 if ( hvm_nx_enabled(current) )
1525 pfec |= PFEC_insn_fetch;
1526 return __hvm_copy(buf, vaddr, size,
1527 HVMCOPY_from_guest | HVMCOPY_fault | HVMCOPY_virt,
1528 PFEC_page_present | pfec);
1531 enum hvm_copy_result hvm_copy_to_guest_virt_nofault(
1532 unsigned long vaddr, void *buf, int size, uint32_t pfec)
1534 return __hvm_copy(buf, vaddr, size,
1535 HVMCOPY_to_guest | HVMCOPY_no_fault | HVMCOPY_virt,
1536 PFEC_page_present | PFEC_write_access | pfec);
1539 enum hvm_copy_result hvm_copy_from_guest_virt_nofault(
1540 void *buf, unsigned long vaddr, int size, uint32_t pfec)
1542 return __hvm_copy(buf, vaddr, size,
1543 HVMCOPY_from_guest | HVMCOPY_no_fault | HVMCOPY_virt,
1544 PFEC_page_present | pfec);
1547 enum hvm_copy_result hvm_fetch_from_guest_virt_nofault(
1548 void *buf, unsigned long vaddr, int size, uint32_t pfec)
1550 if ( hvm_nx_enabled(current) )
1551 pfec |= PFEC_insn_fetch;
1552 return __hvm_copy(buf, vaddr, size,
1553 HVMCOPY_from_guest | HVMCOPY_no_fault | HVMCOPY_virt,
1554 PFEC_page_present | pfec);
1557 DEFINE_PER_CPU(int, guest_handles_in_xen_space);
1559 unsigned long copy_to_user_hvm(void *to, const void *from, unsigned len)
1561 int rc;
1563 if ( this_cpu(guest_handles_in_xen_space) )
1565 memcpy(to, from, len);
1566 return 0;
1569 rc = hvm_copy_to_guest_virt_nofault((unsigned long)to, (void *)from,
1570 len, 0);
1571 return rc ? len : 0; /* fake a copy_to_user() return code */
1574 unsigned long copy_from_user_hvm(void *to, const void *from, unsigned len)
1576 int rc;
1578 if ( this_cpu(guest_handles_in_xen_space) )
1580 memcpy(to, from, len);
1581 return 0;
1584 rc = hvm_copy_from_guest_virt_nofault(to, (unsigned long)from, len, 0);
1585 return rc ? len : 0; /* fake a copy_from_user() return code */
1588 #define bitmaskof(idx) (1U << ((idx) & 31))
1589 void hvm_cpuid(unsigned int input, unsigned int *eax, unsigned int *ebx,
1590 unsigned int *ecx, unsigned int *edx)
1592 struct vcpu *v = current;
1594 if ( cpuid_hypervisor_leaves(input, eax, ebx, ecx, edx) )
1595 return;
1597 domain_cpuid(v->domain, input, *ecx, eax, ebx, ecx, edx);
1599 if ( input == 0x00000001 )
1601 /* Fix up VLAPIC details. */
1602 *ebx &= 0x00FFFFFFu;
1603 *ebx |= (v->vcpu_id * 2) << 24;
1604 if ( vlapic_hw_disabled(vcpu_vlapic(v)) )
1605 __clear_bit(X86_FEATURE_APIC & 31, ebx);
1609 int hvm_msr_read_intercept(struct cpu_user_regs *regs)
1611 uint32_t ecx = regs->ecx;
1612 uint64_t msr_content = 0;
1613 struct vcpu *v = current;
1614 uint64_t *var_range_base, *fixed_range_base;
1615 int index, mtrr;
1616 uint32_t cpuid[4];
1618 var_range_base = (uint64_t *)v->arch.hvm_vcpu.mtrr.var_ranges;
1619 fixed_range_base = (uint64_t *)v->arch.hvm_vcpu.mtrr.fixed_ranges;
1621 hvm_cpuid(1, &cpuid[0], &cpuid[1], &cpuid[2], &cpuid[3]);
1622 mtrr = !!(cpuid[3] & bitmaskof(X86_FEATURE_MTRR));
1624 switch ( ecx )
1626 case MSR_IA32_TSC:
1627 msr_content = hvm_get_guest_time(v);
1628 break;
1630 case MSR_IA32_APICBASE:
1631 msr_content = vcpu_vlapic(v)->hw.apic_base_msr;
1632 break;
1634 case MSR_IA32_MCG_CAP:
1635 case MSR_IA32_MCG_STATUS:
1636 case MSR_IA32_MC0_STATUS:
1637 case MSR_IA32_MC1_STATUS:
1638 case MSR_IA32_MC2_STATUS:
1639 case MSR_IA32_MC3_STATUS:
1640 case MSR_IA32_MC4_STATUS:
1641 case MSR_IA32_MC5_STATUS:
1642 /* No point in letting the guest see real MCEs */
1643 msr_content = 0;
1644 break;
1646 case MSR_IA32_CR_PAT:
1647 msr_content = v->arch.hvm_vcpu.pat_cr;
1648 break;
1650 case MSR_MTRRcap:
1651 if ( !mtrr )
1652 goto gp_fault;
1653 msr_content = v->arch.hvm_vcpu.mtrr.mtrr_cap;
1654 break;
1655 case MSR_MTRRdefType:
1656 if ( !mtrr )
1657 goto gp_fault;
1658 msr_content = v->arch.hvm_vcpu.mtrr.def_type
1659 | (v->arch.hvm_vcpu.mtrr.enabled << 10);
1660 break;
1661 case MSR_MTRRfix64K_00000:
1662 if ( !mtrr )
1663 goto gp_fault;
1664 msr_content = fixed_range_base[0];
1665 break;
1666 case MSR_MTRRfix16K_80000:
1667 case MSR_MTRRfix16K_A0000:
1668 if ( !mtrr )
1669 goto gp_fault;
1670 index = regs->ecx - MSR_MTRRfix16K_80000;
1671 msr_content = fixed_range_base[index + 1];
1672 break;
1673 case MSR_MTRRfix4K_C0000...MSR_MTRRfix4K_F8000:
1674 if ( !mtrr )
1675 goto gp_fault;
1676 index = regs->ecx - MSR_MTRRfix4K_C0000;
1677 msr_content = fixed_range_base[index + 3];
1678 break;
1679 case MSR_IA32_MTRR_PHYSBASE0...MSR_IA32_MTRR_PHYSMASK7:
1680 if ( !mtrr )
1681 goto gp_fault;
1682 index = regs->ecx - MSR_IA32_MTRR_PHYSBASE0;
1683 msr_content = var_range_base[index];
1684 break;
1686 default:
1687 return hvm_funcs.msr_read_intercept(regs);
1690 regs->eax = (uint32_t)msr_content;
1691 regs->edx = (uint32_t)(msr_content >> 32);
1692 return X86EMUL_OKAY;
1694 gp_fault:
1695 hvm_inject_exception(TRAP_gp_fault, 0, 0);
1696 return X86EMUL_EXCEPTION;
1699 int hvm_msr_write_intercept(struct cpu_user_regs *regs)
1701 extern bool_t mtrr_var_range_msr_set(
1702 struct mtrr_state *v, u32 msr, u64 msr_content);
1703 extern bool_t mtrr_fix_range_msr_set(
1704 struct mtrr_state *v, int row, u64 msr_content);
1705 extern bool_t mtrr_def_type_msr_set(struct mtrr_state *v, u64 msr_content);
1706 extern bool_t pat_msr_set(u64 *pat, u64 msr);
1708 uint32_t ecx = regs->ecx;
1709 uint64_t msr_content = (uint32_t)regs->eax | ((uint64_t)regs->edx << 32);
1710 struct vcpu *v = current;
1711 int index, mtrr;
1712 uint32_t cpuid[4];
1714 hvm_cpuid(1, &cpuid[0], &cpuid[1], &cpuid[2], &cpuid[3]);
1715 mtrr = !!(cpuid[3] & bitmaskof(X86_FEATURE_MTRR));
1717 switch ( ecx )
1719 case MSR_IA32_TSC:
1720 hvm_set_guest_time(v, msr_content);
1721 pt_reset(v);
1722 break;
1724 case MSR_IA32_APICBASE:
1725 vlapic_msr_set(vcpu_vlapic(v), msr_content);
1726 break;
1728 case MSR_IA32_CR_PAT:
1729 if ( !pat_msr_set(&v->arch.hvm_vcpu.pat_cr, msr_content) )
1730 goto gp_fault;
1731 break;
1733 case MSR_MTRRcap:
1734 if ( !mtrr )
1735 goto gp_fault;
1736 goto gp_fault;
1737 case MSR_MTRRdefType:
1738 if ( !mtrr )
1739 goto gp_fault;
1740 if ( !mtrr_def_type_msr_set(&v->arch.hvm_vcpu.mtrr, msr_content) )
1741 goto gp_fault;
1742 break;
1743 case MSR_MTRRfix64K_00000:
1744 if ( !mtrr )
1745 goto gp_fault;
1746 if ( !mtrr_fix_range_msr_set(&v->arch.hvm_vcpu.mtrr, 0, msr_content) )
1747 goto gp_fault;
1748 break;
1749 case MSR_MTRRfix16K_80000:
1750 case MSR_MTRRfix16K_A0000:
1751 if ( !mtrr )
1752 goto gp_fault;
1753 index = regs->ecx - MSR_MTRRfix16K_80000 + 1;
1754 if ( !mtrr_fix_range_msr_set(&v->arch.hvm_vcpu.mtrr,
1755 index, msr_content) )
1756 goto gp_fault;
1757 break;
1758 case MSR_MTRRfix4K_C0000...MSR_MTRRfix4K_F8000:
1759 if ( !mtrr )
1760 goto gp_fault;
1761 index = regs->ecx - MSR_MTRRfix4K_C0000 + 3;
1762 if ( !mtrr_fix_range_msr_set(&v->arch.hvm_vcpu.mtrr,
1763 index, msr_content) )
1764 goto gp_fault;
1765 break;
1766 case MSR_IA32_MTRR_PHYSBASE0...MSR_IA32_MTRR_PHYSMASK7:
1767 if ( !mtrr )
1768 goto gp_fault;
1769 if ( !mtrr_var_range_msr_set(&v->arch.hvm_vcpu.mtrr,
1770 regs->ecx, msr_content) )
1771 goto gp_fault;
1772 break;
1774 default:
1775 return hvm_funcs.msr_write_intercept(regs);
1778 return X86EMUL_OKAY;
1780 gp_fault:
1781 hvm_inject_exception(TRAP_gp_fault, 0, 0);
1782 return X86EMUL_EXCEPTION;
1785 enum hvm_intblk hvm_interrupt_blocked(struct vcpu *v, struct hvm_intack intack)
1787 unsigned long intr_shadow;
1789 ASSERT(v == current);
1791 if ( (intack.source != hvm_intsrc_nmi) &&
1792 !(guest_cpu_user_regs()->eflags & X86_EFLAGS_IF) )
1793 return hvm_intblk_rflags_ie;
1795 intr_shadow = hvm_funcs.get_interrupt_shadow(v);
1797 if ( intr_shadow & (HVM_INTR_SHADOW_STI|HVM_INTR_SHADOW_MOV_SS) )
1798 return hvm_intblk_shadow;
1800 if ( intack.source == hvm_intsrc_nmi )
1801 return ((intr_shadow & HVM_INTR_SHADOW_NMI) ?
1802 hvm_intblk_nmi_iret : hvm_intblk_none);
1804 if ( intack.source == hvm_intsrc_lapic )
1806 uint32_t tpr = vlapic_get_reg(vcpu_vlapic(v), APIC_TASKPRI) & 0xF0;
1807 if ( (tpr >> 4) >= (intack.vector >> 4) )
1808 return hvm_intblk_tpr;
1811 return hvm_intblk_none;
1814 static long hvm_grant_table_op(
1815 unsigned int cmd, XEN_GUEST_HANDLE(void) uop, unsigned int count)
1817 if ( (cmd != GNTTABOP_query_size) && (cmd != GNTTABOP_setup_table) )
1818 return -ENOSYS; /* all other commands need auditing */
1819 return do_grant_table_op(cmd, uop, count);
1822 static long hvm_memory_op(int cmd, XEN_GUEST_HANDLE(void) arg)
1824 long rc = do_memory_op(cmd, arg);
1825 if ( (cmd & MEMOP_CMD_MASK) == XENMEM_decrease_reservation )
1826 current->domain->arch.hvm_domain.qemu_mapcache_invalidate = 1;
1827 return rc;
1830 typedef unsigned long hvm_hypercall_t(
1831 unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
1833 #define HYPERCALL(x) \
1834 [ __HYPERVISOR_ ## x ] = (hvm_hypercall_t *) do_ ## x
1836 #if defined(__i386__)
1838 static hvm_hypercall_t *hvm_hypercall32_table[NR_hypercalls] = {
1839 [ __HYPERVISOR_memory_op ] = (hvm_hypercall_t *)hvm_memory_op,
1840 [ __HYPERVISOR_grant_table_op ] = (hvm_hypercall_t *)hvm_grant_table_op,
1841 HYPERCALL(xen_version),
1842 HYPERCALL(event_channel_op),
1843 HYPERCALL(sched_op),
1844 HYPERCALL(hvm_op)
1845 };
1847 #else /* defined(__x86_64__) */
1849 static long hvm_memory_op_compat32(int cmd, XEN_GUEST_HANDLE(void) arg)
1851 extern long do_add_to_physmap(struct xen_add_to_physmap *xatp);
1852 long rc;
1854 switch ( cmd )
1856 case XENMEM_add_to_physmap:
1858 struct {
1859 domid_t domid;
1860 uint32_t space;
1861 uint32_t idx;
1862 uint32_t gpfn;
1863 } u;
1864 struct xen_add_to_physmap h;
1866 if ( copy_from_guest(&u, arg, 1) )
1867 return -EFAULT;
1869 h.domid = u.domid;
1870 h.space = u.space;
1871 h.idx = u.idx;
1872 h.gpfn = u.gpfn;
1874 this_cpu(guest_handles_in_xen_space) = 1;
1875 rc = hvm_memory_op(cmd, guest_handle_from_ptr(&h, void));
1876 this_cpu(guest_handles_in_xen_space) = 0;
1878 break;
1881 default:
1882 gdprintk(XENLOG_WARNING, "memory_op %d.\n", cmd);
1883 rc = -ENOSYS;
1884 break;
1887 return rc;
1890 static hvm_hypercall_t *hvm_hypercall64_table[NR_hypercalls] = {
1891 [ __HYPERVISOR_memory_op ] = (hvm_hypercall_t *)hvm_memory_op,
1892 [ __HYPERVISOR_grant_table_op ] = (hvm_hypercall_t *)hvm_grant_table_op,
1893 HYPERCALL(xen_version),
1894 HYPERCALL(event_channel_op),
1895 HYPERCALL(sched_op),
1896 HYPERCALL(hvm_op)
1897 };
1899 static hvm_hypercall_t *hvm_hypercall32_table[NR_hypercalls] = {
1900 [ __HYPERVISOR_memory_op ] = (hvm_hypercall_t *)hvm_memory_op_compat32,
1901 [ __HYPERVISOR_grant_table_op ] = (hvm_hypercall_t *)hvm_grant_table_op,
1902 HYPERCALL(xen_version),
1903 HYPERCALL(event_channel_op),
1904 HYPERCALL(sched_op),
1905 HYPERCALL(hvm_op)
1906 };
1908 #endif /* defined(__x86_64__) */
1910 int hvm_do_hypercall(struct cpu_user_regs *regs)
1912 struct vcpu *curr = current;
1913 struct segment_register sreg;
1914 int mode = hvm_guest_x86_mode(curr);
1915 uint32_t eax = regs->eax;
1917 switch ( mode )
1919 #ifdef __x86_64__
1920 case 8:
1921 #endif
1922 case 4:
1923 case 2:
1924 hvm_get_segment_register(curr, x86_seg_ss, &sreg);
1925 if ( unlikely(sreg.attr.fields.dpl == 3) )
1927 default:
1928 regs->eax = -EPERM;
1929 return HVM_HCALL_completed;
1931 case 0:
1932 break;
1935 if ( (eax >= NR_hypercalls) || !hvm_hypercall32_table[eax] )
1937 regs->eax = -ENOSYS;
1938 return HVM_HCALL_completed;
1941 this_cpu(hc_preempted) = 0;
1943 #ifdef __x86_64__
1944 if ( mode == 8 )
1946 HVM_DBG_LOG(DBG_LEVEL_HCALL, "hcall%u(%lx, %lx, %lx, %lx, %lx)", eax,
1947 regs->rdi, regs->rsi, regs->rdx, regs->r10, regs->r8);
1949 regs->rax = hvm_hypercall64_table[eax](regs->rdi,
1950 regs->rsi,
1951 regs->rdx,
1952 regs->r10,
1953 regs->r8);
1955 else
1956 #endif
1958 HVM_DBG_LOG(DBG_LEVEL_HCALL, "hcall%u(%x, %x, %x, %x, %x)", eax,
1959 (uint32_t)regs->ebx, (uint32_t)regs->ecx,
1960 (uint32_t)regs->edx, (uint32_t)regs->esi,
1961 (uint32_t)regs->edi);
1963 regs->eax = hvm_hypercall32_table[eax]((uint32_t)regs->ebx,
1964 (uint32_t)regs->ecx,
1965 (uint32_t)regs->edx,
1966 (uint32_t)regs->esi,
1967 (uint32_t)regs->edi);
1970 HVM_DBG_LOG(DBG_LEVEL_HCALL, "hcall%u -> %lx",
1971 eax, (unsigned long)regs->eax);
1973 if ( this_cpu(hc_preempted) )
1974 return HVM_HCALL_preempted;
1976 if ( unlikely(curr->domain->arch.hvm_domain.qemu_mapcache_invalidate) &&
1977 test_and_clear_bool(curr->domain->arch.hvm_domain.
1978 qemu_mapcache_invalidate) )
1979 return HVM_HCALL_invalidate;
1981 return HVM_HCALL_completed;
1984 static void hvm_latch_shinfo_size(struct domain *d)
1986 /*
1987 * Called from operations which are among the very first executed by
1988 * PV drivers on initialisation or after save/restore. These are sensible
1989 * points at which to sample the execution mode of the guest and latch
1990 * 32- or 64-bit format for shared state.
1991 */
1992 if ( current->domain == d )
1993 d->arch.has_32bit_shinfo = (hvm_guest_x86_mode(current) != 8);
1996 /* Initialise a hypercall transfer page for a VMX domain using
1997 paravirtualised drivers. */
1998 void hvm_hypercall_page_initialise(struct domain *d,
1999 void *hypercall_page)
2001 hvm_latch_shinfo_size(d);
2002 hvm_funcs.init_hypercall_page(d, hypercall_page);
2005 static int hvmop_set_pci_intx_level(
2006 XEN_GUEST_HANDLE(xen_hvm_set_pci_intx_level_t) uop)
2008 struct xen_hvm_set_pci_intx_level op;
2009 struct domain *d;
2010 int rc;
2012 if ( copy_from_guest(&op, uop, 1) )
2013 return -EFAULT;
2015 if ( (op.domain > 0) || (op.bus > 0) || (op.device > 31) || (op.intx > 3) )
2016 return -EINVAL;
2018 d = rcu_lock_domain_by_id(op.domid);
2019 if ( d == NULL )
2020 return -ESRCH;
2022 rc = -EPERM;
2023 if ( !IS_PRIV_FOR(current->domain, d) )
2024 goto out;
2026 rc = -EINVAL;
2027 if ( !is_hvm_domain(d) )
2028 goto out;
2030 rc = xsm_hvm_set_pci_intx_level(d);
2031 if ( rc )
2032 goto out;
2034 rc = 0;
2035 switch ( op.level )
2037 case 0:
2038 hvm_pci_intx_deassert(d, op.device, op.intx);
2039 break;
2040 case 1:
2041 hvm_pci_intx_assert(d, op.device, op.intx);
2042 break;
2043 default:
2044 rc = -EINVAL;
2045 break;
2048 out:
2049 rcu_unlock_domain(d);
2050 return rc;
2053 static int hvmop_set_isa_irq_level(
2054 XEN_GUEST_HANDLE(xen_hvm_set_isa_irq_level_t) uop)
2056 struct xen_hvm_set_isa_irq_level op;
2057 struct domain *d;
2058 int rc;
2060 if ( copy_from_guest(&op, uop, 1) )
2061 return -EFAULT;
2063 if ( op.isa_irq > 15 )
2064 return -EINVAL;
2066 d = rcu_lock_domain_by_id(op.domid);
2067 if ( d == NULL )
2068 return -ESRCH;
2070 rc = -EPERM;
2071 if ( !IS_PRIV_FOR(current->domain, d) )
2072 goto out;
2074 rc = -EINVAL;
2075 if ( !is_hvm_domain(d) )
2076 goto out;
2078 rc = xsm_hvm_set_isa_irq_level(d);
2079 if ( rc )
2080 goto out;
2082 rc = 0;
2083 switch ( op.level )
2085 case 0:
2086 hvm_isa_irq_deassert(d, op.isa_irq);
2087 break;
2088 case 1:
2089 hvm_isa_irq_assert(d, op.isa_irq);
2090 break;
2091 default:
2092 rc = -EINVAL;
2093 break;
2096 out:
2097 rcu_unlock_domain(d);
2098 return rc;
2101 static int hvmop_set_pci_link_route(
2102 XEN_GUEST_HANDLE(xen_hvm_set_pci_link_route_t) uop)
2104 struct xen_hvm_set_pci_link_route op;
2105 struct domain *d;
2106 int rc;
2108 if ( copy_from_guest(&op, uop, 1) )
2109 return -EFAULT;
2111 if ( (op.link > 3) || (op.isa_irq > 15) )
2112 return -EINVAL;
2114 d = rcu_lock_domain_by_id(op.domid);
2115 if ( d == NULL )
2116 return -ESRCH;
2118 rc = -EPERM;
2119 if ( !IS_PRIV_FOR(current->domain, d) )
2120 goto out;
2122 rc = -EINVAL;
2123 if ( !is_hvm_domain(d) )
2124 goto out;
2126 rc = xsm_hvm_set_pci_link_route(d);
2127 if ( rc )
2128 goto out;
2130 rc = 0;
2131 hvm_set_pci_link_route(d, op.link, op.isa_irq);
2133 out:
2134 rcu_unlock_domain(d);
2135 return rc;
2138 static int hvmop_flush_tlb_all(void)
2140 struct domain *d = current->domain;
2141 struct vcpu *v;
2143 /* Avoid deadlock if more than one vcpu tries this at the same time. */
2144 if ( !spin_trylock(&d->hypercall_deadlock_mutex) )
2145 return -EAGAIN;
2147 /* Pause all other vcpus. */
2148 for_each_vcpu ( d, v )
2149 if ( v != current )
2150 vcpu_pause_nosync(v);
2152 /* Now that all VCPUs are signalled to deschedule, we wait... */
2153 for_each_vcpu ( d, v )
2154 if ( v != current )
2155 while ( !vcpu_runnable(v) && v->is_running )
2156 cpu_relax();
2158 /* All other vcpus are paused, safe to unlock now. */
2159 spin_unlock(&d->hypercall_deadlock_mutex);
2161 /* Flush paging-mode soft state (e.g., va->gfn cache; PAE PDPE cache). */
2162 for_each_vcpu ( d, v )
2163 paging_update_cr3(v);
2165 /* Flush all dirty TLBs. */
2166 flush_tlb_mask(d->domain_dirty_cpumask);
2168 /* Done. */
2169 for_each_vcpu ( d, v )
2170 if ( v != current )
2171 vcpu_unpause(v);
2173 return 0;
2176 long do_hvm_op(unsigned long op, XEN_GUEST_HANDLE(void) arg)
2179 long rc = 0;
2181 switch ( op )
2183 case HVMOP_set_param:
2184 case HVMOP_get_param:
2186 struct xen_hvm_param a;
2187 struct hvm_ioreq_page *iorp;
2188 struct domain *d;
2189 struct vcpu *v;
2191 if ( copy_from_guest(&a, arg, 1) )
2192 return -EFAULT;
2194 if ( a.index >= HVM_NR_PARAMS )
2195 return -EINVAL;
2197 if ( a.domid == DOMID_SELF )
2199 d = rcu_lock_current_domain();
2201 else
2203 if ( (d = rcu_lock_domain_by_id(a.domid)) == NULL )
2204 return -ESRCH;
2205 if ( !IS_PRIV_FOR(current->domain, d) )
2207 rc = -EPERM;
2208 goto param_fail;
2213 rc = -EINVAL;
2214 if ( !is_hvm_domain(d) )
2215 goto param_fail;
2217 rc = xsm_hvm_param(d, op);
2218 if ( rc )
2219 goto param_fail;
2221 if ( op == HVMOP_set_param )
2223 rc = 0;
2225 switch ( a.index )
2227 case HVM_PARAM_IOREQ_PFN:
2228 iorp = &d->arch.hvm_domain.ioreq;
2229 if ( (rc = hvm_set_ioreq_page(d, iorp, a.value)) != 0 )
2230 break;
2231 spin_lock(&iorp->lock);
2232 if ( iorp->va != NULL )
2233 /* Initialise evtchn port info if VCPUs already created. */
2234 for_each_vcpu ( d, v )
2235 get_ioreq(v)->vp_eport = v->arch.hvm_vcpu.xen_port;
2236 spin_unlock(&iorp->lock);
2237 break;
2238 case HVM_PARAM_BUFIOREQ_PFN:
2239 iorp = &d->arch.hvm_domain.buf_ioreq;
2240 rc = hvm_set_ioreq_page(d, iorp, a.value);
2241 break;
2242 case HVM_PARAM_CALLBACK_IRQ:
2243 hvm_set_callback_via(d, a.value);
2244 hvm_latch_shinfo_size(d);
2245 break;
2246 case HVM_PARAM_TIMER_MODE:
2247 if ( a.value > HVMPTM_one_missed_tick_pending )
2248 rc = -EINVAL;
2249 break;
2250 case HVM_PARAM_IDENT_PT:
2251 rc = -EPERM;
2252 if ( !IS_PRIV(current->domain) )
2253 break;
2255 rc = -EINVAL;
2256 if ( d->arch.hvm_domain.params[a.index] != 0 )
2257 break;
2259 rc = 0;
2260 if ( !paging_mode_hap(d) )
2261 break;
2263 domain_pause(d);
2265 /*
2266 * Update GUEST_CR3 in each VMCS to point at identity map.
2267 * All foreign updates to guest state must synchronise on
2268 * the domctl_lock.
2269 */
2270 spin_lock(&domctl_lock);
2271 d->arch.hvm_domain.params[a.index] = a.value;
2272 for_each_vcpu ( d, v )
2273 paging_update_cr3(v);
2274 spin_unlock(&domctl_lock);
2276 domain_unpause(d);
2277 break;
2278 case HVM_PARAM_DM_DOMAIN:
2279 /* Privileged domains only, as we must domain_pause(d). */
2280 rc = -EPERM;
2281 if ( !IS_PRIV_FOR(current->domain, d) )
2282 break;
2284 if ( a.value == DOMID_SELF )
2285 a.value = current->domain->domain_id;
2287 rc = 0;
2288 domain_pause(d); /* safe to change per-vcpu xen_port */
2289 iorp = &d->arch.hvm_domain.ioreq;
2290 for_each_vcpu ( d, v )
2292 int old_port, new_port;
2293 new_port = alloc_unbound_xen_event_channel(v, a.value);
2294 if ( new_port < 0 )
2296 rc = new_port;
2297 break;
2299 /* xchg() ensures that only we free_xen_event_channel() */
2300 old_port = xchg(&v->arch.hvm_vcpu.xen_port, new_port);
2301 free_xen_event_channel(v, old_port);
2302 spin_lock(&iorp->lock);
2303 if ( iorp->va != NULL )
2304 get_ioreq(v)->vp_eport = v->arch.hvm_vcpu.xen_port;
2305 spin_unlock(&iorp->lock);
2307 domain_unpause(d);
2308 break;
2311 if ( rc == 0 )
2312 d->arch.hvm_domain.params[a.index] = a.value;
2314 else
2316 a.value = d->arch.hvm_domain.params[a.index];
2317 rc = copy_to_guest(arg, &a, 1) ? -EFAULT : 0;
2320 HVM_DBG_LOG(DBG_LEVEL_HCALL, "%s param %u = %"PRIx64,
2321 op == HVMOP_set_param ? "set" : "get",
2322 a.index, a.value);
2324 param_fail:
2325 rcu_unlock_domain(d);
2326 break;
2329 case HVMOP_set_pci_intx_level:
2330 rc = hvmop_set_pci_intx_level(
2331 guest_handle_cast(arg, xen_hvm_set_pci_intx_level_t));
2332 break;
2334 case HVMOP_set_isa_irq_level:
2335 rc = hvmop_set_isa_irq_level(
2336 guest_handle_cast(arg, xen_hvm_set_isa_irq_level_t));
2337 break;
2339 case HVMOP_set_pci_link_route:
2340 rc = hvmop_set_pci_link_route(
2341 guest_handle_cast(arg, xen_hvm_set_pci_link_route_t));
2342 break;
2344 case HVMOP_flush_tlbs:
2345 rc = guest_handle_is_null(arg) ? hvmop_flush_tlb_all() : -ENOSYS;
2346 break;
2348 case HVMOP_track_dirty_vram:
2350 struct xen_hvm_track_dirty_vram a;
2351 struct domain *d;
2353 if ( copy_from_guest(&a, arg, 1) )
2354 return -EFAULT;
2356 if ( a.domid == DOMID_SELF )
2358 d = rcu_lock_current_domain();
2360 else
2362 if ( (d = rcu_lock_domain_by_id(a.domid)) == NULL )
2363 return -ESRCH;
2364 if ( !IS_PRIV_FOR(current->domain, d) )
2366 rc = -EPERM;
2367 goto param_fail2;
2371 rc = -EINVAL;
2372 if ( !is_hvm_domain(d) )
2373 goto param_fail2;
2375 rc = xsm_hvm_param(d, op);
2376 if ( rc )
2377 goto param_fail2;
2379 rc = -ESRCH;
2380 if ( d->is_dying )
2381 goto param_fail2;
2383 rc = -EINVAL;
2384 if ( !shadow_mode_enabled(d))
2385 goto param_fail2;
2386 if ( d->vcpu[0] == NULL )
2387 goto param_fail2;
2389 rc = shadow_track_dirty_vram(d, a.first_pfn, a.nr, a.dirty_bitmap);
2391 param_fail2:
2392 rcu_unlock_domain(d);
2393 break;
2396 default:
2398 gdprintk(XENLOG_WARNING, "Bad HVM op %ld.\n", op);
2399 rc = -ENOSYS;
2400 break;
2404 if ( rc == -EAGAIN )
2405 rc = hypercall_create_continuation(
2406 __HYPERVISOR_hvm_op, "lh", op, arg);
2408 return rc;
2411 /*
2412 * Local variables:
2413 * mode: C
2414 * c-set-style: "BSD"
2415 * c-basic-offset: 4
2416 * tab-width: 4
2417 * indent-tabs-mode: nil
2418 * End:
2419 */