ia64/xen-unstable

view xen/arch/x86/hvm/hvm.c @ 19825:81edfffb3aff

Scaling guest's TSC when the target machine's frequency is different
with its requirement.

Using trap&emulate for guest's each rdtsc instruction first, maybe it
can be optimized later.

Signed-off-by: Xiantao Zhang <xiantao.zhang@intel.com>
author Keir Fraser <keir.fraser@citrix.com>
date Wed Jun 24 11:05:22 2009 +0100 (2009-06-24)
parents 2f9e1348aa98
children
line source
1 /*
2 * hvm.c: Common hardware virtual machine abstractions.
3 *
4 * Copyright (c) 2004, Intel Corporation.
5 * Copyright (c) 2005, International Business Machines Corporation.
6 * Copyright (c) 2008, Citrix Systems, Inc.
7 *
8 * This program is free software; you can redistribute it and/or modify it
9 * under the terms and conditions of the GNU General Public License,
10 * version 2, as published by the Free Software Foundation.
11 *
12 * This program is distributed in the hope it will be useful, but WITHOUT
13 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
15 * more details.
16 *
17 * You should have received a copy of the GNU General Public License along with
18 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
19 * Place - Suite 330, Boston, MA 02111-1307 USA.
20 */
22 #include <xen/config.h>
23 #include <xen/ctype.h>
24 #include <xen/init.h>
25 #include <xen/lib.h>
26 #include <xen/trace.h>
27 #include <xen/sched.h>
28 #include <xen/irq.h>
29 #include <xen/softirq.h>
30 #include <xen/domain.h>
31 #include <xen/domain_page.h>
32 #include <xen/hypercall.h>
33 #include <xen/guest_access.h>
34 #include <xen/event.h>
35 #include <xen/paging.h>
36 #include <asm/shadow.h>
37 #include <asm/hap.h>
38 #include <asm/current.h>
39 #include <asm/e820.h>
40 #include <asm/io.h>
41 #include <asm/regs.h>
42 #include <asm/cpufeature.h>
43 #include <asm/processor.h>
44 #include <asm/types.h>
45 #include <asm/msr.h>
46 #include <asm/mc146818rtc.h>
47 #include <asm/spinlock.h>
48 #include <asm/hvm/hvm.h>
49 #include <asm/hvm/vpt.h>
50 #include <asm/hvm/support.h>
51 #include <asm/hvm/cacheattr.h>
52 #include <asm/hvm/trace.h>
53 #include <public/sched.h>
54 #include <public/hvm/ioreq.h>
55 #include <public/version.h>
56 #include <public/memory.h>
58 int hvm_enabled __read_mostly;
60 unsigned int opt_hvm_debug_level __read_mostly;
61 integer_param("hvm_debug", opt_hvm_debug_level);
63 int opt_softtsc;
64 boolean_param("softtsc", opt_softtsc);
66 struct hvm_function_table hvm_funcs __read_mostly;
68 /* I/O permission bitmap is globally shared by all HVM guests. */
69 unsigned long __attribute__ ((__section__ (".bss.page_aligned")))
70 hvm_io_bitmap[3*PAGE_SIZE/BYTES_PER_LONG];
72 void hvm_enable(struct hvm_function_table *fns)
73 {
74 extern int hvm_port80_allowed;
76 BUG_ON(hvm_enabled);
77 printk("HVM: %s enabled\n", fns->name);
79 /*
80 * Allow direct access to the PC debug ports 0x80 and 0xed (they are
81 * often used for I/O delays, but the vmexits simply slow things down).
82 */
83 memset(hvm_io_bitmap, ~0, sizeof(hvm_io_bitmap));
84 if ( hvm_port80_allowed )
85 __clear_bit(0x80, hvm_io_bitmap);
86 __clear_bit(0xed, hvm_io_bitmap);
88 hvm_funcs = *fns;
89 hvm_enabled = 1;
91 if ( hvm_funcs.hap_supported )
92 printk("HVM: Hardware Assisted Paging detected.\n");
93 }
95 /*
96 * Need to re-inject a given event? We avoid re-injecting software exceptions
97 * and interrupts because the faulting/trapping instruction can simply be
98 * re-executed (neither VMX nor SVM update RIP when they VMEXIT during
99 * INT3/INTO/INTn).
100 */
101 int hvm_event_needs_reinjection(uint8_t type, uint8_t vector)
102 {
103 switch ( type )
104 {
105 case X86_EVENTTYPE_EXT_INTR:
106 case X86_EVENTTYPE_NMI:
107 return 1;
108 case X86_EVENTTYPE_HW_EXCEPTION:
109 /*
110 * SVM uses type 3 ("HW Exception") for #OF and #BP. We explicitly
111 * check for these vectors, as they are really SW Exceptions. SVM has
112 * not updated RIP to point after the trapping instruction (INT3/INTO).
113 */
114 return (vector != 3) && (vector != 4);
115 default:
116 /* Software exceptions/interrupts can be re-executed (e.g., INT n). */
117 break;
118 }
119 return 0;
120 }
122 /*
123 * Combine two hardware exceptions: @vec2 was raised during delivery of @vec1.
124 * This means we can assume that @vec2 is contributory or a page fault.
125 */
126 uint8_t hvm_combine_hw_exceptions(uint8_t vec1, uint8_t vec2)
127 {
128 /* Exception during double-fault delivery always causes a triple fault. */
129 if ( vec1 == TRAP_double_fault )
130 {
131 hvm_triple_fault();
132 return TRAP_double_fault; /* dummy return */
133 }
135 /* Exception during page-fault delivery always causes a double fault. */
136 if ( vec1 == TRAP_page_fault )
137 return TRAP_double_fault;
139 /* Discard the first exception if it's benign or if we now have a #PF. */
140 if ( !((1u << vec1) & 0x7c01u) || (vec2 == TRAP_page_fault) )
141 return vec2;
143 /* Cannot combine the exceptions: double fault. */
144 return TRAP_double_fault;
145 }
147 void hvm_enable_rdtsc_exiting(struct domain *d)
148 {
149 struct vcpu *v;
151 if ( opt_softtsc || !hvm_funcs.enable_rdtsc_exiting )
152 return;
154 for_each_vcpu ( d, v )
155 hvm_funcs.enable_rdtsc_exiting(v);
156 }
158 int hvm_gtsc_need_scale(struct domain *d)
159 {
160 uint32_t gtsc_mhz, htsc_mhz;
162 gtsc_mhz = d->arch.hvm_domain.gtsc_khz / 1000;
163 htsc_mhz = opt_softtsc ? 1000 : ((uint32_t)cpu_khz / 1000);
165 d->arch.hvm_domain.tsc_scaled = (gtsc_mhz && (gtsc_mhz != htsc_mhz));
166 return d->arch.hvm_domain.tsc_scaled;
167 }
169 static u64 hvm_h2g_scale_tsc(struct vcpu *v, u64 host_tsc)
170 {
171 uint32_t gtsc_khz, htsc_khz;
173 if ( !v->domain->arch.hvm_domain.tsc_scaled )
174 return host_tsc;
176 htsc_khz = opt_softtsc ? 1000000 : cpu_khz;
177 gtsc_khz = v->domain->arch.hvm_domain.gtsc_khz;
178 return muldiv64(host_tsc, gtsc_khz, htsc_khz);
179 }
181 void hvm_set_guest_tsc(struct vcpu *v, u64 guest_tsc)
182 {
183 uint64_t host_tsc, scaled_htsc;
185 if ( opt_softtsc )
186 host_tsc = hvm_get_guest_time(v);
187 else
188 rdtscll(host_tsc);
190 scaled_htsc = hvm_h2g_scale_tsc(v, host_tsc);
192 v->arch.hvm_vcpu.cache_tsc_offset = guest_tsc - scaled_htsc;
193 hvm_funcs.set_tsc_offset(v, v->arch.hvm_vcpu.cache_tsc_offset);
194 }
196 u64 hvm_get_guest_tsc(struct vcpu *v)
197 {
198 uint64_t host_tsc, scaled_htsc;
200 if ( opt_softtsc )
201 host_tsc = hvm_get_guest_time(v);
202 else
203 rdtscll(host_tsc);
205 scaled_htsc = hvm_h2g_scale_tsc(v, host_tsc);
207 return scaled_htsc + v->arch.hvm_vcpu.cache_tsc_offset;
208 }
210 void hvm_migrate_timers(struct vcpu *v)
211 {
212 rtc_migrate_timers(v);
213 pt_migrate(v);
214 }
216 void hvm_do_resume(struct vcpu *v)
217 {
218 ioreq_t *p;
220 pt_restore_timer(v);
222 /* NB. Optimised for common case (p->state == STATE_IOREQ_NONE). */
223 p = &get_ioreq(v)->vp_ioreq;
224 while ( p->state != STATE_IOREQ_NONE )
225 {
226 switch ( p->state )
227 {
228 case STATE_IORESP_READY: /* IORESP_READY -> NONE */
229 hvm_io_assist();
230 break;
231 case STATE_IOREQ_READY: /* IOREQ_{READY,INPROCESS} -> IORESP_READY */
232 case STATE_IOREQ_INPROCESS:
233 wait_on_xen_event_channel(v->arch.hvm_vcpu.xen_port,
234 (p->state != STATE_IOREQ_READY) &&
235 (p->state != STATE_IOREQ_INPROCESS));
236 break;
237 default:
238 gdprintk(XENLOG_ERR, "Weird HVM iorequest state %d.\n", p->state);
239 domain_crash(v->domain);
240 return; /* bail */
241 }
242 }
243 }
245 static void hvm_init_ioreq_page(
246 struct domain *d, struct hvm_ioreq_page *iorp)
247 {
248 memset(iorp, 0, sizeof(*iorp));
249 spin_lock_init(&iorp->lock);
250 domain_pause(d);
251 }
253 static void hvm_destroy_ioreq_page(
254 struct domain *d, struct hvm_ioreq_page *iorp)
255 {
256 spin_lock(&iorp->lock);
258 ASSERT(d->is_dying);
260 if ( iorp->va != NULL )
261 {
262 unmap_domain_page_global(iorp->va);
263 put_page_and_type(iorp->page);
264 iorp->va = NULL;
265 }
267 spin_unlock(&iorp->lock);
268 }
270 static int hvm_set_ioreq_page(
271 struct domain *d, struct hvm_ioreq_page *iorp, unsigned long gmfn)
272 {
273 struct page_info *page;
274 p2m_type_t p2mt;
275 unsigned long mfn;
276 void *va;
278 mfn = mfn_x(gfn_to_mfn(d, gmfn, &p2mt));
279 if ( !p2m_is_ram(p2mt) )
280 return -EINVAL;
281 ASSERT(mfn_valid(mfn));
283 page = mfn_to_page(mfn);
284 if ( !get_page_and_type(page, d, PGT_writable_page) )
285 return -EINVAL;
287 va = map_domain_page_global(mfn);
288 if ( va == NULL )
289 {
290 put_page_and_type(page);
291 return -ENOMEM;
292 }
294 spin_lock(&iorp->lock);
296 if ( (iorp->va != NULL) || d->is_dying )
297 {
298 spin_unlock(&iorp->lock);
299 unmap_domain_page_global(va);
300 put_page_and_type(mfn_to_page(mfn));
301 return -EINVAL;
302 }
304 iorp->va = va;
305 iorp->page = page;
307 spin_unlock(&iorp->lock);
309 domain_unpause(d);
311 return 0;
312 }
314 static int hvm_print_line(
315 int dir, uint32_t port, uint32_t bytes, uint32_t *val)
316 {
317 struct vcpu *curr = current;
318 struct hvm_domain *hd = &curr->domain->arch.hvm_domain;
319 char c = *val;
321 BUG_ON(bytes != 1);
323 /* Accept only printable characters, newline, and horizontal tab. */
324 if ( !isprint(c) && (c != '\n') && (c != '\t') )
325 return X86EMUL_OKAY;
327 spin_lock(&hd->pbuf_lock);
328 hd->pbuf[hd->pbuf_idx++] = c;
329 if ( (hd->pbuf_idx == (sizeof(hd->pbuf) - 2)) || (c == '\n') )
330 {
331 if ( c != '\n' )
332 hd->pbuf[hd->pbuf_idx++] = '\n';
333 hd->pbuf[hd->pbuf_idx] = '\0';
334 printk(XENLOG_G_DEBUG "HVM%u: %s", curr->domain->domain_id, hd->pbuf);
335 hd->pbuf_idx = 0;
336 }
337 spin_unlock(&hd->pbuf_lock);
339 return X86EMUL_OKAY;
340 }
342 int hvm_domain_initialise(struct domain *d)
343 {
344 int rc;
346 if ( !hvm_enabled )
347 {
348 gdprintk(XENLOG_WARNING, "Attempt to create a HVM guest "
349 "on a non-VT/AMDV platform.\n");
350 return -EINVAL;
351 }
353 spin_lock_init(&d->arch.hvm_domain.pbuf_lock);
354 spin_lock_init(&d->arch.hvm_domain.irq_lock);
355 spin_lock_init(&d->arch.hvm_domain.uc_lock);
357 INIT_LIST_HEAD(&d->arch.hvm_domain.msixtbl_list);
358 spin_lock_init(&d->arch.hvm_domain.msixtbl_list_lock);
360 hvm_init_guest_time(d);
362 d->arch.hvm_domain.params[HVM_PARAM_HPET_ENABLED] = 1;
364 hvm_init_cacheattr_region_list(d);
366 rc = paging_enable(d, PG_refcounts|PG_translate|PG_external);
367 if ( rc != 0 )
368 goto fail1;
370 vpic_init(d);
372 rc = vioapic_init(d);
373 if ( rc != 0 )
374 goto fail1;
376 stdvga_init(d);
378 rtc_init(d);
380 hvm_init_ioreq_page(d, &d->arch.hvm_domain.ioreq);
381 hvm_init_ioreq_page(d, &d->arch.hvm_domain.buf_ioreq);
383 register_portio_handler(d, 0xe9, 1, hvm_print_line);
385 rc = hvm_funcs.domain_initialise(d);
386 if ( rc != 0 )
387 goto fail2;
389 return 0;
391 fail2:
392 rtc_deinit(d);
393 stdvga_deinit(d);
394 vioapic_deinit(d);
395 fail1:
396 hvm_destroy_cacheattr_region_list(d);
397 return rc;
398 }
400 extern void msixtbl_pt_cleanup(struct domain *d);
402 void hvm_domain_relinquish_resources(struct domain *d)
403 {
404 hvm_destroy_ioreq_page(d, &d->arch.hvm_domain.ioreq);
405 hvm_destroy_ioreq_page(d, &d->arch.hvm_domain.buf_ioreq);
407 msixtbl_pt_cleanup(d);
409 /* Stop all asynchronous timer actions. */
410 rtc_deinit(d);
411 if ( d->vcpu != NULL && d->vcpu[0] != NULL )
412 {
413 pit_deinit(d);
414 pmtimer_deinit(d);
415 hpet_deinit(d);
416 }
417 }
419 void hvm_domain_destroy(struct domain *d)
420 {
421 hvm_funcs.domain_destroy(d);
422 rtc_deinit(d);
423 stdvga_deinit(d);
424 vioapic_deinit(d);
425 hvm_destroy_cacheattr_region_list(d);
426 }
428 static int hvm_save_cpu_ctxt(struct domain *d, hvm_domain_context_t *h)
429 {
430 struct vcpu *v;
431 struct hvm_hw_cpu ctxt;
432 struct segment_register seg;
433 struct vcpu_guest_context *vc;
435 for_each_vcpu ( d, v )
436 {
437 /* We don't need to save state for a vcpu that is down; the restore
438 * code will leave it down if there is nothing saved. */
439 if ( test_bit(_VPF_down, &v->pause_flags) )
440 continue;
442 /* Architecture-specific vmcs/vmcb bits */
443 hvm_funcs.save_cpu_ctxt(v, &ctxt);
445 hvm_get_segment_register(v, x86_seg_idtr, &seg);
446 ctxt.idtr_limit = seg.limit;
447 ctxt.idtr_base = seg.base;
449 hvm_get_segment_register(v, x86_seg_gdtr, &seg);
450 ctxt.gdtr_limit = seg.limit;
451 ctxt.gdtr_base = seg.base;
453 hvm_get_segment_register(v, x86_seg_cs, &seg);
454 ctxt.cs_sel = seg.sel;
455 ctxt.cs_limit = seg.limit;
456 ctxt.cs_base = seg.base;
457 ctxt.cs_arbytes = seg.attr.bytes;
459 hvm_get_segment_register(v, x86_seg_ds, &seg);
460 ctxt.ds_sel = seg.sel;
461 ctxt.ds_limit = seg.limit;
462 ctxt.ds_base = seg.base;
463 ctxt.ds_arbytes = seg.attr.bytes;
465 hvm_get_segment_register(v, x86_seg_es, &seg);
466 ctxt.es_sel = seg.sel;
467 ctxt.es_limit = seg.limit;
468 ctxt.es_base = seg.base;
469 ctxt.es_arbytes = seg.attr.bytes;
471 hvm_get_segment_register(v, x86_seg_ss, &seg);
472 ctxt.ss_sel = seg.sel;
473 ctxt.ss_limit = seg.limit;
474 ctxt.ss_base = seg.base;
475 ctxt.ss_arbytes = seg.attr.bytes;
477 hvm_get_segment_register(v, x86_seg_fs, &seg);
478 ctxt.fs_sel = seg.sel;
479 ctxt.fs_limit = seg.limit;
480 ctxt.fs_base = seg.base;
481 ctxt.fs_arbytes = seg.attr.bytes;
483 hvm_get_segment_register(v, x86_seg_gs, &seg);
484 ctxt.gs_sel = seg.sel;
485 ctxt.gs_limit = seg.limit;
486 ctxt.gs_base = seg.base;
487 ctxt.gs_arbytes = seg.attr.bytes;
489 hvm_get_segment_register(v, x86_seg_tr, &seg);
490 ctxt.tr_sel = seg.sel;
491 ctxt.tr_limit = seg.limit;
492 ctxt.tr_base = seg.base;
493 ctxt.tr_arbytes = seg.attr.bytes;
495 hvm_get_segment_register(v, x86_seg_ldtr, &seg);
496 ctxt.ldtr_sel = seg.sel;
497 ctxt.ldtr_limit = seg.limit;
498 ctxt.ldtr_base = seg.base;
499 ctxt.ldtr_arbytes = seg.attr.bytes;
501 vc = &v->arch.guest_context;
503 if ( v->fpu_initialised )
504 memcpy(ctxt.fpu_regs, &vc->fpu_ctxt, sizeof(ctxt.fpu_regs));
505 else
506 memset(ctxt.fpu_regs, 0, sizeof(ctxt.fpu_regs));
508 ctxt.rax = vc->user_regs.eax;
509 ctxt.rbx = vc->user_regs.ebx;
510 ctxt.rcx = vc->user_regs.ecx;
511 ctxt.rdx = vc->user_regs.edx;
512 ctxt.rbp = vc->user_regs.ebp;
513 ctxt.rsi = vc->user_regs.esi;
514 ctxt.rdi = vc->user_regs.edi;
515 ctxt.rsp = vc->user_regs.esp;
516 ctxt.rip = vc->user_regs.eip;
517 ctxt.rflags = vc->user_regs.eflags;
518 #ifdef __x86_64__
519 ctxt.r8 = vc->user_regs.r8;
520 ctxt.r9 = vc->user_regs.r9;
521 ctxt.r10 = vc->user_regs.r10;
522 ctxt.r11 = vc->user_regs.r11;
523 ctxt.r12 = vc->user_regs.r12;
524 ctxt.r13 = vc->user_regs.r13;
525 ctxt.r14 = vc->user_regs.r14;
526 ctxt.r15 = vc->user_regs.r15;
527 #endif
528 ctxt.dr0 = vc->debugreg[0];
529 ctxt.dr1 = vc->debugreg[1];
530 ctxt.dr2 = vc->debugreg[2];
531 ctxt.dr3 = vc->debugreg[3];
532 ctxt.dr6 = vc->debugreg[6];
533 ctxt.dr7 = vc->debugreg[7];
535 if ( hvm_save_entry(CPU, v->vcpu_id, h, &ctxt) != 0 )
536 return 1;
537 }
538 return 0;
539 }
541 static int hvm_load_cpu_ctxt(struct domain *d, hvm_domain_context_t *h)
542 {
543 int vcpuid, rc;
544 struct vcpu *v;
545 struct hvm_hw_cpu ctxt;
546 struct segment_register seg;
547 struct vcpu_guest_context *vc;
549 /* Which vcpu is this? */
550 vcpuid = hvm_load_instance(h);
551 if ( vcpuid >= d->max_vcpus || (v = d->vcpu[vcpuid]) == NULL )
552 {
553 gdprintk(XENLOG_ERR, "HVM restore: domain has no vcpu %u\n", vcpuid);
554 return -EINVAL;
555 }
556 vc = &v->arch.guest_context;
558 /* Need to init this vcpu before loading its contents */
559 rc = 0;
560 domain_lock(d);
561 if ( !v->is_initialised )
562 rc = boot_vcpu(d, vcpuid, vc);
563 domain_unlock(d);
564 if ( rc != 0 )
565 return rc;
567 if ( hvm_load_entry(CPU, h, &ctxt) != 0 )
568 return -EINVAL;
570 /* Sanity check some control registers. */
571 if ( (ctxt.cr0 & HVM_CR0_GUEST_RESERVED_BITS) ||
572 !(ctxt.cr0 & X86_CR0_ET) ||
573 ((ctxt.cr0 & (X86_CR0_PE|X86_CR0_PG)) == X86_CR0_PG) )
574 {
575 gdprintk(XENLOG_ERR, "HVM restore: bad CR0 0x%"PRIx64"\n",
576 ctxt.cr0);
577 return -EINVAL;
578 }
580 if ( ctxt.cr4 & HVM_CR4_GUEST_RESERVED_BITS )
581 {
582 gdprintk(XENLOG_ERR, "HVM restore: bad CR4 0x%"PRIx64"\n",
583 ctxt.cr4);
584 return -EINVAL;
585 }
587 if ( (ctxt.msr_efer & ~(EFER_FFXSE | EFER_LME | EFER_LMA |
588 EFER_NX | EFER_SCE)) ||
589 ((sizeof(long) != 8) && (ctxt.msr_efer & EFER_LME)) ||
590 (!cpu_has_nx && (ctxt.msr_efer & EFER_NX)) ||
591 (!cpu_has_syscall && (ctxt.msr_efer & EFER_SCE)) ||
592 (!cpu_has_ffxsr && (ctxt.msr_efer & EFER_FFXSE)) ||
593 ((ctxt.msr_efer & (EFER_LME|EFER_LMA)) == EFER_LMA) )
594 {
595 gdprintk(XENLOG_ERR, "HVM restore: bad EFER 0x%"PRIx64"\n",
596 ctxt.msr_efer);
597 return -EINVAL;
598 }
600 /* Older Xen versions used to save the segment arbytes directly
601 * from the VMCS on Intel hosts. Detect this and rearrange them
602 * into the struct segment_register format. */
603 #define UNFOLD_ARBYTES(_r) \
604 if ( (_r & 0xf000) && !(_r & 0x0f00) ) \
605 _r = ((_r & 0xff) | ((_r >> 4) & 0xf00))
606 UNFOLD_ARBYTES(ctxt.cs_arbytes);
607 UNFOLD_ARBYTES(ctxt.ds_arbytes);
608 UNFOLD_ARBYTES(ctxt.es_arbytes);
609 UNFOLD_ARBYTES(ctxt.fs_arbytes);
610 UNFOLD_ARBYTES(ctxt.gs_arbytes);
611 UNFOLD_ARBYTES(ctxt.ss_arbytes);
612 UNFOLD_ARBYTES(ctxt.tr_arbytes);
613 UNFOLD_ARBYTES(ctxt.ldtr_arbytes);
614 #undef UNFOLD_ARBYTES
616 /* Architecture-specific vmcs/vmcb bits */
617 if ( hvm_funcs.load_cpu_ctxt(v, &ctxt) < 0 )
618 return -EINVAL;
620 seg.limit = ctxt.idtr_limit;
621 seg.base = ctxt.idtr_base;
622 hvm_set_segment_register(v, x86_seg_idtr, &seg);
624 seg.limit = ctxt.gdtr_limit;
625 seg.base = ctxt.gdtr_base;
626 hvm_set_segment_register(v, x86_seg_gdtr, &seg);
628 seg.sel = ctxt.cs_sel;
629 seg.limit = ctxt.cs_limit;
630 seg.base = ctxt.cs_base;
631 seg.attr.bytes = ctxt.cs_arbytes;
632 hvm_set_segment_register(v, x86_seg_cs, &seg);
634 seg.sel = ctxt.ds_sel;
635 seg.limit = ctxt.ds_limit;
636 seg.base = ctxt.ds_base;
637 seg.attr.bytes = ctxt.ds_arbytes;
638 hvm_set_segment_register(v, x86_seg_ds, &seg);
640 seg.sel = ctxt.es_sel;
641 seg.limit = ctxt.es_limit;
642 seg.base = ctxt.es_base;
643 seg.attr.bytes = ctxt.es_arbytes;
644 hvm_set_segment_register(v, x86_seg_es, &seg);
646 seg.sel = ctxt.ss_sel;
647 seg.limit = ctxt.ss_limit;
648 seg.base = ctxt.ss_base;
649 seg.attr.bytes = ctxt.ss_arbytes;
650 hvm_set_segment_register(v, x86_seg_ss, &seg);
652 seg.sel = ctxt.fs_sel;
653 seg.limit = ctxt.fs_limit;
654 seg.base = ctxt.fs_base;
655 seg.attr.bytes = ctxt.fs_arbytes;
656 hvm_set_segment_register(v, x86_seg_fs, &seg);
658 seg.sel = ctxt.gs_sel;
659 seg.limit = ctxt.gs_limit;
660 seg.base = ctxt.gs_base;
661 seg.attr.bytes = ctxt.gs_arbytes;
662 hvm_set_segment_register(v, x86_seg_gs, &seg);
664 seg.sel = ctxt.tr_sel;
665 seg.limit = ctxt.tr_limit;
666 seg.base = ctxt.tr_base;
667 seg.attr.bytes = ctxt.tr_arbytes;
668 hvm_set_segment_register(v, x86_seg_tr, &seg);
670 seg.sel = ctxt.ldtr_sel;
671 seg.limit = ctxt.ldtr_limit;
672 seg.base = ctxt.ldtr_base;
673 seg.attr.bytes = ctxt.ldtr_arbytes;
674 hvm_set_segment_register(v, x86_seg_ldtr, &seg);
676 memcpy(&vc->fpu_ctxt, ctxt.fpu_regs, sizeof(ctxt.fpu_regs));
678 vc->user_regs.eax = ctxt.rax;
679 vc->user_regs.ebx = ctxt.rbx;
680 vc->user_regs.ecx = ctxt.rcx;
681 vc->user_regs.edx = ctxt.rdx;
682 vc->user_regs.ebp = ctxt.rbp;
683 vc->user_regs.esi = ctxt.rsi;
684 vc->user_regs.edi = ctxt.rdi;
685 vc->user_regs.esp = ctxt.rsp;
686 vc->user_regs.eip = ctxt.rip;
687 vc->user_regs.eflags = ctxt.rflags | 2;
688 #ifdef __x86_64__
689 vc->user_regs.r8 = ctxt.r8;
690 vc->user_regs.r9 = ctxt.r9;
691 vc->user_regs.r10 = ctxt.r10;
692 vc->user_regs.r11 = ctxt.r11;
693 vc->user_regs.r12 = ctxt.r12;
694 vc->user_regs.r13 = ctxt.r13;
695 vc->user_regs.r14 = ctxt.r14;
696 vc->user_regs.r15 = ctxt.r15;
697 #endif
698 vc->debugreg[0] = ctxt.dr0;
699 vc->debugreg[1] = ctxt.dr1;
700 vc->debugreg[2] = ctxt.dr2;
701 vc->debugreg[3] = ctxt.dr3;
702 vc->debugreg[6] = ctxt.dr6;
703 vc->debugreg[7] = ctxt.dr7;
705 vc->flags = VGCF_online;
706 v->fpu_initialised = 1;
708 /* Auxiliary processors should be woken immediately. */
709 v->is_initialised = 1;
710 clear_bit(_VPF_down, &v->pause_flags);
711 vcpu_wake(v);
713 return 0;
714 }
716 HVM_REGISTER_SAVE_RESTORE(CPU, hvm_save_cpu_ctxt, hvm_load_cpu_ctxt,
717 1, HVMSR_PER_VCPU);
719 int hvm_vcpu_initialise(struct vcpu *v)
720 {
721 int rc;
723 if ( (rc = vlapic_init(v)) != 0 )
724 goto fail1;
726 if ( (rc = hvm_funcs.vcpu_initialise(v)) != 0 )
727 goto fail2;
729 /* Create ioreq event channel. */
730 rc = alloc_unbound_xen_event_channel(v, 0);
731 if ( rc < 0 )
732 goto fail3;
734 /* Register ioreq event channel. */
735 v->arch.hvm_vcpu.xen_port = rc;
736 spin_lock(&v->domain->arch.hvm_domain.ioreq.lock);
737 if ( v->domain->arch.hvm_domain.ioreq.va != NULL )
738 get_ioreq(v)->vp_eport = v->arch.hvm_vcpu.xen_port;
739 spin_unlock(&v->domain->arch.hvm_domain.ioreq.lock);
741 spin_lock_init(&v->arch.hvm_vcpu.tm_lock);
742 INIT_LIST_HEAD(&v->arch.hvm_vcpu.tm_list);
744 rc = hvm_vcpu_cacheattr_init(v);
745 if ( rc != 0 )
746 goto fail3;
748 tasklet_init(&v->arch.hvm_vcpu.assert_evtchn_irq_tasklet,
749 (void(*)(unsigned long))hvm_assert_evtchn_irq,
750 (unsigned long)v);
752 v->arch.guest_context.user_regs.eflags = 2;
754 if ( v->vcpu_id == 0 )
755 {
756 /* NB. All these really belong in hvm_domain_initialise(). */
757 pit_init(v, cpu_khz);
758 pmtimer_init(v);
759 hpet_init(v);
761 /* Init guest TSC to start from zero. */
762 hvm_set_guest_tsc(v, 0);
764 /* Can start up without SIPI-SIPI or setvcpucontext domctl. */
765 v->is_initialised = 1;
766 clear_bit(_VPF_down, &v->pause_flags);
767 }
769 return 0;
771 fail3:
772 hvm_funcs.vcpu_destroy(v);
773 fail2:
774 vlapic_destroy(v);
775 fail1:
776 return rc;
777 }
779 void hvm_vcpu_destroy(struct vcpu *v)
780 {
781 tasklet_kill(&v->arch.hvm_vcpu.assert_evtchn_irq_tasklet);
782 hvm_vcpu_cacheattr_destroy(v);
783 vlapic_destroy(v);
784 hvm_funcs.vcpu_destroy(v);
786 /* Event channel is already freed by evtchn_destroy(). */
787 /*free_xen_event_channel(v, v->arch.hvm_vcpu.xen_port);*/
788 }
790 void hvm_vcpu_down(struct vcpu *v)
791 {
792 struct domain *d = v->domain;
793 int online_count = 0;
795 /* Doesn't halt us immediately, but we'll never return to guest context. */
796 set_bit(_VPF_down, &v->pause_flags);
797 vcpu_sleep_nosync(v);
799 /* Any other VCPUs online? ... */
800 domain_lock(d);
801 for_each_vcpu ( d, v )
802 if ( !test_bit(_VPF_down, &v->pause_flags) )
803 online_count++;
804 domain_unlock(d);
806 /* ... Shut down the domain if not. */
807 if ( online_count == 0 )
808 {
809 gdprintk(XENLOG_INFO, "All CPUs offline -- powering off.\n");
810 domain_shutdown(d, SHUTDOWN_poweroff);
811 }
812 }
814 void hvm_send_assist_req(struct vcpu *v)
815 {
816 ioreq_t *p;
818 if ( unlikely(!vcpu_start_shutdown_deferral(v)) )
819 return; /* implicitly bins the i/o operation */
821 p = &get_ioreq(v)->vp_ioreq;
822 if ( unlikely(p->state != STATE_IOREQ_NONE) )
823 {
824 /* This indicates a bug in the device model. Crash the domain. */
825 gdprintk(XENLOG_ERR, "Device model set bad IO state %d.\n", p->state);
826 domain_crash(v->domain);
827 return;
828 }
830 prepare_wait_on_xen_event_channel(v->arch.hvm_vcpu.xen_port);
832 /*
833 * Following happens /after/ blocking and setting up ioreq contents.
834 * prepare_wait_on_xen_event_channel() is an implicit barrier.
835 */
836 p->state = STATE_IOREQ_READY;
837 notify_via_xen_event_channel(v->arch.hvm_vcpu.xen_port);
838 }
840 void hvm_hlt(unsigned long rflags)
841 {
842 struct vcpu *curr = current;
844 if ( hvm_event_pending(curr) )
845 return;
847 /*
848 * If we halt with interrupts disabled, that's a pretty sure sign that we
849 * want to shut down. In a real processor, NMIs are the only way to break
850 * out of this.
851 */
852 if ( unlikely(!(rflags & X86_EFLAGS_IF)) )
853 return hvm_vcpu_down(curr);
855 do_sched_op_compat(SCHEDOP_block, 0);
857 HVMTRACE_1D(HLT, /* pending = */ vcpu_runnable(curr));
858 }
860 void hvm_triple_fault(void)
861 {
862 struct vcpu *v = current;
863 gdprintk(XENLOG_INFO, "Triple fault on VCPU%d - "
864 "invoking HVM system reset.\n", v->vcpu_id);
865 domain_shutdown(v->domain, SHUTDOWN_reboot);
866 }
868 int hvm_set_efer(uint64_t value)
869 {
870 struct vcpu *v = current;
872 value &= ~EFER_LMA;
874 if ( (value & ~(EFER_FFXSE | EFER_LME | EFER_NX | EFER_SCE)) ||
875 ((sizeof(long) != 8) && (value & EFER_LME)) ||
876 (!cpu_has_nx && (value & EFER_NX)) ||
877 (!cpu_has_syscall && (value & EFER_SCE)) ||
878 (!cpu_has_ffxsr && (value & EFER_FFXSE)) )
879 {
880 gdprintk(XENLOG_WARNING, "Trying to set reserved bit in "
881 "EFER: %"PRIx64"\n", value);
882 hvm_inject_exception(TRAP_gp_fault, 0, 0);
883 return X86EMUL_EXCEPTION;
884 }
886 if ( ((value ^ v->arch.hvm_vcpu.guest_efer) & EFER_LME) &&
887 hvm_paging_enabled(v) )
888 {
889 gdprintk(XENLOG_WARNING,
890 "Trying to change EFER.LME with paging enabled\n");
891 hvm_inject_exception(TRAP_gp_fault, 0, 0);
892 return X86EMUL_EXCEPTION;
893 }
895 value |= v->arch.hvm_vcpu.guest_efer & EFER_LMA;
896 v->arch.hvm_vcpu.guest_efer = value;
897 hvm_update_guest_efer(v);
899 return X86EMUL_OKAY;
900 }
902 extern void shadow_blow_tables_per_domain(struct domain *d);
903 extern bool_t mtrr_pat_not_equal(struct vcpu *vd, struct vcpu *vs);
905 /* Exit UC mode only if all VCPUs agree on MTRR/PAT and are not in no_fill. */
906 static bool_t domain_exit_uc_mode(struct vcpu *v)
907 {
908 struct domain *d = v->domain;
909 struct vcpu *vs;
911 for_each_vcpu ( d, vs )
912 {
913 if ( (vs == v) || !vs->is_initialised )
914 continue;
915 if ( (vs->arch.hvm_vcpu.cache_mode == NO_FILL_CACHE_MODE) ||
916 mtrr_pat_not_equal(vs, v) )
917 return 0;
918 }
920 return 1;
921 }
923 static void local_flush_cache(void *info)
924 {
925 wbinvd();
926 }
928 static void hvm_set_uc_mode(struct vcpu *v, bool_t is_in_uc_mode)
929 {
930 v->domain->arch.hvm_domain.is_in_uc_mode = is_in_uc_mode;
931 shadow_blow_tables_per_domain(v->domain);
932 if ( hvm_funcs.set_uc_mode )
933 return hvm_funcs.set_uc_mode(v);
934 }
936 int hvm_set_cr0(unsigned long value)
937 {
938 struct vcpu *v = current;
939 p2m_type_t p2mt;
940 unsigned long gfn, mfn, old_value = v->arch.hvm_vcpu.guest_cr[0];
942 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR0 value = %lx", value);
944 if ( (u32)value != value )
945 {
946 HVM_DBG_LOG(DBG_LEVEL_1,
947 "Guest attempts to set upper 32 bits in CR0: %lx",
948 value);
949 goto gpf;
950 }
952 value &= ~HVM_CR0_GUEST_RESERVED_BITS;
954 /* ET is reserved and should be always be 1. */
955 value |= X86_CR0_ET;
957 if ( (value & (X86_CR0_PE | X86_CR0_PG)) == X86_CR0_PG )
958 goto gpf;
960 if ( (value & X86_CR0_PG) && !(old_value & X86_CR0_PG) )
961 {
962 if ( v->arch.hvm_vcpu.guest_efer & EFER_LME )
963 {
964 if ( !(v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_PAE) )
965 {
966 HVM_DBG_LOG(DBG_LEVEL_1, "Enable paging before PAE enable");
967 goto gpf;
968 }
969 HVM_DBG_LOG(DBG_LEVEL_1, "Enabling long mode");
970 v->arch.hvm_vcpu.guest_efer |= EFER_LMA;
971 hvm_update_guest_efer(v);
972 }
974 if ( !paging_mode_hap(v->domain) )
975 {
976 /* The guest CR3 must be pointing to the guest physical. */
977 gfn = v->arch.hvm_vcpu.guest_cr[3]>>PAGE_SHIFT;
978 mfn = mfn_x(gfn_to_mfn_current(gfn, &p2mt));
979 if ( !p2m_is_ram(p2mt) || !mfn_valid(mfn) ||
980 !get_page(mfn_to_page(mfn), v->domain))
981 {
982 gdprintk(XENLOG_ERR, "Invalid CR3 value = %lx (mfn=%lx)\n",
983 v->arch.hvm_vcpu.guest_cr[3], mfn);
984 domain_crash(v->domain);
985 return X86EMUL_UNHANDLEABLE;
986 }
988 /* Now arch.guest_table points to machine physical. */
989 v->arch.guest_table = pagetable_from_pfn(mfn);
991 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, mfn = %lx",
992 v->arch.hvm_vcpu.guest_cr[3], mfn);
993 }
994 }
995 else if ( !(value & X86_CR0_PG) && (old_value & X86_CR0_PG) )
996 {
997 /* When CR0.PG is cleared, LMA is cleared immediately. */
998 if ( hvm_long_mode_enabled(v) )
999 {
1000 v->arch.hvm_vcpu.guest_efer &= ~EFER_LMA;
1001 hvm_update_guest_efer(v);
1004 if ( !paging_mode_hap(v->domain) )
1006 put_page(pagetable_get_page(v->arch.guest_table));
1007 v->arch.guest_table = pagetable_null();
1011 if ( has_arch_pdevs(v->domain) )
1013 if ( (value & X86_CR0_CD) && !(value & X86_CR0_NW) )
1015 /* Entering no fill cache mode. */
1016 spin_lock(&v->domain->arch.hvm_domain.uc_lock);
1017 v->arch.hvm_vcpu.cache_mode = NO_FILL_CACHE_MODE;
1019 if ( !v->domain->arch.hvm_domain.is_in_uc_mode )
1021 /* Flush physical caches. */
1022 on_each_cpu(local_flush_cache, NULL, 1);
1023 hvm_set_uc_mode(v, 1);
1025 spin_unlock(&v->domain->arch.hvm_domain.uc_lock);
1027 else if ( !(value & (X86_CR0_CD | X86_CR0_NW)) &&
1028 (v->arch.hvm_vcpu.cache_mode == NO_FILL_CACHE_MODE) )
1030 /* Exit from no fill cache mode. */
1031 spin_lock(&v->domain->arch.hvm_domain.uc_lock);
1032 v->arch.hvm_vcpu.cache_mode = NORMAL_CACHE_MODE;
1034 if ( domain_exit_uc_mode(v) )
1035 hvm_set_uc_mode(v, 0);
1037 spin_unlock(&v->domain->arch.hvm_domain.uc_lock);
1041 v->arch.hvm_vcpu.guest_cr[0] = value;
1042 hvm_update_guest_cr(v, 0);
1044 if ( (value ^ old_value) & X86_CR0_PG )
1045 paging_update_paging_modes(v);
1047 return X86EMUL_OKAY;
1049 gpf:
1050 hvm_inject_exception(TRAP_gp_fault, 0, 0);
1051 return X86EMUL_EXCEPTION;
1054 int hvm_set_cr3(unsigned long value)
1056 unsigned long mfn;
1057 p2m_type_t p2mt;
1058 struct vcpu *v = current;
1060 if ( hvm_paging_enabled(v) && !paging_mode_hap(v->domain) &&
1061 (value != v->arch.hvm_vcpu.guest_cr[3]) )
1063 /* Shadow-mode CR3 change. Check PDBR and update refcounts. */
1064 HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 value = %lx", value);
1065 mfn = mfn_x(gfn_to_mfn_current(value >> PAGE_SHIFT, &p2mt));
1066 if ( !p2m_is_ram(p2mt) || !mfn_valid(mfn) ||
1067 !get_page(mfn_to_page(mfn), v->domain) )
1068 goto bad_cr3;
1070 put_page(pagetable_get_page(v->arch.guest_table));
1071 v->arch.guest_table = pagetable_from_pfn(mfn);
1073 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx", value);
1076 v->arch.hvm_vcpu.guest_cr[3] = value;
1077 paging_update_cr3(v);
1078 return X86EMUL_OKAY;
1080 bad_cr3:
1081 gdprintk(XENLOG_ERR, "Invalid CR3\n");
1082 domain_crash(v->domain);
1083 return X86EMUL_UNHANDLEABLE;
1086 int hvm_set_cr4(unsigned long value)
1088 struct vcpu *v = current;
1089 unsigned long old_cr;
1091 if ( value & HVM_CR4_GUEST_RESERVED_BITS )
1093 HVM_DBG_LOG(DBG_LEVEL_1,
1094 "Guest attempts to set reserved bit in CR4: %lx",
1095 value);
1096 goto gpf;
1099 if ( !(value & X86_CR4_PAE) && hvm_long_mode_enabled(v) )
1101 HVM_DBG_LOG(DBG_LEVEL_1, "Guest cleared CR4.PAE while "
1102 "EFER.LMA is set");
1103 goto gpf;
1106 old_cr = v->arch.hvm_vcpu.guest_cr[4];
1107 v->arch.hvm_vcpu.guest_cr[4] = value;
1108 hvm_update_guest_cr(v, 4);
1110 /* Modifying CR4.{PSE,PAE,PGE} invalidates all TLB entries, inc. Global. */
1111 if ( (old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE) )
1112 paging_update_paging_modes(v);
1114 return X86EMUL_OKAY;
1116 gpf:
1117 hvm_inject_exception(TRAP_gp_fault, 0, 0);
1118 return X86EMUL_EXCEPTION;
1121 int hvm_virtual_to_linear_addr(
1122 enum x86_segment seg,
1123 struct segment_register *reg,
1124 unsigned long offset,
1125 unsigned int bytes,
1126 enum hvm_access_type access_type,
1127 unsigned int addr_size,
1128 unsigned long *linear_addr)
1130 unsigned long addr = offset;
1131 uint32_t last_byte;
1133 if ( !(current->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE) )
1135 /*
1136 * REAL MODE: Don't bother with segment access checks.
1137 * Certain of them are not done in native real mode anyway.
1138 */
1139 addr = (uint32_t)(addr + reg->base);
1141 else if ( addr_size != 64 )
1143 /*
1144 * COMPATIBILITY MODE: Apply segment checks and add base.
1145 */
1147 switch ( access_type )
1149 case hvm_access_read:
1150 if ( (reg->attr.fields.type & 0xa) == 0x8 )
1151 goto gpf; /* execute-only code segment */
1152 break;
1153 case hvm_access_write:
1154 if ( (reg->attr.fields.type & 0xa) != 0x2 )
1155 goto gpf; /* not a writable data segment */
1156 break;
1157 default:
1158 break;
1161 last_byte = offset + bytes - 1;
1163 /* Is this a grows-down data segment? Special limit check if so. */
1164 if ( (reg->attr.fields.type & 0xc) == 0x4 )
1166 /* Is upper limit 0xFFFF or 0xFFFFFFFF? */
1167 if ( !reg->attr.fields.db )
1168 last_byte = (uint16_t)last_byte;
1170 /* Check first byte and last byte against respective bounds. */
1171 if ( (offset <= reg->limit) || (last_byte < offset) )
1172 goto gpf;
1174 else if ( (last_byte > reg->limit) || (last_byte < offset) )
1175 goto gpf; /* last byte is beyond limit or wraps 0xFFFFFFFF */
1177 /*
1178 * Hardware truncates to 32 bits in compatibility mode.
1179 * It does not truncate to 16 bits in 16-bit address-size mode.
1180 */
1181 addr = (uint32_t)(addr + reg->base);
1183 else
1185 /*
1186 * LONG MODE: FS and GS add segment base. Addresses must be canonical.
1187 */
1189 if ( (seg == x86_seg_fs) || (seg == x86_seg_gs) )
1190 addr += reg->base;
1192 if ( !is_canonical_address(addr) )
1193 goto gpf;
1196 *linear_addr = addr;
1197 return 1;
1199 gpf:
1200 return 0;
1203 static void *hvm_map_entry(unsigned long va)
1205 unsigned long gfn, mfn;
1206 p2m_type_t p2mt;
1207 uint32_t pfec;
1209 if ( ((va & ~PAGE_MASK) + 8) > PAGE_SIZE )
1211 gdprintk(XENLOG_ERR, "Descriptor table entry "
1212 "straddles page boundary\n");
1213 domain_crash(current->domain);
1214 return NULL;
1217 /* We're mapping on behalf of the segment-load logic, which might
1218 * write the accessed flags in the descriptors (in 32-bit mode), but
1219 * we still treat it as a kernel-mode read (i.e. no access checks). */
1220 pfec = PFEC_page_present;
1221 gfn = paging_gva_to_gfn(current, va, &pfec);
1222 mfn = mfn_x(gfn_to_mfn_current(gfn, &p2mt));
1223 if ( !p2m_is_ram(p2mt) )
1225 gdprintk(XENLOG_ERR, "Failed to look up descriptor table entry\n");
1226 domain_crash(current->domain);
1227 return NULL;
1230 ASSERT(mfn_valid(mfn));
1232 paging_mark_dirty(current->domain, mfn);
1234 return (char *)map_domain_page(mfn) + (va & ~PAGE_MASK);
1237 static void hvm_unmap_entry(void *p)
1239 if ( p )
1240 unmap_domain_page(p);
1243 static int hvm_load_segment_selector(
1244 enum x86_segment seg, uint16_t sel)
1246 struct segment_register desctab, cs, segr;
1247 struct desc_struct *pdesc, desc;
1248 u8 dpl, rpl, cpl;
1249 int fault_type = TRAP_invalid_tss;
1250 struct cpu_user_regs *regs = guest_cpu_user_regs();
1251 struct vcpu *v = current;
1253 if ( regs->eflags & EF_VM )
1255 segr.sel = sel;
1256 segr.base = (uint32_t)sel << 4;
1257 segr.limit = 0xffffu;
1258 segr.attr.bytes = 0xf3;
1259 hvm_set_segment_register(v, seg, &segr);
1260 return 0;
1263 /* NULL selector? */
1264 if ( (sel & 0xfffc) == 0 )
1266 if ( (seg == x86_seg_cs) || (seg == x86_seg_ss) )
1267 goto fail;
1268 memset(&segr, 0, sizeof(segr));
1269 hvm_set_segment_register(v, seg, &segr);
1270 return 0;
1273 /* LDT descriptor must be in the GDT. */
1274 if ( (seg == x86_seg_ldtr) && (sel & 4) )
1275 goto fail;
1277 hvm_get_segment_register(v, x86_seg_cs, &cs);
1278 hvm_get_segment_register(
1279 v, (sel & 4) ? x86_seg_ldtr : x86_seg_gdtr, &desctab);
1281 /* Check against descriptor table limit. */
1282 if ( ((sel & 0xfff8) + 7) > desctab.limit )
1283 goto fail;
1285 pdesc = hvm_map_entry(desctab.base + (sel & 0xfff8));
1286 if ( pdesc == NULL )
1287 goto hvm_map_fail;
1289 do {
1290 desc = *pdesc;
1292 /* Segment present in memory? */
1293 if ( !(desc.b & (1u<<15)) )
1295 fault_type = TRAP_no_segment;
1296 goto unmap_and_fail;
1299 /* LDT descriptor is a system segment. All others are code/data. */
1300 if ( (desc.b & (1u<<12)) == ((seg == x86_seg_ldtr) << 12) )
1301 goto unmap_and_fail;
1303 dpl = (desc.b >> 13) & 3;
1304 rpl = sel & 3;
1305 cpl = cs.sel & 3;
1307 switch ( seg )
1309 case x86_seg_cs:
1310 /* Code segment? */
1311 if ( !(desc.b & (1u<<11)) )
1312 goto unmap_and_fail;
1313 /* Non-conforming segment: check DPL against RPL. */
1314 if ( ((desc.b & (6u<<9)) != 6) && (dpl != rpl) )
1315 goto unmap_and_fail;
1316 break;
1317 case x86_seg_ss:
1318 /* Writable data segment? */
1319 if ( (desc.b & (5u<<9)) != (1u<<9) )
1320 goto unmap_and_fail;
1321 if ( (dpl != cpl) || (dpl != rpl) )
1322 goto unmap_and_fail;
1323 break;
1324 case x86_seg_ldtr:
1325 /* LDT system segment? */
1326 if ( (desc.b & (15u<<8)) != (2u<<8) )
1327 goto unmap_and_fail;
1328 goto skip_accessed_flag;
1329 default:
1330 /* Readable code or data segment? */
1331 if ( (desc.b & (5u<<9)) == (4u<<9) )
1332 goto unmap_and_fail;
1333 /* Non-conforming segment: check DPL against RPL and CPL. */
1334 if ( ((desc.b & (6u<<9)) != 6) && ((dpl < cpl) || (dpl < rpl)) )
1335 goto unmap_and_fail;
1336 break;
1338 } while ( !(desc.b & 0x100) && /* Ensure Accessed flag is set */
1339 (cmpxchg(&pdesc->b, desc.b, desc.b | 0x100) != desc.b) );
1341 /* Force the Accessed flag in our local copy. */
1342 desc.b |= 0x100;
1344 skip_accessed_flag:
1345 hvm_unmap_entry(pdesc);
1347 segr.base = (((desc.b << 0) & 0xff000000u) |
1348 ((desc.b << 16) & 0x00ff0000u) |
1349 ((desc.a >> 16) & 0x0000ffffu));
1350 segr.attr.bytes = (((desc.b >> 8) & 0x00ffu) |
1351 ((desc.b >> 12) & 0x0f00u));
1352 segr.limit = (desc.b & 0x000f0000u) | (desc.a & 0x0000ffffu);
1353 if ( segr.attr.fields.g )
1354 segr.limit = (segr.limit << 12) | 0xfffu;
1355 segr.sel = sel;
1356 hvm_set_segment_register(v, seg, &segr);
1358 return 0;
1360 unmap_and_fail:
1361 hvm_unmap_entry(pdesc);
1362 fail:
1363 hvm_inject_exception(fault_type, sel & 0xfffc, 0);
1364 hvm_map_fail:
1365 return 1;
1368 void hvm_task_switch(
1369 uint16_t tss_sel, enum hvm_task_switch_reason taskswitch_reason,
1370 int32_t errcode)
1372 struct vcpu *v = current;
1373 struct cpu_user_regs *regs = guest_cpu_user_regs();
1374 struct segment_register gdt, tr, prev_tr, segr;
1375 struct desc_struct *optss_desc = NULL, *nptss_desc = NULL, tss_desc;
1376 unsigned long eflags;
1377 int exn_raised, rc;
1378 struct {
1379 u16 back_link,__blh;
1380 u32 esp0;
1381 u16 ss0, _0;
1382 u32 esp1;
1383 u16 ss1, _1;
1384 u32 esp2;
1385 u16 ss2, _2;
1386 u32 cr3, eip, eflags, eax, ecx, edx, ebx, esp, ebp, esi, edi;
1387 u16 es, _3, cs, _4, ss, _5, ds, _6, fs, _7, gs, _8, ldt, _9;
1388 u16 trace, iomap;
1389 } tss = { 0 };
1391 hvm_get_segment_register(v, x86_seg_gdtr, &gdt);
1392 hvm_get_segment_register(v, x86_seg_tr, &prev_tr);
1394 if ( ((tss_sel & 0xfff8) + 7) > gdt.limit )
1396 hvm_inject_exception((taskswitch_reason == TSW_iret) ?
1397 TRAP_invalid_tss : TRAP_gp_fault,
1398 tss_sel & 0xfff8, 0);
1399 goto out;
1402 optss_desc = hvm_map_entry(gdt.base + (prev_tr.sel & 0xfff8));
1403 if ( optss_desc == NULL )
1404 goto out;
1406 nptss_desc = hvm_map_entry(gdt.base + (tss_sel & 0xfff8));
1407 if ( nptss_desc == NULL )
1408 goto out;
1410 tss_desc = *nptss_desc;
1411 tr.sel = tss_sel;
1412 tr.base = (((tss_desc.b << 0) & 0xff000000u) |
1413 ((tss_desc.b << 16) & 0x00ff0000u) |
1414 ((tss_desc.a >> 16) & 0x0000ffffu));
1415 tr.attr.bytes = (((tss_desc.b >> 8) & 0x00ffu) |
1416 ((tss_desc.b >> 12) & 0x0f00u));
1417 tr.limit = (tss_desc.b & 0x000f0000u) | (tss_desc.a & 0x0000ffffu);
1418 if ( tr.attr.fields.g )
1419 tr.limit = (tr.limit << 12) | 0xfffu;
1421 if ( !tr.attr.fields.p )
1423 hvm_inject_exception(TRAP_no_segment, tss_sel & 0xfff8, 0);
1424 goto out;
1427 if ( tr.attr.fields.type != ((taskswitch_reason == TSW_iret) ? 0xb : 0x9) )
1429 hvm_inject_exception(
1430 (taskswitch_reason == TSW_iret) ? TRAP_invalid_tss : TRAP_gp_fault,
1431 tss_sel & 0xfff8, 0);
1432 goto out;
1435 if ( tr.limit < (sizeof(tss)-1) )
1437 hvm_inject_exception(TRAP_invalid_tss, tss_sel & 0xfff8, 0);
1438 goto out;
1441 rc = hvm_copy_from_guest_virt(
1442 &tss, prev_tr.base, sizeof(tss), PFEC_page_present);
1443 if ( rc == HVMCOPY_bad_gva_to_gfn )
1444 goto out;
1446 eflags = regs->eflags;
1447 if ( taskswitch_reason == TSW_iret )
1448 eflags &= ~X86_EFLAGS_NT;
1450 tss.cr3 = v->arch.hvm_vcpu.guest_cr[3];
1451 tss.eip = regs->eip;
1452 tss.eflags = eflags;
1453 tss.eax = regs->eax;
1454 tss.ecx = regs->ecx;
1455 tss.edx = regs->edx;
1456 tss.ebx = regs->ebx;
1457 tss.esp = regs->esp;
1458 tss.ebp = regs->ebp;
1459 tss.esi = regs->esi;
1460 tss.edi = regs->edi;
1462 hvm_get_segment_register(v, x86_seg_es, &segr);
1463 tss.es = segr.sel;
1464 hvm_get_segment_register(v, x86_seg_cs, &segr);
1465 tss.cs = segr.sel;
1466 hvm_get_segment_register(v, x86_seg_ss, &segr);
1467 tss.ss = segr.sel;
1468 hvm_get_segment_register(v, x86_seg_ds, &segr);
1469 tss.ds = segr.sel;
1470 hvm_get_segment_register(v, x86_seg_fs, &segr);
1471 tss.fs = segr.sel;
1472 hvm_get_segment_register(v, x86_seg_gs, &segr);
1473 tss.gs = segr.sel;
1474 hvm_get_segment_register(v, x86_seg_ldtr, &segr);
1475 tss.ldt = segr.sel;
1477 rc = hvm_copy_to_guest_virt(
1478 prev_tr.base, &tss, sizeof(tss), PFEC_page_present);
1479 if ( rc == HVMCOPY_bad_gva_to_gfn )
1480 goto out;
1482 rc = hvm_copy_from_guest_virt(
1483 &tss, tr.base, sizeof(tss), PFEC_page_present);
1484 if ( rc == HVMCOPY_bad_gva_to_gfn )
1485 goto out;
1487 if ( hvm_set_cr3(tss.cr3) )
1488 goto out;
1490 regs->eip = tss.eip;
1491 regs->eflags = tss.eflags | 2;
1492 regs->eax = tss.eax;
1493 regs->ecx = tss.ecx;
1494 regs->edx = tss.edx;
1495 regs->ebx = tss.ebx;
1496 regs->esp = tss.esp;
1497 regs->ebp = tss.ebp;
1498 regs->esi = tss.esi;
1499 regs->edi = tss.edi;
1501 if ( (taskswitch_reason == TSW_call_or_int) )
1503 regs->eflags |= X86_EFLAGS_NT;
1504 tss.back_link = prev_tr.sel;
1507 exn_raised = 0;
1508 if ( hvm_load_segment_selector(x86_seg_ldtr, tss.ldt) ||
1509 hvm_load_segment_selector(x86_seg_es, tss.es) ||
1510 hvm_load_segment_selector(x86_seg_cs, tss.cs) ||
1511 hvm_load_segment_selector(x86_seg_ss, tss.ss) ||
1512 hvm_load_segment_selector(x86_seg_ds, tss.ds) ||
1513 hvm_load_segment_selector(x86_seg_fs, tss.fs) ||
1514 hvm_load_segment_selector(x86_seg_gs, tss.gs) )
1515 exn_raised = 1;
1517 rc = hvm_copy_to_guest_virt(
1518 tr.base, &tss, sizeof(tss), PFEC_page_present);
1519 if ( rc == HVMCOPY_bad_gva_to_gfn )
1520 exn_raised = 1;
1522 if ( (tss.trace & 1) && !exn_raised )
1523 hvm_inject_exception(TRAP_debug, tss_sel & 0xfff8, 0);
1525 tr.attr.fields.type = 0xb; /* busy 32-bit tss */
1526 hvm_set_segment_register(v, x86_seg_tr, &tr);
1528 v->arch.hvm_vcpu.guest_cr[0] |= X86_CR0_TS;
1529 hvm_update_guest_cr(v, 0);
1531 if ( (taskswitch_reason == TSW_iret) ||
1532 (taskswitch_reason == TSW_jmp) )
1533 clear_bit(41, optss_desc); /* clear B flag of old task */
1535 if ( taskswitch_reason != TSW_iret )
1536 set_bit(41, nptss_desc); /* set B flag of new task */
1538 if ( errcode >= 0 )
1540 struct segment_register reg;
1541 unsigned long linear_addr;
1542 regs->esp -= 4;
1543 hvm_get_segment_register(current, x86_seg_ss, &reg);
1544 /* Todo: do not ignore access faults here. */
1545 if ( hvm_virtual_to_linear_addr(x86_seg_ss, &reg, regs->esp,
1546 4, hvm_access_write, 32,
1547 &linear_addr) )
1548 hvm_copy_to_guest_virt_nofault(linear_addr, &errcode, 4, 0);
1551 out:
1552 hvm_unmap_entry(optss_desc);
1553 hvm_unmap_entry(nptss_desc);
1556 #define HVMCOPY_from_guest (0u<<0)
1557 #define HVMCOPY_to_guest (1u<<0)
1558 #define HVMCOPY_no_fault (0u<<1)
1559 #define HVMCOPY_fault (1u<<1)
1560 #define HVMCOPY_phys (0u<<2)
1561 #define HVMCOPY_virt (1u<<2)
1562 static enum hvm_copy_result __hvm_copy(
1563 void *buf, paddr_t addr, int size, unsigned int flags, uint32_t pfec)
1565 struct vcpu *curr = current;
1566 unsigned long gfn, mfn;
1567 p2m_type_t p2mt;
1568 char *p;
1569 int count, todo = size;
1571 while ( todo > 0 )
1573 count = min_t(int, PAGE_SIZE - (addr & ~PAGE_MASK), todo);
1575 if ( flags & HVMCOPY_virt )
1577 gfn = paging_gva_to_gfn(curr, addr, &pfec);
1578 if ( gfn == INVALID_GFN )
1580 if ( flags & HVMCOPY_fault )
1581 hvm_inject_exception(TRAP_page_fault, pfec, addr);
1582 return HVMCOPY_bad_gva_to_gfn;
1585 else
1587 gfn = addr >> PAGE_SHIFT;
1590 mfn = mfn_x(gfn_to_mfn_current(gfn, &p2mt));
1592 if ( !p2m_is_ram(p2mt) )
1593 return HVMCOPY_bad_gfn_to_mfn;
1594 ASSERT(mfn_valid(mfn));
1596 p = (char *)map_domain_page(mfn) + (addr & ~PAGE_MASK);
1598 if ( flags & HVMCOPY_to_guest )
1600 if ( p2mt == p2m_ram_ro )
1602 static unsigned long lastpage;
1603 if ( xchg(&lastpage, gfn) != gfn )
1604 gdprintk(XENLOG_DEBUG, "guest attempted write to read-only"
1605 " memory page. gfn=%#lx, mfn=%#lx\n",
1606 gfn, mfn);
1608 else
1610 memcpy(p, buf, count);
1611 paging_mark_dirty(curr->domain, mfn);
1614 else
1616 memcpy(buf, p, count);
1619 unmap_domain_page(p);
1621 addr += count;
1622 buf += count;
1623 todo -= count;
1626 return HVMCOPY_okay;
1629 enum hvm_copy_result hvm_copy_to_guest_phys(
1630 paddr_t paddr, void *buf, int size)
1632 return __hvm_copy(buf, paddr, size,
1633 HVMCOPY_to_guest | HVMCOPY_fault | HVMCOPY_phys,
1634 0);
1637 enum hvm_copy_result hvm_copy_from_guest_phys(
1638 void *buf, paddr_t paddr, int size)
1640 return __hvm_copy(buf, paddr, size,
1641 HVMCOPY_from_guest | HVMCOPY_fault | HVMCOPY_phys,
1642 0);
1645 enum hvm_copy_result hvm_copy_to_guest_virt(
1646 unsigned long vaddr, void *buf, int size, uint32_t pfec)
1648 return __hvm_copy(buf, vaddr, size,
1649 HVMCOPY_to_guest | HVMCOPY_fault | HVMCOPY_virt,
1650 PFEC_page_present | PFEC_write_access | pfec);
1653 enum hvm_copy_result hvm_copy_from_guest_virt(
1654 void *buf, unsigned long vaddr, int size, uint32_t pfec)
1656 return __hvm_copy(buf, vaddr, size,
1657 HVMCOPY_from_guest | HVMCOPY_fault | HVMCOPY_virt,
1658 PFEC_page_present | pfec);
1661 enum hvm_copy_result hvm_fetch_from_guest_virt(
1662 void *buf, unsigned long vaddr, int size, uint32_t pfec)
1664 if ( hvm_nx_enabled(current) )
1665 pfec |= PFEC_insn_fetch;
1666 return __hvm_copy(buf, vaddr, size,
1667 HVMCOPY_from_guest | HVMCOPY_fault | HVMCOPY_virt,
1668 PFEC_page_present | pfec);
1671 enum hvm_copy_result hvm_copy_to_guest_virt_nofault(
1672 unsigned long vaddr, void *buf, int size, uint32_t pfec)
1674 return __hvm_copy(buf, vaddr, size,
1675 HVMCOPY_to_guest | HVMCOPY_no_fault | HVMCOPY_virt,
1676 PFEC_page_present | PFEC_write_access | pfec);
1679 enum hvm_copy_result hvm_copy_from_guest_virt_nofault(
1680 void *buf, unsigned long vaddr, int size, uint32_t pfec)
1682 return __hvm_copy(buf, vaddr, size,
1683 HVMCOPY_from_guest | HVMCOPY_no_fault | HVMCOPY_virt,
1684 PFEC_page_present | pfec);
1687 enum hvm_copy_result hvm_fetch_from_guest_virt_nofault(
1688 void *buf, unsigned long vaddr, int size, uint32_t pfec)
1690 if ( hvm_nx_enabled(current) )
1691 pfec |= PFEC_insn_fetch;
1692 return __hvm_copy(buf, vaddr, size,
1693 HVMCOPY_from_guest | HVMCOPY_no_fault | HVMCOPY_virt,
1694 PFEC_page_present | pfec);
1697 #ifdef __x86_64__
1698 DEFINE_PER_CPU(bool_t, hvm_64bit_hcall);
1699 #endif
1701 unsigned long copy_to_user_hvm(void *to, const void *from, unsigned int len)
1703 int rc;
1705 #ifdef __x86_64__
1706 if ( !this_cpu(hvm_64bit_hcall) && is_compat_arg_xlat_range(to, len) )
1708 memcpy(to, from, len);
1709 return 0;
1711 #endif
1713 rc = hvm_copy_to_guest_virt_nofault((unsigned long)to, (void *)from,
1714 len, 0);
1715 return rc ? len : 0; /* fake a copy_to_user() return code */
1718 unsigned long copy_from_user_hvm(void *to, const void *from, unsigned len)
1720 int rc;
1722 #ifdef __x86_64__
1723 if ( !this_cpu(hvm_64bit_hcall) && is_compat_arg_xlat_range(from, len) )
1725 memcpy(to, from, len);
1726 return 0;
1728 #endif
1730 rc = hvm_copy_from_guest_virt_nofault(to, (unsigned long)from, len, 0);
1731 return rc ? len : 0; /* fake a copy_from_user() return code */
1734 #define bitmaskof(idx) (1U << ((idx) & 31))
1735 void hvm_cpuid(unsigned int input, unsigned int *eax, unsigned int *ebx,
1736 unsigned int *ecx, unsigned int *edx)
1738 struct vcpu *v = current;
1740 if ( cpuid_viridian_leaves(input, eax, ebx, ecx, edx) )
1741 return;
1743 if ( cpuid_hypervisor_leaves(input, eax, ebx, ecx, edx) )
1744 return;
1746 domain_cpuid(v->domain, input, *ecx, eax, ebx, ecx, edx);
1748 switch ( input )
1750 case 0x1:
1751 /* Fix up VLAPIC details. */
1752 *ebx &= 0x00FFFFFFu;
1753 *ebx |= (v->vcpu_id * 2) << 24;
1754 if ( vlapic_hw_disabled(vcpu_vlapic(v)) )
1755 __clear_bit(X86_FEATURE_APIC & 31, edx);
1756 break;
1757 case 0xb:
1758 /* Fix the x2APIC identifier. */
1759 *edx = v->vcpu_id * 2;
1760 break;
1764 void hvm_rdtsc_intercept(struct cpu_user_regs *regs)
1766 uint64_t tsc;
1767 struct vcpu *v = current;
1769 tsc = hvm_get_guest_tsc(v);
1770 regs->eax = (uint32_t)tsc;
1771 regs->edx = (uint32_t)(tsc >> 32);
1774 int hvm_msr_read_intercept(struct cpu_user_regs *regs)
1776 uint32_t ecx = regs->ecx;
1777 uint64_t msr_content = 0;
1778 struct vcpu *v = current;
1779 uint64_t *var_range_base, *fixed_range_base;
1780 int index, mtrr;
1781 uint32_t cpuid[4];
1783 var_range_base = (uint64_t *)v->arch.hvm_vcpu.mtrr.var_ranges;
1784 fixed_range_base = (uint64_t *)v->arch.hvm_vcpu.mtrr.fixed_ranges;
1786 hvm_cpuid(1, &cpuid[0], &cpuid[1], &cpuid[2], &cpuid[3]);
1787 mtrr = !!(cpuid[3] & bitmaskof(X86_FEATURE_MTRR));
1789 switch ( ecx )
1791 case MSR_IA32_TSC:
1792 msr_content = hvm_get_guest_tsc(v);
1793 break;
1795 case MSR_IA32_APICBASE:
1796 msr_content = vcpu_vlapic(v)->hw.apic_base_msr;
1797 break;
1799 case MSR_IA32_MCG_CAP:
1800 case MSR_IA32_MCG_STATUS:
1801 case MSR_IA32_MC0_STATUS:
1802 case MSR_IA32_MC1_STATUS:
1803 case MSR_IA32_MC2_STATUS:
1804 case MSR_IA32_MC3_STATUS:
1805 case MSR_IA32_MC4_STATUS:
1806 case MSR_IA32_MC5_STATUS:
1807 /* No point in letting the guest see real MCEs */
1808 msr_content = 0;
1809 break;
1811 case MSR_IA32_CR_PAT:
1812 msr_content = v->arch.hvm_vcpu.pat_cr;
1813 break;
1815 case MSR_MTRRcap:
1816 if ( !mtrr )
1817 goto gp_fault;
1818 msr_content = v->arch.hvm_vcpu.mtrr.mtrr_cap;
1819 break;
1820 case MSR_MTRRdefType:
1821 if ( !mtrr )
1822 goto gp_fault;
1823 msr_content = v->arch.hvm_vcpu.mtrr.def_type
1824 | (v->arch.hvm_vcpu.mtrr.enabled << 10);
1825 break;
1826 case MSR_MTRRfix64K_00000:
1827 if ( !mtrr )
1828 goto gp_fault;
1829 msr_content = fixed_range_base[0];
1830 break;
1831 case MSR_MTRRfix16K_80000:
1832 case MSR_MTRRfix16K_A0000:
1833 if ( !mtrr )
1834 goto gp_fault;
1835 index = regs->ecx - MSR_MTRRfix16K_80000;
1836 msr_content = fixed_range_base[index + 1];
1837 break;
1838 case MSR_MTRRfix4K_C0000...MSR_MTRRfix4K_F8000:
1839 if ( !mtrr )
1840 goto gp_fault;
1841 index = regs->ecx - MSR_MTRRfix4K_C0000;
1842 msr_content = fixed_range_base[index + 3];
1843 break;
1844 case MSR_IA32_MTRR_PHYSBASE0...MSR_IA32_MTRR_PHYSMASK7:
1845 if ( !mtrr )
1846 goto gp_fault;
1847 index = regs->ecx - MSR_IA32_MTRR_PHYSBASE0;
1848 msr_content = var_range_base[index];
1849 break;
1851 case MSR_K8_ENABLE_C1E:
1852 /* There's no point in letting the guest see C-States.
1853 * Further, this AMD-only register may be accessed if this HVM guest
1854 * has been migrated to an Intel host. This fixes a guest crash
1855 * in this case.
1856 */
1857 msr_content = 0;
1858 break;
1860 default:
1861 return hvm_funcs.msr_read_intercept(regs);
1864 regs->eax = (uint32_t)msr_content;
1865 regs->edx = (uint32_t)(msr_content >> 32);
1866 return X86EMUL_OKAY;
1868 gp_fault:
1869 hvm_inject_exception(TRAP_gp_fault, 0, 0);
1870 return X86EMUL_EXCEPTION;
1873 int hvm_msr_write_intercept(struct cpu_user_regs *regs)
1875 extern bool_t mtrr_var_range_msr_set(
1876 struct mtrr_state *v, u32 msr, u64 msr_content);
1877 extern bool_t mtrr_fix_range_msr_set(
1878 struct mtrr_state *v, int row, u64 msr_content);
1879 extern bool_t mtrr_def_type_msr_set(struct mtrr_state *v, u64 msr_content);
1880 extern bool_t pat_msr_set(u64 *pat, u64 msr);
1882 uint32_t ecx = regs->ecx;
1883 uint64_t msr_content = (uint32_t)regs->eax | ((uint64_t)regs->edx << 32);
1884 struct vcpu *v = current;
1885 int index, mtrr;
1886 uint32_t cpuid[4];
1888 hvm_cpuid(1, &cpuid[0], &cpuid[1], &cpuid[2], &cpuid[3]);
1889 mtrr = !!(cpuid[3] & bitmaskof(X86_FEATURE_MTRR));
1891 switch ( ecx )
1893 case MSR_IA32_TSC:
1894 hvm_set_guest_tsc(v, msr_content);
1895 pt_reset(v);
1896 break;
1898 case MSR_IA32_APICBASE:
1899 vlapic_msr_set(vcpu_vlapic(v), msr_content);
1900 break;
1902 case MSR_IA32_CR_PAT:
1903 if ( !pat_msr_set(&v->arch.hvm_vcpu.pat_cr, msr_content) )
1904 goto gp_fault;
1905 break;
1907 case MSR_MTRRcap:
1908 if ( !mtrr )
1909 goto gp_fault;
1910 goto gp_fault;
1911 case MSR_MTRRdefType:
1912 if ( !mtrr )
1913 goto gp_fault;
1914 if ( !mtrr_def_type_msr_set(&v->arch.hvm_vcpu.mtrr, msr_content) )
1915 goto gp_fault;
1916 break;
1917 case MSR_MTRRfix64K_00000:
1918 if ( !mtrr )
1919 goto gp_fault;
1920 if ( !mtrr_fix_range_msr_set(&v->arch.hvm_vcpu.mtrr, 0, msr_content) )
1921 goto gp_fault;
1922 break;
1923 case MSR_MTRRfix16K_80000:
1924 case MSR_MTRRfix16K_A0000:
1925 if ( !mtrr )
1926 goto gp_fault;
1927 index = regs->ecx - MSR_MTRRfix16K_80000 + 1;
1928 if ( !mtrr_fix_range_msr_set(&v->arch.hvm_vcpu.mtrr,
1929 index, msr_content) )
1930 goto gp_fault;
1931 break;
1932 case MSR_MTRRfix4K_C0000...MSR_MTRRfix4K_F8000:
1933 if ( !mtrr )
1934 goto gp_fault;
1935 index = regs->ecx - MSR_MTRRfix4K_C0000 + 3;
1936 if ( !mtrr_fix_range_msr_set(&v->arch.hvm_vcpu.mtrr,
1937 index, msr_content) )
1938 goto gp_fault;
1939 break;
1940 case MSR_IA32_MTRR_PHYSBASE0...MSR_IA32_MTRR_PHYSMASK7:
1941 if ( !mtrr )
1942 goto gp_fault;
1943 if ( !mtrr_var_range_msr_set(&v->arch.hvm_vcpu.mtrr,
1944 regs->ecx, msr_content) )
1945 goto gp_fault;
1946 break;
1948 default:
1949 return hvm_funcs.msr_write_intercept(regs);
1952 return X86EMUL_OKAY;
1954 gp_fault:
1955 hvm_inject_exception(TRAP_gp_fault, 0, 0);
1956 return X86EMUL_EXCEPTION;
1959 enum hvm_intblk hvm_interrupt_blocked(struct vcpu *v, struct hvm_intack intack)
1961 unsigned long intr_shadow;
1963 ASSERT(v == current);
1965 if ( (intack.source != hvm_intsrc_nmi) &&
1966 !(guest_cpu_user_regs()->eflags & X86_EFLAGS_IF) )
1967 return hvm_intblk_rflags_ie;
1969 intr_shadow = hvm_funcs.get_interrupt_shadow(v);
1971 if ( intr_shadow & (HVM_INTR_SHADOW_STI|HVM_INTR_SHADOW_MOV_SS) )
1972 return hvm_intblk_shadow;
1974 if ( intack.source == hvm_intsrc_nmi )
1975 return ((intr_shadow & HVM_INTR_SHADOW_NMI) ?
1976 hvm_intblk_nmi_iret : hvm_intblk_none);
1978 if ( intack.source == hvm_intsrc_lapic )
1980 uint32_t tpr = vlapic_get_reg(vcpu_vlapic(v), APIC_TASKPRI) & 0xF0;
1981 if ( (tpr >> 4) >= (intack.vector >> 4) )
1982 return hvm_intblk_tpr;
1985 return hvm_intblk_none;
1988 static long hvm_grant_table_op(
1989 unsigned int cmd, XEN_GUEST_HANDLE(void) uop, unsigned int count)
1991 if ( (cmd != GNTTABOP_query_size) && (cmd != GNTTABOP_setup_table) )
1992 return -ENOSYS; /* all other commands need auditing */
1993 return do_grant_table_op(cmd, uop, count);
1996 static long hvm_memory_op(int cmd, XEN_GUEST_HANDLE(void) arg)
1998 long rc = do_memory_op(cmd, arg);
1999 if ( (cmd & MEMOP_CMD_MASK) == XENMEM_decrease_reservation )
2000 current->domain->arch.hvm_domain.qemu_mapcache_invalidate = 1;
2001 return rc;
2004 static long hvm_vcpu_op(
2005 int cmd, int vcpuid, XEN_GUEST_HANDLE(void) arg)
2007 long rc;
2009 switch ( cmd )
2011 case VCPUOP_register_runstate_memory_area:
2012 case VCPUOP_get_runstate_info:
2013 rc = do_vcpu_op(cmd, vcpuid, arg);
2014 break;
2015 default:
2016 rc = -ENOSYS;
2017 break;
2020 return rc;
2023 typedef unsigned long hvm_hypercall_t(
2024 unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
2026 #define HYPERCALL(x) \
2027 [ __HYPERVISOR_ ## x ] = (hvm_hypercall_t *) do_ ## x
2029 #if defined(__i386__)
2031 static hvm_hypercall_t *hvm_hypercall32_table[NR_hypercalls] = {
2032 [ __HYPERVISOR_memory_op ] = (hvm_hypercall_t *)hvm_memory_op,
2033 [ __HYPERVISOR_grant_table_op ] = (hvm_hypercall_t *)hvm_grant_table_op,
2034 [ __HYPERVISOR_vcpu_op ] = (hvm_hypercall_t *)hvm_vcpu_op,
2035 HYPERCALL(xen_version),
2036 HYPERCALL(event_channel_op),
2037 HYPERCALL(sched_op),
2038 HYPERCALL(hvm_op)
2039 };
2041 #else /* defined(__x86_64__) */
2043 static long hvm_memory_op_compat32(int cmd, XEN_GUEST_HANDLE(void) arg)
2045 long rc = compat_memory_op(cmd, arg);
2046 if ( (cmd & MEMOP_CMD_MASK) == XENMEM_decrease_reservation )
2047 current->domain->arch.hvm_domain.qemu_mapcache_invalidate = 1;
2048 return rc;
2051 static long hvm_vcpu_op_compat32(
2052 int cmd, int vcpuid, XEN_GUEST_HANDLE(void) arg)
2054 long rc;
2056 switch ( cmd )
2058 case VCPUOP_register_runstate_memory_area:
2059 case VCPUOP_get_runstate_info:
2060 rc = compat_vcpu_op(cmd, vcpuid, arg);
2061 break;
2062 default:
2063 rc = -ENOSYS;
2064 break;
2067 return rc;
2070 static hvm_hypercall_t *hvm_hypercall64_table[NR_hypercalls] = {
2071 [ __HYPERVISOR_memory_op ] = (hvm_hypercall_t *)hvm_memory_op,
2072 [ __HYPERVISOR_grant_table_op ] = (hvm_hypercall_t *)hvm_grant_table_op,
2073 [ __HYPERVISOR_vcpu_op ] = (hvm_hypercall_t *)hvm_vcpu_op,
2074 HYPERCALL(xen_version),
2075 HYPERCALL(event_channel_op),
2076 HYPERCALL(sched_op),
2077 HYPERCALL(hvm_op)
2078 };
2080 static hvm_hypercall_t *hvm_hypercall32_table[NR_hypercalls] = {
2081 [ __HYPERVISOR_memory_op ] = (hvm_hypercall_t *)hvm_memory_op_compat32,
2082 [ __HYPERVISOR_grant_table_op ] = (hvm_hypercall_t *)hvm_grant_table_op,
2083 [ __HYPERVISOR_vcpu_op ] = (hvm_hypercall_t *)hvm_vcpu_op_compat32,
2084 HYPERCALL(xen_version),
2085 HYPERCALL(event_channel_op),
2086 HYPERCALL(sched_op),
2087 HYPERCALL(hvm_op)
2088 };
2090 #endif /* defined(__x86_64__) */
2092 int hvm_do_hypercall(struct cpu_user_regs *regs)
2094 struct vcpu *curr = current;
2095 struct segment_register sreg;
2096 int mode = hvm_guest_x86_mode(curr);
2097 uint32_t eax = regs->eax;
2099 switch ( mode )
2101 #ifdef __x86_64__
2102 case 8:
2103 #endif
2104 case 4:
2105 case 2:
2106 hvm_get_segment_register(curr, x86_seg_ss, &sreg);
2107 if ( unlikely(sreg.attr.fields.dpl == 3) )
2109 default:
2110 regs->eax = -EPERM;
2111 return HVM_HCALL_completed;
2113 case 0:
2114 break;
2117 if ( (eax & 0x80000000) && is_viridian_domain(curr->domain) )
2118 return viridian_hypercall(regs);
2120 if ( (eax >= NR_hypercalls) || !hvm_hypercall32_table[eax] )
2122 regs->eax = -ENOSYS;
2123 return HVM_HCALL_completed;
2126 this_cpu(hc_preempted) = 0;
2128 #ifdef __x86_64__
2129 if ( mode == 8 )
2131 HVM_DBG_LOG(DBG_LEVEL_HCALL, "hcall%u(%lx, %lx, %lx, %lx, %lx)", eax,
2132 regs->rdi, regs->rsi, regs->rdx, regs->r10, regs->r8);
2134 this_cpu(hvm_64bit_hcall) = 1;
2135 regs->rax = hvm_hypercall64_table[eax](regs->rdi,
2136 regs->rsi,
2137 regs->rdx,
2138 regs->r10,
2139 regs->r8);
2140 this_cpu(hvm_64bit_hcall) = 0;
2142 else
2143 #endif
2145 HVM_DBG_LOG(DBG_LEVEL_HCALL, "hcall%u(%x, %x, %x, %x, %x)", eax,
2146 (uint32_t)regs->ebx, (uint32_t)regs->ecx,
2147 (uint32_t)regs->edx, (uint32_t)regs->esi,
2148 (uint32_t)regs->edi);
2150 regs->eax = hvm_hypercall32_table[eax]((uint32_t)regs->ebx,
2151 (uint32_t)regs->ecx,
2152 (uint32_t)regs->edx,
2153 (uint32_t)regs->esi,
2154 (uint32_t)regs->edi);
2157 HVM_DBG_LOG(DBG_LEVEL_HCALL, "hcall%u -> %lx",
2158 eax, (unsigned long)regs->eax);
2160 if ( this_cpu(hc_preempted) )
2161 return HVM_HCALL_preempted;
2163 if ( unlikely(curr->domain->arch.hvm_domain.qemu_mapcache_invalidate) &&
2164 test_and_clear_bool(curr->domain->arch.hvm_domain.
2165 qemu_mapcache_invalidate) )
2166 return HVM_HCALL_invalidate;
2168 return HVM_HCALL_completed;
2171 static void hvm_latch_shinfo_size(struct domain *d)
2173 /*
2174 * Called from operations which are among the very first executed by
2175 * PV drivers on initialisation or after save/restore. These are sensible
2176 * points at which to sample the execution mode of the guest and latch
2177 * 32- or 64-bit format for shared state.
2178 */
2179 if ( current->domain == d )
2180 d->arch.has_32bit_shinfo = (hvm_guest_x86_mode(current) != 8);
2183 /* Initialise a hypercall transfer page for a VMX domain using
2184 paravirtualised drivers. */
2185 void hvm_hypercall_page_initialise(struct domain *d,
2186 void *hypercall_page)
2188 hvm_latch_shinfo_size(d);
2189 hvm_funcs.init_hypercall_page(d, hypercall_page);
2192 static int hvmop_set_pci_intx_level(
2193 XEN_GUEST_HANDLE(xen_hvm_set_pci_intx_level_t) uop)
2195 struct xen_hvm_set_pci_intx_level op;
2196 struct domain *d;
2197 int rc;
2199 if ( copy_from_guest(&op, uop, 1) )
2200 return -EFAULT;
2202 if ( (op.domain > 0) || (op.bus > 0) || (op.device > 31) || (op.intx > 3) )
2203 return -EINVAL;
2205 d = rcu_lock_domain_by_id(op.domid);
2206 if ( d == NULL )
2207 return -ESRCH;
2209 rc = -EPERM;
2210 if ( !IS_PRIV_FOR(current->domain, d) )
2211 goto out;
2213 rc = -EINVAL;
2214 if ( !is_hvm_domain(d) )
2215 goto out;
2217 rc = xsm_hvm_set_pci_intx_level(d);
2218 if ( rc )
2219 goto out;
2221 rc = 0;
2222 switch ( op.level )
2224 case 0:
2225 hvm_pci_intx_deassert(d, op.device, op.intx);
2226 break;
2227 case 1:
2228 hvm_pci_intx_assert(d, op.device, op.intx);
2229 break;
2230 default:
2231 rc = -EINVAL;
2232 break;
2235 out:
2236 rcu_unlock_domain(d);
2237 return rc;
2240 void hvm_vcpu_reset_state(struct vcpu *v, uint16_t cs, uint16_t ip)
2242 struct domain *d = v->domain;
2243 struct vcpu_guest_context *ctxt;
2244 struct segment_register reg;
2246 BUG_ON(vcpu_runnable(v));
2248 domain_lock(d);
2250 if ( v->is_initialised )
2251 goto out;
2253 if ( !paging_mode_hap(d) )
2255 if ( v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PG )
2256 put_page(pagetable_get_page(v->arch.guest_table));
2257 v->arch.guest_table = pagetable_null();
2260 ctxt = &v->arch.guest_context;
2261 memset(ctxt, 0, sizeof(*ctxt));
2262 ctxt->flags = VGCF_online;
2263 ctxt->user_regs.eflags = 2;
2264 ctxt->user_regs.edx = 0x00000f00;
2265 ctxt->user_regs.eip = ip;
2267 v->arch.hvm_vcpu.guest_cr[0] = X86_CR0_ET;
2268 hvm_update_guest_cr(v, 0);
2270 v->arch.hvm_vcpu.guest_cr[2] = 0;
2271 hvm_update_guest_cr(v, 2);
2273 v->arch.hvm_vcpu.guest_cr[3] = 0;
2274 hvm_update_guest_cr(v, 3);
2276 v->arch.hvm_vcpu.guest_cr[4] = 0;
2277 hvm_update_guest_cr(v, 4);
2279 v->arch.hvm_vcpu.guest_efer = 0;
2280 hvm_update_guest_efer(v);
2282 reg.sel = cs;
2283 reg.base = (uint32_t)reg.sel << 4;
2284 reg.limit = 0xffff;
2285 reg.attr.bytes = 0x09b;
2286 hvm_set_segment_register(v, x86_seg_cs, &reg);
2288 reg.sel = reg.base = 0;
2289 reg.limit = 0xffff;
2290 reg.attr.bytes = 0x093;
2291 hvm_set_segment_register(v, x86_seg_ds, &reg);
2292 hvm_set_segment_register(v, x86_seg_es, &reg);
2293 hvm_set_segment_register(v, x86_seg_fs, &reg);
2294 hvm_set_segment_register(v, x86_seg_gs, &reg);
2295 hvm_set_segment_register(v, x86_seg_ss, &reg);
2297 reg.attr.bytes = 0x82; /* LDT */
2298 hvm_set_segment_register(v, x86_seg_ldtr, &reg);
2300 reg.attr.bytes = 0x8b; /* 32-bit TSS (busy) */
2301 hvm_set_segment_register(v, x86_seg_tr, &reg);
2303 reg.attr.bytes = 0;
2304 hvm_set_segment_register(v, x86_seg_gdtr, &reg);
2305 hvm_set_segment_register(v, x86_seg_idtr, &reg);
2307 /* Sync AP's TSC with BSP's. */
2308 v->arch.hvm_vcpu.cache_tsc_offset =
2309 v->domain->vcpu[0]->arch.hvm_vcpu.cache_tsc_offset;
2310 hvm_funcs.set_tsc_offset(v, v->arch.hvm_vcpu.cache_tsc_offset);
2312 paging_update_paging_modes(v);
2314 v->arch.flags |= TF_kernel_mode;
2315 v->is_initialised = 1;
2316 clear_bit(_VPF_down, &v->pause_flags);
2318 out:
2319 domain_unlock(d);
2322 static void hvm_s3_suspend(struct domain *d)
2324 struct vcpu *v;
2326 domain_pause(d);
2327 domain_lock(d);
2329 if ( d->is_dying || (d->vcpu == NULL) || (d->vcpu[0] == NULL) ||
2330 test_and_set_bool(d->arch.hvm_domain.is_s3_suspended) )
2332 domain_unlock(d);
2333 domain_unpause(d);
2334 return;
2337 for_each_vcpu ( d, v )
2339 vlapic_reset(vcpu_vlapic(v));
2340 vcpu_reset(v);
2343 vpic_reset(d);
2344 vioapic_reset(d);
2345 pit_reset(d);
2346 rtc_reset(d);
2347 pmtimer_reset(d);
2348 hpet_reset(d);
2350 hvm_vcpu_reset_state(d->vcpu[0], 0xf000, 0xfff0);
2352 domain_unlock(d);
2355 static void hvm_s3_resume(struct domain *d)
2357 if ( test_and_clear_bool(d->arch.hvm_domain.is_s3_suspended) )
2358 domain_unpause(d);
2361 static int hvmop_set_isa_irq_level(
2362 XEN_GUEST_HANDLE(xen_hvm_set_isa_irq_level_t) uop)
2364 struct xen_hvm_set_isa_irq_level op;
2365 struct domain *d;
2366 int rc;
2368 if ( copy_from_guest(&op, uop, 1) )
2369 return -EFAULT;
2371 if ( op.isa_irq > 15 )
2372 return -EINVAL;
2374 d = rcu_lock_domain_by_id(op.domid);
2375 if ( d == NULL )
2376 return -ESRCH;
2378 rc = -EPERM;
2379 if ( !IS_PRIV_FOR(current->domain, d) )
2380 goto out;
2382 rc = -EINVAL;
2383 if ( !is_hvm_domain(d) )
2384 goto out;
2386 rc = xsm_hvm_set_isa_irq_level(d);
2387 if ( rc )
2388 goto out;
2390 rc = 0;
2391 switch ( op.level )
2393 case 0:
2394 hvm_isa_irq_deassert(d, op.isa_irq);
2395 break;
2396 case 1:
2397 hvm_isa_irq_assert(d, op.isa_irq);
2398 break;
2399 default:
2400 rc = -EINVAL;
2401 break;
2404 out:
2405 rcu_unlock_domain(d);
2406 return rc;
2409 static int hvmop_set_pci_link_route(
2410 XEN_GUEST_HANDLE(xen_hvm_set_pci_link_route_t) uop)
2412 struct xen_hvm_set_pci_link_route op;
2413 struct domain *d;
2414 int rc;
2416 if ( copy_from_guest(&op, uop, 1) )
2417 return -EFAULT;
2419 if ( (op.link > 3) || (op.isa_irq > 15) )
2420 return -EINVAL;
2422 d = rcu_lock_domain_by_id(op.domid);
2423 if ( d == NULL )
2424 return -ESRCH;
2426 rc = -EPERM;
2427 if ( !IS_PRIV_FOR(current->domain, d) )
2428 goto out;
2430 rc = -EINVAL;
2431 if ( !is_hvm_domain(d) )
2432 goto out;
2434 rc = xsm_hvm_set_pci_link_route(d);
2435 if ( rc )
2436 goto out;
2438 rc = 0;
2439 hvm_set_pci_link_route(d, op.link, op.isa_irq);
2441 out:
2442 rcu_unlock_domain(d);
2443 return rc;
2446 static int hvmop_flush_tlb_all(void)
2448 struct domain *d = current->domain;
2449 struct vcpu *v;
2451 if ( !is_hvm_domain(d) )
2452 return -EINVAL;
2454 /* Avoid deadlock if more than one vcpu tries this at the same time. */
2455 if ( !spin_trylock(&d->hypercall_deadlock_mutex) )
2456 return -EAGAIN;
2458 /* Pause all other vcpus. */
2459 for_each_vcpu ( d, v )
2460 if ( v != current )
2461 vcpu_pause_nosync(v);
2463 /* Now that all VCPUs are signalled to deschedule, we wait... */
2464 for_each_vcpu ( d, v )
2465 if ( v != current )
2466 while ( !vcpu_runnable(v) && v->is_running )
2467 cpu_relax();
2469 /* All other vcpus are paused, safe to unlock now. */
2470 spin_unlock(&d->hypercall_deadlock_mutex);
2472 /* Flush paging-mode soft state (e.g., va->gfn cache; PAE PDPE cache). */
2473 for_each_vcpu ( d, v )
2474 paging_update_cr3(v);
2476 /* Flush all dirty TLBs. */
2477 flush_tlb_mask(&d->domain_dirty_cpumask);
2479 /* Done. */
2480 for_each_vcpu ( d, v )
2481 if ( v != current )
2482 vcpu_unpause(v);
2484 return 0;
2487 long do_hvm_op(unsigned long op, XEN_GUEST_HANDLE(void) arg)
2490 struct domain *curr_d = current->domain;
2491 long rc = 0;
2493 switch ( op )
2495 case HVMOP_set_param:
2496 case HVMOP_get_param:
2498 struct xen_hvm_param a;
2499 struct hvm_ioreq_page *iorp;
2500 struct domain *d;
2501 struct vcpu *v;
2503 if ( copy_from_guest(&a, arg, 1) )
2504 return -EFAULT;
2506 if ( a.index >= HVM_NR_PARAMS )
2507 return -EINVAL;
2509 rc = rcu_lock_target_domain_by_id(a.domid, &d);
2510 if ( rc != 0 )
2511 return rc;
2513 rc = -EINVAL;
2514 if ( !is_hvm_domain(d) )
2515 goto param_fail;
2517 rc = xsm_hvm_param(d, op);
2518 if ( rc )
2519 goto param_fail;
2521 if ( op == HVMOP_set_param )
2523 rc = 0;
2525 switch ( a.index )
2527 case HVM_PARAM_IOREQ_PFN:
2528 iorp = &d->arch.hvm_domain.ioreq;
2529 if ( (rc = hvm_set_ioreq_page(d, iorp, a.value)) != 0 )
2530 break;
2531 spin_lock(&iorp->lock);
2532 if ( iorp->va != NULL )
2533 /* Initialise evtchn port info if VCPUs already created. */
2534 for_each_vcpu ( d, v )
2535 get_ioreq(v)->vp_eport = v->arch.hvm_vcpu.xen_port;
2536 spin_unlock(&iorp->lock);
2537 break;
2538 case HVM_PARAM_BUFIOREQ_PFN:
2539 iorp = &d->arch.hvm_domain.buf_ioreq;
2540 rc = hvm_set_ioreq_page(d, iorp, a.value);
2541 break;
2542 case HVM_PARAM_CALLBACK_IRQ:
2543 hvm_set_callback_via(d, a.value);
2544 hvm_latch_shinfo_size(d);
2545 break;
2546 case HVM_PARAM_TIMER_MODE:
2547 if ( a.value > HVMPTM_one_missed_tick_pending )
2548 rc = -EINVAL;
2549 break;
2550 case HVM_PARAM_VIRIDIAN:
2551 if ( a.value > 1 )
2552 rc = -EINVAL;
2553 break;
2554 case HVM_PARAM_IDENT_PT:
2555 /* Not reflexive, as we must domain_pause(). */
2556 rc = -EPERM;
2557 if ( curr_d == d )
2558 break;
2560 rc = -EINVAL;
2561 if ( d->arch.hvm_domain.params[a.index] != 0 )
2562 break;
2564 rc = 0;
2565 if ( !paging_mode_hap(d) )
2566 break;
2568 /*
2569 * Update GUEST_CR3 in each VMCS to point at identity map.
2570 * All foreign updates to guest state must synchronise on
2571 * the domctl_lock.
2572 */
2573 rc = -EAGAIN;
2574 if ( !domctl_lock_acquire() )
2575 break;
2577 rc = 0;
2578 domain_pause(d);
2579 d->arch.hvm_domain.params[a.index] = a.value;
2580 for_each_vcpu ( d, v )
2581 paging_update_cr3(v);
2582 domain_unpause(d);
2584 domctl_lock_release();
2585 break;
2586 case HVM_PARAM_DM_DOMAIN:
2587 /* Not reflexive, as we must domain_pause(). */
2588 rc = -EPERM;
2589 if ( curr_d == d )
2590 break;
2592 if ( a.value == DOMID_SELF )
2593 a.value = curr_d->domain_id;
2595 rc = 0;
2596 domain_pause(d); /* safe to change per-vcpu xen_port */
2597 iorp = &d->arch.hvm_domain.ioreq;
2598 for_each_vcpu ( d, v )
2600 int old_port, new_port;
2601 new_port = alloc_unbound_xen_event_channel(v, a.value);
2602 if ( new_port < 0 )
2604 rc = new_port;
2605 break;
2607 /* xchg() ensures that only we free_xen_event_channel() */
2608 old_port = xchg(&v->arch.hvm_vcpu.xen_port, new_port);
2609 free_xen_event_channel(v, old_port);
2610 spin_lock(&iorp->lock);
2611 if ( iorp->va != NULL )
2612 get_ioreq(v)->vp_eport = v->arch.hvm_vcpu.xen_port;
2613 spin_unlock(&iorp->lock);
2615 domain_unpause(d);
2616 break;
2617 case HVM_PARAM_ACPI_S_STATE:
2618 /* Not reflexive, as we must domain_pause(). */
2619 rc = -EPERM;
2620 if ( curr_d == d )
2621 break;
2623 rc = 0;
2624 if ( a.value == 3 )
2625 hvm_s3_suspend(d);
2626 else if ( a.value == 0 )
2627 hvm_s3_resume(d);
2628 else
2629 rc = -EINVAL;
2631 break;
2634 if ( rc == 0 )
2635 d->arch.hvm_domain.params[a.index] = a.value;
2637 else
2639 switch ( a.index )
2641 case HVM_PARAM_ACPI_S_STATE:
2642 a.value = d->arch.hvm_domain.is_s3_suspended ? 3 : 0;
2643 break;
2644 default:
2645 a.value = d->arch.hvm_domain.params[a.index];
2646 break;
2648 rc = copy_to_guest(arg, &a, 1) ? -EFAULT : 0;
2651 HVM_DBG_LOG(DBG_LEVEL_HCALL, "%s param %u = %"PRIx64,
2652 op == HVMOP_set_param ? "set" : "get",
2653 a.index, a.value);
2655 param_fail:
2656 rcu_unlock_domain(d);
2657 break;
2660 case HVMOP_set_pci_intx_level:
2661 rc = hvmop_set_pci_intx_level(
2662 guest_handle_cast(arg, xen_hvm_set_pci_intx_level_t));
2663 break;
2665 case HVMOP_set_isa_irq_level:
2666 rc = hvmop_set_isa_irq_level(
2667 guest_handle_cast(arg, xen_hvm_set_isa_irq_level_t));
2668 break;
2670 case HVMOP_set_pci_link_route:
2671 rc = hvmop_set_pci_link_route(
2672 guest_handle_cast(arg, xen_hvm_set_pci_link_route_t));
2673 break;
2675 case HVMOP_flush_tlbs:
2676 rc = guest_handle_is_null(arg) ? hvmop_flush_tlb_all() : -ENOSYS;
2677 break;
2679 case HVMOP_track_dirty_vram:
2681 struct xen_hvm_track_dirty_vram a;
2682 struct domain *d;
2684 if ( copy_from_guest(&a, arg, 1) )
2685 return -EFAULT;
2687 rc = rcu_lock_target_domain_by_id(a.domid, &d);
2688 if ( rc != 0 )
2689 return rc;
2691 rc = -EINVAL;
2692 if ( !is_hvm_domain(d) )
2693 goto param_fail2;
2695 rc = xsm_hvm_param(d, op);
2696 if ( rc )
2697 goto param_fail2;
2699 rc = -ESRCH;
2700 if ( d->is_dying )
2701 goto param_fail2;
2703 rc = -EINVAL;
2704 if ( d->vcpu == NULL || d->vcpu[0] == NULL )
2705 goto param_fail2;
2707 if ( shadow_mode_enabled(d) )
2708 rc = shadow_track_dirty_vram(d, a.first_pfn, a.nr, a.dirty_bitmap);
2709 else
2710 rc = hap_track_dirty_vram(d, a.first_pfn, a.nr, a.dirty_bitmap);
2712 param_fail2:
2713 rcu_unlock_domain(d);
2714 break;
2717 case HVMOP_modified_memory:
2719 struct xen_hvm_modified_memory a;
2720 struct domain *d;
2721 unsigned long pfn;
2723 if ( copy_from_guest(&a, arg, 1) )
2724 return -EFAULT;
2726 rc = rcu_lock_target_domain_by_id(a.domid, &d);
2727 if ( rc != 0 )
2728 return rc;
2730 rc = -EINVAL;
2731 if ( !is_hvm_domain(d) )
2732 goto param_fail3;
2734 rc = xsm_hvm_param(d, op);
2735 if ( rc )
2736 goto param_fail3;
2738 rc = -EINVAL;
2739 if ( (a.first_pfn > domain_get_maximum_gpfn(d)) ||
2740 ((a.first_pfn + a.nr - 1) < a.first_pfn) ||
2741 ((a.first_pfn + a.nr - 1) > domain_get_maximum_gpfn(d)) )
2742 goto param_fail3;
2744 rc = 0;
2745 if ( !paging_mode_log_dirty(d) )
2746 goto param_fail3;
2748 for ( pfn = a.first_pfn; pfn < a.first_pfn + a.nr; pfn++ )
2750 p2m_type_t t;
2751 mfn_t mfn = gfn_to_mfn(d, pfn, &t);
2752 if ( mfn_x(mfn) != INVALID_MFN )
2754 paging_mark_dirty(d, mfn_x(mfn));
2755 /* These are most probably not page tables any more */
2756 /* don't take a long time and don't die either */
2757 sh_remove_shadows(d->vcpu[0], mfn, 1, 0);
2761 param_fail3:
2762 rcu_unlock_domain(d);
2763 break;
2766 case HVMOP_set_mem_type:
2768 struct xen_hvm_set_mem_type a;
2769 struct domain *d;
2770 unsigned long pfn;
2772 /* Interface types to internal p2m types */
2773 p2m_type_t memtype[] = {
2774 p2m_ram_rw, /* HVMMEM_ram_rw */
2775 p2m_ram_ro, /* HVMMEM_ram_ro */
2776 p2m_mmio_dm /* HVMMEM_mmio_dm */
2777 };
2779 if ( copy_from_guest(&a, arg, 1) )
2780 return -EFAULT;
2782 rc = rcu_lock_target_domain_by_id(a.domid, &d);
2783 if ( rc != 0 )
2784 return rc;
2786 rc = -EINVAL;
2787 if ( !is_hvm_domain(d) )
2788 goto param_fail4;
2790 rc = -EINVAL;
2791 if ( (a.first_pfn > domain_get_maximum_gpfn(d)) ||
2792 ((a.first_pfn + a.nr - 1) < a.first_pfn) ||
2793 ((a.first_pfn + a.nr - 1) > domain_get_maximum_gpfn(d)) )
2794 goto param_fail4;
2796 if ( a.hvmmem_type >= ARRAY_SIZE(memtype) )
2797 goto param_fail4;
2799 rc = 0;
2801 for ( pfn = a.first_pfn; pfn < a.first_pfn + a.nr; pfn++ )
2803 p2m_type_t t;
2804 mfn_t mfn;
2805 mfn = gfn_to_mfn(d, pfn, &t);
2806 p2m_change_type(d, pfn, t, memtype[a.hvmmem_type]);
2809 param_fail4:
2810 rcu_unlock_domain(d);
2811 break;
2814 default:
2816 gdprintk(XENLOG_WARNING, "Bad HVM op %ld.\n", op);
2817 rc = -ENOSYS;
2818 break;
2822 if ( rc == -EAGAIN )
2823 rc = hypercall_create_continuation(
2824 __HYPERVISOR_hvm_op, "lh", op, arg);
2826 return rc;
2829 int hvm_debug_op(struct vcpu *v, int32_t op)
2831 int rc;
2833 switch ( op )
2835 case XEN_DOMCTL_DEBUG_OP_SINGLE_STEP_ON:
2836 case XEN_DOMCTL_DEBUG_OP_SINGLE_STEP_OFF:
2837 rc = -ENOSYS;
2838 if ( !cpu_has_monitor_trap_flag )
2839 break;
2840 rc = 0;
2841 vcpu_pause(v);
2842 v->arch.hvm_vcpu.single_step =
2843 (op == XEN_DOMCTL_DEBUG_OP_SINGLE_STEP_ON);
2844 vcpu_unpause(v); /* guest will latch new state */
2845 break;
2846 default:
2847 rc = -ENOSYS;
2848 break;
2851 return rc;
2855 /*
2856 * Local variables:
2857 * mode: C
2858 * c-set-style: "BSD"
2859 * c-basic-offset: 4
2860 * tab-width: 4
2861 * indent-tabs-mode: nil
2862 * End:
2863 */