ia64/xen-unstable

view xen/arch/x86/traps.c @ 19697:42fe00c6f8b4

Enable pci mmcfg and ATS for x86_64

This patch enables PCI MMCONFIG in xen and turns on hooks for ATS.

Signed-off-by: Allen Kay <allen.m.kay@intel.com>
author Keir Fraser <keir.fraser@citrix.com>
date Tue Jun 02 11:49:34 2009 +0100 (2009-06-02)
parents 7a73e3aeb224
children 67a0ffade665
line source
1 /******************************************************************************
2 * arch/x86/traps.c
3 *
4 * Modifications to Linux original are copyright (c) 2002-2004, K A Fraser
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
21 /*
22 * Copyright (C) 1991, 1992 Linus Torvalds
23 *
24 * Pentium III FXSR, SSE support
25 * Gareth Hughes <gareth@valinux.com>, May 2000
26 */
28 #include <xen/config.h>
29 #include <xen/init.h>
30 #include <xen/sched.h>
31 #include <xen/lib.h>
32 #include <xen/errno.h>
33 #include <xen/mm.h>
34 #include <xen/console.h>
35 #include <xen/shutdown.h>
36 #include <asm/regs.h>
37 #include <xen/delay.h>
38 #include <xen/event.h>
39 #include <xen/spinlock.h>
40 #include <xen/irq.h>
41 #include <xen/perfc.h>
42 #include <xen/softirq.h>
43 #include <xen/domain_page.h>
44 #include <xen/symbols.h>
45 #include <xen/iocap.h>
46 #include <xen/nmi.h>
47 #include <xen/version.h>
48 #include <xen/kexec.h>
49 #include <xen/trace.h>
50 #include <xen/paging.h>
51 #include <asm/system.h>
52 #include <asm/io.h>
53 #include <asm/atomic.h>
54 #include <asm/desc.h>
55 #include <asm/debugreg.h>
56 #include <asm/smp.h>
57 #include <asm/flushtlb.h>
58 #include <asm/uaccess.h>
59 #include <asm/i387.h>
60 #include <asm/debugger.h>
61 #include <asm/msr.h>
62 #include <asm/shared.h>
63 #include <asm/x86_emulate.h>
64 #include <asm/traps.h>
65 #include <asm/hvm/vpt.h>
66 #include <public/arch-x86/cpuid.h>
68 /*
69 * opt_nmi: one of 'ignore', 'dom0', or 'fatal'.
70 * fatal: Xen prints diagnostic message and then hangs.
71 * dom0: The NMI is virtualised to DOM0.
72 * ignore: The NMI error is cleared and ignored.
73 */
74 #ifdef NDEBUG
75 char opt_nmi[10] = "dom0";
76 #else
77 char opt_nmi[10] = "fatal";
78 #endif
79 string_param("nmi", opt_nmi);
81 DEFINE_PER_CPU(u32, ler_msr);
83 /* Master table, used by CPU0. */
84 idt_entry_t idt_table[IDT_ENTRIES];
86 /* Pointer to the IDT of every CPU. */
87 idt_entry_t *idt_tables[NR_CPUS] __read_mostly;
89 #define DECLARE_TRAP_HANDLER(_name) \
90 asmlinkage void _name(void); \
91 asmlinkage void do_ ## _name(struct cpu_user_regs *regs)
93 DECLARE_TRAP_HANDLER(divide_error);
94 DECLARE_TRAP_HANDLER(debug);
95 DECLARE_TRAP_HANDLER(nmi);
96 DECLARE_TRAP_HANDLER(int3);
97 DECLARE_TRAP_HANDLER(overflow);
98 DECLARE_TRAP_HANDLER(bounds);
99 DECLARE_TRAP_HANDLER(invalid_op);
100 DECLARE_TRAP_HANDLER(device_not_available);
101 DECLARE_TRAP_HANDLER(coprocessor_segment_overrun);
102 DECLARE_TRAP_HANDLER(invalid_TSS);
103 DECLARE_TRAP_HANDLER(segment_not_present);
104 DECLARE_TRAP_HANDLER(stack_segment);
105 DECLARE_TRAP_HANDLER(general_protection);
106 DECLARE_TRAP_HANDLER(page_fault);
107 DECLARE_TRAP_HANDLER(coprocessor_error);
108 DECLARE_TRAP_HANDLER(simd_coprocessor_error);
109 DECLARE_TRAP_HANDLER(machine_check);
110 DECLARE_TRAP_HANDLER(alignment_check);
111 DECLARE_TRAP_HANDLER(spurious_interrupt_bug);
113 long do_set_debugreg(int reg, unsigned long value);
114 unsigned long do_get_debugreg(int reg);
115 void (*ioemul_handle_quirk)(
116 u8 opcode, char *io_emul_stub, struct cpu_user_regs *regs);
118 static int debug_stack_lines = 20;
119 integer_param("debug_stack_lines", debug_stack_lines);
121 static int opt_ler;
122 boolean_param("ler", opt_ler);
124 #ifdef CONFIG_X86_32
125 #define stack_words_per_line 8
126 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)&regs->esp)
127 #else
128 #define stack_words_per_line 4
129 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)regs->rsp)
130 #endif
132 static void show_guest_stack(struct cpu_user_regs *regs)
133 {
134 int i;
135 struct vcpu *curr = current;
136 unsigned long *stack, addr;
138 if ( is_hvm_vcpu(curr) )
139 return;
141 if ( is_pv_32on64_vcpu(curr) )
142 {
143 compat_show_guest_stack(regs, debug_stack_lines);
144 return;
145 }
147 if ( vm86_mode(regs) )
148 {
149 stack = (unsigned long *)((regs->ss << 4) + (regs->esp & 0xffff));
150 printk("Guest stack trace from ss:sp = %04x:%04x (VM86)\n ",
151 regs->ss, (uint16_t)(regs->esp & 0xffff));
152 }
153 else
154 {
155 stack = (unsigned long *)regs->esp;
156 printk("Guest stack trace from "__OP"sp=%p:\n ", stack);
157 }
159 for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
160 {
161 if ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) == 0 )
162 break;
163 if ( get_user(addr, stack) )
164 {
165 if ( i != 0 )
166 printk("\n ");
167 printk("Fault while accessing guest memory.");
168 i = 1;
169 break;
170 }
171 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
172 printk("\n ");
173 printk(" %p", _p(addr));
174 stack++;
175 }
176 if ( i == 0 )
177 printk("Stack empty.");
178 printk("\n");
179 }
181 #if !defined(CONFIG_FRAME_POINTER)
183 static void show_trace(struct cpu_user_regs *regs)
184 {
185 unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr;
187 printk("Xen call trace:\n ");
189 printk("[<%p>]", _p(regs->eip));
190 print_symbol(" %s\n ", regs->eip);
192 while ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) != 0 )
193 {
194 addr = *stack++;
195 if ( is_kernel_text(addr) || is_kernel_inittext(addr) )
196 {
197 printk("[<%p>]", _p(addr));
198 print_symbol(" %s\n ", addr);
199 }
200 }
202 printk("\n");
203 }
205 #else
207 static void show_trace(struct cpu_user_regs *regs)
208 {
209 unsigned long *frame, next, addr, low, high;
211 printk("Xen call trace:\n ");
213 printk("[<%p>]", _p(regs->eip));
214 print_symbol(" %s\n ", regs->eip);
216 /* Bounds for range of valid frame pointer. */
217 low = (unsigned long)(ESP_BEFORE_EXCEPTION(regs) - 2);
218 high = (low & ~(STACK_SIZE - 1)) +
219 (STACK_SIZE - sizeof(struct cpu_info) - 2*sizeof(unsigned long));
221 /* The initial frame pointer. */
222 next = regs->ebp;
224 for ( ; ; )
225 {
226 /* Valid frame pointer? */
227 if ( (next < low) || (next >= high) )
228 {
229 /*
230 * Exception stack frames have a different layout, denoted by an
231 * inverted frame pointer.
232 */
233 next = ~next;
234 if ( (next < low) || (next >= high) )
235 break;
236 frame = (unsigned long *)next;
237 next = frame[0];
238 addr = frame[(offsetof(struct cpu_user_regs, eip) -
239 offsetof(struct cpu_user_regs, ebp))
240 / BYTES_PER_LONG];
241 }
242 else
243 {
244 /* Ordinary stack frame. */
245 frame = (unsigned long *)next;
246 next = frame[0];
247 addr = frame[1];
248 }
250 printk("[<%p>]", _p(addr));
251 print_symbol(" %s\n ", addr);
253 low = (unsigned long)&frame[2];
254 }
256 printk("\n");
257 }
259 #endif
261 void show_stack(struct cpu_user_regs *regs)
262 {
263 unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr;
264 int i;
266 if ( guest_mode(regs) )
267 return show_guest_stack(regs);
269 printk("Xen stack trace from "__OP"sp=%p:\n ", stack);
271 for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
272 {
273 if ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) == 0 )
274 break;
275 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
276 printk("\n ");
277 addr = *stack++;
278 printk(" %p", _p(addr));
279 }
280 if ( i == 0 )
281 printk("Stack empty.");
282 printk("\n");
284 show_trace(regs);
285 }
287 void show_stack_overflow(unsigned int cpu, unsigned long esp)
288 {
289 #ifdef MEMORY_GUARD
290 unsigned long esp_top, esp_bottom;
291 unsigned long *stack, addr;
293 esp_bottom = (esp | (STACK_SIZE - 1)) + 1;
294 esp_top = esp_bottom - PRIMARY_STACK_SIZE;
296 printk("Valid stack range: %p-%p, sp=%p, tss.esp0=%p\n",
297 (void *)esp_top, (void *)esp_bottom, (void *)esp,
298 (void *)init_tss[cpu].esp0);
300 /* Trigger overflow trace if %esp is within 512 bytes of the guard page. */
301 if ( ((unsigned long)(esp - esp_top) > 512) &&
302 ((unsigned long)(esp_top - esp) > 512) )
303 {
304 printk("No stack overflow detected. Skipping stack trace.\n");
305 return;
306 }
308 if ( esp < esp_top )
309 esp = esp_top;
311 printk("Xen stack overflow (dumping trace %p-%p):\n ",
312 (void *)esp, (void *)esp_bottom);
314 stack = (unsigned long *)esp;
315 while ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) != 0 )
316 {
317 addr = *stack++;
318 if ( is_kernel_text(addr) || is_kernel_inittext(addr) )
319 {
320 printk("%p: [<%p>]", stack, _p(addr));
321 print_symbol(" %s\n ", addr);
322 }
323 }
325 printk("\n");
326 #endif
327 }
329 void show_execution_state(struct cpu_user_regs *regs)
330 {
331 show_registers(regs);
332 show_stack(regs);
333 }
335 void vcpu_show_execution_state(struct vcpu *v)
336 {
337 printk("*** Dumping Dom%d vcpu#%d state: ***\n",
338 v->domain->domain_id, v->vcpu_id);
340 if ( v == current )
341 {
342 show_execution_state(guest_cpu_user_regs());
343 return;
344 }
346 vcpu_pause(v); /* acceptably dangerous */
348 vcpu_show_registers(v);
349 /* Todo: map arbitrary vcpu's top guest stack page here. */
350 if ( (v->domain == current->domain) &&
351 guest_kernel_mode(v, &v->arch.guest_context.user_regs) )
352 show_guest_stack(&v->arch.guest_context.user_regs);
354 vcpu_unpause(v);
355 }
357 char *trapstr(int trapnr)
358 {
359 static char *strings[] = {
360 "divide error", "debug", "nmi", "bkpt", "overflow", "bounds",
361 "invalid opcode", "device not available", "double fault",
362 "coprocessor segment", "invalid tss", "segment not found",
363 "stack error", "general protection fault", "page fault",
364 "spurious interrupt", "coprocessor error", "alignment check",
365 "machine check", "simd error"
366 };
368 if ( (trapnr < 0) || (trapnr >= ARRAY_SIZE(strings)) )
369 return "???";
371 return strings[trapnr];
372 }
374 /*
375 * This is called for faults at very unexpected times (e.g., when interrupts
376 * are disabled). In such situations we can't do much that is safe. We try to
377 * print out some tracing and then we just spin.
378 */
379 asmlinkage void fatal_trap(int trapnr, struct cpu_user_regs *regs)
380 {
381 static DEFINE_PER_CPU(char, depth);
383 /*
384 * In some cases, we can end up in a vicious cycle of fatal_trap()s
385 * within fatal_trap()s. We give the problem a couple of iterations to
386 * bottom out, and then we just panic.
387 */
388 if ( ++this_cpu(depth) < 3 )
389 {
390 watchdog_disable();
391 console_start_sync();
393 show_execution_state(regs);
395 if ( trapnr == TRAP_page_fault )
396 {
397 unsigned long cr2 = read_cr2();
398 printk("Faulting linear address: %p\n", _p(cr2));
399 show_page_walk(cr2);
400 }
401 }
403 panic("FATAL TRAP: vector = %d (%s)\n"
404 "[error_code=%04x] %s\n",
405 trapnr, trapstr(trapnr), regs->error_code,
406 (regs->eflags & X86_EFLAGS_IF) ? "" : ", IN INTERRUPT CONTEXT");
407 }
409 static void do_guest_trap(
410 int trapnr, const struct cpu_user_regs *regs, int use_error_code)
411 {
412 struct vcpu *v = current;
413 struct trap_bounce *tb;
414 const struct trap_info *ti;
416 trace_pv_trap(trapnr, regs->eip, use_error_code, regs->error_code);
418 tb = &v->arch.trap_bounce;
419 ti = &v->arch.guest_context.trap_ctxt[trapnr];
421 tb->flags = TBF_EXCEPTION;
422 tb->cs = ti->cs;
423 tb->eip = ti->address;
425 if ( use_error_code )
426 {
427 tb->flags |= TBF_EXCEPTION_ERRCODE;
428 tb->error_code = regs->error_code;
429 }
431 if ( TI_GET_IF(ti) )
432 tb->flags |= TBF_INTERRUPT;
434 if ( unlikely(null_trap_bounce(v, tb)) )
435 gdprintk(XENLOG_WARNING, "Unhandled %s fault/trap [#%d] "
436 "on VCPU %d [ec=%04x]\n",
437 trapstr(trapnr), trapnr, v->vcpu_id, regs->error_code);
438 }
440 static void instruction_done(
441 struct cpu_user_regs *regs, unsigned long eip, unsigned int bpmatch)
442 {
443 regs->eip = eip;
444 regs->eflags &= ~X86_EFLAGS_RF;
445 if ( bpmatch || (regs->eflags & X86_EFLAGS_TF) )
446 {
447 current->arch.guest_context.debugreg[6] |= bpmatch | 0xffff0ff0;
448 if ( regs->eflags & X86_EFLAGS_TF )
449 current->arch.guest_context.debugreg[6] |= 0x4000;
450 do_guest_trap(TRAP_debug, regs, 0);
451 }
452 }
454 static unsigned int check_guest_io_breakpoint(struct vcpu *v,
455 unsigned int port, unsigned int len)
456 {
457 unsigned int width, i, match = 0;
458 unsigned long start;
460 if ( !(v->arch.guest_context.debugreg[5]) ||
461 !(v->arch.guest_context.ctrlreg[4] & X86_CR4_DE) )
462 return 0;
464 for ( i = 0; i < 4; i++ )
465 {
466 if ( !(v->arch.guest_context.debugreg[5] &
467 (3 << (i * DR_ENABLE_SIZE))) )
468 continue;
470 start = v->arch.guest_context.debugreg[i];
471 width = 0;
473 switch ( (v->arch.guest_context.debugreg[7] >>
474 (DR_CONTROL_SHIFT + i * DR_CONTROL_SIZE)) & 0xc )
475 {
476 case DR_LEN_1: width = 1; break;
477 case DR_LEN_2: width = 2; break;
478 case DR_LEN_4: width = 4; break;
479 case DR_LEN_8: width = 8; break;
480 }
482 if ( (start < (port + len)) && ((start + width) > port) )
483 match |= 1 << i;
484 }
486 return match;
487 }
489 /*
490 * Called from asm to set up the MCE trapbounce info.
491 * Returns 0 if no callback is set up, else 1.
492 */
493 asmlinkage int set_guest_machinecheck_trapbounce(void)
494 {
495 struct vcpu *v = current;
496 struct trap_bounce *tb = &v->arch.trap_bounce;
498 do_guest_trap(TRAP_machine_check, guest_cpu_user_regs(), 0);
499 tb->flags &= ~TBF_EXCEPTION; /* not needed for MCE delivery path */
500 return !null_trap_bounce(v, tb);
501 }
503 /*
504 * Called from asm to set up the NMI trapbounce info.
505 * Returns 0 if no callback is set up, else 1.
506 */
507 asmlinkage int set_guest_nmi_trapbounce(void)
508 {
509 struct vcpu *v = current;
510 struct trap_bounce *tb = &v->arch.trap_bounce;
511 do_guest_trap(TRAP_nmi, guest_cpu_user_regs(), 0);
512 tb->flags &= ~TBF_EXCEPTION; /* not needed for NMI delivery path */
513 return !null_trap_bounce(v, tb);
514 }
516 static inline void do_trap(
517 int trapnr, struct cpu_user_regs *regs, int use_error_code)
518 {
519 struct vcpu *curr = current;
520 unsigned long fixup;
522 DEBUGGER_trap_entry(trapnr, regs);
524 if ( guest_mode(regs) )
525 {
526 do_guest_trap(trapnr, regs, use_error_code);
527 return;
528 }
530 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
531 {
532 dprintk(XENLOG_ERR, "Trap %d: %p -> %p\n",
533 trapnr, _p(regs->eip), _p(fixup));
534 regs->eip = fixup;
535 return;
536 }
538 if ( ((trapnr == TRAP_copro_error) || (trapnr == TRAP_simd_error)) &&
539 is_hvm_vcpu(curr) && curr->arch.hvm_vcpu.fpu_exception_callback )
540 {
541 curr->arch.hvm_vcpu.fpu_exception_callback(
542 curr->arch.hvm_vcpu.fpu_exception_callback_arg, regs);
543 return;
544 }
546 DEBUGGER_trap_fatal(trapnr, regs);
548 show_execution_state(regs);
549 panic("FATAL TRAP: vector = %d (%s)\n"
550 "[error_code=%04x]\n",
551 trapnr, trapstr(trapnr), regs->error_code);
552 }
554 #define DO_ERROR_NOCODE(trapnr, name) \
555 asmlinkage void do_##name(struct cpu_user_regs *regs) \
556 { \
557 do_trap(trapnr, regs, 0); \
558 }
560 #define DO_ERROR(trapnr, name) \
561 asmlinkage void do_##name(struct cpu_user_regs *regs) \
562 { \
563 do_trap(trapnr, regs, 1); \
564 }
566 DO_ERROR_NOCODE(TRAP_divide_error, divide_error)
567 DO_ERROR_NOCODE(TRAP_overflow, overflow)
568 DO_ERROR_NOCODE(TRAP_bounds, bounds)
569 DO_ERROR_NOCODE(TRAP_copro_seg, coprocessor_segment_overrun)
570 DO_ERROR( TRAP_invalid_tss, invalid_TSS)
571 DO_ERROR( TRAP_no_segment, segment_not_present)
572 DO_ERROR( TRAP_stack_error, stack_segment)
573 DO_ERROR_NOCODE(TRAP_copro_error, coprocessor_error)
574 DO_ERROR( TRAP_alignment_check, alignment_check)
575 DO_ERROR_NOCODE(TRAP_simd_error, simd_coprocessor_error)
577 int rdmsr_hypervisor_regs(
578 uint32_t idx, uint32_t *eax, uint32_t *edx)
579 {
580 struct domain *d = current->domain;
581 /* Optionally shift out of the way of Viridian architectural MSRs. */
582 uint32_t base = is_viridian_domain(d) ? 0x40000200 : 0x40000000;
584 idx -= base;
585 if ( idx > 0 )
586 return 0;
588 switch ( idx )
589 {
590 case 0:
591 {
592 *eax = *edx = 0;
593 break;
594 }
595 default:
596 BUG();
597 }
599 return 1;
600 }
602 int wrmsr_hypervisor_regs(
603 uint32_t idx, uint32_t eax, uint32_t edx)
604 {
605 struct domain *d = current->domain;
606 /* Optionally shift out of the way of Viridian architectural MSRs. */
607 uint32_t base = is_viridian_domain(d) ? 0x40000200 : 0x40000000;
609 idx -= base;
610 if ( idx > 0 )
611 return 0;
613 switch ( idx )
614 {
615 case 0:
616 {
617 void *hypercall_page;
618 unsigned long mfn;
619 unsigned long gmfn = ((unsigned long)edx << 20) | (eax >> 12);
620 unsigned int idx = eax & 0xfff;
622 if ( idx > 0 )
623 {
624 gdprintk(XENLOG_WARNING,
625 "Out of range index %u to MSR %08x\n",
626 idx, 0x40000000);
627 return 0;
628 }
630 mfn = gmfn_to_mfn(d, gmfn);
632 if ( !mfn_valid(mfn) ||
633 !get_page_and_type(mfn_to_page(mfn), d, PGT_writable_page) )
634 {
635 gdprintk(XENLOG_WARNING,
636 "Bad GMFN %lx (MFN %lx) to MSR %08x\n",
637 gmfn, mfn, base + idx);
638 return 0;
639 }
641 hypercall_page = map_domain_page(mfn);
642 hypercall_page_initialise(d, hypercall_page);
643 unmap_domain_page(hypercall_page);
645 put_page_and_type(mfn_to_page(mfn));
646 break;
647 }
649 default:
650 BUG();
651 }
653 return 1;
654 }
656 int cpuid_hypervisor_leaves(
657 uint32_t idx, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
658 {
659 struct domain *d = current->domain;
660 /* Optionally shift out of the way of Viridian architectural leaves. */
661 uint32_t base = is_viridian_domain(d) ? 0x40000100 : 0x40000000;
663 idx -= base;
664 if ( idx > 2 )
665 return 0;
667 switch ( idx )
668 {
669 case 0:
670 *eax = base + 2; /* Largest leaf */
671 *ebx = XEN_CPUID_SIGNATURE_EBX;
672 *ecx = XEN_CPUID_SIGNATURE_ECX;
673 *edx = XEN_CPUID_SIGNATURE_EDX;
674 break;
676 case 1:
677 *eax = (xen_major_version() << 16) | xen_minor_version();
678 *ebx = 0; /* Reserved */
679 *ecx = 0; /* Reserved */
680 *edx = 0; /* Reserved */
681 break;
683 case 2:
684 *eax = 1; /* Number of hypercall-transfer pages */
685 *ebx = 0x40000000; /* MSR base address */
686 if ( is_viridian_domain(d) )
687 *ebx = 0x40000200;
688 *ecx = 0; /* Features 1 */
689 *edx = 0; /* Features 2 */
690 if ( !is_hvm_vcpu(current) )
691 *ecx |= XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD;
692 break;
694 default:
695 BUG();
696 }
698 return 1;
699 }
701 static void pv_cpuid(struct cpu_user_regs *regs)
702 {
703 uint32_t a, b, c, d;
705 a = regs->eax;
706 b = regs->ebx;
707 c = regs->ecx;
708 d = regs->edx;
710 if ( current->domain->domain_id != 0 )
711 {
712 if ( !cpuid_hypervisor_leaves(a, &a, &b, &c, &d) )
713 domain_cpuid(current->domain, a, c, &a, &b, &c, &d);
714 goto out;
715 }
717 asm (
718 "cpuid"
719 : "=a" (a), "=b" (b), "=c" (c), "=d" (d)
720 : "0" (a), "1" (b), "2" (c), "3" (d) );
722 if ( (regs->eax & 0x7fffffff) == 1 )
723 {
724 /* Modify Feature Information. */
725 __clear_bit(X86_FEATURE_VME, &d);
726 if ( !cpu_has_apic )
727 __clear_bit(X86_FEATURE_APIC, &d);
728 if ( !opt_allow_hugepage )
729 __clear_bit(X86_FEATURE_PSE, &d);
730 __clear_bit(X86_FEATURE_PGE, &d);
731 __clear_bit(X86_FEATURE_PSE36, &d);
732 }
733 switch ( (uint32_t)regs->eax )
734 {
735 case 1:
736 /* Modify Feature Information. */
737 if ( !cpu_has_sep )
738 __clear_bit(X86_FEATURE_SEP, &d);
739 #ifdef __i386__
740 if ( !supervisor_mode_kernel )
741 __clear_bit(X86_FEATURE_SEP, &d);
742 #endif
743 __clear_bit(X86_FEATURE_DS, &d);
744 __clear_bit(X86_FEATURE_ACC, &d);
745 __clear_bit(X86_FEATURE_PBE, &d);
747 __clear_bit(X86_FEATURE_DTES64 % 32, &c);
748 __clear_bit(X86_FEATURE_MWAIT % 32, &c);
749 __clear_bit(X86_FEATURE_DSCPL % 32, &c);
750 __clear_bit(X86_FEATURE_VMXE % 32, &c);
751 __clear_bit(X86_FEATURE_SMXE % 32, &c);
752 __clear_bit(X86_FEATURE_TM2 % 32, &c);
753 if ( is_pv_32bit_vcpu(current) )
754 __clear_bit(X86_FEATURE_CX16 % 32, &c);
755 __clear_bit(X86_FEATURE_XTPR % 32, &c);
756 __clear_bit(X86_FEATURE_PDCM % 32, &c);
757 __clear_bit(X86_FEATURE_DCA % 32, &c);
758 __clear_bit(X86_FEATURE_XSAVE % 32, &c);
759 if ( !cpu_has_apic )
760 __clear_bit(X86_FEATURE_X2APIC % 32, &c);
761 __set_bit(X86_FEATURE_HYPERVISOR % 32, &c);
762 break;
763 case 0x80000001:
764 /* Modify Feature Information. */
765 if ( is_pv_32bit_vcpu(current) )
766 {
767 __clear_bit(X86_FEATURE_LM % 32, &d);
768 __clear_bit(X86_FEATURE_LAHF_LM % 32, &c);
769 }
770 #ifndef __i386__
771 if ( is_pv_32on64_vcpu(current) &&
772 boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
773 #endif
774 __clear_bit(X86_FEATURE_SYSCALL % 32, &d);
775 __clear_bit(X86_FEATURE_PAGE1GB % 32, &d);
776 __clear_bit(X86_FEATURE_RDTSCP % 32, &d);
778 __clear_bit(X86_FEATURE_SVME % 32, &c);
779 if ( !cpu_has_apic )
780 __clear_bit(X86_FEATURE_EXTAPICSPACE % 32, &c);
781 __clear_bit(X86_FEATURE_OSVW % 32, &c);
782 __clear_bit(X86_FEATURE_IBS % 32, &c);
783 __clear_bit(X86_FEATURE_SKINIT % 32, &c);
784 __clear_bit(X86_FEATURE_WDT % 32, &c);
785 break;
786 case 5: /* MONITOR/MWAIT */
787 case 0xa: /* Architectural Performance Monitor Features */
788 case 0x8000000a: /* SVM revision and features */
789 case 0x8000001b: /* Instruction Based Sampling */
790 a = b = c = d = 0;
791 break;
792 default:
793 (void)cpuid_hypervisor_leaves(regs->eax, &a, &b, &c, &d);
794 break;
795 }
797 out:
798 regs->eax = a;
799 regs->ebx = b;
800 regs->ecx = c;
801 regs->edx = d;
802 }
804 static int emulate_forced_invalid_op(struct cpu_user_regs *regs)
805 {
806 char sig[5], instr[2];
807 unsigned long eip, rc;
809 eip = regs->eip;
811 /* Check for forced emulation signature: ud2 ; .ascii "xen". */
812 if ( (rc = copy_from_user(sig, (char *)eip, sizeof(sig))) != 0 )
813 {
814 propagate_page_fault(eip + sizeof(sig) - rc, 0);
815 return EXCRET_fault_fixed;
816 }
817 if ( memcmp(sig, "\xf\xbxen", sizeof(sig)) )
818 return 0;
819 eip += sizeof(sig);
821 /* We only emulate CPUID. */
822 if ( ( rc = copy_from_user(instr, (char *)eip, sizeof(instr))) != 0 )
823 {
824 propagate_page_fault(eip + sizeof(instr) - rc, 0);
825 return EXCRET_fault_fixed;
826 }
827 if ( memcmp(instr, "\xf\xa2", sizeof(instr)) )
828 return 0;
829 eip += sizeof(instr);
831 pv_cpuid(regs);
833 instruction_done(regs, eip, 0);
835 trace_trap_one_addr(TRC_PV_FORCED_INVALID_OP, regs->eip);
837 return EXCRET_fault_fixed;
838 }
840 asmlinkage void do_invalid_op(struct cpu_user_regs *regs)
841 {
842 struct bug_frame bug;
843 struct bug_frame_str bug_str;
844 const char *filename, *predicate, *eip = (char *)regs->eip;
845 unsigned long fixup;
846 int id, lineno;
848 DEBUGGER_trap_entry(TRAP_invalid_op, regs);
850 if ( likely(guest_mode(regs)) )
851 {
852 if ( !emulate_forced_invalid_op(regs) )
853 do_guest_trap(TRAP_invalid_op, regs, 0);
854 return;
855 }
857 if ( !is_kernel(eip) ||
858 __copy_from_user(&bug, eip, sizeof(bug)) ||
859 memcmp(bug.ud2, "\xf\xb", sizeof(bug.ud2)) ||
860 (bug.ret != 0xc2) )
861 goto die;
862 eip += sizeof(bug);
864 id = bug.id & 3;
866 if ( id == BUGFRAME_dump )
867 {
868 show_execution_state(regs);
869 regs->eip = (unsigned long)eip;
870 return;
871 }
873 /* WARN, BUG or ASSERT: decode the filename pointer and line number. */
874 if ( !is_kernel(eip) ||
875 __copy_from_user(&bug_str, eip, sizeof(bug_str)) ||
876 (bug_str.mov != 0xbc) )
877 goto die;
878 filename = bug_str(bug_str, eip);
879 eip += sizeof(bug_str);
881 if ( !is_kernel(filename) )
882 filename = "<unknown>";
883 lineno = bug.id >> 2;
885 if ( id == BUGFRAME_warn )
886 {
887 printk("Xen WARN at %.50s:%d\n", filename, lineno);
888 show_execution_state(regs);
889 regs->eip = (unsigned long)eip;
890 return;
891 }
893 if ( id == BUGFRAME_bug )
894 {
895 printk("Xen BUG at %.50s:%d\n", filename, lineno);
896 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
897 show_execution_state(regs);
898 panic("Xen BUG at %.50s:%d\n", filename, lineno);
899 }
901 /* ASSERT: decode the predicate string pointer. */
902 ASSERT(id == BUGFRAME_assert);
903 if ( !is_kernel(eip) ||
904 __copy_from_user(&bug_str, eip, sizeof(bug_str)) ||
905 (bug_str.mov != 0xbc) )
906 goto die;
907 predicate = bug_str(bug_str, eip);
908 eip += sizeof(bug_str);
910 if ( !is_kernel(predicate) )
911 predicate = "<unknown>";
912 printk("Assertion '%s' failed at %.50s:%d\n",
913 predicate, filename, lineno);
914 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
915 show_execution_state(regs);
916 panic("Assertion '%s' failed at %.50s:%d\n",
917 predicate, filename, lineno);
919 die:
920 if ( (fixup = search_exception_table(regs->eip)) != 0 )
921 {
922 regs->eip = fixup;
923 return;
924 }
925 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
926 show_execution_state(regs);
927 panic("FATAL TRAP: vector = %d (invalid opcode)\n", TRAP_invalid_op);
928 }
930 asmlinkage void do_int3(struct cpu_user_regs *regs)
931 {
932 DEBUGGER_trap_entry(TRAP_int3, regs);
934 if ( !guest_mode(regs) )
935 {
936 debugger_trap_fatal(TRAP_int3, regs);
937 return;
938 }
940 do_guest_trap(TRAP_int3, regs, 0);
941 }
943 asmlinkage void do_machine_check(struct cpu_user_regs *regs)
944 {
945 machine_check_vector(regs, regs->error_code);
946 }
948 static void reserved_bit_page_fault(
949 unsigned long addr, struct cpu_user_regs *regs)
950 {
951 printk("d%d:v%d: reserved bit in page table (ec=%04X)\n",
952 current->domain->domain_id, current->vcpu_id, regs->error_code);
953 show_page_walk(addr);
954 show_execution_state(regs);
955 }
957 void propagate_page_fault(unsigned long addr, u16 error_code)
958 {
959 struct trap_info *ti;
960 struct vcpu *v = current;
961 struct trap_bounce *tb = &v->arch.trap_bounce;
963 v->arch.guest_context.ctrlreg[2] = addr;
964 arch_set_cr2(v, addr);
966 /* Re-set error_code.user flag appropriately for the guest. */
967 error_code &= ~PFEC_user_mode;
968 if ( !guest_kernel_mode(v, guest_cpu_user_regs()) )
969 error_code |= PFEC_user_mode;
971 trace_pv_page_fault(addr, error_code);
973 ti = &v->arch.guest_context.trap_ctxt[TRAP_page_fault];
974 tb->flags = TBF_EXCEPTION | TBF_EXCEPTION_ERRCODE;
975 tb->error_code = error_code;
976 tb->cs = ti->cs;
977 tb->eip = ti->address;
978 if ( TI_GET_IF(ti) )
979 tb->flags |= TBF_INTERRUPT;
980 if ( unlikely(null_trap_bounce(v, tb)) )
981 {
982 printk("d%d:v%d: unhandled page fault (ec=%04X)\n",
983 v->domain->domain_id, v->vcpu_id, error_code);
984 show_page_walk(addr);
985 }
987 if ( unlikely(error_code & PFEC_reserved_bit) )
988 reserved_bit_page_fault(addr, guest_cpu_user_regs());
989 }
991 static int handle_gdt_ldt_mapping_fault(
992 unsigned long offset, struct cpu_user_regs *regs)
993 {
994 struct vcpu *curr = current;
995 /* Which vcpu's area did we fault in, and is it in the ldt sub-area? */
996 unsigned int is_ldt_area = (offset >> (GDT_LDT_VCPU_VA_SHIFT-1)) & 1;
997 unsigned int vcpu_area = (offset >> GDT_LDT_VCPU_VA_SHIFT);
999 /* Should never fault in another vcpu's area. */
1000 BUG_ON(vcpu_area != curr->vcpu_id);
1002 /* Byte offset within the gdt/ldt sub-area. */
1003 offset &= (1UL << (GDT_LDT_VCPU_VA_SHIFT-1)) - 1UL;
1005 if ( likely(is_ldt_area) )
1007 /* LDT fault: Copy a mapping from the guest's LDT, if it is valid. */
1008 if ( likely(map_ldt_shadow_page(offset >> PAGE_SHIFT)) )
1010 if ( guest_mode(regs) )
1011 trace_trap_two_addr(TRC_PV_GDT_LDT_MAPPING_FAULT,
1012 regs->eip, offset);
1014 else
1016 /* In hypervisor mode? Leave it to the #PF handler to fix up. */
1017 if ( !guest_mode(regs) )
1018 return 0;
1019 /* In guest mode? Propagate #PF to guest, with adjusted %cr2. */
1020 propagate_page_fault(
1021 curr->arch.guest_context.ldt_base + offset,
1022 regs->error_code);
1025 else
1027 /* GDT fault: handle the fault as #GP(selector). */
1028 regs->error_code = (u16)offset & ~7;
1029 (void)do_general_protection(regs);
1032 return EXCRET_fault_fixed;
1035 #ifdef HYPERVISOR_VIRT_END
1036 #define IN_HYPERVISOR_RANGE(va) \
1037 (((va) >= HYPERVISOR_VIRT_START) && ((va) < HYPERVISOR_VIRT_END))
1038 #else
1039 #define IN_HYPERVISOR_RANGE(va) \
1040 (((va) >= HYPERVISOR_VIRT_START))
1041 #endif
1043 static int __spurious_page_fault(
1044 unsigned long addr, unsigned int error_code)
1046 unsigned long mfn, cr3 = read_cr3();
1047 #if CONFIG_PAGING_LEVELS >= 4
1048 l4_pgentry_t l4e, *l4t;
1049 #endif
1050 #if CONFIG_PAGING_LEVELS >= 3
1051 l3_pgentry_t l3e, *l3t;
1052 #endif
1053 l2_pgentry_t l2e, *l2t;
1054 l1_pgentry_t l1e, *l1t;
1055 unsigned int required_flags, disallowed_flags;
1057 /*
1058 * We do not take spurious page faults in IRQ handlers as we do not
1059 * modify page tables in IRQ context. We therefore bail here because
1060 * map_domain_page() is not IRQ-safe.
1061 */
1062 if ( in_irq() )
1063 return 0;
1065 /* Reserved bit violations are never spurious faults. */
1066 if ( error_code & PFEC_reserved_bit )
1067 return 0;
1069 required_flags = _PAGE_PRESENT;
1070 if ( error_code & PFEC_write_access )
1071 required_flags |= _PAGE_RW;
1072 if ( error_code & PFEC_user_mode )
1073 required_flags |= _PAGE_USER;
1075 disallowed_flags = 0;
1076 if ( error_code & PFEC_insn_fetch )
1077 disallowed_flags |= _PAGE_NX;
1079 mfn = cr3 >> PAGE_SHIFT;
1081 #if CONFIG_PAGING_LEVELS >= 4
1082 l4t = map_domain_page(mfn);
1083 l4e = l4e_read_atomic(&l4t[l4_table_offset(addr)]);
1084 mfn = l4e_get_pfn(l4e);
1085 unmap_domain_page(l4t);
1086 if ( ((l4e_get_flags(l4e) & required_flags) != required_flags) ||
1087 (l4e_get_flags(l4e) & disallowed_flags) )
1088 return 0;
1089 #endif
1091 #if CONFIG_PAGING_LEVELS >= 3
1092 l3t = map_domain_page(mfn);
1093 #if CONFIG_PAGING_LEVELS == 3
1094 l3t += (cr3 & 0xFE0UL) >> 3;
1095 #endif
1096 l3e = l3e_read_atomic(&l3t[l3_table_offset(addr)]);
1097 mfn = l3e_get_pfn(l3e);
1098 unmap_domain_page(l3t);
1099 #if CONFIG_PAGING_LEVELS == 3
1100 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
1101 return 0;
1102 #else
1103 if ( ((l3e_get_flags(l3e) & required_flags) != required_flags) ||
1104 (l3e_get_flags(l3e) & disallowed_flags) )
1105 return 0;
1106 #endif
1107 #endif
1109 l2t = map_domain_page(mfn);
1110 l2e = l2e_read_atomic(&l2t[l2_table_offset(addr)]);
1111 mfn = l2e_get_pfn(l2e);
1112 unmap_domain_page(l2t);
1113 if ( ((l2e_get_flags(l2e) & required_flags) != required_flags) ||
1114 (l2e_get_flags(l2e) & disallowed_flags) )
1115 return 0;
1116 if ( l2e_get_flags(l2e) & _PAGE_PSE )
1118 l1e = l1e_empty(); /* define before use in debug tracing */
1119 goto spurious;
1122 l1t = map_domain_page(mfn);
1123 l1e = l1e_read_atomic(&l1t[l1_table_offset(addr)]);
1124 mfn = l1e_get_pfn(l1e);
1125 unmap_domain_page(l1t);
1126 if ( ((l1e_get_flags(l1e) & required_flags) != required_flags) ||
1127 (l1e_get_flags(l1e) & disallowed_flags) )
1128 return 0;
1130 spurious:
1131 dprintk(XENLOG_WARNING, "Spurious fault in domain %u:%u "
1132 "at addr %lx, e/c %04x\n",
1133 current->domain->domain_id, current->vcpu_id,
1134 addr, error_code);
1135 #if CONFIG_PAGING_LEVELS >= 4
1136 dprintk(XENLOG_WARNING, " l4e = %"PRIpte"\n", l4e_get_intpte(l4e));
1137 #endif
1138 #if CONFIG_PAGING_LEVELS >= 3
1139 dprintk(XENLOG_WARNING, " l3e = %"PRIpte"\n", l3e_get_intpte(l3e));
1140 #endif
1141 dprintk(XENLOG_WARNING, " l2e = %"PRIpte"\n", l2e_get_intpte(l2e));
1142 dprintk(XENLOG_WARNING, " l1e = %"PRIpte"\n", l1e_get_intpte(l1e));
1143 return 1;
1146 static int spurious_page_fault(
1147 unsigned long addr, unsigned int error_code)
1149 unsigned long flags;
1150 int is_spurious;
1152 /*
1153 * Disabling interrupts prevents TLB flushing, and hence prevents
1154 * page tables from becoming invalid under our feet during the walk.
1155 */
1156 local_irq_save(flags);
1157 is_spurious = __spurious_page_fault(addr, error_code);
1158 local_irq_restore(flags);
1160 return is_spurious;
1163 static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs)
1165 struct vcpu *v = current;
1166 struct domain *d = v->domain;
1168 /* No fixups in interrupt context or when interrupts are disabled. */
1169 if ( in_irq() || !(regs->eflags & X86_EFLAGS_IF) )
1170 return 0;
1172 /* Faults from external-mode guests are handled by shadow/hap */
1173 if ( paging_mode_external(d) && guest_mode(regs) )
1175 int ret = paging_fault(addr, regs);
1176 if ( ret == EXCRET_fault_fixed )
1177 trace_trap_two_addr(TRC_PV_PAGING_FIXUP, regs->eip, addr);
1178 return ret;
1181 if ( unlikely(IN_HYPERVISOR_RANGE(addr)) )
1183 if ( !(regs->error_code & PFEC_reserved_bit) &&
1184 (addr >= GDT_LDT_VIRT_START) && (addr < GDT_LDT_VIRT_END) )
1185 return handle_gdt_ldt_mapping_fault(
1186 addr - GDT_LDT_VIRT_START, regs);
1187 return 0;
1190 if ( VM_ASSIST(d, VMASST_TYPE_writable_pagetables) &&
1191 guest_kernel_mode(v, regs) &&
1192 /* Do not check if access-protection fault since the page may
1193 legitimately be not present in shadow page tables */
1194 ((regs->error_code & (PFEC_write_access|PFEC_reserved_bit)) ==
1195 PFEC_write_access) &&
1196 ptwr_do_page_fault(v, addr, regs) )
1197 return EXCRET_fault_fixed;
1199 /* For non-external shadowed guests, we fix up both their own
1200 * pagefaults and Xen's, since they share the pagetables. */
1201 if ( paging_mode_enabled(d) && !paging_mode_external(d) )
1203 int ret = paging_fault(addr, regs);
1204 if ( ret == EXCRET_fault_fixed )
1205 trace_trap_two_addr(TRC_PV_PAGING_FIXUP, regs->eip, addr);
1206 return ret;
1209 return 0;
1212 /*
1213 * #PF error code:
1214 * Bit 0: Protection violation (=1) ; Page not present (=0)
1215 * Bit 1: Write access
1216 * Bit 2: User mode (=1) ; Supervisor mode (=0)
1217 * Bit 3: Reserved bit violation
1218 * Bit 4: Instruction fetch
1219 */
1220 asmlinkage void do_page_fault(struct cpu_user_regs *regs)
1222 unsigned long addr, fixup;
1223 unsigned int error_code;
1225 addr = read_cr2();
1227 /* fixup_page_fault() might change regs->error_code, so cache it here. */
1228 error_code = regs->error_code;
1230 DEBUGGER_trap_entry(TRAP_page_fault, regs);
1232 perfc_incr(page_faults);
1234 if ( unlikely(fixup_page_fault(addr, regs) != 0) )
1235 return;
1237 if ( unlikely(!guest_mode(regs)) )
1239 if ( spurious_page_fault(addr, error_code) )
1240 return;
1242 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
1244 perfc_incr(copy_user_faults);
1245 if ( unlikely(regs->error_code & PFEC_reserved_bit) )
1246 reserved_bit_page_fault(addr, regs);
1247 regs->eip = fixup;
1248 return;
1251 DEBUGGER_trap_fatal(TRAP_page_fault, regs);
1253 show_execution_state(regs);
1254 show_page_walk(addr);
1255 panic("FATAL PAGE FAULT\n"
1256 "[error_code=%04x]\n"
1257 "Faulting linear address: %p\n",
1258 error_code, _p(addr));
1261 if ( unlikely(current->domain->arch.suppress_spurious_page_faults
1262 && spurious_page_fault(addr, error_code)) )
1263 return;
1265 propagate_page_fault(addr, regs->error_code);
1268 /*
1269 * Early #PF handler to print CR2, error code, and stack.
1271 * We also deal with spurious faults here, even though they should never happen
1272 * during early boot (an issue was seen once, but was most likely a hardware
1273 * problem).
1274 */
1275 asmlinkage void do_early_page_fault(struct cpu_user_regs *regs)
1277 static int stuck;
1278 static unsigned long prev_eip, prev_cr2;
1279 unsigned long cr2 = read_cr2();
1281 BUG_ON(smp_processor_id() != 0);
1283 if ( (regs->eip != prev_eip) || (cr2 != prev_cr2) )
1285 prev_eip = regs->eip;
1286 prev_cr2 = cr2;
1287 stuck = 0;
1288 return;
1291 if ( stuck++ == 1000 )
1293 unsigned long *stk = (unsigned long *)regs;
1294 printk("Early fatal page fault at %04x:%p (cr2=%p, ec=%04x)\n",
1295 regs->cs, _p(regs->eip), _p(cr2), regs->error_code);
1296 printk("Stack dump: ");
1297 while ( ((long)stk & ((PAGE_SIZE - 1) & ~(BYTES_PER_LONG - 1))) != 0 )
1298 printk("%p ", _p(*stk++));
1299 for ( ; ; ) ;
1303 long do_fpu_taskswitch(int set)
1305 struct vcpu *v = current;
1307 if ( set )
1309 v->arch.guest_context.ctrlreg[0] |= X86_CR0_TS;
1310 stts();
1312 else
1314 v->arch.guest_context.ctrlreg[0] &= ~X86_CR0_TS;
1315 if ( v->fpu_dirtied )
1316 clts();
1319 return 0;
1322 static int read_descriptor(unsigned int sel,
1323 const struct vcpu *v,
1324 const struct cpu_user_regs * regs,
1325 unsigned long *base,
1326 unsigned long *limit,
1327 unsigned int *ar,
1328 unsigned int vm86attr)
1330 struct desc_struct desc;
1332 if ( !vm86_mode(regs) )
1334 if ( sel < 4)
1335 desc.b = desc.a = 0;
1336 else if ( __get_user(desc,
1337 (const struct desc_struct *)(!(sel & 4)
1338 ? GDT_VIRT_START(v)
1339 : LDT_VIRT_START(v))
1340 + (sel >> 3)) )
1341 return 0;
1342 if ( !(vm86attr & _SEGMENT_CODE) )
1343 desc.b &= ~_SEGMENT_L;
1345 else
1347 desc.a = (sel << 20) | 0xffff;
1348 desc.b = vm86attr | (sel >> 12);
1351 *ar = desc.b & 0x00f0ff00;
1352 if ( !(desc.b & _SEGMENT_L) )
1354 *base = ((desc.a >> 16) + ((desc.b & 0xff) << 16) +
1355 (desc.b & 0xff000000));
1356 *limit = (desc.a & 0xffff) | (desc.b & 0x000f0000);
1357 if ( desc.b & _SEGMENT_G )
1358 *limit = ((*limit + 1) << 12) - 1;
1359 #ifndef NDEBUG
1360 if ( !vm86_mode(regs) && (sel > 3) )
1362 unsigned int a, l;
1363 unsigned char valid;
1365 asm volatile (
1366 "larl %2,%0 ; setz %1"
1367 : "=r" (a), "=qm" (valid) : "rm" (sel));
1368 BUG_ON(valid && ((a & 0x00f0ff00) != *ar));
1369 asm volatile (
1370 "lsll %2,%0 ; setz %1"
1371 : "=r" (l), "=qm" (valid) : "rm" (sel));
1372 BUG_ON(valid && (l != *limit));
1374 #endif
1376 else
1378 *base = 0UL;
1379 *limit = ~0UL;
1382 return 1;
1385 #ifdef __x86_64__
1386 static int read_gate_descriptor(unsigned int gate_sel,
1387 const struct vcpu *v,
1388 unsigned int *sel,
1389 unsigned long *off,
1390 unsigned int *ar)
1392 struct desc_struct desc;
1393 const struct desc_struct *pdesc;
1396 pdesc = (const struct desc_struct *)
1397 (!(gate_sel & 4) ? GDT_VIRT_START(v) : LDT_VIRT_START(v))
1398 + (gate_sel >> 3);
1399 if ( (gate_sel < 4) ||
1400 ((gate_sel >= FIRST_RESERVED_GDT_BYTE) && !(gate_sel & 4)) ||
1401 __get_user(desc, pdesc) )
1402 return 0;
1404 *sel = (desc.a >> 16) & 0x0000fffc;
1405 *off = (desc.a & 0x0000ffff) | (desc.b & 0xffff0000);
1406 *ar = desc.b & 0x0000ffff;
1408 /*
1409 * check_descriptor() clears the DPL field and stores the
1410 * guest requested DPL in the selector's RPL field.
1411 */
1412 if ( *ar & _SEGMENT_DPL )
1413 return 0;
1414 *ar |= (desc.a >> (16 - 13)) & _SEGMENT_DPL;
1416 if ( !is_pv_32bit_vcpu(v) )
1418 if ( (*ar & 0x1f00) != 0x0c00 ||
1419 (gate_sel >= FIRST_RESERVED_GDT_BYTE - 8 && !(gate_sel & 4)) ||
1420 __get_user(desc, pdesc + 1) ||
1421 (desc.b & 0x1f00) )
1422 return 0;
1424 *off |= (unsigned long)desc.a << 32;
1425 return 1;
1428 switch ( *ar & 0x1f00 )
1430 case 0x0400:
1431 *off &= 0xffff;
1432 break;
1433 case 0x0c00:
1434 break;
1435 default:
1436 return 0;
1439 return 1;
1441 #endif
1443 /* Has the guest requested sufficient permission for this I/O access? */
1444 static int guest_io_okay(
1445 unsigned int port, unsigned int bytes,
1446 struct vcpu *v, struct cpu_user_regs *regs)
1448 #if defined(__x86_64__)
1449 /* If in user mode, switch to kernel mode just to read I/O bitmap. */
1450 int user_mode = !(v->arch.flags & TF_kernel_mode);
1451 #define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v)
1452 #elif defined(__i386__)
1453 #define TOGGLE_MODE() ((void)0)
1454 #endif
1456 if ( !vm86_mode(regs) &&
1457 (v->arch.iopl >= (guest_kernel_mode(v, regs) ? 1 : 3)) )
1458 return 1;
1460 if ( v->arch.iobmp_limit > (port + bytes) )
1462 union { uint8_t bytes[2]; uint16_t mask; } x;
1464 /*
1465 * Grab permission bytes from guest space. Inaccessible bytes are
1466 * read as 0xff (no access allowed).
1467 */
1468 TOGGLE_MODE();
1469 switch ( __copy_from_guest_offset(x.bytes, v->arch.iobmp,
1470 port>>3, 2) )
1472 default: x.bytes[0] = ~0;
1473 case 1: x.bytes[1] = ~0;
1474 case 0: break;
1476 TOGGLE_MODE();
1478 if ( (x.mask & (((1<<bytes)-1) << (port&7))) == 0 )
1479 return 1;
1482 return 0;
1485 /* Has the administrator granted sufficient permission for this I/O access? */
1486 static int admin_io_okay(
1487 unsigned int port, unsigned int bytes,
1488 struct vcpu *v, struct cpu_user_regs *regs)
1490 /*
1491 * Port 0xcf8 (CONFIG_ADDRESS) is only visible for DWORD accesses.
1492 * We never permit direct access to that register.
1493 */
1494 if ( (port == 0xcf8) && (bytes == 4) )
1495 return 0;
1497 return ioports_access_permitted(v->domain, port, port + bytes - 1);
1500 static uint32_t guest_io_read(
1501 unsigned int port, unsigned int bytes,
1502 struct vcpu *v, struct cpu_user_regs *regs)
1504 extern uint32_t pci_conf_read(
1505 uint32_t cf8, uint8_t offset, uint8_t bytes);
1507 uint32_t data = 0;
1508 unsigned int shift = 0;
1510 if ( admin_io_okay(port, bytes, v, regs) )
1512 switch ( bytes )
1514 case 1: return inb(port);
1515 case 2: return inw(port);
1516 case 4: return inl(port);
1520 while ( bytes != 0 )
1522 unsigned int size = 1;
1523 uint32_t sub_data = 0xff;
1525 if ( (port == 0x42) || (port == 0x43) || (port == 0x61) )
1527 sub_data = pv_pit_handler(port, 0, 0);
1529 else if ( (port == 0xcf8) && (bytes == 4) )
1531 size = 4;
1532 sub_data = v->domain->arch.pci_cf8;
1534 else if ( ((port & 0xfffc) == 0xcfc) && IS_PRIV(v->domain) )
1536 size = min(bytes, 4 - (port & 3));
1537 if ( size == 3 )
1538 size = 2;
1539 sub_data = pci_conf_read(v->domain->arch.pci_cf8, port & 3, size);
1542 if ( size == 4 )
1543 return sub_data;
1545 data |= (sub_data & ((1u << (size * 8)) - 1)) << shift;
1546 shift += size * 8;
1547 port += size;
1548 bytes -= size;
1551 return data;
1554 extern void (*pv_rtc_handler)(unsigned int port, uint8_t value);
1556 static void guest_io_write(
1557 unsigned int port, unsigned int bytes, uint32_t data,
1558 struct vcpu *v, struct cpu_user_regs *regs)
1560 extern void pci_conf_write(
1561 uint32_t cf8, uint8_t offset, uint8_t bytes, uint32_t data);
1563 if ( admin_io_okay(port, bytes, v, regs) )
1565 switch ( bytes ) {
1566 case 1:
1567 if ( ((port == 0x70) || (port == 0x71)) && pv_rtc_handler )
1568 pv_rtc_handler(port, (uint8_t)data);
1569 outb((uint8_t)data, port);
1570 if ( pv_post_outb_hook )
1571 pv_post_outb_hook(port, (uint8_t)data);
1572 break;
1573 case 2:
1574 outw((uint16_t)data, port);
1575 break;
1576 case 4:
1577 outl(data, port);
1578 break;
1580 return;
1583 while ( bytes != 0 )
1585 unsigned int size = 1;
1587 if ( (port == 0x42) || (port == 0x43) || (port == 0x61) )
1589 pv_pit_handler(port, (uint8_t)data, 1);
1591 else if ( (port == 0xcf8) && (bytes == 4) )
1593 size = 4;
1594 v->domain->arch.pci_cf8 = data;
1596 else if ( ((port & 0xfffc) == 0xcfc) && IS_PRIV(v->domain) )
1598 size = min(bytes, 4 - (port & 3));
1599 if ( size == 3 )
1600 size = 2;
1601 pci_conf_write(v->domain->arch.pci_cf8, port & 3, size, data);
1604 if ( size == 4 )
1605 return;
1607 port += size;
1608 bytes -= size;
1609 data >>= size * 8;
1613 /* I/O emulation support. Helper routines for, and type of, the stack stub.*/
1614 void host_to_guest_gpr_switch(struct cpu_user_regs *)
1615 __attribute__((__regparm__(1)));
1616 unsigned long guest_to_host_gpr_switch(unsigned long)
1617 __attribute__((__regparm__(1)));
1619 void (*pv_post_outb_hook)(unsigned int port, u8 value);
1621 /* Instruction fetch with error handling. */
1622 #define insn_fetch(type, base, eip, limit) \
1623 ({ unsigned long _rc, _ptr = (base) + (eip); \
1624 type _x; \
1625 if ( ad_default < 8 ) \
1626 _ptr = (unsigned int)_ptr; \
1627 if ( (limit) < sizeof(_x) - 1 || (eip) > (limit) - (sizeof(_x) - 1) ) \
1628 goto fail; \
1629 if ( (_rc = copy_from_user(&_x, (type *)_ptr, sizeof(_x))) != 0 ) \
1630 { \
1631 propagate_page_fault(_ptr + sizeof(_x) - _rc, 0); \
1632 goto skip; \
1633 } \
1634 (eip) += sizeof(_x); _x; })
1636 #if defined(CONFIG_X86_32)
1637 # define read_sreg(regs, sr) ((regs)->sr)
1638 #elif defined(CONFIG_X86_64)
1639 # define read_sreg(regs, sr) read_segment_register(sr)
1640 #endif
1642 static int is_cpufreq_controller(struct domain *d)
1644 return ((cpufreq_controller == FREQCTL_dom0_kernel) &&
1645 (d->domain_id == 0));
1648 static int emulate_privileged_op(struct cpu_user_regs *regs)
1650 struct vcpu *v = current;
1651 unsigned long *reg, eip = regs->eip, res;
1652 u8 opcode, modrm_reg = 0, modrm_rm = 0, rep_prefix = 0, lock = 0, rex = 0;
1653 enum { lm_seg_none, lm_seg_fs, lm_seg_gs } lm_ovr = lm_seg_none;
1654 unsigned int port, i, data_sel, ar, data, rc, bpmatch = 0;
1655 unsigned int op_bytes, op_default, ad_bytes, ad_default;
1656 #define rd_ad(reg) (ad_bytes >= sizeof(regs->reg) \
1657 ? regs->reg \
1658 : ad_bytes == 4 \
1659 ? (u32)regs->reg \
1660 : (u16)regs->reg)
1661 #define wr_ad(reg, val) (ad_bytes >= sizeof(regs->reg) \
1662 ? regs->reg = (val) \
1663 : ad_bytes == 4 \
1664 ? (*(u32 *)&regs->reg = (val)) \
1665 : (*(u16 *)&regs->reg = (val)))
1666 unsigned long code_base, code_limit;
1667 char io_emul_stub[32];
1668 void (*io_emul)(struct cpu_user_regs *) __attribute__((__regparm__(1)));
1669 u32 l, h, eax, edx;
1671 if ( !read_descriptor(regs->cs, v, regs,
1672 &code_base, &code_limit, &ar,
1673 _SEGMENT_CODE|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P) )
1674 goto fail;
1675 op_default = op_bytes = (ar & (_SEGMENT_L|_SEGMENT_DB)) ? 4 : 2;
1676 ad_default = ad_bytes = (ar & _SEGMENT_L) ? 8 : op_default;
1677 if ( !(ar & _SEGMENT_S) ||
1678 !(ar & _SEGMENT_P) ||
1679 !(ar & _SEGMENT_CODE) )
1680 goto fail;
1682 /* emulating only opcodes not allowing SS to be default */
1683 data_sel = read_sreg(regs, ds);
1685 /* Legacy prefixes. */
1686 for ( i = 0; i < 8; i++, rex == opcode || (rex = 0) )
1688 switch ( opcode = insn_fetch(u8, code_base, eip, code_limit) )
1690 case 0x66: /* operand-size override */
1691 op_bytes = op_default ^ 6; /* switch between 2/4 bytes */
1692 continue;
1693 case 0x67: /* address-size override */
1694 ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */
1695 continue;
1696 case 0x2e: /* CS override */
1697 data_sel = regs->cs;
1698 continue;
1699 case 0x3e: /* DS override */
1700 data_sel = read_sreg(regs, ds);
1701 continue;
1702 case 0x26: /* ES override */
1703 data_sel = read_sreg(regs, es);
1704 continue;
1705 case 0x64: /* FS override */
1706 data_sel = read_sreg(regs, fs);
1707 lm_ovr = lm_seg_fs;
1708 continue;
1709 case 0x65: /* GS override */
1710 data_sel = read_sreg(regs, gs);
1711 lm_ovr = lm_seg_gs;
1712 continue;
1713 case 0x36: /* SS override */
1714 data_sel = regs->ss;
1715 continue;
1716 case 0xf0: /* LOCK */
1717 lock = 1;
1718 continue;
1719 case 0xf2: /* REPNE/REPNZ */
1720 case 0xf3: /* REP/REPE/REPZ */
1721 rep_prefix = 1;
1722 continue;
1723 default:
1724 if ( (ar & _SEGMENT_L) && (opcode & 0xf0) == 0x40 )
1726 rex = opcode;
1727 continue;
1729 break;
1731 break;
1734 /* REX prefix. */
1735 if ( rex & 8 ) /* REX.W */
1736 op_bytes = 4; /* emulate only opcodes not supporting 64-bit operands */
1737 modrm_reg = (rex & 4) << 1; /* REX.R */
1738 /* REX.X does not need to be decoded. */
1739 modrm_rm = (rex & 1) << 3; /* REX.B */
1741 if ( opcode == 0x0f )
1742 goto twobyte_opcode;
1744 if ( lock )
1745 goto fail;
1747 /* Input/Output String instructions. */
1748 if ( (opcode >= 0x6c) && (opcode <= 0x6f) )
1750 unsigned long data_base, data_limit;
1752 if ( rep_prefix && (rd_ad(ecx) == 0) )
1753 goto done;
1755 if ( !(opcode & 2) )
1757 data_sel = read_sreg(regs, es);
1758 lm_ovr = lm_seg_none;
1761 if ( !(ar & _SEGMENT_L) )
1763 if ( !read_descriptor(data_sel, v, regs,
1764 &data_base, &data_limit, &ar,
1765 _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|
1766 _SEGMENT_P) )
1767 goto fail;
1768 if ( !(ar & _SEGMENT_S) ||
1769 !(ar & _SEGMENT_P) ||
1770 (opcode & 2 ?
1771 (ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR) :
1772 (ar & _SEGMENT_CODE) || !(ar & _SEGMENT_WR)) )
1773 goto fail;
1775 #ifdef CONFIG_X86_64
1776 else
1778 if ( lm_ovr == lm_seg_none || data_sel < 4 )
1780 switch ( lm_ovr )
1782 case lm_seg_none:
1783 data_base = 0UL;
1784 break;
1785 case lm_seg_fs:
1786 data_base = v->arch.guest_context.fs_base;
1787 break;
1788 case lm_seg_gs:
1789 if ( guest_kernel_mode(v, regs) )
1790 data_base = v->arch.guest_context.gs_base_kernel;
1791 else
1792 data_base = v->arch.guest_context.gs_base_user;
1793 break;
1796 else
1797 read_descriptor(data_sel, v, regs,
1798 &data_base, &data_limit, &ar,
1799 0);
1800 data_limit = ~0UL;
1801 ar = _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P;
1803 #endif
1805 port = (u16)regs->edx;
1807 continue_io_string:
1808 switch ( opcode )
1810 case 0x6c: /* INSB */
1811 op_bytes = 1;
1812 case 0x6d: /* INSW/INSL */
1813 if ( (data_limit < (op_bytes - 1)) ||
1814 (rd_ad(edi) > (data_limit - (op_bytes - 1))) ||
1815 !guest_io_okay(port, op_bytes, v, regs) )
1816 goto fail;
1817 data = guest_io_read(port, op_bytes, v, regs);
1818 if ( (rc = copy_to_user((void *)data_base + rd_ad(edi),
1819 &data, op_bytes)) != 0 )
1821 propagate_page_fault(data_base + rd_ad(edi) + op_bytes - rc,
1822 PFEC_write_access);
1823 return EXCRET_fault_fixed;
1825 wr_ad(edi, regs->edi + (int)((regs->eflags & EF_DF)
1826 ? -op_bytes : op_bytes));
1827 break;
1829 case 0x6e: /* OUTSB */
1830 op_bytes = 1;
1831 case 0x6f: /* OUTSW/OUTSL */
1832 if ( (data_limit < (op_bytes - 1)) ||
1833 (rd_ad(esi) > (data_limit - (op_bytes - 1))) ||
1834 !guest_io_okay(port, op_bytes, v, regs) )
1835 goto fail;
1836 if ( (rc = copy_from_user(&data, (void *)data_base + rd_ad(esi),
1837 op_bytes)) != 0 )
1839 propagate_page_fault(data_base + rd_ad(esi)
1840 + op_bytes - rc, 0);
1841 return EXCRET_fault_fixed;
1843 guest_io_write(port, op_bytes, data, v, regs);
1844 wr_ad(esi, regs->esi + (int)((regs->eflags & EF_DF)
1845 ? -op_bytes : op_bytes));
1846 break;
1849 bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
1851 if ( rep_prefix && (wr_ad(ecx, regs->ecx - 1) != 0) )
1853 if ( !bpmatch && !hypercall_preempt_check() )
1854 goto continue_io_string;
1855 eip = regs->eip;
1858 goto done;
1861 /*
1862 * Very likely to be an I/O instruction (IN/OUT).
1863 * Build an on-stack stub to execute the instruction with full guest
1864 * GPR context. This is needed for some systems which (ab)use IN/OUT
1865 * to communicate with BIOS code in system-management mode.
1866 */
1867 #ifdef __x86_64__
1868 /* movq $host_to_guest_gpr_switch,%rcx */
1869 io_emul_stub[0] = 0x48;
1870 io_emul_stub[1] = 0xb9;
1871 *(void **)&io_emul_stub[2] = (void *)host_to_guest_gpr_switch;
1872 /* callq *%rcx */
1873 io_emul_stub[10] = 0xff;
1874 io_emul_stub[11] = 0xd1;
1875 #else
1876 /* call host_to_guest_gpr_switch */
1877 io_emul_stub[0] = 0xe8;
1878 *(s32 *)&io_emul_stub[1] =
1879 (char *)host_to_guest_gpr_switch - &io_emul_stub[5];
1880 /* 7 x nop */
1881 memset(&io_emul_stub[5], 0x90, 7);
1882 #endif
1883 /* data16 or nop */
1884 io_emul_stub[12] = (op_bytes != 2) ? 0x90 : 0x66;
1885 /* <io-access opcode> */
1886 io_emul_stub[13] = opcode;
1887 /* imm8 or nop */
1888 io_emul_stub[14] = 0x90;
1889 /* ret (jumps to guest_to_host_gpr_switch) */
1890 io_emul_stub[15] = 0xc3;
1892 /* Handy function-typed pointer to the stub. */
1893 io_emul = (void *)io_emul_stub;
1895 if ( ioemul_handle_quirk )
1896 ioemul_handle_quirk(opcode, &io_emul_stub[12], regs);
1898 /* I/O Port and Interrupt Flag instructions. */
1899 switch ( opcode )
1901 case 0xe4: /* IN imm8,%al */
1902 op_bytes = 1;
1903 case 0xe5: /* IN imm8,%eax */
1904 port = insn_fetch(u8, code_base, eip, code_limit);
1905 io_emul_stub[14] = port; /* imm8 */
1906 exec_in:
1907 if ( !guest_io_okay(port, op_bytes, v, regs) )
1908 goto fail;
1909 if ( admin_io_okay(port, op_bytes, v, regs) )
1911 io_emul(regs);
1913 else
1915 if ( op_bytes == 4 )
1916 regs->eax = 0;
1917 else
1918 regs->eax &= ~((1u << (op_bytes * 8)) - 1);
1919 regs->eax |= guest_io_read(port, op_bytes, v, regs);
1921 bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
1922 goto done;
1924 case 0xec: /* IN %dx,%al */
1925 op_bytes = 1;
1926 case 0xed: /* IN %dx,%eax */
1927 port = (u16)regs->edx;
1928 goto exec_in;
1930 case 0xe6: /* OUT %al,imm8 */
1931 op_bytes = 1;
1932 case 0xe7: /* OUT %eax,imm8 */
1933 port = insn_fetch(u8, code_base, eip, code_limit);
1934 io_emul_stub[14] = port; /* imm8 */
1935 exec_out:
1936 if ( !guest_io_okay(port, op_bytes, v, regs) )
1937 goto fail;
1938 if ( admin_io_okay(port, op_bytes, v, regs) )
1940 if ( (op_bytes == 1) &&
1941 ((port == 0x71) || (port == 0x70)) &&
1942 pv_rtc_handler )
1943 pv_rtc_handler(port, regs->eax);
1944 io_emul(regs);
1945 if ( (op_bytes == 1) && pv_post_outb_hook )
1946 pv_post_outb_hook(port, regs->eax);
1948 else
1950 guest_io_write(port, op_bytes, regs->eax, v, regs);
1952 bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
1953 goto done;
1955 case 0xee: /* OUT %al,%dx */
1956 op_bytes = 1;
1957 case 0xef: /* OUT %eax,%dx */
1958 port = (u16)regs->edx;
1959 goto exec_out;
1961 case 0xfa: /* CLI */
1962 case 0xfb: /* STI */
1963 if ( v->arch.iopl < (guest_kernel_mode(v, regs) ? 1 : 3) )
1964 goto fail;
1965 /*
1966 * This is just too dangerous to allow, in my opinion. Consider if the
1967 * caller then tries to reenable interrupts using POPF: we can't trap
1968 * that and we'll end up with hard-to-debug lockups. Fast & loose will
1969 * do for us. :-)
1970 */
1971 /*v->vcpu_info->evtchn_upcall_mask = (opcode == 0xfa);*/
1972 goto done;
1975 /* No decode of this single-byte opcode. */
1976 goto fail;
1978 twobyte_opcode:
1979 /* Two-byte opcodes only emulated from guest kernel. */
1980 if ( !guest_kernel_mode(v, regs) )
1981 goto fail;
1983 /* Privileged (ring 0) instructions. */
1984 opcode = insn_fetch(u8, code_base, eip, code_limit);
1985 if ( lock && (opcode & ~3) != 0x20 )
1986 goto fail;
1987 switch ( opcode )
1989 case 0x06: /* CLTS */
1990 (void)do_fpu_taskswitch(0);
1991 break;
1993 case 0x09: /* WBINVD */
1994 /* Ignore the instruction if unprivileged. */
1995 if ( !cache_flush_permitted(v->domain) )
1996 /* Non-physdev domain attempted WBINVD; ignore for now since
1997 newer linux uses this in some start-of-day timing loops */
1999 else
2000 wbinvd();
2001 break;
2003 case 0x20: /* MOV CR?,<reg> */
2004 opcode = insn_fetch(u8, code_base, eip, code_limit);
2005 if ( opcode < 0xc0 )
2006 goto fail;
2007 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
2008 modrm_rm |= (opcode >> 0) & 7;
2009 reg = decode_register(modrm_rm, regs, 0);
2010 switch ( modrm_reg )
2012 case 0: /* Read CR0 */
2013 *reg = (read_cr0() & ~X86_CR0_TS) |
2014 v->arch.guest_context.ctrlreg[0];
2015 break;
2017 case 2: /* Read CR2 */
2018 *reg = v->arch.guest_context.ctrlreg[2];
2019 break;
2021 case 3: /* Read CR3 */
2022 if ( !is_pv_32on64_vcpu(v) )
2023 *reg = xen_pfn_to_cr3(mfn_to_gmfn(
2024 v->domain, pagetable_get_pfn(v->arch.guest_table)));
2025 #ifdef CONFIG_COMPAT
2026 else
2027 *reg = compat_pfn_to_cr3(mfn_to_gmfn(
2028 v->domain, l4e_get_pfn(*(l4_pgentry_t *)__va(pagetable_get_paddr(v->arch.guest_table)))));
2029 #endif
2030 break;
2032 case 4: /* Read CR4 */
2033 /*
2034 * Guests can read CR4 to see what features Xen has enabled. We
2035 * therefore lie about PGE as it is unavailable to guests.
2036 * Also disallow PSE if hugepages are not enabled.
2037 */
2038 *reg = read_cr4() & ~X86_CR4_PGE;
2039 if ( !opt_allow_hugepage )
2040 *reg &= ~X86_CR4_PSE;
2041 break;
2043 default:
2044 goto fail;
2046 break;
2048 case 0x21: /* MOV DR?,<reg> */
2049 opcode = insn_fetch(u8, code_base, eip, code_limit);
2050 if ( opcode < 0xc0 )
2051 goto fail;
2052 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
2053 modrm_rm |= (opcode >> 0) & 7;
2054 reg = decode_register(modrm_rm, regs, 0);
2055 if ( (res = do_get_debugreg(modrm_reg)) > (unsigned long)-256 )
2056 goto fail;
2057 *reg = res;
2058 break;
2060 case 0x22: /* MOV <reg>,CR? */
2061 opcode = insn_fetch(u8, code_base, eip, code_limit);
2062 if ( opcode < 0xc0 )
2063 goto fail;
2064 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
2065 modrm_rm |= (opcode >> 0) & 7;
2066 reg = decode_register(modrm_rm, regs, 0);
2067 switch ( modrm_reg )
2069 case 0: /* Write CR0 */
2070 if ( (*reg ^ read_cr0()) & ~X86_CR0_TS )
2072 gdprintk(XENLOG_WARNING,
2073 "Attempt to change unmodifiable CR0 flags.\n");
2074 goto fail;
2076 (void)do_fpu_taskswitch(!!(*reg & X86_CR0_TS));
2077 break;
2079 case 2: /* Write CR2 */
2080 v->arch.guest_context.ctrlreg[2] = *reg;
2081 arch_set_cr2(v, *reg);
2082 break;
2084 case 3: /* Write CR3 */
2085 domain_lock(v->domain);
2086 if ( !is_pv_32on64_vcpu(v) )
2087 rc = new_guest_cr3(gmfn_to_mfn(v->domain, xen_cr3_to_pfn(*reg)));
2088 #ifdef CONFIG_COMPAT
2089 else
2090 rc = new_guest_cr3(gmfn_to_mfn(v->domain, compat_cr3_to_pfn(*reg)));
2091 #endif
2092 domain_unlock(v->domain);
2093 if ( rc == 0 ) /* not okay */
2094 goto fail;
2095 break;
2097 case 4: /* Write CR4 */
2098 v->arch.guest_context.ctrlreg[4] = pv_guest_cr4_fixup(*reg);
2099 write_cr4(pv_guest_cr4_to_real_cr4(
2100 v->arch.guest_context.ctrlreg[4]));
2101 break;
2103 default:
2104 goto fail;
2106 break;
2108 case 0x23: /* MOV <reg>,DR? */
2109 opcode = insn_fetch(u8, code_base, eip, code_limit);
2110 if ( opcode < 0xc0 )
2111 goto fail;
2112 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
2113 modrm_rm |= (opcode >> 0) & 7;
2114 reg = decode_register(modrm_rm, regs, 0);
2115 if ( do_set_debugreg(modrm_reg, *reg) != 0 )
2116 goto fail;
2117 break;
2119 case 0x30: /* WRMSR */
2120 eax = regs->eax;
2121 edx = regs->edx;
2122 res = ((u64)edx << 32) | eax;
2123 switch ( (u32)regs->ecx )
2125 #ifdef CONFIG_X86_64
2126 case MSR_FS_BASE:
2127 if ( is_pv_32on64_vcpu(v) )
2128 goto fail;
2129 if ( wrmsr_safe(MSR_FS_BASE, eax, edx) )
2130 goto fail;
2131 v->arch.guest_context.fs_base = res;
2132 break;
2133 case MSR_GS_BASE:
2134 if ( is_pv_32on64_vcpu(v) )
2135 goto fail;
2136 if ( wrmsr_safe(MSR_GS_BASE, eax, edx) )
2137 goto fail;
2138 v->arch.guest_context.gs_base_kernel = res;
2139 break;
2140 case MSR_SHADOW_GS_BASE:
2141 if ( is_pv_32on64_vcpu(v) )
2142 goto fail;
2143 if ( wrmsr_safe(MSR_SHADOW_GS_BASE, eax, edx) )
2144 goto fail;
2145 v->arch.guest_context.gs_base_user = res;
2146 break;
2147 #endif
2148 case MSR_K7_FID_VID_STATUS:
2149 case MSR_K7_FID_VID_CTL:
2150 case MSR_K8_PSTATE_LIMIT:
2151 case MSR_K8_PSTATE_CTRL:
2152 case MSR_K8_PSTATE_STATUS:
2153 case MSR_K8_PSTATE0:
2154 case MSR_K8_PSTATE1:
2155 case MSR_K8_PSTATE2:
2156 case MSR_K8_PSTATE3:
2157 case MSR_K8_PSTATE4:
2158 case MSR_K8_PSTATE5:
2159 case MSR_K8_PSTATE6:
2160 case MSR_K8_PSTATE7:
2161 if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
2162 goto fail;
2163 if ( !is_cpufreq_controller(v->domain) )
2164 break;
2165 if ( wrmsr_safe(regs->ecx, eax, edx) != 0 )
2166 goto fail;
2167 break;
2168 case MSR_AMD64_NB_CFG:
2169 if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ||
2170 boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x11 )
2171 goto fail;
2172 if ( !IS_PRIV(v->domain) )
2173 break;
2174 if ( (rdmsr_safe(MSR_AMD64_NB_CFG, l, h) != 0) ||
2175 (eax != l) ||
2176 ((edx ^ h) & ~(1 << (AMD64_NB_CFG_CF8_EXT_ENABLE_BIT - 32))) )
2177 goto invalid;
2178 if ( wrmsr_safe(MSR_AMD64_NB_CFG, eax, edx) != 0 )
2179 goto fail;
2180 break;
2181 case MSR_FAM10H_MMIO_CONF_BASE:
2182 if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ||
2183 boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x11 )
2184 goto fail;
2185 if ( !IS_PRIV(v->domain) )
2186 break;
2187 if ( (rdmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, l, h) != 0) ||
2188 (((((u64)h << 32) | l) ^ res) &
2189 ~( FAM10H_MMIO_CONF_ENABLE |
2190 (FAM10H_MMIO_CONF_BUSRANGE_MASK <<
2191 FAM10H_MMIO_CONF_BUSRANGE_SHIFT) |
2192 ((u64)FAM10H_MMIO_CONF_BASE_MASK <<
2193 FAM10H_MMIO_CONF_BASE_SHIFT))) )
2194 goto invalid;
2195 if ( wrmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, eax, edx) != 0 )
2196 goto fail;
2197 break;
2198 case MSR_IA32_MPERF:
2199 case MSR_IA32_APERF:
2200 case MSR_IA32_PERF_CTL:
2201 if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
2202 goto fail;
2203 if ( !is_cpufreq_controller(v->domain) )
2204 break;
2205 if ( wrmsr_safe(regs->ecx, eax, edx) != 0 )
2206 goto fail;
2207 break;
2208 case MSR_IA32_THERM_CONTROL:
2209 if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
2210 goto fail;
2211 if ( (v->domain->domain_id != 0) || !v->domain->is_pinned )
2212 break;
2213 if ( wrmsr_safe(regs->ecx, eax, edx) != 0 )
2214 goto fail;
2215 break;
2216 default:
2217 if ( wrmsr_hypervisor_regs(regs->ecx, eax, edx) )
2218 break;
2219 if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
2221 int rc = intel_mce_wrmsr(regs->ecx, res);
2222 if ( rc < 0 )
2223 goto fail;
2224 if ( rc )
2225 break;
2228 if ( (rdmsr_safe(regs->ecx, l, h) != 0) ||
2229 (eax != l) || (edx != h) )
2230 invalid:
2231 gdprintk(XENLOG_WARNING, "Domain attempted WRMSR %p from "
2232 "%08x:%08x to %08x:%08x.\n",
2233 _p(regs->ecx), h, l, edx, eax);
2234 break;
2236 break;
2238 case 0x31: /* RDTSC */
2239 rdtsc(regs->eax, regs->edx);
2240 break;
2242 case 0x32: /* RDMSR */
2243 switch ( (u32)regs->ecx )
2245 #ifdef CONFIG_X86_64
2246 case MSR_FS_BASE:
2247 if ( is_pv_32on64_vcpu(v) )
2248 goto fail;
2249 regs->eax = v->arch.guest_context.fs_base & 0xFFFFFFFFUL;
2250 regs->edx = v->arch.guest_context.fs_base >> 32;
2251 break;
2252 case MSR_GS_BASE:
2253 if ( is_pv_32on64_vcpu(v) )
2254 goto fail;
2255 regs->eax = v->arch.guest_context.gs_base_kernel & 0xFFFFFFFFUL;
2256 regs->edx = v->arch.guest_context.gs_base_kernel >> 32;
2257 break;
2258 case MSR_SHADOW_GS_BASE:
2259 if ( is_pv_32on64_vcpu(v) )
2260 goto fail;
2261 regs->eax = v->arch.guest_context.gs_base_user & 0xFFFFFFFFUL;
2262 regs->edx = v->arch.guest_context.gs_base_user >> 32;
2263 break;
2264 #endif
2265 case MSR_K7_FID_VID_CTL:
2266 case MSR_K7_FID_VID_STATUS:
2267 case MSR_K8_PSTATE_LIMIT:
2268 case MSR_K8_PSTATE_CTRL:
2269 case MSR_K8_PSTATE_STATUS:
2270 case MSR_K8_PSTATE0:
2271 case MSR_K8_PSTATE1:
2272 case MSR_K8_PSTATE2:
2273 case MSR_K8_PSTATE3:
2274 case MSR_K8_PSTATE4:
2275 case MSR_K8_PSTATE5:
2276 case MSR_K8_PSTATE6:
2277 case MSR_K8_PSTATE7:
2278 if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
2279 goto fail;
2280 if ( !is_cpufreq_controller(v->domain) )
2282 regs->eax = regs->edx = 0;
2283 break;
2285 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) != 0 )
2286 goto fail;
2287 break;
2288 case MSR_IA32_MISC_ENABLE:
2289 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
2290 goto fail;
2291 regs->eax &= ~(MSR_IA32_MISC_ENABLE_PERF_AVAIL |
2292 MSR_IA32_MISC_ENABLE_MONITOR_ENABLE);
2293 regs->eax |= MSR_IA32_MISC_ENABLE_BTS_UNAVAIL |
2294 MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL |
2295 MSR_IA32_MISC_ENABLE_XTPR_DISABLE;
2296 break;
2297 case MSR_EFER:
2298 case MSR_AMD_PATCHLEVEL:
2299 default:
2300 if ( rdmsr_hypervisor_regs(regs->ecx, &l, &h) )
2302 rdmsr_writeback:
2303 regs->eax = l;
2304 regs->edx = h;
2305 break;
2308 if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
2310 int rc = intel_mce_rdmsr(regs->ecx, &l, &h);
2312 if ( rc < 0 )
2313 goto fail;
2314 if ( rc )
2315 goto rdmsr_writeback;
2318 /* Everyone can read the MSR space. */
2319 /* gdprintk(XENLOG_WARNING,"Domain attempted RDMSR %p.\n",
2320 _p(regs->ecx));*/
2321 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
2322 goto fail;
2323 break;
2325 break;
2327 default:
2328 goto fail;
2331 #undef wr_ad
2332 #undef rd_ad
2334 done:
2335 instruction_done(regs, eip, bpmatch);
2336 skip:
2337 return EXCRET_fault_fixed;
2339 fail:
2340 return 0;
2343 static inline int check_stack_limit(unsigned int ar, unsigned int limit,
2344 unsigned int esp, unsigned int decr)
2346 return (((esp - decr) < (esp - 1)) &&
2347 (!(ar & _SEGMENT_EC) ? (esp - 1) <= limit : (esp - decr) > limit));
2350 static void emulate_gate_op(struct cpu_user_regs *regs)
2352 #ifdef __x86_64__
2353 struct vcpu *v = current;
2354 unsigned int sel, ar, dpl, nparm, opnd_sel;
2355 unsigned int op_default, op_bytes, ad_default, ad_bytes;
2356 unsigned long off, eip, opnd_off, base, limit;
2357 int jump;
2359 /* Check whether this fault is due to the use of a call gate. */
2360 if ( !read_gate_descriptor(regs->error_code, v, &sel, &off, &ar) ||
2361 (((ar >> 13) & 3) < (regs->cs & 3)) ||
2362 ((ar & _SEGMENT_TYPE) != 0xc00) )
2364 do_guest_trap(TRAP_gp_fault, regs, 1);
2365 return;
2367 if ( !(ar & _SEGMENT_P) )
2369 do_guest_trap(TRAP_no_segment, regs, 1);
2370 return;
2372 dpl = (ar >> 13) & 3;
2373 nparm = ar & 0x1f;
2375 /*
2376 * Decode instruction (and perhaps operand) to determine RPL,
2377 * whether this is a jump or a call, and the call return offset.
2378 */
2379 if ( !read_descriptor(regs->cs, v, regs, &base, &limit, &ar, 0) ||
2380 !(ar & _SEGMENT_S) ||
2381 !(ar & _SEGMENT_P) ||
2382 !(ar & _SEGMENT_CODE) )
2384 do_guest_trap(TRAP_gp_fault, regs, 1);
2385 return;
2388 op_bytes = op_default = ar & _SEGMENT_DB ? 4 : 2;
2389 ad_default = ad_bytes = op_default;
2390 opnd_sel = opnd_off = 0;
2391 jump = -1;
2392 for ( eip = regs->eip; eip - regs->_eip < 10; )
2394 switch ( insn_fetch(u8, base, eip, limit) )
2396 case 0x66: /* operand-size override */
2397 op_bytes = op_default ^ 6; /* switch between 2/4 bytes */
2398 continue;
2399 case 0x67: /* address-size override */
2400 ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */
2401 continue;
2402 case 0x2e: /* CS override */
2403 opnd_sel = regs->cs;
2404 ASSERT(opnd_sel);
2405 continue;
2406 case 0x3e: /* DS override */
2407 opnd_sel = read_sreg(regs, ds);
2408 if ( !opnd_sel )
2409 opnd_sel = dpl;
2410 continue;
2411 case 0x26: /* ES override */
2412 opnd_sel = read_sreg(regs, es);
2413 if ( !opnd_sel )
2414 opnd_sel = dpl;
2415 continue;
2416 case 0x64: /* FS override */
2417 opnd_sel = read_sreg(regs, fs);
2418 if ( !opnd_sel )
2419 opnd_sel = dpl;
2420 continue;
2421 case 0x65: /* GS override */
2422 opnd_sel = read_sreg(regs, gs);
2423 if ( !opnd_sel )
2424 opnd_sel = dpl;
2425 continue;
2426 case 0x36: /* SS override */
2427 opnd_sel = regs->ss;
2428 if ( !opnd_sel )
2429 opnd_sel = dpl;
2430 continue;
2431 case 0xea:
2432 ++jump;
2433 /* FALLTHROUGH */
2434 case 0x9a:
2435 ++jump;
2436 opnd_sel = regs->cs;
2437 opnd_off = eip;
2438 ad_bytes = ad_default;
2439 eip += op_bytes + 2;
2440 break;
2441 case 0xff:
2443 unsigned int modrm;
2445 switch ( (modrm = insn_fetch(u8, base, eip, limit)) & 0xf8 )
2447 case 0x28: case 0x68: case 0xa8:
2448 ++jump;
2449 /* FALLTHROUGH */
2450 case 0x18: case 0x58: case 0x98:
2451 ++jump;
2452 if ( ad_bytes != 2 )
2454 if ( (modrm & 7) == 4 )
2456 unsigned int sib;
2457 sib = insn_fetch(u8, base, eip, limit);
2459 modrm = (modrm & ~7) | (sib & 7);
2460 if ( (sib >>= 3) != 4 )
2461 opnd_off = *(unsigned long *)
2462 decode_register(sib & 7, regs, 0);
2463 opnd_off <<= sib >> 3;
2465 if ( (modrm & 7) != 5 || (modrm & 0xc0) )
2466 opnd_off += *(unsigned long *)
2467 decode_register(modrm & 7, regs, 0);
2468 else
2469 modrm |= 0x87;
2470 if ( !opnd_sel )
2472 switch ( modrm & 7 )
2474 default:
2475 opnd_sel = read_sreg(regs, ds);
2476 break;
2477 case 4: case 5:
2478 opnd_sel = regs->ss;
2479 break;
2483 else
2485 switch ( modrm & 7 )
2487 case 0: case 1: case 7:
2488 opnd_off = regs->ebx;
2489 break;
2490 case 6:
2491 if ( !(modrm & 0xc0) )
2492 modrm |= 0x80;
2493 else
2494 case 2: case 3:
2496 opnd_off = regs->ebp;
2497 if ( !opnd_sel )
2498 opnd_sel = regs->ss;
2500 break;
2502 if ( !opnd_sel )
2503 opnd_sel = read_sreg(regs, ds);
2504 switch ( modrm & 7 )
2506 case 0: case 2: case 4:
2507 opnd_off += regs->esi;
2508 break;
2509 case 1: case 3: case 5:
2510 opnd_off += regs->edi;
2511 break;
2514 switch ( modrm & 0xc0 )
2516 case 0x40:
2517 opnd_off += insn_fetch(s8, base, eip, limit);
2518 break;
2519 case 0x80:
2520 opnd_off += insn_fetch(s32, base, eip, limit);
2521 break;
2523 if ( ad_bytes == 4 )
2524 opnd_off = (unsigned int)opnd_off;
2525 else if ( ad_bytes == 2 )
2526 opnd_off = (unsigned short)opnd_off;
2527 break;
2530 break;
2532 break;
2535 if ( jump < 0 )
2537 fail:
2538 do_guest_trap(TRAP_gp_fault, regs, 1);
2539 skip:
2540 return;
2543 if ( (opnd_sel != regs->cs &&
2544 !read_descriptor(opnd_sel, v, regs, &base, &limit, &ar, 0)) ||
2545 !(ar & _SEGMENT_S) ||
2546 !(ar & _SEGMENT_P) ||
2547 ((ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR)) )
2549 do_guest_trap(TRAP_gp_fault, regs, 1);
2550 return;
2553 opnd_off += op_bytes;
2554 #define ad_default ad_bytes
2555 opnd_sel = insn_fetch(u16, base, opnd_off, limit);
2556 #undef ad_default
2557 ASSERT((opnd_sel & ~3) == regs->error_code);
2558 if ( dpl < (opnd_sel & 3) )
2560 do_guest_trap(TRAP_gp_fault, regs, 1);
2561 return;
2564 if ( !read_descriptor(sel, v, regs, &base, &limit, &ar, 0) ||
2565 !(ar & _SEGMENT_S) ||
2566 !(ar & _SEGMENT_CODE) ||
2567 (!jump || (ar & _SEGMENT_EC) ?
2568 ((ar >> 13) & 3) > (regs->cs & 3) :
2569 ((ar >> 13) & 3) != (regs->cs & 3)) )
2571 regs->error_code = sel;
2572 do_guest_trap(TRAP_gp_fault, regs, 1);
2573 return;
2575 if ( !(ar & _SEGMENT_P) )
2577 regs->error_code = sel;
2578 do_guest_trap(TRAP_no_segment, regs, 1);
2579 return;
2581 if ( off > limit )
2583 regs->error_code = 0;
2584 do_guest_trap(TRAP_gp_fault, regs, 1);
2585 return;
2588 if ( !jump )
2590 unsigned int ss, esp, *stkp;
2591 int rc;
2592 #define push(item) do \
2593 { \
2594 --stkp; \
2595 esp -= 4; \
2596 rc = __put_user(item, stkp); \
2597 if ( rc ) \
2598 { \
2599 propagate_page_fault((unsigned long)(stkp + 1) - rc, \
2600 PFEC_write_access); \
2601 return; \
2602 } \
2603 } while ( 0 )
2605 if ( ((ar >> 13) & 3) < (regs->cs & 3) )
2607 sel |= (ar >> 13) & 3;
2608 /* Inner stack known only for kernel ring. */
2609 if ( (sel & 3) != GUEST_KERNEL_RPL(v->domain) )
2611 do_guest_trap(TRAP_gp_fault, regs, 1);
2612 return;
2614 esp = v->arch.guest_context.kernel_sp;
2615 ss = v->arch.guest_context.kernel_ss;
2616 if ( (ss & 3) != (sel & 3) ||
2617 !read_descriptor(ss, v, regs, &base, &limit, &ar, 0) ||
2618 ((ar >> 13) & 3) != (sel & 3) ||
2619 !(ar & _SEGMENT_S) ||
2620 (ar & _SEGMENT_CODE) ||
2621 !(ar & _SEGMENT_WR) )
2623 regs->error_code = ss & ~3;
2624 do_guest_trap(TRAP_invalid_tss, regs, 1);
2625 return;
2627 if ( !(ar & _SEGMENT_P) ||
2628 !check_stack_limit(ar, limit, esp, (4 + nparm) * 4) )
2630 regs->error_code = ss & ~3;
2631 do_guest_trap(TRAP_stack_error, regs, 1);
2632 return;
2634 stkp = (unsigned int *)(unsigned long)((unsigned int)base + esp);
2635 if ( !compat_access_ok(stkp - 4 - nparm, (4 + nparm) * 4) )
2637 do_guest_trap(TRAP_gp_fault, regs, 1);
2638 return;
2640 push(regs->ss);
2641 push(regs->esp);
2642 if ( nparm )
2644 const unsigned int *ustkp;
2646 if ( !read_descriptor(regs->ss, v, regs, &base, &limit, &ar, 0) ||
2647 ((ar >> 13) & 3) != (regs->cs & 3) ||
2648 !(ar & _SEGMENT_S) ||
2649 (ar & _SEGMENT_CODE) ||
2650 !(ar & _SEGMENT_WR) ||
2651 !check_stack_limit(ar, limit, esp + nparm * 4, nparm * 4) )
2652 return do_guest_trap(TRAP_gp_fault, regs, 1);
2653 ustkp = (unsigned int *)(unsigned long)((unsigned int)base + regs->_esp + nparm * 4);
2654 if ( !compat_access_ok(ustkp - nparm, nparm * 4) )
2656 do_guest_trap(TRAP_gp_fault, regs, 1);
2657 return;
2659 do
2661 unsigned int parm;
2663 --ustkp;
2664 rc = __get_user(parm, ustkp);
2665 if ( rc )
2667 propagate_page_fault((unsigned long)(ustkp + 1) - rc, 0);
2668 return;
2670 push(parm);
2671 } while ( --nparm );
2674 else
2676 sel |= (regs->cs & 3);
2677 esp = regs->esp;
2678 ss = regs->ss;
2679 if ( !read_descriptor(ss, v, regs, &base, &limit, &ar, 0) ||
2680 ((ar >> 13) & 3) != (sel & 3) )
2682 do_guest_trap(TRAP_gp_fault, regs, 1);
2683 return;
2685 if ( !check_stack_limit(ar, limit, esp, 2 * 4) )
2687 regs->error_code = 0;
2688 do_guest_trap(TRAP_stack_error, regs, 1);
2689 return;
2691 stkp = (unsigned int *)(unsigned long)((unsigned int)base + esp);
2692 if ( !compat_access_ok(stkp - 2, 2 * 4) )
2694 do_guest_trap(TRAP_gp_fault, regs, 1);
2695 return;
2698 push(regs->cs);
2699 push(eip);
2700 #undef push
2701 regs->esp = esp;
2702 regs->ss = ss;
2704 else
2705 sel |= (regs->cs & 3);
2707 regs->cs = sel;
2708 instruction_done(regs, off, 0);
2709 #endif
2712 asmlinkage void do_general_protection(struct cpu_user_regs *regs)
2714 struct vcpu *v = current;
2715 unsigned long fixup;
2717 DEBUGGER_trap_entry(TRAP_gp_fault, regs);
2719 if ( regs->error_code & 1 )
2720 goto hardware_gp;
2722 if ( !guest_mode(regs) )
2723 goto gp_in_kernel;
2725 /*
2726 * Cunning trick to allow arbitrary "INT n" handling.
2728 * We set DPL == 0 on all vectors in the IDT. This prevents any INT <n>
2729 * instruction from trapping to the appropriate vector, when that might not
2730 * be expected by Xen or the guest OS. For example, that entry might be for
2731 * a fault handler (unlike traps, faults don't increment EIP), or might
2732 * expect an error code on the stack (which a software trap never
2733 * provides), or might be a hardware interrupt handler that doesn't like
2734 * being called spuriously.
2736 * Instead, a GPF occurs with the faulting IDT vector in the error code.
2737 * Bit 1 is set to indicate that an IDT entry caused the fault. Bit 0 is
2738 * clear to indicate that it's a software fault, not hardware.
2740 * NOTE: Vectors 3 and 4 are dealt with from their own handler. This is
2741 * okay because they can only be triggered by an explicit DPL-checked
2742 * instruction. The DPL specified by the guest OS for these vectors is NOT
2743 * CHECKED!!
2744 */
2745 if ( (regs->error_code & 3) == 2 )
2747 /* This fault must be due to <INT n> instruction. */
2748 const struct trap_info *ti;
2749 unsigned char vector = regs->error_code >> 3;
2750 ti = &v->arch.guest_context.trap_ctxt[vector];
2751 if ( permit_softint(TI_GET_DPL(ti), v, regs) )
2753 regs->eip += 2;
2754 do_guest_trap(vector, regs, 0);
2755 return;
2758 else if ( is_pv_32on64_vcpu(v) && regs->error_code )
2760 emulate_gate_op(regs);
2761 return;
2764 /* Emulate some simple privileged and I/O instructions. */
2765 if ( (regs->error_code == 0) &&
2766 emulate_privileged_op(regs) )
2768 trace_trap_one_addr(TRC_PV_EMULATE_PRIVOP, regs->eip);
2769 return;
2772 #if defined(__i386__)
2773 if ( VM_ASSIST(v->domain, VMASST_TYPE_4gb_segments) &&
2774 (regs->error_code == 0) &&
2775 gpf_emulate_4gb(regs) )
2777 TRACE_1D(TRC_PV_EMULATE_4GB, regs->eip);
2778 return;
2780 #endif
2782 /* Pass on GPF as is. */
2783 do_guest_trap(TRAP_gp_fault, regs, 1);
2784 return;
2786 gp_in_kernel:
2788 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
2790 dprintk(XENLOG_INFO, "GPF (%04x): %p -> %p\n",
2791 regs->error_code, _p(regs->eip), _p(fixup));
2792 regs->eip = fixup;
2793 return;
2796 DEBUGGER_trap_fatal(TRAP_gp_fault, regs);
2798 hardware_gp:
2799 show_execution_state(regs);
2800 panic("GENERAL PROTECTION FAULT\n[error_code=%04x]\n", regs->error_code);
2803 static DEFINE_PER_CPU(struct softirq_trap, softirq_trap);
2805 static void nmi_mce_softirq(void)
2807 int cpu = smp_processor_id();
2808 struct softirq_trap *st = &per_cpu(softirq_trap, cpu);
2809 cpumask_t affinity;
2811 BUG_ON(st == NULL);
2812 BUG_ON(st->vcpu == NULL);
2814 /* Set the tmp value unconditionally, so that
2815 * the check in the iret hypercall works. */
2816 st->vcpu->cpu_affinity_tmp = st->vcpu->cpu_affinity;
2818 if ((cpu != st->processor)
2819 || (st->processor != st->vcpu->processor))
2821 /* We are on a different physical cpu.
2822 * Make sure to wakeup the vcpu on the
2823 * specified processor.
2824 */
2825 cpus_clear(affinity);
2826 cpu_set(st->processor, affinity);
2827 vcpu_set_affinity(st->vcpu, &affinity);
2829 /* Affinity is restored in the iret hypercall. */
2832 /* Only used to defer wakeup of domain/vcpu to
2833 * a safe (non-NMI/MCE) context.
2834 */
2835 vcpu_kick(st->vcpu);
2838 static void nmi_dom0_report(unsigned int reason_idx)
2840 struct domain *d = dom0;
2842 if ( (d == NULL) || (d->vcpu[0] == NULL) )
2843 return;
2845 set_bit(reason_idx, nmi_reason(d));
2847 send_guest_trap(d, 0, TRAP_nmi);
2850 asmlinkage void mem_parity_error(struct cpu_user_regs *regs)
2852 switch ( opt_nmi[0] )
2854 case 'd': /* 'dom0' */
2855 nmi_dom0_report(_XEN_NMIREASON_parity_error);
2856 case 'i': /* 'ignore' */
2857 break;
2858 default: /* 'fatal' */
2859 console_force_unlock();
2860 printk("\n\nNMI - MEMORY ERROR\n");
2861 fatal_trap(TRAP_nmi, regs);
2864 outb((inb(0x61) & 0x0f) | 0x04, 0x61); /* clear-and-disable parity check */
2865 mdelay(1);
2866 outb((inb(0x61) & 0x0b) | 0x00, 0x61); /* enable parity check */
2869 asmlinkage void io_check_error(struct cpu_user_regs *regs)
2871 switch ( opt_nmi[0] )
2873 case 'd': /* 'dom0' */
2874 nmi_dom0_report(_XEN_NMIREASON_io_error);
2875 case 'i': /* 'ignore' */
2876 break;
2877 default: /* 'fatal' */
2878 console_force_unlock();
2879 printk("\n\nNMI - I/O ERROR\n");
2880 fatal_trap(TRAP_nmi, regs);
2883 outb((inb(0x61) & 0x0f) | 0x08, 0x61); /* clear-and-disable IOCK */
2884 mdelay(1);
2885 outb((inb(0x61) & 0x07) | 0x00, 0x61); /* enable IOCK */
2888 static void unknown_nmi_error(unsigned char reason)
2890 switch ( opt_nmi[0] )
2892 case 'd': /* 'dom0' */
2893 nmi_dom0_report(_XEN_NMIREASON_unknown);
2894 case 'i': /* 'ignore' */
2895 break;
2896 default: /* 'fatal' */
2897 printk("Uhhuh. NMI received for unknown reason %02x.\n", reason);
2898 printk("Dazed and confused, but trying to continue\n");
2899 printk("Do you have a strange power saving mode enabled?\n");
2900 kexec_crash();
2904 static int dummy_nmi_callback(struct cpu_user_regs *regs, int cpu)
2906 return 0;
2909 static nmi_callback_t nmi_callback = dummy_nmi_callback;
2911 asmlinkage void do_nmi(struct cpu_user_regs *regs)
2913 unsigned int cpu = smp_processor_id();
2914 unsigned char reason;
2916 ++nmi_count(cpu);
2918 if ( nmi_callback(regs, cpu) )
2919 return;
2921 if ( nmi_watchdog )
2922 nmi_watchdog_tick(regs);
2924 /* Only the BSP gets external NMIs from the system. */
2925 if ( cpu == 0 )
2927 reason = inb(0x61);
2928 if ( reason & 0x80 )
2929 mem_parity_error(regs);
2930 else if ( reason & 0x40 )
2931 io_check_error(regs);
2932 else if ( !nmi_watchdog )
2933 unknown_nmi_error((unsigned char)(reason&0xff));
2937 void set_nmi_callback(nmi_callback_t callback)
2939 nmi_callback = callback;
2942 void unset_nmi_callback(void)
2944 nmi_callback = dummy_nmi_callback;
2947 asmlinkage void do_device_not_available(struct cpu_user_regs *regs)
2949 struct vcpu *curr = current;
2951 BUG_ON(!guest_mode(regs));
2953 setup_fpu(curr);
2955 if ( curr->arch.guest_context.ctrlreg[0] & X86_CR0_TS )
2957 do_guest_trap(TRAP_no_device, regs, 0);
2958 curr->arch.guest_context.ctrlreg[0] &= ~X86_CR0_TS;
2960 else
2961 TRACE_0D(TRC_PV_MATH_STATE_RESTORE);
2963 return;
2966 asmlinkage void do_debug(struct cpu_user_regs *regs)
2968 struct vcpu *v = current;
2970 DEBUGGER_trap_entry(TRAP_debug, regs);
2972 if ( !guest_mode(regs) )
2974 if ( regs->eflags & EF_TF )
2976 #ifdef __x86_64__
2977 void sysenter_entry(void);
2978 void sysenter_eflags_saved(void);
2979 /* In SYSENTER entry path we can't zap TF until EFLAGS is saved. */
2980 if ( (regs->rip >= (unsigned long)sysenter_entry) &&
2981 (regs->rip < (unsigned long)sysenter_eflags_saved) )
2982 goto out;
2983 WARN_ON(regs->rip != (unsigned long)sysenter_eflags_saved);
2984 #else
2985 WARN_ON(1);
2986 #endif
2987 regs->eflags &= ~EF_TF;
2989 else
2991 /*
2992 * We ignore watchpoints when they trigger within Xen. This may
2993 * happen when a buffer is passed to us which previously had a
2994 * watchpoint set on it. No need to bump EIP; the only faulting
2995 * trap is an instruction breakpoint, which can't happen to us.
2996 */
2997 WARN_ON(!search_exception_table(regs->eip));
2999 goto out;
3002 /* Save debug status register where guest OS can peek at it */
3003 v->arch.guest_context.debugreg[6] = read_debugreg(6);
3005 ler_enable();
3006 do_guest_trap(TRAP_debug, regs, 0);
3007 return;
3009 out:
3010 ler_enable();
3011 return;
3014 asmlinkage void do_spurious_interrupt_bug(struct cpu_user_regs *regs)
3018 static void __set_intr_gate(unsigned int n, uint32_t dpl, void *addr)
3020 int i;
3021 /* Keep secondary tables in sync with IRQ updates. */
3022 for ( i = 1; i < NR_CPUS; i++ )
3023 if ( idt_tables[i] != NULL )
3024 _set_gate(&idt_tables[i][n], 14, dpl, addr);
3025 _set_gate(&idt_table[n], 14, dpl, addr);
3028 static void set_swint_gate(unsigned int n, void *addr)
3030 __set_intr_gate(n, 3, addr);
3033 void set_intr_gate(unsigned int n, void *addr)
3035 __set_intr_gate(n, 0, addr);
3038 void load_TR(void)
3040 struct tss_struct *tss = &init_tss[smp_processor_id()];
3041 struct desc_ptr old_gdt, tss_gdt = {
3042 .base = (long)(this_cpu(gdt_table) - FIRST_RESERVED_GDT_ENTRY),
3043 .limit = LAST_RESERVED_GDT_BYTE
3044 };
3046 _set_tssldt_desc(
3047 this_cpu(gdt_table) + TSS_ENTRY - FIRST_RESERVED_GDT_ENTRY,
3048 (unsigned long)tss,
3049 offsetof(struct tss_struct, __cacheline_filler) - 1,
3050 9);
3051 #ifdef CONFIG_COMPAT
3052 _set_tssldt_desc(
3053 this_cpu(compat_gdt_table) + TSS_ENTRY - FIRST_RESERVED_GDT_ENTRY,
3054 (unsigned long)tss,
3055 offsetof(struct tss_struct, __cacheline_filler) - 1,
3056 11);
3057 #endif
3059 /* Switch to non-compat GDT (which has B bit clear) to execute LTR. */
3060 asm volatile (
3061 "sgdt %0; lgdt %2; ltr %w1; lgdt %0"
3062 : "=m" (old_gdt) : "rm" (TSS_ENTRY << 3), "m" (tss_gdt) : "memory" );
3065 void __devinit percpu_traps_init(void)
3067 subarch_percpu_traps_init();
3069 if ( !opt_ler )
3070 return;
3072 switch ( boot_cpu_data.x86_vendor )
3074 case X86_VENDOR_INTEL:
3075 switch ( boot_cpu_data.x86 )
3077 case 6:
3078 this_cpu(ler_msr) = MSR_IA32_LASTINTFROMIP;
3079 break;
3080 case 15:
3081 this_cpu(ler_msr) = MSR_P4_LER_FROM_LIP;
3082 break;
3084 break;
3085 case X86_VENDOR_AMD:
3086 switch ( boot_cpu_data.x86 )
3088 case 6:
3089 case 15:
3090 case 16:
3091 this_cpu(ler_msr) = MSR_IA32_LASTINTFROMIP;
3092 break;
3094 break;
3097 ler_enable();
3100 void __init trap_init(void)
3102 /*
3103 * Note that interrupt gates are always used, rather than trap gates. We
3104 * must have interrupts disabled until DS/ES/FS/GS are saved because the
3105 * first activation must have the "bad" value(s) for these registers and
3106 * we may lose them if another activation is installed before they are
3107 * saved. The page-fault handler also needs interrupts disabled until %cr2
3108 * has been read and saved on the stack.
3109 */
3110 set_intr_gate(TRAP_divide_error,&divide_error);
3111 set_intr_gate(TRAP_debug,&debug);
3112 set_intr_gate(TRAP_nmi,&nmi);
3113 set_swint_gate(TRAP_int3,&int3); /* usable from all privileges */
3114 set_swint_gate(TRAP_overflow,&overflow); /* usable from all privileges */
3115 set_intr_gate(TRAP_bounds,&bounds);
3116 set_intr_gate(TRAP_invalid_op,&invalid_op);
3117 set_intr_gate(TRAP_no_device,&device_not_available);
3118 set_intr_gate(TRAP_copro_seg,&coprocessor_segment_overrun);
3119 set_intr_gate(TRAP_invalid_tss,&invalid_TSS);
3120 set_intr_gate(TRAP_no_segment,&segment_not_present);
3121 set_intr_gate(TRAP_stack_error,&stack_segment);
3122 set_intr_gate(TRAP_gp_fault,&general_protection);
3123 set_intr_gate(TRAP_page_fault,&page_fault);
3124 set_intr_gate(TRAP_spurious_int,&spurious_interrupt_bug);
3125 set_intr_gate(TRAP_copro_error,&coprocessor_error);
3126 set_intr_gate(TRAP_alignment_check,&alignment_check);
3127 set_intr_gate(TRAP_machine_check,&machine_check);
3128 set_intr_gate(TRAP_simd_error,&simd_coprocessor_error);
3130 /* CPU0 uses the master IDT. */
3131 idt_tables[0] = idt_table;
3133 percpu_traps_init();
3135 cpu_init();
3137 open_softirq(NMI_MCE_SOFTIRQ, nmi_mce_softirq);
3140 long register_guest_nmi_callback(unsigned long address)
3142 struct vcpu *v = current;
3143 struct domain *d = v->domain;
3144 struct trap_info *t = &v->arch.guest_context.trap_ctxt[TRAP_nmi];
3146 t->vector = TRAP_nmi;
3147 t->flags = 0;
3148 t->cs = (is_pv_32on64_domain(d) ?
3149 FLAT_COMPAT_KERNEL_CS : FLAT_KERNEL_CS);
3150 t->address = address;
3151 TI_SET_IF(t, 1);
3153 /*
3154 * If no handler was registered we can 'lose the NMI edge'. Re-assert it
3155 * now.
3156 */
3157 if ( (v->vcpu_id == 0) && (arch_get_nmi_reason(d) != 0) )
3158 v->nmi_pending = 1;
3160 return 0;
3163 long unregister_guest_nmi_callback(void)
3165 struct vcpu *v = current;
3166 struct trap_info *t = &v->arch.guest_context.trap_ctxt[TRAP_nmi];
3168 memset(t, 0, sizeof(*t));
3170 return 0;
3173 int guest_has_trap_callback(struct domain *d, uint16_t vcpuid, unsigned int trap_nr)
3175 struct vcpu *v;
3176 struct trap_info *t;
3178 BUG_ON(d == NULL);
3179 BUG_ON(vcpuid >= MAX_VIRT_CPUS);
3181 /* Sanity check - XXX should be more fine grained. */
3182 BUG_ON(trap_nr > TRAP_syscall);
3184 v = d->vcpu[vcpuid];
3185 t = &v->arch.guest_context.trap_ctxt[trap_nr];
3187 return (t->address != 0);
3191 int send_guest_trap(struct domain *d, uint16_t vcpuid, unsigned int trap_nr)
3193 struct vcpu *v;
3194 struct softirq_trap *st;
3196 BUG_ON(d == NULL);
3197 BUG_ON(vcpuid >= MAX_VIRT_CPUS);
3198 v = d->vcpu[vcpuid];
3200 switch (trap_nr) {
3201 case TRAP_nmi:
3202 if ( !test_and_set_bool(v->nmi_pending) ) {
3203 st = &per_cpu(softirq_trap, smp_processor_id());
3204 st->domain = dom0;
3205 st->vcpu = dom0->vcpu[0];
3206 st->processor = st->vcpu->processor;
3208 /* not safe to wake up a vcpu here */
3209 raise_softirq(NMI_MCE_SOFTIRQ);
3210 return 0;
3212 break;
3214 case TRAP_machine_check:
3216 /* We are called by the machine check (exception or polling) handlers
3217 * on the physical CPU that reported a machine check error. */
3219 if ( !test_and_set_bool(v->mce_pending) ) {
3220 st = &per_cpu(softirq_trap, smp_processor_id());
3221 st->domain = d;
3222 st->vcpu = v;
3223 st->processor = v->processor;
3225 /* not safe to wake up a vcpu here */
3226 raise_softirq(NMI_MCE_SOFTIRQ);
3227 return 0;
3229 break;
3232 /* delivery failed */
3233 return -EIO;
3237 long do_set_trap_table(XEN_GUEST_HANDLE(const_trap_info_t) traps)
3239 struct trap_info cur;
3240 struct vcpu *curr = current;
3241 struct trap_info *dst = curr->arch.guest_context.trap_ctxt;
3242 long rc = 0;
3244 /* If no table is presented then clear the entire virtual IDT. */
3245 if ( guest_handle_is_null(traps) )
3247 memset(dst, 0, 256 * sizeof(*dst));
3248 init_int80_direct_trap(curr);
3249 return 0;
3252 for ( ; ; )
3254 if ( hypercall_preempt_check() )
3256 rc = hypercall_create_continuation(
3257 __HYPERVISOR_set_trap_table, "h", traps);
3258 break;
3261 if ( copy_from_guest(&cur, traps, 1) )
3263 rc = -EFAULT;
3264 break;
3267 if ( cur.address == 0 )
3268 break;
3270 fixup_guest_code_selector(curr->domain, cur.cs);
3272 memcpy(&dst[cur.vector], &cur, sizeof(cur));
3274 if ( cur.vector == 0x80 )
3275 init_int80_direct_trap(curr);
3277 guest_handle_add_offset(traps, 1);
3280 return rc;
3283 long set_debugreg(struct vcpu *v, int reg, unsigned long value)
3285 int i;
3286 struct vcpu *curr = current;
3288 switch ( reg )
3290 case 0:
3291 if ( !access_ok(value, sizeof(long)) )
3292 return -EPERM;
3293 if ( v == curr )
3294 write_debugreg(0, value);
3295 break;
3296 case 1:
3297 if ( !access_ok(value, sizeof(long)) )
3298 return -EPERM;
3299 if ( v == curr )
3300 write_debugreg(1, value);
3301 break;
3302 case 2:
3303 if ( !access_ok(value, sizeof(long)) )
3304 return -EPERM;
3305 if ( v == curr )
3306 write_debugreg(2, value);
3307 break;
3308 case 3:
3309 if ( !access_ok(value, sizeof(long)) )
3310 return -EPERM;
3311 if ( v == curr )
3312 write_debugreg(3, value);
3313 break;
3314 case 6:
3315 /*
3316 * DR6: Bits 4-11,16-31 reserved (set to 1).
3317 * Bit 12 reserved (set to 0).
3318 */
3319 value &= 0xffffefff; /* reserved bits => 0 */
3320 value |= 0xffff0ff0; /* reserved bits => 1 */
3321 if ( v == curr )
3322 write_debugreg(6, value);
3323 break;
3324 case 7:
3325 /*
3326 * DR7: Bit 10 reserved (set to 1).
3327 * Bits 11-12,14-15 reserved (set to 0).
3328 */
3329 value &= ~DR_CONTROL_RESERVED_ZERO; /* reserved bits => 0 */
3330 value |= DR_CONTROL_RESERVED_ONE; /* reserved bits => 1 */
3331 /*
3332 * Privileged bits:
3333 * GD (bit 13): must be 0.
3334 */
3335 if ( value & DR_GENERAL_DETECT )
3336 return -EPERM;
3337 /* DR7.{G,L}E = 0 => debugging disabled for this domain. */
3338 if ( value & DR7_ACTIVE_MASK )
3340 unsigned int io_enable = 0;
3342 for ( i = DR_CONTROL_SHIFT; i < 32; i += DR_CONTROL_SIZE )
3344 if ( ((value >> i) & 3) == DR_IO )
3346 if ( !(v->arch.guest_context.ctrlreg[4] & X86_CR4_DE) )
3347 return -EPERM;
3348 io_enable |= value & (3 << ((i - 16) >> 1));
3350 #ifdef __i386__
3351 if ( ((boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) ||
3352 !boot_cpu_has(X86_FEATURE_LM)) &&
3353 (((value >> i) & 0xc) == DR_LEN_8) )
3354 return -EPERM;
3355 #endif
3358 /* Guest DR5 is a handy stash for I/O intercept information. */
3359 v->arch.guest_context.debugreg[5] = io_enable;
3360 value &= ~io_enable;
3362 /*
3363 * If DR7 was previously clear then we need to load all other
3364 * debug registers at this point as they were not restored during
3365 * context switch.
3366 */
3367 if ( (v == curr) &&
3368 !(v->arch.guest_context.debugreg[7] & DR7_ACTIVE_MASK) )
3370 write_debugreg(0, v->arch.guest_context.debugreg[0]);
3371 write_debugreg(1, v->arch.guest_context.debugreg[1]);
3372 write_debugreg(2, v->arch.guest_context.debugreg[2]);
3373 write_debugreg(3, v->arch.guest_context.debugreg[3]);
3374 write_debugreg(6, v->arch.guest_context.debugreg[6]);
3377 if ( v == curr )
3378 write_debugreg(7, value);
3379 break;
3380 default:
3381 return -EINVAL;
3384 v->arch.guest_context.debugreg[reg] = value;
3385 return 0;
3388 long do_set_debugreg(int reg, unsigned long value)
3390 return set_debugreg(current, reg, value);
3393 unsigned long do_get_debugreg(int reg)
3395 struct vcpu *curr = current;
3397 switch ( reg )
3399 case 0 ... 3:
3400 case 6:
3401 return curr->arch.guest_context.debugreg[reg];
3402 case 7:
3403 return (curr->arch.guest_context.debugreg[7] |
3404 curr->arch.guest_context.debugreg[5]);
3405 case 4 ... 5:
3406 return ((curr->arch.guest_context.ctrlreg[4] & X86_CR4_DE) ?
3407 curr->arch.guest_context.debugreg[reg + 2] : 0);
3410 return -EINVAL;
3413 /*
3414 * Local variables:
3415 * mode: C
3416 * c-set-style: "BSD"
3417 * c-basic-offset: 4
3418 * tab-width: 4
3419 * indent-tabs-mode: nil
3420 * End:
3421 */