ia64/xen-unstable

view xen/arch/x86/traps.c @ 18432:1e98ea5c8604

x86: Fix guest_handle_okay/guest_handle_subrange_okay

The guest handle checks should use paging_* predicates, not shadow_*.
Also tidy up a few places where p2m definitions were being imported
via asm/guest_access.h -> asm/shadow.h -> asm/p2m.h

Signed-off-by: Tim Deegan <Tim.Deegan@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Wed Sep 03 14:16:35 2008 +0100 (2008-09-03)
parents b6eea72ea9dc
children a5cc38391afb
line source
1 /******************************************************************************
2 * arch/x86/traps.c
3 *
4 * Modifications to Linux original are copyright (c) 2002-2004, K A Fraser
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
21 /*
22 * Copyright (C) 1991, 1992 Linus Torvalds
23 *
24 * Pentium III FXSR, SSE support
25 * Gareth Hughes <gareth@valinux.com>, May 2000
26 */
28 #include <xen/config.h>
29 #include <xen/init.h>
30 #include <xen/sched.h>
31 #include <xen/lib.h>
32 #include <xen/errno.h>
33 #include <xen/mm.h>
34 #include <xen/console.h>
35 #include <xen/shutdown.h>
36 #include <asm/regs.h>
37 #include <xen/delay.h>
38 #include <xen/event.h>
39 #include <xen/spinlock.h>
40 #include <xen/irq.h>
41 #include <xen/perfc.h>
42 #include <xen/softirq.h>
43 #include <xen/domain_page.h>
44 #include <xen/symbols.h>
45 #include <xen/iocap.h>
46 #include <xen/nmi.h>
47 #include <xen/version.h>
48 #include <xen/kexec.h>
49 #include <xen/trace.h>
50 #include <xen/paging.h>
51 #include <asm/system.h>
52 #include <asm/io.h>
53 #include <asm/atomic.h>
54 #include <asm/desc.h>
55 #include <asm/debugreg.h>
56 #include <asm/smp.h>
57 #include <asm/flushtlb.h>
58 #include <asm/uaccess.h>
59 #include <asm/i387.h>
60 #include <asm/debugger.h>
61 #include <asm/msr.h>
62 #include <asm/shared.h>
63 #include <asm/x86_emulate.h>
64 #include <asm/traps.h>
65 #include <asm/hvm/vpt.h>
66 #include <public/arch-x86/cpuid.h>
68 /*
69 * opt_nmi: one of 'ignore', 'dom0', or 'fatal'.
70 * fatal: Xen prints diagnostic message and then hangs.
71 * dom0: The NMI is virtualised to DOM0.
72 * ignore: The NMI error is cleared and ignored.
73 */
74 #ifdef NDEBUG
75 char opt_nmi[10] = "dom0";
76 #else
77 char opt_nmi[10] = "fatal";
78 #endif
79 string_param("nmi", opt_nmi);
81 DEFINE_PER_CPU(u32, ler_msr);
83 /* Master table, used by CPU0. */
84 idt_entry_t idt_table[IDT_ENTRIES];
86 /* Pointer to the IDT of every CPU. */
87 idt_entry_t *idt_tables[NR_CPUS] __read_mostly;
89 #define DECLARE_TRAP_HANDLER(_name) \
90 asmlinkage void _name(void); \
91 asmlinkage void do_ ## _name(struct cpu_user_regs *regs)
93 DECLARE_TRAP_HANDLER(divide_error);
94 DECLARE_TRAP_HANDLER(debug);
95 DECLARE_TRAP_HANDLER(nmi);
96 DECLARE_TRAP_HANDLER(int3);
97 DECLARE_TRAP_HANDLER(overflow);
98 DECLARE_TRAP_HANDLER(bounds);
99 DECLARE_TRAP_HANDLER(invalid_op);
100 DECLARE_TRAP_HANDLER(device_not_available);
101 DECLARE_TRAP_HANDLER(coprocessor_segment_overrun);
102 DECLARE_TRAP_HANDLER(invalid_TSS);
103 DECLARE_TRAP_HANDLER(segment_not_present);
104 DECLARE_TRAP_HANDLER(stack_segment);
105 DECLARE_TRAP_HANDLER(general_protection);
106 DECLARE_TRAP_HANDLER(page_fault);
107 DECLARE_TRAP_HANDLER(coprocessor_error);
108 DECLARE_TRAP_HANDLER(simd_coprocessor_error);
109 DECLARE_TRAP_HANDLER(machine_check);
110 DECLARE_TRAP_HANDLER(alignment_check);
111 DECLARE_TRAP_HANDLER(spurious_interrupt_bug);
113 long do_set_debugreg(int reg, unsigned long value);
114 unsigned long do_get_debugreg(int reg);
115 void (*ioemul_handle_quirk)(
116 u8 opcode, char *io_emul_stub, struct cpu_user_regs *regs);
118 static int debug_stack_lines = 20;
119 integer_param("debug_stack_lines", debug_stack_lines);
121 static int opt_ler;
122 boolean_param("ler", opt_ler);
124 #ifdef CONFIG_X86_32
125 #define stack_words_per_line 8
126 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)&regs->esp)
127 #else
128 #define stack_words_per_line 4
129 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)regs->rsp)
130 #endif
132 static void show_guest_stack(struct cpu_user_regs *regs)
133 {
134 int i;
135 struct vcpu *curr = current;
136 unsigned long *stack, addr;
138 if ( is_hvm_vcpu(curr) )
139 return;
141 if ( is_pv_32on64_vcpu(curr) )
142 {
143 compat_show_guest_stack(regs, debug_stack_lines);
144 return;
145 }
147 if ( vm86_mode(regs) )
148 {
149 stack = (unsigned long *)((regs->ss << 4) + (regs->esp & 0xffff));
150 printk("Guest stack trace from ss:sp = %04x:%04x (VM86)\n ",
151 regs->ss, (uint16_t)(regs->esp & 0xffff));
152 }
153 else
154 {
155 stack = (unsigned long *)regs->esp;
156 printk("Guest stack trace from "__OP"sp=%p:\n ", stack);
157 }
159 for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
160 {
161 if ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) == 0 )
162 break;
163 if ( get_user(addr, stack) )
164 {
165 if ( i != 0 )
166 printk("\n ");
167 printk("Fault while accessing guest memory.");
168 i = 1;
169 break;
170 }
171 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
172 printk("\n ");
173 printk(" %p", _p(addr));
174 stack++;
175 }
176 if ( i == 0 )
177 printk("Stack empty.");
178 printk("\n");
179 }
181 #if !defined(CONFIG_FRAME_POINTER)
183 static void show_trace(struct cpu_user_regs *regs)
184 {
185 unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr;
187 printk("Xen call trace:\n ");
189 printk("[<%p>]", _p(regs->eip));
190 print_symbol(" %s\n ", regs->eip);
192 while ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) != 0 )
193 {
194 addr = *stack++;
195 if ( is_kernel_text(addr) || is_kernel_inittext(addr) )
196 {
197 printk("[<%p>]", _p(addr));
198 print_symbol(" %s\n ", addr);
199 }
200 }
202 printk("\n");
203 }
205 #else
207 static void show_trace(struct cpu_user_regs *regs)
208 {
209 unsigned long *frame, next, addr, low, high;
211 printk("Xen call trace:\n ");
213 printk("[<%p>]", _p(regs->eip));
214 print_symbol(" %s\n ", regs->eip);
216 /* Bounds for range of valid frame pointer. */
217 low = (unsigned long)(ESP_BEFORE_EXCEPTION(regs) - 2);
218 high = (low & ~(STACK_SIZE - 1)) +
219 (STACK_SIZE - sizeof(struct cpu_info) - 2*sizeof(unsigned long));
221 /* The initial frame pointer. */
222 next = regs->ebp;
224 for ( ; ; )
225 {
226 /* Valid frame pointer? */
227 if ( (next < low) || (next >= high) )
228 {
229 /*
230 * Exception stack frames have a different layout, denoted by an
231 * inverted frame pointer.
232 */
233 next = ~next;
234 if ( (next < low) || (next >= high) )
235 break;
236 frame = (unsigned long *)next;
237 next = frame[0];
238 addr = frame[(offsetof(struct cpu_user_regs, eip) -
239 offsetof(struct cpu_user_regs, ebp))
240 / BYTES_PER_LONG];
241 }
242 else
243 {
244 /* Ordinary stack frame. */
245 frame = (unsigned long *)next;
246 next = frame[0];
247 addr = frame[1];
248 }
250 printk("[<%p>]", _p(addr));
251 print_symbol(" %s\n ", addr);
253 low = (unsigned long)&frame[2];
254 }
256 printk("\n");
257 }
259 #endif
261 void show_stack(struct cpu_user_regs *regs)
262 {
263 unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr;
264 int i;
266 if ( guest_mode(regs) )
267 return show_guest_stack(regs);
269 printk("Xen stack trace from "__OP"sp=%p:\n ", stack);
271 for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
272 {
273 if ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) == 0 )
274 break;
275 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
276 printk("\n ");
277 addr = *stack++;
278 printk(" %p", _p(addr));
279 }
280 if ( i == 0 )
281 printk("Stack empty.");
282 printk("\n");
284 show_trace(regs);
285 }
287 void show_stack_overflow(unsigned int cpu, unsigned long esp)
288 {
289 #ifdef MEMORY_GUARD
290 unsigned long esp_top, esp_bottom;
291 unsigned long *stack, addr;
293 esp_bottom = (esp | (STACK_SIZE - 1)) + 1;
294 esp_top = esp_bottom - PRIMARY_STACK_SIZE;
296 printk("Valid stack range: %p-%p, sp=%p, tss.esp0=%p\n",
297 (void *)esp_top, (void *)esp_bottom, (void *)esp,
298 (void *)init_tss[cpu].esp0);
300 /* Trigger overflow trace if %esp is within 512 bytes of the guard page. */
301 if ( ((unsigned long)(esp - esp_top) > 512) &&
302 ((unsigned long)(esp_top - esp) > 512) )
303 {
304 printk("No stack overflow detected. Skipping stack trace.\n");
305 return;
306 }
308 if ( esp < esp_top )
309 esp = esp_top;
311 printk("Xen stack overflow (dumping trace %p-%p):\n ",
312 (void *)esp, (void *)esp_bottom);
314 stack = (unsigned long *)esp;
315 while ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) != 0 )
316 {
317 addr = *stack++;
318 if ( is_kernel_text(addr) || is_kernel_inittext(addr) )
319 {
320 printk("%p: [<%p>]", stack, _p(addr));
321 print_symbol(" %s\n ", addr);
322 }
323 }
325 printk("\n");
326 #endif
327 }
329 void show_execution_state(struct cpu_user_regs *regs)
330 {
331 show_registers(regs);
332 show_stack(regs);
333 }
335 void vcpu_show_execution_state(struct vcpu *v)
336 {
337 printk("*** Dumping Dom%d vcpu#%d state: ***\n",
338 v->domain->domain_id, v->vcpu_id);
340 if ( v == current )
341 {
342 show_execution_state(guest_cpu_user_regs());
343 return;
344 }
346 vcpu_pause(v); /* acceptably dangerous */
348 vcpu_show_registers(v);
349 /* Todo: map arbitrary vcpu's top guest stack page here. */
350 if ( (v->domain == current->domain) &&
351 guest_kernel_mode(v, &v->arch.guest_context.user_regs) )
352 show_guest_stack(&v->arch.guest_context.user_regs);
354 vcpu_unpause(v);
355 }
357 char *trapstr(int trapnr)
358 {
359 static char *strings[] = {
360 "divide error", "debug", "nmi", "bkpt", "overflow", "bounds",
361 "invalid opcode", "device not available", "double fault",
362 "coprocessor segment", "invalid tss", "segment not found",
363 "stack error", "general protection fault", "page fault",
364 "spurious interrupt", "coprocessor error", "alignment check",
365 "machine check", "simd error"
366 };
368 if ( (trapnr < 0) || (trapnr >= ARRAY_SIZE(strings)) )
369 return "???";
371 return strings[trapnr];
372 }
374 /*
375 * This is called for faults at very unexpected times (e.g., when interrupts
376 * are disabled). In such situations we can't do much that is safe. We try to
377 * print out some tracing and then we just spin.
378 */
379 asmlinkage void fatal_trap(int trapnr, struct cpu_user_regs *regs)
380 {
381 static DEFINE_PER_CPU(char, depth);
383 /*
384 * In some cases, we can end up in a vicious cycle of fatal_trap()s
385 * within fatal_trap()s. We give the problem a couple of iterations to
386 * bottom out, and then we just panic.
387 */
388 if ( ++this_cpu(depth) < 3 )
389 {
390 watchdog_disable();
391 console_start_sync();
393 show_execution_state(regs);
395 if ( trapnr == TRAP_page_fault )
396 {
397 unsigned long cr2 = read_cr2();
398 printk("Faulting linear address: %p\n", _p(cr2));
399 show_page_walk(cr2);
400 }
401 }
403 panic("FATAL TRAP: vector = %d (%s)\n"
404 "[error_code=%04x] %s\n",
405 trapnr, trapstr(trapnr), regs->error_code,
406 (regs->eflags & X86_EFLAGS_IF) ? "" : ", IN INTERRUPT CONTEXT");
407 }
409 static void do_guest_trap(
410 int trapnr, const struct cpu_user_regs *regs, int use_error_code)
411 {
412 struct vcpu *v = current;
413 struct trap_bounce *tb;
414 const struct trap_info *ti;
416 trace_pv_trap(trapnr, regs->eip, use_error_code, regs->error_code);
418 tb = &v->arch.trap_bounce;
419 ti = &v->arch.guest_context.trap_ctxt[trapnr];
421 tb->flags = TBF_EXCEPTION;
422 tb->cs = ti->cs;
423 tb->eip = ti->address;
425 if ( use_error_code )
426 {
427 tb->flags |= TBF_EXCEPTION_ERRCODE;
428 tb->error_code = regs->error_code;
429 }
431 if ( TI_GET_IF(ti) )
432 tb->flags |= TBF_INTERRUPT;
434 if ( unlikely(null_trap_bounce(v, tb)) )
435 gdprintk(XENLOG_WARNING, "Unhandled %s fault/trap [#%d] "
436 "on VCPU %d [ec=%04x]\n",
437 trapstr(trapnr), trapnr, v->vcpu_id, regs->error_code);
438 }
440 static void instruction_done(
441 struct cpu_user_regs *regs, unsigned long eip, unsigned int bpmatch)
442 {
443 regs->eip = eip;
444 regs->eflags &= ~X86_EFLAGS_RF;
445 if ( bpmatch || (regs->eflags & X86_EFLAGS_TF) )
446 {
447 current->arch.guest_context.debugreg[6] |= bpmatch | 0xffff0ff0;
448 if ( regs->eflags & X86_EFLAGS_TF )
449 current->arch.guest_context.debugreg[6] |= 0x4000;
450 do_guest_trap(TRAP_debug, regs, 0);
451 }
452 }
454 static unsigned int check_guest_io_breakpoint(struct vcpu *v,
455 unsigned int port, unsigned int len)
456 {
457 unsigned int width, i, match = 0;
458 unsigned long start;
460 if ( !(v->arch.guest_context.debugreg[5]) ||
461 !(v->arch.guest_context.ctrlreg[4] & X86_CR4_DE) )
462 return 0;
464 for ( i = 0; i < 4; i++ )
465 {
466 if ( !(v->arch.guest_context.debugreg[5] &
467 (3 << (i * DR_ENABLE_SIZE))) )
468 continue;
470 start = v->arch.guest_context.debugreg[i];
471 width = 0;
473 switch ( (v->arch.guest_context.debugreg[7] >>
474 (DR_CONTROL_SHIFT + i * DR_CONTROL_SIZE)) & 0xc )
475 {
476 case DR_LEN_1: width = 1; break;
477 case DR_LEN_2: width = 2; break;
478 case DR_LEN_4: width = 4; break;
479 case DR_LEN_8: width = 8; break;
480 }
482 if ( (start < (port + len)) && ((start + width) > port) )
483 match |= 1 << i;
484 }
486 return match;
487 }
489 /*
490 * Called from asm to set up the MCE trapbounce info.
491 * Returns 0 if no callback is set up, else 1.
492 */
493 asmlinkage int set_guest_machinecheck_trapbounce(void)
494 {
495 struct vcpu *v = current;
496 struct trap_bounce *tb = &v->arch.trap_bounce;
498 do_guest_trap(TRAP_machine_check, guest_cpu_user_regs(), 0);
499 tb->flags &= ~TBF_EXCEPTION; /* not needed for MCE delivery path */
500 return !null_trap_bounce(v, tb);
501 }
503 /*
504 * Called from asm to set up the NMI trapbounce info.
505 * Returns 0 if no callback is set up, else 1.
506 */
507 asmlinkage int set_guest_nmi_trapbounce(void)
508 {
509 struct vcpu *v = current;
510 struct trap_bounce *tb = &v->arch.trap_bounce;
511 do_guest_trap(TRAP_nmi, guest_cpu_user_regs(), 0);
512 tb->flags &= ~TBF_EXCEPTION; /* not needed for NMI delivery path */
513 return !null_trap_bounce(v, tb);
514 }
516 static inline void do_trap(
517 int trapnr, struct cpu_user_regs *regs, int use_error_code)
518 {
519 struct vcpu *curr = current;
520 unsigned long fixup;
522 DEBUGGER_trap_entry(trapnr, regs);
524 if ( guest_mode(regs) )
525 {
526 do_guest_trap(trapnr, regs, use_error_code);
527 return;
528 }
530 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
531 {
532 dprintk(XENLOG_ERR, "Trap %d: %p -> %p\n",
533 trapnr, _p(regs->eip), _p(fixup));
534 regs->eip = fixup;
535 return;
536 }
538 if ( ((trapnr == TRAP_copro_error) || (trapnr == TRAP_simd_error)) &&
539 is_hvm_vcpu(curr) && curr->arch.hvm_vcpu.fpu_exception_callback )
540 {
541 curr->arch.hvm_vcpu.fpu_exception_callback(
542 curr->arch.hvm_vcpu.fpu_exception_callback_arg, regs);
543 return;
544 }
546 DEBUGGER_trap_fatal(trapnr, regs);
548 show_execution_state(regs);
549 panic("FATAL TRAP: vector = %d (%s)\n"
550 "[error_code=%04x]\n",
551 trapnr, trapstr(trapnr), regs->error_code);
552 }
554 #define DO_ERROR_NOCODE(trapnr, name) \
555 asmlinkage void do_##name(struct cpu_user_regs *regs) \
556 { \
557 do_trap(trapnr, regs, 0); \
558 }
560 #define DO_ERROR(trapnr, name) \
561 asmlinkage void do_##name(struct cpu_user_regs *regs) \
562 { \
563 do_trap(trapnr, regs, 1); \
564 }
566 DO_ERROR_NOCODE(TRAP_divide_error, divide_error)
567 DO_ERROR_NOCODE(TRAP_overflow, overflow)
568 DO_ERROR_NOCODE(TRAP_bounds, bounds)
569 DO_ERROR_NOCODE(TRAP_copro_seg, coprocessor_segment_overrun)
570 DO_ERROR( TRAP_invalid_tss, invalid_TSS)
571 DO_ERROR( TRAP_no_segment, segment_not_present)
572 DO_ERROR( TRAP_stack_error, stack_segment)
573 DO_ERROR_NOCODE(TRAP_copro_error, coprocessor_error)
574 DO_ERROR( TRAP_alignment_check, alignment_check)
575 DO_ERROR_NOCODE(TRAP_simd_error, simd_coprocessor_error)
577 int rdmsr_hypervisor_regs(
578 uint32_t idx, uint32_t *eax, uint32_t *edx)
579 {
580 idx -= 0x40000000;
581 if ( idx > 0 )
582 return 0;
584 switch ( idx )
585 {
586 case 0:
587 {
588 *eax = *edx = 0;
589 break;
590 }
591 default:
592 BUG();
593 }
595 return 1;
596 }
598 int wrmsr_hypervisor_regs(
599 uint32_t idx, uint32_t eax, uint32_t edx)
600 {
601 struct domain *d = current->domain;
603 idx -= 0x40000000;
604 if ( idx > 0 )
605 return 0;
607 switch ( idx )
608 {
609 case 0:
610 {
611 void *hypercall_page;
612 unsigned long mfn;
613 unsigned long gmfn = ((unsigned long)edx << 20) | (eax >> 12);
614 unsigned int idx = eax & 0xfff;
616 if ( idx > 0 )
617 {
618 gdprintk(XENLOG_WARNING,
619 "Out of range index %u to MSR %08x\n",
620 idx, 0x40000000);
621 return 0;
622 }
624 mfn = gmfn_to_mfn(d, gmfn);
626 if ( !mfn_valid(mfn) ||
627 !get_page_and_type(mfn_to_page(mfn), d, PGT_writable_page) )
628 {
629 gdprintk(XENLOG_WARNING,
630 "Bad GMFN %lx (MFN %lx) to MSR %08x\n",
631 gmfn, mfn, 0x40000000);
632 return 0;
633 }
635 hypercall_page = map_domain_page(mfn);
636 hypercall_page_initialise(d, hypercall_page);
637 unmap_domain_page(hypercall_page);
639 put_page_and_type(mfn_to_page(mfn));
640 break;
641 }
643 default:
644 BUG();
645 }
647 return 1;
648 }
650 int cpuid_hypervisor_leaves(
651 uint32_t idx, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
652 {
653 idx -= 0x40000000;
654 if ( idx > 2 )
655 return 0;
657 switch ( idx )
658 {
659 case 0:
660 *eax = 0x40000002; /* Largest leaf */
661 *ebx = XEN_CPUID_SIGNATURE_EBX;
662 *ecx = XEN_CPUID_SIGNATURE_ECX;
663 *edx = XEN_CPUID_SIGNATURE_EDX;
664 break;
666 case 1:
667 *eax = (xen_major_version() << 16) | xen_minor_version();
668 *ebx = 0; /* Reserved */
669 *ecx = 0; /* Reserved */
670 *edx = 0; /* Reserved */
671 break;
673 case 2:
674 *eax = 1; /* Number of hypercall-transfer pages */
675 *ebx = 0x40000000; /* MSR base address */
676 *ecx = 0; /* Features 1 */
677 *edx = 0; /* Features 2 */
678 if ( !is_hvm_vcpu(current) )
679 *ecx |= XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD;
680 break;
682 default:
683 BUG();
684 }
686 return 1;
687 }
689 static void pv_cpuid(struct cpu_user_regs *regs)
690 {
691 uint32_t a, b, c, d;
693 a = regs->eax;
694 b = regs->ebx;
695 c = regs->ecx;
696 d = regs->edx;
698 if ( current->domain->domain_id != 0 )
699 {
700 if ( !cpuid_hypervisor_leaves(a, &a, &b, &c, &d) )
701 domain_cpuid(current->domain, a, b, &a, &b, &c, &d);
702 goto out;
703 }
705 asm (
706 "cpuid"
707 : "=a" (a), "=b" (b), "=c" (c), "=d" (d)
708 : "0" (a), "1" (b), "2" (c), "3" (d) );
710 if ( (regs->eax & 0x7fffffff) == 1 )
711 {
712 /* Modify Feature Information. */
713 __clear_bit(X86_FEATURE_VME, &d);
714 __clear_bit(X86_FEATURE_PSE, &d);
715 __clear_bit(X86_FEATURE_PGE, &d);
716 __clear_bit(X86_FEATURE_MCE, &d);
717 __clear_bit(X86_FEATURE_MCA, &d);
718 __clear_bit(X86_FEATURE_PSE36, &d);
719 }
720 switch ( (uint32_t)regs->eax )
721 {
722 case 1:
723 /* Modify Feature Information. */
724 if ( !cpu_has_sep )
725 __clear_bit(X86_FEATURE_SEP, &d);
726 #ifdef __i386__
727 if ( !supervisor_mode_kernel )
728 __clear_bit(X86_FEATURE_SEP, &d);
729 #endif
730 __clear_bit(X86_FEATURE_DS, &d);
731 __clear_bit(X86_FEATURE_ACC, &d);
732 __clear_bit(X86_FEATURE_PBE, &d);
734 __clear_bit(X86_FEATURE_DTES64 % 32, &c);
735 __clear_bit(X86_FEATURE_MWAIT % 32, &c);
736 __clear_bit(X86_FEATURE_DSCPL % 32, &c);
737 __clear_bit(X86_FEATURE_VMXE % 32, &c);
738 __clear_bit(X86_FEATURE_SMXE % 32, &c);
739 __clear_bit(X86_FEATURE_TM2 % 32, &c);
740 if ( is_pv_32bit_vcpu(current) )
741 __clear_bit(X86_FEATURE_CX16 % 32, &c);
742 __clear_bit(X86_FEATURE_XTPR % 32, &c);
743 __clear_bit(X86_FEATURE_PDCM % 32, &c);
744 __clear_bit(X86_FEATURE_DCA % 32, &c);
745 break;
746 case 0x80000001:
747 /* Modify Feature Information. */
748 if ( is_pv_32bit_vcpu(current) )
749 {
750 __clear_bit(X86_FEATURE_LM % 32, &d);
751 __clear_bit(X86_FEATURE_LAHF_LM % 32, &c);
752 }
753 #ifndef __i386__
754 if ( is_pv_32on64_vcpu(current) &&
755 boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
756 #endif
757 __clear_bit(X86_FEATURE_SYSCALL % 32, &d);
758 __clear_bit(X86_FEATURE_PAGE1GB % 32, &d);
759 __clear_bit(X86_FEATURE_RDTSCP % 32, &d);
761 __clear_bit(X86_FEATURE_SVME % 32, &c);
762 __clear_bit(X86_FEATURE_OSVW % 32, &c);
763 __clear_bit(X86_FEATURE_IBS % 32, &c);
764 __clear_bit(X86_FEATURE_SKINIT % 32, &c);
765 __clear_bit(X86_FEATURE_WDT % 32, &c);
766 break;
767 case 5: /* MONITOR/MWAIT */
768 case 0xa: /* Architectural Performance Monitor Features */
769 case 0x8000000a: /* SVM revision and features */
770 case 0x8000001b: /* Instruction Based Sampling */
771 a = b = c = d = 0;
772 break;
773 default:
774 (void)cpuid_hypervisor_leaves(regs->eax, &a, &b, &c, &d);
775 break;
776 }
778 out:
779 regs->eax = a;
780 regs->ebx = b;
781 regs->ecx = c;
782 regs->edx = d;
783 }
785 static int emulate_forced_invalid_op(struct cpu_user_regs *regs)
786 {
787 char sig[5], instr[2];
788 unsigned long eip, rc;
790 eip = regs->eip;
792 /* Check for forced emulation signature: ud2 ; .ascii "xen". */
793 if ( (rc = copy_from_user(sig, (char *)eip, sizeof(sig))) != 0 )
794 {
795 propagate_page_fault(eip + sizeof(sig) - rc, 0);
796 return EXCRET_fault_fixed;
797 }
798 if ( memcmp(sig, "\xf\xbxen", sizeof(sig)) )
799 return 0;
800 eip += sizeof(sig);
802 /* We only emulate CPUID. */
803 if ( ( rc = copy_from_user(instr, (char *)eip, sizeof(instr))) != 0 )
804 {
805 propagate_page_fault(eip + sizeof(instr) - rc, 0);
806 return EXCRET_fault_fixed;
807 }
808 if ( memcmp(instr, "\xf\xa2", sizeof(instr)) )
809 return 0;
810 eip += sizeof(instr);
812 pv_cpuid(regs);
814 instruction_done(regs, eip, 0);
816 trace_trap_one_addr(TRC_PV_FORCED_INVALID_OP, regs->eip);
818 return EXCRET_fault_fixed;
819 }
821 asmlinkage void do_invalid_op(struct cpu_user_regs *regs)
822 {
823 struct bug_frame bug;
824 struct bug_frame_str bug_str;
825 char *filename, *predicate, *eip = (char *)regs->eip;
826 unsigned long fixup;
827 int id, lineno;
829 DEBUGGER_trap_entry(TRAP_invalid_op, regs);
831 if ( likely(guest_mode(regs)) )
832 {
833 if ( !emulate_forced_invalid_op(regs) )
834 do_guest_trap(TRAP_invalid_op, regs, 0);
835 return;
836 }
838 if ( !is_kernel(eip) ||
839 __copy_from_user(&bug, eip, sizeof(bug)) ||
840 memcmp(bug.ud2, "\xf\xb", sizeof(bug.ud2)) ||
841 (bug.ret != 0xc2) )
842 goto die;
843 eip += sizeof(bug);
845 id = bug.id & 3;
847 if ( id == BUGFRAME_dump )
848 {
849 show_execution_state(regs);
850 regs->eip = (unsigned long)eip;
851 return;
852 }
854 /* WARN, BUG or ASSERT: decode the filename pointer and line number. */
855 if ( !is_kernel(eip) ||
856 __copy_from_user(&bug_str, eip, sizeof(bug_str)) ||
857 memcmp(bug_str.mov, BUG_MOV_STR, sizeof(bug_str.mov)) )
858 goto die;
859 eip += sizeof(bug_str);
861 filename = is_kernel(bug_str.str) ? (char *)bug_str.str : "<unknown>";
862 lineno = bug.id >> 2;
864 if ( id == BUGFRAME_warn )
865 {
866 printk("Xen WARN at %.50s:%d\n", filename, lineno);
867 show_execution_state(regs);
868 regs->eip = (unsigned long)eip;
869 return;
870 }
872 if ( id == BUGFRAME_bug )
873 {
874 printk("Xen BUG at %.50s:%d\n", filename, lineno);
875 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
876 show_execution_state(regs);
877 panic("Xen BUG at %.50s:%d\n", filename, lineno);
878 }
880 /* ASSERT: decode the predicate string pointer. */
881 ASSERT(id == BUGFRAME_assert);
882 if ( !is_kernel(eip) ||
883 __copy_from_user(&bug_str, eip, sizeof(bug_str)) ||
884 memcmp(bug_str.mov, BUG_MOV_STR, sizeof(bug_str.mov)) )
885 goto die;
886 eip += sizeof(bug_str);
888 predicate = is_kernel(bug_str.str) ? (char *)bug_str.str : "<unknown>";
889 printk("Assertion '%s' failed at %.50s:%d\n",
890 predicate, filename, lineno);
891 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
892 show_execution_state(regs);
893 panic("Assertion '%s' failed at %.50s:%d\n",
894 predicate, filename, lineno);
896 die:
897 if ( (fixup = search_exception_table(regs->eip)) != 0 )
898 {
899 regs->eip = fixup;
900 return;
901 }
902 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
903 show_execution_state(regs);
904 panic("FATAL TRAP: vector = %d (invalid opcode)\n", TRAP_invalid_op);
905 }
907 asmlinkage void do_int3(struct cpu_user_regs *regs)
908 {
909 DEBUGGER_trap_entry(TRAP_int3, regs);
911 if ( !guest_mode(regs) )
912 {
913 debugger_trap_fatal(TRAP_int3, regs);
914 return;
915 }
917 do_guest_trap(TRAP_int3, regs, 0);
918 }
920 asmlinkage void do_machine_check(struct cpu_user_regs *regs)
921 {
922 machine_check_vector(regs, regs->error_code);
923 }
925 static void reserved_bit_page_fault(
926 unsigned long addr, struct cpu_user_regs *regs)
927 {
928 printk("d%d:v%d: reserved bit in page table (ec=%04X)\n",
929 current->domain->domain_id, current->vcpu_id, regs->error_code);
930 show_page_walk(addr);
931 show_execution_state(regs);
932 }
934 void propagate_page_fault(unsigned long addr, u16 error_code)
935 {
936 struct trap_info *ti;
937 struct vcpu *v = current;
938 struct trap_bounce *tb = &v->arch.trap_bounce;
940 v->arch.guest_context.ctrlreg[2] = addr;
941 arch_set_cr2(v, addr);
943 /* Re-set error_code.user flag appropriately for the guest. */
944 error_code &= ~PFEC_user_mode;
945 if ( !guest_kernel_mode(v, guest_cpu_user_regs()) )
946 error_code |= PFEC_user_mode;
948 trace_pv_page_fault(addr, error_code);
950 ti = &v->arch.guest_context.trap_ctxt[TRAP_page_fault];
951 tb->flags = TBF_EXCEPTION | TBF_EXCEPTION_ERRCODE;
952 tb->error_code = error_code;
953 tb->cs = ti->cs;
954 tb->eip = ti->address;
955 if ( TI_GET_IF(ti) )
956 tb->flags |= TBF_INTERRUPT;
957 if ( unlikely(null_trap_bounce(v, tb)) )
958 {
959 printk("d%d:v%d: unhandled page fault (ec=%04X)\n",
960 v->domain->domain_id, v->vcpu_id, error_code);
961 show_page_walk(addr);
962 }
964 if ( unlikely(error_code & PFEC_reserved_bit) )
965 reserved_bit_page_fault(addr, guest_cpu_user_regs());
966 }
968 static int handle_gdt_ldt_mapping_fault(
969 unsigned long offset, struct cpu_user_regs *regs)
970 {
971 struct vcpu *curr = current;
972 /* Which vcpu's area did we fault in, and is it in the ldt sub-area? */
973 unsigned int is_ldt_area = (offset >> (GDT_LDT_VCPU_VA_SHIFT-1)) & 1;
974 unsigned int vcpu_area = (offset >> GDT_LDT_VCPU_VA_SHIFT);
976 /* Should never fault in another vcpu's area. */
977 BUG_ON(vcpu_area != curr->vcpu_id);
979 /* Byte offset within the gdt/ldt sub-area. */
980 offset &= (1UL << (GDT_LDT_VCPU_VA_SHIFT-1)) - 1UL;
982 if ( likely(is_ldt_area) )
983 {
984 /* LDT fault: Copy a mapping from the guest's LDT, if it is valid. */
985 if ( likely(map_ldt_shadow_page(offset >> PAGE_SHIFT)) )
986 {
987 if ( guest_mode(regs) )
988 trace_trap_two_addr(TRC_PV_GDT_LDT_MAPPING_FAULT,
989 regs->eip, offset);
990 }
991 else
992 {
993 /* In hypervisor mode? Leave it to the #PF handler to fix up. */
994 if ( !guest_mode(regs) )
995 return 0;
996 /* In guest mode? Propagate #PF to guest, with adjusted %cr2. */
997 propagate_page_fault(
998 curr->arch.guest_context.ldt_base + offset,
999 regs->error_code);
1002 else
1004 /* GDT fault: handle the fault as #GP(selector). */
1005 regs->error_code = (u16)offset & ~7;
1006 (void)do_general_protection(regs);
1009 return EXCRET_fault_fixed;
1012 #ifdef HYPERVISOR_VIRT_END
1013 #define IN_HYPERVISOR_RANGE(va) \
1014 (((va) >= HYPERVISOR_VIRT_START) && ((va) < HYPERVISOR_VIRT_END))
1015 #else
1016 #define IN_HYPERVISOR_RANGE(va) \
1017 (((va) >= HYPERVISOR_VIRT_START))
1018 #endif
1020 static int __spurious_page_fault(
1021 unsigned long addr, struct cpu_user_regs *regs)
1023 unsigned long mfn, cr3 = read_cr3();
1024 #if CONFIG_PAGING_LEVELS >= 4
1025 l4_pgentry_t l4e, *l4t;
1026 #endif
1027 #if CONFIG_PAGING_LEVELS >= 3
1028 l3_pgentry_t l3e, *l3t;
1029 #endif
1030 l2_pgentry_t l2e, *l2t;
1031 l1_pgentry_t l1e, *l1t;
1032 unsigned int required_flags, disallowed_flags;
1034 /*
1035 * We do not take spurious page faults in IRQ handlers as we do not
1036 * modify page tables in IRQ context. We therefore bail here because
1037 * map_domain_page() is not IRQ-safe.
1038 */
1039 if ( in_irq() )
1040 return 0;
1042 /* Reserved bit violations are never spurious faults. */
1043 if ( regs->error_code & PFEC_reserved_bit )
1044 return 0;
1046 required_flags = _PAGE_PRESENT;
1047 if ( regs->error_code & PFEC_write_access )
1048 required_flags |= _PAGE_RW;
1049 if ( regs->error_code & PFEC_user_mode )
1050 required_flags |= _PAGE_USER;
1052 disallowed_flags = 0;
1053 if ( regs->error_code & PFEC_insn_fetch )
1054 disallowed_flags |= _PAGE_NX;
1056 mfn = cr3 >> PAGE_SHIFT;
1058 #if CONFIG_PAGING_LEVELS >= 4
1059 l4t = map_domain_page(mfn);
1060 l4e = l4e_read_atomic(&l4t[l4_table_offset(addr)]);
1061 mfn = l4e_get_pfn(l4e);
1062 unmap_domain_page(l4t);
1063 if ( ((l4e_get_flags(l4e) & required_flags) != required_flags) ||
1064 (l4e_get_flags(l4e) & disallowed_flags) )
1065 return 0;
1066 #endif
1068 #if CONFIG_PAGING_LEVELS >= 3
1069 l3t = map_domain_page(mfn);
1070 #if CONFIG_PAGING_LEVELS == 3
1071 l3t += (cr3 & 0xFE0UL) >> 3;
1072 #endif
1073 l3e = l3e_read_atomic(&l3t[l3_table_offset(addr)]);
1074 mfn = l3e_get_pfn(l3e);
1075 unmap_domain_page(l3t);
1076 #if CONFIG_PAGING_LEVELS == 3
1077 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
1078 return 0;
1079 #else
1080 if ( ((l3e_get_flags(l3e) & required_flags) != required_flags) ||
1081 (l3e_get_flags(l3e) & disallowed_flags) )
1082 return 0;
1083 #endif
1084 #endif
1086 l2t = map_domain_page(mfn);
1087 l2e = l2e_read_atomic(&l2t[l2_table_offset(addr)]);
1088 mfn = l2e_get_pfn(l2e);
1089 unmap_domain_page(l2t);
1090 if ( ((l2e_get_flags(l2e) & required_flags) != required_flags) ||
1091 (l2e_get_flags(l2e) & disallowed_flags) )
1092 return 0;
1093 if ( l2e_get_flags(l2e) & _PAGE_PSE )
1095 l1e = l1e_empty(); /* define before use in debug tracing */
1096 goto spurious;
1099 l1t = map_domain_page(mfn);
1100 l1e = l1e_read_atomic(&l1t[l1_table_offset(addr)]);
1101 mfn = l1e_get_pfn(l1e);
1102 unmap_domain_page(l1t);
1103 if ( ((l1e_get_flags(l1e) & required_flags) != required_flags) ||
1104 (l1e_get_flags(l1e) & disallowed_flags) )
1105 return 0;
1107 spurious:
1108 dprintk(XENLOG_WARNING, "Spurious fault in domain %u:%u "
1109 "at addr %lx, e/c %04x\n",
1110 current->domain->domain_id, current->vcpu_id,
1111 addr, regs->error_code);
1112 #if CONFIG_PAGING_LEVELS >= 4
1113 dprintk(XENLOG_WARNING, " l4e = %"PRIpte"\n", l4e_get_intpte(l4e));
1114 #endif
1115 #if CONFIG_PAGING_LEVELS >= 3
1116 dprintk(XENLOG_WARNING, " l3e = %"PRIpte"\n", l3e_get_intpte(l3e));
1117 #endif
1118 dprintk(XENLOG_WARNING, " l2e = %"PRIpte"\n", l2e_get_intpte(l2e));
1119 dprintk(XENLOG_WARNING, " l1e = %"PRIpte"\n", l1e_get_intpte(l1e));
1120 #ifndef NDEBUG
1121 show_registers(regs);
1122 #endif
1123 return 1;
1126 static int spurious_page_fault(
1127 unsigned long addr, struct cpu_user_regs *regs)
1129 unsigned long flags;
1130 int is_spurious;
1132 /*
1133 * Disabling interrupts prevents TLB flushing, and hence prevents
1134 * page tables from becoming invalid under our feet during the walk.
1135 */
1136 local_irq_save(flags);
1137 is_spurious = __spurious_page_fault(addr, regs);
1138 local_irq_restore(flags);
1140 return is_spurious;
1143 static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs)
1145 struct vcpu *v = current;
1146 struct domain *d = v->domain;
1148 /* No fixups in interrupt context or when interrupts are disabled. */
1149 if ( in_irq() || !(regs->eflags & X86_EFLAGS_IF) )
1150 return 0;
1152 if ( unlikely(IN_HYPERVISOR_RANGE(addr)) )
1154 if ( paging_mode_external(d) && guest_mode(regs) )
1156 int ret = paging_fault(addr, regs);
1157 if ( ret == EXCRET_fault_fixed )
1158 trace_trap_two_addr(TRC_PV_PAGING_FIXUP, regs->eip, addr);
1159 return ret;
1161 if ( !(regs->error_code & PFEC_reserved_bit) &&
1162 (addr >= GDT_LDT_VIRT_START) && (addr < GDT_LDT_VIRT_END) )
1163 return handle_gdt_ldt_mapping_fault(
1164 addr - GDT_LDT_VIRT_START, regs);
1165 return 0;
1168 if ( VM_ASSIST(d, VMASST_TYPE_writable_pagetables) &&
1169 guest_kernel_mode(v, regs) &&
1170 /* Do not check if access-protection fault since the page may
1171 legitimately be not present in shadow page tables */
1172 ((regs->error_code & (PFEC_write_access|PFEC_reserved_bit)) ==
1173 PFEC_write_access) &&
1174 ptwr_do_page_fault(v, addr, regs) )
1175 return EXCRET_fault_fixed;
1177 if ( paging_mode_enabled(d) )
1179 int ret = paging_fault(addr, regs);
1180 if ( ret == EXCRET_fault_fixed )
1181 trace_trap_two_addr(TRC_PV_PAGING_FIXUP, regs->eip, addr);
1182 return ret;
1185 return 0;
1188 /*
1189 * #PF error code:
1190 * Bit 0: Protection violation (=1) ; Page not present (=0)
1191 * Bit 1: Write access
1192 * Bit 2: User mode (=1) ; Supervisor mode (=0)
1193 * Bit 3: Reserved bit violation
1194 * Bit 4: Instruction fetch
1195 */
1196 asmlinkage void do_page_fault(struct cpu_user_regs *regs)
1198 unsigned long addr, fixup;
1200 addr = read_cr2();
1202 DEBUGGER_trap_entry(TRAP_page_fault, regs);
1204 perfc_incr(page_faults);
1206 if ( unlikely(fixup_page_fault(addr, regs) != 0) )
1207 return;
1209 if ( unlikely(!guest_mode(regs)) )
1211 if ( spurious_page_fault(addr, regs) )
1212 return;
1214 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
1216 perfc_incr(copy_user_faults);
1217 if ( unlikely(regs->error_code & PFEC_reserved_bit) )
1218 reserved_bit_page_fault(addr, regs);
1219 regs->eip = fixup;
1220 return;
1223 DEBUGGER_trap_fatal(TRAP_page_fault, regs);
1225 show_execution_state(regs);
1226 show_page_walk(addr);
1227 panic("FATAL PAGE FAULT\n"
1228 "[error_code=%04x]\n"
1229 "Faulting linear address: %p\n",
1230 regs->error_code, _p(addr));
1233 propagate_page_fault(addr, regs->error_code);
1236 /*
1237 * Early #PF handler to print CR2, error code, and stack.
1239 * We also deal with spurious faults here, even though they should never happen
1240 * during early boot (an issue was seen once, but was most likely a hardware
1241 * problem).
1242 */
1243 asmlinkage void do_early_page_fault(struct cpu_user_regs *regs)
1245 static int stuck;
1246 static unsigned long prev_eip, prev_cr2;
1247 unsigned long cr2 = read_cr2();
1249 BUG_ON(smp_processor_id() != 0);
1251 if ( (regs->eip != prev_eip) || (cr2 != prev_cr2) )
1253 prev_eip = regs->eip;
1254 prev_cr2 = cr2;
1255 stuck = 0;
1256 return;
1259 if ( stuck++ == 1000 )
1261 unsigned long *stk = (unsigned long *)regs;
1262 printk("Early fatal page fault at %04x:%p (cr2=%p, ec=%04x)\n",
1263 regs->cs, _p(regs->eip), _p(cr2), regs->error_code);
1264 printk("Stack dump: ");
1265 while ( ((long)stk & ((PAGE_SIZE - 1) & ~(BYTES_PER_LONG - 1))) != 0 )
1266 printk("%p ", _p(*stk++));
1267 for ( ; ; ) ;
1271 long do_fpu_taskswitch(int set)
1273 struct vcpu *v = current;
1275 if ( set )
1277 v->arch.guest_context.ctrlreg[0] |= X86_CR0_TS;
1278 stts();
1280 else
1282 v->arch.guest_context.ctrlreg[0] &= ~X86_CR0_TS;
1283 if ( v->fpu_dirtied )
1284 clts();
1287 return 0;
1290 static int read_descriptor(unsigned int sel,
1291 const struct vcpu *v,
1292 const struct cpu_user_regs * regs,
1293 unsigned long *base,
1294 unsigned long *limit,
1295 unsigned int *ar,
1296 unsigned int vm86attr)
1298 struct desc_struct desc;
1300 if ( !vm86_mode(regs) )
1302 if ( sel < 4)
1303 desc.b = desc.a = 0;
1304 else if ( __get_user(desc,
1305 (const struct desc_struct *)(!(sel & 4)
1306 ? GDT_VIRT_START(v)
1307 : LDT_VIRT_START(v))
1308 + (sel >> 3)) )
1309 return 0;
1310 if ( !(vm86attr & _SEGMENT_CODE) )
1311 desc.b &= ~_SEGMENT_L;
1313 else
1315 desc.a = (sel << 20) | 0xffff;
1316 desc.b = vm86attr | (sel >> 12);
1319 *ar = desc.b & 0x00f0ff00;
1320 if ( !(desc.b & _SEGMENT_L) )
1322 *base = ((desc.a >> 16) + ((desc.b & 0xff) << 16) +
1323 (desc.b & 0xff000000));
1324 *limit = (desc.a & 0xffff) | (desc.b & 0x000f0000);
1325 if ( desc.b & _SEGMENT_G )
1326 *limit = ((*limit + 1) << 12) - 1;
1327 #ifndef NDEBUG
1328 if ( !vm86_mode(regs) && (sel > 3) )
1330 unsigned int a, l;
1331 unsigned char valid;
1333 asm volatile (
1334 "larl %2,%0 ; setz %1"
1335 : "=r" (a), "=rm" (valid) : "rm" (sel));
1336 BUG_ON(valid && ((a & 0x00f0ff00) != *ar));
1337 asm volatile (
1338 "lsll %2,%0 ; setz %1"
1339 : "=r" (l), "=rm" (valid) : "rm" (sel));
1340 BUG_ON(valid && (l != *limit));
1342 #endif
1344 else
1346 *base = 0UL;
1347 *limit = ~0UL;
1350 return 1;
1353 #ifdef __x86_64__
1354 static int read_gate_descriptor(unsigned int gate_sel,
1355 const struct vcpu *v,
1356 unsigned int *sel,
1357 unsigned long *off,
1358 unsigned int *ar)
1360 struct desc_struct desc;
1361 const struct desc_struct *pdesc;
1364 pdesc = (const struct desc_struct *)
1365 (!(gate_sel & 4) ? GDT_VIRT_START(v) : LDT_VIRT_START(v))
1366 + (gate_sel >> 3);
1367 if ( (gate_sel < 4) ||
1368 ((gate_sel >= FIRST_RESERVED_GDT_BYTE) && !(gate_sel & 4)) ||
1369 __get_user(desc, pdesc) )
1370 return 0;
1372 *sel = (desc.a >> 16) & 0x0000fffc;
1373 *off = (desc.a & 0x0000ffff) | (desc.b & 0xffff0000);
1374 *ar = desc.b & 0x0000ffff;
1376 /*
1377 * check_descriptor() clears the DPL field and stores the
1378 * guest requested DPL in the selector's RPL field.
1379 */
1380 if ( *ar & _SEGMENT_DPL )
1381 return 0;
1382 *ar |= (desc.a >> (16 - 13)) & _SEGMENT_DPL;
1384 if ( !is_pv_32bit_vcpu(v) )
1386 if ( (*ar & 0x1f00) != 0x0c00 ||
1387 (gate_sel >= FIRST_RESERVED_GDT_BYTE - 8 && !(gate_sel & 4)) ||
1388 __get_user(desc, pdesc + 1) ||
1389 (desc.b & 0x1f00) )
1390 return 0;
1392 *off |= (unsigned long)desc.a << 32;
1393 return 1;
1396 switch ( *ar & 0x1f00 )
1398 case 0x0400:
1399 *off &= 0xffff;
1400 break;
1401 case 0x0c00:
1402 break;
1403 default:
1404 return 0;
1407 return 1;
1409 #endif
1411 /* Has the guest requested sufficient permission for this I/O access? */
1412 static int guest_io_okay(
1413 unsigned int port, unsigned int bytes,
1414 struct vcpu *v, struct cpu_user_regs *regs)
1416 #if defined(__x86_64__)
1417 /* If in user mode, switch to kernel mode just to read I/O bitmap. */
1418 int user_mode = !(v->arch.flags & TF_kernel_mode);
1419 #define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v)
1420 #elif defined(__i386__)
1421 #define TOGGLE_MODE() ((void)0)
1422 #endif
1424 if ( !vm86_mode(regs) &&
1425 (v->arch.iopl >= (guest_kernel_mode(v, regs) ? 1 : 3)) )
1426 return 1;
1428 if ( v->arch.iobmp_limit > (port + bytes) )
1430 union { uint8_t bytes[2]; uint16_t mask; } x;
1432 /*
1433 * Grab permission bytes from guest space. Inaccessible bytes are
1434 * read as 0xff (no access allowed).
1435 */
1436 TOGGLE_MODE();
1437 switch ( __copy_from_guest_offset(x.bytes, v->arch.iobmp,
1438 port>>3, 2) )
1440 default: x.bytes[0] = ~0;
1441 case 1: x.bytes[1] = ~0;
1442 case 0: break;
1444 TOGGLE_MODE();
1446 if ( (x.mask & (((1<<bytes)-1) << (port&7))) == 0 )
1447 return 1;
1450 return 0;
1453 /* Has the administrator granted sufficient permission for this I/O access? */
1454 static int admin_io_okay(
1455 unsigned int port, unsigned int bytes,
1456 struct vcpu *v, struct cpu_user_regs *regs)
1458 /*
1459 * Port 0xcf8 (CONFIG_ADDRESS) is only visible for DWORD accesses.
1460 * We never permit direct access to that register.
1461 */
1462 if ( (port == 0xcf8) && (bytes == 4) )
1463 return 0;
1465 return ioports_access_permitted(v->domain, port, port + bytes - 1);
1468 static uint32_t guest_io_read(
1469 unsigned int port, unsigned int bytes,
1470 struct vcpu *v, struct cpu_user_regs *regs)
1472 extern uint32_t pci_conf_read(
1473 uint32_t cf8, uint8_t offset, uint8_t bytes);
1475 uint32_t data = 0;
1476 unsigned int shift = 0;
1478 if ( admin_io_okay(port, bytes, v, regs) )
1480 switch ( bytes )
1482 case 1: return inb(port);
1483 case 2: return inw(port);
1484 case 4: return inl(port);
1488 while ( bytes != 0 )
1490 unsigned int size = 1;
1491 uint32_t sub_data = 0xff;
1493 if ( (port == 0x42) || (port == 0x43) || (port == 0x61) )
1495 sub_data = pv_pit_handler(port, 0, 0);
1497 else if ( (port == 0xcf8) && (bytes == 4) )
1499 size = 4;
1500 sub_data = v->domain->arch.pci_cf8;
1502 else if ( ((port & 0xfffc) == 0xcfc) && IS_PRIV(v->domain) )
1504 size = min(bytes, 4 - (port & 3));
1505 if ( size == 3 )
1506 size = 2;
1507 sub_data = pci_conf_read(v->domain->arch.pci_cf8, port & 3, size);
1510 if ( size == 4 )
1511 return sub_data;
1513 data |= (sub_data & ((1u << (size * 8)) - 1)) << shift;
1514 shift += size * 8;
1515 port += size;
1516 bytes -= size;
1519 return data;
1522 static void guest_io_write(
1523 unsigned int port, unsigned int bytes, uint32_t data,
1524 struct vcpu *v, struct cpu_user_regs *regs)
1526 extern void pci_conf_write(
1527 uint32_t cf8, uint8_t offset, uint8_t bytes, uint32_t data);
1529 if ( admin_io_okay(port, bytes, v, regs) )
1531 switch ( bytes ) {
1532 case 1:
1533 outb((uint8_t)data, port);
1534 if ( pv_post_outb_hook )
1535 pv_post_outb_hook(port, (uint8_t)data);
1536 break;
1537 case 2:
1538 outw((uint16_t)data, port);
1539 break;
1540 case 4:
1541 outl(data, port);
1542 break;
1544 return;
1547 while ( bytes != 0 )
1549 unsigned int size = 1;
1551 if ( (port == 0x42) || (port == 0x43) || (port == 0x61) )
1553 pv_pit_handler(port, (uint8_t)data, 1);
1555 else if ( (port == 0xcf8) && (bytes == 4) )
1557 size = 4;
1558 v->domain->arch.pci_cf8 = data;
1560 else if ( ((port & 0xfffc) == 0xcfc) && IS_PRIV(v->domain) )
1562 size = min(bytes, 4 - (port & 3));
1563 if ( size == 3 )
1564 size = 2;
1565 pci_conf_write(v->domain->arch.pci_cf8, port & 3, size, data);
1568 if ( size == 4 )
1569 return;
1571 port += size;
1572 bytes -= size;
1573 data >>= size * 8;
1577 /* I/O emulation support. Helper routines for, and type of, the stack stub.*/
1578 void host_to_guest_gpr_switch(struct cpu_user_regs *)
1579 __attribute__((__regparm__(1)));
1580 unsigned long guest_to_host_gpr_switch(unsigned long)
1581 __attribute__((__regparm__(1)));
1583 void (*pv_post_outb_hook)(unsigned int port, u8 value);
1585 /* Instruction fetch with error handling. */
1586 #define insn_fetch(type, base, eip, limit) \
1587 ({ unsigned long _rc, _ptr = (base) + (eip); \
1588 type _x; \
1589 if ( ad_default < 8 ) \
1590 _ptr = (unsigned int)_ptr; \
1591 if ( (limit) < sizeof(_x) - 1 || (eip) > (limit) - (sizeof(_x) - 1) ) \
1592 goto fail; \
1593 if ( (_rc = copy_from_user(&_x, (type *)_ptr, sizeof(_x))) != 0 ) \
1594 { \
1595 propagate_page_fault(_ptr + sizeof(_x) - _rc, 0); \
1596 goto skip; \
1597 } \
1598 (eip) += sizeof(_x); _x; })
1600 #if defined(CONFIG_X86_32)
1601 # define read_sreg(regs, sr) ((regs)->sr)
1602 #elif defined(CONFIG_X86_64)
1603 # define read_sreg(regs, sr) read_segment_register(sr)
1604 #endif
1606 static int emulate_privileged_op(struct cpu_user_regs *regs)
1608 struct vcpu *v = current;
1609 unsigned long *reg, eip = regs->eip, res;
1610 u8 opcode, modrm_reg = 0, modrm_rm = 0, rep_prefix = 0, lock = 0, rex = 0;
1611 enum { lm_seg_none, lm_seg_fs, lm_seg_gs } lm_ovr = lm_seg_none;
1612 unsigned int port, i, data_sel, ar, data, rc, bpmatch = 0;
1613 unsigned int op_bytes, op_default, ad_bytes, ad_default;
1614 #define rd_ad(reg) (ad_bytes >= sizeof(regs->reg) \
1615 ? regs->reg \
1616 : ad_bytes == 4 \
1617 ? (u32)regs->reg \
1618 : (u16)regs->reg)
1619 #define wr_ad(reg, val) (ad_bytes >= sizeof(regs->reg) \
1620 ? regs->reg = (val) \
1621 : ad_bytes == 4 \
1622 ? (*(u32 *)&regs->reg = (val)) \
1623 : (*(u16 *)&regs->reg = (val)))
1624 unsigned long code_base, code_limit;
1625 char io_emul_stub[32];
1626 void (*io_emul)(struct cpu_user_regs *) __attribute__((__regparm__(1)));
1627 u32 l, h, eax, edx;
1629 if ( !read_descriptor(regs->cs, v, regs,
1630 &code_base, &code_limit, &ar,
1631 _SEGMENT_CODE|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P) )
1632 goto fail;
1633 op_default = op_bytes = (ar & (_SEGMENT_L|_SEGMENT_DB)) ? 4 : 2;
1634 ad_default = ad_bytes = (ar & _SEGMENT_L) ? 8 : op_default;
1635 if ( !(ar & _SEGMENT_S) ||
1636 !(ar & _SEGMENT_P) ||
1637 !(ar & _SEGMENT_CODE) )
1638 goto fail;
1640 /* emulating only opcodes not allowing SS to be default */
1641 data_sel = read_sreg(regs, ds);
1643 /* Legacy prefixes. */
1644 for ( i = 0; i < 8; i++, rex == opcode || (rex = 0) )
1646 switch ( opcode = insn_fetch(u8, code_base, eip, code_limit) )
1648 case 0x66: /* operand-size override */
1649 op_bytes = op_default ^ 6; /* switch between 2/4 bytes */
1650 continue;
1651 case 0x67: /* address-size override */
1652 ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */
1653 continue;
1654 case 0x2e: /* CS override */
1655 data_sel = regs->cs;
1656 continue;
1657 case 0x3e: /* DS override */
1658 data_sel = read_sreg(regs, ds);
1659 continue;
1660 case 0x26: /* ES override */
1661 data_sel = read_sreg(regs, es);
1662 continue;
1663 case 0x64: /* FS override */
1664 data_sel = read_sreg(regs, fs);
1665 lm_ovr = lm_seg_fs;
1666 continue;
1667 case 0x65: /* GS override */
1668 data_sel = read_sreg(regs, gs);
1669 lm_ovr = lm_seg_gs;
1670 continue;
1671 case 0x36: /* SS override */
1672 data_sel = regs->ss;
1673 continue;
1674 case 0xf0: /* LOCK */
1675 lock = 1;
1676 continue;
1677 case 0xf2: /* REPNE/REPNZ */
1678 case 0xf3: /* REP/REPE/REPZ */
1679 rep_prefix = 1;
1680 continue;
1681 default:
1682 if ( (ar & _SEGMENT_L) && (opcode & 0xf0) == 0x40 )
1684 rex = opcode;
1685 continue;
1687 break;
1689 break;
1692 /* REX prefix. */
1693 if ( rex & 8 ) /* REX.W */
1694 op_bytes = 4; /* emulate only opcodes not supporting 64-bit operands */
1695 modrm_reg = (rex & 4) << 1; /* REX.R */
1696 /* REX.X does not need to be decoded. */
1697 modrm_rm = (rex & 1) << 3; /* REX.B */
1699 if ( opcode == 0x0f )
1700 goto twobyte_opcode;
1702 if ( lock )
1703 goto fail;
1705 /* Input/Output String instructions. */
1706 if ( (opcode >= 0x6c) && (opcode <= 0x6f) )
1708 unsigned long data_base, data_limit;
1710 if ( rep_prefix && (rd_ad(ecx) == 0) )
1711 goto done;
1713 if ( !(opcode & 2) )
1715 data_sel = read_sreg(regs, es);
1716 lm_ovr = lm_seg_none;
1719 if ( !(ar & _SEGMENT_L) )
1721 if ( !read_descriptor(data_sel, v, regs,
1722 &data_base, &data_limit, &ar,
1723 _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|
1724 _SEGMENT_P) )
1725 goto fail;
1726 if ( !(ar & _SEGMENT_S) ||
1727 !(ar & _SEGMENT_P) ||
1728 (opcode & 2 ?
1729 (ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR) :
1730 (ar & _SEGMENT_CODE) || !(ar & _SEGMENT_WR)) )
1731 goto fail;
1733 #ifdef CONFIG_X86_64
1734 else
1736 if ( lm_ovr == lm_seg_none || data_sel < 4 )
1738 switch ( lm_ovr )
1740 case lm_seg_none:
1741 data_base = 0UL;
1742 break;
1743 case lm_seg_fs:
1744 data_base = v->arch.guest_context.fs_base;
1745 break;
1746 case lm_seg_gs:
1747 if ( guest_kernel_mode(v, regs) )
1748 data_base = v->arch.guest_context.gs_base_kernel;
1749 else
1750 data_base = v->arch.guest_context.gs_base_user;
1751 break;
1754 else
1755 read_descriptor(data_sel, v, regs,
1756 &data_base, &data_limit, &ar,
1757 0);
1758 data_limit = ~0UL;
1759 ar = _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P;
1761 #endif
1763 port = (u16)regs->edx;
1765 continue_io_string:
1766 switch ( opcode )
1768 case 0x6c: /* INSB */
1769 op_bytes = 1;
1770 case 0x6d: /* INSW/INSL */
1771 if ( (data_limit < (op_bytes - 1)) ||
1772 (rd_ad(edi) > (data_limit - (op_bytes - 1))) ||
1773 !guest_io_okay(port, op_bytes, v, regs) )
1774 goto fail;
1775 data = guest_io_read(port, op_bytes, v, regs);
1776 if ( (rc = copy_to_user((void *)data_base + rd_ad(edi),
1777 &data, op_bytes)) != 0 )
1779 propagate_page_fault(data_base + rd_ad(edi) + op_bytes - rc,
1780 PFEC_write_access);
1781 return EXCRET_fault_fixed;
1783 wr_ad(edi, regs->edi + (int)((regs->eflags & EF_DF)
1784 ? -op_bytes : op_bytes));
1785 break;
1787 case 0x6e: /* OUTSB */
1788 op_bytes = 1;
1789 case 0x6f: /* OUTSW/OUTSL */
1790 if ( (data_limit < (op_bytes - 1)) ||
1791 (rd_ad(esi) > (data_limit - (op_bytes - 1))) ||
1792 !guest_io_okay(port, op_bytes, v, regs) )
1793 goto fail;
1794 if ( (rc = copy_from_user(&data, (void *)data_base + rd_ad(esi),
1795 op_bytes)) != 0 )
1797 propagate_page_fault(data_base + rd_ad(esi)
1798 + op_bytes - rc, 0);
1799 return EXCRET_fault_fixed;
1801 guest_io_write(port, op_bytes, data, v, regs);
1802 wr_ad(esi, regs->esi + (int)((regs->eflags & EF_DF)
1803 ? -op_bytes : op_bytes));
1804 break;
1807 bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
1809 if ( rep_prefix && (wr_ad(ecx, regs->ecx - 1) != 0) )
1811 if ( !bpmatch && !hypercall_preempt_check() )
1812 goto continue_io_string;
1813 eip = regs->eip;
1816 goto done;
1819 /*
1820 * Very likely to be an I/O instruction (IN/OUT).
1821 * Build an on-stack stub to execute the instruction with full guest
1822 * GPR context. This is needed for some systems which (ab)use IN/OUT
1823 * to communicate with BIOS code in system-management mode.
1824 */
1825 #ifdef __x86_64__
1826 /* movq $host_to_guest_gpr_switch,%rcx */
1827 io_emul_stub[0] = 0x48;
1828 io_emul_stub[1] = 0xb9;
1829 *(void **)&io_emul_stub[2] = (void *)host_to_guest_gpr_switch;
1830 /* callq *%rcx */
1831 io_emul_stub[10] = 0xff;
1832 io_emul_stub[11] = 0xd1;
1833 #else
1834 /* call host_to_guest_gpr_switch */
1835 io_emul_stub[0] = 0xe8;
1836 *(s32 *)&io_emul_stub[1] =
1837 (char *)host_to_guest_gpr_switch - &io_emul_stub[5];
1838 /* 7 x nop */
1839 memset(&io_emul_stub[5], 0x90, 7);
1840 #endif
1841 /* data16 or nop */
1842 io_emul_stub[12] = (op_bytes != 2) ? 0x90 : 0x66;
1843 /* <io-access opcode> */
1844 io_emul_stub[13] = opcode;
1845 /* imm8 or nop */
1846 io_emul_stub[14] = 0x90;
1847 /* ret (jumps to guest_to_host_gpr_switch) */
1848 io_emul_stub[15] = 0xc3;
1850 /* Handy function-typed pointer to the stub. */
1851 io_emul = (void *)io_emul_stub;
1853 if ( ioemul_handle_quirk )
1854 ioemul_handle_quirk(opcode, &io_emul_stub[12], regs);
1856 /* I/O Port and Interrupt Flag instructions. */
1857 switch ( opcode )
1859 case 0xe4: /* IN imm8,%al */
1860 op_bytes = 1;
1861 case 0xe5: /* IN imm8,%eax */
1862 port = insn_fetch(u8, code_base, eip, code_limit);
1863 io_emul_stub[14] = port; /* imm8 */
1864 exec_in:
1865 if ( !guest_io_okay(port, op_bytes, v, regs) )
1866 goto fail;
1867 if ( admin_io_okay(port, op_bytes, v, regs) )
1869 io_emul(regs);
1871 else
1873 if ( op_bytes == 4 )
1874 regs->eax = 0;
1875 else
1876 regs->eax &= ~((1u << (op_bytes * 8)) - 1);
1877 regs->eax |= guest_io_read(port, op_bytes, v, regs);
1879 bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
1880 goto done;
1882 case 0xec: /* IN %dx,%al */
1883 op_bytes = 1;
1884 case 0xed: /* IN %dx,%eax */
1885 port = (u16)regs->edx;
1886 goto exec_in;
1888 case 0xe6: /* OUT %al,imm8 */
1889 op_bytes = 1;
1890 case 0xe7: /* OUT %eax,imm8 */
1891 port = insn_fetch(u8, code_base, eip, code_limit);
1892 io_emul_stub[14] = port; /* imm8 */
1893 exec_out:
1894 if ( !guest_io_okay(port, op_bytes, v, regs) )
1895 goto fail;
1896 if ( admin_io_okay(port, op_bytes, v, regs) )
1898 io_emul(regs);
1899 if ( (op_bytes == 1) && pv_post_outb_hook )
1900 pv_post_outb_hook(port, regs->eax);
1902 else
1904 guest_io_write(port, op_bytes, regs->eax, v, regs);
1906 bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
1907 goto done;
1909 case 0xee: /* OUT %al,%dx */
1910 op_bytes = 1;
1911 case 0xef: /* OUT %eax,%dx */
1912 port = (u16)regs->edx;
1913 goto exec_out;
1915 case 0xfa: /* CLI */
1916 case 0xfb: /* STI */
1917 if ( v->arch.iopl < (guest_kernel_mode(v, regs) ? 1 : 3) )
1918 goto fail;
1919 /*
1920 * This is just too dangerous to allow, in my opinion. Consider if the
1921 * caller then tries to reenable interrupts using POPF: we can't trap
1922 * that and we'll end up with hard-to-debug lockups. Fast & loose will
1923 * do for us. :-)
1924 */
1925 /*v->vcpu_info->evtchn_upcall_mask = (opcode == 0xfa);*/
1926 goto done;
1929 /* No decode of this single-byte opcode. */
1930 goto fail;
1932 twobyte_opcode:
1933 /* Two-byte opcodes only emulated from guest kernel. */
1934 if ( !guest_kernel_mode(v, regs) )
1935 goto fail;
1937 /* Privileged (ring 0) instructions. */
1938 opcode = insn_fetch(u8, code_base, eip, code_limit);
1939 if ( lock && (opcode & ~3) != 0x20 )
1940 goto fail;
1941 switch ( opcode )
1943 case 0x06: /* CLTS */
1944 (void)do_fpu_taskswitch(0);
1945 break;
1947 case 0x09: /* WBINVD */
1948 /* Ignore the instruction if unprivileged. */
1949 if ( !cache_flush_permitted(v->domain) )
1950 /* Non-physdev domain attempted WBINVD; ignore for now since
1951 newer linux uses this in some start-of-day timing loops */
1953 else
1954 wbinvd();
1955 break;
1957 case 0x20: /* MOV CR?,<reg> */
1958 opcode = insn_fetch(u8, code_base, eip, code_limit);
1959 if ( opcode < 0xc0 )
1960 goto fail;
1961 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
1962 modrm_rm |= (opcode >> 0) & 7;
1963 reg = decode_register(modrm_rm, regs, 0);
1964 switch ( modrm_reg )
1966 case 0: /* Read CR0 */
1967 *reg = (read_cr0() & ~X86_CR0_TS) |
1968 v->arch.guest_context.ctrlreg[0];
1969 break;
1971 case 2: /* Read CR2 */
1972 *reg = v->arch.guest_context.ctrlreg[2];
1973 break;
1975 case 3: /* Read CR3 */
1976 if ( !is_pv_32on64_vcpu(v) )
1977 *reg = xen_pfn_to_cr3(mfn_to_gmfn(
1978 v->domain, pagetable_get_pfn(v->arch.guest_table)));
1979 #ifdef CONFIG_COMPAT
1980 else
1981 *reg = compat_pfn_to_cr3(mfn_to_gmfn(
1982 v->domain, l4e_get_pfn(*(l4_pgentry_t *)__va(pagetable_get_paddr(v->arch.guest_table)))));
1983 #endif
1984 break;
1986 case 4: /* Read CR4 */
1987 /*
1988 * Guests can read CR4 to see what features Xen has enabled. We
1989 * therefore lie about PGE & PSE as they are unavailable to guests.
1990 */
1991 *reg = read_cr4() & ~(X86_CR4_PGE|X86_CR4_PSE);
1992 break;
1994 default:
1995 goto fail;
1997 break;
1999 case 0x21: /* MOV DR?,<reg> */
2000 opcode = insn_fetch(u8, code_base, eip, code_limit);
2001 if ( opcode < 0xc0 )
2002 goto fail;
2003 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
2004 modrm_rm |= (opcode >> 0) & 7;
2005 reg = decode_register(modrm_rm, regs, 0);
2006 if ( (res = do_get_debugreg(modrm_reg)) > (unsigned long)-256 )
2007 goto fail;
2008 *reg = res;
2009 break;
2011 case 0x22: /* MOV <reg>,CR? */
2012 opcode = insn_fetch(u8, code_base, eip, code_limit);
2013 if ( opcode < 0xc0 )
2014 goto fail;
2015 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
2016 modrm_rm |= (opcode >> 0) & 7;
2017 reg = decode_register(modrm_rm, regs, 0);
2018 switch ( modrm_reg )
2020 case 0: /* Write CR0 */
2021 if ( (*reg ^ read_cr0()) & ~X86_CR0_TS )
2023 gdprintk(XENLOG_WARNING,
2024 "Attempt to change unmodifiable CR0 flags.\n");
2025 goto fail;
2027 (void)do_fpu_taskswitch(!!(*reg & X86_CR0_TS));
2028 break;
2030 case 2: /* Write CR2 */
2031 v->arch.guest_context.ctrlreg[2] = *reg;
2032 arch_set_cr2(v, *reg);
2033 break;
2035 case 3: /* Write CR3 */
2036 domain_lock(v->domain);
2037 if ( !is_pv_32on64_vcpu(v) )
2038 rc = new_guest_cr3(gmfn_to_mfn(v->domain, xen_cr3_to_pfn(*reg)));
2039 #ifdef CONFIG_COMPAT
2040 else
2041 rc = new_guest_cr3(gmfn_to_mfn(v->domain, compat_cr3_to_pfn(*reg)));
2042 #endif
2043 domain_unlock(v->domain);
2044 if ( rc == 0 ) /* not okay */
2045 goto fail;
2046 break;
2048 case 4: /* Write CR4 */
2049 v->arch.guest_context.ctrlreg[4] = pv_guest_cr4_fixup(*reg);
2050 write_cr4(pv_guest_cr4_to_real_cr4(
2051 v->arch.guest_context.ctrlreg[4]));
2052 break;
2054 default:
2055 goto fail;
2057 break;
2059 case 0x23: /* MOV <reg>,DR? */
2060 opcode = insn_fetch(u8, code_base, eip, code_limit);
2061 if ( opcode < 0xc0 )
2062 goto fail;
2063 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
2064 modrm_rm |= (opcode >> 0) & 7;
2065 reg = decode_register(modrm_rm, regs, 0);
2066 if ( do_set_debugreg(modrm_reg, *reg) != 0 )
2067 goto fail;
2068 break;
2070 case 0x30: /* WRMSR */
2071 eax = regs->eax;
2072 edx = regs->edx;
2073 res = ((u64)edx << 32) | eax;
2074 switch ( (u32)regs->ecx )
2076 #ifdef CONFIG_X86_64
2077 case MSR_FS_BASE:
2078 if ( is_pv_32on64_vcpu(v) )
2079 goto fail;
2080 if ( wrmsr_safe(MSR_FS_BASE, eax, edx) )
2081 goto fail;
2082 v->arch.guest_context.fs_base = res;
2083 break;
2084 case MSR_GS_BASE:
2085 if ( is_pv_32on64_vcpu(v) )
2086 goto fail;
2087 if ( wrmsr_safe(MSR_GS_BASE, eax, edx) )
2088 goto fail;
2089 v->arch.guest_context.gs_base_kernel = res;
2090 break;
2091 case MSR_SHADOW_GS_BASE:
2092 if ( is_pv_32on64_vcpu(v) )
2093 goto fail;
2094 if ( wrmsr_safe(MSR_SHADOW_GS_BASE, eax, edx) )
2095 goto fail;
2096 v->arch.guest_context.gs_base_user = res;
2097 break;
2098 #endif
2099 case MSR_K7_FID_VID_STATUS:
2100 case MSR_K7_FID_VID_CTL:
2101 case MSR_K8_PSTATE_LIMIT:
2102 case MSR_K8_PSTATE_CTRL:
2103 case MSR_K8_PSTATE_STATUS:
2104 case MSR_K8_PSTATE0:
2105 case MSR_K8_PSTATE1:
2106 case MSR_K8_PSTATE2:
2107 case MSR_K8_PSTATE3:
2108 case MSR_K8_PSTATE4:
2109 case MSR_K8_PSTATE5:
2110 case MSR_K8_PSTATE6:
2111 case MSR_K8_PSTATE7:
2112 if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
2113 goto fail;
2114 if ( cpufreq_controller != FREQCTL_dom0_kernel )
2115 break;
2116 if ( wrmsr_safe(regs->ecx, eax, edx) != 0 )
2117 goto fail;
2118 break;
2119 case MSR_AMD64_NB_CFG:
2120 if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ||
2121 boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x11 )
2122 goto fail;
2123 if ( !IS_PRIV(v->domain) )
2124 break;
2125 if ( (rdmsr_safe(MSR_AMD64_NB_CFG, l, h) != 0) ||
2126 (eax != l) ||
2127 ((edx ^ h) & ~(1 << (AMD64_NB_CFG_CF8_EXT_ENABLE_BIT - 32))) )
2128 goto invalid;
2129 if ( wrmsr_safe(MSR_AMD64_NB_CFG, eax, edx) != 0 )
2130 goto fail;
2131 break;
2132 case MSR_FAM10H_MMIO_CONF_BASE:
2133 if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ||
2134 boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x11 )
2135 goto fail;
2136 if ( !IS_PRIV(v->domain) )
2137 break;
2138 if ( (rdmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, l, h) != 0) ||
2139 (((((u64)h << 32) | l) ^ res) &
2140 ~((1 << FAM10H_MMIO_CONF_ENABLE_BIT) |
2141 (FAM10H_MMIO_CONF_BUSRANGE_MASK <<
2142 FAM10H_MMIO_CONF_BUSRANGE_SHIFT) |
2143 ((u64)FAM10H_MMIO_CONF_BASE_MASK <<
2144 FAM10H_MMIO_CONF_BASE_SHIFT))) )
2145 goto invalid;
2146 if ( wrmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, eax, edx) != 0 )
2147 goto fail;
2148 break;
2149 case MSR_IA32_PERF_CTL:
2150 if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
2151 goto fail;
2152 if ( cpufreq_controller != FREQCTL_dom0_kernel )
2153 break;
2154 if ( wrmsr_safe(regs->ecx, eax, edx) != 0 )
2155 goto fail;
2156 break;
2157 default:
2158 if ( wrmsr_hypervisor_regs(regs->ecx, eax, edx) )
2159 break;
2160 if ( (rdmsr_safe(regs->ecx, l, h) != 0) ||
2161 (eax != l) || (edx != h) )
2162 invalid:
2163 gdprintk(XENLOG_WARNING, "Domain attempted WRMSR %p from "
2164 "%08x:%08x to %08x:%08x.\n",
2165 _p(regs->ecx), h, l, edx, eax);
2166 break;
2168 break;
2170 case 0x31: /* RDTSC */
2171 rdtsc(regs->eax, regs->edx);
2172 break;
2174 case 0x32: /* RDMSR */
2175 switch ( (u32)regs->ecx )
2177 #ifdef CONFIG_X86_64
2178 case MSR_FS_BASE:
2179 if ( is_pv_32on64_vcpu(v) )
2180 goto fail;
2181 regs->eax = v->arch.guest_context.fs_base & 0xFFFFFFFFUL;
2182 regs->edx = v->arch.guest_context.fs_base >> 32;
2183 break;
2184 case MSR_GS_BASE:
2185 if ( is_pv_32on64_vcpu(v) )
2186 goto fail;
2187 regs->eax = v->arch.guest_context.gs_base_kernel & 0xFFFFFFFFUL;
2188 regs->edx = v->arch.guest_context.gs_base_kernel >> 32;
2189 break;
2190 case MSR_SHADOW_GS_BASE:
2191 if ( is_pv_32on64_vcpu(v) )
2192 goto fail;
2193 regs->eax = v->arch.guest_context.gs_base_user & 0xFFFFFFFFUL;
2194 regs->edx = v->arch.guest_context.gs_base_user >> 32;
2195 break;
2196 #endif
2197 case MSR_K7_FID_VID_CTL:
2198 case MSR_K7_FID_VID_STATUS:
2199 case MSR_K8_PSTATE_LIMIT:
2200 case MSR_K8_PSTATE_CTRL:
2201 case MSR_K8_PSTATE_STATUS:
2202 case MSR_K8_PSTATE0:
2203 case MSR_K8_PSTATE1:
2204 case MSR_K8_PSTATE2:
2205 case MSR_K8_PSTATE3:
2206 case MSR_K8_PSTATE4:
2207 case MSR_K8_PSTATE5:
2208 case MSR_K8_PSTATE6:
2209 case MSR_K8_PSTATE7:
2210 if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
2211 goto fail;
2212 if ( cpufreq_controller != FREQCTL_dom0_kernel )
2214 regs->eax = regs->edx = 0;
2215 break;
2217 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) != 0 )
2218 goto fail;
2219 break;
2220 case MSR_EFER:
2221 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
2222 goto fail;
2223 break;
2224 case MSR_IA32_MISC_ENABLE:
2225 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
2226 goto fail;
2227 regs->eax &= ~(MSR_IA32_MISC_ENABLE_PERF_AVAIL |
2228 MSR_IA32_MISC_ENABLE_MONITOR_ENABLE);
2229 regs->eax |= MSR_IA32_MISC_ENABLE_BTS_UNAVAIL |
2230 MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL |
2231 MSR_IA32_MISC_ENABLE_XTPR_DISABLE;
2232 break;
2233 default:
2234 if ( rdmsr_hypervisor_regs(regs->ecx, &l, &h) )
2236 regs->eax = l;
2237 regs->edx = h;
2238 break;
2240 /* Everyone can read the MSR space. */
2241 /* gdprintk(XENLOG_WARNING,"Domain attempted RDMSR %p.\n",
2242 _p(regs->ecx));*/
2243 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
2244 goto fail;
2245 break;
2247 break;
2249 default:
2250 goto fail;
2253 #undef wr_ad
2254 #undef rd_ad
2256 done:
2257 instruction_done(regs, eip, bpmatch);
2258 skip:
2259 return EXCRET_fault_fixed;
2261 fail:
2262 return 0;
2265 static inline int check_stack_limit(unsigned int ar, unsigned int limit,
2266 unsigned int esp, unsigned int decr)
2268 return (((esp - decr) < (esp - 1)) &&
2269 (!(ar & _SEGMENT_EC) ? (esp - 1) <= limit : (esp - decr) > limit));
2272 static void emulate_gate_op(struct cpu_user_regs *regs)
2274 #ifdef __x86_64__
2275 struct vcpu *v = current;
2276 unsigned int sel, ar, dpl, nparm, opnd_sel;
2277 unsigned int op_default, op_bytes, ad_default, ad_bytes;
2278 unsigned long off, eip, opnd_off, base, limit;
2279 int jump;
2281 /* Check whether this fault is due to the use of a call gate. */
2282 if ( !read_gate_descriptor(regs->error_code, v, &sel, &off, &ar) ||
2283 (((ar >> 13) & 3) < (regs->cs & 3)) ||
2284 ((ar & _SEGMENT_TYPE) != 0xc00) )
2286 do_guest_trap(TRAP_gp_fault, regs, 1);
2287 return;
2289 if ( !(ar & _SEGMENT_P) )
2291 do_guest_trap(TRAP_no_segment, regs, 1);
2292 return;
2294 dpl = (ar >> 13) & 3;
2295 nparm = ar & 0x1f;
2297 /*
2298 * Decode instruction (and perhaps operand) to determine RPL,
2299 * whether this is a jump or a call, and the call return offset.
2300 */
2301 if ( !read_descriptor(regs->cs, v, regs, &base, &limit, &ar, 0) ||
2302 !(ar & _SEGMENT_S) ||
2303 !(ar & _SEGMENT_P) ||
2304 !(ar & _SEGMENT_CODE) )
2306 do_guest_trap(TRAP_gp_fault, regs, 1);
2307 return;
2310 op_bytes = op_default = ar & _SEGMENT_DB ? 4 : 2;
2311 ad_default = ad_bytes = op_default;
2312 opnd_sel = opnd_off = 0;
2313 jump = -1;
2314 for ( eip = regs->eip; eip - regs->_eip < 10; )
2316 switch ( insn_fetch(u8, base, eip, limit) )
2318 case 0x66: /* operand-size override */
2319 op_bytes = op_default ^ 6; /* switch between 2/4 bytes */
2320 continue;
2321 case 0x67: /* address-size override */
2322 ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */
2323 continue;
2324 case 0x2e: /* CS override */
2325 opnd_sel = regs->cs;
2326 ASSERT(opnd_sel);
2327 continue;
2328 case 0x3e: /* DS override */
2329 opnd_sel = read_sreg(regs, ds);
2330 if ( !opnd_sel )
2331 opnd_sel = dpl;
2332 continue;
2333 case 0x26: /* ES override */
2334 opnd_sel = read_sreg(regs, es);
2335 if ( !opnd_sel )
2336 opnd_sel = dpl;
2337 continue;
2338 case 0x64: /* FS override */
2339 opnd_sel = read_sreg(regs, fs);
2340 if ( !opnd_sel )
2341 opnd_sel = dpl;
2342 continue;
2343 case 0x65: /* GS override */
2344 opnd_sel = read_sreg(regs, gs);
2345 if ( !opnd_sel )
2346 opnd_sel = dpl;
2347 continue;
2348 case 0x36: /* SS override */
2349 opnd_sel = regs->ss;
2350 if ( !opnd_sel )
2351 opnd_sel = dpl;
2352 continue;
2353 case 0xea:
2354 ++jump;
2355 /* FALLTHROUGH */
2356 case 0x9a:
2357 ++jump;
2358 opnd_sel = regs->cs;
2359 opnd_off = eip;
2360 ad_bytes = ad_default;
2361 eip += op_bytes + 2;
2362 break;
2363 case 0xff:
2365 unsigned int modrm;
2367 switch ( (modrm = insn_fetch(u8, base, eip, limit)) & 0xf8 )
2369 case 0x28: case 0x68: case 0xa8:
2370 ++jump;
2371 /* FALLTHROUGH */
2372 case 0x18: case 0x58: case 0x98:
2373 ++jump;
2374 if ( ad_bytes != 2 )
2376 if ( (modrm & 7) == 4 )
2378 unsigned int sib;
2379 sib = insn_fetch(u8, base, eip, limit);
2381 modrm = (modrm & ~7) | (sib & 7);
2382 if ( (sib >>= 3) != 4 )
2383 opnd_off = *(unsigned long *)
2384 decode_register(sib & 7, regs, 0);
2385 opnd_off <<= sib >> 3;
2387 if ( (modrm & 7) != 5 || (modrm & 0xc0) )
2388 opnd_off += *(unsigned long *)
2389 decode_register(modrm & 7, regs, 0);
2390 else
2391 modrm |= 0x87;
2392 if ( !opnd_sel )
2394 switch ( modrm & 7 )
2396 default:
2397 opnd_sel = read_sreg(regs, ds);
2398 break;
2399 case 4: case 5:
2400 opnd_sel = regs->ss;
2401 break;
2405 else
2407 switch ( modrm & 7 )
2409 case 0: case 1: case 7:
2410 opnd_off = regs->ebx;
2411 break;
2412 case 6:
2413 if ( !(modrm & 0xc0) )
2414 modrm |= 0x80;
2415 else
2416 case 2: case 3:
2418 opnd_off = regs->ebp;
2419 if ( !opnd_sel )
2420 opnd_sel = regs->ss;
2422 break;
2424 if ( !opnd_sel )
2425 opnd_sel = read_sreg(regs, ds);
2426 switch ( modrm & 7 )
2428 case 0: case 2: case 4:
2429 opnd_off += regs->esi;
2430 break;
2431 case 1: case 3: case 5:
2432 opnd_off += regs->edi;
2433 break;
2436 switch ( modrm & 0xc0 )
2438 case 0x40:
2439 opnd_off += insn_fetch(s8, base, eip, limit);
2440 break;
2441 case 0x80:
2442 opnd_off += insn_fetch(s32, base, eip, limit);
2443 break;
2445 if ( ad_bytes == 4 )
2446 opnd_off = (unsigned int)opnd_off;
2447 else if ( ad_bytes == 2 )
2448 opnd_off = (unsigned short)opnd_off;
2449 break;
2452 break;
2454 break;
2457 if ( jump < 0 )
2459 fail:
2460 do_guest_trap(TRAP_gp_fault, regs, 1);
2461 skip:
2462 return;
2465 if ( (opnd_sel != regs->cs &&
2466 !read_descriptor(opnd_sel, v, regs, &base, &limit, &ar, 0)) ||
2467 !(ar & _SEGMENT_S) ||
2468 !(ar & _SEGMENT_P) ||
2469 ((ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR)) )
2471 do_guest_trap(TRAP_gp_fault, regs, 1);
2472 return;
2475 opnd_off += op_bytes;
2476 #define ad_default ad_bytes
2477 opnd_sel = insn_fetch(u16, base, opnd_off, limit);
2478 #undef ad_default
2479 ASSERT((opnd_sel & ~3) == regs->error_code);
2480 if ( dpl < (opnd_sel & 3) )
2482 do_guest_trap(TRAP_gp_fault, regs, 1);
2483 return;
2486 if ( !read_descriptor(sel, v, regs, &base, &limit, &ar, 0) ||
2487 !(ar & _SEGMENT_S) ||
2488 !(ar & _SEGMENT_CODE) ||
2489 (!jump || (ar & _SEGMENT_EC) ?
2490 ((ar >> 13) & 3) > (regs->cs & 3) :
2491 ((ar >> 13) & 3) != (regs->cs & 3)) )
2493 regs->error_code = sel;
2494 do_guest_trap(TRAP_gp_fault, regs, 1);
2495 return;
2497 if ( !(ar & _SEGMENT_P) )
2499 regs->error_code = sel;
2500 do_guest_trap(TRAP_no_segment, regs, 1);
2501 return;
2503 if ( off > limit )
2505 regs->error_code = 0;
2506 do_guest_trap(TRAP_gp_fault, regs, 1);
2507 return;
2510 if ( !jump )
2512 unsigned int ss, esp, *stkp;
2513 int rc;
2514 #define push(item) do \
2515 { \
2516 --stkp; \
2517 esp -= 4; \
2518 rc = __put_user(item, stkp); \
2519 if ( rc ) \
2520 { \
2521 propagate_page_fault((unsigned long)(stkp + 1) - rc, \
2522 PFEC_write_access); \
2523 return; \
2524 } \
2525 } while ( 0 )
2527 if ( ((ar >> 13) & 3) < (regs->cs & 3) )
2529 sel |= (ar >> 13) & 3;
2530 /* Inner stack known only for kernel ring. */
2531 if ( (sel & 3) != GUEST_KERNEL_RPL(v->domain) )
2533 do_guest_trap(TRAP_gp_fault, regs, 1);
2534 return;
2536 esp = v->arch.guest_context.kernel_sp;
2537 ss = v->arch.guest_context.kernel_ss;
2538 if ( (ss & 3) != (sel & 3) ||
2539 !read_descriptor(ss, v, regs, &base, &limit, &ar, 0) ||
2540 ((ar >> 13) & 3) != (sel & 3) ||
2541 !(ar & _SEGMENT_S) ||
2542 (ar & _SEGMENT_CODE) ||
2543 !(ar & _SEGMENT_WR) )
2545 regs->error_code = ss & ~3;
2546 do_guest_trap(TRAP_invalid_tss, regs, 1);
2547 return;
2549 if ( !(ar & _SEGMENT_P) ||
2550 !check_stack_limit(ar, limit, esp, (4 + nparm) * 4) )
2552 regs->error_code = ss & ~3;
2553 do_guest_trap(TRAP_stack_error, regs, 1);
2554 return;
2556 stkp = (unsigned int *)(unsigned long)((unsigned int)base + esp);
2557 if ( !compat_access_ok(stkp - 4 - nparm, (4 + nparm) * 4) )
2559 do_guest_trap(TRAP_gp_fault, regs, 1);
2560 return;
2562 push(regs->ss);
2563 push(regs->esp);
2564 if ( nparm )
2566 const unsigned int *ustkp;
2568 if ( !read_descriptor(regs->ss, v, regs, &base, &limit, &ar, 0) ||
2569 ((ar >> 13) & 3) != (regs->cs & 3) ||
2570 !(ar & _SEGMENT_S) ||
2571 (ar & _SEGMENT_CODE) ||
2572 !(ar & _SEGMENT_WR) ||
2573 !check_stack_limit(ar, limit, esp + nparm * 4, nparm * 4) )
2574 return do_guest_trap(TRAP_gp_fault, regs, 1);
2575 ustkp = (unsigned int *)(unsigned long)((unsigned int)base + regs->_esp + nparm * 4);
2576 if ( !compat_access_ok(ustkp - nparm, nparm * 4) )
2578 do_guest_trap(TRAP_gp_fault, regs, 1);
2579 return;
2581 do
2583 unsigned int parm;
2585 --ustkp;
2586 rc = __get_user(parm, ustkp);
2587 if ( rc )
2589 propagate_page_fault((unsigned long)(ustkp + 1) - rc, 0);
2590 return;
2592 push(parm);
2593 } while ( --nparm );
2596 else
2598 sel |= (regs->cs & 3);
2599 esp = regs->esp;
2600 ss = regs->ss;
2601 if ( !read_descriptor(ss, v, regs, &base, &limit, &ar, 0) ||
2602 ((ar >> 13) & 3) != (sel & 3) )
2604 do_guest_trap(TRAP_gp_fault, regs, 1);
2605 return;
2607 if ( !check_stack_limit(ar, limit, esp, 2 * 4) )
2609 regs->error_code = 0;
2610 do_guest_trap(TRAP_stack_error, regs, 1);
2611 return;
2613 stkp = (unsigned int *)(unsigned long)((unsigned int)base + esp);
2614 if ( !compat_access_ok(stkp - 2, 2 * 4) )
2616 do_guest_trap(TRAP_gp_fault, regs, 1);
2617 return;
2620 push(regs->cs);
2621 push(eip);
2622 #undef push
2623 regs->esp = esp;
2624 regs->ss = ss;
2626 else
2627 sel |= (regs->cs & 3);
2629 regs->cs = sel;
2630 instruction_done(regs, off, 0);
2631 #endif
2634 asmlinkage void do_general_protection(struct cpu_user_regs *regs)
2636 struct vcpu *v = current;
2637 unsigned long fixup;
2639 DEBUGGER_trap_entry(TRAP_gp_fault, regs);
2641 if ( regs->error_code & 1 )
2642 goto hardware_gp;
2644 if ( !guest_mode(regs) )
2645 goto gp_in_kernel;
2647 /*
2648 * Cunning trick to allow arbitrary "INT n" handling.
2650 * We set DPL == 0 on all vectors in the IDT. This prevents any INT <n>
2651 * instruction from trapping to the appropriate vector, when that might not
2652 * be expected by Xen or the guest OS. For example, that entry might be for
2653 * a fault handler (unlike traps, faults don't increment EIP), or might
2654 * expect an error code on the stack (which a software trap never
2655 * provides), or might be a hardware interrupt handler that doesn't like
2656 * being called spuriously.
2658 * Instead, a GPF occurs with the faulting IDT vector in the error code.
2659 * Bit 1 is set to indicate that an IDT entry caused the fault. Bit 0 is
2660 * clear to indicate that it's a software fault, not hardware.
2662 * NOTE: Vectors 3 and 4 are dealt with from their own handler. This is
2663 * okay because they can only be triggered by an explicit DPL-checked
2664 * instruction. The DPL specified by the guest OS for these vectors is NOT
2665 * CHECKED!!
2666 */
2667 if ( (regs->error_code & 3) == 2 )
2669 /* This fault must be due to <INT n> instruction. */
2670 const struct trap_info *ti;
2671 unsigned char vector = regs->error_code >> 3;
2672 ti = &v->arch.guest_context.trap_ctxt[vector];
2673 if ( permit_softint(TI_GET_DPL(ti), v, regs) )
2675 regs->eip += 2;
2676 do_guest_trap(vector, regs, 0);
2677 return;
2680 else if ( is_pv_32on64_vcpu(v) && regs->error_code )
2682 emulate_gate_op(regs);
2683 return;
2686 /* Emulate some simple privileged and I/O instructions. */
2687 if ( (regs->error_code == 0) &&
2688 emulate_privileged_op(regs) )
2690 trace_trap_one_addr(TRC_PV_EMULATE_PRIVOP, regs->eip);
2691 return;
2694 #if defined(__i386__)
2695 if ( VM_ASSIST(v->domain, VMASST_TYPE_4gb_segments) &&
2696 (regs->error_code == 0) &&
2697 gpf_emulate_4gb(regs) )
2699 TRACE_1D(TRC_PV_EMULATE_4GB, regs->eip);
2700 return;
2702 #endif
2704 /* Pass on GPF as is. */
2705 do_guest_trap(TRAP_gp_fault, regs, 1);
2706 return;
2708 gp_in_kernel:
2710 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
2712 dprintk(XENLOG_INFO, "GPF (%04x): %p -> %p\n",
2713 regs->error_code, _p(regs->eip), _p(fixup));
2714 regs->eip = fixup;
2715 return;
2718 DEBUGGER_trap_fatal(TRAP_gp_fault, regs);
2720 hardware_gp:
2721 show_execution_state(regs);
2722 panic("GENERAL PROTECTION FAULT\n[error_code=%04x]\n", regs->error_code);
2725 static DEFINE_PER_CPU(struct softirq_trap, softirq_trap);
2727 static void nmi_mce_softirq(void)
2729 int cpu = smp_processor_id();
2730 struct softirq_trap *st = &per_cpu(softirq_trap, cpu);
2731 cpumask_t affinity;
2733 BUG_ON(st == NULL);
2734 BUG_ON(st->vcpu == NULL);
2736 /* Set the tmp value unconditionally, so that
2737 * the check in the iret hypercall works. */
2738 st->vcpu->cpu_affinity_tmp = st->vcpu->cpu_affinity;
2740 if ((cpu != st->processor)
2741 || (st->processor != st->vcpu->processor))
2743 /* We are on a different physical cpu.
2744 * Make sure to wakeup the vcpu on the
2745 * specified processor.
2746 */
2747 cpus_clear(affinity);
2748 cpu_set(st->processor, affinity);
2749 vcpu_set_affinity(st->vcpu, &affinity);
2751 /* Affinity is restored in the iret hypercall. */
2754 /* Only used to defer wakeup of domain/vcpu to
2755 * a safe (non-NMI/MCE) context.
2756 */
2757 vcpu_kick(st->vcpu);
2760 static void nmi_dom0_report(unsigned int reason_idx)
2762 struct domain *d = dom0;
2764 if ( (d == NULL) || (d->vcpu[0] == NULL) )
2765 return;
2767 set_bit(reason_idx, nmi_reason(d));
2769 send_guest_trap(d, 0, TRAP_nmi);
2772 asmlinkage void mem_parity_error(struct cpu_user_regs *regs)
2774 switch ( opt_nmi[0] )
2776 case 'd': /* 'dom0' */
2777 nmi_dom0_report(_XEN_NMIREASON_parity_error);
2778 case 'i': /* 'ignore' */
2779 break;
2780 default: /* 'fatal' */
2781 console_force_unlock();
2782 printk("\n\nNMI - MEMORY ERROR\n");
2783 fatal_trap(TRAP_nmi, regs);
2786 outb((inb(0x61) & 0x0f) | 0x04, 0x61); /* clear-and-disable parity check */
2787 mdelay(1);
2788 outb((inb(0x61) & 0x0b) | 0x00, 0x61); /* enable parity check */
2791 asmlinkage void io_check_error(struct cpu_user_regs *regs)
2793 switch ( opt_nmi[0] )
2795 case 'd': /* 'dom0' */
2796 nmi_dom0_report(_XEN_NMIREASON_io_error);
2797 case 'i': /* 'ignore' */
2798 break;
2799 default: /* 'fatal' */
2800 console_force_unlock();
2801 printk("\n\nNMI - I/O ERROR\n");
2802 fatal_trap(TRAP_nmi, regs);
2805 outb((inb(0x61) & 0x0f) | 0x08, 0x61); /* clear-and-disable IOCK */
2806 mdelay(1);
2807 outb((inb(0x61) & 0x07) | 0x00, 0x61); /* enable IOCK */
2810 static void unknown_nmi_error(unsigned char reason)
2812 switch ( opt_nmi[0] )
2814 case 'd': /* 'dom0' */
2815 nmi_dom0_report(_XEN_NMIREASON_unknown);
2816 case 'i': /* 'ignore' */
2817 break;
2818 default: /* 'fatal' */
2819 printk("Uhhuh. NMI received for unknown reason %02x.\n", reason);
2820 printk("Dazed and confused, but trying to continue\n");
2821 printk("Do you have a strange power saving mode enabled?\n");
2822 kexec_crash();
2826 static int dummy_nmi_callback(struct cpu_user_regs *regs, int cpu)
2828 return 0;
2831 static nmi_callback_t nmi_callback = dummy_nmi_callback;
2833 asmlinkage void do_nmi(struct cpu_user_regs *regs)
2835 unsigned int cpu = smp_processor_id();
2836 unsigned char reason;
2838 ++nmi_count(cpu);
2840 if ( nmi_callback(regs, cpu) )
2841 return;
2843 if ( nmi_watchdog )
2844 nmi_watchdog_tick(regs);
2846 /* Only the BSP gets external NMIs from the system. */
2847 if ( cpu == 0 )
2849 reason = inb(0x61);
2850 if ( reason & 0x80 )
2851 mem_parity_error(regs);
2852 else if ( reason & 0x40 )
2853 io_check_error(regs);
2854 else if ( !nmi_watchdog )
2855 unknown_nmi_error((unsigned char)(reason&0xff));
2859 void set_nmi_callback(nmi_callback_t callback)
2861 nmi_callback = callback;
2864 void unset_nmi_callback(void)
2866 nmi_callback = dummy_nmi_callback;
2869 asmlinkage void do_device_not_available(struct cpu_user_regs *regs)
2871 struct vcpu *curr = current;
2873 BUG_ON(!guest_mode(regs));
2875 setup_fpu(curr);
2877 if ( curr->arch.guest_context.ctrlreg[0] & X86_CR0_TS )
2879 do_guest_trap(TRAP_no_device, regs, 0);
2880 curr->arch.guest_context.ctrlreg[0] &= ~X86_CR0_TS;
2882 else
2883 TRACE_0D(TRC_PV_MATH_STATE_RESTORE);
2885 return;
2888 asmlinkage void do_debug(struct cpu_user_regs *regs)
2890 struct vcpu *v = current;
2892 DEBUGGER_trap_entry(TRAP_debug, regs);
2894 if ( !guest_mode(regs) )
2896 if ( regs->eflags & EF_TF )
2898 #ifdef __x86_64__
2899 void sysenter_entry(void);
2900 void sysenter_eflags_saved(void);
2901 /* In SYSENTER entry path we can't zap TF until EFLAGS is saved. */
2902 if ( (regs->rip >= (unsigned long)sysenter_entry) &&
2903 (regs->rip < (unsigned long)sysenter_eflags_saved) )
2904 goto out;
2905 WARN_ON(regs->rip != (unsigned long)sysenter_eflags_saved);
2906 #else
2907 WARN_ON(1);
2908 #endif
2909 regs->eflags &= ~EF_TF;
2911 else
2913 /*
2914 * We ignore watchpoints when they trigger within Xen. This may
2915 * happen when a buffer is passed to us which previously had a
2916 * watchpoint set on it. No need to bump EIP; the only faulting
2917 * trap is an instruction breakpoint, which can't happen to us.
2918 */
2919 WARN_ON(!search_exception_table(regs->eip));
2921 goto out;
2924 /* Save debug status register where guest OS can peek at it */
2925 v->arch.guest_context.debugreg[6] = read_debugreg(6);
2927 ler_enable();
2928 do_guest_trap(TRAP_debug, regs, 0);
2929 return;
2931 out:
2932 ler_enable();
2933 return;
2936 asmlinkage void do_spurious_interrupt_bug(struct cpu_user_regs *regs)
2940 static void __set_intr_gate(unsigned int n, uint32_t dpl, void *addr)
2942 int i;
2943 /* Keep secondary tables in sync with IRQ updates. */
2944 for ( i = 1; i < NR_CPUS; i++ )
2945 if ( idt_tables[i] != NULL )
2946 _set_gate(&idt_tables[i][n], 14, dpl, addr);
2947 _set_gate(&idt_table[n], 14, dpl, addr);
2950 static void set_swint_gate(unsigned int n, void *addr)
2952 __set_intr_gate(n, 3, addr);
2955 void set_intr_gate(unsigned int n, void *addr)
2957 __set_intr_gate(n, 0, addr);
2960 void set_tss_desc(unsigned int n, void *addr)
2962 _set_tssldt_desc(
2963 gdt_table + __TSS(n) - FIRST_RESERVED_GDT_ENTRY,
2964 (unsigned long)addr,
2965 offsetof(struct tss_struct, __cacheline_filler) - 1,
2966 9);
2967 #ifdef CONFIG_COMPAT
2968 _set_tssldt_desc(
2969 compat_gdt_table + __TSS(n) - FIRST_RESERVED_GDT_ENTRY,
2970 (unsigned long)addr,
2971 offsetof(struct tss_struct, __cacheline_filler) - 1,
2972 11);
2973 #endif
2976 void __devinit percpu_traps_init(void)
2978 subarch_percpu_traps_init();
2980 if ( !opt_ler )
2981 return;
2983 switch ( boot_cpu_data.x86_vendor )
2985 case X86_VENDOR_INTEL:
2986 switch ( boot_cpu_data.x86 )
2988 case 6:
2989 this_cpu(ler_msr) = MSR_IA32_LASTINTFROMIP;
2990 break;
2991 case 15:
2992 this_cpu(ler_msr) = MSR_P4_LER_FROM_LIP;
2993 break;
2995 break;
2996 case X86_VENDOR_AMD:
2997 switch ( boot_cpu_data.x86 )
2999 case 6:
3000 case 15:
3001 case 16:
3002 this_cpu(ler_msr) = MSR_IA32_LASTINTFROMIP;
3003 break;
3005 break;
3008 ler_enable();
3011 void __init trap_init(void)
3013 /*
3014 * Note that interrupt gates are always used, rather than trap gates. We
3015 * must have interrupts disabled until DS/ES/FS/GS are saved because the
3016 * first activation must have the "bad" value(s) for these registers and
3017 * we may lose them if another activation is installed before they are
3018 * saved. The page-fault handler also needs interrupts disabled until %cr2
3019 * has been read and saved on the stack.
3020 */
3021 set_intr_gate(TRAP_divide_error,&divide_error);
3022 set_intr_gate(TRAP_debug,&debug);
3023 set_intr_gate(TRAP_nmi,&nmi);
3024 set_swint_gate(TRAP_int3,&int3); /* usable from all privileges */
3025 set_swint_gate(TRAP_overflow,&overflow); /* usable from all privileges */
3026 set_intr_gate(TRAP_bounds,&bounds);
3027 set_intr_gate(TRAP_invalid_op,&invalid_op);
3028 set_intr_gate(TRAP_no_device,&device_not_available);
3029 set_intr_gate(TRAP_copro_seg,&coprocessor_segment_overrun);
3030 set_intr_gate(TRAP_invalid_tss,&invalid_TSS);
3031 set_intr_gate(TRAP_no_segment,&segment_not_present);
3032 set_intr_gate(TRAP_stack_error,&stack_segment);
3033 set_intr_gate(TRAP_gp_fault,&general_protection);
3034 set_intr_gate(TRAP_page_fault,&page_fault);
3035 set_intr_gate(TRAP_spurious_int,&spurious_interrupt_bug);
3036 set_intr_gate(TRAP_copro_error,&coprocessor_error);
3037 set_intr_gate(TRAP_alignment_check,&alignment_check);
3038 set_intr_gate(TRAP_machine_check,&machine_check);
3039 set_intr_gate(TRAP_simd_error,&simd_coprocessor_error);
3041 /* CPU0 uses the master IDT. */
3042 idt_tables[0] = idt_table;
3044 percpu_traps_init();
3046 cpu_init();
3048 open_softirq(NMI_MCE_SOFTIRQ, nmi_mce_softirq);
3051 long register_guest_nmi_callback(unsigned long address)
3053 struct vcpu *v = current;
3054 struct domain *d = v->domain;
3055 struct trap_info *t = &v->arch.guest_context.trap_ctxt[TRAP_nmi];
3057 t->vector = TRAP_nmi;
3058 t->flags = 0;
3059 t->cs = !IS_COMPAT(d) ? FLAT_KERNEL_CS : FLAT_COMPAT_KERNEL_CS;
3060 t->address = address;
3061 TI_SET_IF(t, 1);
3063 /*
3064 * If no handler was registered we can 'lose the NMI edge'. Re-assert it
3065 * now.
3066 */
3067 if ( (v->vcpu_id == 0) && (arch_get_nmi_reason(d) != 0) )
3068 v->nmi_pending = 1;
3070 return 0;
3073 long unregister_guest_nmi_callback(void)
3075 struct vcpu *v = current;
3076 struct trap_info *t = &v->arch.guest_context.trap_ctxt[TRAP_nmi];
3078 memset(t, 0, sizeof(*t));
3080 return 0;
3083 int guest_has_trap_callback(struct domain *d, uint16_t vcpuid, unsigned int trap_nr)
3085 struct vcpu *v;
3086 struct trap_info *t;
3088 BUG_ON(d == NULL);
3089 BUG_ON(vcpuid >= MAX_VIRT_CPUS);
3091 /* Sanity check - XXX should be more fine grained. */
3092 BUG_ON(trap_nr > TRAP_syscall);
3094 v = d->vcpu[vcpuid];
3095 t = &v->arch.guest_context.trap_ctxt[trap_nr];
3097 return (t->address != 0);
3101 int send_guest_trap(struct domain *d, uint16_t vcpuid, unsigned int trap_nr)
3103 struct vcpu *v;
3104 struct softirq_trap *st;
3106 BUG_ON(d == NULL);
3107 BUG_ON(vcpuid >= MAX_VIRT_CPUS);
3108 v = d->vcpu[vcpuid];
3110 switch (trap_nr) {
3111 case TRAP_nmi:
3112 if ( !test_and_set_bool(v->nmi_pending) ) {
3113 st = &per_cpu(softirq_trap, smp_processor_id());
3114 st->domain = dom0;
3115 st->vcpu = dom0->vcpu[0];
3116 st->processor = st->vcpu->processor;
3118 /* not safe to wake up a vcpu here */
3119 raise_softirq(NMI_MCE_SOFTIRQ);
3120 return 0;
3122 break;
3124 case TRAP_machine_check:
3126 /* We are called by the machine check (exception or polling) handlers
3127 * on the physical CPU that reported a machine check error. */
3129 if ( !test_and_set_bool(v->mce_pending) ) {
3130 st = &per_cpu(softirq_trap, smp_processor_id());
3131 st->domain = d;
3132 st->vcpu = v;
3133 st->processor = v->processor;
3135 /* not safe to wake up a vcpu here */
3136 raise_softirq(NMI_MCE_SOFTIRQ);
3137 return 0;
3139 break;
3142 /* delivery failed */
3143 return -EIO;
3147 long do_set_trap_table(XEN_GUEST_HANDLE(const_trap_info_t) traps)
3149 struct trap_info cur;
3150 struct vcpu *curr = current;
3151 struct trap_info *dst = curr->arch.guest_context.trap_ctxt;
3152 long rc = 0;
3154 /* If no table is presented then clear the entire virtual IDT. */
3155 if ( guest_handle_is_null(traps) )
3157 memset(dst, 0, 256 * sizeof(*dst));
3158 init_int80_direct_trap(curr);
3159 return 0;
3162 for ( ; ; )
3164 if ( hypercall_preempt_check() )
3166 rc = hypercall_create_continuation(
3167 __HYPERVISOR_set_trap_table, "h", traps);
3168 break;
3171 if ( copy_from_guest(&cur, traps, 1) )
3173 rc = -EFAULT;
3174 break;
3177 if ( cur.address == 0 )
3178 break;
3180 fixup_guest_code_selector(curr->domain, cur.cs);
3182 memcpy(&dst[cur.vector], &cur, sizeof(cur));
3184 if ( cur.vector == 0x80 )
3185 init_int80_direct_trap(curr);
3187 guest_handle_add_offset(traps, 1);
3190 return rc;
3193 long set_debugreg(struct vcpu *v, int reg, unsigned long value)
3195 int i;
3196 struct vcpu *curr = current;
3198 switch ( reg )
3200 case 0:
3201 if ( !access_ok(value, sizeof(long)) )
3202 return -EPERM;
3203 if ( v == curr )
3204 write_debugreg(0, value);
3205 break;
3206 case 1:
3207 if ( !access_ok(value, sizeof(long)) )
3208 return -EPERM;
3209 if ( v == curr )
3210 write_debugreg(1, value);
3211 break;
3212 case 2:
3213 if ( !access_ok(value, sizeof(long)) )
3214 return -EPERM;
3215 if ( v == curr )
3216 write_debugreg(2, value);
3217 break;
3218 case 3:
3219 if ( !access_ok(value, sizeof(long)) )
3220 return -EPERM;
3221 if ( v == curr )
3222 write_debugreg(3, value);
3223 break;
3224 case 6:
3225 /*
3226 * DR6: Bits 4-11,16-31 reserved (set to 1).
3227 * Bit 12 reserved (set to 0).
3228 */
3229 value &= 0xffffefff; /* reserved bits => 0 */
3230 value |= 0xffff0ff0; /* reserved bits => 1 */
3231 if ( v == curr )
3232 write_debugreg(6, value);
3233 break;
3234 case 7:
3235 /*
3236 * DR7: Bit 10 reserved (set to 1).
3237 * Bits 11-12,14-15 reserved (set to 0).
3238 */
3239 value &= ~DR_CONTROL_RESERVED_ZERO; /* reserved bits => 0 */
3240 value |= DR_CONTROL_RESERVED_ONE; /* reserved bits => 1 */
3241 /*
3242 * Privileged bits:
3243 * GD (bit 13): must be 0.
3244 */
3245 if ( value & DR_GENERAL_DETECT )
3246 return -EPERM;
3247 /* DR7.{G,L}E = 0 => debugging disabled for this domain. */
3248 if ( value & DR7_ACTIVE_MASK )
3250 unsigned int io_enable = 0;
3252 for ( i = DR_CONTROL_SHIFT; i < 32; i += DR_CONTROL_SIZE )
3254 if ( ((value >> i) & 3) == DR_IO )
3256 if ( !(v->arch.guest_context.ctrlreg[4] & X86_CR4_DE) )
3257 return -EPERM;
3258 io_enable |= value & (3 << ((i - 16) >> 1));
3260 #ifdef __i386__
3261 if ( ((boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) ||
3262 !boot_cpu_has(X86_FEATURE_LM)) &&
3263 (((value >> i) & 0xc) == DR_LEN_8) )
3264 return -EPERM;
3265 #endif
3268 /* Guest DR5 is a handy stash for I/O intercept information. */
3269 v->arch.guest_context.debugreg[5] = io_enable;
3270 value &= ~io_enable;
3272 /*
3273 * If DR7 was previously clear then we need to load all other
3274 * debug registers at this point as they were not restored during
3275 * context switch.
3276 */
3277 if ( (v == curr) &&
3278 !(v->arch.guest_context.debugreg[7] & DR7_ACTIVE_MASK) )
3280 write_debugreg(0, v->arch.guest_context.debugreg[0]);
3281 write_debugreg(1, v->arch.guest_context.debugreg[1]);
3282 write_debugreg(2, v->arch.guest_context.debugreg[2]);
3283 write_debugreg(3, v->arch.guest_context.debugreg[3]);
3284 write_debugreg(6, v->arch.guest_context.debugreg[6]);
3287 if ( v == curr )
3288 write_debugreg(7, value);
3289 break;
3290 default:
3291 return -EINVAL;
3294 v->arch.guest_context.debugreg[reg] = value;
3295 return 0;
3298 long do_set_debugreg(int reg, unsigned long value)
3300 return set_debugreg(current, reg, value);
3303 unsigned long do_get_debugreg(int reg)
3305 struct vcpu *curr = current;
3307 switch ( reg )
3309 case 0 ... 3:
3310 case 6:
3311 return curr->arch.guest_context.debugreg[reg];
3312 case 7:
3313 return (curr->arch.guest_context.debugreg[7] |
3314 curr->arch.guest_context.debugreg[5]);
3315 case 4 ... 5:
3316 return ((curr->arch.guest_context.ctrlreg[4] & X86_CR4_DE) ?
3317 curr->arch.guest_context.debugreg[reg + 2] : 0);
3320 return -EINVAL;
3323 /*
3324 * Local variables:
3325 * mode: C
3326 * c-set-style: "BSD"
3327 * c-basic-offset: 4
3328 * tab-width: 4
3329 * indent-tabs-mode: nil
3330 * End:
3331 */