direct-io.hg

view xen/arch/x86/traps.c @ 10445:8d75d4e0af1e

[XEN] Improve double-fault tracing -- print backtrace
on stack overflow.
Signed-off-by: Keir Fraser <keir@xensource.com>
author kfraser@dhcp93.uk.xensource.com
date Mon Jun 19 11:21:40 2006 +0100 (2006-06-19)
parents e23961a8ce7e
children f3561b1ee7a3
line source
1 /******************************************************************************
2 * arch/x86/traps.c
3 *
4 * Modifications to Linux original are copyright (c) 2002-2004, K A Fraser
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
21 /*
22 * Copyright (C) 1991, 1992 Linus Torvalds
23 *
24 * Pentium III FXSR, SSE support
25 * Gareth Hughes <gareth@valinux.com>, May 2000
26 */
28 #include <xen/config.h>
29 #include <xen/init.h>
30 #include <xen/sched.h>
31 #include <xen/lib.h>
32 #include <xen/errno.h>
33 #include <xen/mm.h>
34 #include <xen/console.h>
35 #include <xen/reboot.h>
36 #include <asm/regs.h>
37 #include <xen/delay.h>
38 #include <xen/event.h>
39 #include <xen/spinlock.h>
40 #include <xen/irq.h>
41 #include <xen/perfc.h>
42 #include <xen/softirq.h>
43 #include <xen/domain_page.h>
44 #include <xen/symbols.h>
45 #include <xen/iocap.h>
46 #include <xen/nmi.h>
47 #include <asm/shadow.h>
48 #include <asm/system.h>
49 #include <asm/io.h>
50 #include <asm/atomic.h>
51 #include <asm/desc.h>
52 #include <asm/debugreg.h>
53 #include <asm/smp.h>
54 #include <asm/flushtlb.h>
55 #include <asm/uaccess.h>
56 #include <asm/i387.h>
57 #include <asm/debugger.h>
58 #include <asm/msr.h>
59 #include <asm/x86_emulate.h>
61 /*
62 * opt_nmi: one of 'ignore', 'dom0', or 'fatal'.
63 * fatal: Xen prints diagnostic message and then hangs.
64 * dom0: The NMI is virtualised to DOM0.
65 * ignore: The NMI error is cleared and ignored.
66 */
67 #ifdef NDEBUG
68 char opt_nmi[10] = "dom0";
69 #else
70 char opt_nmi[10] = "fatal";
71 #endif
72 string_param("nmi", opt_nmi);
74 /* Master table, used by all CPUs on x86/64, and by CPU0 on x86/32.*/
75 idt_entry_t idt_table[IDT_ENTRIES];
77 #define DECLARE_TRAP_HANDLER(_name) \
78 asmlinkage void _name(void); \
79 asmlinkage int do_ ## _name(struct cpu_user_regs *regs)
81 asmlinkage void nmi(void);
82 DECLARE_TRAP_HANDLER(divide_error);
83 DECLARE_TRAP_HANDLER(debug);
84 DECLARE_TRAP_HANDLER(int3);
85 DECLARE_TRAP_HANDLER(overflow);
86 DECLARE_TRAP_HANDLER(bounds);
87 DECLARE_TRAP_HANDLER(invalid_op);
88 DECLARE_TRAP_HANDLER(device_not_available);
89 DECLARE_TRAP_HANDLER(coprocessor_segment_overrun);
90 DECLARE_TRAP_HANDLER(invalid_TSS);
91 DECLARE_TRAP_HANDLER(segment_not_present);
92 DECLARE_TRAP_HANDLER(stack_segment);
93 DECLARE_TRAP_HANDLER(general_protection);
94 DECLARE_TRAP_HANDLER(page_fault);
95 DECLARE_TRAP_HANDLER(coprocessor_error);
96 DECLARE_TRAP_HANDLER(simd_coprocessor_error);
97 DECLARE_TRAP_HANDLER(alignment_check);
98 DECLARE_TRAP_HANDLER(spurious_interrupt_bug);
99 DECLARE_TRAP_HANDLER(machine_check);
101 long do_set_debugreg(int reg, unsigned long value);
102 unsigned long do_get_debugreg(int reg);
104 static int debug_stack_lines = 20;
105 integer_param("debug_stack_lines", debug_stack_lines);
107 #ifdef CONFIG_X86_32
108 #define stack_words_per_line 8
109 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)&regs->esp)
110 #else
111 #define stack_words_per_line 4
112 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)regs->rsp)
113 #endif
115 int is_kernel_text(unsigned long addr)
116 {
117 extern char _stext, _etext;
118 if (addr >= (unsigned long) &_stext &&
119 addr <= (unsigned long) &_etext)
120 return 1;
121 return 0;
123 }
125 unsigned long kernel_text_end(void)
126 {
127 extern char _etext;
128 return (unsigned long) &_etext;
129 }
131 static void show_guest_stack(struct cpu_user_regs *regs)
132 {
133 int i;
134 unsigned long *stack, addr;
136 if ( hvm_guest(current) )
137 return;
139 if ( vm86_mode(regs) )
140 {
141 stack = (unsigned long *)((regs->ss << 4) + (regs->esp & 0xffff));
142 printk("Guest stack trace from ss:sp = %04x:%04x (VM86)\n ",
143 regs->ss, (uint16_t)(regs->esp & 0xffff));
144 }
145 else
146 {
147 stack = (unsigned long *)regs->esp;
148 printk("Guest stack trace from "__OP"sp=%p:\n ", stack);
149 }
151 for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
152 {
153 if ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) == 0 )
154 break;
155 if ( get_user(addr, stack) )
156 {
157 if ( i != 0 )
158 printk("\n ");
159 printk("Fault while accessing guest memory.");
160 i = 1;
161 break;
162 }
163 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
164 printk("\n ");
165 printk(" %p", _p(addr));
166 stack++;
167 }
168 if ( i == 0 )
169 printk("Stack empty.");
170 printk("\n");
171 }
173 #ifdef NDEBUG
175 static void show_trace(struct cpu_user_regs *regs)
176 {
177 unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr;
179 printk("Xen call trace:\n ");
181 printk("[<%p>]", _p(regs->eip));
182 print_symbol(" %s\n ", regs->eip);
184 while ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) != 0 )
185 {
186 addr = *stack++;
187 if ( is_kernel_text(addr) )
188 {
189 printk("[<%p>]", _p(addr));
190 print_symbol(" %s\n ", addr);
191 }
192 }
194 printk("\n");
195 }
197 #else
199 static void show_trace(struct cpu_user_regs *regs)
200 {
201 unsigned long *frame, next, addr, low, high;
203 printk("Xen call trace:\n ");
205 printk("[<%p>]", _p(regs->eip));
206 print_symbol(" %s\n ", regs->eip);
208 /* Bounds for range of valid frame pointer. */
209 low = (unsigned long)(ESP_BEFORE_EXCEPTION(regs) - 2);
210 high = (low & ~(STACK_SIZE - 1)) +
211 (STACK_SIZE - sizeof(struct cpu_info) - 2*sizeof(unsigned long));
213 /* The initial frame pointer. */
214 next = regs->ebp;
216 for ( ; ; )
217 {
218 /* Valid frame pointer? */
219 if ( (next < low) || (next >= high) )
220 {
221 /*
222 * Exception stack frames have a different layout, denoted by an
223 * inverted frame pointer.
224 */
225 next = ~next;
226 if ( (next < low) || (next >= high) )
227 break;
228 frame = (unsigned long *)next;
229 next = frame[0];
230 addr = frame[(offsetof(struct cpu_user_regs, eip) -
231 offsetof(struct cpu_user_regs, ebp))
232 / BYTES_PER_LONG];
233 }
234 else
235 {
236 /* Ordinary stack frame. */
237 frame = (unsigned long *)next;
238 next = frame[0];
239 addr = frame[1];
240 }
242 printk("[<%p>]", _p(addr));
243 print_symbol(" %s\n ", addr);
245 low = (unsigned long)&frame[2];
246 }
248 printk("\n");
249 }
251 #endif
253 void show_stack(struct cpu_user_regs *regs)
254 {
255 unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr;
256 int i;
258 if ( guest_mode(regs) )
259 return show_guest_stack(regs);
261 printk("Xen stack trace from "__OP"sp=%p:\n ", stack);
263 for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
264 {
265 if ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) == 0 )
266 break;
267 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
268 printk("\n ");
269 addr = *stack++;
270 printk(" %p", _p(addr));
271 }
272 if ( i == 0 )
273 printk("Stack empty.");
274 printk("\n");
276 show_trace(regs);
277 }
279 void show_stack_overflow(unsigned long esp)
280 {
281 #ifdef MEMORY_GUARD
282 unsigned long esp_top = get_stack_bottom() & PAGE_MASK;
283 unsigned long *stack, addr;
285 /* Trigger overflow trace if %esp is within 100 bytes of the guard page. */
286 if ( ((esp - esp_top) > 100) && ((esp_top - esp) > 100) )
287 return;
289 if ( esp < esp_top )
290 esp = esp_top;
292 printk("Xen stack overflow:\n ");
294 stack = (unsigned long *)esp;
295 while ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) != 0 )
296 {
297 addr = *stack++;
298 if ( is_kernel_text(addr) )
299 {
300 printk("%p: [<%p>]", stack, _p(addr));
301 print_symbol(" %s\n ", addr);
302 }
303 }
305 printk("\n");
306 #endif
307 }
309 /*
310 * This is called for faults at very unexpected times (e.g., when interrupts
311 * are disabled). In such situations we can't do much that is safe. We try to
312 * print out some tracing and then we just spin.
313 */
314 asmlinkage void fatal_trap(int trapnr, struct cpu_user_regs *regs)
315 {
316 int cpu = smp_processor_id();
317 unsigned long cr2;
318 static char *trapstr[] = {
319 "divide error", "debug", "nmi", "bkpt", "overflow", "bounds",
320 "invalid opcode", "device not available", "double fault",
321 "coprocessor segment", "invalid tss", "segment not found",
322 "stack error", "general protection fault", "page fault",
323 "spurious interrupt", "coprocessor error", "alignment check",
324 "machine check", "simd error"
325 };
327 watchdog_disable();
328 console_start_sync();
330 show_registers(regs);
332 if ( trapnr == TRAP_page_fault )
333 {
334 __asm__ __volatile__ ("mov %%cr2,%0" : "=r" (cr2) : );
335 printk("Faulting linear address: %p\n", _p(cr2));
336 show_page_walk(cr2);
337 }
339 printk("************************************\n");
340 printk("CPU%d FATAL TRAP %d (%s), ERROR_CODE %04x%s.\n",
341 cpu, trapnr, trapstr[trapnr], regs->error_code,
342 (regs->eflags & X86_EFLAGS_IF) ? "" : ", IN INTERRUPT CONTEXT");
343 printk("System shutting down -- need manual reset.\n");
344 printk("************************************\n");
346 (void)debugger_trap_fatal(trapnr, regs);
348 /* Lock up the console to prevent spurious output from other CPUs. */
349 console_force_lock();
351 /* Wait for manual reset. */
352 machine_halt();
353 }
355 static inline int do_trap(int trapnr, char *str,
356 struct cpu_user_regs *regs,
357 int use_error_code)
358 {
359 struct vcpu *v = current;
360 struct trap_bounce *tb = &v->arch.trap_bounce;
361 struct trap_info *ti;
362 unsigned long fixup;
364 DEBUGGER_trap_entry(trapnr, regs);
366 if ( !guest_mode(regs) )
367 goto xen_fault;
369 ti = &current->arch.guest_context.trap_ctxt[trapnr];
370 tb->flags = TBF_EXCEPTION;
371 tb->cs = ti->cs;
372 tb->eip = ti->address;
373 if ( use_error_code )
374 {
375 tb->flags |= TBF_EXCEPTION_ERRCODE;
376 tb->error_code = regs->error_code;
377 }
378 if ( TI_GET_IF(ti) )
379 tb->flags |= TBF_INTERRUPT;
380 return 0;
382 xen_fault:
384 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
385 {
386 DPRINTK("Trap %d: %p -> %p\n", trapnr, _p(regs->eip), _p(fixup));
387 regs->eip = fixup;
388 return 0;
389 }
391 DEBUGGER_trap_fatal(trapnr, regs);
393 show_registers(regs);
394 panic("CPU%d FATAL TRAP: vector = %d (%s)\n"
395 "[error_code=%04x]\n",
396 smp_processor_id(), trapnr, str, regs->error_code);
397 return 0;
398 }
400 #define DO_ERROR_NOCODE(trapnr, str, name) \
401 asmlinkage int do_##name(struct cpu_user_regs *regs) \
402 { \
403 return do_trap(trapnr, str, regs, 0); \
404 }
406 #define DO_ERROR(trapnr, str, name) \
407 asmlinkage int do_##name(struct cpu_user_regs *regs) \
408 { \
409 return do_trap(trapnr, str, regs, 1); \
410 }
412 DO_ERROR_NOCODE( 0, "divide error", divide_error)
413 DO_ERROR_NOCODE( 4, "overflow", overflow)
414 DO_ERROR_NOCODE( 5, "bounds", bounds)
415 DO_ERROR_NOCODE( 9, "coprocessor segment overrun", coprocessor_segment_overrun)
416 DO_ERROR(10, "invalid TSS", invalid_TSS)
417 DO_ERROR(11, "segment not present", segment_not_present)
418 DO_ERROR(12, "stack segment", stack_segment)
419 DO_ERROR_NOCODE(16, "fpu error", coprocessor_error)
420 DO_ERROR(17, "alignment check", alignment_check)
421 DO_ERROR_NOCODE(19, "simd error", simd_coprocessor_error)
423 static int emulate_forced_invalid_op(struct cpu_user_regs *regs)
424 {
425 char signature[5], instr[2];
426 unsigned long a, b, c, d, eip;
428 a = regs->eax;
429 b = regs->ebx;
430 c = regs->ecx;
431 d = regs->edx;
432 eip = regs->eip;
434 /* Check for forced emulation signature: ud2 ; .ascii "xen". */
435 if ( copy_from_user(signature, (char *)eip, sizeof(signature)) ||
436 memcmp(signature, "\xf\xbxen", sizeof(signature)) )
437 return 0;
438 eip += sizeof(signature);
440 /* We only emulate CPUID. */
441 if ( copy_from_user(instr, (char *)eip, sizeof(instr)) ||
442 memcmp(instr, "\xf\xa2", sizeof(instr)) )
443 return 0;
444 eip += sizeof(instr);
446 __asm__ (
447 "cpuid"
448 : "=a" (a), "=b" (b), "=c" (c), "=d" (d)
449 : "0" (a), "1" (b), "2" (c), "3" (d) );
451 if ( regs->eax == 1 )
452 {
453 /* Modify Feature Information. */
454 clear_bit(X86_FEATURE_VME, &d);
455 clear_bit(X86_FEATURE_DE, &d);
456 clear_bit(X86_FEATURE_PSE, &d);
457 clear_bit(X86_FEATURE_PGE, &d);
458 if ( !supervisor_mode_kernel )
459 clear_bit(X86_FEATURE_SEP, &d);
460 if ( !IS_PRIV(current->domain) )
461 clear_bit(X86_FEATURE_MTRR, &d);
462 }
464 regs->eax = a;
465 regs->ebx = b;
466 regs->ecx = c;
467 regs->edx = d;
468 regs->eip = eip;
470 return EXCRET_fault_fixed;
471 }
473 asmlinkage int do_invalid_op(struct cpu_user_regs *regs)
474 {
475 struct vcpu *v = current;
476 struct trap_bounce *tb = &v->arch.trap_bounce;
477 struct trap_info *ti;
478 int rc;
480 DEBUGGER_trap_entry(TRAP_invalid_op, regs);
482 if ( unlikely(!guest_mode(regs)) )
483 {
484 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
485 show_registers(regs);
486 panic("CPU%d FATAL TRAP: vector = %d (invalid opcode)\n",
487 smp_processor_id(), TRAP_invalid_op);
488 }
490 if ( (rc = emulate_forced_invalid_op(regs)) != 0 )
491 return rc;
493 ti = &current->arch.guest_context.trap_ctxt[TRAP_invalid_op];
494 tb->flags = TBF_EXCEPTION;
495 tb->cs = ti->cs;
496 tb->eip = ti->address;
497 if ( TI_GET_IF(ti) )
498 tb->flags |= TBF_INTERRUPT;
500 return 0;
501 }
503 asmlinkage int do_int3(struct cpu_user_regs *regs)
504 {
505 struct vcpu *v = current;
506 struct trap_bounce *tb = &v->arch.trap_bounce;
507 struct trap_info *ti;
509 DEBUGGER_trap_entry(TRAP_int3, regs);
511 if ( !guest_mode(regs) )
512 {
513 DEBUGGER_trap_fatal(TRAP_int3, regs);
514 show_registers(regs);
515 panic("CPU%d FATAL TRAP: vector = 3 (Int3)\n", smp_processor_id());
516 }
518 ti = &current->arch.guest_context.trap_ctxt[TRAP_int3];
519 tb->flags = TBF_EXCEPTION;
520 tb->cs = ti->cs;
521 tb->eip = ti->address;
522 if ( TI_GET_IF(ti) )
523 tb->flags |= TBF_INTERRUPT;
525 return 0;
526 }
528 asmlinkage int do_machine_check(struct cpu_user_regs *regs)
529 {
530 fatal_trap(TRAP_machine_check, regs);
531 return 0;
532 }
534 void propagate_page_fault(unsigned long addr, u16 error_code)
535 {
536 struct trap_info *ti;
537 struct vcpu *v = current;
538 struct trap_bounce *tb = &v->arch.trap_bounce;
540 v->arch.guest_context.ctrlreg[2] = addr;
541 v->vcpu_info->arch.cr2 = addr;
543 /* Re-set error_code.user flag appropriately for the guest. */
544 error_code &= ~PGERR_user_mode;
545 if ( !guest_kernel_mode(v, guest_cpu_user_regs()) )
546 error_code |= PGERR_user_mode;
548 ti = &v->arch.guest_context.trap_ctxt[TRAP_page_fault];
549 tb->flags = TBF_EXCEPTION | TBF_EXCEPTION_ERRCODE;
550 tb->error_code = error_code;
551 tb->cs = ti->cs;
552 tb->eip = ti->address;
553 if ( TI_GET_IF(ti) )
554 tb->flags |= TBF_INTERRUPT;
555 }
557 static int handle_gdt_ldt_mapping_fault(
558 unsigned long offset, struct cpu_user_regs *regs)
559 {
560 extern int map_ldt_shadow_page(unsigned int);
562 struct vcpu *v = current;
563 struct domain *d = v->domain;
564 int ret;
566 /* Which vcpu's area did we fault in, and is it in the ldt sub-area? */
567 unsigned int is_ldt_area = (offset >> (GDT_LDT_VCPU_VA_SHIFT-1)) & 1;
568 unsigned int vcpu_area = (offset >> GDT_LDT_VCPU_VA_SHIFT);
570 /* Should never fault in another vcpu's area. */
571 BUG_ON(vcpu_area != current->vcpu_id);
573 /* Byte offset within the gdt/ldt sub-area. */
574 offset &= (1UL << (GDT_LDT_VCPU_VA_SHIFT-1)) - 1UL;
576 if ( likely(is_ldt_area) )
577 {
578 /* LDT fault: Copy a mapping from the guest's LDT, if it is valid. */
579 LOCK_BIGLOCK(d);
580 cleanup_writable_pagetable(d);
581 ret = map_ldt_shadow_page(offset >> PAGE_SHIFT);
582 UNLOCK_BIGLOCK(d);
584 if ( unlikely(ret == 0) )
585 {
586 /* In hypervisor mode? Leave it to the #PF handler to fix up. */
587 if ( !guest_mode(regs) )
588 return 0;
589 /* In guest mode? Propagate #PF to guest, with adjusted %cr2. */
590 propagate_page_fault(
591 v->arch.guest_context.ldt_base + offset, regs->error_code);
592 }
593 }
594 else
595 {
596 /* GDT fault: handle the fault as #GP(selector). */
597 regs->error_code = (u16)offset & ~7;
598 (void)do_general_protection(regs);
599 }
601 return EXCRET_fault_fixed;
602 }
604 #ifdef HYPERVISOR_VIRT_END
605 #define IN_HYPERVISOR_RANGE(va) \
606 (((va) >= HYPERVISOR_VIRT_START) && ((va) < HYPERVISOR_VIRT_END))
607 #else
608 #define IN_HYPERVISOR_RANGE(va) \
609 (((va) >= HYPERVISOR_VIRT_START))
610 #endif
612 static int __spurious_page_fault(
613 unsigned long addr, struct cpu_user_regs *regs)
614 {
615 unsigned long mfn = read_cr3() >> PAGE_SHIFT;
616 #if CONFIG_PAGING_LEVELS >= 4
617 l4_pgentry_t l4e, *l4t;
618 #endif
619 #if CONFIG_PAGING_LEVELS >= 3
620 l3_pgentry_t l3e, *l3t;
621 #endif
622 l2_pgentry_t l2e, *l2t;
623 l1_pgentry_t l1e, *l1t;
624 unsigned int required_flags, disallowed_flags;
626 /* Reserved bit violations are never spurious faults. */
627 if ( regs->error_code & PGERR_reserved_bit )
628 return 0;
630 required_flags = _PAGE_PRESENT;
631 if ( regs->error_code & PGERR_write_access )
632 required_flags |= _PAGE_RW;
633 if ( regs->error_code & PGERR_user_mode )
634 required_flags |= _PAGE_USER;
636 disallowed_flags = 0;
637 if ( regs->error_code & PGERR_instr_fetch )
638 disallowed_flags |= _PAGE_NX;
640 #if CONFIG_PAGING_LEVELS >= 4
641 l4t = map_domain_page(mfn);
642 l4e = l4t[l4_table_offset(addr)];
643 mfn = l4e_get_pfn(l4e);
644 unmap_domain_page(l4t);
645 if ( !(l4e_get_flags(l4e) & required_flags) ||
646 (l4e_get_flags(l4e) & disallowed_flags) )
647 return 0;
648 #endif
650 #if CONFIG_PAGING_LEVELS >= 3
651 l3t = map_domain_page(mfn);
652 l3e = l3t[l3_table_offset(addr)];
653 mfn = l3e_get_pfn(l3e);
654 unmap_domain_page(l3t);
655 #ifdef CONFIG_X86_PAE
656 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
657 return 0;
658 #else
659 if ( !(l3e_get_flags(l3e) & required_flags) ||
660 (l3e_get_flags(l3e) & disallowed_flags) )
661 return 0;
662 #endif
663 #endif
665 l2t = map_domain_page(mfn);
666 l2e = l2t[l2_table_offset(addr)];
667 mfn = l2e_get_pfn(l2e);
668 unmap_domain_page(l2t);
669 if ( !(l2e_get_flags(l2e) & required_flags) ||
670 (l2e_get_flags(l2e) & disallowed_flags) )
671 return 0;
672 if ( l2e_get_flags(l2e) & _PAGE_PSE )
673 return 1;
675 l1t = map_domain_page(mfn);
676 l1e = l1t[l1_table_offset(addr)];
677 mfn = l1e_get_pfn(l1e);
678 unmap_domain_page(l1t);
679 if ( !(l1e_get_flags(l1e) & required_flags) ||
680 (l1e_get_flags(l1e) & disallowed_flags) )
681 return 0;
682 return 1;
683 }
685 static int spurious_page_fault(
686 unsigned long addr, struct cpu_user_regs *regs)
687 {
688 struct domain *d = current->domain;
689 int is_spurious;
691 LOCK_BIGLOCK(d);
692 cleanup_writable_pagetable(d);
693 is_spurious = __spurious_page_fault(addr, regs);
694 UNLOCK_BIGLOCK(d);
696 return is_spurious;
697 }
699 static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs)
700 {
701 struct vcpu *v = current;
702 struct domain *d = v->domain;
704 if ( unlikely(IN_HYPERVISOR_RANGE(addr)) )
705 {
706 if ( shadow_mode_external(d) && guest_mode(regs) )
707 return shadow_fault(addr, regs);
708 if ( (addr >= GDT_LDT_VIRT_START) && (addr < GDT_LDT_VIRT_END) )
709 return handle_gdt_ldt_mapping_fault(
710 addr - GDT_LDT_VIRT_START, regs);
711 /*
712 * Do not propagate spurious faults in the hypervisor area to the
713 * guest. It cannot fix them up.
714 */
715 return (spurious_page_fault(addr, regs) ? EXCRET_not_a_fault : 0);
716 }
718 if ( unlikely(shadow_mode_enabled(d)) )
719 return shadow_fault(addr, regs);
721 if ( likely(VM_ASSIST(d, VMASST_TYPE_writable_pagetables)) )
722 {
723 LOCK_BIGLOCK(d);
724 if ( unlikely(d->arch.ptwr[PTWR_PT_ACTIVE].l1va) &&
725 unlikely(l2_linear_offset(addr) ==
726 d->arch.ptwr[PTWR_PT_ACTIVE].l2_idx) )
727 {
728 ptwr_flush(d, PTWR_PT_ACTIVE);
729 UNLOCK_BIGLOCK(d);
730 return EXCRET_fault_fixed;
731 }
733 /*
734 * Note it is *not* safe to check PGERR_page_present here. It can be
735 * clear, due to unhooked page table, when we would otherwise expect
736 * it to be set. We have an aversion to trusting that flag in Xen, and
737 * guests ought to be leery too.
738 */
739 if ( guest_kernel_mode(v, regs) &&
740 (regs->error_code & PGERR_write_access) &&
741 ptwr_do_page_fault(d, addr, regs) )
742 {
743 UNLOCK_BIGLOCK(d);
744 return EXCRET_fault_fixed;
745 }
746 UNLOCK_BIGLOCK(d);
747 }
749 return 0;
750 }
752 /*
753 * #PF error code:
754 * Bit 0: Protection violation (=1) ; Page not present (=0)
755 * Bit 1: Write access
756 * Bit 2: User mode (=1) ; Supervisor mode (=0)
757 * Bit 3: Reserved bit violation
758 * Bit 4: Instruction fetch
759 */
760 asmlinkage int do_page_fault(struct cpu_user_regs *regs)
761 {
762 unsigned long addr, fixup;
763 int rc;
765 ASSERT(!in_irq());
767 __asm__ __volatile__ ("mov %%cr2,%0" : "=r" (addr) : );
769 DEBUGGER_trap_entry(TRAP_page_fault, regs);
771 perfc_incrc(page_faults);
773 if ( unlikely((rc = fixup_page_fault(addr, regs)) != 0) )
774 return rc;
776 if ( unlikely(!guest_mode(regs)) )
777 {
778 if ( spurious_page_fault(addr, regs) )
779 {
780 DPRINTK("Spurious fault in domain %u:%u at addr %lx\n",
781 current->domain->domain_id, current->vcpu_id, addr);
782 return EXCRET_not_a_fault;
783 }
785 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
786 {
787 perfc_incrc(copy_user_faults);
788 regs->eip = fixup;
789 return 0;
790 }
792 DEBUGGER_trap_fatal(TRAP_page_fault, regs);
794 show_registers(regs);
795 show_page_walk(addr);
796 panic("CPU%d FATAL PAGE FAULT\n"
797 "[error_code=%04x]\n"
798 "Faulting linear address: %p\n",
799 smp_processor_id(), regs->error_code, _p(addr));
800 }
802 propagate_page_fault(addr, regs->error_code);
803 return 0;
804 }
806 long do_fpu_taskswitch(int set)
807 {
808 struct vcpu *v = current;
810 if ( set )
811 {
812 v->arch.guest_context.ctrlreg[0] |= X86_CR0_TS;
813 stts();
814 }
815 else
816 {
817 v->arch.guest_context.ctrlreg[0] &= ~X86_CR0_TS;
818 if ( test_bit(_VCPUF_fpu_dirtied, &v->vcpu_flags) )
819 clts();
820 }
822 return 0;
823 }
825 /* Has the guest requested sufficient permission for this I/O access? */
826 static inline int guest_io_okay(
827 unsigned int port, unsigned int bytes,
828 struct vcpu *v, struct cpu_user_regs *regs)
829 {
830 u16 x;
831 #if defined(__x86_64__)
832 /* If in user mode, switch to kernel mode just to read I/O bitmap. */
833 int user_mode = !(v->arch.flags & TF_kernel_mode);
834 #define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v)
835 #elif defined(__i386__)
836 #define TOGGLE_MODE() ((void)0)
837 #endif
839 if ( !vm86_mode(regs) &&
840 (v->arch.iopl >= (guest_kernel_mode(v, regs) ? 1 : 3)) )
841 return 1;
843 if ( v->arch.iobmp_limit > (port + bytes) )
844 {
845 TOGGLE_MODE();
846 __get_user(x, (u16 *)(v->arch.iobmp+(port>>3)));
847 TOGGLE_MODE();
848 if ( (x & (((1<<bytes)-1) << (port&7))) == 0 )
849 return 1;
850 }
852 return 0;
853 }
855 /* Has the administrator granted sufficient permission for this I/O access? */
856 static inline int admin_io_okay(
857 unsigned int port, unsigned int bytes,
858 struct vcpu *v, struct cpu_user_regs *regs)
859 {
860 return ioports_access_permitted(v->domain, port, port + bytes - 1);
861 }
863 /* Check admin limits. Silently fail the access if it is disallowed. */
864 #define inb_user(_p, _d, _r) (admin_io_okay(_p, 1, _d, _r) ? inb(_p) : ~0)
865 #define inw_user(_p, _d, _r) (admin_io_okay(_p, 2, _d, _r) ? inw(_p) : ~0)
866 #define inl_user(_p, _d, _r) (admin_io_okay(_p, 4, _d, _r) ? inl(_p) : ~0)
867 #define outb_user(_v, _p, _d, _r) \
868 (admin_io_okay(_p, 1, _d, _r) ? outb(_v, _p) : ((void)0))
869 #define outw_user(_v, _p, _d, _r) \
870 (admin_io_okay(_p, 2, _d, _r) ? outw(_v, _p) : ((void)0))
871 #define outl_user(_v, _p, _d, _r) \
872 (admin_io_okay(_p, 4, _d, _r) ? outl(_v, _p) : ((void)0))
874 /* Propagate a fault back to the guest kernel. */
875 #define PAGE_FAULT(_faultaddr, _errcode) \
876 ({ propagate_page_fault(_faultaddr, _errcode); \
877 return EXCRET_fault_fixed; \
878 })
880 /* Isntruction fetch with error handling. */
881 #define insn_fetch(_type, _size, _ptr) \
882 ({ unsigned long _x; \
883 if ( get_user(_x, (_type *)eip) ) \
884 PAGE_FAULT(eip, 0); /* read fault */ \
885 eip += _size; (_type)_x; })
887 static int emulate_privileged_op(struct cpu_user_regs *regs)
888 {
889 struct vcpu *v = current;
890 unsigned long *reg, eip = regs->eip, res;
891 u8 opcode, modrm_reg = 0, modrm_rm = 0, rep_prefix = 0;
892 unsigned int port, i, op_bytes = 4, data;
893 u32 l, h;
895 /* Legacy prefixes. */
896 for ( i = 0; i < 8; i++ )
897 {
898 switch ( opcode = insn_fetch(u8, 1, eip) )
899 {
900 case 0x66: /* operand-size override */
901 op_bytes ^= 6; /* switch between 2/4 bytes */
902 break;
903 case 0x67: /* address-size override */
904 case 0x2e: /* CS override */
905 case 0x3e: /* DS override */
906 case 0x26: /* ES override */
907 case 0x64: /* FS override */
908 case 0x65: /* GS override */
909 case 0x36: /* SS override */
910 case 0xf0: /* LOCK */
911 case 0xf2: /* REPNE/REPNZ */
912 break;
913 case 0xf3: /* REP/REPE/REPZ */
914 rep_prefix = 1;
915 break;
916 default:
917 goto done_prefixes;
918 }
919 }
920 done_prefixes:
922 #ifdef __x86_64__
923 /* REX prefix. */
924 if ( (opcode & 0xf0) == 0x40 )
925 {
926 modrm_reg = (opcode & 4) << 1; /* REX.R */
927 modrm_rm = (opcode & 1) << 3; /* REX.B */
929 /* REX.W and REX.X do not need to be decoded. */
930 opcode = insn_fetch(u8, 1, eip);
931 }
932 #endif
934 /* Input/Output String instructions. */
935 if ( (opcode >= 0x6c) && (opcode <= 0x6f) )
936 {
937 if ( rep_prefix && (regs->ecx == 0) )
938 goto done;
940 continue_io_string:
941 switch ( opcode )
942 {
943 case 0x6c: /* INSB */
944 op_bytes = 1;
945 case 0x6d: /* INSW/INSL */
946 if ( !guest_io_okay((u16)regs->edx, op_bytes, v, regs) )
947 goto fail;
948 switch ( op_bytes )
949 {
950 case 1:
951 data = (u8)inb_user((u16)regs->edx, v, regs);
952 if ( put_user((u8)data, (u8 *)regs->edi) )
953 PAGE_FAULT(regs->edi, PGERR_write_access);
954 break;
955 case 2:
956 data = (u16)inw_user((u16)regs->edx, v, regs);
957 if ( put_user((u16)data, (u16 *)regs->edi) )
958 PAGE_FAULT(regs->edi, PGERR_write_access);
959 break;
960 case 4:
961 data = (u32)inl_user((u16)regs->edx, v, regs);
962 if ( put_user((u32)data, (u32 *)regs->edi) )
963 PAGE_FAULT(regs->edi, PGERR_write_access);
964 break;
965 }
966 regs->edi += (int)((regs->eflags & EF_DF) ? -op_bytes : op_bytes);
967 break;
969 case 0x6e: /* OUTSB */
970 op_bytes = 1;
971 case 0x6f: /* OUTSW/OUTSL */
972 if ( !guest_io_okay((u16)regs->edx, op_bytes, v, regs) )
973 goto fail;
974 switch ( op_bytes )
975 {
976 case 1:
977 if ( get_user(data, (u8 *)regs->esi) )
978 PAGE_FAULT(regs->esi, 0); /* read fault */
979 outb_user((u8)data, (u16)regs->edx, v, regs);
980 break;
981 case 2:
982 if ( get_user(data, (u16 *)regs->esi) )
983 PAGE_FAULT(regs->esi, 0); /* read fault */
984 outw_user((u16)data, (u16)regs->edx, v, regs);
985 break;
986 case 4:
987 if ( get_user(data, (u32 *)regs->esi) )
988 PAGE_FAULT(regs->esi, 0); /* read fault */
989 outl_user((u32)data, (u16)regs->edx, v, regs);
990 break;
991 }
992 regs->esi += (int)((regs->eflags & EF_DF) ? -op_bytes : op_bytes);
993 break;
994 }
996 if ( rep_prefix && (--regs->ecx != 0) )
997 {
998 if ( !hypercall_preempt_check() )
999 goto continue_io_string;
1000 eip = regs->eip;
1003 goto done;
1006 /* I/O Port and Interrupt Flag instructions. */
1007 switch ( opcode )
1009 case 0xe4: /* IN imm8,%al */
1010 op_bytes = 1;
1011 case 0xe5: /* IN imm8,%eax */
1012 port = insn_fetch(u8, 1, eip);
1013 exec_in:
1014 if ( !guest_io_okay(port, op_bytes, v, regs) )
1015 goto fail;
1016 switch ( op_bytes )
1018 case 1:
1019 regs->eax &= ~0xffUL;
1020 regs->eax |= (u8)inb_user(port, v, regs);
1021 break;
1022 case 2:
1023 regs->eax &= ~0xffffUL;
1024 regs->eax |= (u16)inw_user(port, v, regs);
1025 break;
1026 case 4:
1027 regs->eax = (u32)inl_user(port, v, regs);
1028 break;
1030 goto done;
1032 case 0xec: /* IN %dx,%al */
1033 op_bytes = 1;
1034 case 0xed: /* IN %dx,%eax */
1035 port = (u16)regs->edx;
1036 goto exec_in;
1038 case 0xe6: /* OUT %al,imm8 */
1039 op_bytes = 1;
1040 case 0xe7: /* OUT %eax,imm8 */
1041 port = insn_fetch(u8, 1, eip);
1042 exec_out:
1043 if ( !guest_io_okay(port, op_bytes, v, regs) )
1044 goto fail;
1045 switch ( op_bytes )
1047 case 1:
1048 outb_user((u8)regs->eax, port, v, regs);
1049 break;
1050 case 2:
1051 outw_user((u16)regs->eax, port, v, regs);
1052 break;
1053 case 4:
1054 outl_user((u32)regs->eax, port, v, regs);
1055 break;
1057 goto done;
1059 case 0xee: /* OUT %al,%dx */
1060 op_bytes = 1;
1061 case 0xef: /* OUT %eax,%dx */
1062 port = (u16)regs->edx;
1063 goto exec_out;
1065 case 0xfa: /* CLI */
1066 case 0xfb: /* STI */
1067 if ( v->arch.iopl < (guest_kernel_mode(v, regs) ? 1 : 3) )
1068 goto fail;
1069 /*
1070 * This is just too dangerous to allow, in my opinion. Consider if the
1071 * caller then tries to reenable interrupts using POPF: we can't trap
1072 * that and we'll end up with hard-to-debug lockups. Fast & loose will
1073 * do for us. :-)
1074 */
1075 /*v->vcpu_info->evtchn_upcall_mask = (opcode == 0xfa);*/
1076 goto done;
1078 case 0x0f: /* Two-byte opcode */
1079 break;
1081 default:
1082 goto fail;
1085 /* Remaining instructions only emulated from guest kernel. */
1086 if ( !guest_kernel_mode(v, regs) )
1087 goto fail;
1089 /* Privileged (ring 0) instructions. */
1090 opcode = insn_fetch(u8, 1, eip);
1091 switch ( opcode )
1093 case 0x06: /* CLTS */
1094 (void)do_fpu_taskswitch(0);
1095 break;
1097 case 0x09: /* WBINVD */
1098 /* Ignore the instruction if unprivileged. */
1099 if ( !cache_flush_permitted(v->domain) )
1100 /* Non-physdev domain attempted WBINVD; ignore for now since
1101 newer linux uses this in some start-of-day timing loops */
1103 else
1104 wbinvd();
1105 break;
1107 case 0x20: /* MOV CR?,<reg> */
1108 opcode = insn_fetch(u8, 1, eip);
1109 modrm_reg |= (opcode >> 3) & 7;
1110 modrm_rm |= (opcode >> 0) & 7;
1111 reg = decode_register(modrm_rm, regs, 0);
1112 switch ( modrm_reg )
1114 case 0: /* Read CR0 */
1115 *reg = (read_cr0() & ~X86_CR0_TS) |
1116 v->arch.guest_context.ctrlreg[0];
1117 break;
1119 case 2: /* Read CR2 */
1120 *reg = v->arch.guest_context.ctrlreg[2];
1121 break;
1123 case 3: /* Read CR3 */
1124 *reg = xen_pfn_to_cr3(mfn_to_gmfn(
1125 v->domain, pagetable_get_pfn(v->arch.guest_table)));
1126 break;
1128 case 4: /* Read CR4 */
1129 /*
1130 * Guests can read CR4 to see what features Xen has enabled. We
1131 * therefore lie about PGE & PSE as they are unavailable to guests.
1132 */
1133 *reg = read_cr4() & ~(X86_CR4_PGE|X86_CR4_PSE);
1134 break;
1136 default:
1137 goto fail;
1139 break;
1141 case 0x21: /* MOV DR?,<reg> */
1142 opcode = insn_fetch(u8, 1, eip);
1143 modrm_reg |= (opcode >> 3) & 7;
1144 modrm_rm |= (opcode >> 0) & 7;
1145 reg = decode_register(modrm_rm, regs, 0);
1146 if ( (res = do_get_debugreg(modrm_reg)) > (unsigned long)-256 )
1147 goto fail;
1148 *reg = res;
1149 break;
1151 case 0x22: /* MOV <reg>,CR? */
1152 opcode = insn_fetch(u8, 1, eip);
1153 modrm_reg |= (opcode >> 3) & 7;
1154 modrm_rm |= (opcode >> 0) & 7;
1155 reg = decode_register(modrm_rm, regs, 0);
1156 switch ( modrm_reg )
1158 case 0: /* Write CR0 */
1159 if ( (*reg ^ read_cr0()) & ~X86_CR0_TS )
1161 DPRINTK("Attempt to change unmodifiable CR0 flags.\n");
1162 goto fail;
1164 (void)do_fpu_taskswitch(!!(*reg & X86_CR0_TS));
1165 break;
1167 case 2: /* Write CR2 */
1168 v->arch.guest_context.ctrlreg[2] = *reg;
1169 v->vcpu_info->arch.cr2 = *reg;
1170 break;
1172 case 3: /* Write CR3 */
1173 LOCK_BIGLOCK(v->domain);
1174 cleanup_writable_pagetable(v->domain);
1175 (void)new_guest_cr3(gmfn_to_mfn(v->domain, xen_cr3_to_pfn(*reg)));
1176 UNLOCK_BIGLOCK(v->domain);
1177 break;
1179 case 4:
1180 if ( *reg != (read_cr4() & ~(X86_CR4_PGE|X86_CR4_PSE)) )
1182 DPRINTK("Attempt to change CR4 flags.\n");
1183 goto fail;
1185 break;
1187 default:
1188 goto fail;
1190 break;
1192 case 0x23: /* MOV <reg>,DR? */
1193 opcode = insn_fetch(u8, 1, eip);
1194 modrm_reg |= (opcode >> 3) & 7;
1195 modrm_rm |= (opcode >> 0) & 7;
1196 reg = decode_register(modrm_rm, regs, 0);
1197 if ( do_set_debugreg(modrm_reg, *reg) != 0 )
1198 goto fail;
1199 break;
1201 case 0x30: /* WRMSR */
1202 switch ( regs->ecx )
1204 #ifdef CONFIG_X86_64
1205 case MSR_FS_BASE:
1206 if ( wrmsr_safe(MSR_FS_BASE, regs->eax, regs->edx) )
1207 goto fail;
1208 v->arch.guest_context.fs_base =
1209 ((u64)regs->edx << 32) | regs->eax;
1210 break;
1211 case MSR_GS_BASE:
1212 if ( wrmsr_safe(MSR_GS_BASE, regs->eax, regs->edx) )
1213 goto fail;
1214 v->arch.guest_context.gs_base_kernel =
1215 ((u64)regs->edx << 32) | regs->eax;
1216 break;
1217 case MSR_SHADOW_GS_BASE:
1218 if ( wrmsr_safe(MSR_SHADOW_GS_BASE, regs->eax, regs->edx) )
1219 goto fail;
1220 v->arch.guest_context.gs_base_user =
1221 ((u64)regs->edx << 32) | regs->eax;
1222 break;
1223 #endif
1224 default:
1225 if ( (rdmsr_safe(regs->ecx, l, h) != 0) ||
1226 (regs->eax != l) || (regs->edx != h) )
1227 DPRINTK("Domain attempted WRMSR %p from "
1228 "%08x:%08x to %08lx:%08lx.\n",
1229 _p(regs->ecx), h, l, (long)regs->edx, (long)regs->eax);
1230 break;
1232 break;
1234 case 0x32: /* RDMSR */
1235 switch ( regs->ecx )
1237 #ifdef CONFIG_X86_64
1238 case MSR_FS_BASE:
1239 regs->eax = v->arch.guest_context.fs_base & 0xFFFFFFFFUL;
1240 regs->edx = v->arch.guest_context.fs_base >> 32;
1241 break;
1242 case MSR_GS_BASE:
1243 regs->eax = v->arch.guest_context.gs_base_kernel & 0xFFFFFFFFUL;
1244 regs->edx = v->arch.guest_context.gs_base_kernel >> 32;
1245 break;
1246 case MSR_SHADOW_GS_BASE:
1247 regs->eax = v->arch.guest_context.gs_base_user & 0xFFFFFFFFUL;
1248 regs->edx = v->arch.guest_context.gs_base_user >> 32;
1249 break;
1250 #endif
1251 case MSR_EFER:
1252 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
1253 goto fail;
1254 break;
1255 default:
1256 /* Everyone can read the MSR space. */
1257 /*DPRINTK("Domain attempted RDMSR %p.\n", _p(regs->ecx));*/
1258 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
1259 goto fail;
1260 break;
1262 break;
1264 default:
1265 goto fail;
1268 done:
1269 regs->eip = eip;
1270 return EXCRET_fault_fixed;
1272 fail:
1273 return 0;
1276 asmlinkage int do_general_protection(struct cpu_user_regs *regs)
1278 struct vcpu *v = current;
1279 struct trap_bounce *tb = &v->arch.trap_bounce;
1280 struct trap_info *ti;
1281 unsigned long fixup;
1283 DEBUGGER_trap_entry(TRAP_gp_fault, regs);
1285 if ( regs->error_code & 1 )
1286 goto hardware_gp;
1288 if ( !guest_mode(regs) )
1289 goto gp_in_kernel;
1291 /*
1292 * Cunning trick to allow arbitrary "INT n" handling.
1294 * We set DPL == 0 on all vectors in the IDT. This prevents any INT <n>
1295 * instruction from trapping to the appropriate vector, when that might not
1296 * be expected by Xen or the guest OS. For example, that entry might be for
1297 * a fault handler (unlike traps, faults don't increment EIP), or might
1298 * expect an error code on the stack (which a software trap never
1299 * provides), or might be a hardware interrupt handler that doesn't like
1300 * being called spuriously.
1302 * Instead, a GPF occurs with the faulting IDT vector in the error code.
1303 * Bit 1 is set to indicate that an IDT entry caused the fault. Bit 0 is
1304 * clear to indicate that it's a software fault, not hardware.
1306 * NOTE: Vectors 3 and 4 are dealt with from their own handler. This is
1307 * okay because they can only be triggered by an explicit DPL-checked
1308 * instruction. The DPL specified by the guest OS for these vectors is NOT
1309 * CHECKED!!
1310 */
1311 if ( (regs->error_code & 3) == 2 )
1313 /* This fault must be due to <INT n> instruction. */
1314 ti = &current->arch.guest_context.trap_ctxt[regs->error_code>>3];
1315 if ( permit_softint(TI_GET_DPL(ti), v, regs) )
1317 tb->flags = TBF_EXCEPTION;
1318 regs->eip += 2;
1319 goto finish_propagation;
1323 /* Emulate some simple privileged and I/O instructions. */
1324 if ( (regs->error_code == 0) &&
1325 emulate_privileged_op(regs) )
1326 return 0;
1328 #if defined(__i386__)
1329 if ( VM_ASSIST(v->domain, VMASST_TYPE_4gb_segments) &&
1330 (regs->error_code == 0) &&
1331 gpf_emulate_4gb(regs) )
1332 return 0;
1333 #endif
1335 /* Pass on GPF as is. */
1336 ti = &current->arch.guest_context.trap_ctxt[TRAP_gp_fault];
1337 tb->flags = TBF_EXCEPTION | TBF_EXCEPTION_ERRCODE;
1338 tb->error_code = regs->error_code;
1339 finish_propagation:
1340 tb->cs = ti->cs;
1341 tb->eip = ti->address;
1342 if ( TI_GET_IF(ti) )
1343 tb->flags |= TBF_INTERRUPT;
1344 return 0;
1346 gp_in_kernel:
1348 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
1350 DPRINTK("GPF (%04x): %p -> %p\n",
1351 regs->error_code, _p(regs->eip), _p(fixup));
1352 regs->eip = fixup;
1353 return 0;
1356 DEBUGGER_trap_fatal(TRAP_gp_fault, regs);
1358 hardware_gp:
1359 show_registers(regs);
1360 panic("CPU%d GENERAL PROTECTION FAULT\n[error_code=%04x]\n",
1361 smp_processor_id(), regs->error_code);
1362 return 0;
1365 static void nmi_softirq(void)
1367 /* Only used to defer wakeup of dom0,vcpu0 to a safe (non-NMI) context. */
1368 vcpu_kick(dom0->vcpu[0]);
1371 static void nmi_dom0_report(unsigned int reason_idx)
1373 struct domain *d;
1375 if ( (d = dom0) == NULL )
1376 return;
1378 set_bit(reason_idx, &d->shared_info->arch.nmi_reason);
1380 if ( test_and_set_bit(_VCPUF_nmi_pending, &d->vcpu[0]->vcpu_flags) )
1381 raise_softirq(NMI_SOFTIRQ); /* not safe to wake up a vcpu here */
1384 asmlinkage void mem_parity_error(struct cpu_user_regs *regs)
1386 switch ( opt_nmi[0] )
1388 case 'd': /* 'dom0' */
1389 nmi_dom0_report(_XEN_NMIREASON_parity_error);
1390 case 'i': /* 'ignore' */
1391 break;
1392 default: /* 'fatal' */
1393 console_force_unlock();
1394 printk("\n\nNMI - MEMORY ERROR\n");
1395 fatal_trap(TRAP_nmi, regs);
1398 outb((inb(0x61) & 0x0f) | 0x04, 0x61); /* clear-and-disable parity check */
1399 mdelay(1);
1400 outb((inb(0x61) & 0x0b) | 0x00, 0x61); /* enable parity check */
1403 asmlinkage void io_check_error(struct cpu_user_regs *regs)
1405 switch ( opt_nmi[0] )
1407 case 'd': /* 'dom0' */
1408 nmi_dom0_report(_XEN_NMIREASON_io_error);
1409 case 'i': /* 'ignore' */
1410 break;
1411 default: /* 'fatal' */
1412 console_force_unlock();
1413 printk("\n\nNMI - I/O ERROR\n");
1414 fatal_trap(TRAP_nmi, regs);
1417 outb((inb(0x61) & 0x0f) | 0x08, 0x61); /* clear-and-disable IOCK */
1418 mdelay(1);
1419 outb((inb(0x61) & 0x07) | 0x00, 0x61); /* enable IOCK */
1422 static void unknown_nmi_error(unsigned char reason)
1424 switch ( opt_nmi[0] )
1426 case 'd': /* 'dom0' */
1427 nmi_dom0_report(_XEN_NMIREASON_unknown);
1428 case 'i': /* 'ignore' */
1429 break;
1430 default: /* 'fatal' */
1431 printk("Uhhuh. NMI received for unknown reason %02x.\n", reason);
1432 printk("Dazed and confused, but trying to continue\n");
1433 printk("Do you have a strange power saving mode enabled?\n");
1437 static int dummy_nmi_callback(struct cpu_user_regs *regs, int cpu)
1439 return 0;
1442 static nmi_callback_t nmi_callback = dummy_nmi_callback;
1444 asmlinkage void do_nmi(struct cpu_user_regs *regs)
1446 unsigned int cpu = smp_processor_id();
1447 unsigned char reason;
1449 ++nmi_count(cpu);
1451 if ( nmi_callback(regs, cpu) )
1452 return;
1454 if ( nmi_watchdog )
1455 nmi_watchdog_tick(regs);
1457 /* Only the BSP gets external NMIs from the system. */
1458 if ( cpu == 0 )
1460 reason = inb(0x61);
1461 if ( reason & 0x80 )
1462 mem_parity_error(regs);
1463 else if ( reason & 0x40 )
1464 io_check_error(regs);
1465 else if ( !nmi_watchdog )
1466 unknown_nmi_error((unsigned char)(reason&0xff));
1470 void set_nmi_callback(nmi_callback_t callback)
1472 nmi_callback = callback;
1475 void unset_nmi_callback(void)
1477 nmi_callback = dummy_nmi_callback;
1480 asmlinkage int math_state_restore(struct cpu_user_regs *regs)
1482 struct trap_bounce *tb;
1483 struct trap_info *ti;
1485 setup_fpu(current);
1487 if ( current->arch.guest_context.ctrlreg[0] & X86_CR0_TS )
1489 tb = &current->arch.trap_bounce;
1490 ti = &current->arch.guest_context.trap_ctxt[TRAP_no_device];
1492 tb->flags = TBF_EXCEPTION;
1493 tb->cs = ti->cs;
1494 tb->eip = ti->address;
1495 if ( TI_GET_IF(ti) )
1496 tb->flags |= TBF_INTERRUPT;
1498 current->arch.guest_context.ctrlreg[0] &= ~X86_CR0_TS;
1501 return EXCRET_fault_fixed;
1504 asmlinkage int do_debug(struct cpu_user_regs *regs)
1506 unsigned long condition;
1507 struct vcpu *v = current;
1508 struct trap_bounce *tb = &v->arch.trap_bounce;
1509 struct trap_info *ti;
1511 __asm__ __volatile__("mov %%db6,%0" : "=r" (condition));
1513 /* Mask out spurious debug traps due to lazy DR7 setting */
1514 if ( (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) &&
1515 (v->arch.guest_context.debugreg[7] == 0) )
1517 __asm__("mov %0,%%db7" : : "r" (0UL));
1518 goto out;
1521 DEBUGGER_trap_entry(TRAP_debug, regs);
1523 if ( !guest_mode(regs) )
1525 /* Clear TF just for absolute sanity. */
1526 regs->eflags &= ~EF_TF;
1527 /*
1528 * We ignore watchpoints when they trigger within Xen. This may happen
1529 * when a buffer is passed to us which previously had a watchpoint set
1530 * on it. No need to bump EIP; the only faulting trap is an instruction
1531 * breakpoint, which can't happen to us.
1532 */
1533 goto out;
1536 /* Save debug status register where guest OS can peek at it */
1537 v->arch.guest_context.debugreg[6] = condition;
1539 ti = &v->arch.guest_context.trap_ctxt[TRAP_debug];
1540 tb->flags = TBF_EXCEPTION;
1541 tb->cs = ti->cs;
1542 tb->eip = ti->address;
1543 if ( TI_GET_IF(ti) )
1544 tb->flags |= TBF_INTERRUPT;
1546 out:
1547 return EXCRET_not_a_fault;
1550 asmlinkage int do_spurious_interrupt_bug(struct cpu_user_regs *regs)
1552 return EXCRET_not_a_fault;
1555 void set_intr_gate(unsigned int n, void *addr)
1557 #ifdef __i386__
1558 int i;
1559 /* Keep secondary tables in sync with IRQ updates. */
1560 for ( i = 1; i < NR_CPUS; i++ )
1561 if ( idt_tables[i] != NULL )
1562 _set_gate(&idt_tables[i][n], 14, 0, addr);
1563 #endif
1564 _set_gate(&idt_table[n], 14, 0, addr);
1567 void set_system_gate(unsigned int n, void *addr)
1569 _set_gate(idt_table+n,14,3,addr);
1572 void set_task_gate(unsigned int n, unsigned int sel)
1574 idt_table[n].a = sel << 16;
1575 idt_table[n].b = 0x8500;
1578 void set_tss_desc(unsigned int n, void *addr)
1580 _set_tssldt_desc(
1581 gdt_table + __TSS(n) - FIRST_RESERVED_GDT_ENTRY,
1582 (unsigned long)addr,
1583 offsetof(struct tss_struct, __cacheline_filler) - 1,
1584 9);
1587 void __init trap_init(void)
1589 extern void percpu_traps_init(void);
1591 /*
1592 * Note that interrupt gates are always used, rather than trap gates. We
1593 * must have interrupts disabled until DS/ES/FS/GS are saved because the
1594 * first activation must have the "bad" value(s) for these registers and
1595 * we may lose them if another activation is installed before they are
1596 * saved. The page-fault handler also needs interrupts disabled until %cr2
1597 * has been read and saved on the stack.
1598 */
1599 set_intr_gate(TRAP_divide_error,&divide_error);
1600 set_intr_gate(TRAP_debug,&debug);
1601 set_intr_gate(TRAP_nmi,&nmi);
1602 set_system_gate(TRAP_int3,&int3); /* usable from all privileges */
1603 set_system_gate(TRAP_overflow,&overflow); /* usable from all privileges */
1604 set_intr_gate(TRAP_bounds,&bounds);
1605 set_intr_gate(TRAP_invalid_op,&invalid_op);
1606 set_intr_gate(TRAP_no_device,&device_not_available);
1607 set_intr_gate(TRAP_copro_seg,&coprocessor_segment_overrun);
1608 set_intr_gate(TRAP_invalid_tss,&invalid_TSS);
1609 set_intr_gate(TRAP_no_segment,&segment_not_present);
1610 set_intr_gate(TRAP_stack_error,&stack_segment);
1611 set_intr_gate(TRAP_gp_fault,&general_protection);
1612 set_intr_gate(TRAP_page_fault,&page_fault);
1613 set_intr_gate(TRAP_spurious_int,&spurious_interrupt_bug);
1614 set_intr_gate(TRAP_copro_error,&coprocessor_error);
1615 set_intr_gate(TRAP_alignment_check,&alignment_check);
1616 set_intr_gate(TRAP_machine_check,&machine_check);
1617 set_intr_gate(TRAP_simd_error,&simd_coprocessor_error);
1619 percpu_traps_init();
1621 cpu_init();
1623 open_softirq(NMI_SOFTIRQ, nmi_softirq);
1627 long do_set_trap_table(XEN_GUEST_HANDLE(trap_info_t) traps)
1629 struct trap_info cur;
1630 struct trap_info *dst = current->arch.guest_context.trap_ctxt;
1631 long rc = 0;
1633 /* If no table is presented then clear the entire virtual IDT. */
1634 if ( guest_handle_is_null(traps) )
1636 memset(dst, 0, 256 * sizeof(*dst));
1637 init_int80_direct_trap(current);
1638 return 0;
1641 for ( ; ; )
1643 if ( hypercall_preempt_check() )
1645 rc = hypercall_create_continuation(
1646 __HYPERVISOR_set_trap_table, "h", traps);
1647 break;
1650 if ( copy_from_guest(&cur, traps, 1) )
1652 rc = -EFAULT;
1653 break;
1656 if ( cur.address == 0 )
1657 break;
1659 fixup_guest_code_selector(cur.cs);
1661 memcpy(&dst[cur.vector], &cur, sizeof(cur));
1663 if ( cur.vector == 0x80 )
1664 init_int80_direct_trap(current);
1666 guest_handle_add_offset(traps, 1);
1669 return rc;
1673 long set_debugreg(struct vcpu *p, int reg, unsigned long value)
1675 int i;
1677 switch ( reg )
1679 case 0:
1680 if ( !access_ok(value, sizeof(long)) )
1681 return -EPERM;
1682 if ( p == current )
1683 __asm__ ( "mov %0, %%db0" : : "r" (value) );
1684 break;
1685 case 1:
1686 if ( !access_ok(value, sizeof(long)) )
1687 return -EPERM;
1688 if ( p == current )
1689 __asm__ ( "mov %0, %%db1" : : "r" (value) );
1690 break;
1691 case 2:
1692 if ( !access_ok(value, sizeof(long)) )
1693 return -EPERM;
1694 if ( p == current )
1695 __asm__ ( "mov %0, %%db2" : : "r" (value) );
1696 break;
1697 case 3:
1698 if ( !access_ok(value, sizeof(long)) )
1699 return -EPERM;
1700 if ( p == current )
1701 __asm__ ( "mov %0, %%db3" : : "r" (value) );
1702 break;
1703 case 6:
1704 /*
1705 * DR6: Bits 4-11,16-31 reserved (set to 1).
1706 * Bit 12 reserved (set to 0).
1707 */
1708 value &= 0xffffefff; /* reserved bits => 0 */
1709 value |= 0xffff0ff0; /* reserved bits => 1 */
1710 if ( p == current )
1711 __asm__ ( "mov %0, %%db6" : : "r" (value) );
1712 break;
1713 case 7:
1714 /*
1715 * DR7: Bit 10 reserved (set to 1).
1716 * Bits 11-12,14-15 reserved (set to 0).
1717 * Privileged bits:
1718 * GD (bit 13): must be 0.
1719 * R/Wn (bits 16-17,20-21,24-25,28-29): mustn't be 10.
1720 * LENn (bits 18-19,22-23,26-27,30-31): mustn't be 10.
1721 */
1722 /* DR7 == 0 => debugging disabled for this domain. */
1723 if ( value != 0 )
1725 value &= 0xffff27ff; /* reserved bits => 0 */
1726 value |= 0x00000400; /* reserved bits => 1 */
1727 if ( (value & (1<<13)) != 0 ) return -EPERM;
1728 for ( i = 0; i < 16; i += 2 )
1729 if ( ((value >> (i+16)) & 3) == 2 ) return -EPERM;
1731 if ( p == current )
1732 __asm__ ( "mov %0, %%db7" : : "r" (value) );
1733 break;
1734 default:
1735 return -EINVAL;
1738 p->arch.guest_context.debugreg[reg] = value;
1739 return 0;
1742 long do_set_debugreg(int reg, unsigned long value)
1744 return set_debugreg(current, reg, value);
1747 unsigned long do_get_debugreg(int reg)
1749 if ( (reg < 0) || (reg > 7) ) return -EINVAL;
1750 return current->arch.guest_context.debugreg[reg];
1753 /*
1754 * Local variables:
1755 * mode: C
1756 * c-set-style: "BSD"
1757 * c-basic-offset: 4
1758 * tab-width: 4
1759 * indent-tabs-mode: nil
1760 * End:
1761 */