ia64/xen-unstable

view xen/arch/x86/traps.c @ 10879:2d2ed4d9b1c1

[XEN] Some suspicion that we may enter an infinite
#PF loop due to broken spurious pagefault detection.
Beef up the tracing on that code path so we can catch
some useful info if it happens.
Signed-off-by: Keir Fraser <keir@xensource.com>
author kfraser@localhost.localdomain
date Mon Jul 31 10:40:21 2006 +0100 (2006-07-31)
parents 37f206c7405a
children 0d2ba35c0cf2
line source
1 /******************************************************************************
2 * arch/x86/traps.c
3 *
4 * Modifications to Linux original are copyright (c) 2002-2004, K A Fraser
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
21 /*
22 * Copyright (C) 1991, 1992 Linus Torvalds
23 *
24 * Pentium III FXSR, SSE support
25 * Gareth Hughes <gareth@valinux.com>, May 2000
26 */
28 #include <xen/config.h>
29 #include <xen/init.h>
30 #include <xen/sched.h>
31 #include <xen/lib.h>
32 #include <xen/errno.h>
33 #include <xen/mm.h>
34 #include <xen/console.h>
35 #include <xen/reboot.h>
36 #include <asm/regs.h>
37 #include <xen/delay.h>
38 #include <xen/event.h>
39 #include <xen/spinlock.h>
40 #include <xen/irq.h>
41 #include <xen/perfc.h>
42 #include <xen/softirq.h>
43 #include <xen/domain_page.h>
44 #include <xen/symbols.h>
45 #include <xen/iocap.h>
46 #include <xen/nmi.h>
47 #include <asm/shadow.h>
48 #include <asm/system.h>
49 #include <asm/io.h>
50 #include <asm/atomic.h>
51 #include <asm/desc.h>
52 #include <asm/debugreg.h>
53 #include <asm/smp.h>
54 #include <asm/flushtlb.h>
55 #include <asm/uaccess.h>
56 #include <asm/i387.h>
57 #include <asm/debugger.h>
58 #include <asm/msr.h>
59 #include <asm/x86_emulate.h>
61 /*
62 * opt_nmi: one of 'ignore', 'dom0', or 'fatal'.
63 * fatal: Xen prints diagnostic message and then hangs.
64 * dom0: The NMI is virtualised to DOM0.
65 * ignore: The NMI error is cleared and ignored.
66 */
67 #ifdef NDEBUG
68 char opt_nmi[10] = "dom0";
69 #else
70 char opt_nmi[10] = "fatal";
71 #endif
72 string_param("nmi", opt_nmi);
74 /* Master table, used by all CPUs on x86/64, and by CPU0 on x86/32.*/
75 idt_entry_t idt_table[IDT_ENTRIES];
77 #define DECLARE_TRAP_HANDLER(_name) \
78 asmlinkage void _name(void); \
79 asmlinkage int do_ ## _name(struct cpu_user_regs *regs)
81 asmlinkage void nmi(void);
82 DECLARE_TRAP_HANDLER(divide_error);
83 DECLARE_TRAP_HANDLER(debug);
84 DECLARE_TRAP_HANDLER(int3);
85 DECLARE_TRAP_HANDLER(overflow);
86 DECLARE_TRAP_HANDLER(bounds);
87 DECLARE_TRAP_HANDLER(invalid_op);
88 DECLARE_TRAP_HANDLER(device_not_available);
89 DECLARE_TRAP_HANDLER(coprocessor_segment_overrun);
90 DECLARE_TRAP_HANDLER(invalid_TSS);
91 DECLARE_TRAP_HANDLER(segment_not_present);
92 DECLARE_TRAP_HANDLER(stack_segment);
93 DECLARE_TRAP_HANDLER(general_protection);
94 DECLARE_TRAP_HANDLER(page_fault);
95 DECLARE_TRAP_HANDLER(coprocessor_error);
96 DECLARE_TRAP_HANDLER(simd_coprocessor_error);
97 DECLARE_TRAP_HANDLER(alignment_check);
98 DECLARE_TRAP_HANDLER(spurious_interrupt_bug);
99 DECLARE_TRAP_HANDLER(machine_check);
101 long do_set_debugreg(int reg, unsigned long value);
102 unsigned long do_get_debugreg(int reg);
104 static int debug_stack_lines = 20;
105 integer_param("debug_stack_lines", debug_stack_lines);
107 #ifdef CONFIG_X86_32
108 #define stack_words_per_line 8
109 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)&regs->esp)
110 #else
111 #define stack_words_per_line 4
112 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)regs->rsp)
113 #endif
115 int is_kernel_text(unsigned long addr)
116 {
117 extern char _stext, _etext;
118 if (addr >= (unsigned long) &_stext &&
119 addr <= (unsigned long) &_etext)
120 return 1;
121 return 0;
123 }
125 unsigned long kernel_text_end(void)
126 {
127 extern char _etext;
128 return (unsigned long) &_etext;
129 }
131 static void show_guest_stack(struct cpu_user_regs *regs)
132 {
133 int i;
134 unsigned long *stack, addr;
136 if ( hvm_guest(current) )
137 return;
139 if ( vm86_mode(regs) )
140 {
141 stack = (unsigned long *)((regs->ss << 4) + (regs->esp & 0xffff));
142 printk("Guest stack trace from ss:sp = %04x:%04x (VM86)\n ",
143 regs->ss, (uint16_t)(regs->esp & 0xffff));
144 }
145 else
146 {
147 stack = (unsigned long *)regs->esp;
148 printk("Guest stack trace from "__OP"sp=%p:\n ", stack);
149 }
151 for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
152 {
153 if ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) == 0 )
154 break;
155 if ( get_user(addr, stack) )
156 {
157 if ( i != 0 )
158 printk("\n ");
159 printk("Fault while accessing guest memory.");
160 i = 1;
161 break;
162 }
163 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
164 printk("\n ");
165 printk(" %p", _p(addr));
166 stack++;
167 }
168 if ( i == 0 )
169 printk("Stack empty.");
170 printk("\n");
171 }
173 #ifdef NDEBUG
175 static void show_trace(struct cpu_user_regs *regs)
176 {
177 unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr;
179 printk("Xen call trace:\n ");
181 printk("[<%p>]", _p(regs->eip));
182 print_symbol(" %s\n ", regs->eip);
184 while ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) != 0 )
185 {
186 addr = *stack++;
187 if ( is_kernel_text(addr) )
188 {
189 printk("[<%p>]", _p(addr));
190 print_symbol(" %s\n ", addr);
191 }
192 }
194 printk("\n");
195 }
197 #else
199 static void show_trace(struct cpu_user_regs *regs)
200 {
201 unsigned long *frame, next, addr, low, high;
203 printk("Xen call trace:\n ");
205 printk("[<%p>]", _p(regs->eip));
206 print_symbol(" %s\n ", regs->eip);
208 /* Bounds for range of valid frame pointer. */
209 low = (unsigned long)(ESP_BEFORE_EXCEPTION(regs) - 2);
210 high = (low & ~(STACK_SIZE - 1)) +
211 (STACK_SIZE - sizeof(struct cpu_info) - 2*sizeof(unsigned long));
213 /* The initial frame pointer. */
214 next = regs->ebp;
216 for ( ; ; )
217 {
218 /* Valid frame pointer? */
219 if ( (next < low) || (next >= high) )
220 {
221 /*
222 * Exception stack frames have a different layout, denoted by an
223 * inverted frame pointer.
224 */
225 next = ~next;
226 if ( (next < low) || (next >= high) )
227 break;
228 frame = (unsigned long *)next;
229 next = frame[0];
230 addr = frame[(offsetof(struct cpu_user_regs, eip) -
231 offsetof(struct cpu_user_regs, ebp))
232 / BYTES_PER_LONG];
233 }
234 else
235 {
236 /* Ordinary stack frame. */
237 frame = (unsigned long *)next;
238 next = frame[0];
239 addr = frame[1];
240 }
242 printk("[<%p>]", _p(addr));
243 print_symbol(" %s\n ", addr);
245 low = (unsigned long)&frame[2];
246 }
248 printk("\n");
249 }
251 #endif
253 void show_stack(struct cpu_user_regs *regs)
254 {
255 unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr;
256 int i;
258 if ( guest_mode(regs) )
259 return show_guest_stack(regs);
261 printk("Xen stack trace from "__OP"sp=%p:\n ", stack);
263 for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
264 {
265 if ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) == 0 )
266 break;
267 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
268 printk("\n ");
269 addr = *stack++;
270 printk(" %p", _p(addr));
271 }
272 if ( i == 0 )
273 printk("Stack empty.");
274 printk("\n");
276 show_trace(regs);
277 }
279 void show_stack_overflow(unsigned long esp)
280 {
281 #ifdef MEMORY_GUARD
282 unsigned long esp_top;
283 unsigned long *stack, addr;
285 esp_top = (esp | (STACK_SIZE - 1)) - DEBUG_STACK_SIZE;
287 /* Trigger overflow trace if %esp is within 512 bytes of the guard page. */
288 if ( ((unsigned long)(esp - esp_top) > 512) &&
289 ((unsigned long)(esp_top - esp) > 512) )
290 return;
292 if ( esp < esp_top )
293 esp = esp_top;
295 printk("Xen stack overflow:\n ");
297 stack = (unsigned long *)esp;
298 while ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) != 0 )
299 {
300 addr = *stack++;
301 if ( is_kernel_text(addr) )
302 {
303 printk("%p: [<%p>]", stack, _p(addr));
304 print_symbol(" %s\n ", addr);
305 }
306 }
308 printk("\n");
309 #endif
310 }
312 void show_execution_state(struct cpu_user_regs *regs)
313 {
314 show_registers(regs);
315 show_stack(regs);
316 }
318 /*
319 * This is called for faults at very unexpected times (e.g., when interrupts
320 * are disabled). In such situations we can't do much that is safe. We try to
321 * print out some tracing and then we just spin.
322 */
323 asmlinkage void fatal_trap(int trapnr, struct cpu_user_regs *regs)
324 {
325 int cpu = smp_processor_id();
326 unsigned long cr2;
327 static char *trapstr[] = {
328 "divide error", "debug", "nmi", "bkpt", "overflow", "bounds",
329 "invalid opcode", "device not available", "double fault",
330 "coprocessor segment", "invalid tss", "segment not found",
331 "stack error", "general protection fault", "page fault",
332 "spurious interrupt", "coprocessor error", "alignment check",
333 "machine check", "simd error"
334 };
336 watchdog_disable();
337 console_start_sync();
339 show_execution_state(regs);
341 if ( trapnr == TRAP_page_fault )
342 {
343 __asm__ __volatile__ ("mov %%cr2,%0" : "=r" (cr2) : );
344 printk("Faulting linear address: %p\n", _p(cr2));
345 show_page_walk(cr2);
346 }
348 printk("************************************\n");
349 printk("CPU%d FATAL TRAP %d (%s), ERROR_CODE %04x%s.\n",
350 cpu, trapnr, trapstr[trapnr], regs->error_code,
351 (regs->eflags & X86_EFLAGS_IF) ? "" : ", IN INTERRUPT CONTEXT");
352 printk("System shutting down -- need manual reset.\n");
353 printk("************************************\n");
355 (void)debugger_trap_fatal(trapnr, regs);
357 /* Lock up the console to prevent spurious output from other CPUs. */
358 console_force_lock();
360 /* Wait for manual reset. */
361 machine_halt();
362 }
364 static inline int do_trap(int trapnr, char *str,
365 struct cpu_user_regs *regs,
366 int use_error_code)
367 {
368 struct vcpu *v = current;
369 struct trap_bounce *tb = &v->arch.trap_bounce;
370 struct trap_info *ti;
371 unsigned long fixup;
373 DEBUGGER_trap_entry(trapnr, regs);
375 if ( !guest_mode(regs) )
376 goto xen_fault;
378 ti = &current->arch.guest_context.trap_ctxt[trapnr];
379 tb->flags = TBF_EXCEPTION;
380 tb->cs = ti->cs;
381 tb->eip = ti->address;
382 if ( use_error_code )
383 {
384 tb->flags |= TBF_EXCEPTION_ERRCODE;
385 tb->error_code = regs->error_code;
386 }
387 if ( TI_GET_IF(ti) )
388 tb->flags |= TBF_INTERRUPT;
389 return 0;
391 xen_fault:
393 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
394 {
395 DPRINTK("Trap %d: %p -> %p\n", trapnr, _p(regs->eip), _p(fixup));
396 regs->eip = fixup;
397 return 0;
398 }
400 DEBUGGER_trap_fatal(trapnr, regs);
402 show_execution_state(regs);
403 panic("CPU%d FATAL TRAP: vector = %d (%s)\n"
404 "[error_code=%04x]\n",
405 smp_processor_id(), trapnr, str, regs->error_code);
406 return 0;
407 }
409 #define DO_ERROR_NOCODE(trapnr, str, name) \
410 asmlinkage int do_##name(struct cpu_user_regs *regs) \
411 { \
412 return do_trap(trapnr, str, regs, 0); \
413 }
415 #define DO_ERROR(trapnr, str, name) \
416 asmlinkage int do_##name(struct cpu_user_regs *regs) \
417 { \
418 return do_trap(trapnr, str, regs, 1); \
419 }
421 DO_ERROR_NOCODE( 0, "divide error", divide_error)
422 DO_ERROR_NOCODE( 4, "overflow", overflow)
423 DO_ERROR_NOCODE( 5, "bounds", bounds)
424 DO_ERROR_NOCODE( 9, "coprocessor segment overrun", coprocessor_segment_overrun)
425 DO_ERROR(10, "invalid TSS", invalid_TSS)
426 DO_ERROR(11, "segment not present", segment_not_present)
427 DO_ERROR(12, "stack segment", stack_segment)
428 DO_ERROR_NOCODE(16, "fpu error", coprocessor_error)
429 DO_ERROR(17, "alignment check", alignment_check)
430 DO_ERROR_NOCODE(19, "simd error", simd_coprocessor_error)
432 int cpuid_hypervisor_leaves(
433 uint32_t idx, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
434 {
435 if ( (idx < 0x40000000) || (idx > 0x40000000) )
436 return 0;
438 switch ( idx - 0x40000000 )
439 {
440 case 0:
441 *eax = 0x40000000;
442 *ebx = 0x006e6558; /* "Xen\0" */
443 *ecx = *edx = 0;
444 break;
446 default:
447 BUG();
448 }
450 return 1;
451 }
453 static int emulate_forced_invalid_op(struct cpu_user_regs *regs)
454 {
455 char sig[5], instr[2];
456 uint32_t a, b, c, d;
457 unsigned long eip, rc;
459 a = regs->eax;
460 b = regs->ebx;
461 c = regs->ecx;
462 d = regs->edx;
463 eip = regs->eip;
465 /* Check for forced emulation signature: ud2 ; .ascii "xen". */
466 if ( (rc = copy_from_user(sig, (char *)eip, sizeof(sig))) != 0 )
467 {
468 propagate_page_fault(eip + sizeof(sig) - rc, 0);
469 return EXCRET_fault_fixed;
470 }
471 if ( memcmp(sig, "\xf\xbxen", sizeof(sig)) )
472 return 0;
473 eip += sizeof(sig);
475 /* We only emulate CPUID. */
476 if ( ( rc = copy_from_user(instr, (char *)eip, sizeof(instr))) != 0 )
477 {
478 propagate_page_fault(eip + sizeof(instr) - rc, 0);
479 return EXCRET_fault_fixed;
480 }
481 if ( memcmp(instr, "\xf\xa2", sizeof(instr)) )
482 return 0;
483 eip += sizeof(instr);
485 __asm__ (
486 "cpuid"
487 : "=a" (a), "=b" (b), "=c" (c), "=d" (d)
488 : "0" (a), "1" (b), "2" (c), "3" (d) );
490 if ( regs->eax == 1 )
491 {
492 /* Modify Feature Information. */
493 clear_bit(X86_FEATURE_VME, &d);
494 clear_bit(X86_FEATURE_DE, &d);
495 clear_bit(X86_FEATURE_PSE, &d);
496 clear_bit(X86_FEATURE_PGE, &d);
497 if ( !supervisor_mode_kernel )
498 clear_bit(X86_FEATURE_SEP, &d);
499 if ( !IS_PRIV(current->domain) )
500 clear_bit(X86_FEATURE_MTRR, &d);
501 }
502 else
503 {
504 (void)cpuid_hypervisor_leaves(regs->eax, &a, &b, &c, &d);
505 }
507 regs->eax = a;
508 regs->ebx = b;
509 regs->ecx = c;
510 regs->edx = d;
511 regs->eip = eip;
513 return EXCRET_fault_fixed;
514 }
516 asmlinkage int do_invalid_op(struct cpu_user_regs *regs)
517 {
518 struct vcpu *v = current;
519 struct trap_bounce *tb = &v->arch.trap_bounce;
520 struct trap_info *ti;
521 int rc;
523 DEBUGGER_trap_entry(TRAP_invalid_op, regs);
525 if ( unlikely(!guest_mode(regs)) )
526 {
527 char sig[5];
528 /* Signature (ud2; .ascii "dbg") indicates dump state and continue. */
529 if ( (__copy_from_user(sig, (char *)regs->eip, sizeof(sig)) == 0) &&
530 (memcmp(sig, "\xf\xb""dbg", sizeof(sig)) == 0) )
531 {
532 show_execution_state(regs);
533 regs->eip += sizeof(sig);
534 return EXCRET_fault_fixed;
535 }
536 printk("%02x %02x %02x %02x %02x\n",
537 (unsigned char)sig[0],
538 (unsigned char)sig[1],
539 (unsigned char)sig[2],
540 (unsigned char)sig[3],
541 (unsigned char)sig[4]);
542 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
543 show_execution_state(regs);
544 panic("CPU%d FATAL TRAP: vector = %d (invalid opcode)\n",
545 smp_processor_id(), TRAP_invalid_op);
546 }
548 if ( (rc = emulate_forced_invalid_op(regs)) != 0 )
549 return rc;
551 ti = &current->arch.guest_context.trap_ctxt[TRAP_invalid_op];
552 tb->flags = TBF_EXCEPTION;
553 tb->cs = ti->cs;
554 tb->eip = ti->address;
555 if ( TI_GET_IF(ti) )
556 tb->flags |= TBF_INTERRUPT;
558 return 0;
559 }
561 asmlinkage int do_int3(struct cpu_user_regs *regs)
562 {
563 struct vcpu *v = current;
564 struct trap_bounce *tb = &v->arch.trap_bounce;
565 struct trap_info *ti;
567 DEBUGGER_trap_entry(TRAP_int3, regs);
569 if ( !guest_mode(regs) )
570 {
571 DEBUGGER_trap_fatal(TRAP_int3, regs);
572 show_execution_state(regs);
573 panic("CPU%d FATAL TRAP: vector = 3 (Int3)\n", smp_processor_id());
574 }
576 ti = &current->arch.guest_context.trap_ctxt[TRAP_int3];
577 tb->flags = TBF_EXCEPTION;
578 tb->cs = ti->cs;
579 tb->eip = ti->address;
580 if ( TI_GET_IF(ti) )
581 tb->flags |= TBF_INTERRUPT;
583 return 0;
584 }
586 asmlinkage int do_machine_check(struct cpu_user_regs *regs)
587 {
588 fatal_trap(TRAP_machine_check, regs);
589 return 0;
590 }
592 void propagate_page_fault(unsigned long addr, u16 error_code)
593 {
594 struct trap_info *ti;
595 struct vcpu *v = current;
596 struct trap_bounce *tb = &v->arch.trap_bounce;
598 v->arch.guest_context.ctrlreg[2] = addr;
599 v->vcpu_info->arch.cr2 = addr;
601 /* Re-set error_code.user flag appropriately for the guest. */
602 error_code &= ~PGERR_user_mode;
603 if ( !guest_kernel_mode(v, guest_cpu_user_regs()) )
604 error_code |= PGERR_user_mode;
606 ti = &v->arch.guest_context.trap_ctxt[TRAP_page_fault];
607 tb->flags = TBF_EXCEPTION | TBF_EXCEPTION_ERRCODE;
608 tb->error_code = error_code;
609 tb->cs = ti->cs;
610 tb->eip = ti->address;
611 if ( TI_GET_IF(ti) )
612 tb->flags |= TBF_INTERRUPT;
613 }
615 static int handle_gdt_ldt_mapping_fault(
616 unsigned long offset, struct cpu_user_regs *regs)
617 {
618 extern int map_ldt_shadow_page(unsigned int);
620 struct vcpu *v = current;
621 struct domain *d = v->domain;
622 int ret;
624 /* Which vcpu's area did we fault in, and is it in the ldt sub-area? */
625 unsigned int is_ldt_area = (offset >> (GDT_LDT_VCPU_VA_SHIFT-1)) & 1;
626 unsigned int vcpu_area = (offset >> GDT_LDT_VCPU_VA_SHIFT);
628 /* Should never fault in another vcpu's area. */
629 BUG_ON(vcpu_area != current->vcpu_id);
631 /* Byte offset within the gdt/ldt sub-area. */
632 offset &= (1UL << (GDT_LDT_VCPU_VA_SHIFT-1)) - 1UL;
634 if ( likely(is_ldt_area) )
635 {
636 /* LDT fault: Copy a mapping from the guest's LDT, if it is valid. */
637 LOCK_BIGLOCK(d);
638 cleanup_writable_pagetable(d);
639 ret = map_ldt_shadow_page(offset >> PAGE_SHIFT);
640 UNLOCK_BIGLOCK(d);
642 if ( unlikely(ret == 0) )
643 {
644 /* In hypervisor mode? Leave it to the #PF handler to fix up. */
645 if ( !guest_mode(regs) )
646 return 0;
647 /* In guest mode? Propagate #PF to guest, with adjusted %cr2. */
648 propagate_page_fault(
649 v->arch.guest_context.ldt_base + offset, regs->error_code);
650 }
651 }
652 else
653 {
654 /* GDT fault: handle the fault as #GP(selector). */
655 regs->error_code = (u16)offset & ~7;
656 (void)do_general_protection(regs);
657 }
659 return EXCRET_fault_fixed;
660 }
662 #ifdef HYPERVISOR_VIRT_END
663 #define IN_HYPERVISOR_RANGE(va) \
664 (((va) >= HYPERVISOR_VIRT_START) && ((va) < HYPERVISOR_VIRT_END))
665 #else
666 #define IN_HYPERVISOR_RANGE(va) \
667 (((va) >= HYPERVISOR_VIRT_START))
668 #endif
670 static int __spurious_page_fault(
671 unsigned long addr, struct cpu_user_regs *regs)
672 {
673 unsigned long mfn, cr3 = read_cr3();
674 #if CONFIG_PAGING_LEVELS >= 4
675 l4_pgentry_t l4e, *l4t;
676 #endif
677 #if CONFIG_PAGING_LEVELS >= 3
678 l3_pgentry_t l3e, *l3t;
679 #endif
680 l2_pgentry_t l2e, *l2t;
681 l1_pgentry_t l1e, *l1t;
682 unsigned int required_flags, disallowed_flags;
684 /* Reserved bit violations are never spurious faults. */
685 if ( regs->error_code & PGERR_reserved_bit )
686 return 0;
688 required_flags = _PAGE_PRESENT;
689 if ( regs->error_code & PGERR_write_access )
690 required_flags |= _PAGE_RW;
691 if ( regs->error_code & PGERR_user_mode )
692 required_flags |= _PAGE_USER;
694 disallowed_flags = 0;
695 if ( regs->error_code & PGERR_instr_fetch )
696 disallowed_flags |= _PAGE_NX;
698 mfn = cr3 >> PAGE_SHIFT;
700 #if CONFIG_PAGING_LEVELS >= 4
701 l4t = map_domain_page(mfn);
702 l4e = l4t[l4_table_offset(addr)];
703 mfn = l4e_get_pfn(l4e);
704 unmap_domain_page(l4t);
705 if ( !(l4e_get_flags(l4e) & required_flags) ||
706 (l4e_get_flags(l4e) & disallowed_flags) )
707 return 0;
708 #endif
710 #if CONFIG_PAGING_LEVELS >= 3
711 l3t = map_domain_page(mfn);
712 #ifdef CONFIG_X86_PAE
713 l3t += (cr3 & 0xFE0UL) >> 3;
714 #endif
715 l3e = l3t[l3_table_offset(addr)];
716 mfn = l3e_get_pfn(l3e);
717 unmap_domain_page(l3t);
718 #ifdef CONFIG_X86_PAE
719 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
720 return 0;
721 #else
722 if ( !(l3e_get_flags(l3e) & required_flags) ||
723 (l3e_get_flags(l3e) & disallowed_flags) )
724 return 0;
725 #endif
726 #endif
728 l2t = map_domain_page(mfn);
729 l2e = l2t[l2_table_offset(addr)];
730 mfn = l2e_get_pfn(l2e);
731 unmap_domain_page(l2t);
732 if ( !(l2e_get_flags(l2e) & required_flags) ||
733 (l2e_get_flags(l2e) & disallowed_flags) )
734 return 0;
735 if ( l2e_get_flags(l2e) & _PAGE_PSE )
736 {
737 l1e = l1e_empty(); /* define before use in debug tracing */
738 goto spurious;
739 }
741 l1t = map_domain_page(mfn);
742 l1e = l1t[l1_table_offset(addr)];
743 mfn = l1e_get_pfn(l1e);
744 unmap_domain_page(l1t);
745 if ( !(l1e_get_flags(l1e) & required_flags) ||
746 (l1e_get_flags(l1e) & disallowed_flags) )
747 return 0;
749 spurious:
750 DPRINTK("Spurious fault in domain %u:%u at addr %lx, e/c %04x\n",
751 current->domain->domain_id, current->vcpu_id,
752 addr, regs->error_code);
753 #if CONFIG_PAGING_LEVELS >= 4
754 DPRINTK(" l4e = %"PRIpte"\n", l4e_get_intpte(l4e));
755 #endif
756 #if CONFIG_PAGING_LEVELS >= 3
757 DPRINTK(" l3e = %"PRIpte"\n", l3e_get_intpte(l3e));
758 #endif
759 DPRINTK(" l2e = %"PRIpte"\n", l2e_get_intpte(l2e));
760 DPRINTK(" l1e = %"PRIpte"\n", l1e_get_intpte(l1e));
761 #ifndef NDEBUG
762 show_registers(regs);
763 #endif
764 return 1;
765 }
767 static int spurious_page_fault(
768 unsigned long addr, struct cpu_user_regs *regs)
769 {
770 struct domain *d = current->domain;
771 int is_spurious;
773 LOCK_BIGLOCK(d);
774 cleanup_writable_pagetable(d);
775 is_spurious = __spurious_page_fault(addr, regs);
776 UNLOCK_BIGLOCK(d);
778 return is_spurious;
779 }
781 static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs)
782 {
783 struct vcpu *v = current;
784 struct domain *d = v->domain;
786 if ( unlikely(IN_HYPERVISOR_RANGE(addr)) )
787 {
788 if ( shadow_mode_external(d) && guest_mode(regs) )
789 return shadow_fault(addr, regs);
790 if ( (addr >= GDT_LDT_VIRT_START) && (addr < GDT_LDT_VIRT_END) )
791 return handle_gdt_ldt_mapping_fault(
792 addr - GDT_LDT_VIRT_START, regs);
793 /*
794 * Do not propagate spurious faults in the hypervisor area to the
795 * guest. It cannot fix them up.
796 */
797 return (spurious_page_fault(addr, regs) ? EXCRET_not_a_fault : 0);
798 }
800 if ( unlikely(shadow_mode_enabled(d)) )
801 return shadow_fault(addr, regs);
803 if ( likely(VM_ASSIST(d, VMASST_TYPE_writable_pagetables)) )
804 {
805 LOCK_BIGLOCK(d);
806 if ( unlikely(d->arch.ptwr[PTWR_PT_ACTIVE].l1va) &&
807 unlikely(l2_linear_offset(addr) ==
808 d->arch.ptwr[PTWR_PT_ACTIVE].l2_idx) )
809 {
810 ptwr_flush(d, PTWR_PT_ACTIVE);
811 UNLOCK_BIGLOCK(d);
812 return EXCRET_fault_fixed;
813 }
815 /*
816 * Note it is *not* safe to check PGERR_page_present here. It can be
817 * clear, due to unhooked page table, when we would otherwise expect
818 * it to be set. We have an aversion to trusting that flag in Xen, and
819 * guests ought to be leery too.
820 */
821 if ( guest_kernel_mode(v, regs) &&
822 (regs->error_code & PGERR_write_access) &&
823 ptwr_do_page_fault(d, addr, regs) )
824 {
825 UNLOCK_BIGLOCK(d);
826 return EXCRET_fault_fixed;
827 }
828 UNLOCK_BIGLOCK(d);
829 }
831 return 0;
832 }
834 /*
835 * #PF error code:
836 * Bit 0: Protection violation (=1) ; Page not present (=0)
837 * Bit 1: Write access
838 * Bit 2: User mode (=1) ; Supervisor mode (=0)
839 * Bit 3: Reserved bit violation
840 * Bit 4: Instruction fetch
841 */
842 asmlinkage int do_page_fault(struct cpu_user_regs *regs)
843 {
844 unsigned long addr, fixup;
845 int rc;
847 ASSERT(!in_irq());
849 __asm__ __volatile__ ("mov %%cr2,%0" : "=r" (addr) : );
851 DEBUGGER_trap_entry(TRAP_page_fault, regs);
853 perfc_incrc(page_faults);
855 if ( unlikely((rc = fixup_page_fault(addr, regs)) != 0) )
856 return rc;
858 if ( unlikely(!guest_mode(regs)) )
859 {
860 if ( spurious_page_fault(addr, regs) )
861 return EXCRET_not_a_fault;
863 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
864 {
865 perfc_incrc(copy_user_faults);
866 regs->eip = fixup;
867 return 0;
868 }
870 DEBUGGER_trap_fatal(TRAP_page_fault, regs);
872 show_execution_state(regs);
873 show_page_walk(addr);
874 panic("CPU%d FATAL PAGE FAULT\n"
875 "[error_code=%04x]\n"
876 "Faulting linear address: %p\n",
877 smp_processor_id(), regs->error_code, _p(addr));
878 }
880 propagate_page_fault(addr, regs->error_code);
881 return 0;
882 }
884 long do_fpu_taskswitch(int set)
885 {
886 struct vcpu *v = current;
888 if ( set )
889 {
890 v->arch.guest_context.ctrlreg[0] |= X86_CR0_TS;
891 stts();
892 }
893 else
894 {
895 v->arch.guest_context.ctrlreg[0] &= ~X86_CR0_TS;
896 if ( test_bit(_VCPUF_fpu_dirtied, &v->vcpu_flags) )
897 clts();
898 }
900 return 0;
901 }
903 /* Has the guest requested sufficient permission for this I/O access? */
904 static inline int guest_io_okay(
905 unsigned int port, unsigned int bytes,
906 struct vcpu *v, struct cpu_user_regs *regs)
907 {
908 u16 x;
909 #if defined(__x86_64__)
910 /* If in user mode, switch to kernel mode just to read I/O bitmap. */
911 int user_mode = !(v->arch.flags & TF_kernel_mode);
912 #define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v)
913 #elif defined(__i386__)
914 #define TOGGLE_MODE() ((void)0)
915 #endif
917 if ( !vm86_mode(regs) &&
918 (v->arch.iopl >= (guest_kernel_mode(v, regs) ? 1 : 3)) )
919 return 1;
921 if ( v->arch.iobmp_limit > (port + bytes) )
922 {
923 TOGGLE_MODE();
924 __get_user(x, (u16 *)(v->arch.iobmp+(port>>3)));
925 TOGGLE_MODE();
926 if ( (x & (((1<<bytes)-1) << (port&7))) == 0 )
927 return 1;
928 }
930 return 0;
931 }
933 /* Has the administrator granted sufficient permission for this I/O access? */
934 static inline int admin_io_okay(
935 unsigned int port, unsigned int bytes,
936 struct vcpu *v, struct cpu_user_regs *regs)
937 {
938 return ioports_access_permitted(v->domain, port, port + bytes - 1);
939 }
941 /* Check admin limits. Silently fail the access if it is disallowed. */
942 #define inb_user(_p, _d, _r) (admin_io_okay(_p, 1, _d, _r) ? inb(_p) : ~0)
943 #define inw_user(_p, _d, _r) (admin_io_okay(_p, 2, _d, _r) ? inw(_p) : ~0)
944 #define inl_user(_p, _d, _r) (admin_io_okay(_p, 4, _d, _r) ? inl(_p) : ~0)
945 #define outb_user(_v, _p, _d, _r) \
946 (admin_io_okay(_p, 1, _d, _r) ? outb(_v, _p) : ((void)0))
947 #define outw_user(_v, _p, _d, _r) \
948 (admin_io_okay(_p, 2, _d, _r) ? outw(_v, _p) : ((void)0))
949 #define outl_user(_v, _p, _d, _r) \
950 (admin_io_okay(_p, 4, _d, _r) ? outl(_v, _p) : ((void)0))
952 /* Instruction fetch with error handling. */
953 #define insn_fetch(_type, _size, _ptr) \
954 ({ unsigned long _rc, _x; \
955 if ( (_rc = copy_from_user(&_x, (_type *)eip, sizeof(_type))) != 0 ) \
956 { \
957 propagate_page_fault(eip + sizeof(_type) - _rc, 0); \
958 return EXCRET_fault_fixed; \
959 } \
960 eip += _size; (_type)_x; })
962 static int emulate_privileged_op(struct cpu_user_regs *regs)
963 {
964 struct vcpu *v = current;
965 unsigned long *reg, eip = regs->eip, res;
966 u8 opcode, modrm_reg = 0, modrm_rm = 0, rep_prefix = 0;
967 unsigned int port, i, op_bytes = 4, data, rc;
968 u32 l, h;
970 /* Legacy prefixes. */
971 for ( i = 0; i < 8; i++ )
972 {
973 switch ( opcode = insn_fetch(u8, 1, eip) )
974 {
975 case 0x66: /* operand-size override */
976 op_bytes ^= 6; /* switch between 2/4 bytes */
977 break;
978 case 0x67: /* address-size override */
979 case 0x2e: /* CS override */
980 case 0x3e: /* DS override */
981 case 0x26: /* ES override */
982 case 0x64: /* FS override */
983 case 0x65: /* GS override */
984 case 0x36: /* SS override */
985 case 0xf0: /* LOCK */
986 case 0xf2: /* REPNE/REPNZ */
987 break;
988 case 0xf3: /* REP/REPE/REPZ */
989 rep_prefix = 1;
990 break;
991 default:
992 goto done_prefixes;
993 }
994 }
995 done_prefixes:
997 #ifdef __x86_64__
998 /* REX prefix. */
999 if ( (opcode & 0xf0) == 0x40 )
1001 modrm_reg = (opcode & 4) << 1; /* REX.R */
1002 modrm_rm = (opcode & 1) << 3; /* REX.B */
1004 /* REX.W and REX.X do not need to be decoded. */
1005 opcode = insn_fetch(u8, 1, eip);
1007 #endif
1009 /* Input/Output String instructions. */
1010 if ( (opcode >= 0x6c) && (opcode <= 0x6f) )
1012 if ( rep_prefix && (regs->ecx == 0) )
1013 goto done;
1015 continue_io_string:
1016 switch ( opcode )
1018 case 0x6c: /* INSB */
1019 op_bytes = 1;
1020 case 0x6d: /* INSW/INSL */
1021 if ( !guest_io_okay((u16)regs->edx, op_bytes, v, regs) )
1022 goto fail;
1023 switch ( op_bytes )
1025 case 1:
1026 data = (u8)inb_user((u16)regs->edx, v, regs);
1027 break;
1028 case 2:
1029 data = (u16)inw_user((u16)regs->edx, v, regs);
1030 break;
1031 case 4:
1032 data = (u32)inl_user((u16)regs->edx, v, regs);
1033 break;
1035 if ( (rc = copy_to_user((void *)regs->edi, &data, op_bytes)) != 0 )
1037 propagate_page_fault(regs->edi + op_bytes - rc,
1038 PGERR_write_access);
1039 return EXCRET_fault_fixed;
1041 regs->edi += (int)((regs->eflags & EF_DF) ? -op_bytes : op_bytes);
1042 break;
1044 case 0x6e: /* OUTSB */
1045 op_bytes = 1;
1046 case 0x6f: /* OUTSW/OUTSL */
1047 if ( !guest_io_okay((u16)regs->edx, op_bytes, v, regs) )
1048 goto fail;
1049 rc = copy_from_user(&data, (void *)regs->esi, op_bytes);
1050 if ( rc != 0 )
1052 propagate_page_fault(regs->esi + op_bytes - rc, 0);
1053 return EXCRET_fault_fixed;
1055 switch ( op_bytes )
1057 case 1:
1058 outb_user((u8)data, (u16)regs->edx, v, regs);
1059 break;
1060 case 2:
1061 outw_user((u16)data, (u16)regs->edx, v, regs);
1062 break;
1063 case 4:
1064 outl_user((u32)data, (u16)regs->edx, v, regs);
1065 break;
1067 regs->esi += (int)((regs->eflags & EF_DF) ? -op_bytes : op_bytes);
1068 break;
1071 if ( rep_prefix && (--regs->ecx != 0) )
1073 if ( !hypercall_preempt_check() )
1074 goto continue_io_string;
1075 eip = regs->eip;
1078 goto done;
1081 /* I/O Port and Interrupt Flag instructions. */
1082 switch ( opcode )
1084 case 0xe4: /* IN imm8,%al */
1085 op_bytes = 1;
1086 case 0xe5: /* IN imm8,%eax */
1087 port = insn_fetch(u8, 1, eip);
1088 exec_in:
1089 if ( !guest_io_okay(port, op_bytes, v, regs) )
1090 goto fail;
1091 switch ( op_bytes )
1093 case 1:
1094 regs->eax &= ~0xffUL;
1095 regs->eax |= (u8)inb_user(port, v, regs);
1096 break;
1097 case 2:
1098 regs->eax &= ~0xffffUL;
1099 regs->eax |= (u16)inw_user(port, v, regs);
1100 break;
1101 case 4:
1102 regs->eax = (u32)inl_user(port, v, regs);
1103 break;
1105 goto done;
1107 case 0xec: /* IN %dx,%al */
1108 op_bytes = 1;
1109 case 0xed: /* IN %dx,%eax */
1110 port = (u16)regs->edx;
1111 goto exec_in;
1113 case 0xe6: /* OUT %al,imm8 */
1114 op_bytes = 1;
1115 case 0xe7: /* OUT %eax,imm8 */
1116 port = insn_fetch(u8, 1, eip);
1117 exec_out:
1118 if ( !guest_io_okay(port, op_bytes, v, regs) )
1119 goto fail;
1120 switch ( op_bytes )
1122 case 1:
1123 outb_user((u8)regs->eax, port, v, regs);
1124 break;
1125 case 2:
1126 outw_user((u16)regs->eax, port, v, regs);
1127 break;
1128 case 4:
1129 outl_user((u32)regs->eax, port, v, regs);
1130 break;
1132 goto done;
1134 case 0xee: /* OUT %al,%dx */
1135 op_bytes = 1;
1136 case 0xef: /* OUT %eax,%dx */
1137 port = (u16)regs->edx;
1138 goto exec_out;
1140 case 0xfa: /* CLI */
1141 case 0xfb: /* STI */
1142 if ( v->arch.iopl < (guest_kernel_mode(v, regs) ? 1 : 3) )
1143 goto fail;
1144 /*
1145 * This is just too dangerous to allow, in my opinion. Consider if the
1146 * caller then tries to reenable interrupts using POPF: we can't trap
1147 * that and we'll end up with hard-to-debug lockups. Fast & loose will
1148 * do for us. :-)
1149 */
1150 /*v->vcpu_info->evtchn_upcall_mask = (opcode == 0xfa);*/
1151 goto done;
1153 case 0x0f: /* Two-byte opcode */
1154 break;
1156 default:
1157 goto fail;
1160 /* Remaining instructions only emulated from guest kernel. */
1161 if ( !guest_kernel_mode(v, regs) )
1162 goto fail;
1164 /* Privileged (ring 0) instructions. */
1165 opcode = insn_fetch(u8, 1, eip);
1166 switch ( opcode )
1168 case 0x06: /* CLTS */
1169 (void)do_fpu_taskswitch(0);
1170 break;
1172 case 0x09: /* WBINVD */
1173 /* Ignore the instruction if unprivileged. */
1174 if ( !cache_flush_permitted(v->domain) )
1175 /* Non-physdev domain attempted WBINVD; ignore for now since
1176 newer linux uses this in some start-of-day timing loops */
1178 else
1179 wbinvd();
1180 break;
1182 case 0x20: /* MOV CR?,<reg> */
1183 opcode = insn_fetch(u8, 1, eip);
1184 modrm_reg |= (opcode >> 3) & 7;
1185 modrm_rm |= (opcode >> 0) & 7;
1186 reg = decode_register(modrm_rm, regs, 0);
1187 switch ( modrm_reg )
1189 case 0: /* Read CR0 */
1190 *reg = (read_cr0() & ~X86_CR0_TS) |
1191 v->arch.guest_context.ctrlreg[0];
1192 break;
1194 case 2: /* Read CR2 */
1195 *reg = v->arch.guest_context.ctrlreg[2];
1196 break;
1198 case 3: /* Read CR3 */
1199 *reg = xen_pfn_to_cr3(mfn_to_gmfn(
1200 v->domain, pagetable_get_pfn(v->arch.guest_table)));
1201 break;
1203 case 4: /* Read CR4 */
1204 /*
1205 * Guests can read CR4 to see what features Xen has enabled. We
1206 * therefore lie about PGE & PSE as they are unavailable to guests.
1207 */
1208 *reg = read_cr4() & ~(X86_CR4_PGE|X86_CR4_PSE);
1209 break;
1211 default:
1212 goto fail;
1214 break;
1216 case 0x21: /* MOV DR?,<reg> */
1217 opcode = insn_fetch(u8, 1, eip);
1218 modrm_reg |= (opcode >> 3) & 7;
1219 modrm_rm |= (opcode >> 0) & 7;
1220 reg = decode_register(modrm_rm, regs, 0);
1221 if ( (res = do_get_debugreg(modrm_reg)) > (unsigned long)-256 )
1222 goto fail;
1223 *reg = res;
1224 break;
1226 case 0x22: /* MOV <reg>,CR? */
1227 opcode = insn_fetch(u8, 1, eip);
1228 modrm_reg |= (opcode >> 3) & 7;
1229 modrm_rm |= (opcode >> 0) & 7;
1230 reg = decode_register(modrm_rm, regs, 0);
1231 switch ( modrm_reg )
1233 case 0: /* Write CR0 */
1234 if ( (*reg ^ read_cr0()) & ~X86_CR0_TS )
1236 DPRINTK("Attempt to change unmodifiable CR0 flags.\n");
1237 goto fail;
1239 (void)do_fpu_taskswitch(!!(*reg & X86_CR0_TS));
1240 break;
1242 case 2: /* Write CR2 */
1243 v->arch.guest_context.ctrlreg[2] = *reg;
1244 v->vcpu_info->arch.cr2 = *reg;
1245 break;
1247 case 3: /* Write CR3 */
1248 LOCK_BIGLOCK(v->domain);
1249 cleanup_writable_pagetable(v->domain);
1250 (void)new_guest_cr3(gmfn_to_mfn(v->domain, xen_cr3_to_pfn(*reg)));
1251 UNLOCK_BIGLOCK(v->domain);
1252 break;
1254 case 4:
1255 if ( *reg != (read_cr4() & ~(X86_CR4_PGE|X86_CR4_PSE)) )
1257 DPRINTK("Attempt to change CR4 flags.\n");
1258 goto fail;
1260 break;
1262 default:
1263 goto fail;
1265 break;
1267 case 0x23: /* MOV <reg>,DR? */
1268 opcode = insn_fetch(u8, 1, eip);
1269 modrm_reg |= (opcode >> 3) & 7;
1270 modrm_rm |= (opcode >> 0) & 7;
1271 reg = decode_register(modrm_rm, regs, 0);
1272 if ( do_set_debugreg(modrm_reg, *reg) != 0 )
1273 goto fail;
1274 break;
1276 case 0x30: /* WRMSR */
1277 switch ( regs->ecx )
1279 #ifdef CONFIG_X86_64
1280 case MSR_FS_BASE:
1281 if ( wrmsr_safe(MSR_FS_BASE, regs->eax, regs->edx) )
1282 goto fail;
1283 v->arch.guest_context.fs_base =
1284 ((u64)regs->edx << 32) | regs->eax;
1285 break;
1286 case MSR_GS_BASE:
1287 if ( wrmsr_safe(MSR_GS_BASE, regs->eax, regs->edx) )
1288 goto fail;
1289 v->arch.guest_context.gs_base_kernel =
1290 ((u64)regs->edx << 32) | regs->eax;
1291 break;
1292 case MSR_SHADOW_GS_BASE:
1293 if ( wrmsr_safe(MSR_SHADOW_GS_BASE, regs->eax, regs->edx) )
1294 goto fail;
1295 v->arch.guest_context.gs_base_user =
1296 ((u64)regs->edx << 32) | regs->eax;
1297 break;
1298 #endif
1299 default:
1300 if ( (rdmsr_safe(regs->ecx, l, h) != 0) ||
1301 (regs->eax != l) || (regs->edx != h) )
1302 DPRINTK("Domain attempted WRMSR %p from "
1303 "%08x:%08x to %08lx:%08lx.\n",
1304 _p(regs->ecx), h, l, (long)regs->edx, (long)regs->eax);
1305 break;
1307 break;
1309 case 0x32: /* RDMSR */
1310 switch ( regs->ecx )
1312 #ifdef CONFIG_X86_64
1313 case MSR_FS_BASE:
1314 regs->eax = v->arch.guest_context.fs_base & 0xFFFFFFFFUL;
1315 regs->edx = v->arch.guest_context.fs_base >> 32;
1316 break;
1317 case MSR_GS_BASE:
1318 regs->eax = v->arch.guest_context.gs_base_kernel & 0xFFFFFFFFUL;
1319 regs->edx = v->arch.guest_context.gs_base_kernel >> 32;
1320 break;
1321 case MSR_SHADOW_GS_BASE:
1322 regs->eax = v->arch.guest_context.gs_base_user & 0xFFFFFFFFUL;
1323 regs->edx = v->arch.guest_context.gs_base_user >> 32;
1324 break;
1325 #endif
1326 case MSR_EFER:
1327 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
1328 goto fail;
1329 break;
1330 default:
1331 /* Everyone can read the MSR space. */
1332 /*DPRINTK("Domain attempted RDMSR %p.\n", _p(regs->ecx));*/
1333 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
1334 goto fail;
1335 break;
1337 break;
1339 default:
1340 goto fail;
1343 done:
1344 regs->eip = eip;
1345 return EXCRET_fault_fixed;
1347 fail:
1348 return 0;
1351 asmlinkage int do_general_protection(struct cpu_user_regs *regs)
1353 struct vcpu *v = current;
1354 struct trap_bounce *tb = &v->arch.trap_bounce;
1355 struct trap_info *ti;
1356 unsigned long fixup;
1358 DEBUGGER_trap_entry(TRAP_gp_fault, regs);
1360 if ( regs->error_code & 1 )
1361 goto hardware_gp;
1363 if ( !guest_mode(regs) )
1364 goto gp_in_kernel;
1366 /*
1367 * Cunning trick to allow arbitrary "INT n" handling.
1369 * We set DPL == 0 on all vectors in the IDT. This prevents any INT <n>
1370 * instruction from trapping to the appropriate vector, when that might not
1371 * be expected by Xen or the guest OS. For example, that entry might be for
1372 * a fault handler (unlike traps, faults don't increment EIP), or might
1373 * expect an error code on the stack (which a software trap never
1374 * provides), or might be a hardware interrupt handler that doesn't like
1375 * being called spuriously.
1377 * Instead, a GPF occurs with the faulting IDT vector in the error code.
1378 * Bit 1 is set to indicate that an IDT entry caused the fault. Bit 0 is
1379 * clear to indicate that it's a software fault, not hardware.
1381 * NOTE: Vectors 3 and 4 are dealt with from their own handler. This is
1382 * okay because they can only be triggered by an explicit DPL-checked
1383 * instruction. The DPL specified by the guest OS for these vectors is NOT
1384 * CHECKED!!
1385 */
1386 if ( (regs->error_code & 3) == 2 )
1388 /* This fault must be due to <INT n> instruction. */
1389 ti = &current->arch.guest_context.trap_ctxt[regs->error_code>>3];
1390 if ( permit_softint(TI_GET_DPL(ti), v, regs) )
1392 tb->flags = TBF_EXCEPTION;
1393 regs->eip += 2;
1394 goto finish_propagation;
1398 /* Emulate some simple privileged and I/O instructions. */
1399 if ( (regs->error_code == 0) &&
1400 emulate_privileged_op(regs) )
1401 return 0;
1403 #if defined(__i386__)
1404 if ( VM_ASSIST(v->domain, VMASST_TYPE_4gb_segments) &&
1405 (regs->error_code == 0) &&
1406 gpf_emulate_4gb(regs) )
1407 return 0;
1408 #endif
1410 /* Pass on GPF as is. */
1411 ti = &current->arch.guest_context.trap_ctxt[TRAP_gp_fault];
1412 tb->flags = TBF_EXCEPTION | TBF_EXCEPTION_ERRCODE;
1413 tb->error_code = regs->error_code;
1414 finish_propagation:
1415 tb->cs = ti->cs;
1416 tb->eip = ti->address;
1417 if ( TI_GET_IF(ti) )
1418 tb->flags |= TBF_INTERRUPT;
1419 return 0;
1421 gp_in_kernel:
1423 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
1425 DPRINTK("GPF (%04x): %p -> %p\n",
1426 regs->error_code, _p(regs->eip), _p(fixup));
1427 regs->eip = fixup;
1428 return 0;
1431 DEBUGGER_trap_fatal(TRAP_gp_fault, regs);
1433 hardware_gp:
1434 show_execution_state(regs);
1435 panic("CPU%d GENERAL PROTECTION FAULT\n[error_code=%04x]\n",
1436 smp_processor_id(), regs->error_code);
1437 return 0;
1440 static void nmi_softirq(void)
1442 /* Only used to defer wakeup of dom0,vcpu0 to a safe (non-NMI) context. */
1443 vcpu_kick(dom0->vcpu[0]);
1446 static void nmi_dom0_report(unsigned int reason_idx)
1448 struct domain *d;
1449 struct vcpu *v;
1451 if ( ((d = dom0) == NULL) || ((v = d->vcpu[0]) == NULL) )
1452 return;
1454 set_bit(reason_idx, &d->shared_info->arch.nmi_reason);
1456 if ( test_and_set_bit(_VCPUF_nmi_pending, &v->vcpu_flags) )
1457 raise_softirq(NMI_SOFTIRQ); /* not safe to wake up a vcpu here */
1460 asmlinkage void mem_parity_error(struct cpu_user_regs *regs)
1462 switch ( opt_nmi[0] )
1464 case 'd': /* 'dom0' */
1465 nmi_dom0_report(_XEN_NMIREASON_parity_error);
1466 case 'i': /* 'ignore' */
1467 break;
1468 default: /* 'fatal' */
1469 console_force_unlock();
1470 printk("\n\nNMI - MEMORY ERROR\n");
1471 fatal_trap(TRAP_nmi, regs);
1474 outb((inb(0x61) & 0x0f) | 0x04, 0x61); /* clear-and-disable parity check */
1475 mdelay(1);
1476 outb((inb(0x61) & 0x0b) | 0x00, 0x61); /* enable parity check */
1479 asmlinkage void io_check_error(struct cpu_user_regs *regs)
1481 switch ( opt_nmi[0] )
1483 case 'd': /* 'dom0' */
1484 nmi_dom0_report(_XEN_NMIREASON_io_error);
1485 case 'i': /* 'ignore' */
1486 break;
1487 default: /* 'fatal' */
1488 console_force_unlock();
1489 printk("\n\nNMI - I/O ERROR\n");
1490 fatal_trap(TRAP_nmi, regs);
1493 outb((inb(0x61) & 0x0f) | 0x08, 0x61); /* clear-and-disable IOCK */
1494 mdelay(1);
1495 outb((inb(0x61) & 0x07) | 0x00, 0x61); /* enable IOCK */
1498 static void unknown_nmi_error(unsigned char reason)
1500 switch ( opt_nmi[0] )
1502 case 'd': /* 'dom0' */
1503 nmi_dom0_report(_XEN_NMIREASON_unknown);
1504 case 'i': /* 'ignore' */
1505 break;
1506 default: /* 'fatal' */
1507 printk("Uhhuh. NMI received for unknown reason %02x.\n", reason);
1508 printk("Dazed and confused, but trying to continue\n");
1509 printk("Do you have a strange power saving mode enabled?\n");
1513 static int dummy_nmi_callback(struct cpu_user_regs *regs, int cpu)
1515 return 0;
1518 static nmi_callback_t nmi_callback = dummy_nmi_callback;
1520 asmlinkage void do_nmi(struct cpu_user_regs *regs)
1522 unsigned int cpu = smp_processor_id();
1523 unsigned char reason;
1525 ++nmi_count(cpu);
1527 if ( nmi_callback(regs, cpu) )
1528 return;
1530 if ( nmi_watchdog )
1531 nmi_watchdog_tick(regs);
1533 /* Only the BSP gets external NMIs from the system. */
1534 if ( cpu == 0 )
1536 reason = inb(0x61);
1537 if ( reason & 0x80 )
1538 mem_parity_error(regs);
1539 else if ( reason & 0x40 )
1540 io_check_error(regs);
1541 else if ( !nmi_watchdog )
1542 unknown_nmi_error((unsigned char)(reason&0xff));
1546 void set_nmi_callback(nmi_callback_t callback)
1548 nmi_callback = callback;
1551 void unset_nmi_callback(void)
1553 nmi_callback = dummy_nmi_callback;
1556 asmlinkage int math_state_restore(struct cpu_user_regs *regs)
1558 struct trap_bounce *tb;
1559 struct trap_info *ti;
1561 setup_fpu(current);
1563 if ( current->arch.guest_context.ctrlreg[0] & X86_CR0_TS )
1565 tb = &current->arch.trap_bounce;
1566 ti = &current->arch.guest_context.trap_ctxt[TRAP_no_device];
1568 tb->flags = TBF_EXCEPTION;
1569 tb->cs = ti->cs;
1570 tb->eip = ti->address;
1571 if ( TI_GET_IF(ti) )
1572 tb->flags |= TBF_INTERRUPT;
1574 current->arch.guest_context.ctrlreg[0] &= ~X86_CR0_TS;
1577 return EXCRET_fault_fixed;
1580 asmlinkage int do_debug(struct cpu_user_regs *regs)
1582 unsigned long condition;
1583 struct vcpu *v = current;
1584 struct trap_bounce *tb = &v->arch.trap_bounce;
1585 struct trap_info *ti;
1587 __asm__ __volatile__("mov %%db6,%0" : "=r" (condition));
1589 /* Mask out spurious debug traps due to lazy DR7 setting */
1590 if ( (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) &&
1591 (v->arch.guest_context.debugreg[7] == 0) )
1593 __asm__("mov %0,%%db7" : : "r" (0UL));
1594 goto out;
1597 DEBUGGER_trap_entry(TRAP_debug, regs);
1599 if ( !guest_mode(regs) )
1601 /* Clear TF just for absolute sanity. */
1602 regs->eflags &= ~EF_TF;
1603 /*
1604 * We ignore watchpoints when they trigger within Xen. This may happen
1605 * when a buffer is passed to us which previously had a watchpoint set
1606 * on it. No need to bump EIP; the only faulting trap is an instruction
1607 * breakpoint, which can't happen to us.
1608 */
1609 goto out;
1612 /* Save debug status register where guest OS can peek at it */
1613 v->arch.guest_context.debugreg[6] = condition;
1615 ti = &v->arch.guest_context.trap_ctxt[TRAP_debug];
1616 tb->flags = TBF_EXCEPTION;
1617 tb->cs = ti->cs;
1618 tb->eip = ti->address;
1619 if ( TI_GET_IF(ti) )
1620 tb->flags |= TBF_INTERRUPT;
1622 out:
1623 return EXCRET_not_a_fault;
1626 asmlinkage int do_spurious_interrupt_bug(struct cpu_user_regs *regs)
1628 return EXCRET_not_a_fault;
1631 void set_intr_gate(unsigned int n, void *addr)
1633 #ifdef __i386__
1634 int i;
1635 /* Keep secondary tables in sync with IRQ updates. */
1636 for ( i = 1; i < NR_CPUS; i++ )
1637 if ( idt_tables[i] != NULL )
1638 _set_gate(&idt_tables[i][n], 14, 0, addr);
1639 #endif
1640 _set_gate(&idt_table[n], 14, 0, addr);
1643 void set_system_gate(unsigned int n, void *addr)
1645 _set_gate(idt_table+n,14,3,addr);
1648 void set_task_gate(unsigned int n, unsigned int sel)
1650 idt_table[n].a = sel << 16;
1651 idt_table[n].b = 0x8500;
1654 void set_tss_desc(unsigned int n, void *addr)
1656 _set_tssldt_desc(
1657 gdt_table + __TSS(n) - FIRST_RESERVED_GDT_ENTRY,
1658 (unsigned long)addr,
1659 offsetof(struct tss_struct, __cacheline_filler) - 1,
1660 9);
1663 void __init trap_init(void)
1665 extern void percpu_traps_init(void);
1667 /*
1668 * Note that interrupt gates are always used, rather than trap gates. We
1669 * must have interrupts disabled until DS/ES/FS/GS are saved because the
1670 * first activation must have the "bad" value(s) for these registers and
1671 * we may lose them if another activation is installed before they are
1672 * saved. The page-fault handler also needs interrupts disabled until %cr2
1673 * has been read and saved on the stack.
1674 */
1675 set_intr_gate(TRAP_divide_error,&divide_error);
1676 set_intr_gate(TRAP_debug,&debug);
1677 set_intr_gate(TRAP_nmi,&nmi);
1678 set_system_gate(TRAP_int3,&int3); /* usable from all privileges */
1679 set_system_gate(TRAP_overflow,&overflow); /* usable from all privileges */
1680 set_intr_gate(TRAP_bounds,&bounds);
1681 set_intr_gate(TRAP_invalid_op,&invalid_op);
1682 set_intr_gate(TRAP_no_device,&device_not_available);
1683 set_intr_gate(TRAP_copro_seg,&coprocessor_segment_overrun);
1684 set_intr_gate(TRAP_invalid_tss,&invalid_TSS);
1685 set_intr_gate(TRAP_no_segment,&segment_not_present);
1686 set_intr_gate(TRAP_stack_error,&stack_segment);
1687 set_intr_gate(TRAP_gp_fault,&general_protection);
1688 set_intr_gate(TRAP_page_fault,&page_fault);
1689 set_intr_gate(TRAP_spurious_int,&spurious_interrupt_bug);
1690 set_intr_gate(TRAP_copro_error,&coprocessor_error);
1691 set_intr_gate(TRAP_alignment_check,&alignment_check);
1692 set_intr_gate(TRAP_machine_check,&machine_check);
1693 set_intr_gate(TRAP_simd_error,&simd_coprocessor_error);
1695 percpu_traps_init();
1697 cpu_init();
1699 open_softirq(NMI_SOFTIRQ, nmi_softirq);
1703 long do_set_trap_table(XEN_GUEST_HANDLE(trap_info_t) traps)
1705 struct trap_info cur;
1706 struct trap_info *dst = current->arch.guest_context.trap_ctxt;
1707 long rc = 0;
1709 /* If no table is presented then clear the entire virtual IDT. */
1710 if ( guest_handle_is_null(traps) )
1712 memset(dst, 0, 256 * sizeof(*dst));
1713 init_int80_direct_trap(current);
1714 return 0;
1717 for ( ; ; )
1719 if ( hypercall_preempt_check() )
1721 rc = hypercall_create_continuation(
1722 __HYPERVISOR_set_trap_table, "h", traps);
1723 break;
1726 if ( copy_from_guest(&cur, traps, 1) )
1728 rc = -EFAULT;
1729 break;
1732 if ( cur.address == 0 )
1733 break;
1735 fixup_guest_code_selector(cur.cs);
1737 memcpy(&dst[cur.vector], &cur, sizeof(cur));
1739 if ( cur.vector == 0x80 )
1740 init_int80_direct_trap(current);
1742 guest_handle_add_offset(traps, 1);
1745 return rc;
1749 long set_debugreg(struct vcpu *p, int reg, unsigned long value)
1751 int i;
1753 switch ( reg )
1755 case 0:
1756 if ( !access_ok(value, sizeof(long)) )
1757 return -EPERM;
1758 if ( p == current )
1759 __asm__ ( "mov %0, %%db0" : : "r" (value) );
1760 break;
1761 case 1:
1762 if ( !access_ok(value, sizeof(long)) )
1763 return -EPERM;
1764 if ( p == current )
1765 __asm__ ( "mov %0, %%db1" : : "r" (value) );
1766 break;
1767 case 2:
1768 if ( !access_ok(value, sizeof(long)) )
1769 return -EPERM;
1770 if ( p == current )
1771 __asm__ ( "mov %0, %%db2" : : "r" (value) );
1772 break;
1773 case 3:
1774 if ( !access_ok(value, sizeof(long)) )
1775 return -EPERM;
1776 if ( p == current )
1777 __asm__ ( "mov %0, %%db3" : : "r" (value) );
1778 break;
1779 case 6:
1780 /*
1781 * DR6: Bits 4-11,16-31 reserved (set to 1).
1782 * Bit 12 reserved (set to 0).
1783 */
1784 value &= 0xffffefff; /* reserved bits => 0 */
1785 value |= 0xffff0ff0; /* reserved bits => 1 */
1786 if ( p == current )
1787 __asm__ ( "mov %0, %%db6" : : "r" (value) );
1788 break;
1789 case 7:
1790 /*
1791 * DR7: Bit 10 reserved (set to 1).
1792 * Bits 11-12,14-15 reserved (set to 0).
1793 * Privileged bits:
1794 * GD (bit 13): must be 0.
1795 * R/Wn (bits 16-17,20-21,24-25,28-29): mustn't be 10.
1796 * LENn (bits 18-19,22-23,26-27,30-31): mustn't be 10.
1797 */
1798 /* DR7 == 0 => debugging disabled for this domain. */
1799 if ( value != 0 )
1801 value &= 0xffff27ff; /* reserved bits => 0 */
1802 value |= 0x00000400; /* reserved bits => 1 */
1803 if ( (value & (1<<13)) != 0 ) return -EPERM;
1804 for ( i = 0; i < 16; i += 2 )
1805 if ( ((value >> (i+16)) & 3) == 2 ) return -EPERM;
1807 if ( p == current )
1808 __asm__ ( "mov %0, %%db7" : : "r" (value) );
1809 break;
1810 default:
1811 return -EINVAL;
1814 p->arch.guest_context.debugreg[reg] = value;
1815 return 0;
1818 long do_set_debugreg(int reg, unsigned long value)
1820 return set_debugreg(current, reg, value);
1823 unsigned long do_get_debugreg(int reg)
1825 if ( (reg < 0) || (reg > 7) ) return -EINVAL;
1826 return current->arch.guest_context.debugreg[reg];
1829 /*
1830 * Local variables:
1831 * mode: C
1832 * c-set-style: "BSD"
1833 * c-basic-offset: 4
1834 * tab-width: 4
1835 * indent-tabs-mode: nil
1836 * End:
1837 */