ia64/xen-unstable

view xen/arch/x86/traps.c @ 10661:8e55c5c11475

[XEN] Add CPUID hypervisor-info leaves at index 0x40000000.
Currently only a signature leaf is defined ("Xen\0").
Signed-off-by: Keir Fraser <keir@xensource.com>
author kfraser@localhost.localdomain
date Wed Jul 05 18:48:41 2006 +0100 (2006-07-05)
parents 462d6e4cb29a
children af9809f51f81
line source
1 /******************************************************************************
2 * arch/x86/traps.c
3 *
4 * Modifications to Linux original are copyright (c) 2002-2004, K A Fraser
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
21 /*
22 * Copyright (C) 1991, 1992 Linus Torvalds
23 *
24 * Pentium III FXSR, SSE support
25 * Gareth Hughes <gareth@valinux.com>, May 2000
26 */
28 #include <xen/config.h>
29 #include <xen/init.h>
30 #include <xen/sched.h>
31 #include <xen/lib.h>
32 #include <xen/errno.h>
33 #include <xen/mm.h>
34 #include <xen/console.h>
35 #include <xen/reboot.h>
36 #include <asm/regs.h>
37 #include <xen/delay.h>
38 #include <xen/event.h>
39 #include <xen/spinlock.h>
40 #include <xen/irq.h>
41 #include <xen/perfc.h>
42 #include <xen/softirq.h>
43 #include <xen/domain_page.h>
44 #include <xen/symbols.h>
45 #include <xen/iocap.h>
46 #include <xen/nmi.h>
47 #include <asm/shadow.h>
48 #include <asm/system.h>
49 #include <asm/io.h>
50 #include <asm/atomic.h>
51 #include <asm/desc.h>
52 #include <asm/debugreg.h>
53 #include <asm/smp.h>
54 #include <asm/flushtlb.h>
55 #include <asm/uaccess.h>
56 #include <asm/i387.h>
57 #include <asm/debugger.h>
58 #include <asm/msr.h>
59 #include <asm/x86_emulate.h>
61 /*
62 * opt_nmi: one of 'ignore', 'dom0', or 'fatal'.
63 * fatal: Xen prints diagnostic message and then hangs.
64 * dom0: The NMI is virtualised to DOM0.
65 * ignore: The NMI error is cleared and ignored.
66 */
67 #ifdef NDEBUG
68 char opt_nmi[10] = "dom0";
69 #else
70 char opt_nmi[10] = "fatal";
71 #endif
72 string_param("nmi", opt_nmi);
74 /* Master table, used by all CPUs on x86/64, and by CPU0 on x86/32.*/
75 idt_entry_t idt_table[IDT_ENTRIES];
77 #define DECLARE_TRAP_HANDLER(_name) \
78 asmlinkage void _name(void); \
79 asmlinkage int do_ ## _name(struct cpu_user_regs *regs)
81 asmlinkage void nmi(void);
82 DECLARE_TRAP_HANDLER(divide_error);
83 DECLARE_TRAP_HANDLER(debug);
84 DECLARE_TRAP_HANDLER(int3);
85 DECLARE_TRAP_HANDLER(overflow);
86 DECLARE_TRAP_HANDLER(bounds);
87 DECLARE_TRAP_HANDLER(invalid_op);
88 DECLARE_TRAP_HANDLER(device_not_available);
89 DECLARE_TRAP_HANDLER(coprocessor_segment_overrun);
90 DECLARE_TRAP_HANDLER(invalid_TSS);
91 DECLARE_TRAP_HANDLER(segment_not_present);
92 DECLARE_TRAP_HANDLER(stack_segment);
93 DECLARE_TRAP_HANDLER(general_protection);
94 DECLARE_TRAP_HANDLER(page_fault);
95 DECLARE_TRAP_HANDLER(coprocessor_error);
96 DECLARE_TRAP_HANDLER(simd_coprocessor_error);
97 DECLARE_TRAP_HANDLER(alignment_check);
98 DECLARE_TRAP_HANDLER(spurious_interrupt_bug);
99 DECLARE_TRAP_HANDLER(machine_check);
101 long do_set_debugreg(int reg, unsigned long value);
102 unsigned long do_get_debugreg(int reg);
104 static int debug_stack_lines = 20;
105 integer_param("debug_stack_lines", debug_stack_lines);
107 #ifdef CONFIG_X86_32
108 #define stack_words_per_line 8
109 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)&regs->esp)
110 #else
111 #define stack_words_per_line 4
112 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)regs->rsp)
113 #endif
115 int is_kernel_text(unsigned long addr)
116 {
117 extern char _stext, _etext;
118 if (addr >= (unsigned long) &_stext &&
119 addr <= (unsigned long) &_etext)
120 return 1;
121 return 0;
123 }
125 unsigned long kernel_text_end(void)
126 {
127 extern char _etext;
128 return (unsigned long) &_etext;
129 }
131 static void show_guest_stack(struct cpu_user_regs *regs)
132 {
133 int i;
134 unsigned long *stack, addr;
136 if ( hvm_guest(current) )
137 return;
139 if ( vm86_mode(regs) )
140 {
141 stack = (unsigned long *)((regs->ss << 4) + (regs->esp & 0xffff));
142 printk("Guest stack trace from ss:sp = %04x:%04x (VM86)\n ",
143 regs->ss, (uint16_t)(regs->esp & 0xffff));
144 }
145 else
146 {
147 stack = (unsigned long *)regs->esp;
148 printk("Guest stack trace from "__OP"sp=%p:\n ", stack);
149 }
151 for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
152 {
153 if ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) == 0 )
154 break;
155 if ( get_user(addr, stack) )
156 {
157 if ( i != 0 )
158 printk("\n ");
159 printk("Fault while accessing guest memory.");
160 i = 1;
161 break;
162 }
163 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
164 printk("\n ");
165 printk(" %p", _p(addr));
166 stack++;
167 }
168 if ( i == 0 )
169 printk("Stack empty.");
170 printk("\n");
171 }
173 #ifdef NDEBUG
175 static void show_trace(struct cpu_user_regs *regs)
176 {
177 unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr;
179 printk("Xen call trace:\n ");
181 printk("[<%p>]", _p(regs->eip));
182 print_symbol(" %s\n ", regs->eip);
184 while ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) != 0 )
185 {
186 addr = *stack++;
187 if ( is_kernel_text(addr) )
188 {
189 printk("[<%p>]", _p(addr));
190 print_symbol(" %s\n ", addr);
191 }
192 }
194 printk("\n");
195 }
197 #else
199 static void show_trace(struct cpu_user_regs *regs)
200 {
201 unsigned long *frame, next, addr, low, high;
203 printk("Xen call trace:\n ");
205 printk("[<%p>]", _p(regs->eip));
206 print_symbol(" %s\n ", regs->eip);
208 /* Bounds for range of valid frame pointer. */
209 low = (unsigned long)(ESP_BEFORE_EXCEPTION(regs) - 2);
210 high = (low & ~(STACK_SIZE - 1)) +
211 (STACK_SIZE - sizeof(struct cpu_info) - 2*sizeof(unsigned long));
213 /* The initial frame pointer. */
214 next = regs->ebp;
216 for ( ; ; )
217 {
218 /* Valid frame pointer? */
219 if ( (next < low) || (next >= high) )
220 {
221 /*
222 * Exception stack frames have a different layout, denoted by an
223 * inverted frame pointer.
224 */
225 next = ~next;
226 if ( (next < low) || (next >= high) )
227 break;
228 frame = (unsigned long *)next;
229 next = frame[0];
230 addr = frame[(offsetof(struct cpu_user_regs, eip) -
231 offsetof(struct cpu_user_regs, ebp))
232 / BYTES_PER_LONG];
233 }
234 else
235 {
236 /* Ordinary stack frame. */
237 frame = (unsigned long *)next;
238 next = frame[0];
239 addr = frame[1];
240 }
242 printk("[<%p>]", _p(addr));
243 print_symbol(" %s\n ", addr);
245 low = (unsigned long)&frame[2];
246 }
248 printk("\n");
249 }
251 #endif
253 void show_stack(struct cpu_user_regs *regs)
254 {
255 unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr;
256 int i;
258 if ( guest_mode(regs) )
259 return show_guest_stack(regs);
261 printk("Xen stack trace from "__OP"sp=%p:\n ", stack);
263 for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
264 {
265 if ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) == 0 )
266 break;
267 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
268 printk("\n ");
269 addr = *stack++;
270 printk(" %p", _p(addr));
271 }
272 if ( i == 0 )
273 printk("Stack empty.");
274 printk("\n");
276 show_trace(regs);
277 }
279 void show_stack_overflow(unsigned long esp)
280 {
281 #ifdef MEMORY_GUARD
282 unsigned long esp_top = get_stack_bottom() & PAGE_MASK;
283 unsigned long *stack, addr;
285 /* Trigger overflow trace if %esp is within 100 bytes of the guard page. */
286 if ( ((esp - esp_top) > 100) && ((esp_top - esp) > 100) )
287 return;
289 if ( esp < esp_top )
290 esp = esp_top;
292 printk("Xen stack overflow:\n ");
294 stack = (unsigned long *)esp;
295 while ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) != 0 )
296 {
297 addr = *stack++;
298 if ( is_kernel_text(addr) )
299 {
300 printk("%p: [<%p>]", stack, _p(addr));
301 print_symbol(" %s\n ", addr);
302 }
303 }
305 printk("\n");
306 #endif
307 }
309 void show_execution_state(struct cpu_user_regs *regs)
310 {
311 show_registers(regs);
312 show_stack(regs);
313 }
315 /*
316 * This is called for faults at very unexpected times (e.g., when interrupts
317 * are disabled). In such situations we can't do much that is safe. We try to
318 * print out some tracing and then we just spin.
319 */
320 asmlinkage void fatal_trap(int trapnr, struct cpu_user_regs *regs)
321 {
322 int cpu = smp_processor_id();
323 unsigned long cr2;
324 static char *trapstr[] = {
325 "divide error", "debug", "nmi", "bkpt", "overflow", "bounds",
326 "invalid opcode", "device not available", "double fault",
327 "coprocessor segment", "invalid tss", "segment not found",
328 "stack error", "general protection fault", "page fault",
329 "spurious interrupt", "coprocessor error", "alignment check",
330 "machine check", "simd error"
331 };
333 watchdog_disable();
334 console_start_sync();
336 show_execution_state(regs);
338 if ( trapnr == TRAP_page_fault )
339 {
340 __asm__ __volatile__ ("mov %%cr2,%0" : "=r" (cr2) : );
341 printk("Faulting linear address: %p\n", _p(cr2));
342 show_page_walk(cr2);
343 }
345 printk("************************************\n");
346 printk("CPU%d FATAL TRAP %d (%s), ERROR_CODE %04x%s.\n",
347 cpu, trapnr, trapstr[trapnr], regs->error_code,
348 (regs->eflags & X86_EFLAGS_IF) ? "" : ", IN INTERRUPT CONTEXT");
349 printk("System shutting down -- need manual reset.\n");
350 printk("************************************\n");
352 (void)debugger_trap_fatal(trapnr, regs);
354 /* Lock up the console to prevent spurious output from other CPUs. */
355 console_force_lock();
357 /* Wait for manual reset. */
358 machine_halt();
359 }
361 static inline int do_trap(int trapnr, char *str,
362 struct cpu_user_regs *regs,
363 int use_error_code)
364 {
365 struct vcpu *v = current;
366 struct trap_bounce *tb = &v->arch.trap_bounce;
367 struct trap_info *ti;
368 unsigned long fixup;
370 DEBUGGER_trap_entry(trapnr, regs);
372 if ( !guest_mode(regs) )
373 goto xen_fault;
375 ti = &current->arch.guest_context.trap_ctxt[trapnr];
376 tb->flags = TBF_EXCEPTION;
377 tb->cs = ti->cs;
378 tb->eip = ti->address;
379 if ( use_error_code )
380 {
381 tb->flags |= TBF_EXCEPTION_ERRCODE;
382 tb->error_code = regs->error_code;
383 }
384 if ( TI_GET_IF(ti) )
385 tb->flags |= TBF_INTERRUPT;
386 return 0;
388 xen_fault:
390 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
391 {
392 DPRINTK("Trap %d: %p -> %p\n", trapnr, _p(regs->eip), _p(fixup));
393 regs->eip = fixup;
394 return 0;
395 }
397 DEBUGGER_trap_fatal(trapnr, regs);
399 show_execution_state(regs);
400 panic("CPU%d FATAL TRAP: vector = %d (%s)\n"
401 "[error_code=%04x]\n",
402 smp_processor_id(), trapnr, str, regs->error_code);
403 return 0;
404 }
406 #define DO_ERROR_NOCODE(trapnr, str, name) \
407 asmlinkage int do_##name(struct cpu_user_regs *regs) \
408 { \
409 return do_trap(trapnr, str, regs, 0); \
410 }
412 #define DO_ERROR(trapnr, str, name) \
413 asmlinkage int do_##name(struct cpu_user_regs *regs) \
414 { \
415 return do_trap(trapnr, str, regs, 1); \
416 }
418 DO_ERROR_NOCODE( 0, "divide error", divide_error)
419 DO_ERROR_NOCODE( 4, "overflow", overflow)
420 DO_ERROR_NOCODE( 5, "bounds", bounds)
421 DO_ERROR_NOCODE( 9, "coprocessor segment overrun", coprocessor_segment_overrun)
422 DO_ERROR(10, "invalid TSS", invalid_TSS)
423 DO_ERROR(11, "segment not present", segment_not_present)
424 DO_ERROR(12, "stack segment", stack_segment)
425 DO_ERROR_NOCODE(16, "fpu error", coprocessor_error)
426 DO_ERROR(17, "alignment check", alignment_check)
427 DO_ERROR_NOCODE(19, "simd error", simd_coprocessor_error)
429 int cpuid_hypervisor_leaves(
430 uint32_t idx, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
431 {
432 if ( (idx < 0x40000000) || (idx > 0x40000000) )
433 return 0;
435 switch ( idx - 0x40000000 )
436 {
437 case 0:
438 *eax = 0x40000000;
439 *ebx = 0x006e6558; /* "Xen\0" */
440 *ecx = *edx = 0;
441 break;
443 default:
444 BUG();
445 }
447 return 1;
448 }
450 static int emulate_forced_invalid_op(struct cpu_user_regs *regs)
451 {
452 char signature[5], instr[2];
453 uint32_t a, b, c, d;
454 unsigned long eip;
456 a = regs->eax;
457 b = regs->ebx;
458 c = regs->ecx;
459 d = regs->edx;
460 eip = regs->eip;
462 /* Check for forced emulation signature: ud2 ; .ascii "xen". */
463 if ( copy_from_user(signature, (char *)eip, sizeof(signature)) ||
464 memcmp(signature, "\xf\xbxen", sizeof(signature)) )
465 return 0;
466 eip += sizeof(signature);
468 /* We only emulate CPUID. */
469 if ( copy_from_user(instr, (char *)eip, sizeof(instr)) ||
470 memcmp(instr, "\xf\xa2", sizeof(instr)) )
471 return 0;
472 eip += sizeof(instr);
474 __asm__ (
475 "cpuid"
476 : "=a" (a), "=b" (b), "=c" (c), "=d" (d)
477 : "0" (a), "1" (b), "2" (c), "3" (d) );
479 if ( regs->eax == 1 )
480 {
481 /* Modify Feature Information. */
482 clear_bit(X86_FEATURE_VME, &d);
483 clear_bit(X86_FEATURE_DE, &d);
484 clear_bit(X86_FEATURE_PSE, &d);
485 clear_bit(X86_FEATURE_PGE, &d);
486 if ( !supervisor_mode_kernel )
487 clear_bit(X86_FEATURE_SEP, &d);
488 if ( !IS_PRIV(current->domain) )
489 clear_bit(X86_FEATURE_MTRR, &d);
490 }
491 else
492 {
493 (void)cpuid_hypervisor_leaves(regs->eax, &a, &b, &c, &d);
494 }
496 regs->eax = a;
497 regs->ebx = b;
498 regs->ecx = c;
499 regs->edx = d;
500 regs->eip = eip;
502 return EXCRET_fault_fixed;
503 }
505 asmlinkage int do_invalid_op(struct cpu_user_regs *regs)
506 {
507 struct vcpu *v = current;
508 struct trap_bounce *tb = &v->arch.trap_bounce;
509 struct trap_info *ti;
510 int rc;
512 DEBUGGER_trap_entry(TRAP_invalid_op, regs);
514 if ( unlikely(!guest_mode(regs)) )
515 {
516 char sig[5];
517 /* Signature (ud2; .ascii "dbg") indicates dump state and continue. */
518 if ( (__copy_from_user(sig, (char *)regs->eip, sizeof(sig)) == 0) &&
519 (memcmp(sig, "\xf\xb""dbg", sizeof(sig)) == 0) )
520 {
521 show_execution_state(regs);
522 regs->eip += sizeof(sig);
523 return EXCRET_fault_fixed;
524 }
525 printk("%02x %02x %02x %02x %02x\n",
526 (unsigned char)sig[0],
527 (unsigned char)sig[1],
528 (unsigned char)sig[2],
529 (unsigned char)sig[3],
530 (unsigned char)sig[4]);
531 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
532 show_execution_state(regs);
533 panic("CPU%d FATAL TRAP: vector = %d (invalid opcode)\n",
534 smp_processor_id(), TRAP_invalid_op);
535 }
537 if ( (rc = emulate_forced_invalid_op(regs)) != 0 )
538 return rc;
540 ti = &current->arch.guest_context.trap_ctxt[TRAP_invalid_op];
541 tb->flags = TBF_EXCEPTION;
542 tb->cs = ti->cs;
543 tb->eip = ti->address;
544 if ( TI_GET_IF(ti) )
545 tb->flags |= TBF_INTERRUPT;
547 return 0;
548 }
550 asmlinkage int do_int3(struct cpu_user_regs *regs)
551 {
552 struct vcpu *v = current;
553 struct trap_bounce *tb = &v->arch.trap_bounce;
554 struct trap_info *ti;
556 DEBUGGER_trap_entry(TRAP_int3, regs);
558 if ( !guest_mode(regs) )
559 {
560 DEBUGGER_trap_fatal(TRAP_int3, regs);
561 show_execution_state(regs);
562 panic("CPU%d FATAL TRAP: vector = 3 (Int3)\n", smp_processor_id());
563 }
565 ti = &current->arch.guest_context.trap_ctxt[TRAP_int3];
566 tb->flags = TBF_EXCEPTION;
567 tb->cs = ti->cs;
568 tb->eip = ti->address;
569 if ( TI_GET_IF(ti) )
570 tb->flags |= TBF_INTERRUPT;
572 return 0;
573 }
575 asmlinkage int do_machine_check(struct cpu_user_regs *regs)
576 {
577 fatal_trap(TRAP_machine_check, regs);
578 return 0;
579 }
581 void propagate_page_fault(unsigned long addr, u16 error_code)
582 {
583 struct trap_info *ti;
584 struct vcpu *v = current;
585 struct trap_bounce *tb = &v->arch.trap_bounce;
587 v->arch.guest_context.ctrlreg[2] = addr;
588 v->vcpu_info->arch.cr2 = addr;
590 /* Re-set error_code.user flag appropriately for the guest. */
591 error_code &= ~PGERR_user_mode;
592 if ( !guest_kernel_mode(v, guest_cpu_user_regs()) )
593 error_code |= PGERR_user_mode;
595 ti = &v->arch.guest_context.trap_ctxt[TRAP_page_fault];
596 tb->flags = TBF_EXCEPTION | TBF_EXCEPTION_ERRCODE;
597 tb->error_code = error_code;
598 tb->cs = ti->cs;
599 tb->eip = ti->address;
600 if ( TI_GET_IF(ti) )
601 tb->flags |= TBF_INTERRUPT;
602 }
604 static int handle_gdt_ldt_mapping_fault(
605 unsigned long offset, struct cpu_user_regs *regs)
606 {
607 extern int map_ldt_shadow_page(unsigned int);
609 struct vcpu *v = current;
610 struct domain *d = v->domain;
611 int ret;
613 /* Which vcpu's area did we fault in, and is it in the ldt sub-area? */
614 unsigned int is_ldt_area = (offset >> (GDT_LDT_VCPU_VA_SHIFT-1)) & 1;
615 unsigned int vcpu_area = (offset >> GDT_LDT_VCPU_VA_SHIFT);
617 /* Should never fault in another vcpu's area. */
618 BUG_ON(vcpu_area != current->vcpu_id);
620 /* Byte offset within the gdt/ldt sub-area. */
621 offset &= (1UL << (GDT_LDT_VCPU_VA_SHIFT-1)) - 1UL;
623 if ( likely(is_ldt_area) )
624 {
625 /* LDT fault: Copy a mapping from the guest's LDT, if it is valid. */
626 LOCK_BIGLOCK(d);
627 cleanup_writable_pagetable(d);
628 ret = map_ldt_shadow_page(offset >> PAGE_SHIFT);
629 UNLOCK_BIGLOCK(d);
631 if ( unlikely(ret == 0) )
632 {
633 /* In hypervisor mode? Leave it to the #PF handler to fix up. */
634 if ( !guest_mode(regs) )
635 return 0;
636 /* In guest mode? Propagate #PF to guest, with adjusted %cr2. */
637 propagate_page_fault(
638 v->arch.guest_context.ldt_base + offset, regs->error_code);
639 }
640 }
641 else
642 {
643 /* GDT fault: handle the fault as #GP(selector). */
644 regs->error_code = (u16)offset & ~7;
645 (void)do_general_protection(regs);
646 }
648 return EXCRET_fault_fixed;
649 }
651 #ifdef HYPERVISOR_VIRT_END
652 #define IN_HYPERVISOR_RANGE(va) \
653 (((va) >= HYPERVISOR_VIRT_START) && ((va) < HYPERVISOR_VIRT_END))
654 #else
655 #define IN_HYPERVISOR_RANGE(va) \
656 (((va) >= HYPERVISOR_VIRT_START))
657 #endif
659 static int __spurious_page_fault(
660 unsigned long addr, struct cpu_user_regs *regs)
661 {
662 unsigned long mfn, cr3 = read_cr3();
663 #if CONFIG_PAGING_LEVELS >= 4
664 l4_pgentry_t l4e, *l4t;
665 #endif
666 #if CONFIG_PAGING_LEVELS >= 3
667 l3_pgentry_t l3e, *l3t;
668 #endif
669 l2_pgentry_t l2e, *l2t;
670 l1_pgentry_t l1e, *l1t;
671 unsigned int required_flags, disallowed_flags;
673 /* Reserved bit violations are never spurious faults. */
674 if ( regs->error_code & PGERR_reserved_bit )
675 return 0;
677 required_flags = _PAGE_PRESENT;
678 if ( regs->error_code & PGERR_write_access )
679 required_flags |= _PAGE_RW;
680 if ( regs->error_code & PGERR_user_mode )
681 required_flags |= _PAGE_USER;
683 disallowed_flags = 0;
684 if ( regs->error_code & PGERR_instr_fetch )
685 disallowed_flags |= _PAGE_NX;
687 mfn = cr3 >> PAGE_SHIFT;
689 #if CONFIG_PAGING_LEVELS >= 4
690 l4t = map_domain_page(mfn);
691 l4e = l4t[l4_table_offset(addr)];
692 mfn = l4e_get_pfn(l4e);
693 unmap_domain_page(l4t);
694 if ( !(l4e_get_flags(l4e) & required_flags) ||
695 (l4e_get_flags(l4e) & disallowed_flags) )
696 return 0;
697 #endif
699 #if CONFIG_PAGING_LEVELS >= 3
700 l3t = map_domain_page(mfn);
701 #ifdef CONFIG_X86_PAE
702 l3t += (cr3 & 0xFE0UL) >> 3;
703 #endif
704 l3e = l3t[l3_table_offset(addr)];
705 mfn = l3e_get_pfn(l3e);
706 unmap_domain_page(l3t);
707 #ifdef CONFIG_X86_PAE
708 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
709 return 0;
710 #else
711 if ( !(l3e_get_flags(l3e) & required_flags) ||
712 (l3e_get_flags(l3e) & disallowed_flags) )
713 return 0;
714 #endif
715 #endif
717 l2t = map_domain_page(mfn);
718 l2e = l2t[l2_table_offset(addr)];
719 mfn = l2e_get_pfn(l2e);
720 unmap_domain_page(l2t);
721 if ( !(l2e_get_flags(l2e) & required_flags) ||
722 (l2e_get_flags(l2e) & disallowed_flags) )
723 return 0;
724 if ( l2e_get_flags(l2e) & _PAGE_PSE )
725 return 1;
727 l1t = map_domain_page(mfn);
728 l1e = l1t[l1_table_offset(addr)];
729 mfn = l1e_get_pfn(l1e);
730 unmap_domain_page(l1t);
731 if ( !(l1e_get_flags(l1e) & required_flags) ||
732 (l1e_get_flags(l1e) & disallowed_flags) )
733 return 0;
734 return 1;
735 }
737 static int spurious_page_fault(
738 unsigned long addr, struct cpu_user_regs *regs)
739 {
740 struct domain *d = current->domain;
741 int is_spurious;
743 LOCK_BIGLOCK(d);
744 cleanup_writable_pagetable(d);
745 is_spurious = __spurious_page_fault(addr, regs);
746 UNLOCK_BIGLOCK(d);
748 return is_spurious;
749 }
751 static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs)
752 {
753 struct vcpu *v = current;
754 struct domain *d = v->domain;
756 if ( unlikely(IN_HYPERVISOR_RANGE(addr)) )
757 {
758 if ( shadow_mode_external(d) && guest_mode(regs) )
759 return shadow_fault(addr, regs);
760 if ( (addr >= GDT_LDT_VIRT_START) && (addr < GDT_LDT_VIRT_END) )
761 return handle_gdt_ldt_mapping_fault(
762 addr - GDT_LDT_VIRT_START, regs);
763 /*
764 * Do not propagate spurious faults in the hypervisor area to the
765 * guest. It cannot fix them up.
766 */
767 return (spurious_page_fault(addr, regs) ? EXCRET_not_a_fault : 0);
768 }
770 if ( unlikely(shadow_mode_enabled(d)) )
771 return shadow_fault(addr, regs);
773 if ( likely(VM_ASSIST(d, VMASST_TYPE_writable_pagetables)) )
774 {
775 LOCK_BIGLOCK(d);
776 if ( unlikely(d->arch.ptwr[PTWR_PT_ACTIVE].l1va) &&
777 unlikely(l2_linear_offset(addr) ==
778 d->arch.ptwr[PTWR_PT_ACTIVE].l2_idx) )
779 {
780 ptwr_flush(d, PTWR_PT_ACTIVE);
781 UNLOCK_BIGLOCK(d);
782 return EXCRET_fault_fixed;
783 }
785 /*
786 * Note it is *not* safe to check PGERR_page_present here. It can be
787 * clear, due to unhooked page table, when we would otherwise expect
788 * it to be set. We have an aversion to trusting that flag in Xen, and
789 * guests ought to be leery too.
790 */
791 if ( guest_kernel_mode(v, regs) &&
792 (regs->error_code & PGERR_write_access) &&
793 ptwr_do_page_fault(d, addr, regs) )
794 {
795 UNLOCK_BIGLOCK(d);
796 return EXCRET_fault_fixed;
797 }
798 UNLOCK_BIGLOCK(d);
799 }
801 return 0;
802 }
804 /*
805 * #PF error code:
806 * Bit 0: Protection violation (=1) ; Page not present (=0)
807 * Bit 1: Write access
808 * Bit 2: User mode (=1) ; Supervisor mode (=0)
809 * Bit 3: Reserved bit violation
810 * Bit 4: Instruction fetch
811 */
812 asmlinkage int do_page_fault(struct cpu_user_regs *regs)
813 {
814 unsigned long addr, fixup;
815 int rc;
817 ASSERT(!in_irq());
819 __asm__ __volatile__ ("mov %%cr2,%0" : "=r" (addr) : );
821 DEBUGGER_trap_entry(TRAP_page_fault, regs);
823 perfc_incrc(page_faults);
825 if ( unlikely((rc = fixup_page_fault(addr, regs)) != 0) )
826 return rc;
828 if ( unlikely(!guest_mode(regs)) )
829 {
830 if ( spurious_page_fault(addr, regs) )
831 {
832 DPRINTK("Spurious fault in domain %u:%u at addr %lx\n",
833 current->domain->domain_id, current->vcpu_id, addr);
834 return EXCRET_not_a_fault;
835 }
837 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
838 {
839 perfc_incrc(copy_user_faults);
840 regs->eip = fixup;
841 return 0;
842 }
844 DEBUGGER_trap_fatal(TRAP_page_fault, regs);
846 show_execution_state(regs);
847 show_page_walk(addr);
848 panic("CPU%d FATAL PAGE FAULT\n"
849 "[error_code=%04x]\n"
850 "Faulting linear address: %p\n",
851 smp_processor_id(), regs->error_code, _p(addr));
852 }
854 propagate_page_fault(addr, regs->error_code);
855 return 0;
856 }
858 long do_fpu_taskswitch(int set)
859 {
860 struct vcpu *v = current;
862 if ( set )
863 {
864 v->arch.guest_context.ctrlreg[0] |= X86_CR0_TS;
865 stts();
866 }
867 else
868 {
869 v->arch.guest_context.ctrlreg[0] &= ~X86_CR0_TS;
870 if ( test_bit(_VCPUF_fpu_dirtied, &v->vcpu_flags) )
871 clts();
872 }
874 return 0;
875 }
877 /* Has the guest requested sufficient permission for this I/O access? */
878 static inline int guest_io_okay(
879 unsigned int port, unsigned int bytes,
880 struct vcpu *v, struct cpu_user_regs *regs)
881 {
882 u16 x;
883 #if defined(__x86_64__)
884 /* If in user mode, switch to kernel mode just to read I/O bitmap. */
885 int user_mode = !(v->arch.flags & TF_kernel_mode);
886 #define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v)
887 #elif defined(__i386__)
888 #define TOGGLE_MODE() ((void)0)
889 #endif
891 if ( !vm86_mode(regs) &&
892 (v->arch.iopl >= (guest_kernel_mode(v, regs) ? 1 : 3)) )
893 return 1;
895 if ( v->arch.iobmp_limit > (port + bytes) )
896 {
897 TOGGLE_MODE();
898 __get_user(x, (u16 *)(v->arch.iobmp+(port>>3)));
899 TOGGLE_MODE();
900 if ( (x & (((1<<bytes)-1) << (port&7))) == 0 )
901 return 1;
902 }
904 return 0;
905 }
907 /* Has the administrator granted sufficient permission for this I/O access? */
908 static inline int admin_io_okay(
909 unsigned int port, unsigned int bytes,
910 struct vcpu *v, struct cpu_user_regs *regs)
911 {
912 return ioports_access_permitted(v->domain, port, port + bytes - 1);
913 }
915 /* Check admin limits. Silently fail the access if it is disallowed. */
916 #define inb_user(_p, _d, _r) (admin_io_okay(_p, 1, _d, _r) ? inb(_p) : ~0)
917 #define inw_user(_p, _d, _r) (admin_io_okay(_p, 2, _d, _r) ? inw(_p) : ~0)
918 #define inl_user(_p, _d, _r) (admin_io_okay(_p, 4, _d, _r) ? inl(_p) : ~0)
919 #define outb_user(_v, _p, _d, _r) \
920 (admin_io_okay(_p, 1, _d, _r) ? outb(_v, _p) : ((void)0))
921 #define outw_user(_v, _p, _d, _r) \
922 (admin_io_okay(_p, 2, _d, _r) ? outw(_v, _p) : ((void)0))
923 #define outl_user(_v, _p, _d, _r) \
924 (admin_io_okay(_p, 4, _d, _r) ? outl(_v, _p) : ((void)0))
926 /* Propagate a fault back to the guest kernel. */
927 #define PAGE_FAULT(_faultaddr, _errcode) \
928 ({ propagate_page_fault(_faultaddr, _errcode); \
929 return EXCRET_fault_fixed; \
930 })
932 /* Isntruction fetch with error handling. */
933 #define insn_fetch(_type, _size, _ptr) \
934 ({ unsigned long _x; \
935 if ( get_user(_x, (_type *)eip) ) \
936 PAGE_FAULT(eip, 0); /* read fault */ \
937 eip += _size; (_type)_x; })
939 static int emulate_privileged_op(struct cpu_user_regs *regs)
940 {
941 struct vcpu *v = current;
942 unsigned long *reg, eip = regs->eip, res;
943 u8 opcode, modrm_reg = 0, modrm_rm = 0, rep_prefix = 0;
944 unsigned int port, i, op_bytes = 4, data;
945 u32 l, h;
947 /* Legacy prefixes. */
948 for ( i = 0; i < 8; i++ )
949 {
950 switch ( opcode = insn_fetch(u8, 1, eip) )
951 {
952 case 0x66: /* operand-size override */
953 op_bytes ^= 6; /* switch between 2/4 bytes */
954 break;
955 case 0x67: /* address-size override */
956 case 0x2e: /* CS override */
957 case 0x3e: /* DS override */
958 case 0x26: /* ES override */
959 case 0x64: /* FS override */
960 case 0x65: /* GS override */
961 case 0x36: /* SS override */
962 case 0xf0: /* LOCK */
963 case 0xf2: /* REPNE/REPNZ */
964 break;
965 case 0xf3: /* REP/REPE/REPZ */
966 rep_prefix = 1;
967 break;
968 default:
969 goto done_prefixes;
970 }
971 }
972 done_prefixes:
974 #ifdef __x86_64__
975 /* REX prefix. */
976 if ( (opcode & 0xf0) == 0x40 )
977 {
978 modrm_reg = (opcode & 4) << 1; /* REX.R */
979 modrm_rm = (opcode & 1) << 3; /* REX.B */
981 /* REX.W and REX.X do not need to be decoded. */
982 opcode = insn_fetch(u8, 1, eip);
983 }
984 #endif
986 /* Input/Output String instructions. */
987 if ( (opcode >= 0x6c) && (opcode <= 0x6f) )
988 {
989 if ( rep_prefix && (regs->ecx == 0) )
990 goto done;
992 continue_io_string:
993 switch ( opcode )
994 {
995 case 0x6c: /* INSB */
996 op_bytes = 1;
997 case 0x6d: /* INSW/INSL */
998 if ( !guest_io_okay((u16)regs->edx, op_bytes, v, regs) )
999 goto fail;
1000 switch ( op_bytes )
1002 case 1:
1003 data = (u8)inb_user((u16)regs->edx, v, regs);
1004 if ( put_user((u8)data, (u8 *)regs->edi) )
1005 PAGE_FAULT(regs->edi, PGERR_write_access);
1006 break;
1007 case 2:
1008 data = (u16)inw_user((u16)regs->edx, v, regs);
1009 if ( put_user((u16)data, (u16 *)regs->edi) )
1010 PAGE_FAULT(regs->edi, PGERR_write_access);
1011 break;
1012 case 4:
1013 data = (u32)inl_user((u16)regs->edx, v, regs);
1014 if ( put_user((u32)data, (u32 *)regs->edi) )
1015 PAGE_FAULT(regs->edi, PGERR_write_access);
1016 break;
1018 regs->edi += (int)((regs->eflags & EF_DF) ? -op_bytes : op_bytes);
1019 break;
1021 case 0x6e: /* OUTSB */
1022 op_bytes = 1;
1023 case 0x6f: /* OUTSW/OUTSL */
1024 if ( !guest_io_okay((u16)regs->edx, op_bytes, v, regs) )
1025 goto fail;
1026 switch ( op_bytes )
1028 case 1:
1029 if ( get_user(data, (u8 *)regs->esi) )
1030 PAGE_FAULT(regs->esi, 0); /* read fault */
1031 outb_user((u8)data, (u16)regs->edx, v, regs);
1032 break;
1033 case 2:
1034 if ( get_user(data, (u16 *)regs->esi) )
1035 PAGE_FAULT(regs->esi, 0); /* read fault */
1036 outw_user((u16)data, (u16)regs->edx, v, regs);
1037 break;
1038 case 4:
1039 if ( get_user(data, (u32 *)regs->esi) )
1040 PAGE_FAULT(regs->esi, 0); /* read fault */
1041 outl_user((u32)data, (u16)regs->edx, v, regs);
1042 break;
1044 regs->esi += (int)((regs->eflags & EF_DF) ? -op_bytes : op_bytes);
1045 break;
1048 if ( rep_prefix && (--regs->ecx != 0) )
1050 if ( !hypercall_preempt_check() )
1051 goto continue_io_string;
1052 eip = regs->eip;
1055 goto done;
1058 /* I/O Port and Interrupt Flag instructions. */
1059 switch ( opcode )
1061 case 0xe4: /* IN imm8,%al */
1062 op_bytes = 1;
1063 case 0xe5: /* IN imm8,%eax */
1064 port = insn_fetch(u8, 1, eip);
1065 exec_in:
1066 if ( !guest_io_okay(port, op_bytes, v, regs) )
1067 goto fail;
1068 switch ( op_bytes )
1070 case 1:
1071 regs->eax &= ~0xffUL;
1072 regs->eax |= (u8)inb_user(port, v, regs);
1073 break;
1074 case 2:
1075 regs->eax &= ~0xffffUL;
1076 regs->eax |= (u16)inw_user(port, v, regs);
1077 break;
1078 case 4:
1079 regs->eax = (u32)inl_user(port, v, regs);
1080 break;
1082 goto done;
1084 case 0xec: /* IN %dx,%al */
1085 op_bytes = 1;
1086 case 0xed: /* IN %dx,%eax */
1087 port = (u16)regs->edx;
1088 goto exec_in;
1090 case 0xe6: /* OUT %al,imm8 */
1091 op_bytes = 1;
1092 case 0xe7: /* OUT %eax,imm8 */
1093 port = insn_fetch(u8, 1, eip);
1094 exec_out:
1095 if ( !guest_io_okay(port, op_bytes, v, regs) )
1096 goto fail;
1097 switch ( op_bytes )
1099 case 1:
1100 outb_user((u8)regs->eax, port, v, regs);
1101 break;
1102 case 2:
1103 outw_user((u16)regs->eax, port, v, regs);
1104 break;
1105 case 4:
1106 outl_user((u32)regs->eax, port, v, regs);
1107 break;
1109 goto done;
1111 case 0xee: /* OUT %al,%dx */
1112 op_bytes = 1;
1113 case 0xef: /* OUT %eax,%dx */
1114 port = (u16)regs->edx;
1115 goto exec_out;
1117 case 0xfa: /* CLI */
1118 case 0xfb: /* STI */
1119 if ( v->arch.iopl < (guest_kernel_mode(v, regs) ? 1 : 3) )
1120 goto fail;
1121 /*
1122 * This is just too dangerous to allow, in my opinion. Consider if the
1123 * caller then tries to reenable interrupts using POPF: we can't trap
1124 * that and we'll end up with hard-to-debug lockups. Fast & loose will
1125 * do for us. :-)
1126 */
1127 /*v->vcpu_info->evtchn_upcall_mask = (opcode == 0xfa);*/
1128 goto done;
1130 case 0x0f: /* Two-byte opcode */
1131 break;
1133 default:
1134 goto fail;
1137 /* Remaining instructions only emulated from guest kernel. */
1138 if ( !guest_kernel_mode(v, regs) )
1139 goto fail;
1141 /* Privileged (ring 0) instructions. */
1142 opcode = insn_fetch(u8, 1, eip);
1143 switch ( opcode )
1145 case 0x06: /* CLTS */
1146 (void)do_fpu_taskswitch(0);
1147 break;
1149 case 0x09: /* WBINVD */
1150 /* Ignore the instruction if unprivileged. */
1151 if ( !cache_flush_permitted(v->domain) )
1152 /* Non-physdev domain attempted WBINVD; ignore for now since
1153 newer linux uses this in some start-of-day timing loops */
1155 else
1156 wbinvd();
1157 break;
1159 case 0x20: /* MOV CR?,<reg> */
1160 opcode = insn_fetch(u8, 1, eip);
1161 modrm_reg |= (opcode >> 3) & 7;
1162 modrm_rm |= (opcode >> 0) & 7;
1163 reg = decode_register(modrm_rm, regs, 0);
1164 switch ( modrm_reg )
1166 case 0: /* Read CR0 */
1167 *reg = (read_cr0() & ~X86_CR0_TS) |
1168 v->arch.guest_context.ctrlreg[0];
1169 break;
1171 case 2: /* Read CR2 */
1172 *reg = v->arch.guest_context.ctrlreg[2];
1173 break;
1175 case 3: /* Read CR3 */
1176 *reg = xen_pfn_to_cr3(mfn_to_gmfn(
1177 v->domain, pagetable_get_pfn(v->arch.guest_table)));
1178 break;
1180 case 4: /* Read CR4 */
1181 /*
1182 * Guests can read CR4 to see what features Xen has enabled. We
1183 * therefore lie about PGE & PSE as they are unavailable to guests.
1184 */
1185 *reg = read_cr4() & ~(X86_CR4_PGE|X86_CR4_PSE);
1186 break;
1188 default:
1189 goto fail;
1191 break;
1193 case 0x21: /* MOV DR?,<reg> */
1194 opcode = insn_fetch(u8, 1, eip);
1195 modrm_reg |= (opcode >> 3) & 7;
1196 modrm_rm |= (opcode >> 0) & 7;
1197 reg = decode_register(modrm_rm, regs, 0);
1198 if ( (res = do_get_debugreg(modrm_reg)) > (unsigned long)-256 )
1199 goto fail;
1200 *reg = res;
1201 break;
1203 case 0x22: /* MOV <reg>,CR? */
1204 opcode = insn_fetch(u8, 1, eip);
1205 modrm_reg |= (opcode >> 3) & 7;
1206 modrm_rm |= (opcode >> 0) & 7;
1207 reg = decode_register(modrm_rm, regs, 0);
1208 switch ( modrm_reg )
1210 case 0: /* Write CR0 */
1211 if ( (*reg ^ read_cr0()) & ~X86_CR0_TS )
1213 DPRINTK("Attempt to change unmodifiable CR0 flags.\n");
1214 goto fail;
1216 (void)do_fpu_taskswitch(!!(*reg & X86_CR0_TS));
1217 break;
1219 case 2: /* Write CR2 */
1220 v->arch.guest_context.ctrlreg[2] = *reg;
1221 v->vcpu_info->arch.cr2 = *reg;
1222 break;
1224 case 3: /* Write CR3 */
1225 LOCK_BIGLOCK(v->domain);
1226 cleanup_writable_pagetable(v->domain);
1227 (void)new_guest_cr3(gmfn_to_mfn(v->domain, xen_cr3_to_pfn(*reg)));
1228 UNLOCK_BIGLOCK(v->domain);
1229 break;
1231 case 4:
1232 if ( *reg != (read_cr4() & ~(X86_CR4_PGE|X86_CR4_PSE)) )
1234 DPRINTK("Attempt to change CR4 flags.\n");
1235 goto fail;
1237 break;
1239 default:
1240 goto fail;
1242 break;
1244 case 0x23: /* MOV <reg>,DR? */
1245 opcode = insn_fetch(u8, 1, eip);
1246 modrm_reg |= (opcode >> 3) & 7;
1247 modrm_rm |= (opcode >> 0) & 7;
1248 reg = decode_register(modrm_rm, regs, 0);
1249 if ( do_set_debugreg(modrm_reg, *reg) != 0 )
1250 goto fail;
1251 break;
1253 case 0x30: /* WRMSR */
1254 switch ( regs->ecx )
1256 #ifdef CONFIG_X86_64
1257 case MSR_FS_BASE:
1258 if ( wrmsr_safe(MSR_FS_BASE, regs->eax, regs->edx) )
1259 goto fail;
1260 v->arch.guest_context.fs_base =
1261 ((u64)regs->edx << 32) | regs->eax;
1262 break;
1263 case MSR_GS_BASE:
1264 if ( wrmsr_safe(MSR_GS_BASE, regs->eax, regs->edx) )
1265 goto fail;
1266 v->arch.guest_context.gs_base_kernel =
1267 ((u64)regs->edx << 32) | regs->eax;
1268 break;
1269 case MSR_SHADOW_GS_BASE:
1270 if ( wrmsr_safe(MSR_SHADOW_GS_BASE, regs->eax, regs->edx) )
1271 goto fail;
1272 v->arch.guest_context.gs_base_user =
1273 ((u64)regs->edx << 32) | regs->eax;
1274 break;
1275 #endif
1276 default:
1277 if ( (rdmsr_safe(regs->ecx, l, h) != 0) ||
1278 (regs->eax != l) || (regs->edx != h) )
1279 DPRINTK("Domain attempted WRMSR %p from "
1280 "%08x:%08x to %08lx:%08lx.\n",
1281 _p(regs->ecx), h, l, (long)regs->edx, (long)regs->eax);
1282 break;
1284 break;
1286 case 0x32: /* RDMSR */
1287 switch ( regs->ecx )
1289 #ifdef CONFIG_X86_64
1290 case MSR_FS_BASE:
1291 regs->eax = v->arch.guest_context.fs_base & 0xFFFFFFFFUL;
1292 regs->edx = v->arch.guest_context.fs_base >> 32;
1293 break;
1294 case MSR_GS_BASE:
1295 regs->eax = v->arch.guest_context.gs_base_kernel & 0xFFFFFFFFUL;
1296 regs->edx = v->arch.guest_context.gs_base_kernel >> 32;
1297 break;
1298 case MSR_SHADOW_GS_BASE:
1299 regs->eax = v->arch.guest_context.gs_base_user & 0xFFFFFFFFUL;
1300 regs->edx = v->arch.guest_context.gs_base_user >> 32;
1301 break;
1302 #endif
1303 case MSR_EFER:
1304 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
1305 goto fail;
1306 break;
1307 default:
1308 /* Everyone can read the MSR space. */
1309 /*DPRINTK("Domain attempted RDMSR %p.\n", _p(regs->ecx));*/
1310 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
1311 goto fail;
1312 break;
1314 break;
1316 default:
1317 goto fail;
1320 done:
1321 regs->eip = eip;
1322 return EXCRET_fault_fixed;
1324 fail:
1325 return 0;
1328 asmlinkage int do_general_protection(struct cpu_user_regs *regs)
1330 struct vcpu *v = current;
1331 struct trap_bounce *tb = &v->arch.trap_bounce;
1332 struct trap_info *ti;
1333 unsigned long fixup;
1335 DEBUGGER_trap_entry(TRAP_gp_fault, regs);
1337 if ( regs->error_code & 1 )
1338 goto hardware_gp;
1340 if ( !guest_mode(regs) )
1341 goto gp_in_kernel;
1343 /*
1344 * Cunning trick to allow arbitrary "INT n" handling.
1346 * We set DPL == 0 on all vectors in the IDT. This prevents any INT <n>
1347 * instruction from trapping to the appropriate vector, when that might not
1348 * be expected by Xen or the guest OS. For example, that entry might be for
1349 * a fault handler (unlike traps, faults don't increment EIP), or might
1350 * expect an error code on the stack (which a software trap never
1351 * provides), or might be a hardware interrupt handler that doesn't like
1352 * being called spuriously.
1354 * Instead, a GPF occurs with the faulting IDT vector in the error code.
1355 * Bit 1 is set to indicate that an IDT entry caused the fault. Bit 0 is
1356 * clear to indicate that it's a software fault, not hardware.
1358 * NOTE: Vectors 3 and 4 are dealt with from their own handler. This is
1359 * okay because they can only be triggered by an explicit DPL-checked
1360 * instruction. The DPL specified by the guest OS for these vectors is NOT
1361 * CHECKED!!
1362 */
1363 if ( (regs->error_code & 3) == 2 )
1365 /* This fault must be due to <INT n> instruction. */
1366 ti = &current->arch.guest_context.trap_ctxt[regs->error_code>>3];
1367 if ( permit_softint(TI_GET_DPL(ti), v, regs) )
1369 tb->flags = TBF_EXCEPTION;
1370 regs->eip += 2;
1371 goto finish_propagation;
1375 /* Emulate some simple privileged and I/O instructions. */
1376 if ( (regs->error_code == 0) &&
1377 emulate_privileged_op(regs) )
1378 return 0;
1380 #if defined(__i386__)
1381 if ( VM_ASSIST(v->domain, VMASST_TYPE_4gb_segments) &&
1382 (regs->error_code == 0) &&
1383 gpf_emulate_4gb(regs) )
1384 return 0;
1385 #endif
1387 /* Pass on GPF as is. */
1388 ti = &current->arch.guest_context.trap_ctxt[TRAP_gp_fault];
1389 tb->flags = TBF_EXCEPTION | TBF_EXCEPTION_ERRCODE;
1390 tb->error_code = regs->error_code;
1391 finish_propagation:
1392 tb->cs = ti->cs;
1393 tb->eip = ti->address;
1394 if ( TI_GET_IF(ti) )
1395 tb->flags |= TBF_INTERRUPT;
1396 return 0;
1398 gp_in_kernel:
1400 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
1402 DPRINTK("GPF (%04x): %p -> %p\n",
1403 regs->error_code, _p(regs->eip), _p(fixup));
1404 regs->eip = fixup;
1405 return 0;
1408 DEBUGGER_trap_fatal(TRAP_gp_fault, regs);
1410 hardware_gp:
1411 show_execution_state(regs);
1412 panic("CPU%d GENERAL PROTECTION FAULT\n[error_code=%04x]\n",
1413 smp_processor_id(), regs->error_code);
1414 return 0;
1417 static void nmi_softirq(void)
1419 /* Only used to defer wakeup of dom0,vcpu0 to a safe (non-NMI) context. */
1420 vcpu_kick(dom0->vcpu[0]);
1423 static void nmi_dom0_report(unsigned int reason_idx)
1425 struct domain *d;
1426 struct vcpu *v;
1428 if ( ((d = dom0) == NULL) || ((v = d->vcpu[0]) == NULL) )
1429 return;
1431 set_bit(reason_idx, &d->shared_info->arch.nmi_reason);
1433 if ( test_and_set_bit(_VCPUF_nmi_pending, &v->vcpu_flags) )
1434 raise_softirq(NMI_SOFTIRQ); /* not safe to wake up a vcpu here */
1437 asmlinkage void mem_parity_error(struct cpu_user_regs *regs)
1439 switch ( opt_nmi[0] )
1441 case 'd': /* 'dom0' */
1442 nmi_dom0_report(_XEN_NMIREASON_parity_error);
1443 case 'i': /* 'ignore' */
1444 break;
1445 default: /* 'fatal' */
1446 console_force_unlock();
1447 printk("\n\nNMI - MEMORY ERROR\n");
1448 fatal_trap(TRAP_nmi, regs);
1451 outb((inb(0x61) & 0x0f) | 0x04, 0x61); /* clear-and-disable parity check */
1452 mdelay(1);
1453 outb((inb(0x61) & 0x0b) | 0x00, 0x61); /* enable parity check */
1456 asmlinkage void io_check_error(struct cpu_user_regs *regs)
1458 switch ( opt_nmi[0] )
1460 case 'd': /* 'dom0' */
1461 nmi_dom0_report(_XEN_NMIREASON_io_error);
1462 case 'i': /* 'ignore' */
1463 break;
1464 default: /* 'fatal' */
1465 console_force_unlock();
1466 printk("\n\nNMI - I/O ERROR\n");
1467 fatal_trap(TRAP_nmi, regs);
1470 outb((inb(0x61) & 0x0f) | 0x08, 0x61); /* clear-and-disable IOCK */
1471 mdelay(1);
1472 outb((inb(0x61) & 0x07) | 0x00, 0x61); /* enable IOCK */
1475 static void unknown_nmi_error(unsigned char reason)
1477 switch ( opt_nmi[0] )
1479 case 'd': /* 'dom0' */
1480 nmi_dom0_report(_XEN_NMIREASON_unknown);
1481 case 'i': /* 'ignore' */
1482 break;
1483 default: /* 'fatal' */
1484 printk("Uhhuh. NMI received for unknown reason %02x.\n", reason);
1485 printk("Dazed and confused, but trying to continue\n");
1486 printk("Do you have a strange power saving mode enabled?\n");
1490 static int dummy_nmi_callback(struct cpu_user_regs *regs, int cpu)
1492 return 0;
1495 static nmi_callback_t nmi_callback = dummy_nmi_callback;
1497 asmlinkage void do_nmi(struct cpu_user_regs *regs)
1499 unsigned int cpu = smp_processor_id();
1500 unsigned char reason;
1502 ++nmi_count(cpu);
1504 if ( nmi_callback(regs, cpu) )
1505 return;
1507 if ( nmi_watchdog )
1508 nmi_watchdog_tick(regs);
1510 /* Only the BSP gets external NMIs from the system. */
1511 if ( cpu == 0 )
1513 reason = inb(0x61);
1514 if ( reason & 0x80 )
1515 mem_parity_error(regs);
1516 else if ( reason & 0x40 )
1517 io_check_error(regs);
1518 else if ( !nmi_watchdog )
1519 unknown_nmi_error((unsigned char)(reason&0xff));
1523 void set_nmi_callback(nmi_callback_t callback)
1525 nmi_callback = callback;
1528 void unset_nmi_callback(void)
1530 nmi_callback = dummy_nmi_callback;
1533 asmlinkage int math_state_restore(struct cpu_user_regs *regs)
1535 struct trap_bounce *tb;
1536 struct trap_info *ti;
1538 setup_fpu(current);
1540 if ( current->arch.guest_context.ctrlreg[0] & X86_CR0_TS )
1542 tb = &current->arch.trap_bounce;
1543 ti = &current->arch.guest_context.trap_ctxt[TRAP_no_device];
1545 tb->flags = TBF_EXCEPTION;
1546 tb->cs = ti->cs;
1547 tb->eip = ti->address;
1548 if ( TI_GET_IF(ti) )
1549 tb->flags |= TBF_INTERRUPT;
1551 current->arch.guest_context.ctrlreg[0] &= ~X86_CR0_TS;
1554 return EXCRET_fault_fixed;
1557 asmlinkage int do_debug(struct cpu_user_regs *regs)
1559 unsigned long condition;
1560 struct vcpu *v = current;
1561 struct trap_bounce *tb = &v->arch.trap_bounce;
1562 struct trap_info *ti;
1564 __asm__ __volatile__("mov %%db6,%0" : "=r" (condition));
1566 /* Mask out spurious debug traps due to lazy DR7 setting */
1567 if ( (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) &&
1568 (v->arch.guest_context.debugreg[7] == 0) )
1570 __asm__("mov %0,%%db7" : : "r" (0UL));
1571 goto out;
1574 DEBUGGER_trap_entry(TRAP_debug, regs);
1576 if ( !guest_mode(regs) )
1578 /* Clear TF just for absolute sanity. */
1579 regs->eflags &= ~EF_TF;
1580 /*
1581 * We ignore watchpoints when they trigger within Xen. This may happen
1582 * when a buffer is passed to us which previously had a watchpoint set
1583 * on it. No need to bump EIP; the only faulting trap is an instruction
1584 * breakpoint, which can't happen to us.
1585 */
1586 goto out;
1589 /* Save debug status register where guest OS can peek at it */
1590 v->arch.guest_context.debugreg[6] = condition;
1592 ti = &v->arch.guest_context.trap_ctxt[TRAP_debug];
1593 tb->flags = TBF_EXCEPTION;
1594 tb->cs = ti->cs;
1595 tb->eip = ti->address;
1596 if ( TI_GET_IF(ti) )
1597 tb->flags |= TBF_INTERRUPT;
1599 out:
1600 return EXCRET_not_a_fault;
1603 asmlinkage int do_spurious_interrupt_bug(struct cpu_user_regs *regs)
1605 return EXCRET_not_a_fault;
1608 void set_intr_gate(unsigned int n, void *addr)
1610 #ifdef __i386__
1611 int i;
1612 /* Keep secondary tables in sync with IRQ updates. */
1613 for ( i = 1; i < NR_CPUS; i++ )
1614 if ( idt_tables[i] != NULL )
1615 _set_gate(&idt_tables[i][n], 14, 0, addr);
1616 #endif
1617 _set_gate(&idt_table[n], 14, 0, addr);
1620 void set_system_gate(unsigned int n, void *addr)
1622 _set_gate(idt_table+n,14,3,addr);
1625 void set_task_gate(unsigned int n, unsigned int sel)
1627 idt_table[n].a = sel << 16;
1628 idt_table[n].b = 0x8500;
1631 void set_tss_desc(unsigned int n, void *addr)
1633 _set_tssldt_desc(
1634 gdt_table + __TSS(n) - FIRST_RESERVED_GDT_ENTRY,
1635 (unsigned long)addr,
1636 offsetof(struct tss_struct, __cacheline_filler) - 1,
1637 9);
1640 void __init trap_init(void)
1642 extern void percpu_traps_init(void);
1644 /*
1645 * Note that interrupt gates are always used, rather than trap gates. We
1646 * must have interrupts disabled until DS/ES/FS/GS are saved because the
1647 * first activation must have the "bad" value(s) for these registers and
1648 * we may lose them if another activation is installed before they are
1649 * saved. The page-fault handler also needs interrupts disabled until %cr2
1650 * has been read and saved on the stack.
1651 */
1652 set_intr_gate(TRAP_divide_error,&divide_error);
1653 set_intr_gate(TRAP_debug,&debug);
1654 set_intr_gate(TRAP_nmi,&nmi);
1655 set_system_gate(TRAP_int3,&int3); /* usable from all privileges */
1656 set_system_gate(TRAP_overflow,&overflow); /* usable from all privileges */
1657 set_intr_gate(TRAP_bounds,&bounds);
1658 set_intr_gate(TRAP_invalid_op,&invalid_op);
1659 set_intr_gate(TRAP_no_device,&device_not_available);
1660 set_intr_gate(TRAP_copro_seg,&coprocessor_segment_overrun);
1661 set_intr_gate(TRAP_invalid_tss,&invalid_TSS);
1662 set_intr_gate(TRAP_no_segment,&segment_not_present);
1663 set_intr_gate(TRAP_stack_error,&stack_segment);
1664 set_intr_gate(TRAP_gp_fault,&general_protection);
1665 set_intr_gate(TRAP_page_fault,&page_fault);
1666 set_intr_gate(TRAP_spurious_int,&spurious_interrupt_bug);
1667 set_intr_gate(TRAP_copro_error,&coprocessor_error);
1668 set_intr_gate(TRAP_alignment_check,&alignment_check);
1669 set_intr_gate(TRAP_machine_check,&machine_check);
1670 set_intr_gate(TRAP_simd_error,&simd_coprocessor_error);
1672 percpu_traps_init();
1674 cpu_init();
1676 open_softirq(NMI_SOFTIRQ, nmi_softirq);
1680 long do_set_trap_table(XEN_GUEST_HANDLE(trap_info_t) traps)
1682 struct trap_info cur;
1683 struct trap_info *dst = current->arch.guest_context.trap_ctxt;
1684 long rc = 0;
1686 /* If no table is presented then clear the entire virtual IDT. */
1687 if ( guest_handle_is_null(traps) )
1689 memset(dst, 0, 256 * sizeof(*dst));
1690 init_int80_direct_trap(current);
1691 return 0;
1694 for ( ; ; )
1696 if ( hypercall_preempt_check() )
1698 rc = hypercall_create_continuation(
1699 __HYPERVISOR_set_trap_table, "h", traps);
1700 break;
1703 if ( copy_from_guest(&cur, traps, 1) )
1705 rc = -EFAULT;
1706 break;
1709 if ( cur.address == 0 )
1710 break;
1712 fixup_guest_code_selector(cur.cs);
1714 memcpy(&dst[cur.vector], &cur, sizeof(cur));
1716 if ( cur.vector == 0x80 )
1717 init_int80_direct_trap(current);
1719 guest_handle_add_offset(traps, 1);
1722 return rc;
1726 long set_debugreg(struct vcpu *p, int reg, unsigned long value)
1728 int i;
1730 switch ( reg )
1732 case 0:
1733 if ( !access_ok(value, sizeof(long)) )
1734 return -EPERM;
1735 if ( p == current )
1736 __asm__ ( "mov %0, %%db0" : : "r" (value) );
1737 break;
1738 case 1:
1739 if ( !access_ok(value, sizeof(long)) )
1740 return -EPERM;
1741 if ( p == current )
1742 __asm__ ( "mov %0, %%db1" : : "r" (value) );
1743 break;
1744 case 2:
1745 if ( !access_ok(value, sizeof(long)) )
1746 return -EPERM;
1747 if ( p == current )
1748 __asm__ ( "mov %0, %%db2" : : "r" (value) );
1749 break;
1750 case 3:
1751 if ( !access_ok(value, sizeof(long)) )
1752 return -EPERM;
1753 if ( p == current )
1754 __asm__ ( "mov %0, %%db3" : : "r" (value) );
1755 break;
1756 case 6:
1757 /*
1758 * DR6: Bits 4-11,16-31 reserved (set to 1).
1759 * Bit 12 reserved (set to 0).
1760 */
1761 value &= 0xffffefff; /* reserved bits => 0 */
1762 value |= 0xffff0ff0; /* reserved bits => 1 */
1763 if ( p == current )
1764 __asm__ ( "mov %0, %%db6" : : "r" (value) );
1765 break;
1766 case 7:
1767 /*
1768 * DR7: Bit 10 reserved (set to 1).
1769 * Bits 11-12,14-15 reserved (set to 0).
1770 * Privileged bits:
1771 * GD (bit 13): must be 0.
1772 * R/Wn (bits 16-17,20-21,24-25,28-29): mustn't be 10.
1773 * LENn (bits 18-19,22-23,26-27,30-31): mustn't be 10.
1774 */
1775 /* DR7 == 0 => debugging disabled for this domain. */
1776 if ( value != 0 )
1778 value &= 0xffff27ff; /* reserved bits => 0 */
1779 value |= 0x00000400; /* reserved bits => 1 */
1780 if ( (value & (1<<13)) != 0 ) return -EPERM;
1781 for ( i = 0; i < 16; i += 2 )
1782 if ( ((value >> (i+16)) & 3) == 2 ) return -EPERM;
1784 if ( p == current )
1785 __asm__ ( "mov %0, %%db7" : : "r" (value) );
1786 break;
1787 default:
1788 return -EINVAL;
1791 p->arch.guest_context.debugreg[reg] = value;
1792 return 0;
1795 long do_set_debugreg(int reg, unsigned long value)
1797 return set_debugreg(current, reg, value);
1800 unsigned long do_get_debugreg(int reg)
1802 if ( (reg < 0) || (reg > 7) ) return -EINVAL;
1803 return current->arch.guest_context.debugreg[reg];
1806 /*
1807 * Local variables:
1808 * mode: C
1809 * c-set-style: "BSD"
1810 * c-basic-offset: 4
1811 * tab-width: 4
1812 * indent-tabs-mode: nil
1813 * End:
1814 */