ia64/xen-unstable

view xen/arch/x86/traps.c @ 6538:84ee014ebd41

Merge xen-vtx-unstable.hg
author adsharma@los-vmm.sc.intel.com
date Wed Aug 17 12:34:38 2005 -0800 (2005-08-17)
parents 23979fb12c49 da515995a281
children 99914b54f7bf
line source
1 /******************************************************************************
2 * arch/x86/traps.c
3 *
4 * Modifications to Linux original are copyright (c) 2002-2004, K A Fraser
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
21 /*
22 * Copyright (C) 1991, 1992 Linus Torvalds
23 *
24 * Pentium III FXSR, SSE support
25 * Gareth Hughes <gareth@valinux.com>, May 2000
26 */
28 #include <xen/config.h>
29 #include <xen/init.h>
30 #include <xen/sched.h>
31 #include <xen/lib.h>
32 #include <xen/errno.h>
33 #include <xen/mm.h>
34 #include <xen/console.h>
35 #include <asm/regs.h>
36 #include <xen/delay.h>
37 #include <xen/event.h>
38 #include <xen/spinlock.h>
39 #include <xen/irq.h>
40 #include <xen/perfc.h>
41 #include <xen/softirq.h>
42 #include <xen/domain_page.h>
43 #include <xen/symbols.h>
44 #include <asm/shadow.h>
45 #include <asm/system.h>
46 #include <asm/io.h>
47 #include <asm/atomic.h>
48 #include <asm/desc.h>
49 #include <asm/debugreg.h>
50 #include <asm/smp.h>
51 #include <asm/flushtlb.h>
52 #include <asm/uaccess.h>
53 #include <asm/i387.h>
54 #include <asm/debugger.h>
55 #include <asm/msr.h>
56 #include <asm/x86_emulate.h>
58 /*
59 * opt_nmi: one of 'ignore', 'dom0', or 'fatal'.
60 * fatal: Xen prints diagnostic message and then hangs.
61 * dom0: The NMI is virtualised to DOM0.
62 * ignore: The NMI error is cleared and ignored.
63 */
64 #ifdef NDEBUG
65 char opt_nmi[10] = "dom0";
66 #else
67 char opt_nmi[10] = "fatal";
68 #endif
69 string_param("nmi", opt_nmi);
71 /* Master table, used by all CPUs on x86/64, and by CPU0 on x86/32.*/
72 idt_entry_t idt_table[IDT_ENTRIES];
74 #define DECLARE_TRAP_HANDLER(_name) \
75 asmlinkage void _name(void); \
76 asmlinkage int do_ ## _name(struct cpu_user_regs *regs)
78 asmlinkage void nmi(void);
79 DECLARE_TRAP_HANDLER(divide_error);
80 DECLARE_TRAP_HANDLER(debug);
81 DECLARE_TRAP_HANDLER(int3);
82 DECLARE_TRAP_HANDLER(overflow);
83 DECLARE_TRAP_HANDLER(bounds);
84 DECLARE_TRAP_HANDLER(invalid_op);
85 DECLARE_TRAP_HANDLER(device_not_available);
86 DECLARE_TRAP_HANDLER(coprocessor_segment_overrun);
87 DECLARE_TRAP_HANDLER(invalid_TSS);
88 DECLARE_TRAP_HANDLER(segment_not_present);
89 DECLARE_TRAP_HANDLER(stack_segment);
90 DECLARE_TRAP_HANDLER(general_protection);
91 DECLARE_TRAP_HANDLER(page_fault);
92 DECLARE_TRAP_HANDLER(coprocessor_error);
93 DECLARE_TRAP_HANDLER(simd_coprocessor_error);
94 DECLARE_TRAP_HANDLER(alignment_check);
95 DECLARE_TRAP_HANDLER(spurious_interrupt_bug);
96 DECLARE_TRAP_HANDLER(machine_check);
98 long do_set_debugreg(int reg, unsigned long value);
99 unsigned long do_get_debugreg(int reg);
101 static int debug_stack_lines = 20;
102 integer_param("debug_stack_lines", debug_stack_lines);
104 int is_kernel_text(unsigned long addr)
105 {
106 extern char _stext, _etext;
107 if (addr >= (unsigned long) &_stext &&
108 addr <= (unsigned long) &_etext)
109 return 1;
110 return 0;
112 }
114 unsigned long kernel_text_end(void)
115 {
116 extern char _etext;
117 return (unsigned long) &_etext;
118 }
120 void show_guest_stack(void)
121 {
122 int i;
123 struct cpu_user_regs *regs = guest_cpu_user_regs();
124 unsigned long *stack = (unsigned long *)regs->esp, addr;
126 printk("Guest stack trace from "__OP"sp=%p:\n ", stack);
128 for ( i = 0; i < (debug_stack_lines*8); i++ )
129 {
130 if ( ((long)stack & (STACK_SIZE-1)) == 0 )
131 break;
132 if ( get_user(addr, stack) )
133 {
134 if ( i != 0 )
135 printk("\n ");
136 printk("Fault while accessing guest memory.");
137 i = 1;
138 break;
139 }
140 if ( (i != 0) && ((i % 8) == 0) )
141 printk("\n ");
142 printk("%p ", _p(addr));
143 stack++;
144 }
145 if ( i == 0 )
146 printk("Stack empty.");
147 printk("\n");
148 }
150 void show_trace(unsigned long *esp)
151 {
152 unsigned long *stack = esp, addr;
153 int i = 0;
155 printk("Xen call trace from "__OP"sp=%p:\n ", stack);
157 while ( ((long) stack & (STACK_SIZE-1)) != 0 )
158 {
159 addr = *stack++;
160 if ( is_kernel_text(addr) )
161 {
162 if ( (i != 0) && ((i % 6) == 0) )
163 printk("\n ");
164 printk("[<%p>]", _p(addr));
165 print_symbol(" %s\n", addr);
166 i++;
167 }
168 }
169 if ( i == 0 )
170 printk("Trace empty.");
171 printk("\n");
172 }
174 void show_stack(unsigned long *esp)
175 {
176 unsigned long *stack = esp, addr;
177 int i;
179 printk("Xen stack trace from "__OP"sp=%p:\n ", stack);
181 for ( i = 0; i < (debug_stack_lines*8); i++ )
182 {
183 if ( ((long)stack & (STACK_SIZE-1)) == 0 )
184 break;
185 if ( (i != 0) && ((i % 8) == 0) )
186 printk("\n ");
187 addr = *stack++;
188 printk("%p ", _p(addr));
189 }
190 if ( i == 0 )
191 printk("Stack empty.");
192 printk("\n");
194 show_trace(esp);
195 }
197 /*
198 * This is called for faults at very unexpected times (e.g., when interrupts
199 * are disabled). In such situations we can't do much that is safe. We try to
200 * print out some tracing and then we just spin.
201 */
202 asmlinkage void fatal_trap(int trapnr, struct cpu_user_regs *regs)
203 {
204 int cpu = smp_processor_id();
205 unsigned long cr2;
206 static char *trapstr[] = {
207 "divide error", "debug", "nmi", "bkpt", "overflow", "bounds",
208 "invalid operation", "device not available", "double fault",
209 "coprocessor segment", "invalid tss", "segment not found",
210 "stack error", "general protection fault", "page fault",
211 "spurious interrupt", "coprocessor error", "alignment check",
212 "machine check", "simd error"
213 };
215 watchdog_disable();
216 console_start_sync();
218 show_registers(regs);
220 if ( trapnr == TRAP_page_fault )
221 {
222 __asm__ __volatile__ ("mov %%cr2,%0" : "=r" (cr2) : );
223 printk("Faulting linear address: %p\n", _p(cr2));
224 show_page_walk(cr2);
225 }
227 printk("************************************\n");
228 printk("CPU%d FATAL TRAP %d (%s), ERROR_CODE %04x%s.\n",
229 cpu, trapnr, trapstr[trapnr], regs->error_code,
230 (regs->eflags & X86_EFLAGS_IF) ? "" : ", IN INTERRUPT CONTEXT");
231 printk("System shutting down -- need manual reset.\n");
232 printk("************************************\n");
234 (void)debugger_trap_fatal(trapnr, regs);
236 /* Lock up the console to prevent spurious output from other CPUs. */
237 console_force_lock();
239 /* Wait for manual reset. */
240 for ( ; ; )
241 __asm__ __volatile__ ( "hlt" );
242 }
244 static inline int do_trap(int trapnr, char *str,
245 struct cpu_user_regs *regs,
246 int use_error_code)
247 {
248 struct vcpu *v = current;
249 struct trap_bounce *tb = &v->arch.trap_bounce;
250 trap_info_t *ti;
251 unsigned long fixup;
253 DEBUGGER_trap_entry(trapnr, regs);
255 if ( !GUEST_MODE(regs) )
256 goto xen_fault;
258 ti = &current->arch.guest_context.trap_ctxt[trapnr];
259 tb->flags = TBF_EXCEPTION;
260 tb->cs = ti->cs;
261 tb->eip = ti->address;
262 if ( use_error_code )
263 {
264 tb->flags |= TBF_EXCEPTION_ERRCODE;
265 tb->error_code = regs->error_code;
266 }
267 if ( TI_GET_IF(ti) )
268 tb->flags |= TBF_INTERRUPT;
269 return 0;
271 xen_fault:
273 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
274 {
275 DPRINTK("Trap %d: %p -> %p\n", trapnr, _p(regs->eip), _p(fixup));
276 regs->eip = fixup;
277 return 0;
278 }
280 DEBUGGER_trap_fatal(trapnr, regs);
282 show_registers(regs);
283 panic("CPU%d FATAL TRAP: vector = %d (%s)\n"
284 "[error_code=%04x]\n",
285 smp_processor_id(), trapnr, str, regs->error_code);
286 return 0;
287 }
289 #define DO_ERROR_NOCODE(trapnr, str, name) \
290 asmlinkage int do_##name(struct cpu_user_regs *regs) \
291 { \
292 return do_trap(trapnr, str, regs, 0); \
293 }
295 #define DO_ERROR(trapnr, str, name) \
296 asmlinkage int do_##name(struct cpu_user_regs *regs) \
297 { \
298 return do_trap(trapnr, str, regs, 1); \
299 }
301 DO_ERROR_NOCODE( 0, "divide error", divide_error)
302 DO_ERROR_NOCODE( 4, "overflow", overflow)
303 DO_ERROR_NOCODE( 5, "bounds", bounds)
304 DO_ERROR_NOCODE( 6, "invalid operand", invalid_op)
305 DO_ERROR_NOCODE( 9, "coprocessor segment overrun", coprocessor_segment_overrun)
306 DO_ERROR(10, "invalid TSS", invalid_TSS)
307 DO_ERROR(11, "segment not present", segment_not_present)
308 DO_ERROR(12, "stack segment", stack_segment)
309 DO_ERROR_NOCODE(16, "fpu error", coprocessor_error)
310 DO_ERROR(17, "alignment check", alignment_check)
311 DO_ERROR_NOCODE(19, "simd error", simd_coprocessor_error)
313 asmlinkage int do_int3(struct cpu_user_regs *regs)
314 {
315 struct vcpu *v = current;
316 struct trap_bounce *tb = &v->arch.trap_bounce;
317 trap_info_t *ti;
319 DEBUGGER_trap_entry(TRAP_int3, regs);
321 if ( !GUEST_MODE(regs) )
322 {
323 DEBUGGER_trap_fatal(TRAP_int3, regs);
324 show_registers(regs);
325 panic("CPU%d FATAL TRAP: vector = 3 (Int3)\n", smp_processor_id());
326 }
328 ti = &current->arch.guest_context.trap_ctxt[TRAP_int3];
329 tb->flags = TBF_EXCEPTION;
330 tb->cs = ti->cs;
331 tb->eip = ti->address;
332 if ( TI_GET_IF(ti) )
333 tb->flags |= TBF_INTERRUPT;
335 return 0;
336 }
338 asmlinkage int do_machine_check(struct cpu_user_regs *regs)
339 {
340 fatal_trap(TRAP_machine_check, regs);
341 return 0;
342 }
344 void propagate_page_fault(unsigned long addr, u16 error_code)
345 {
346 trap_info_t *ti;
347 struct vcpu *v = current;
348 struct trap_bounce *tb = &v->arch.trap_bounce;
350 ti = &v->arch.guest_context.trap_ctxt[TRAP_page_fault];
351 tb->flags = TBF_EXCEPTION | TBF_EXCEPTION_ERRCODE | TBF_EXCEPTION_CR2;
352 tb->cr2 = addr;
353 tb->error_code = error_code;
354 tb->cs = ti->cs;
355 tb->eip = ti->address;
356 if ( TI_GET_IF(ti) )
357 tb->flags |= TBF_INTERRUPT;
359 v->arch.guest_context.ctrlreg[2] = addr;
360 }
362 static int handle_perdomain_mapping_fault(
363 unsigned long offset, struct cpu_user_regs *regs)
364 {
365 extern int map_ldt_shadow_page(unsigned int);
367 struct vcpu *v = current;
368 struct domain *d = v->domain;
369 int ret;
371 /* Which vcpu's area did we fault in, and is it in the ldt sub-area? */
372 unsigned int is_ldt_area = (offset >> (PDPT_VCPU_VA_SHIFT-1)) & 1;
373 unsigned int vcpu_area = (offset >> PDPT_VCPU_VA_SHIFT);
375 /* Should never fault in another vcpu's area. */
376 BUG_ON(vcpu_area != current->vcpu_id);
378 /* Byte offset within the gdt/ldt sub-area. */
379 offset &= (1UL << (PDPT_VCPU_VA_SHIFT-1)) - 1UL;
381 if ( likely(is_ldt_area) )
382 {
383 /* LDT fault: Copy a mapping from the guest's LDT, if it is valid. */
384 LOCK_BIGLOCK(d);
385 ret = map_ldt_shadow_page(offset >> PAGE_SHIFT);
386 UNLOCK_BIGLOCK(d);
388 if ( unlikely(ret == 0) )
389 {
390 /* In hypervisor mode? Leave it to the #PF handler to fix up. */
391 if ( !GUEST_MODE(regs) )
392 return 0;
393 /* In guest mode? Propagate #PF to guest, with adjusted %cr2. */
394 propagate_page_fault(
395 v->arch.guest_context.ldt_base + offset, regs->error_code);
396 }
397 }
398 else
399 {
400 /* GDT fault: handle the fault as #GP(selector). */
401 regs->error_code = (u16)offset & ~7;
402 (void)do_general_protection(regs);
403 }
405 return EXCRET_fault_fixed;
406 }
408 asmlinkage int do_page_fault(struct cpu_user_regs *regs)
409 {
410 unsigned long addr, fixup;
411 struct vcpu *v = current;
412 struct domain *d = v->domain;
414 __asm__ __volatile__ ("mov %%cr2,%0" : "=r" (addr) : );
416 DEBUGGER_trap_entry(TRAP_page_fault, regs);
418 perfc_incrc(page_faults);
420 if ( likely(VM_ASSIST(d, VMASST_TYPE_writable_pagetables) &&
421 !shadow_mode_enabled(d)) )
422 {
423 LOCK_BIGLOCK(d);
424 if ( unlikely(d->arch.ptwr[PTWR_PT_ACTIVE].l1va) &&
425 unlikely(l2_linear_offset(addr) ==
426 d->arch.ptwr[PTWR_PT_ACTIVE].l2_idx) )
427 {
428 ptwr_flush(d, PTWR_PT_ACTIVE);
429 UNLOCK_BIGLOCK(d);
430 return EXCRET_fault_fixed;
431 }
433 if ( ((addr < HYPERVISOR_VIRT_START)
434 #if defined(__x86_64__)
435 || (addr >= HYPERVISOR_VIRT_END)
436 #endif
437 )
438 &&
439 KERNEL_MODE(v, regs) &&
440 ((regs->error_code & 3) == 3) && /* write-protection fault */
441 ptwr_do_page_fault(d, addr, regs) )
442 {
443 UNLOCK_BIGLOCK(d);
444 return EXCRET_fault_fixed;
445 }
446 UNLOCK_BIGLOCK(d);
447 }
449 if ( unlikely(shadow_mode_enabled(d)) &&
450 ((addr < HYPERVISOR_VIRT_START) ||
451 #if defined(__x86_64__)
452 (addr >= HYPERVISOR_VIRT_END) ||
453 #endif
454 (shadow_mode_external(d) && GUEST_CONTEXT(v, regs))) &&
455 shadow_fault(addr, regs) )
456 return EXCRET_fault_fixed;
458 if ( unlikely(addr >= PERDOMAIN_VIRT_START) &&
459 unlikely(addr < PERDOMAIN_VIRT_END) &&
460 handle_perdomain_mapping_fault(addr - PERDOMAIN_VIRT_START, regs) )
461 return EXCRET_fault_fixed;
463 if ( !GUEST_MODE(regs) )
464 goto xen_fault;
466 propagate_page_fault(addr, regs->error_code);
467 return 0;
469 xen_fault:
471 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
472 {
473 perfc_incrc(copy_user_faults);
474 regs->eip = fixup;
475 return 0;
476 }
478 DEBUGGER_trap_fatal(TRAP_page_fault, regs);
480 show_registers(regs);
481 show_page_walk(addr);
482 panic("CPU%d FATAL PAGE FAULT\n"
483 "[error_code=%04x]\n"
484 "Faulting linear address: %p\n",
485 smp_processor_id(), regs->error_code, addr);
486 return 0;
487 }
489 long do_fpu_taskswitch(int set)
490 {
491 struct vcpu *v = current;
493 if ( set )
494 {
495 v->arch.guest_context.ctrlreg[0] |= X86_CR0_TS;
496 stts();
497 }
498 else
499 {
500 v->arch.guest_context.ctrlreg[0] &= ~X86_CR0_TS;
501 if ( test_bit(_VCPUF_fpu_dirtied, &v->vcpu_flags) )
502 clts();
503 }
505 return 0;
506 }
508 /* Has the guest requested sufficient permission for this I/O access? */
509 static inline int guest_io_okay(
510 unsigned int port, unsigned int bytes,
511 struct vcpu *v, struct cpu_user_regs *regs)
512 {
513 u16 x;
514 #if defined(__x86_64__)
515 /* If in user mode, switch to kernel mode just to read I/O bitmap. */
516 extern void toggle_guest_mode(struct vcpu *);
517 int user_mode = !(v->arch.flags & TF_kernel_mode);
518 #define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v)
519 #elif defined(__i386__)
520 #define TOGGLE_MODE() ((void)0)
521 #endif
523 if ( v->arch.iopl >= (KERNEL_MODE(v, regs) ? 1 : 3) )
524 return 1;
526 if ( v->arch.iobmp_limit > (port + bytes) )
527 {
528 TOGGLE_MODE();
529 __get_user(x, (u16 *)(v->arch.iobmp+(port>>3)));
530 TOGGLE_MODE();
531 if ( (x & (((1<<bytes)-1) << (port&7))) == 0 )
532 return 1;
533 }
535 return 0;
536 }
538 /* Has the administrator granted sufficient permission for this I/O access? */
539 static inline int admin_io_okay(
540 unsigned int port, unsigned int bytes,
541 struct vcpu *v, struct cpu_user_regs *regs)
542 {
543 struct domain *d = v->domain;
544 u16 x;
546 if ( d->arch.iobmp_mask != NULL )
547 {
548 x = *(u16 *)(d->arch.iobmp_mask + (port >> 3));
549 if ( (x & (((1<<bytes)-1) << (port&7))) == 0 )
550 return 1;
551 }
553 return 0;
554 }
556 /* Check admin limits. Silently fail the access if it is disallowed. */
557 #define inb_user(_p, _d, _r) (admin_io_okay(_p, 1, _d, _r) ? inb(_p) : ~0)
558 #define inw_user(_p, _d, _r) (admin_io_okay(_p, 2, _d, _r) ? inw(_p) : ~0)
559 #define inl_user(_p, _d, _r) (admin_io_okay(_p, 4, _d, _r) ? inl(_p) : ~0)
560 #define outb_user(_v, _p, _d, _r) \
561 (admin_io_okay(_p, 1, _d, _r) ? outb(_v, _p) : ((void)0))
562 #define outw_user(_v, _p, _d, _r) \
563 (admin_io_okay(_p, 2, _d, _r) ? outw(_v, _p) : ((void)0))
564 #define outl_user(_v, _p, _d, _r) \
565 (admin_io_okay(_p, 4, _d, _r) ? outl(_v, _p) : ((void)0))
567 /* Propagate a fault back to the guest kernel. */
568 #define USER_READ_FAULT 4 /* user mode, read fault */
569 #define USER_WRITE_FAULT 6 /* user mode, write fault */
570 #define PAGE_FAULT(_faultaddr, _errcode) \
571 ({ propagate_page_fault(_faultaddr, _errcode); \
572 return EXCRET_fault_fixed; \
573 })
575 /* Isntruction fetch with error handling. */
576 #define insn_fetch(_type, _size, _ptr) \
577 ({ unsigned long _x; \
578 if ( get_user(_x, (_type *)eip) ) \
579 PAGE_FAULT(eip, USER_READ_FAULT); \
580 eip += _size; (_type)_x; })
582 static int emulate_privileged_op(struct cpu_user_regs *regs)
583 {
584 struct vcpu *v = current;
585 unsigned long *reg, eip = regs->eip, res;
586 u8 opcode, modrm_reg = 0, modrm_rm = 0, rep_prefix = 0;
587 unsigned int port, i, op_bytes = 4, data;
589 /* Legacy prefixes. */
590 for ( i = 0; i < 8; i++ )
591 {
592 switch ( opcode = insn_fetch(u8, 1, eip) )
593 {
594 case 0x66: /* operand-size override */
595 op_bytes ^= 6; /* switch between 2/4 bytes */
596 break;
597 case 0x67: /* address-size override */
598 case 0x2e: /* CS override */
599 case 0x3e: /* DS override */
600 case 0x26: /* ES override */
601 case 0x64: /* FS override */
602 case 0x65: /* GS override */
603 case 0x36: /* SS override */
604 case 0xf0: /* LOCK */
605 case 0xf2: /* REPNE/REPNZ */
606 break;
607 case 0xf3: /* REP/REPE/REPZ */
608 rep_prefix = 1;
609 break;
610 default:
611 goto done_prefixes;
612 }
613 }
614 done_prefixes:
616 #ifdef __x86_64__
617 /* REX prefix. */
618 if ( (opcode & 0xf0) == 0x40 )
619 {
620 modrm_reg = (opcode & 4) << 1; /* REX.R */
621 modrm_rm = (opcode & 1) << 3; /* REX.B */
623 /* REX.W and REX.X do not need to be decoded. */
624 opcode = insn_fetch(u8, 1, eip);
625 }
626 #endif
628 /* Input/Output String instructions. */
629 if ( (opcode >= 0x6c) && (opcode <= 0x6f) )
630 {
631 if ( rep_prefix && (regs->ecx == 0) )
632 goto done;
634 continue_io_string:
635 switch ( opcode )
636 {
637 case 0x6c: /* INSB */
638 op_bytes = 1;
639 case 0x6d: /* INSW/INSL */
640 if ( !guest_io_okay((u16)regs->edx, op_bytes, v, regs) )
641 goto fail;
642 switch ( op_bytes )
643 {
644 case 1:
645 data = (u8)inb_user((u16)regs->edx, v, regs);
646 if ( put_user((u8)data, (u8 *)regs->edi) )
647 PAGE_FAULT(regs->edi, USER_WRITE_FAULT);
648 break;
649 case 2:
650 data = (u16)inw_user((u16)regs->edx, v, regs);
651 if ( put_user((u16)data, (u16 *)regs->edi) )
652 PAGE_FAULT(regs->edi, USER_WRITE_FAULT);
653 break;
654 case 4:
655 data = (u32)inl_user((u16)regs->edx, v, regs);
656 if ( put_user((u32)data, (u32 *)regs->edi) )
657 PAGE_FAULT(regs->edi, USER_WRITE_FAULT);
658 break;
659 }
660 regs->edi += (regs->eflags & EF_DF) ? -op_bytes : op_bytes;
661 break;
663 case 0x6e: /* OUTSB */
664 op_bytes = 1;
665 case 0x6f: /* OUTSW/OUTSL */
666 if ( !guest_io_okay((u16)regs->edx, op_bytes, v, regs) )
667 goto fail;
668 switch ( op_bytes )
669 {
670 case 1:
671 if ( get_user(data, (u8 *)regs->esi) )
672 PAGE_FAULT(regs->esi, USER_READ_FAULT);
673 outb_user((u8)data, (u16)regs->edx, v, regs);
674 break;
675 case 2:
676 if ( get_user(data, (u16 *)regs->esi) )
677 PAGE_FAULT(regs->esi, USER_READ_FAULT);
678 outw_user((u16)data, (u16)regs->edx, v, regs);
679 break;
680 case 4:
681 if ( get_user(data, (u32 *)regs->esi) )
682 PAGE_FAULT(regs->esi, USER_READ_FAULT);
683 outl_user((u32)data, (u16)regs->edx, v, regs);
684 break;
685 }
686 regs->esi += (regs->eflags & EF_DF) ? -op_bytes : op_bytes;
687 break;
688 }
690 if ( rep_prefix && (--regs->ecx != 0) )
691 {
692 if ( !hypercall_preempt_check() )
693 goto continue_io_string;
694 eip = regs->eip;
695 }
697 goto done;
698 }
700 /* I/O Port and Interrupt Flag instructions. */
701 switch ( opcode )
702 {
703 case 0xe4: /* IN imm8,%al */
704 op_bytes = 1;
705 case 0xe5: /* IN imm8,%eax */
706 port = insn_fetch(u8, 1, eip);
707 exec_in:
708 if ( !guest_io_okay(port, op_bytes, v, regs) )
709 goto fail;
710 switch ( op_bytes )
711 {
712 case 1:
713 regs->eax &= ~0xffUL;
714 regs->eax |= (u8)inb_user(port, v, regs);
715 break;
716 case 2:
717 regs->eax &= ~0xffffUL;
718 regs->eax |= (u16)inw_user(port, v, regs);
719 break;
720 case 4:
721 regs->eax = (u32)inl_user(port, v, regs);
722 break;
723 }
724 goto done;
726 case 0xec: /* IN %dx,%al */
727 op_bytes = 1;
728 case 0xed: /* IN %dx,%eax */
729 port = (u16)regs->edx;
730 goto exec_in;
732 case 0xe6: /* OUT %al,imm8 */
733 op_bytes = 1;
734 case 0xe7: /* OUT %eax,imm8 */
735 port = insn_fetch(u8, 1, eip);
736 exec_out:
737 if ( !guest_io_okay(port, op_bytes, v, regs) )
738 goto fail;
739 switch ( op_bytes )
740 {
741 case 1:
742 outb_user((u8)regs->eax, port, v, regs);
743 break;
744 case 2:
745 outw_user((u16)regs->eax, port, v, regs);
746 break;
747 case 4:
748 outl_user((u32)regs->eax, port, v, regs);
749 break;
750 }
751 goto done;
753 case 0xee: /* OUT %al,%dx */
754 op_bytes = 1;
755 case 0xef: /* OUT %eax,%dx */
756 port = (u16)regs->edx;
757 goto exec_out;
759 case 0xfa: /* CLI */
760 case 0xfb: /* STI */
761 if ( v->arch.iopl < (KERNEL_MODE(v, regs) ? 1 : 3) )
762 goto fail;
763 /*
764 * This is just too dangerous to allow, in my opinion. Consider if the
765 * caller then tries to reenable interrupts using POPF: we can't trap
766 * that and we'll end up with hard-to-debug lockups. Fast & loose will
767 * do for us. :-)
768 */
769 /*v->vcpu_info->evtchn_upcall_mask = (opcode == 0xfa);*/
770 goto done;
772 case 0x0f: /* Two-byte opcode */
773 break;
775 default:
776 goto fail;
777 }
779 /* Remaining instructions only emulated from guest kernel. */
780 if ( !KERNEL_MODE(v, regs) )
781 goto fail;
783 /* Privileged (ring 0) instructions. */
784 opcode = insn_fetch(u8, 1, eip);
785 switch ( opcode )
786 {
787 case 0x06: /* CLTS */
788 (void)do_fpu_taskswitch(0);
789 break;
791 case 0x09: /* WBINVD */
792 /* Ignore the instruction if unprivileged. */
793 if ( !IS_CAPABLE_PHYSDEV(v->domain) )
794 DPRINTK("Non-physdev domain attempted WBINVD.\n");
795 else
796 wbinvd();
797 break;
799 case 0x20: /* MOV CR?,<reg> */
800 opcode = insn_fetch(u8, 1, eip);
801 modrm_reg |= (opcode >> 3) & 7;
802 modrm_rm |= (opcode >> 0) & 7;
803 reg = decode_register(modrm_rm, regs, 0);
804 switch ( modrm_reg )
805 {
806 case 0: /* Read CR0 */
807 *reg = v->arch.guest_context.ctrlreg[0];
808 break;
810 case 2: /* Read CR2 */
811 *reg = v->arch.guest_context.ctrlreg[2];
812 break;
814 case 3: /* Read CR3 */
815 *reg = pagetable_get_paddr(v->arch.guest_table);
816 break;
818 default:
819 goto fail;
820 }
821 break;
823 case 0x21: /* MOV DR?,<reg> */
824 opcode = insn_fetch(u8, 1, eip);
825 modrm_reg |= (opcode >> 3) & 7;
826 modrm_rm |= (opcode >> 0) & 7;
827 reg = decode_register(modrm_rm, regs, 0);
828 if ( (res = do_get_debugreg(modrm_reg)) > (unsigned long)-256 )
829 goto fail;
830 *reg = res;
831 break;
833 case 0x22: /* MOV <reg>,CR? */
834 opcode = insn_fetch(u8, 1, eip);
835 modrm_reg |= (opcode >> 3) & 7;
836 modrm_rm |= (opcode >> 0) & 7;
837 reg = decode_register(modrm_rm, regs, 0);
838 switch ( modrm_reg )
839 {
840 case 0: /* Write CR0 */
841 (void)do_fpu_taskswitch(!!(*reg & X86_CR0_TS));
842 break;
844 case 2: /* Write CR2 */
845 v->arch.guest_context.ctrlreg[2] = *reg;
846 break;
848 case 3: /* Write CR3 */
849 LOCK_BIGLOCK(v->domain);
850 (void)new_guest_cr3(*reg);
851 UNLOCK_BIGLOCK(v->domain);
852 break;
854 default:
855 goto fail;
856 }
857 break;
859 case 0x23: /* MOV <reg>,DR? */
860 opcode = insn_fetch(u8, 1, eip);
861 modrm_reg |= (opcode >> 3) & 7;
862 modrm_rm |= (opcode >> 0) & 7;
863 reg = decode_register(modrm_rm, regs, 0);
864 if ( do_set_debugreg(modrm_reg, *reg) != 0 )
865 goto fail;
866 break;
868 case 0x30: /* WRMSR */
869 /* Ignore the instruction if unprivileged. */
870 if ( !IS_PRIV(v->domain) )
871 DPRINTK("Non-priv domain attempted WRMSR(%p,%08lx,%08lx).\n",
872 _p(regs->ecx), (long)regs->eax, (long)regs->edx);
873 else if ( wrmsr_user(regs->ecx, regs->eax, regs->edx) )
874 goto fail;
875 break;
877 case 0x32: /* RDMSR */
878 if ( !IS_PRIV(v->domain) )
879 DPRINTK("Non-priv domain attempted RDMSR(%p,%08lx,%08lx).\n",
880 _p(regs->ecx), (long)regs->eax, (long)regs->edx);
881 /* Everyone can read the MSR space. */
882 if ( rdmsr_user(regs->ecx, regs->eax, regs->edx) )
883 goto fail;
884 break;
886 default:
887 goto fail;
888 }
890 done:
891 regs->eip = eip;
892 return EXCRET_fault_fixed;
894 fail:
895 return 0;
896 }
898 asmlinkage int do_general_protection(struct cpu_user_regs *regs)
899 {
900 struct vcpu *v = current;
901 struct trap_bounce *tb = &v->arch.trap_bounce;
902 trap_info_t *ti;
903 unsigned long fixup;
905 DEBUGGER_trap_entry(TRAP_gp_fault, regs);
907 if ( regs->error_code & 1 )
908 goto hardware_gp;
910 if ( !GUEST_MODE(regs) )
911 goto gp_in_kernel;
913 /*
914 * Cunning trick to allow arbitrary "INT n" handling.
915 *
916 * We set DPL == 0 on all vectors in the IDT. This prevents any INT <n>
917 * instruction from trapping to the appropriate vector, when that might not
918 * be expected by Xen or the guest OS. For example, that entry might be for
919 * a fault handler (unlike traps, faults don't increment EIP), or might
920 * expect an error code on the stack (which a software trap never
921 * provides), or might be a hardware interrupt handler that doesn't like
922 * being called spuriously.
923 *
924 * Instead, a GPF occurs with the faulting IDT vector in the error code.
925 * Bit 1 is set to indicate that an IDT entry caused the fault. Bit 0 is
926 * clear to indicate that it's a software fault, not hardware.
927 *
928 * NOTE: Vectors 3 and 4 are dealt with from their own handler. This is
929 * okay because they can only be triggered by an explicit DPL-checked
930 * instruction. The DPL specified by the guest OS for these vectors is NOT
931 * CHECKED!!
932 */
933 if ( (regs->error_code & 3) == 2 )
934 {
935 /* This fault must be due to <INT n> instruction. */
936 ti = &current->arch.guest_context.trap_ctxt[regs->error_code>>3];
937 if ( PERMIT_SOFTINT(TI_GET_DPL(ti), v, regs) )
938 {
939 tb->flags = TBF_EXCEPTION;
940 regs->eip += 2;
941 goto finish_propagation;
942 }
943 }
945 /* Emulate some simple privileged and I/O instructions. */
946 if ( (regs->error_code == 0) &&
947 emulate_privileged_op(regs) )
948 return 0;
950 #if defined(__i386__)
951 if ( VM_ASSIST(v->domain, VMASST_TYPE_4gb_segments) &&
952 (regs->error_code == 0) &&
953 gpf_emulate_4gb(regs) )
954 return 0;
955 #endif
957 /* Pass on GPF as is. */
958 ti = &current->arch.guest_context.trap_ctxt[TRAP_gp_fault];
959 tb->flags = TBF_EXCEPTION | TBF_EXCEPTION_ERRCODE;
960 tb->error_code = regs->error_code;
961 finish_propagation:
962 tb->cs = ti->cs;
963 tb->eip = ti->address;
964 if ( TI_GET_IF(ti) )
965 tb->flags |= TBF_INTERRUPT;
966 return 0;
968 gp_in_kernel:
970 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
971 {
972 DPRINTK("GPF (%04x): %p -> %p\n",
973 regs->error_code, _p(regs->eip), _p(fixup));
974 regs->eip = fixup;
975 return 0;
976 }
978 DEBUGGER_trap_fatal(TRAP_gp_fault, regs);
980 hardware_gp:
981 show_registers(regs);
982 panic("CPU%d GENERAL PROTECTION FAULT\n[error_code=%04x]\n",
983 smp_processor_id(), regs->error_code);
984 return 0;
985 }
987 unsigned long nmi_softirq_reason;
988 static void nmi_softirq(void)
989 {
990 if ( dom0 == NULL )
991 return;
993 if ( test_and_clear_bit(0, &nmi_softirq_reason) )
994 send_guest_virq(dom0->vcpu[0], VIRQ_PARITY_ERR);
996 if ( test_and_clear_bit(1, &nmi_softirq_reason) )
997 send_guest_virq(dom0->vcpu[0], VIRQ_IO_ERR);
998 }
1000 asmlinkage void mem_parity_error(struct cpu_user_regs *regs)
1002 /* Clear and disable the parity-error line. */
1003 outb((inb(0x61)&15)|4,0x61);
1005 switch ( opt_nmi[0] )
1007 case 'd': /* 'dom0' */
1008 set_bit(0, &nmi_softirq_reason);
1009 raise_softirq(NMI_SOFTIRQ);
1010 case 'i': /* 'ignore' */
1011 break;
1012 default: /* 'fatal' */
1013 console_force_unlock();
1014 printk("\n\nNMI - MEMORY ERROR\n");
1015 fatal_trap(TRAP_nmi, regs);
1019 asmlinkage void io_check_error(struct cpu_user_regs *regs)
1021 /* Clear and disable the I/O-error line. */
1022 outb((inb(0x61)&15)|8,0x61);
1024 switch ( opt_nmi[0] )
1026 case 'd': /* 'dom0' */
1027 set_bit(0, &nmi_softirq_reason);
1028 raise_softirq(NMI_SOFTIRQ);
1029 case 'i': /* 'ignore' */
1030 break;
1031 default: /* 'fatal' */
1032 console_force_unlock();
1033 printk("\n\nNMI - I/O ERROR\n");
1034 fatal_trap(TRAP_nmi, regs);
1038 static void unknown_nmi_error(unsigned char reason)
1040 printk("Uhhuh. NMI received for unknown reason %02x.\n", reason);
1041 printk("Dazed and confused, but trying to continue\n");
1042 printk("Do you have a strange power saving mode enabled?\n");
1045 asmlinkage void do_nmi(struct cpu_user_regs *regs, unsigned long reason)
1047 ++nmi_count(smp_processor_id());
1049 if ( nmi_watchdog )
1050 nmi_watchdog_tick(regs);
1052 if ( reason & 0x80 )
1053 mem_parity_error(regs);
1054 else if ( reason & 0x40 )
1055 io_check_error(regs);
1056 else if ( !nmi_watchdog )
1057 unknown_nmi_error((unsigned char)(reason&0xff));
1060 asmlinkage int math_state_restore(struct cpu_user_regs *regs)
1062 /* Prevent recursion. */
1063 clts();
1065 setup_fpu(current);
1067 if ( current->arch.guest_context.ctrlreg[0] & X86_CR0_TS )
1069 struct trap_bounce *tb = &current->arch.trap_bounce;
1070 tb->flags = TBF_EXCEPTION;
1071 tb->cs = current->arch.guest_context.trap_ctxt[7].cs;
1072 tb->eip = current->arch.guest_context.trap_ctxt[7].address;
1073 current->arch.guest_context.ctrlreg[0] &= ~X86_CR0_TS;
1076 return EXCRET_fault_fixed;
1079 asmlinkage int do_debug(struct cpu_user_regs *regs)
1081 unsigned long condition;
1082 struct vcpu *v = current;
1083 struct trap_bounce *tb = &v->arch.trap_bounce;
1085 __asm__ __volatile__("mov %%db6,%0" : "=r" (condition));
1087 /* Mask out spurious debug traps due to lazy DR7 setting */
1088 if ( (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) &&
1089 (v->arch.guest_context.debugreg[7] == 0) )
1091 __asm__("mov %0,%%db7" : : "r" (0UL));
1092 goto out;
1095 DEBUGGER_trap_entry(TRAP_debug, regs);
1097 if ( !GUEST_MODE(regs) )
1099 /* Clear TF just for absolute sanity. */
1100 regs->eflags &= ~EF_TF;
1101 /*
1102 * We ignore watchpoints when they trigger within Xen. This may happen
1103 * when a buffer is passed to us which previously had a watchpoint set
1104 * on it. No need to bump EIP; the only faulting trap is an instruction
1105 * breakpoint, which can't happen to us.
1106 */
1107 goto out;
1110 /* Save debug status register where guest OS can peek at it */
1111 v->arch.guest_context.debugreg[6] = condition;
1113 tb->flags = TBF_EXCEPTION;
1114 tb->cs = v->arch.guest_context.trap_ctxt[TRAP_debug].cs;
1115 tb->eip = v->arch.guest_context.trap_ctxt[TRAP_debug].address;
1117 out:
1118 return EXCRET_not_a_fault;
1121 asmlinkage int do_spurious_interrupt_bug(struct cpu_user_regs *regs)
1123 return EXCRET_not_a_fault;
1126 void set_intr_gate(unsigned int n, void *addr)
1128 #ifdef __i386__
1129 int i;
1130 /* Keep secondary tables in sync with IRQ updates. */
1131 for ( i = 1; i < NR_CPUS; i++ )
1132 if ( idt_tables[i] != NULL )
1133 _set_gate(&idt_tables[i][n], 14, 0, addr);
1134 #endif
1135 _set_gate(&idt_table[n], 14, 0, addr);
1138 void set_system_gate(unsigned int n, void *addr)
1140 _set_gate(idt_table+n,14,3,addr);
1143 void set_task_gate(unsigned int n, unsigned int sel)
1145 idt_table[n].a = sel << 16;
1146 idt_table[n].b = 0x8500;
1149 void set_tss_desc(unsigned int n, void *addr)
1151 _set_tssldt_desc(
1152 gdt_table + __TSS(n) - FIRST_RESERVED_GDT_ENTRY,
1153 (unsigned long)addr,
1154 offsetof(struct tss_struct, __cacheline_filler) - 1,
1155 9);
1158 void __init trap_init(void)
1160 extern void percpu_traps_init(void);
1162 /*
1163 * Note that interrupt gates are always used, rather than trap gates. We
1164 * must have interrupts disabled until DS/ES/FS/GS are saved because the
1165 * first activation must have the "bad" value(s) for these registers and
1166 * we may lose them if another activation is installed before they are
1167 * saved. The page-fault handler also needs interrupts disabled until %cr2
1168 * has been read and saved on the stack.
1169 */
1170 set_intr_gate(TRAP_divide_error,&divide_error);
1171 set_intr_gate(TRAP_debug,&debug);
1172 set_intr_gate(TRAP_nmi,&nmi);
1173 set_system_gate(TRAP_int3,&int3); /* usable from all privileges */
1174 set_system_gate(TRAP_overflow,&overflow); /* usable from all privileges */
1175 set_intr_gate(TRAP_bounds,&bounds);
1176 set_intr_gate(TRAP_invalid_op,&invalid_op);
1177 set_intr_gate(TRAP_no_device,&device_not_available);
1178 set_intr_gate(TRAP_copro_seg,&coprocessor_segment_overrun);
1179 set_intr_gate(TRAP_invalid_tss,&invalid_TSS);
1180 set_intr_gate(TRAP_no_segment,&segment_not_present);
1181 set_intr_gate(TRAP_stack_error,&stack_segment);
1182 set_intr_gate(TRAP_gp_fault,&general_protection);
1183 set_intr_gate(TRAP_page_fault,&page_fault);
1184 set_intr_gate(TRAP_spurious_int,&spurious_interrupt_bug);
1185 set_intr_gate(TRAP_copro_error,&coprocessor_error);
1186 set_intr_gate(TRAP_alignment_check,&alignment_check);
1187 set_intr_gate(TRAP_machine_check,&machine_check);
1188 set_intr_gate(TRAP_simd_error,&simd_coprocessor_error);
1190 percpu_traps_init();
1192 cpu_init();
1194 open_softirq(NMI_SOFTIRQ, nmi_softirq);
1198 long do_set_trap_table(trap_info_t *traps)
1200 trap_info_t cur;
1201 trap_info_t *dst = current->arch.guest_context.trap_ctxt;
1202 long rc = 0;
1204 LOCK_BIGLOCK(current->domain);
1206 for ( ; ; )
1208 if ( hypercall_preempt_check() )
1210 rc = hypercall1_create_continuation(
1211 __HYPERVISOR_set_trap_table, traps);
1212 break;
1215 if ( copy_from_user(&cur, traps, sizeof(cur)) )
1217 rc = -EFAULT;
1218 break;
1221 if ( cur.address == 0 )
1222 break;
1224 if ( !VALID_CODESEL(cur.cs) )
1226 rc = -EPERM;
1227 break;
1230 memcpy(&dst[cur.vector], &cur, sizeof(cur));
1232 if ( cur.vector == 0x80 )
1233 init_int80_direct_trap(current);
1235 traps++;
1238 UNLOCK_BIGLOCK(current->domain);
1240 return rc;
1244 long set_debugreg(struct vcpu *p, int reg, unsigned long value)
1246 int i;
1248 switch ( reg )
1250 case 0:
1251 if ( !access_ok(value, sizeof(long)) )
1252 return -EPERM;
1253 if ( p == current )
1254 __asm__ ( "mov %0, %%db0" : : "r" (value) );
1255 break;
1256 case 1:
1257 if ( !access_ok(value, sizeof(long)) )
1258 return -EPERM;
1259 if ( p == current )
1260 __asm__ ( "mov %0, %%db1" : : "r" (value) );
1261 break;
1262 case 2:
1263 if ( !access_ok(value, sizeof(long)) )
1264 return -EPERM;
1265 if ( p == current )
1266 __asm__ ( "mov %0, %%db2" : : "r" (value) );
1267 break;
1268 case 3:
1269 if ( !access_ok(value, sizeof(long)) )
1270 return -EPERM;
1271 if ( p == current )
1272 __asm__ ( "mov %0, %%db3" : : "r" (value) );
1273 break;
1274 case 6:
1275 /*
1276 * DR6: Bits 4-11,16-31 reserved (set to 1).
1277 * Bit 12 reserved (set to 0).
1278 */
1279 value &= 0xffffefff; /* reserved bits => 0 */
1280 value |= 0xffff0ff0; /* reserved bits => 1 */
1281 if ( p == current )
1282 __asm__ ( "mov %0, %%db6" : : "r" (value) );
1283 break;
1284 case 7:
1285 /*
1286 * DR7: Bit 10 reserved (set to 1).
1287 * Bits 11-12,14-15 reserved (set to 0).
1288 * Privileged bits:
1289 * GD (bit 13): must be 0.
1290 * R/Wn (bits 16-17,20-21,24-25,28-29): mustn't be 10.
1291 * LENn (bits 18-19,22-23,26-27,30-31): mustn't be 10.
1292 */
1293 /* DR7 == 0 => debugging disabled for this domain. */
1294 if ( value != 0 )
1296 value &= 0xffff27ff; /* reserved bits => 0 */
1297 value |= 0x00000400; /* reserved bits => 1 */
1298 if ( (value & (1<<13)) != 0 ) return -EPERM;
1299 for ( i = 0; i < 16; i += 2 )
1300 if ( ((value >> (i+16)) & 3) == 2 ) return -EPERM;
1302 if ( p == current )
1303 __asm__ ( "mov %0, %%db7" : : "r" (value) );
1304 break;
1305 default:
1306 return -EINVAL;
1309 p->arch.guest_context.debugreg[reg] = value;
1310 return 0;
1313 long do_set_debugreg(int reg, unsigned long value)
1315 return set_debugreg(current, reg, value);
1318 unsigned long do_get_debugreg(int reg)
1320 if ( (reg < 0) || (reg > 7) ) return -EINVAL;
1321 return current->arch.guest_context.debugreg[reg];
1324 /*
1325 * Local variables:
1326 * mode: C
1327 * c-set-style: "BSD"
1328 * c-basic-offset: 4
1329 * tab-width: 4
1330 * indent-tabs-mode: nil
1331 * End:
1332 */