direct-io.hg

view xen/arch/x86/traps.c @ 15454:83cbda5c1e1b

x86-64: bump STACK_SIZE to 32 so that trampoline and IST stacks fit
without undue squeezing.
Signed-off-by: Jan Beulich <jbeulich@novell.com>
author kfraser@localhost.localdomain
date Tue Jul 03 11:41:25 2007 +0100 (2007-07-03)
parents 3cf5052ba5e5
children
line source
1 /******************************************************************************
2 * arch/x86/traps.c
3 *
4 * Modifications to Linux original are copyright (c) 2002-2004, K A Fraser
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
21 /*
22 * Copyright (C) 1991, 1992 Linus Torvalds
23 *
24 * Pentium III FXSR, SSE support
25 * Gareth Hughes <gareth@valinux.com>, May 2000
26 */
28 #include <xen/config.h>
29 #include <xen/init.h>
30 #include <xen/sched.h>
31 #include <xen/lib.h>
32 #include <xen/errno.h>
33 #include <xen/mm.h>
34 #include <xen/console.h>
35 #include <xen/shutdown.h>
36 #include <asm/regs.h>
37 #include <xen/delay.h>
38 #include <xen/event.h>
39 #include <xen/spinlock.h>
40 #include <xen/irq.h>
41 #include <xen/perfc.h>
42 #include <xen/softirq.h>
43 #include <xen/domain_page.h>
44 #include <xen/symbols.h>
45 #include <xen/iocap.h>
46 #include <xen/nmi.h>
47 #include <xen/version.h>
48 #include <xen/kexec.h>
49 #include <asm/paging.h>
50 #include <asm/system.h>
51 #include <asm/io.h>
52 #include <asm/atomic.h>
53 #include <asm/desc.h>
54 #include <asm/debugreg.h>
55 #include <asm/smp.h>
56 #include <asm/flushtlb.h>
57 #include <asm/uaccess.h>
58 #include <asm/i387.h>
59 #include <asm/debugger.h>
60 #include <asm/msr.h>
61 #include <asm/shared.h>
62 #include <asm/x86_emulate.h>
63 #include <asm/hvm/vpt.h>
65 /*
66 * opt_nmi: one of 'ignore', 'dom0', or 'fatal'.
67 * fatal: Xen prints diagnostic message and then hangs.
68 * dom0: The NMI is virtualised to DOM0.
69 * ignore: The NMI error is cleared and ignored.
70 */
71 #ifdef NDEBUG
72 char opt_nmi[10] = "dom0";
73 #else
74 char opt_nmi[10] = "fatal";
75 #endif
76 string_param("nmi", opt_nmi);
78 /* Master table, used by CPU0. */
79 idt_entry_t idt_table[IDT_ENTRIES];
81 /* Pointer to the IDT of every CPU. */
82 idt_entry_t *idt_tables[NR_CPUS] __read_mostly;
84 #define DECLARE_TRAP_HANDLER(_name) \
85 asmlinkage void _name(void); \
86 asmlinkage int do_ ## _name(struct cpu_user_regs *regs)
88 asmlinkage void nmi(void);
89 asmlinkage void machine_check(void);
90 DECLARE_TRAP_HANDLER(divide_error);
91 DECLARE_TRAP_HANDLER(debug);
92 DECLARE_TRAP_HANDLER(int3);
93 DECLARE_TRAP_HANDLER(overflow);
94 DECLARE_TRAP_HANDLER(bounds);
95 DECLARE_TRAP_HANDLER(invalid_op);
96 DECLARE_TRAP_HANDLER(device_not_available);
97 DECLARE_TRAP_HANDLER(coprocessor_segment_overrun);
98 DECLARE_TRAP_HANDLER(invalid_TSS);
99 DECLARE_TRAP_HANDLER(segment_not_present);
100 DECLARE_TRAP_HANDLER(stack_segment);
101 DECLARE_TRAP_HANDLER(general_protection);
102 DECLARE_TRAP_HANDLER(page_fault);
103 DECLARE_TRAP_HANDLER(coprocessor_error);
104 DECLARE_TRAP_HANDLER(simd_coprocessor_error);
105 DECLARE_TRAP_HANDLER(alignment_check);
106 DECLARE_TRAP_HANDLER(spurious_interrupt_bug);
108 long do_set_debugreg(int reg, unsigned long value);
109 unsigned long do_get_debugreg(int reg);
111 static int debug_stack_lines = 20;
112 integer_param("debug_stack_lines", debug_stack_lines);
114 #ifdef CONFIG_X86_32
115 #define stack_words_per_line 8
116 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)&regs->esp)
117 #else
118 #define stack_words_per_line 4
119 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)regs->rsp)
120 #endif
122 static void show_guest_stack(struct cpu_user_regs *regs)
123 {
124 int i;
125 unsigned long *stack, addr;
127 if ( is_hvm_vcpu(current) )
128 return;
130 if ( is_pv_32on64_vcpu(current) )
131 {
132 compat_show_guest_stack(regs, debug_stack_lines);
133 return;
134 }
136 if ( vm86_mode(regs) )
137 {
138 stack = (unsigned long *)((regs->ss << 4) + (regs->esp & 0xffff));
139 printk("Guest stack trace from ss:sp = %04x:%04x (VM86)\n ",
140 regs->ss, (uint16_t)(regs->esp & 0xffff));
141 }
142 else
143 {
144 stack = (unsigned long *)regs->esp;
145 printk("Guest stack trace from "__OP"sp=%p:\n ", stack);
146 }
148 for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
149 {
150 if ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) == 0 )
151 break;
152 if ( get_user(addr, stack) )
153 {
154 if ( i != 0 )
155 printk("\n ");
156 printk("Fault while accessing guest memory.");
157 i = 1;
158 break;
159 }
160 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
161 printk("\n ");
162 printk(" %p", _p(addr));
163 stack++;
164 }
165 if ( i == 0 )
166 printk("Stack empty.");
167 printk("\n");
168 }
170 #if !defined(CONFIG_FRAME_POINTER)
172 static void show_trace(struct cpu_user_regs *regs)
173 {
174 unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr;
176 printk("Xen call trace:\n ");
178 printk("[<%p>]", _p(regs->eip));
179 print_symbol(" %s\n ", regs->eip);
181 while ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) != 0 )
182 {
183 addr = *stack++;
184 if ( is_kernel_text(addr) || is_kernel_inittext(addr) )
185 {
186 printk("[<%p>]", _p(addr));
187 print_symbol(" %s\n ", addr);
188 }
189 }
191 printk("\n");
192 }
194 #else
196 static void show_trace(struct cpu_user_regs *regs)
197 {
198 unsigned long *frame, next, addr, low, high;
200 printk("Xen call trace:\n ");
202 printk("[<%p>]", _p(regs->eip));
203 print_symbol(" %s\n ", regs->eip);
205 /* Bounds for range of valid frame pointer. */
206 low = (unsigned long)(ESP_BEFORE_EXCEPTION(regs) - 2);
207 high = (low & ~(STACK_SIZE - 1)) +
208 (STACK_SIZE - sizeof(struct cpu_info) - 2*sizeof(unsigned long));
210 /* The initial frame pointer. */
211 next = regs->ebp;
213 for ( ; ; )
214 {
215 /* Valid frame pointer? */
216 if ( (next < low) || (next >= high) )
217 {
218 /*
219 * Exception stack frames have a different layout, denoted by an
220 * inverted frame pointer.
221 */
222 next = ~next;
223 if ( (next < low) || (next >= high) )
224 break;
225 frame = (unsigned long *)next;
226 next = frame[0];
227 addr = frame[(offsetof(struct cpu_user_regs, eip) -
228 offsetof(struct cpu_user_regs, ebp))
229 / BYTES_PER_LONG];
230 }
231 else
232 {
233 /* Ordinary stack frame. */
234 frame = (unsigned long *)next;
235 next = frame[0];
236 addr = frame[1];
237 }
239 printk("[<%p>]", _p(addr));
240 print_symbol(" %s\n ", addr);
242 low = (unsigned long)&frame[2];
243 }
245 printk("\n");
246 }
248 #endif
250 void show_stack(struct cpu_user_regs *regs)
251 {
252 unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr;
253 int i;
255 if ( guest_mode(regs) )
256 return show_guest_stack(regs);
258 printk("Xen stack trace from "__OP"sp=%p:\n ", stack);
260 for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
261 {
262 if ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) == 0 )
263 break;
264 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
265 printk("\n ");
266 addr = *stack++;
267 printk(" %p", _p(addr));
268 }
269 if ( i == 0 )
270 printk("Stack empty.");
271 printk("\n");
273 show_trace(regs);
274 }
276 void show_stack_overflow(unsigned int cpu, unsigned long esp)
277 {
278 #ifdef MEMORY_GUARD
279 unsigned long esp_top, esp_bottom;
280 unsigned long *stack, addr;
282 esp_bottom = (esp | (STACK_SIZE - 1)) + 1;
283 esp_top = esp_bottom - PRIMARY_STACK_SIZE;
285 printk("Valid stack range: %p-%p, sp=%p, tss.esp0=%p\n",
286 (void *)esp_top, (void *)esp_bottom, (void *)esp,
287 (void *)init_tss[cpu].esp0);
289 /* Trigger overflow trace if %esp is within 512 bytes of the guard page. */
290 if ( ((unsigned long)(esp - esp_top) > 512) &&
291 ((unsigned long)(esp_top - esp) > 512) )
292 {
293 printk("No stack overflow detected. Skipping stack trace.\n");
294 return;
295 }
297 if ( esp < esp_top )
298 esp = esp_top;
300 printk("Xen stack overflow (dumping trace %p-%p):\n ",
301 (void *)esp, (void *)esp_bottom);
303 stack = (unsigned long *)esp;
304 while ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) != 0 )
305 {
306 addr = *stack++;
307 if ( is_kernel_text(addr) || is_kernel_inittext(addr) )
308 {
309 printk("%p: [<%p>]", stack, _p(addr));
310 print_symbol(" %s\n ", addr);
311 }
312 }
314 printk("\n");
315 #endif
316 }
318 void show_execution_state(struct cpu_user_regs *regs)
319 {
320 show_registers(regs);
321 show_stack(regs);
322 }
324 char *trapstr(int trapnr)
325 {
326 static char *strings[] = {
327 "divide error", "debug", "nmi", "bkpt", "overflow", "bounds",
328 "invalid opcode", "device not available", "double fault",
329 "coprocessor segment", "invalid tss", "segment not found",
330 "stack error", "general protection fault", "page fault",
331 "spurious interrupt", "coprocessor error", "alignment check",
332 "machine check", "simd error"
333 };
335 if ( (trapnr < 0) || (trapnr >= ARRAY_SIZE(strings)) )
336 return "???";
338 return strings[trapnr];
339 }
341 /*
342 * This is called for faults at very unexpected times (e.g., when interrupts
343 * are disabled). In such situations we can't do much that is safe. We try to
344 * print out some tracing and then we just spin.
345 */
346 asmlinkage void fatal_trap(int trapnr, struct cpu_user_regs *regs)
347 {
348 static DEFINE_PER_CPU(char, depth);
350 /*
351 * In some cases, we can end up in a vicious cycle of fatal_trap()s
352 * within fatal_trap()s. We give the problem a couple of iterations to
353 * bottom out, and then we just panic.
354 */
355 if ( ++this_cpu(depth) < 3 )
356 {
357 watchdog_disable();
358 console_start_sync();
360 show_execution_state(regs);
362 if ( trapnr == TRAP_page_fault )
363 {
364 unsigned long cr2 = read_cr2();
365 printk("Faulting linear address: %p\n", _p(cr2));
366 show_page_walk(cr2);
367 }
368 }
370 panic("FATAL TRAP: vector = %d (%s)\n"
371 "[error_code=%04x] %s\n",
372 trapnr, trapstr(trapnr), regs->error_code,
373 (regs->eflags & X86_EFLAGS_IF) ? "" : ", IN INTERRUPT CONTEXT");
374 }
376 static int do_guest_trap(
377 int trapnr, const struct cpu_user_regs *regs, int use_error_code)
378 {
379 struct vcpu *v = current;
380 struct trap_bounce *tb;
381 const struct trap_info *ti;
383 tb = &v->arch.trap_bounce;
384 ti = &v->arch.guest_context.trap_ctxt[trapnr];
386 tb->flags = TBF_EXCEPTION;
387 tb->cs = ti->cs;
388 tb->eip = ti->address;
390 if ( use_error_code )
391 {
392 tb->flags |= TBF_EXCEPTION_ERRCODE;
393 tb->error_code = regs->error_code;
394 }
396 if ( TI_GET_IF(ti) )
397 tb->flags |= TBF_INTERRUPT;
399 if ( unlikely(null_trap_bounce(v, tb)) )
400 gdprintk(XENLOG_WARNING, "Unhandled %s fault/trap [#%d] in "
401 "domain %d on VCPU %d [ec=%04x]\n",
402 trapstr(trapnr), trapnr, v->domain->domain_id, v->vcpu_id,
403 regs->error_code);
405 return 0;
406 }
408 static inline int do_trap(
409 int trapnr, struct cpu_user_regs *regs, int use_error_code)
410 {
411 unsigned long fixup;
413 DEBUGGER_trap_entry(trapnr, regs);
415 if ( guest_mode(regs) )
416 return do_guest_trap(trapnr, regs, use_error_code);
418 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
419 {
420 dprintk(XENLOG_ERR, "Trap %d: %p -> %p\n",
421 trapnr, _p(regs->eip), _p(fixup));
422 regs->eip = fixup;
423 return 0;
424 }
426 DEBUGGER_trap_fatal(trapnr, regs);
428 show_execution_state(regs);
429 panic("FATAL TRAP: vector = %d (%s)\n"
430 "[error_code=%04x]\n",
431 trapnr, trapstr(trapnr), regs->error_code);
432 return 0;
433 }
435 #define DO_ERROR_NOCODE(trapnr, name) \
436 asmlinkage int do_##name(struct cpu_user_regs *regs) \
437 { \
438 return do_trap(trapnr, regs, 0); \
439 }
441 #define DO_ERROR(trapnr, name) \
442 asmlinkage int do_##name(struct cpu_user_regs *regs) \
443 { \
444 return do_trap(trapnr, regs, 1); \
445 }
447 DO_ERROR_NOCODE(TRAP_divide_error, divide_error)
448 DO_ERROR_NOCODE(TRAP_overflow, overflow)
449 DO_ERROR_NOCODE(TRAP_bounds, bounds)
450 DO_ERROR_NOCODE(TRAP_copro_seg, coprocessor_segment_overrun)
451 DO_ERROR( TRAP_invalid_tss, invalid_TSS)
452 DO_ERROR( TRAP_no_segment, segment_not_present)
453 DO_ERROR( TRAP_stack_error, stack_segment)
454 DO_ERROR_NOCODE(TRAP_copro_error, coprocessor_error)
455 DO_ERROR( TRAP_alignment_check, alignment_check)
456 DO_ERROR_NOCODE(TRAP_simd_error, simd_coprocessor_error)
458 int rdmsr_hypervisor_regs(
459 uint32_t idx, uint32_t *eax, uint32_t *edx)
460 {
461 idx -= 0x40000000;
462 if ( idx > 0 )
463 return 0;
465 switch ( idx )
466 {
467 case 0:
468 {
469 *eax = *edx = 0;
470 break;
471 }
472 default:
473 BUG();
474 }
476 return 1;
477 }
479 int wrmsr_hypervisor_regs(
480 uint32_t idx, uint32_t eax, uint32_t edx)
481 {
482 struct domain *d = current->domain;
484 idx -= 0x40000000;
485 if ( idx > 0 )
486 return 0;
488 switch ( idx )
489 {
490 case 0:
491 {
492 void *hypercall_page;
493 unsigned long mfn;
494 unsigned long gmfn = ((unsigned long)edx << 20) | (eax >> 12);
495 unsigned int idx = eax & 0xfff;
497 if ( idx > 0 )
498 {
499 gdprintk(XENLOG_WARNING,
500 "Dom%d: Out of range index %u to MSR %08x\n",
501 d->domain_id, idx, 0x40000000);
502 return 0;
503 }
505 mfn = gmfn_to_mfn(d, gmfn);
507 if ( !mfn_valid(mfn) ||
508 !get_page_and_type(mfn_to_page(mfn), d, PGT_writable_page) )
509 {
510 gdprintk(XENLOG_WARNING,
511 "Dom%d: Bad GMFN %lx (MFN %lx) to MSR %08x\n",
512 d->domain_id, gmfn, mfn, 0x40000000);
513 return 0;
514 }
516 hypercall_page = map_domain_page(mfn);
517 hypercall_page_initialise(d, hypercall_page);
518 unmap_domain_page(hypercall_page);
520 put_page_and_type(mfn_to_page(mfn));
521 break;
522 }
524 default:
525 BUG();
526 }
528 return 1;
529 }
531 int cpuid_hypervisor_leaves(
532 uint32_t idx, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
533 {
534 idx -= 0x40000000;
535 if ( idx > 2 )
536 return 0;
538 switch ( idx )
539 {
540 case 0:
541 *eax = 0x40000002; /* Largest leaf */
542 *ebx = 0x566e6558; /* Signature 1: "XenV" */
543 *ecx = 0x65584d4d; /* Signature 2: "MMXe" */
544 *edx = 0x4d4d566e; /* Signature 3: "nVMM" */
545 break;
547 case 1:
548 *eax = (xen_major_version() << 16) | xen_minor_version();
549 *ebx = 0; /* Reserved */
550 *ecx = 0; /* Reserved */
551 *edx = 0; /* Reserved */
552 break;
554 case 2:
555 *eax = 1; /* Number of hypercall-transfer pages */
556 *ebx = 0x40000000; /* MSR base address */
557 *ecx = 0; /* Features 1 */
558 *edx = 0; /* Features 2 */
559 break;
561 default:
562 BUG();
563 }
565 return 1;
566 }
568 static int emulate_forced_invalid_op(struct cpu_user_regs *regs)
569 {
570 char sig[5], instr[2];
571 uint32_t a, b, c, d;
572 unsigned long eip, rc;
574 a = regs->eax;
575 b = regs->ebx;
576 c = regs->ecx;
577 d = regs->edx;
578 eip = regs->eip;
580 /* Check for forced emulation signature: ud2 ; .ascii "xen". */
581 if ( (rc = copy_from_user(sig, (char *)eip, sizeof(sig))) != 0 )
582 {
583 propagate_page_fault(eip + sizeof(sig) - rc, 0);
584 return EXCRET_fault_fixed;
585 }
586 if ( memcmp(sig, "\xf\xbxen", sizeof(sig)) )
587 return 0;
588 eip += sizeof(sig);
590 /* We only emulate CPUID. */
591 if ( ( rc = copy_from_user(instr, (char *)eip, sizeof(instr))) != 0 )
592 {
593 propagate_page_fault(eip + sizeof(instr) - rc, 0);
594 return EXCRET_fault_fixed;
595 }
596 if ( memcmp(instr, "\xf\xa2", sizeof(instr)) )
597 return 0;
598 eip += sizeof(instr);
600 __asm__ (
601 "cpuid"
602 : "=a" (a), "=b" (b), "=c" (c), "=d" (d)
603 : "0" (a), "1" (b), "2" (c), "3" (d) );
605 if ( regs->eax == 1 )
606 {
607 /* Modify Feature Information. */
608 clear_bit(X86_FEATURE_VME, &d);
609 clear_bit(X86_FEATURE_DE, &d);
610 clear_bit(X86_FEATURE_PSE, &d);
611 clear_bit(X86_FEATURE_PGE, &d);
612 if ( !supervisor_mode_kernel )
613 clear_bit(X86_FEATURE_SEP, &d);
614 if ( !IS_PRIV(current->domain) )
615 clear_bit(X86_FEATURE_MTRR, &d);
616 }
617 else if ( regs->eax == 0x80000001 )
618 {
619 /* Modify Feature Information. */
620 if ( is_pv_32bit_vcpu(current) )
621 clear_bit(X86_FEATURE_SYSCALL % 32, &d);
622 clear_bit(X86_FEATURE_RDTSCP % 32, &d);
623 }
624 else
625 {
626 (void)cpuid_hypervisor_leaves(regs->eax, &a, &b, &c, &d);
627 }
629 regs->eax = a;
630 regs->ebx = b;
631 regs->ecx = c;
632 regs->edx = d;
633 regs->eip = eip;
634 regs->eflags &= ~X86_EFLAGS_RF;
636 return EXCRET_fault_fixed;
637 }
639 asmlinkage int do_invalid_op(struct cpu_user_regs *regs)
640 {
641 struct bug_frame bug;
642 struct bug_frame_str bug_str;
643 char *filename, *predicate, *eip = (char *)regs->eip;
644 int rc, id, lineno;
646 DEBUGGER_trap_entry(TRAP_invalid_op, regs);
648 if ( likely(guest_mode(regs)) )
649 {
650 if ( (rc = emulate_forced_invalid_op(regs)) != 0 )
651 return rc;
652 return do_guest_trap(TRAP_invalid_op, regs, 0);
653 }
655 if ( !is_kernel(eip) ||
656 __copy_from_user(&bug, eip, sizeof(bug)) ||
657 memcmp(bug.ud2, "\xf\xb", sizeof(bug.ud2)) ||
658 (bug.ret != 0xc2) )
659 goto die;
660 eip += sizeof(bug);
662 id = bug.id & 3;
664 if ( id == BUGFRAME_dump )
665 {
666 show_execution_state(regs);
667 regs->eip = (unsigned long)eip;
668 return EXCRET_fault_fixed;
669 }
671 /* WARN, BUG or ASSERT: decode the filename pointer and line number. */
672 if ( !is_kernel(eip) ||
673 __copy_from_user(&bug_str, eip, sizeof(bug_str)) ||
674 memcmp(bug_str.mov, BUG_MOV_STR, sizeof(bug_str.mov)) )
675 goto die;
676 eip += sizeof(bug_str);
678 filename = is_kernel(bug_str.str) ? (char *)bug_str.str : "<unknown>";
679 lineno = bug.id >> 2;
681 if ( id == BUGFRAME_warn )
682 {
683 printk("Xen WARN at %.50s:%d\n", filename, lineno);
684 show_execution_state(regs);
685 regs->eip = (unsigned long)eip;
686 return EXCRET_fault_fixed;
687 }
689 if ( id == BUGFRAME_bug )
690 {
691 printk("Xen BUG at %.50s:%d\n", filename, lineno);
692 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
693 show_execution_state(regs);
694 panic("Xen BUG at %.50s:%d\n", filename, lineno);
695 }
697 /* ASSERT: decode the predicate string pointer. */
698 ASSERT(id == BUGFRAME_assert);
699 if ( !is_kernel(eip) ||
700 __copy_from_user(&bug_str, eip, sizeof(bug_str)) ||
701 memcmp(bug_str.mov, BUG_MOV_STR, sizeof(bug_str.mov)) )
702 goto die;
703 eip += sizeof(bug_str);
705 predicate = is_kernel(bug_str.str) ? (char *)bug_str.str : "<unknown>";
706 printk("Assertion '%s' failed at %.50s:%d\n",
707 predicate, filename, lineno);
708 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
709 show_execution_state(regs);
710 panic("Assertion '%s' failed at %.50s:%d\n",
711 predicate, filename, lineno);
713 die:
714 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
715 show_execution_state(regs);
716 panic("FATAL TRAP: vector = %d (invalid opcode)\n", TRAP_invalid_op);
717 return 0;
718 }
720 asmlinkage int do_int3(struct cpu_user_regs *regs)
721 {
722 DEBUGGER_trap_entry(TRAP_int3, regs);
724 if ( !guest_mode(regs) )
725 {
726 DEBUGGER_trap_fatal(TRAP_int3, regs);
727 show_execution_state(regs);
728 panic("FATAL TRAP: vector = 3 (Int3)\n");
729 }
731 return do_guest_trap(TRAP_int3, regs, 0);
732 }
734 asmlinkage void do_machine_check(struct cpu_user_regs *regs)
735 {
736 extern fastcall void (*machine_check_vector)(
737 struct cpu_user_regs *, long error_code);
738 machine_check_vector(regs, regs->error_code);
739 }
741 void propagate_page_fault(unsigned long addr, u16 error_code)
742 {
743 struct trap_info *ti;
744 struct vcpu *v = current;
745 struct trap_bounce *tb = &v->arch.trap_bounce;
747 v->arch.guest_context.ctrlreg[2] = addr;
748 arch_set_cr2(v, addr);
750 /* Re-set error_code.user flag appropriately for the guest. */
751 error_code &= ~PFEC_user_mode;
752 if ( !guest_kernel_mode(v, guest_cpu_user_regs()) )
753 error_code |= PFEC_user_mode;
755 ti = &v->arch.guest_context.trap_ctxt[TRAP_page_fault];
756 tb->flags = TBF_EXCEPTION | TBF_EXCEPTION_ERRCODE;
757 tb->error_code = error_code;
758 tb->cs = ti->cs;
759 tb->eip = ti->address;
760 if ( TI_GET_IF(ti) )
761 tb->flags |= TBF_INTERRUPT;
762 if ( unlikely(null_trap_bounce(v, tb)) )
763 {
764 printk("Unhandled page fault in domain %d on VCPU %d (ec=%04X)\n",
765 v->domain->domain_id, v->vcpu_id, error_code);
766 show_page_walk(addr);
767 }
768 }
770 static int handle_gdt_ldt_mapping_fault(
771 unsigned long offset, struct cpu_user_regs *regs)
772 {
773 /* Which vcpu's area did we fault in, and is it in the ldt sub-area? */
774 unsigned int is_ldt_area = (offset >> (GDT_LDT_VCPU_VA_SHIFT-1)) & 1;
775 unsigned int vcpu_area = (offset >> GDT_LDT_VCPU_VA_SHIFT);
777 /* Should never fault in another vcpu's area. */
778 BUG_ON(vcpu_area != current->vcpu_id);
780 /* Byte offset within the gdt/ldt sub-area. */
781 offset &= (1UL << (GDT_LDT_VCPU_VA_SHIFT-1)) - 1UL;
783 if ( likely(is_ldt_area) )
784 {
785 /* LDT fault: Copy a mapping from the guest's LDT, if it is valid. */
786 if ( unlikely(map_ldt_shadow_page(offset >> PAGE_SHIFT) == 0) )
787 {
788 /* In hypervisor mode? Leave it to the #PF handler to fix up. */
789 if ( !guest_mode(regs) )
790 return 0;
791 /* In guest mode? Propagate #PF to guest, with adjusted %cr2. */
792 propagate_page_fault(
793 current->arch.guest_context.ldt_base + offset,
794 regs->error_code);
795 }
796 }
797 else
798 {
799 /* GDT fault: handle the fault as #GP(selector). */
800 regs->error_code = (u16)offset & ~7;
801 (void)do_general_protection(regs);
802 }
804 return EXCRET_fault_fixed;
805 }
807 #ifdef HYPERVISOR_VIRT_END
808 #define IN_HYPERVISOR_RANGE(va) \
809 (((va) >= HYPERVISOR_VIRT_START) && ((va) < HYPERVISOR_VIRT_END))
810 #else
811 #define IN_HYPERVISOR_RANGE(va) \
812 (((va) >= HYPERVISOR_VIRT_START))
813 #endif
815 static int __spurious_page_fault(
816 unsigned long addr, struct cpu_user_regs *regs)
817 {
818 unsigned long mfn, cr3 = read_cr3();
819 #if CONFIG_PAGING_LEVELS >= 4
820 l4_pgentry_t l4e, *l4t;
821 #endif
822 #if CONFIG_PAGING_LEVELS >= 3
823 l3_pgentry_t l3e, *l3t;
824 #endif
825 l2_pgentry_t l2e, *l2t;
826 l1_pgentry_t l1e, *l1t;
827 unsigned int required_flags, disallowed_flags;
829 /* Reserved bit violations are never spurious faults. */
830 if ( regs->error_code & PFEC_reserved_bit )
831 return 0;
833 required_flags = _PAGE_PRESENT;
834 if ( regs->error_code & PFEC_write_access )
835 required_flags |= _PAGE_RW;
836 if ( regs->error_code & PFEC_user_mode )
837 required_flags |= _PAGE_USER;
839 disallowed_flags = 0;
840 if ( regs->error_code & PFEC_insn_fetch )
841 disallowed_flags |= _PAGE_NX;
843 mfn = cr3 >> PAGE_SHIFT;
845 #if CONFIG_PAGING_LEVELS >= 4
846 l4t = map_domain_page(mfn);
847 l4e = l4e_read_atomic(&l4t[l4_table_offset(addr)]);
848 mfn = l4e_get_pfn(l4e);
849 unmap_domain_page(l4t);
850 if ( ((l4e_get_flags(l4e) & required_flags) != required_flags) ||
851 (l4e_get_flags(l4e) & disallowed_flags) )
852 return 0;
853 #endif
855 #if CONFIG_PAGING_LEVELS >= 3
856 l3t = map_domain_page(mfn);
857 #ifdef CONFIG_X86_PAE
858 l3t += (cr3 & 0xFE0UL) >> 3;
859 #endif
860 l3e = l3e_read_atomic(&l3t[l3_table_offset(addr)]);
861 mfn = l3e_get_pfn(l3e);
862 unmap_domain_page(l3t);
863 #ifdef CONFIG_X86_PAE
864 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
865 return 0;
866 #else
867 if ( ((l3e_get_flags(l3e) & required_flags) != required_flags) ||
868 (l3e_get_flags(l3e) & disallowed_flags) )
869 return 0;
870 #endif
871 #endif
873 l2t = map_domain_page(mfn);
874 l2e = l2e_read_atomic(&l2t[l2_table_offset(addr)]);
875 mfn = l2e_get_pfn(l2e);
876 unmap_domain_page(l2t);
877 if ( ((l2e_get_flags(l2e) & required_flags) != required_flags) ||
878 (l2e_get_flags(l2e) & disallowed_flags) )
879 return 0;
880 if ( l2e_get_flags(l2e) & _PAGE_PSE )
881 {
882 l1e = l1e_empty(); /* define before use in debug tracing */
883 goto spurious;
884 }
886 l1t = map_domain_page(mfn);
887 l1e = l1e_read_atomic(&l1t[l1_table_offset(addr)]);
888 mfn = l1e_get_pfn(l1e);
889 unmap_domain_page(l1t);
890 if ( ((l1e_get_flags(l1e) & required_flags) != required_flags) ||
891 (l1e_get_flags(l1e) & disallowed_flags) )
892 return 0;
894 spurious:
895 dprintk(XENLOG_WARNING, "Spurious fault in domain %u:%u "
896 "at addr %lx, e/c %04x\n",
897 current->domain->domain_id, current->vcpu_id,
898 addr, regs->error_code);
899 #if CONFIG_PAGING_LEVELS >= 4
900 dprintk(XENLOG_WARNING, " l4e = %"PRIpte"\n", l4e_get_intpte(l4e));
901 #endif
902 #if CONFIG_PAGING_LEVELS >= 3
903 dprintk(XENLOG_WARNING, " l3e = %"PRIpte"\n", l3e_get_intpte(l3e));
904 #endif
905 dprintk(XENLOG_WARNING, " l2e = %"PRIpte"\n", l2e_get_intpte(l2e));
906 dprintk(XENLOG_WARNING, " l1e = %"PRIpte"\n", l1e_get_intpte(l1e));
907 #ifndef NDEBUG
908 show_registers(regs);
909 #endif
910 return 1;
911 }
913 static int spurious_page_fault(
914 unsigned long addr, struct cpu_user_regs *regs)
915 {
916 unsigned long flags;
917 int is_spurious;
919 /*
920 * Disabling interrupts prevents TLB flushing, and hence prevents
921 * page tables from becoming invalid under our feet during the walk.
922 */
923 local_irq_save(flags);
924 is_spurious = __spurious_page_fault(addr, regs);
925 local_irq_restore(flags);
927 return is_spurious;
928 }
930 static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs)
931 {
932 struct vcpu *v = current;
933 struct domain *d = v->domain;
935 /* No fixups in interrupt context or when interrupts are disabled. */
936 if ( in_irq() || !(regs->eflags & X86_EFLAGS_IF) )
937 return 0;
939 if ( unlikely(IN_HYPERVISOR_RANGE(addr)) )
940 {
941 if ( paging_mode_external(d) && guest_mode(regs) )
942 return paging_fault(addr, regs);
943 if ( (addr >= GDT_LDT_VIRT_START) && (addr < GDT_LDT_VIRT_END) )
944 return handle_gdt_ldt_mapping_fault(
945 addr - GDT_LDT_VIRT_START, regs);
946 return 0;
947 }
949 if ( VM_ASSIST(d, VMASST_TYPE_writable_pagetables) &&
950 guest_kernel_mode(v, regs) &&
951 /* Do not check if access-protection fault since the page may
952 legitimately be not present in shadow page tables */
953 ((regs->error_code & PFEC_write_access) == PFEC_write_access) &&
954 ptwr_do_page_fault(v, addr, regs) )
955 return EXCRET_fault_fixed;
957 if ( paging_mode_enabled(d) )
958 return paging_fault(addr, regs);
960 return 0;
961 }
963 /*
964 * #PF error code:
965 * Bit 0: Protection violation (=1) ; Page not present (=0)
966 * Bit 1: Write access
967 * Bit 2: User mode (=1) ; Supervisor mode (=0)
968 * Bit 3: Reserved bit violation
969 * Bit 4: Instruction fetch
970 */
971 asmlinkage int do_page_fault(struct cpu_user_regs *regs)
972 {
973 unsigned long addr, fixup;
974 int rc;
976 addr = read_cr2();
978 DEBUGGER_trap_entry(TRAP_page_fault, regs);
980 perfc_incr(page_faults);
982 if ( unlikely((rc = fixup_page_fault(addr, regs)) != 0) )
983 return rc;
985 if ( unlikely(!guest_mode(regs)) )
986 {
987 if ( spurious_page_fault(addr, regs) )
988 return EXCRET_not_a_fault;
990 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
991 {
992 perfc_incr(copy_user_faults);
993 regs->eip = fixup;
994 return 0;
995 }
997 DEBUGGER_trap_fatal(TRAP_page_fault, regs);
999 show_execution_state(regs);
1000 show_page_walk(addr);
1001 panic("FATAL PAGE FAULT\n"
1002 "[error_code=%04x]\n"
1003 "Faulting linear address: %p\n",
1004 regs->error_code, _p(addr));
1007 propagate_page_fault(addr, regs->error_code);
1008 return 0;
1011 /*
1012 * Early handler to deal with spurious page faults. For example, consider a
1013 * routine that uses a mapping immediately after installing it (making it
1014 * present). The CPU may speculatively execute the memory access before
1015 * executing the PTE write. The instruction will then be marked to cause a
1016 * page fault when it is retired, despite the fact that the PTE is present and
1017 * correct at that point in time.
1018 */
1019 asmlinkage int do_early_page_fault(struct cpu_user_regs *regs)
1021 static int stuck;
1022 static unsigned long prev_eip, prev_cr2;
1023 unsigned long cr2 = read_cr2();
1025 BUG_ON(smp_processor_id() != 0);
1027 if ( (regs->eip != prev_eip) || (cr2 != prev_cr2) )
1029 prev_eip = regs->eip;
1030 prev_cr2 = cr2;
1031 stuck = 0;
1032 return EXCRET_not_a_fault;
1035 if ( stuck++ == 1000 )
1036 panic("Early fatal page fault at %04x:%p (cr2=%p, ec=%04x)\n",
1037 regs->cs, _p(regs->eip), _p(cr2), regs->error_code);
1039 return EXCRET_not_a_fault;
1042 long do_fpu_taskswitch(int set)
1044 struct vcpu *v = current;
1046 if ( set )
1048 v->arch.guest_context.ctrlreg[0] |= X86_CR0_TS;
1049 stts();
1051 else
1053 v->arch.guest_context.ctrlreg[0] &= ~X86_CR0_TS;
1054 if ( v->fpu_dirtied )
1055 clts();
1058 return 0;
1061 static int read_descriptor(unsigned int sel,
1062 const struct vcpu *v,
1063 const struct cpu_user_regs * regs,
1064 unsigned long *base,
1065 unsigned long *limit,
1066 unsigned int *ar,
1067 unsigned int vm86attr)
1069 struct desc_struct desc;
1071 if ( !vm86_mode(regs) )
1073 if ( sel < 4)
1074 desc.b = desc.a = 0;
1075 else if ( __get_user(desc,
1076 (const struct desc_struct *)(!(sel & 4)
1077 ? GDT_VIRT_START(v)
1078 : LDT_VIRT_START(v))
1079 + (sel >> 3)) )
1080 return 0;
1081 if ( !(vm86attr & _SEGMENT_CODE) )
1082 desc.b &= ~_SEGMENT_L;
1084 else
1086 desc.a = (sel << 20) | 0xffff;
1087 desc.b = vm86attr | (sel >> 12);
1090 *ar = desc.b & 0x00f0ff00;
1091 if ( !(desc.b & _SEGMENT_L) )
1093 *base = (desc.a >> 16) + ((desc.b & 0xff) << 16) + (desc.b & 0xff000000);
1094 *limit = (desc.a & 0xffff) | (desc.b & 0x000f0000);
1095 if ( desc.b & _SEGMENT_G )
1096 *limit = ((*limit + 1) << 12) - 1;
1097 #ifndef NDEBUG
1098 if ( !vm86_mode(regs) && sel > 3 )
1100 unsigned int a, l;
1101 unsigned char valid;
1103 __asm__("larl %2, %0\n\tsetz %1" : "=r" (a), "=rm" (valid) : "rm" (sel));
1104 BUG_ON(valid && (a & 0x00f0ff00) != *ar);
1105 __asm__("lsll %2, %0\n\tsetz %1" : "=r" (l), "=rm" (valid) : "rm" (sel));
1106 BUG_ON(valid && l != *limit);
1108 #endif
1110 else
1112 *base = 0UL;
1113 *limit = ~0UL;
1116 return 1;
1119 /* Has the guest requested sufficient permission for this I/O access? */
1120 static inline int guest_io_okay(
1121 unsigned int port, unsigned int bytes,
1122 struct vcpu *v, struct cpu_user_regs *regs)
1124 #if defined(__x86_64__)
1125 /* If in user mode, switch to kernel mode just to read I/O bitmap. */
1126 int user_mode = !(v->arch.flags & TF_kernel_mode);
1127 #define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v)
1128 #elif defined(__i386__)
1129 #define TOGGLE_MODE() ((void)0)
1130 #endif
1132 if ( !vm86_mode(regs) &&
1133 (v->arch.iopl >= (guest_kernel_mode(v, regs) ? 1 : 3)) )
1134 return 1;
1136 if ( v->arch.iobmp_limit > (port + bytes) )
1138 union { uint8_t bytes[2]; uint16_t mask; } x;
1140 /*
1141 * Grab permission bytes from guest space. Inaccessible bytes are
1142 * read as 0xff (no access allowed).
1143 */
1144 TOGGLE_MODE();
1145 switch ( __copy_from_guest_offset(x.bytes, v->arch.iobmp,
1146 port>>3, 2) )
1148 default: x.bytes[0] = ~0;
1149 case 1: x.bytes[1] = ~0;
1150 case 0: break;
1152 TOGGLE_MODE();
1154 if ( (x.mask & (((1<<bytes)-1) << (port&7))) == 0 )
1155 return 1;
1158 return 0;
1161 /* Has the administrator granted sufficient permission for this I/O access? */
1162 static inline int admin_io_okay(
1163 unsigned int port, unsigned int bytes,
1164 struct vcpu *v, struct cpu_user_regs *regs)
1166 return ioports_access_permitted(v->domain, port, port + bytes - 1);
1169 #define guest_inb_okay(_p, _d, _r) admin_io_okay(_p, 1, _d, _r)
1170 #define guest_inw_okay(_p, _d, _r) admin_io_okay(_p, 2, _d, _r)
1171 #define guest_inl_okay(_p, _d, _r) admin_io_okay(_p, 4, _d, _r)
1172 #define guest_outb_okay(_p, _d, _r) admin_io_okay(_p, 1, _d, _r)
1173 #define guest_outw_okay(_p, _d, _r) admin_io_okay(_p, 2, _d, _r)
1174 #define guest_outl_okay(_p, _d, _r) admin_io_okay(_p, 4, _d, _r)
1176 /* I/O emulation support. Helper routines for, and type of, the stack stub.*/
1177 void host_to_guest_gpr_switch(struct cpu_user_regs *)
1178 __attribute__((__regparm__(1)));
1179 unsigned long guest_to_host_gpr_switch(unsigned long)
1180 __attribute__((__regparm__(1)));
1182 /* Instruction fetch with error handling. */
1183 #define insn_fetch(type, base, eip, limit) \
1184 ({ unsigned long _rc, _ptr = (base) + (eip); \
1185 type _x; \
1186 if ( (limit) < sizeof(_x) - 1 || (eip) > (limit) - (sizeof(_x) - 1) ) \
1187 goto fail; \
1188 if ( (_rc = copy_from_user(&_x, (type *)_ptr, sizeof(_x))) != 0 ) \
1189 { \
1190 propagate_page_fault(_ptr + sizeof(_x) - _rc, 0); \
1191 return EXCRET_fault_fixed; \
1192 } \
1193 (eip) += sizeof(_x); _x; })
1195 #if defined(CONFIG_X86_32)
1196 # define read_sreg(regs, sr) ((regs)->sr)
1197 #elif defined(CONFIG_X86_64)
1198 # define read_sreg(regs, sr) read_segment_register(sr)
1199 #endif
1201 static int emulate_privileged_op(struct cpu_user_regs *regs)
1203 struct vcpu *v = current;
1204 unsigned long *reg, eip = regs->eip, res;
1205 u8 opcode, modrm_reg = 0, modrm_rm = 0, rep_prefix = 0, lock = 0, rex = 0;
1206 enum { lm_seg_none, lm_seg_fs, lm_seg_gs } lm_ovr = lm_seg_none;
1207 unsigned int port, i, data_sel, ar, data, rc;
1208 unsigned int op_bytes, op_default, ad_bytes, ad_default;
1209 #define rd_ad(reg) (ad_bytes >= sizeof(regs->reg) \
1210 ? regs->reg \
1211 : ad_bytes == 4 \
1212 ? (u32)regs->reg \
1213 : (u16)regs->reg)
1214 #define wr_ad(reg, val) (ad_bytes >= sizeof(regs->reg) \
1215 ? regs->reg = (val) \
1216 : ad_bytes == 4 \
1217 ? (*(u32 *)&regs->reg = (val)) \
1218 : (*(u16 *)&regs->reg = (val)))
1219 unsigned long code_base, code_limit;
1220 char io_emul_stub[16];
1221 void (*io_emul)(struct cpu_user_regs *) __attribute__((__regparm__(1)));
1222 u32 l, h;
1224 if ( !read_descriptor(regs->cs, v, regs,
1225 &code_base, &code_limit, &ar,
1226 _SEGMENT_CODE|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P) )
1227 goto fail;
1228 op_default = op_bytes = (ar & (_SEGMENT_L|_SEGMENT_DB)) ? 4 : 2;
1229 ad_default = ad_bytes = (ar & _SEGMENT_L) ? 8 : op_default;
1230 if ( !(ar & _SEGMENT_S) ||
1231 !(ar & _SEGMENT_P) ||
1232 !(ar & _SEGMENT_CODE) )
1233 goto fail;
1235 /* emulating only opcodes not allowing SS to be default */
1236 data_sel = read_sreg(regs, ds);
1238 /* Legacy prefixes. */
1239 for ( i = 0; i < 8; i++, rex == opcode || (rex = 0) )
1241 switch ( opcode = insn_fetch(u8, code_base, eip, code_limit) )
1243 case 0x66: /* operand-size override */
1244 op_bytes = op_default ^ 6; /* switch between 2/4 bytes */
1245 continue;
1246 case 0x67: /* address-size override */
1247 ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */
1248 continue;
1249 case 0x2e: /* CS override */
1250 data_sel = regs->cs;
1251 continue;
1252 case 0x3e: /* DS override */
1253 data_sel = read_sreg(regs, ds);
1254 continue;
1255 case 0x26: /* ES override */
1256 data_sel = read_sreg(regs, es);
1257 continue;
1258 case 0x64: /* FS override */
1259 data_sel = read_sreg(regs, fs);
1260 lm_ovr = lm_seg_fs;
1261 continue;
1262 case 0x65: /* GS override */
1263 data_sel = read_sreg(regs, gs);
1264 lm_ovr = lm_seg_gs;
1265 continue;
1266 case 0x36: /* SS override */
1267 data_sel = regs->ss;
1268 continue;
1269 case 0xf0: /* LOCK */
1270 lock = 1;
1271 continue;
1272 case 0xf2: /* REPNE/REPNZ */
1273 case 0xf3: /* REP/REPE/REPZ */
1274 rep_prefix = 1;
1275 continue;
1276 default:
1277 if ( (ar & _SEGMENT_L) && (opcode & 0xf0) == 0x40 )
1279 rex = opcode;
1280 continue;
1282 break;
1284 break;
1287 /* REX prefix. */
1288 if ( rex & 8 ) /* REX.W */
1289 op_bytes = 4; /* emulating only opcodes not supporting 64-bit operands */
1290 modrm_reg = (rex & 4) << 1; /* REX.R */
1291 /* REX.X does not need to be decoded. */
1292 modrm_rm = (rex & 1) << 3; /* REX.B */
1294 if ( opcode == 0x0f )
1295 goto twobyte_opcode;
1297 if ( lock )
1298 goto fail;
1300 /* Input/Output String instructions. */
1301 if ( (opcode >= 0x6c) && (opcode <= 0x6f) )
1303 unsigned long data_base, data_limit;
1305 if ( rep_prefix && (rd_ad(ecx) == 0) )
1306 goto done;
1308 if ( !(opcode & 2) )
1310 data_sel = read_sreg(regs, es);
1311 lm_ovr = lm_seg_none;
1314 if ( !(ar & _SEGMENT_L) )
1316 if ( !read_descriptor(data_sel, v, regs,
1317 &data_base, &data_limit, &ar,
1318 _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P) )
1319 goto fail;
1320 if ( !(ar & _SEGMENT_S) ||
1321 !(ar & _SEGMENT_P) ||
1322 (opcode & 2 ?
1323 (ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR) :
1324 (ar & _SEGMENT_CODE) || !(ar & _SEGMENT_WR)) )
1325 goto fail;
1327 #ifdef CONFIG_X86_64
1328 else
1330 if ( lm_ovr == lm_seg_none || data_sel < 4 )
1332 switch ( lm_ovr )
1334 case lm_seg_none:
1335 data_base = 0UL;
1336 break;
1337 case lm_seg_fs:
1338 data_base = v->arch.guest_context.fs_base;
1339 break;
1340 case lm_seg_gs:
1341 if ( guest_kernel_mode(v, regs) )
1342 data_base = v->arch.guest_context.gs_base_kernel;
1343 else
1344 data_base = v->arch.guest_context.gs_base_user;
1345 break;
1348 else
1349 read_descriptor(data_sel, v, regs,
1350 &data_base, &data_limit, &ar,
1351 0);
1352 data_limit = ~0UL;
1353 ar = _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P;
1355 #endif
1357 continue_io_string:
1358 switch ( opcode )
1360 case 0x6c: /* INSB */
1361 op_bytes = 1;
1362 case 0x6d: /* INSW/INSL */
1363 if ( data_limit < op_bytes - 1 ||
1364 rd_ad(edi) > data_limit - (op_bytes - 1) ||
1365 !guest_io_okay((u16)regs->edx, op_bytes, v, regs) )
1366 goto fail;
1367 port = (u16)regs->edx;
1368 switch ( op_bytes )
1370 case 1:
1371 /* emulate PIT counter 2 */
1372 data = (u8)(guest_inb_okay(port, v, regs) ? inb(port) :
1373 ((port == 0x42 || port == 0x43 || port == 0x61) ?
1374 pv_pit_handler(port, 0, 0) : ~0));
1375 break;
1376 case 2:
1377 data = (u16)(guest_inw_okay(port, v, regs) ? inw(port) : ~0);
1378 break;
1379 case 4:
1380 data = (u32)(guest_inl_okay(port, v, regs) ? inl(port) : ~0);
1381 break;
1383 if ( (rc = copy_to_user((void *)data_base + rd_ad(edi), &data, op_bytes)) != 0 )
1385 propagate_page_fault(data_base + rd_ad(edi) + op_bytes - rc,
1386 PFEC_write_access);
1387 return EXCRET_fault_fixed;
1389 wr_ad(edi, regs->edi + (int)((regs->eflags & EF_DF) ? -op_bytes : op_bytes));
1390 break;
1392 case 0x6e: /* OUTSB */
1393 op_bytes = 1;
1394 case 0x6f: /* OUTSW/OUTSL */
1395 if ( data_limit < op_bytes - 1 ||
1396 rd_ad(esi) > data_limit - (op_bytes - 1) ||
1397 !guest_io_okay((u16)regs->edx, op_bytes, v, regs) )
1398 goto fail;
1399 rc = copy_from_user(&data, (void *)data_base + rd_ad(esi), op_bytes);
1400 if ( rc != 0 )
1402 propagate_page_fault(data_base + rd_ad(esi) + op_bytes - rc, 0);
1403 return EXCRET_fault_fixed;
1405 port = (u16)regs->edx;
1406 switch ( op_bytes )
1408 case 1:
1409 if ( guest_outb_okay(port, v, regs) )
1410 outb((u8)data, port);
1411 else if ( port == 0x42 || port == 0x43 || port == 0x61 )
1412 pv_pit_handler(port, data, 1);
1413 break;
1414 case 2:
1415 if ( guest_outw_okay(port, v, regs) )
1416 outw((u16)data, port);
1417 break;
1418 case 4:
1419 if ( guest_outl_okay(port, v, regs) )
1420 outl((u32)data, port);
1421 break;
1423 wr_ad(esi, regs->esi + (int)((regs->eflags & EF_DF) ? -op_bytes : op_bytes));
1424 break;
1427 if ( rep_prefix && (wr_ad(ecx, regs->ecx - 1) != 0) )
1429 if ( !hypercall_preempt_check() )
1430 goto continue_io_string;
1431 eip = regs->eip;
1434 goto done;
1437 /*
1438 * Very likely to be an I/O instruction (IN/OUT).
1439 * Build an on-stack stub to execute the instruction with full guest
1440 * GPR context. This is needed for some systems which (ab)use IN/OUT
1441 * to communicate with BIOS code in system-management mode.
1442 */
1443 #ifdef __x86_64__
1444 /* movq $host_to_guest_gpr_switch,%rcx */
1445 io_emul_stub[0] = 0x48;
1446 io_emul_stub[1] = 0xb9;
1447 *(void **)&io_emul_stub[2] = (void *)host_to_guest_gpr_switch;
1448 /* callq *%rcx */
1449 io_emul_stub[10] = 0xff;
1450 io_emul_stub[11] = 0xd1;
1451 #else
1452 /* call host_to_guest_gpr_switch */
1453 io_emul_stub[0] = 0xe8;
1454 *(s32 *)&io_emul_stub[1] =
1455 (char *)host_to_guest_gpr_switch - &io_emul_stub[5];
1456 /* 7 x nop */
1457 memset(&io_emul_stub[5], 0x90, 7);
1458 #endif
1459 /* data16 or nop */
1460 io_emul_stub[12] = (op_bytes != 2) ? 0x90 : 0x66;
1461 /* <io-access opcode> */
1462 io_emul_stub[13] = opcode;
1463 /* imm8 or nop */
1464 io_emul_stub[14] = 0x90;
1465 /* ret (jumps to guest_to_host_gpr_switch) */
1466 io_emul_stub[15] = 0xc3;
1468 /* Handy function-typed pointer to the stub. */
1469 io_emul = (void *)io_emul_stub;
1471 /* I/O Port and Interrupt Flag instructions. */
1472 switch ( opcode )
1474 case 0xe4: /* IN imm8,%al */
1475 op_bytes = 1;
1476 case 0xe5: /* IN imm8,%eax */
1477 port = insn_fetch(u8, code_base, eip, code_limit);
1478 io_emul_stub[14] = port; /* imm8 */
1479 exec_in:
1480 if ( !guest_io_okay(port, op_bytes, v, regs) )
1481 goto fail;
1482 switch ( op_bytes )
1484 case 1:
1485 if ( guest_inb_okay(port, v, regs) )
1486 io_emul(regs);
1487 else if ( port == 0x42 || port == 0x43 || port == 0x61 )
1489 regs->eax &= ~0xffUL;
1490 regs->eax |= pv_pit_handler(port, 0, 0);
1492 else
1493 regs->eax |= (u8)~0;
1494 break;
1495 case 2:
1496 if ( guest_inw_okay(port, v, regs) )
1497 io_emul(regs);
1498 else
1499 regs->eax |= (u16)~0;
1500 break;
1501 case 4:
1502 if ( guest_inl_okay(port, v, regs) )
1503 io_emul(regs);
1504 else
1505 regs->eax = (u32)~0;
1506 break;
1508 goto done;
1510 case 0xec: /* IN %dx,%al */
1511 op_bytes = 1;
1512 case 0xed: /* IN %dx,%eax */
1513 port = (u16)regs->edx;
1514 goto exec_in;
1516 case 0xe6: /* OUT %al,imm8 */
1517 op_bytes = 1;
1518 case 0xe7: /* OUT %eax,imm8 */
1519 port = insn_fetch(u8, code_base, eip, code_limit);
1520 io_emul_stub[14] = port; /* imm8 */
1521 exec_out:
1522 if ( !guest_io_okay(port, op_bytes, v, regs) )
1523 goto fail;
1524 switch ( op_bytes )
1526 case 1:
1527 if ( guest_outb_okay(port, v, regs) )
1528 io_emul(regs);
1529 else if ( port == 0x42 || port == 0x43 || port == 0x61 )
1530 pv_pit_handler(port, regs->eax, 1);
1531 break;
1532 case 2:
1533 if ( guest_outw_okay(port, v, regs) )
1534 io_emul(regs);
1535 break;
1536 case 4:
1537 if ( guest_outl_okay(port, v, regs) )
1538 io_emul(regs);
1539 break;
1541 goto done;
1543 case 0xee: /* OUT %al,%dx */
1544 op_bytes = 1;
1545 case 0xef: /* OUT %eax,%dx */
1546 port = (u16)regs->edx;
1547 goto exec_out;
1549 case 0xfa: /* CLI */
1550 case 0xfb: /* STI */
1551 if ( v->arch.iopl < (guest_kernel_mode(v, regs) ? 1 : 3) )
1552 goto fail;
1553 /*
1554 * This is just too dangerous to allow, in my opinion. Consider if the
1555 * caller then tries to reenable interrupts using POPF: we can't trap
1556 * that and we'll end up with hard-to-debug lockups. Fast & loose will
1557 * do for us. :-)
1558 */
1559 /*v->vcpu_info->evtchn_upcall_mask = (opcode == 0xfa);*/
1560 goto done;
1563 /* No decode of this single-byte opcode. */
1564 goto fail;
1566 twobyte_opcode:
1567 /* Two-byte opcodes only emulated from guest kernel. */
1568 if ( !guest_kernel_mode(v, regs) )
1569 goto fail;
1571 /* Privileged (ring 0) instructions. */
1572 opcode = insn_fetch(u8, code_base, eip, code_limit);
1573 if ( lock && (opcode & ~3) != 0x20 )
1574 goto fail;
1575 switch ( opcode )
1577 case 0x06: /* CLTS */
1578 (void)do_fpu_taskswitch(0);
1579 break;
1581 case 0x09: /* WBINVD */
1582 /* Ignore the instruction if unprivileged. */
1583 if ( !cache_flush_permitted(v->domain) )
1584 /* Non-physdev domain attempted WBINVD; ignore for now since
1585 newer linux uses this in some start-of-day timing loops */
1587 else
1588 wbinvd();
1589 break;
1591 case 0x20: /* MOV CR?,<reg> */
1592 opcode = insn_fetch(u8, code_base, eip, code_limit);
1593 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
1594 modrm_rm |= (opcode >> 0) & 7;
1595 reg = decode_register(modrm_rm, regs, 0);
1596 switch ( modrm_reg )
1598 case 0: /* Read CR0 */
1599 *reg = (read_cr0() & ~X86_CR0_TS) |
1600 v->arch.guest_context.ctrlreg[0];
1601 break;
1603 case 2: /* Read CR2 */
1604 *reg = v->arch.guest_context.ctrlreg[2];
1605 break;
1607 case 3: /* Read CR3 */
1608 if ( !is_pv_32on64_vcpu(v) )
1609 *reg = xen_pfn_to_cr3(mfn_to_gmfn(
1610 v->domain, pagetable_get_pfn(v->arch.guest_table)));
1611 #ifdef CONFIG_COMPAT
1612 else
1613 *reg = compat_pfn_to_cr3(mfn_to_gmfn(
1614 v->domain, l4e_get_pfn(*(l4_pgentry_t *)__va(pagetable_get_paddr(v->arch.guest_table)))));
1615 #endif
1616 break;
1618 case 4: /* Read CR4 */
1619 /*
1620 * Guests can read CR4 to see what features Xen has enabled. We
1621 * therefore lie about PGE & PSE as they are unavailable to guests.
1622 */
1623 *reg = read_cr4() & ~(X86_CR4_PGE|X86_CR4_PSE);
1624 break;
1626 default:
1627 goto fail;
1629 break;
1631 case 0x21: /* MOV DR?,<reg> */
1632 opcode = insn_fetch(u8, code_base, eip, code_limit);
1633 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
1634 modrm_rm |= (opcode >> 0) & 7;
1635 reg = decode_register(modrm_rm, regs, 0);
1636 if ( (res = do_get_debugreg(modrm_reg)) > (unsigned long)-256 )
1637 goto fail;
1638 *reg = res;
1639 break;
1641 case 0x22: /* MOV <reg>,CR? */
1642 opcode = insn_fetch(u8, code_base, eip, code_limit);
1643 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
1644 modrm_rm |= (opcode >> 0) & 7;
1645 reg = decode_register(modrm_rm, regs, 0);
1646 switch ( modrm_reg )
1648 case 0: /* Write CR0 */
1649 if ( (*reg ^ read_cr0()) & ~X86_CR0_TS )
1651 gdprintk(XENLOG_WARNING,
1652 "Attempt to change unmodifiable CR0 flags.\n");
1653 goto fail;
1655 (void)do_fpu_taskswitch(!!(*reg & X86_CR0_TS));
1656 break;
1658 case 2: /* Write CR2 */
1659 v->arch.guest_context.ctrlreg[2] = *reg;
1660 arch_set_cr2(v, *reg);
1661 break;
1663 case 3: /* Write CR3 */
1664 LOCK_BIGLOCK(v->domain);
1665 if ( !is_pv_32on64_vcpu(v) )
1666 rc = new_guest_cr3(gmfn_to_mfn(v->domain, xen_cr3_to_pfn(*reg)));
1667 #ifdef CONFIG_COMPAT
1668 else
1669 rc = new_guest_cr3(gmfn_to_mfn(v->domain, compat_cr3_to_pfn(*reg)));
1670 #endif
1671 UNLOCK_BIGLOCK(v->domain);
1672 if ( rc == 0 ) /* not okay */
1673 goto fail;
1674 break;
1676 case 4:
1677 if ( *reg != (read_cr4() & ~(X86_CR4_PGE|X86_CR4_PSE)) )
1679 gdprintk(XENLOG_WARNING, "Attempt to change CR4 flags.\n");
1680 goto fail;
1682 break;
1684 default:
1685 goto fail;
1687 break;
1689 case 0x23: /* MOV <reg>,DR? */
1690 opcode = insn_fetch(u8, code_base, eip, code_limit);
1691 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
1692 modrm_rm |= (opcode >> 0) & 7;
1693 reg = decode_register(modrm_rm, regs, 0);
1694 if ( do_set_debugreg(modrm_reg, *reg) != 0 )
1695 goto fail;
1696 break;
1698 case 0x30: /* WRMSR */
1699 switch ( regs->ecx )
1701 #ifdef CONFIG_X86_64
1702 case MSR_FS_BASE:
1703 if ( is_pv_32on64_vcpu(v) )
1704 goto fail;
1705 if ( wrmsr_safe(MSR_FS_BASE, regs->eax, regs->edx) )
1706 goto fail;
1707 v->arch.guest_context.fs_base =
1708 ((u64)regs->edx << 32) | regs->eax;
1709 break;
1710 case MSR_GS_BASE:
1711 if ( is_pv_32on64_vcpu(v) )
1712 goto fail;
1713 if ( wrmsr_safe(MSR_GS_BASE, regs->eax, regs->edx) )
1714 goto fail;
1715 v->arch.guest_context.gs_base_kernel =
1716 ((u64)regs->edx << 32) | regs->eax;
1717 break;
1718 case MSR_SHADOW_GS_BASE:
1719 if ( is_pv_32on64_vcpu(v) )
1720 goto fail;
1721 if ( wrmsr_safe(MSR_SHADOW_GS_BASE, regs->eax, regs->edx) )
1722 goto fail;
1723 v->arch.guest_context.gs_base_user =
1724 ((u64)regs->edx << 32) | regs->eax;
1725 break;
1726 #endif
1727 default:
1728 if ( wrmsr_hypervisor_regs(regs->ecx, regs->eax, regs->edx) )
1729 break;
1731 if ( (rdmsr_safe(regs->ecx, l, h) != 0) ||
1732 (regs->eax != l) || (regs->edx != h) )
1733 gdprintk(XENLOG_WARNING, "Domain attempted WRMSR %p from "
1734 "%08x:%08x to %08lx:%08lx.\n",
1735 _p(regs->ecx), h, l, (long)regs->edx, (long)regs->eax);
1736 break;
1738 break;
1740 case 0x32: /* RDMSR */
1741 switch ( regs->ecx )
1743 #ifdef CONFIG_X86_64
1744 case MSR_FS_BASE:
1745 if ( is_pv_32on64_vcpu(v) )
1746 goto fail;
1747 regs->eax = v->arch.guest_context.fs_base & 0xFFFFFFFFUL;
1748 regs->edx = v->arch.guest_context.fs_base >> 32;
1749 break;
1750 case MSR_GS_BASE:
1751 if ( is_pv_32on64_vcpu(v) )
1752 goto fail;
1753 regs->eax = v->arch.guest_context.gs_base_kernel & 0xFFFFFFFFUL;
1754 regs->edx = v->arch.guest_context.gs_base_kernel >> 32;
1755 break;
1756 case MSR_SHADOW_GS_BASE:
1757 if ( is_pv_32on64_vcpu(v) )
1758 goto fail;
1759 regs->eax = v->arch.guest_context.gs_base_user & 0xFFFFFFFFUL;
1760 regs->edx = v->arch.guest_context.gs_base_user >> 32;
1761 break;
1762 #endif
1763 case MSR_EFER:
1764 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
1765 goto fail;
1766 break;
1767 default:
1768 if ( rdmsr_hypervisor_regs(regs->ecx, &l, &h) )
1770 regs->eax = l;
1771 regs->edx = h;
1772 break;
1774 /* Everyone can read the MSR space. */
1775 /* gdprintk(XENLOG_WARNING,"Domain attempted RDMSR %p.\n",
1776 _p(regs->ecx));*/
1777 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
1778 goto fail;
1779 break;
1781 break;
1783 default:
1784 goto fail;
1787 #undef wr_ad
1788 #undef rd_ad
1790 done:
1791 regs->eip = eip;
1792 regs->eflags &= ~X86_EFLAGS_RF;
1793 return EXCRET_fault_fixed;
1795 fail:
1796 return 0;
1799 asmlinkage int do_general_protection(struct cpu_user_regs *regs)
1801 struct vcpu *v = current;
1802 unsigned long fixup;
1804 DEBUGGER_trap_entry(TRAP_gp_fault, regs);
1806 if ( regs->error_code & 1 )
1807 goto hardware_gp;
1809 if ( !guest_mode(regs) )
1810 goto gp_in_kernel;
1812 /*
1813 * Cunning trick to allow arbitrary "INT n" handling.
1815 * We set DPL == 0 on all vectors in the IDT. This prevents any INT <n>
1816 * instruction from trapping to the appropriate vector, when that might not
1817 * be expected by Xen or the guest OS. For example, that entry might be for
1818 * a fault handler (unlike traps, faults don't increment EIP), or might
1819 * expect an error code on the stack (which a software trap never
1820 * provides), or might be a hardware interrupt handler that doesn't like
1821 * being called spuriously.
1823 * Instead, a GPF occurs with the faulting IDT vector in the error code.
1824 * Bit 1 is set to indicate that an IDT entry caused the fault. Bit 0 is
1825 * clear to indicate that it's a software fault, not hardware.
1827 * NOTE: Vectors 3 and 4 are dealt with from their own handler. This is
1828 * okay because they can only be triggered by an explicit DPL-checked
1829 * instruction. The DPL specified by the guest OS for these vectors is NOT
1830 * CHECKED!!
1831 */
1832 if ( (regs->error_code & 3) == 2 )
1834 /* This fault must be due to <INT n> instruction. */
1835 const struct trap_info *ti;
1836 unsigned char vector = regs->error_code >> 3;
1837 ti = &v->arch.guest_context.trap_ctxt[vector];
1838 if ( permit_softint(TI_GET_DPL(ti), v, regs) )
1840 regs->eip += 2;
1841 return do_guest_trap(vector, regs, 0);
1845 /* Emulate some simple privileged and I/O instructions. */
1846 if ( (regs->error_code == 0) &&
1847 emulate_privileged_op(regs) )
1848 return 0;
1850 #if defined(__i386__)
1851 if ( VM_ASSIST(v->domain, VMASST_TYPE_4gb_segments) &&
1852 (regs->error_code == 0) &&
1853 gpf_emulate_4gb(regs) )
1854 return 0;
1855 #endif
1857 /* Pass on GPF as is. */
1858 return do_guest_trap(TRAP_gp_fault, regs, 1);
1860 gp_in_kernel:
1862 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
1864 dprintk(XENLOG_INFO, "GPF (%04x): %p -> %p\n",
1865 regs->error_code, _p(regs->eip), _p(fixup));
1866 regs->eip = fixup;
1867 return 0;
1870 DEBUGGER_trap_fatal(TRAP_gp_fault, regs);
1872 hardware_gp:
1873 show_execution_state(regs);
1874 panic("GENERAL PROTECTION FAULT\n[error_code=%04x]\n", regs->error_code);
1875 return 0;
1878 static void nmi_softirq(void)
1880 /* Only used to defer wakeup of dom0,vcpu0 to a safe (non-NMI) context. */
1881 vcpu_kick(dom0->vcpu[0]);
1884 static void nmi_dom0_report(unsigned int reason_idx)
1886 struct domain *d;
1887 struct vcpu *v;
1889 if ( ((d = dom0) == NULL) || ((v = d->vcpu[0]) == NULL) )
1890 return;
1892 set_bit(reason_idx, nmi_reason(d));
1894 if ( !xchg(&v->nmi_pending, 1) )
1895 raise_softirq(NMI_SOFTIRQ); /* not safe to wake up a vcpu here */
1898 asmlinkage void mem_parity_error(struct cpu_user_regs *regs)
1900 switch ( opt_nmi[0] )
1902 case 'd': /* 'dom0' */
1903 nmi_dom0_report(_XEN_NMIREASON_parity_error);
1904 case 'i': /* 'ignore' */
1905 break;
1906 default: /* 'fatal' */
1907 console_force_unlock();
1908 printk("\n\nNMI - MEMORY ERROR\n");
1909 fatal_trap(TRAP_nmi, regs);
1912 outb((inb(0x61) & 0x0f) | 0x04, 0x61); /* clear-and-disable parity check */
1913 mdelay(1);
1914 outb((inb(0x61) & 0x0b) | 0x00, 0x61); /* enable parity check */
1917 asmlinkage void io_check_error(struct cpu_user_regs *regs)
1919 switch ( opt_nmi[0] )
1921 case 'd': /* 'dom0' */
1922 nmi_dom0_report(_XEN_NMIREASON_io_error);
1923 case 'i': /* 'ignore' */
1924 break;
1925 default: /* 'fatal' */
1926 console_force_unlock();
1927 printk("\n\nNMI - I/O ERROR\n");
1928 fatal_trap(TRAP_nmi, regs);
1931 outb((inb(0x61) & 0x0f) | 0x08, 0x61); /* clear-and-disable IOCK */
1932 mdelay(1);
1933 outb((inb(0x61) & 0x07) | 0x00, 0x61); /* enable IOCK */
1936 static void unknown_nmi_error(unsigned char reason)
1938 switch ( opt_nmi[0] )
1940 case 'd': /* 'dom0' */
1941 nmi_dom0_report(_XEN_NMIREASON_unknown);
1942 case 'i': /* 'ignore' */
1943 break;
1944 default: /* 'fatal' */
1945 printk("Uhhuh. NMI received for unknown reason %02x.\n", reason);
1946 printk("Dazed and confused, but trying to continue\n");
1947 printk("Do you have a strange power saving mode enabled?\n");
1948 kexec_crash();
1952 static int dummy_nmi_callback(struct cpu_user_regs *regs, int cpu)
1954 return 0;
1957 static nmi_callback_t nmi_callback = dummy_nmi_callback;
1959 asmlinkage void do_nmi(struct cpu_user_regs *regs)
1961 unsigned int cpu = smp_processor_id();
1962 unsigned char reason;
1964 ++nmi_count(cpu);
1966 if ( nmi_callback(regs, cpu) )
1967 return;
1969 if ( nmi_watchdog )
1970 nmi_watchdog_tick(regs);
1972 /* Only the BSP gets external NMIs from the system. */
1973 if ( cpu == 0 )
1975 reason = inb(0x61);
1976 if ( reason & 0x80 )
1977 mem_parity_error(regs);
1978 else if ( reason & 0x40 )
1979 io_check_error(regs);
1980 else if ( !nmi_watchdog )
1981 unknown_nmi_error((unsigned char)(reason&0xff));
1985 void set_nmi_callback(nmi_callback_t callback)
1987 nmi_callback = callback;
1990 void unset_nmi_callback(void)
1992 nmi_callback = dummy_nmi_callback;
1995 asmlinkage int math_state_restore(struct cpu_user_regs *regs)
1997 BUG_ON(!guest_mode(regs));
1999 setup_fpu(current);
2001 if ( current->arch.guest_context.ctrlreg[0] & X86_CR0_TS )
2003 do_guest_trap(TRAP_no_device, regs, 0);
2004 current->arch.guest_context.ctrlreg[0] &= ~X86_CR0_TS;
2007 return EXCRET_fault_fixed;
2010 asmlinkage int do_debug(struct cpu_user_regs *regs)
2012 unsigned long condition;
2013 struct vcpu *v = current;
2015 __asm__ __volatile__("mov %%db6,%0" : "=r" (condition));
2017 /* Mask out spurious debug traps due to lazy DR7 setting */
2018 if ( (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) &&
2019 (v->arch.guest_context.debugreg[7] == 0) )
2021 __asm__("mov %0,%%db7" : : "r" (0UL));
2022 goto out;
2025 DEBUGGER_trap_entry(TRAP_debug, regs);
2027 if ( !guest_mode(regs) )
2029 /* Clear TF just for absolute sanity. */
2030 regs->eflags &= ~EF_TF;
2031 /*
2032 * We ignore watchpoints when they trigger within Xen. This may happen
2033 * when a buffer is passed to us which previously had a watchpoint set
2034 * on it. No need to bump EIP; the only faulting trap is an instruction
2035 * breakpoint, which can't happen to us.
2036 */
2037 goto out;
2040 /* Save debug status register where guest OS can peek at it */
2041 v->arch.guest_context.debugreg[6] = condition;
2043 return do_guest_trap(TRAP_debug, regs, 0);
2045 out:
2046 return EXCRET_not_a_fault;
2049 asmlinkage int do_spurious_interrupt_bug(struct cpu_user_regs *regs)
2051 return EXCRET_not_a_fault;
2054 void set_intr_gate(unsigned int n, void *addr)
2056 int i;
2057 /* Keep secondary tables in sync with IRQ updates. */
2058 for ( i = 1; i < NR_CPUS; i++ )
2059 if ( idt_tables[i] != NULL )
2060 _set_gate(&idt_tables[i][n], 14, 0, addr);
2061 _set_gate(&idt_table[n], 14, 0, addr);
2064 void set_system_gate(unsigned int n, void *addr)
2066 _set_gate(idt_table+n,14,3,addr);
2069 void set_task_gate(unsigned int n, unsigned int sel)
2071 idt_table[n].a = sel << 16;
2072 idt_table[n].b = 0x8500;
2075 void set_tss_desc(unsigned int n, void *addr)
2077 _set_tssldt_desc(
2078 gdt_table + __TSS(n) - FIRST_RESERVED_GDT_ENTRY,
2079 (unsigned long)addr,
2080 offsetof(struct tss_struct, __cacheline_filler) - 1,
2081 9);
2082 #ifdef CONFIG_COMPAT
2083 _set_tssldt_desc(
2084 compat_gdt_table + __TSS(n) - FIRST_RESERVED_GDT_ENTRY,
2085 (unsigned long)addr,
2086 offsetof(struct tss_struct, __cacheline_filler) - 1,
2087 11);
2088 #endif
2091 void __init trap_init(void)
2093 extern void percpu_traps_init(void);
2095 /*
2096 * Note that interrupt gates are always used, rather than trap gates. We
2097 * must have interrupts disabled until DS/ES/FS/GS are saved because the
2098 * first activation must have the "bad" value(s) for these registers and
2099 * we may lose them if another activation is installed before they are
2100 * saved. The page-fault handler also needs interrupts disabled until %cr2
2101 * has been read and saved on the stack.
2102 */
2103 set_intr_gate(TRAP_divide_error,&divide_error);
2104 set_intr_gate(TRAP_debug,&debug);
2105 set_intr_gate(TRAP_nmi,&nmi);
2106 set_system_gate(TRAP_int3,&int3); /* usable from all privileges */
2107 set_system_gate(TRAP_overflow,&overflow); /* usable from all privileges */
2108 set_intr_gate(TRAP_bounds,&bounds);
2109 set_intr_gate(TRAP_invalid_op,&invalid_op);
2110 set_intr_gate(TRAP_no_device,&device_not_available);
2111 set_intr_gate(TRAP_copro_seg,&coprocessor_segment_overrun);
2112 set_intr_gate(TRAP_invalid_tss,&invalid_TSS);
2113 set_intr_gate(TRAP_no_segment,&segment_not_present);
2114 set_intr_gate(TRAP_stack_error,&stack_segment);
2115 set_intr_gate(TRAP_gp_fault,&general_protection);
2116 set_intr_gate(TRAP_page_fault,&page_fault);
2117 set_intr_gate(TRAP_spurious_int,&spurious_interrupt_bug);
2118 set_intr_gate(TRAP_copro_error,&coprocessor_error);
2119 set_intr_gate(TRAP_alignment_check,&alignment_check);
2120 set_intr_gate(TRAP_machine_check,&machine_check);
2121 set_intr_gate(TRAP_simd_error,&simd_coprocessor_error);
2123 /* CPU0 uses the master IDT. */
2124 idt_tables[0] = idt_table;
2126 percpu_traps_init();
2128 cpu_init();
2130 open_softirq(NMI_SOFTIRQ, nmi_softirq);
2134 long do_set_trap_table(XEN_GUEST_HANDLE(trap_info_t) traps)
2136 struct trap_info cur;
2137 struct trap_info *dst = current->arch.guest_context.trap_ctxt;
2138 long rc = 0;
2140 /* If no table is presented then clear the entire virtual IDT. */
2141 if ( guest_handle_is_null(traps) )
2143 memset(dst, 0, 256 * sizeof(*dst));
2144 init_int80_direct_trap(current);
2145 return 0;
2148 for ( ; ; )
2150 if ( hypercall_preempt_check() )
2152 rc = hypercall_create_continuation(
2153 __HYPERVISOR_set_trap_table, "h", traps);
2154 break;
2157 if ( copy_from_guest(&cur, traps, 1) )
2159 rc = -EFAULT;
2160 break;
2163 if ( cur.address == 0 )
2164 break;
2166 fixup_guest_code_selector(current->domain, cur.cs);
2168 memcpy(&dst[cur.vector], &cur, sizeof(cur));
2170 if ( cur.vector == 0x80 )
2171 init_int80_direct_trap(current);
2173 guest_handle_add_offset(traps, 1);
2176 return rc;
2180 long set_debugreg(struct vcpu *p, int reg, unsigned long value)
2182 int i;
2184 switch ( reg )
2186 case 0:
2187 if ( !access_ok(value, sizeof(long)) )
2188 return -EPERM;
2189 if ( p == current )
2190 __asm__ ( "mov %0, %%db0" : : "r" (value) );
2191 break;
2192 case 1:
2193 if ( !access_ok(value, sizeof(long)) )
2194 return -EPERM;
2195 if ( p == current )
2196 __asm__ ( "mov %0, %%db1" : : "r" (value) );
2197 break;
2198 case 2:
2199 if ( !access_ok(value, sizeof(long)) )
2200 return -EPERM;
2201 if ( p == current )
2202 __asm__ ( "mov %0, %%db2" : : "r" (value) );
2203 break;
2204 case 3:
2205 if ( !access_ok(value, sizeof(long)) )
2206 return -EPERM;
2207 if ( p == current )
2208 __asm__ ( "mov %0, %%db3" : : "r" (value) );
2209 break;
2210 case 6:
2211 /*
2212 * DR6: Bits 4-11,16-31 reserved (set to 1).
2213 * Bit 12 reserved (set to 0).
2214 */
2215 value &= 0xffffefff; /* reserved bits => 0 */
2216 value |= 0xffff0ff0; /* reserved bits => 1 */
2217 if ( p == current )
2218 __asm__ ( "mov %0, %%db6" : : "r" (value) );
2219 break;
2220 case 7:
2221 /*
2222 * DR7: Bit 10 reserved (set to 1).
2223 * Bits 11-12,14-15 reserved (set to 0).
2224 * Privileged bits:
2225 * GD (bit 13): must be 0.
2226 * R/Wn (bits 16-17,20-21,24-25,28-29): mustn't be 10.
2227 * LENn (bits 18-19,22-23,26-27,30-31): mustn't be 10.
2228 */
2229 /* DR7 == 0 => debugging disabled for this domain. */
2230 if ( value != 0 )
2232 value &= 0xffff27ff; /* reserved bits => 0 */
2233 value |= 0x00000400; /* reserved bits => 1 */
2234 if ( (value & (1<<13)) != 0 ) return -EPERM;
2235 for ( i = 0; i < 16; i += 2 )
2236 if ( ((value >> (i+16)) & 3) == 2 ) return -EPERM;
2238 if ( p == current )
2239 __asm__ ( "mov %0, %%db7" : : "r" (value) );
2240 break;
2241 default:
2242 return -EINVAL;
2245 p->arch.guest_context.debugreg[reg] = value;
2246 return 0;
2249 long do_set_debugreg(int reg, unsigned long value)
2251 return set_debugreg(current, reg, value);
2254 unsigned long do_get_debugreg(int reg)
2256 if ( (reg < 0) || (reg > 7) ) return -EINVAL;
2257 return current->arch.guest_context.debugreg[reg];
2260 /*
2261 * Local variables:
2262 * mode: C
2263 * c-set-style: "BSD"
2264 * c-basic-offset: 4
2265 * tab-width: 4
2266 * indent-tabs-mode: nil
2267 * End:
2268 */