ia64/xen-unstable

view xen/arch/x86/traps.c @ 16185:42d8dadb5864

x86: Allow NMI callback CS to be specified via set_trap_table()
hypercall.
Based on a patch by Jan Beulich.
Signed-off-by: Keir Fraser <keir@xensource.com>
author Keir Fraser <keir@xensource.com>
date Mon Oct 22 13:04:32 2007 +0100 (2007-10-22)
parents 16f5672879c8
children c05ec22a9106
line source
1 /******************************************************************************
2 * arch/x86/traps.c
3 *
4 * Modifications to Linux original are copyright (c) 2002-2004, K A Fraser
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
21 /*
22 * Copyright (C) 1991, 1992 Linus Torvalds
23 *
24 * Pentium III FXSR, SSE support
25 * Gareth Hughes <gareth@valinux.com>, May 2000
26 */
28 #include <xen/config.h>
29 #include <xen/init.h>
30 #include <xen/sched.h>
31 #include <xen/lib.h>
32 #include <xen/errno.h>
33 #include <xen/mm.h>
34 #include <xen/console.h>
35 #include <xen/shutdown.h>
36 #include <asm/regs.h>
37 #include <xen/delay.h>
38 #include <xen/event.h>
39 #include <xen/spinlock.h>
40 #include <xen/irq.h>
41 #include <xen/perfc.h>
42 #include <xen/softirq.h>
43 #include <xen/domain_page.h>
44 #include <xen/symbols.h>
45 #include <xen/iocap.h>
46 #include <xen/nmi.h>
47 #include <xen/version.h>
48 #include <xen/kexec.h>
49 #include <xen/trace.h>
50 #include <asm/paging.h>
51 #include <asm/system.h>
52 #include <asm/io.h>
53 #include <asm/atomic.h>
54 #include <asm/desc.h>
55 #include <asm/debugreg.h>
56 #include <asm/smp.h>
57 #include <asm/flushtlb.h>
58 #include <asm/uaccess.h>
59 #include <asm/i387.h>
60 #include <asm/debugger.h>
61 #include <asm/msr.h>
62 #include <asm/shared.h>
63 #include <asm/x86_emulate.h>
64 #include <asm/hvm/vpt.h>
66 /*
67 * opt_nmi: one of 'ignore', 'dom0', or 'fatal'.
68 * fatal: Xen prints diagnostic message and then hangs.
69 * dom0: The NMI is virtualised to DOM0.
70 * ignore: The NMI error is cleared and ignored.
71 */
72 #ifdef NDEBUG
73 char opt_nmi[10] = "dom0";
74 #else
75 char opt_nmi[10] = "fatal";
76 #endif
77 string_param("nmi", opt_nmi);
79 DEFINE_PER_CPU(u32, ler_msr);
81 /* Master table, used by CPU0. */
82 idt_entry_t idt_table[IDT_ENTRIES];
84 /* Pointer to the IDT of every CPU. */
85 idt_entry_t *idt_tables[NR_CPUS] __read_mostly;
87 #define DECLARE_TRAP_HANDLER(_name) \
88 asmlinkage void _name(void); \
89 asmlinkage int do_ ## _name(struct cpu_user_regs *regs)
91 asmlinkage void nmi(void);
92 asmlinkage void machine_check(void);
93 DECLARE_TRAP_HANDLER(divide_error);
94 DECLARE_TRAP_HANDLER(debug);
95 DECLARE_TRAP_HANDLER(int3);
96 DECLARE_TRAP_HANDLER(overflow);
97 DECLARE_TRAP_HANDLER(bounds);
98 DECLARE_TRAP_HANDLER(invalid_op);
99 DECLARE_TRAP_HANDLER(device_not_available);
100 DECLARE_TRAP_HANDLER(coprocessor_segment_overrun);
101 DECLARE_TRAP_HANDLER(invalid_TSS);
102 DECLARE_TRAP_HANDLER(segment_not_present);
103 DECLARE_TRAP_HANDLER(stack_segment);
104 DECLARE_TRAP_HANDLER(general_protection);
105 DECLARE_TRAP_HANDLER(page_fault);
106 DECLARE_TRAP_HANDLER(coprocessor_error);
107 DECLARE_TRAP_HANDLER(simd_coprocessor_error);
108 DECLARE_TRAP_HANDLER(alignment_check);
109 DECLARE_TRAP_HANDLER(spurious_interrupt_bug);
111 long do_set_debugreg(int reg, unsigned long value);
112 unsigned long do_get_debugreg(int reg);
114 static int debug_stack_lines = 20;
115 integer_param("debug_stack_lines", debug_stack_lines);
117 static int opt_ler;
118 boolean_param("ler", opt_ler);
120 #ifdef CONFIG_X86_32
121 #define stack_words_per_line 8
122 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)&regs->esp)
123 #else
124 #define stack_words_per_line 4
125 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)regs->rsp)
126 #endif
128 static void show_guest_stack(struct cpu_user_regs *regs)
129 {
130 int i;
131 unsigned long *stack, addr;
133 if ( is_hvm_vcpu(current) )
134 return;
136 if ( is_pv_32on64_vcpu(current) )
137 {
138 compat_show_guest_stack(regs, debug_stack_lines);
139 return;
140 }
142 if ( vm86_mode(regs) )
143 {
144 stack = (unsigned long *)((regs->ss << 4) + (regs->esp & 0xffff));
145 printk("Guest stack trace from ss:sp = %04x:%04x (VM86)\n ",
146 regs->ss, (uint16_t)(regs->esp & 0xffff));
147 }
148 else
149 {
150 stack = (unsigned long *)regs->esp;
151 printk("Guest stack trace from "__OP"sp=%p:\n ", stack);
152 }
154 for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
155 {
156 if ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) == 0 )
157 break;
158 if ( get_user(addr, stack) )
159 {
160 if ( i != 0 )
161 printk("\n ");
162 printk("Fault while accessing guest memory.");
163 i = 1;
164 break;
165 }
166 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
167 printk("\n ");
168 printk(" %p", _p(addr));
169 stack++;
170 }
171 if ( i == 0 )
172 printk("Stack empty.");
173 printk("\n");
174 }
176 #if !defined(CONFIG_FRAME_POINTER)
178 static void show_trace(struct cpu_user_regs *regs)
179 {
180 unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr;
182 printk("Xen call trace:\n ");
184 printk("[<%p>]", _p(regs->eip));
185 print_symbol(" %s\n ", regs->eip);
187 while ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) != 0 )
188 {
189 addr = *stack++;
190 if ( is_kernel_text(addr) || is_kernel_inittext(addr) )
191 {
192 printk("[<%p>]", _p(addr));
193 print_symbol(" %s\n ", addr);
194 }
195 }
197 printk("\n");
198 }
200 #else
202 static void show_trace(struct cpu_user_regs *regs)
203 {
204 unsigned long *frame, next, addr, low, high;
206 printk("Xen call trace:\n ");
208 printk("[<%p>]", _p(regs->eip));
209 print_symbol(" %s\n ", regs->eip);
211 /* Bounds for range of valid frame pointer. */
212 low = (unsigned long)(ESP_BEFORE_EXCEPTION(regs) - 2);
213 high = (low & ~(STACK_SIZE - 1)) +
214 (STACK_SIZE - sizeof(struct cpu_info) - 2*sizeof(unsigned long));
216 /* The initial frame pointer. */
217 next = regs->ebp;
219 for ( ; ; )
220 {
221 /* Valid frame pointer? */
222 if ( (next < low) || (next >= high) )
223 {
224 /*
225 * Exception stack frames have a different layout, denoted by an
226 * inverted frame pointer.
227 */
228 next = ~next;
229 if ( (next < low) || (next >= high) )
230 break;
231 frame = (unsigned long *)next;
232 next = frame[0];
233 addr = frame[(offsetof(struct cpu_user_regs, eip) -
234 offsetof(struct cpu_user_regs, ebp))
235 / BYTES_PER_LONG];
236 }
237 else
238 {
239 /* Ordinary stack frame. */
240 frame = (unsigned long *)next;
241 next = frame[0];
242 addr = frame[1];
243 }
245 printk("[<%p>]", _p(addr));
246 print_symbol(" %s\n ", addr);
248 low = (unsigned long)&frame[2];
249 }
251 printk("\n");
252 }
254 #endif
256 void show_stack(struct cpu_user_regs *regs)
257 {
258 unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr;
259 int i;
261 if ( guest_mode(regs) )
262 return show_guest_stack(regs);
264 printk("Xen stack trace from "__OP"sp=%p:\n ", stack);
266 for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
267 {
268 if ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) == 0 )
269 break;
270 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
271 printk("\n ");
272 addr = *stack++;
273 printk(" %p", _p(addr));
274 }
275 if ( i == 0 )
276 printk("Stack empty.");
277 printk("\n");
279 show_trace(regs);
280 }
282 void show_stack_overflow(unsigned int cpu, unsigned long esp)
283 {
284 #ifdef MEMORY_GUARD
285 unsigned long esp_top, esp_bottom;
286 unsigned long *stack, addr;
288 esp_bottom = (esp | (STACK_SIZE - 1)) + 1;
289 esp_top = esp_bottom - PRIMARY_STACK_SIZE;
291 printk("Valid stack range: %p-%p, sp=%p, tss.esp0=%p\n",
292 (void *)esp_top, (void *)esp_bottom, (void *)esp,
293 (void *)init_tss[cpu].esp0);
295 /* Trigger overflow trace if %esp is within 512 bytes of the guard page. */
296 if ( ((unsigned long)(esp - esp_top) > 512) &&
297 ((unsigned long)(esp_top - esp) > 512) )
298 {
299 printk("No stack overflow detected. Skipping stack trace.\n");
300 return;
301 }
303 if ( esp < esp_top )
304 esp = esp_top;
306 printk("Xen stack overflow (dumping trace %p-%p):\n ",
307 (void *)esp, (void *)esp_bottom);
309 stack = (unsigned long *)esp;
310 while ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) != 0 )
311 {
312 addr = *stack++;
313 if ( is_kernel_text(addr) || is_kernel_inittext(addr) )
314 {
315 printk("%p: [<%p>]", stack, _p(addr));
316 print_symbol(" %s\n ", addr);
317 }
318 }
320 printk("\n");
321 #endif
322 }
324 void show_execution_state(struct cpu_user_regs *regs)
325 {
326 show_registers(regs);
327 show_stack(regs);
328 }
330 char *trapstr(int trapnr)
331 {
332 static char *strings[] = {
333 "divide error", "debug", "nmi", "bkpt", "overflow", "bounds",
334 "invalid opcode", "device not available", "double fault",
335 "coprocessor segment", "invalid tss", "segment not found",
336 "stack error", "general protection fault", "page fault",
337 "spurious interrupt", "coprocessor error", "alignment check",
338 "machine check", "simd error"
339 };
341 if ( (trapnr < 0) || (trapnr >= ARRAY_SIZE(strings)) )
342 return "???";
344 return strings[trapnr];
345 }
347 /*
348 * This is called for faults at very unexpected times (e.g., when interrupts
349 * are disabled). In such situations we can't do much that is safe. We try to
350 * print out some tracing and then we just spin.
351 */
352 asmlinkage void fatal_trap(int trapnr, struct cpu_user_regs *regs)
353 {
354 static DEFINE_PER_CPU(char, depth);
356 /*
357 * In some cases, we can end up in a vicious cycle of fatal_trap()s
358 * within fatal_trap()s. We give the problem a couple of iterations to
359 * bottom out, and then we just panic.
360 */
361 if ( ++this_cpu(depth) < 3 )
362 {
363 watchdog_disable();
364 console_start_sync();
366 show_execution_state(regs);
368 if ( trapnr == TRAP_page_fault )
369 {
370 unsigned long cr2 = read_cr2();
371 printk("Faulting linear address: %p\n", _p(cr2));
372 show_page_walk(cr2);
373 }
374 }
376 panic("FATAL TRAP: vector = %d (%s)\n"
377 "[error_code=%04x] %s\n",
378 trapnr, trapstr(trapnr), regs->error_code,
379 (regs->eflags & X86_EFLAGS_IF) ? "" : ", IN INTERRUPT CONTEXT");
380 }
382 static int do_guest_trap(
383 int trapnr, const struct cpu_user_regs *regs, int use_error_code)
384 {
385 struct vcpu *v = current;
386 struct trap_bounce *tb;
387 const struct trap_info *ti;
389 trace_pv_trap(trapnr, regs->eip, use_error_code, regs->error_code);
391 tb = &v->arch.trap_bounce;
392 ti = &v->arch.guest_context.trap_ctxt[trapnr];
394 tb->flags = TBF_EXCEPTION;
395 tb->cs = ti->cs;
396 tb->eip = ti->address;
398 if ( use_error_code )
399 {
400 tb->flags |= TBF_EXCEPTION_ERRCODE;
401 tb->error_code = regs->error_code;
402 }
404 if ( TI_GET_IF(ti) )
405 tb->flags |= TBF_INTERRUPT;
407 if ( unlikely(null_trap_bounce(v, tb)) )
408 gdprintk(XENLOG_WARNING, "Unhandled %s fault/trap [#%d] in "
409 "domain %d on VCPU %d [ec=%04x]\n",
410 trapstr(trapnr), trapnr, v->domain->domain_id, v->vcpu_id,
411 regs->error_code);
413 return 0;
414 }
416 static inline int do_trap(
417 int trapnr, struct cpu_user_regs *regs, int use_error_code)
418 {
419 unsigned long fixup;
421 DEBUGGER_trap_entry(trapnr, regs);
423 if ( guest_mode(regs) )
424 return do_guest_trap(trapnr, regs, use_error_code);
426 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
427 {
428 dprintk(XENLOG_ERR, "Trap %d: %p -> %p\n",
429 trapnr, _p(regs->eip), _p(fixup));
430 regs->eip = fixup;
431 return 0;
432 }
434 DEBUGGER_trap_fatal(trapnr, regs);
436 show_execution_state(regs);
437 panic("FATAL TRAP: vector = %d (%s)\n"
438 "[error_code=%04x]\n",
439 trapnr, trapstr(trapnr), regs->error_code);
440 return 0;
441 }
443 #define DO_ERROR_NOCODE(trapnr, name) \
444 asmlinkage int do_##name(struct cpu_user_regs *regs) \
445 { \
446 return do_trap(trapnr, regs, 0); \
447 }
449 #define DO_ERROR(trapnr, name) \
450 asmlinkage int do_##name(struct cpu_user_regs *regs) \
451 { \
452 return do_trap(trapnr, regs, 1); \
453 }
455 DO_ERROR_NOCODE(TRAP_divide_error, divide_error)
456 DO_ERROR_NOCODE(TRAP_overflow, overflow)
457 DO_ERROR_NOCODE(TRAP_bounds, bounds)
458 DO_ERROR_NOCODE(TRAP_copro_seg, coprocessor_segment_overrun)
459 DO_ERROR( TRAP_invalid_tss, invalid_TSS)
460 DO_ERROR( TRAP_no_segment, segment_not_present)
461 DO_ERROR( TRAP_stack_error, stack_segment)
462 DO_ERROR_NOCODE(TRAP_copro_error, coprocessor_error)
463 DO_ERROR( TRAP_alignment_check, alignment_check)
464 DO_ERROR_NOCODE(TRAP_simd_error, simd_coprocessor_error)
466 int rdmsr_hypervisor_regs(
467 uint32_t idx, uint32_t *eax, uint32_t *edx)
468 {
469 idx -= 0x40000000;
470 if ( idx > 0 )
471 return 0;
473 switch ( idx )
474 {
475 case 0:
476 {
477 *eax = *edx = 0;
478 break;
479 }
480 default:
481 BUG();
482 }
484 return 1;
485 }
487 int wrmsr_hypervisor_regs(
488 uint32_t idx, uint32_t eax, uint32_t edx)
489 {
490 struct domain *d = current->domain;
492 idx -= 0x40000000;
493 if ( idx > 0 )
494 return 0;
496 switch ( idx )
497 {
498 case 0:
499 {
500 void *hypercall_page;
501 unsigned long mfn;
502 unsigned long gmfn = ((unsigned long)edx << 20) | (eax >> 12);
503 unsigned int idx = eax & 0xfff;
505 if ( idx > 0 )
506 {
507 gdprintk(XENLOG_WARNING,
508 "Dom%d: Out of range index %u to MSR %08x\n",
509 d->domain_id, idx, 0x40000000);
510 return 0;
511 }
513 mfn = gmfn_to_mfn(d, gmfn);
515 if ( !mfn_valid(mfn) ||
516 !get_page_and_type(mfn_to_page(mfn), d, PGT_writable_page) )
517 {
518 gdprintk(XENLOG_WARNING,
519 "Dom%d: Bad GMFN %lx (MFN %lx) to MSR %08x\n",
520 d->domain_id, gmfn, mfn, 0x40000000);
521 return 0;
522 }
524 hypercall_page = map_domain_page(mfn);
525 hypercall_page_initialise(d, hypercall_page);
526 unmap_domain_page(hypercall_page);
528 put_page_and_type(mfn_to_page(mfn));
529 break;
530 }
532 default:
533 BUG();
534 }
536 return 1;
537 }
539 int cpuid_hypervisor_leaves(
540 uint32_t idx, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
541 {
542 idx -= 0x40000000;
543 if ( idx > 2 )
544 return 0;
546 switch ( idx )
547 {
548 case 0:
549 *eax = 0x40000002; /* Largest leaf */
550 *ebx = 0x566e6558; /* Signature 1: "XenV" */
551 *ecx = 0x65584d4d; /* Signature 2: "MMXe" */
552 *edx = 0x4d4d566e; /* Signature 3: "nVMM" */
553 break;
555 case 1:
556 *eax = (xen_major_version() << 16) | xen_minor_version();
557 *ebx = 0; /* Reserved */
558 *ecx = 0; /* Reserved */
559 *edx = 0; /* Reserved */
560 break;
562 case 2:
563 *eax = 1; /* Number of hypercall-transfer pages */
564 *ebx = 0x40000000; /* MSR base address */
565 *ecx = 0; /* Features 1 */
566 *edx = 0; /* Features 2 */
567 break;
569 default:
570 BUG();
571 }
573 return 1;
574 }
576 static int emulate_forced_invalid_op(struct cpu_user_regs *regs)
577 {
578 char sig[5], instr[2];
579 uint32_t a, b, c, d;
580 unsigned long eip, rc;
582 a = regs->eax;
583 b = regs->ebx;
584 c = regs->ecx;
585 d = regs->edx;
586 eip = regs->eip;
588 /* Check for forced emulation signature: ud2 ; .ascii "xen". */
589 if ( (rc = copy_from_user(sig, (char *)eip, sizeof(sig))) != 0 )
590 {
591 propagate_page_fault(eip + sizeof(sig) - rc, 0);
592 return EXCRET_fault_fixed;
593 }
594 if ( memcmp(sig, "\xf\xbxen", sizeof(sig)) )
595 return 0;
596 eip += sizeof(sig);
598 /* We only emulate CPUID. */
599 if ( ( rc = copy_from_user(instr, (char *)eip, sizeof(instr))) != 0 )
600 {
601 propagate_page_fault(eip + sizeof(instr) - rc, 0);
602 return EXCRET_fault_fixed;
603 }
604 if ( memcmp(instr, "\xf\xa2", sizeof(instr)) )
605 return 0;
606 eip += sizeof(instr);
608 asm (
609 "cpuid"
610 : "=a" (a), "=b" (b), "=c" (c), "=d" (d)
611 : "0" (a), "1" (b), "2" (c), "3" (d) );
613 if ( regs->eax == 1 )
614 {
615 /* Modify Feature Information. */
616 clear_bit(X86_FEATURE_VME, &d);
617 clear_bit(X86_FEATURE_DE, &d);
618 clear_bit(X86_FEATURE_PSE, &d);
619 clear_bit(X86_FEATURE_PGE, &d);
620 if ( !supervisor_mode_kernel )
621 clear_bit(X86_FEATURE_SEP, &d);
622 if ( !IS_PRIV(current->domain) )
623 clear_bit(X86_FEATURE_MTRR, &d);
624 }
625 else if ( regs->eax == 0x80000001 )
626 {
627 /* Modify Feature Information. */
628 if ( is_pv_32bit_vcpu(current) )
629 clear_bit(X86_FEATURE_SYSCALL % 32, &d);
630 clear_bit(X86_FEATURE_RDTSCP % 32, &d);
631 }
632 else
633 {
634 (void)cpuid_hypervisor_leaves(regs->eax, &a, &b, &c, &d);
635 }
637 regs->eax = a;
638 regs->ebx = b;
639 regs->ecx = c;
640 regs->edx = d;
641 regs->eip = eip;
642 regs->eflags &= ~X86_EFLAGS_RF;
644 trace_trap_one_addr(TRC_PV_FORCED_INVALID_OP, regs->eip);
646 return EXCRET_fault_fixed;
647 }
649 asmlinkage int do_invalid_op(struct cpu_user_regs *regs)
650 {
651 struct bug_frame bug;
652 struct bug_frame_str bug_str;
653 char *filename, *predicate, *eip = (char *)regs->eip;
654 int rc, id, lineno;
656 DEBUGGER_trap_entry(TRAP_invalid_op, regs);
658 if ( likely(guest_mode(regs)) )
659 {
660 if ( (rc = emulate_forced_invalid_op(regs)) != 0 )
661 return rc;
662 return do_guest_trap(TRAP_invalid_op, regs, 0);
663 }
665 if ( !is_kernel(eip) ||
666 __copy_from_user(&bug, eip, sizeof(bug)) ||
667 memcmp(bug.ud2, "\xf\xb", sizeof(bug.ud2)) ||
668 (bug.ret != 0xc2) )
669 goto die;
670 eip += sizeof(bug);
672 id = bug.id & 3;
674 if ( id == BUGFRAME_dump )
675 {
676 show_execution_state(regs);
677 regs->eip = (unsigned long)eip;
678 return EXCRET_fault_fixed;
679 }
681 /* WARN, BUG or ASSERT: decode the filename pointer and line number. */
682 if ( !is_kernel(eip) ||
683 __copy_from_user(&bug_str, eip, sizeof(bug_str)) ||
684 memcmp(bug_str.mov, BUG_MOV_STR, sizeof(bug_str.mov)) )
685 goto die;
686 eip += sizeof(bug_str);
688 filename = is_kernel(bug_str.str) ? (char *)bug_str.str : "<unknown>";
689 lineno = bug.id >> 2;
691 if ( id == BUGFRAME_warn )
692 {
693 printk("Xen WARN at %.50s:%d\n", filename, lineno);
694 show_execution_state(regs);
695 regs->eip = (unsigned long)eip;
696 return EXCRET_fault_fixed;
697 }
699 if ( id == BUGFRAME_bug )
700 {
701 printk("Xen BUG at %.50s:%d\n", filename, lineno);
702 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
703 show_execution_state(regs);
704 panic("Xen BUG at %.50s:%d\n", filename, lineno);
705 }
707 /* ASSERT: decode the predicate string pointer. */
708 ASSERT(id == BUGFRAME_assert);
709 if ( !is_kernel(eip) ||
710 __copy_from_user(&bug_str, eip, sizeof(bug_str)) ||
711 memcmp(bug_str.mov, BUG_MOV_STR, sizeof(bug_str.mov)) )
712 goto die;
713 eip += sizeof(bug_str);
715 predicate = is_kernel(bug_str.str) ? (char *)bug_str.str : "<unknown>";
716 printk("Assertion '%s' failed at %.50s:%d\n",
717 predicate, filename, lineno);
718 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
719 show_execution_state(regs);
720 panic("Assertion '%s' failed at %.50s:%d\n",
721 predicate, filename, lineno);
723 die:
724 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
725 show_execution_state(regs);
726 panic("FATAL TRAP: vector = %d (invalid opcode)\n", TRAP_invalid_op);
727 return 0;
728 }
730 asmlinkage int do_int3(struct cpu_user_regs *regs)
731 {
732 DEBUGGER_trap_entry(TRAP_int3, regs);
734 if ( !guest_mode(regs) )
735 {
736 DEBUGGER_trap_fatal(TRAP_int3, regs);
737 show_execution_state(regs);
738 panic("FATAL TRAP: vector = 3 (Int3)\n");
739 }
741 return do_guest_trap(TRAP_int3, regs, 0);
742 }
744 asmlinkage void do_machine_check(struct cpu_user_regs *regs)
745 {
746 extern fastcall void (*machine_check_vector)(
747 struct cpu_user_regs *, long error_code);
748 machine_check_vector(regs, regs->error_code);
749 }
751 void propagate_page_fault(unsigned long addr, u16 error_code)
752 {
753 struct trap_info *ti;
754 struct vcpu *v = current;
755 struct trap_bounce *tb = &v->arch.trap_bounce;
757 v->arch.guest_context.ctrlreg[2] = addr;
758 arch_set_cr2(v, addr);
760 /* Re-set error_code.user flag appropriately for the guest. */
761 error_code &= ~PFEC_user_mode;
762 if ( !guest_kernel_mode(v, guest_cpu_user_regs()) )
763 error_code |= PFEC_user_mode;
765 trace_pv_page_fault(addr, error_code);
767 ti = &v->arch.guest_context.trap_ctxt[TRAP_page_fault];
768 tb->flags = TBF_EXCEPTION | TBF_EXCEPTION_ERRCODE;
769 tb->error_code = error_code;
770 tb->cs = ti->cs;
771 tb->eip = ti->address;
772 if ( TI_GET_IF(ti) )
773 tb->flags |= TBF_INTERRUPT;
774 if ( unlikely(null_trap_bounce(v, tb)) )
775 {
776 printk("Unhandled page fault in domain %d on VCPU %d (ec=%04X)\n",
777 v->domain->domain_id, v->vcpu_id, error_code);
778 show_page_walk(addr);
779 }
780 }
782 static int handle_gdt_ldt_mapping_fault(
783 unsigned long offset, struct cpu_user_regs *regs)
784 {
785 /* Which vcpu's area did we fault in, and is it in the ldt sub-area? */
786 unsigned int is_ldt_area = (offset >> (GDT_LDT_VCPU_VA_SHIFT-1)) & 1;
787 unsigned int vcpu_area = (offset >> GDT_LDT_VCPU_VA_SHIFT);
789 /* Should never fault in another vcpu's area. */
790 BUG_ON(vcpu_area != current->vcpu_id);
792 /* Byte offset within the gdt/ldt sub-area. */
793 offset &= (1UL << (GDT_LDT_VCPU_VA_SHIFT-1)) - 1UL;
795 if ( likely(is_ldt_area) )
796 {
797 /* LDT fault: Copy a mapping from the guest's LDT, if it is valid. */
798 if ( likely(map_ldt_shadow_page(offset >> PAGE_SHIFT)) )
799 {
800 if ( guest_mode(regs) )
801 trace_trap_two_addr(TRC_PV_GDT_LDT_MAPPING_FAULT,
802 regs->eip, offset);
803 }
804 else
805 {
806 /* In hypervisor mode? Leave it to the #PF handler to fix up. */
807 if ( !guest_mode(regs) )
808 return 0;
809 /* In guest mode? Propagate #PF to guest, with adjusted %cr2. */
810 propagate_page_fault(
811 current->arch.guest_context.ldt_base + offset,
812 regs->error_code);
813 }
814 }
815 else
816 {
817 /* GDT fault: handle the fault as #GP(selector). */
818 regs->error_code = (u16)offset & ~7;
819 (void)do_general_protection(regs);
820 }
822 return EXCRET_fault_fixed;
823 }
825 #ifdef HYPERVISOR_VIRT_END
826 #define IN_HYPERVISOR_RANGE(va) \
827 (((va) >= HYPERVISOR_VIRT_START) && ((va) < HYPERVISOR_VIRT_END))
828 #else
829 #define IN_HYPERVISOR_RANGE(va) \
830 (((va) >= HYPERVISOR_VIRT_START))
831 #endif
833 static int __spurious_page_fault(
834 unsigned long addr, struct cpu_user_regs *regs)
835 {
836 unsigned long mfn, cr3 = read_cr3();
837 #if CONFIG_PAGING_LEVELS >= 4
838 l4_pgentry_t l4e, *l4t;
839 #endif
840 #if CONFIG_PAGING_LEVELS >= 3
841 l3_pgentry_t l3e, *l3t;
842 #endif
843 l2_pgentry_t l2e, *l2t;
844 l1_pgentry_t l1e, *l1t;
845 unsigned int required_flags, disallowed_flags;
847 /* Reserved bit violations are never spurious faults. */
848 if ( regs->error_code & PFEC_reserved_bit )
849 return 0;
851 required_flags = _PAGE_PRESENT;
852 if ( regs->error_code & PFEC_write_access )
853 required_flags |= _PAGE_RW;
854 if ( regs->error_code & PFEC_user_mode )
855 required_flags |= _PAGE_USER;
857 disallowed_flags = 0;
858 if ( regs->error_code & PFEC_insn_fetch )
859 disallowed_flags |= _PAGE_NX;
861 mfn = cr3 >> PAGE_SHIFT;
863 #if CONFIG_PAGING_LEVELS >= 4
864 l4t = map_domain_page(mfn);
865 l4e = l4e_read_atomic(&l4t[l4_table_offset(addr)]);
866 mfn = l4e_get_pfn(l4e);
867 unmap_domain_page(l4t);
868 if ( ((l4e_get_flags(l4e) & required_flags) != required_flags) ||
869 (l4e_get_flags(l4e) & disallowed_flags) )
870 return 0;
871 #endif
873 #if CONFIG_PAGING_LEVELS >= 3
874 l3t = map_domain_page(mfn);
875 #ifdef CONFIG_X86_PAE
876 l3t += (cr3 & 0xFE0UL) >> 3;
877 #endif
878 l3e = l3e_read_atomic(&l3t[l3_table_offset(addr)]);
879 mfn = l3e_get_pfn(l3e);
880 unmap_domain_page(l3t);
881 #ifdef CONFIG_X86_PAE
882 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
883 return 0;
884 #else
885 if ( ((l3e_get_flags(l3e) & required_flags) != required_flags) ||
886 (l3e_get_flags(l3e) & disallowed_flags) )
887 return 0;
888 #endif
889 #endif
891 l2t = map_domain_page(mfn);
892 l2e = l2e_read_atomic(&l2t[l2_table_offset(addr)]);
893 mfn = l2e_get_pfn(l2e);
894 unmap_domain_page(l2t);
895 if ( ((l2e_get_flags(l2e) & required_flags) != required_flags) ||
896 (l2e_get_flags(l2e) & disallowed_flags) )
897 return 0;
898 if ( l2e_get_flags(l2e) & _PAGE_PSE )
899 {
900 l1e = l1e_empty(); /* define before use in debug tracing */
901 goto spurious;
902 }
904 l1t = map_domain_page(mfn);
905 l1e = l1e_read_atomic(&l1t[l1_table_offset(addr)]);
906 mfn = l1e_get_pfn(l1e);
907 unmap_domain_page(l1t);
908 if ( ((l1e_get_flags(l1e) & required_flags) != required_flags) ||
909 (l1e_get_flags(l1e) & disallowed_flags) )
910 return 0;
912 spurious:
913 dprintk(XENLOG_WARNING, "Spurious fault in domain %u:%u "
914 "at addr %lx, e/c %04x\n",
915 current->domain->domain_id, current->vcpu_id,
916 addr, regs->error_code);
917 #if CONFIG_PAGING_LEVELS >= 4
918 dprintk(XENLOG_WARNING, " l4e = %"PRIpte"\n", l4e_get_intpte(l4e));
919 #endif
920 #if CONFIG_PAGING_LEVELS >= 3
921 dprintk(XENLOG_WARNING, " l3e = %"PRIpte"\n", l3e_get_intpte(l3e));
922 #endif
923 dprintk(XENLOG_WARNING, " l2e = %"PRIpte"\n", l2e_get_intpte(l2e));
924 dprintk(XENLOG_WARNING, " l1e = %"PRIpte"\n", l1e_get_intpte(l1e));
925 #ifndef NDEBUG
926 show_registers(regs);
927 #endif
928 return 1;
929 }
931 static int spurious_page_fault(
932 unsigned long addr, struct cpu_user_regs *regs)
933 {
934 unsigned long flags;
935 int is_spurious;
937 /*
938 * Disabling interrupts prevents TLB flushing, and hence prevents
939 * page tables from becoming invalid under our feet during the walk.
940 */
941 local_irq_save(flags);
942 is_spurious = __spurious_page_fault(addr, regs);
943 local_irq_restore(flags);
945 return is_spurious;
946 }
948 static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs)
949 {
950 struct vcpu *v = current;
951 struct domain *d = v->domain;
953 /* No fixups in interrupt context or when interrupts are disabled. */
954 if ( in_irq() || !(regs->eflags & X86_EFLAGS_IF) )
955 return 0;
957 if ( unlikely(IN_HYPERVISOR_RANGE(addr)) )
958 {
959 if ( paging_mode_external(d) && guest_mode(regs) )
960 {
961 int ret = paging_fault(addr, regs);
962 if ( ret == EXCRET_fault_fixed )
963 trace_trap_two_addr(TRC_PV_PAGING_FIXUP, regs->eip, addr);
964 return ret;
965 }
966 if ( (addr >= GDT_LDT_VIRT_START) && (addr < GDT_LDT_VIRT_END) )
967 return handle_gdt_ldt_mapping_fault(
968 addr - GDT_LDT_VIRT_START, regs);
969 return 0;
970 }
972 if ( VM_ASSIST(d, VMASST_TYPE_writable_pagetables) &&
973 guest_kernel_mode(v, regs) &&
974 /* Do not check if access-protection fault since the page may
975 legitimately be not present in shadow page tables */
976 ((regs->error_code & PFEC_write_access) == PFEC_write_access) &&
977 ptwr_do_page_fault(v, addr, regs) )
978 return EXCRET_fault_fixed;
980 if ( paging_mode_enabled(d) )
981 {
982 int ret = paging_fault(addr, regs);
983 if ( ret == EXCRET_fault_fixed )
984 trace_trap_two_addr(TRC_PV_PAGING_FIXUP, regs->eip, addr);
985 return ret;
986 }
988 return 0;
989 }
991 /*
992 * #PF error code:
993 * Bit 0: Protection violation (=1) ; Page not present (=0)
994 * Bit 1: Write access
995 * Bit 2: User mode (=1) ; Supervisor mode (=0)
996 * Bit 3: Reserved bit violation
997 * Bit 4: Instruction fetch
998 */
999 asmlinkage int do_page_fault(struct cpu_user_regs *regs)
1001 unsigned long addr, fixup;
1002 int rc;
1004 addr = read_cr2();
1006 DEBUGGER_trap_entry(TRAP_page_fault, regs);
1008 perfc_incr(page_faults);
1010 if ( unlikely((rc = fixup_page_fault(addr, regs)) != 0) )
1011 return rc;
1013 if ( unlikely(!guest_mode(regs)) )
1015 if ( spurious_page_fault(addr, regs) )
1016 return EXCRET_not_a_fault;
1018 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
1020 perfc_incr(copy_user_faults);
1021 regs->eip = fixup;
1022 return 0;
1025 DEBUGGER_trap_fatal(TRAP_page_fault, regs);
1027 show_execution_state(regs);
1028 show_page_walk(addr);
1029 panic("FATAL PAGE FAULT\n"
1030 "[error_code=%04x]\n"
1031 "Faulting linear address: %p\n",
1032 regs->error_code, _p(addr));
1035 propagate_page_fault(addr, regs->error_code);
1036 return 0;
1039 /*
1040 * Early handler to deal with spurious page faults. For example, consider a
1041 * routine that uses a mapping immediately after installing it (making it
1042 * present). The CPU may speculatively execute the memory access before
1043 * executing the PTE write. The instruction will then be marked to cause a
1044 * page fault when it is retired, despite the fact that the PTE is present and
1045 * correct at that point in time.
1046 */
1047 asmlinkage int do_early_page_fault(struct cpu_user_regs *regs)
1049 static int stuck;
1050 static unsigned long prev_eip, prev_cr2;
1051 unsigned long cr2 = read_cr2();
1053 BUG_ON(smp_processor_id() != 0);
1055 if ( (regs->eip != prev_eip) || (cr2 != prev_cr2) )
1057 prev_eip = regs->eip;
1058 prev_cr2 = cr2;
1059 stuck = 0;
1060 return EXCRET_not_a_fault;
1063 if ( stuck++ == 1000 )
1064 panic("Early fatal page fault at %04x:%p (cr2=%p, ec=%04x)\n",
1065 regs->cs, _p(regs->eip), _p(cr2), regs->error_code);
1067 return EXCRET_not_a_fault;
1070 long do_fpu_taskswitch(int set)
1072 struct vcpu *v = current;
1074 if ( set )
1076 v->arch.guest_context.ctrlreg[0] |= X86_CR0_TS;
1077 stts();
1079 else
1081 v->arch.guest_context.ctrlreg[0] &= ~X86_CR0_TS;
1082 if ( v->fpu_dirtied )
1083 clts();
1086 return 0;
1089 static int read_descriptor(unsigned int sel,
1090 const struct vcpu *v,
1091 const struct cpu_user_regs * regs,
1092 unsigned long *base,
1093 unsigned long *limit,
1094 unsigned int *ar,
1095 unsigned int vm86attr)
1097 struct desc_struct desc;
1099 if ( !vm86_mode(regs) )
1101 if ( sel < 4)
1102 desc.b = desc.a = 0;
1103 else if ( __get_user(desc,
1104 (const struct desc_struct *)(!(sel & 4)
1105 ? GDT_VIRT_START(v)
1106 : LDT_VIRT_START(v))
1107 + (sel >> 3)) )
1108 return 0;
1109 if ( !(vm86attr & _SEGMENT_CODE) )
1110 desc.b &= ~_SEGMENT_L;
1112 else
1114 desc.a = (sel << 20) | 0xffff;
1115 desc.b = vm86attr | (sel >> 12);
1118 *ar = desc.b & 0x00f0ff00;
1119 if ( !(desc.b & _SEGMENT_L) )
1121 *base = ((desc.a >> 16) + ((desc.b & 0xff) << 16) +
1122 (desc.b & 0xff000000));
1123 *limit = (desc.a & 0xffff) | (desc.b & 0x000f0000);
1124 if ( desc.b & _SEGMENT_G )
1125 *limit = ((*limit + 1) << 12) - 1;
1126 #ifndef NDEBUG
1127 if ( !vm86_mode(regs) && (sel > 3) )
1129 unsigned int a, l;
1130 unsigned char valid;
1132 asm volatile (
1133 "larl %2,%0 ; setz %1"
1134 : "=r" (a), "=rm" (valid) : "rm" (sel));
1135 BUG_ON(valid && ((a & 0x00f0ff00) != *ar));
1136 asm volatile (
1137 "lsll %2,%0 ; setz %1"
1138 : "=r" (l), "=rm" (valid) : "rm" (sel));
1139 BUG_ON(valid && (l != *limit));
1141 #endif
1143 else
1145 *base = 0UL;
1146 *limit = ~0UL;
1149 return 1;
1152 /* Has the guest requested sufficient permission for this I/O access? */
1153 static inline int guest_io_okay(
1154 unsigned int port, unsigned int bytes,
1155 struct vcpu *v, struct cpu_user_regs *regs)
1157 #if defined(__x86_64__)
1158 /* If in user mode, switch to kernel mode just to read I/O bitmap. */
1159 int user_mode = !(v->arch.flags & TF_kernel_mode);
1160 #define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v)
1161 #elif defined(__i386__)
1162 #define TOGGLE_MODE() ((void)0)
1163 #endif
1165 if ( !vm86_mode(regs) &&
1166 (v->arch.iopl >= (guest_kernel_mode(v, regs) ? 1 : 3)) )
1167 return 1;
1169 if ( v->arch.iobmp_limit > (port + bytes) )
1171 union { uint8_t bytes[2]; uint16_t mask; } x;
1173 /*
1174 * Grab permission bytes from guest space. Inaccessible bytes are
1175 * read as 0xff (no access allowed).
1176 */
1177 TOGGLE_MODE();
1178 switch ( __copy_from_guest_offset(x.bytes, v->arch.iobmp,
1179 port>>3, 2) )
1181 default: x.bytes[0] = ~0;
1182 case 1: x.bytes[1] = ~0;
1183 case 0: break;
1185 TOGGLE_MODE();
1187 if ( (x.mask & (((1<<bytes)-1) << (port&7))) == 0 )
1188 return 1;
1191 return 0;
1194 /* Has the administrator granted sufficient permission for this I/O access? */
1195 static inline int admin_io_okay(
1196 unsigned int port, unsigned int bytes,
1197 struct vcpu *v, struct cpu_user_regs *regs)
1199 return ioports_access_permitted(v->domain, port, port + bytes - 1);
1202 #define guest_inb_okay(_p, _d, _r) admin_io_okay(_p, 1, _d, _r)
1203 #define guest_inw_okay(_p, _d, _r) admin_io_okay(_p, 2, _d, _r)
1204 #define guest_inl_okay(_p, _d, _r) admin_io_okay(_p, 4, _d, _r)
1205 #define guest_outb_okay(_p, _d, _r) admin_io_okay(_p, 1, _d, _r)
1206 #define guest_outw_okay(_p, _d, _r) admin_io_okay(_p, 2, _d, _r)
1207 #define guest_outl_okay(_p, _d, _r) admin_io_okay(_p, 4, _d, _r)
1209 /* I/O emulation support. Helper routines for, and type of, the stack stub.*/
1210 void host_to_guest_gpr_switch(struct cpu_user_regs *)
1211 __attribute__((__regparm__(1)));
1212 unsigned long guest_to_host_gpr_switch(unsigned long)
1213 __attribute__((__regparm__(1)));
1215 void (*pv_post_outb_hook)(unsigned int port, u8 value);
1217 /* Instruction fetch with error handling. */
1218 #define insn_fetch(type, base, eip, limit) \
1219 ({ unsigned long _rc, _ptr = (base) + (eip); \
1220 type _x; \
1221 if ( (limit) < sizeof(_x) - 1 || (eip) > (limit) - (sizeof(_x) - 1) ) \
1222 goto fail; \
1223 if ( (_rc = copy_from_user(&_x, (type *)_ptr, sizeof(_x))) != 0 ) \
1224 { \
1225 propagate_page_fault(_ptr + sizeof(_x) - _rc, 0); \
1226 return EXCRET_fault_fixed; \
1227 } \
1228 (eip) += sizeof(_x); _x; })
1230 #if defined(CONFIG_X86_32)
1231 # define read_sreg(regs, sr) ((regs)->sr)
1232 #elif defined(CONFIG_X86_64)
1233 # define read_sreg(regs, sr) read_segment_register(sr)
1234 #endif
1236 static int emulate_privileged_op(struct cpu_user_regs *regs)
1238 struct vcpu *v = current;
1239 unsigned long *reg, eip = regs->eip, res;
1240 u8 opcode, modrm_reg = 0, modrm_rm = 0, rep_prefix = 0, lock = 0, rex = 0;
1241 enum { lm_seg_none, lm_seg_fs, lm_seg_gs } lm_ovr = lm_seg_none;
1242 unsigned int port, i, data_sel, ar, data, rc;
1243 unsigned int op_bytes, op_default, ad_bytes, ad_default;
1244 #define rd_ad(reg) (ad_bytes >= sizeof(regs->reg) \
1245 ? regs->reg \
1246 : ad_bytes == 4 \
1247 ? (u32)regs->reg \
1248 : (u16)regs->reg)
1249 #define wr_ad(reg, val) (ad_bytes >= sizeof(regs->reg) \
1250 ? regs->reg = (val) \
1251 : ad_bytes == 4 \
1252 ? (*(u32 *)&regs->reg = (val)) \
1253 : (*(u16 *)&regs->reg = (val)))
1254 unsigned long code_base, code_limit;
1255 char io_emul_stub[16];
1256 void (*io_emul)(struct cpu_user_regs *) __attribute__((__regparm__(1)));
1257 u32 l, h, eax, edx;
1259 if ( !read_descriptor(regs->cs, v, regs,
1260 &code_base, &code_limit, &ar,
1261 _SEGMENT_CODE|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P) )
1262 goto fail;
1263 op_default = op_bytes = (ar & (_SEGMENT_L|_SEGMENT_DB)) ? 4 : 2;
1264 ad_default = ad_bytes = (ar & _SEGMENT_L) ? 8 : op_default;
1265 if ( !(ar & _SEGMENT_S) ||
1266 !(ar & _SEGMENT_P) ||
1267 !(ar & _SEGMENT_CODE) )
1268 goto fail;
1270 /* emulating only opcodes not allowing SS to be default */
1271 data_sel = read_sreg(regs, ds);
1273 /* Legacy prefixes. */
1274 for ( i = 0; i < 8; i++, rex == opcode || (rex = 0) )
1276 switch ( opcode = insn_fetch(u8, code_base, eip, code_limit) )
1278 case 0x66: /* operand-size override */
1279 op_bytes = op_default ^ 6; /* switch between 2/4 bytes */
1280 continue;
1281 case 0x67: /* address-size override */
1282 ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */
1283 continue;
1284 case 0x2e: /* CS override */
1285 data_sel = regs->cs;
1286 continue;
1287 case 0x3e: /* DS override */
1288 data_sel = read_sreg(regs, ds);
1289 continue;
1290 case 0x26: /* ES override */
1291 data_sel = read_sreg(regs, es);
1292 continue;
1293 case 0x64: /* FS override */
1294 data_sel = read_sreg(regs, fs);
1295 lm_ovr = lm_seg_fs;
1296 continue;
1297 case 0x65: /* GS override */
1298 data_sel = read_sreg(regs, gs);
1299 lm_ovr = lm_seg_gs;
1300 continue;
1301 case 0x36: /* SS override */
1302 data_sel = regs->ss;
1303 continue;
1304 case 0xf0: /* LOCK */
1305 lock = 1;
1306 continue;
1307 case 0xf2: /* REPNE/REPNZ */
1308 case 0xf3: /* REP/REPE/REPZ */
1309 rep_prefix = 1;
1310 continue;
1311 default:
1312 if ( (ar & _SEGMENT_L) && (opcode & 0xf0) == 0x40 )
1314 rex = opcode;
1315 continue;
1317 break;
1319 break;
1322 /* REX prefix. */
1323 if ( rex & 8 ) /* REX.W */
1324 op_bytes = 4; /* emulating only opcodes not supporting 64-bit operands */
1325 modrm_reg = (rex & 4) << 1; /* REX.R */
1326 /* REX.X does not need to be decoded. */
1327 modrm_rm = (rex & 1) << 3; /* REX.B */
1329 if ( opcode == 0x0f )
1330 goto twobyte_opcode;
1332 if ( lock )
1333 goto fail;
1335 /* Input/Output String instructions. */
1336 if ( (opcode >= 0x6c) && (opcode <= 0x6f) )
1338 unsigned long data_base, data_limit;
1340 if ( rep_prefix && (rd_ad(ecx) == 0) )
1341 goto done;
1343 if ( !(opcode & 2) )
1345 data_sel = read_sreg(regs, es);
1346 lm_ovr = lm_seg_none;
1349 if ( !(ar & _SEGMENT_L) )
1351 if ( !read_descriptor(data_sel, v, regs,
1352 &data_base, &data_limit, &ar,
1353 _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P) )
1354 goto fail;
1355 if ( !(ar & _SEGMENT_S) ||
1356 !(ar & _SEGMENT_P) ||
1357 (opcode & 2 ?
1358 (ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR) :
1359 (ar & _SEGMENT_CODE) || !(ar & _SEGMENT_WR)) )
1360 goto fail;
1362 #ifdef CONFIG_X86_64
1363 else
1365 if ( lm_ovr == lm_seg_none || data_sel < 4 )
1367 switch ( lm_ovr )
1369 case lm_seg_none:
1370 data_base = 0UL;
1371 break;
1372 case lm_seg_fs:
1373 data_base = v->arch.guest_context.fs_base;
1374 break;
1375 case lm_seg_gs:
1376 if ( guest_kernel_mode(v, regs) )
1377 data_base = v->arch.guest_context.gs_base_kernel;
1378 else
1379 data_base = v->arch.guest_context.gs_base_user;
1380 break;
1383 else
1384 read_descriptor(data_sel, v, regs,
1385 &data_base, &data_limit, &ar,
1386 0);
1387 data_limit = ~0UL;
1388 ar = _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P;
1390 #endif
1392 continue_io_string:
1393 switch ( opcode )
1395 case 0x6c: /* INSB */
1396 op_bytes = 1;
1397 case 0x6d: /* INSW/INSL */
1398 if ( data_limit < op_bytes - 1 ||
1399 rd_ad(edi) > data_limit - (op_bytes - 1) ||
1400 !guest_io_okay((u16)regs->edx, op_bytes, v, regs) )
1401 goto fail;
1402 port = (u16)regs->edx;
1403 switch ( op_bytes )
1405 case 1:
1406 /* emulate PIT counter 2 */
1407 data = (u8)(guest_inb_okay(port, v, regs) ? inb(port) :
1408 ((port == 0x42 || port == 0x43 || port == 0x61) ?
1409 pv_pit_handler(port, 0, 0) : ~0));
1410 break;
1411 case 2:
1412 data = (u16)(guest_inw_okay(port, v, regs) ? inw(port) : ~0);
1413 break;
1414 case 4:
1415 data = (u32)(guest_inl_okay(port, v, regs) ? inl(port) : ~0);
1416 break;
1418 if ( (rc = copy_to_user((void *)data_base + rd_ad(edi), &data, op_bytes)) != 0 )
1420 propagate_page_fault(data_base + rd_ad(edi) + op_bytes - rc,
1421 PFEC_write_access);
1422 return EXCRET_fault_fixed;
1424 wr_ad(edi, regs->edi + (int)((regs->eflags & EF_DF) ? -op_bytes : op_bytes));
1425 break;
1427 case 0x6e: /* OUTSB */
1428 op_bytes = 1;
1429 case 0x6f: /* OUTSW/OUTSL */
1430 if ( data_limit < op_bytes - 1 ||
1431 rd_ad(esi) > data_limit - (op_bytes - 1) ||
1432 !guest_io_okay((u16)regs->edx, op_bytes, v, regs) )
1433 goto fail;
1434 rc = copy_from_user(&data, (void *)data_base + rd_ad(esi), op_bytes);
1435 if ( rc != 0 )
1437 propagate_page_fault(data_base + rd_ad(esi) + op_bytes - rc, 0);
1438 return EXCRET_fault_fixed;
1440 port = (u16)regs->edx;
1441 switch ( op_bytes )
1443 case 1:
1444 if ( guest_outb_okay(port, v, regs) )
1446 outb((u8)data, port);
1447 if ( pv_post_outb_hook )
1448 pv_post_outb_hook(port, data);
1450 else if ( port == 0x42 || port == 0x43 || port == 0x61 )
1451 pv_pit_handler(port, data, 1);
1452 break;
1453 case 2:
1454 if ( guest_outw_okay(port, v, regs) )
1455 outw((u16)data, port);
1456 break;
1457 case 4:
1458 if ( guest_outl_okay(port, v, regs) )
1459 outl((u32)data, port);
1460 break;
1462 wr_ad(esi, regs->esi + (int)((regs->eflags & EF_DF) ? -op_bytes : op_bytes));
1463 break;
1466 if ( rep_prefix && (wr_ad(ecx, regs->ecx - 1) != 0) )
1468 if ( !hypercall_preempt_check() )
1469 goto continue_io_string;
1470 eip = regs->eip;
1473 goto done;
1476 /*
1477 * Very likely to be an I/O instruction (IN/OUT).
1478 * Build an on-stack stub to execute the instruction with full guest
1479 * GPR context. This is needed for some systems which (ab)use IN/OUT
1480 * to communicate with BIOS code in system-management mode.
1481 */
1482 #ifdef __x86_64__
1483 /* movq $host_to_guest_gpr_switch,%rcx */
1484 io_emul_stub[0] = 0x48;
1485 io_emul_stub[1] = 0xb9;
1486 *(void **)&io_emul_stub[2] = (void *)host_to_guest_gpr_switch;
1487 /* callq *%rcx */
1488 io_emul_stub[10] = 0xff;
1489 io_emul_stub[11] = 0xd1;
1490 #else
1491 /* call host_to_guest_gpr_switch */
1492 io_emul_stub[0] = 0xe8;
1493 *(s32 *)&io_emul_stub[1] =
1494 (char *)host_to_guest_gpr_switch - &io_emul_stub[5];
1495 /* 7 x nop */
1496 memset(&io_emul_stub[5], 0x90, 7);
1497 #endif
1498 /* data16 or nop */
1499 io_emul_stub[12] = (op_bytes != 2) ? 0x90 : 0x66;
1500 /* <io-access opcode> */
1501 io_emul_stub[13] = opcode;
1502 /* imm8 or nop */
1503 io_emul_stub[14] = 0x90;
1504 /* ret (jumps to guest_to_host_gpr_switch) */
1505 io_emul_stub[15] = 0xc3;
1507 /* Handy function-typed pointer to the stub. */
1508 io_emul = (void *)io_emul_stub;
1510 /* I/O Port and Interrupt Flag instructions. */
1511 switch ( opcode )
1513 case 0xe4: /* IN imm8,%al */
1514 op_bytes = 1;
1515 case 0xe5: /* IN imm8,%eax */
1516 port = insn_fetch(u8, code_base, eip, code_limit);
1517 io_emul_stub[14] = port; /* imm8 */
1518 exec_in:
1519 if ( !guest_io_okay(port, op_bytes, v, regs) )
1520 goto fail;
1521 switch ( op_bytes )
1523 case 1:
1524 if ( guest_inb_okay(port, v, regs) )
1525 io_emul(regs);
1526 else if ( port == 0x42 || port == 0x43 || port == 0x61 )
1528 regs->eax &= ~0xffUL;
1529 regs->eax |= pv_pit_handler(port, 0, 0);
1531 else
1532 regs->eax |= (u8)~0;
1533 break;
1534 case 2:
1535 if ( guest_inw_okay(port, v, regs) )
1536 io_emul(regs);
1537 else
1538 regs->eax |= (u16)~0;
1539 break;
1540 case 4:
1541 if ( guest_inl_okay(port, v, regs) )
1542 io_emul(regs);
1543 else
1544 regs->eax = (u32)~0;
1545 break;
1547 goto done;
1549 case 0xec: /* IN %dx,%al */
1550 op_bytes = 1;
1551 case 0xed: /* IN %dx,%eax */
1552 port = (u16)regs->edx;
1553 goto exec_in;
1555 case 0xe6: /* OUT %al,imm8 */
1556 op_bytes = 1;
1557 case 0xe7: /* OUT %eax,imm8 */
1558 port = insn_fetch(u8, code_base, eip, code_limit);
1559 io_emul_stub[14] = port; /* imm8 */
1560 exec_out:
1561 if ( !guest_io_okay(port, op_bytes, v, regs) )
1562 goto fail;
1563 switch ( op_bytes )
1565 case 1:
1566 if ( guest_outb_okay(port, v, regs) )
1568 io_emul(regs);
1569 if ( pv_post_outb_hook )
1570 pv_post_outb_hook(port, regs->eax);
1572 else if ( port == 0x42 || port == 0x43 || port == 0x61 )
1573 pv_pit_handler(port, regs->eax, 1);
1574 break;
1575 case 2:
1576 if ( guest_outw_okay(port, v, regs) )
1577 io_emul(regs);
1578 break;
1579 case 4:
1580 if ( guest_outl_okay(port, v, regs) )
1581 io_emul(regs);
1582 break;
1584 goto done;
1586 case 0xee: /* OUT %al,%dx */
1587 op_bytes = 1;
1588 case 0xef: /* OUT %eax,%dx */
1589 port = (u16)regs->edx;
1590 goto exec_out;
1592 case 0xfa: /* CLI */
1593 case 0xfb: /* STI */
1594 if ( v->arch.iopl < (guest_kernel_mode(v, regs) ? 1 : 3) )
1595 goto fail;
1596 /*
1597 * This is just too dangerous to allow, in my opinion. Consider if the
1598 * caller then tries to reenable interrupts using POPF: we can't trap
1599 * that and we'll end up with hard-to-debug lockups. Fast & loose will
1600 * do for us. :-)
1601 */
1602 /*v->vcpu_info->evtchn_upcall_mask = (opcode == 0xfa);*/
1603 goto done;
1606 /* No decode of this single-byte opcode. */
1607 goto fail;
1609 twobyte_opcode:
1610 /* Two-byte opcodes only emulated from guest kernel. */
1611 if ( !guest_kernel_mode(v, regs) )
1612 goto fail;
1614 /* Privileged (ring 0) instructions. */
1615 opcode = insn_fetch(u8, code_base, eip, code_limit);
1616 if ( lock && (opcode & ~3) != 0x20 )
1617 goto fail;
1618 switch ( opcode )
1620 case 0x06: /* CLTS */
1621 (void)do_fpu_taskswitch(0);
1622 break;
1624 case 0x09: /* WBINVD */
1625 /* Ignore the instruction if unprivileged. */
1626 if ( !cache_flush_permitted(v->domain) )
1627 /* Non-physdev domain attempted WBINVD; ignore for now since
1628 newer linux uses this in some start-of-day timing loops */
1630 else
1631 wbinvd();
1632 break;
1634 case 0x20: /* MOV CR?,<reg> */
1635 opcode = insn_fetch(u8, code_base, eip, code_limit);
1636 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
1637 modrm_rm |= (opcode >> 0) & 7;
1638 reg = decode_register(modrm_rm, regs, 0);
1639 switch ( modrm_reg )
1641 case 0: /* Read CR0 */
1642 *reg = (read_cr0() & ~X86_CR0_TS) |
1643 v->arch.guest_context.ctrlreg[0];
1644 break;
1646 case 2: /* Read CR2 */
1647 *reg = v->arch.guest_context.ctrlreg[2];
1648 break;
1650 case 3: /* Read CR3 */
1651 if ( !is_pv_32on64_vcpu(v) )
1652 *reg = xen_pfn_to_cr3(mfn_to_gmfn(
1653 v->domain, pagetable_get_pfn(v->arch.guest_table)));
1654 #ifdef CONFIG_COMPAT
1655 else
1656 *reg = compat_pfn_to_cr3(mfn_to_gmfn(
1657 v->domain, l4e_get_pfn(*(l4_pgentry_t *)__va(pagetable_get_paddr(v->arch.guest_table)))));
1658 #endif
1659 break;
1661 case 4: /* Read CR4 */
1662 /*
1663 * Guests can read CR4 to see what features Xen has enabled. We
1664 * therefore lie about PGE & PSE as they are unavailable to guests.
1665 */
1666 *reg = read_cr4() & ~(X86_CR4_PGE|X86_CR4_PSE);
1667 break;
1669 default:
1670 goto fail;
1672 break;
1674 case 0x21: /* MOV DR?,<reg> */
1675 opcode = insn_fetch(u8, code_base, eip, code_limit);
1676 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
1677 modrm_rm |= (opcode >> 0) & 7;
1678 reg = decode_register(modrm_rm, regs, 0);
1679 if ( (res = do_get_debugreg(modrm_reg)) > (unsigned long)-256 )
1680 goto fail;
1681 *reg = res;
1682 break;
1684 case 0x22: /* MOV <reg>,CR? */
1685 opcode = insn_fetch(u8, code_base, eip, code_limit);
1686 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
1687 modrm_rm |= (opcode >> 0) & 7;
1688 reg = decode_register(modrm_rm, regs, 0);
1689 switch ( modrm_reg )
1691 case 0: /* Write CR0 */
1692 if ( (*reg ^ read_cr0()) & ~X86_CR0_TS )
1694 gdprintk(XENLOG_WARNING,
1695 "Attempt to change unmodifiable CR0 flags.\n");
1696 goto fail;
1698 (void)do_fpu_taskswitch(!!(*reg & X86_CR0_TS));
1699 break;
1701 case 2: /* Write CR2 */
1702 v->arch.guest_context.ctrlreg[2] = *reg;
1703 arch_set_cr2(v, *reg);
1704 break;
1706 case 3: /* Write CR3 */
1707 LOCK_BIGLOCK(v->domain);
1708 if ( !is_pv_32on64_vcpu(v) )
1709 rc = new_guest_cr3(gmfn_to_mfn(v->domain, xen_cr3_to_pfn(*reg)));
1710 #ifdef CONFIG_COMPAT
1711 else
1712 rc = new_guest_cr3(gmfn_to_mfn(v->domain, compat_cr3_to_pfn(*reg)));
1713 #endif
1714 UNLOCK_BIGLOCK(v->domain);
1715 if ( rc == 0 ) /* not okay */
1716 goto fail;
1717 break;
1719 case 4: /* Write CR4 */
1720 if ( *reg != (read_cr4() & ~(X86_CR4_PGE|X86_CR4_PSE)) )
1721 gdprintk(XENLOG_WARNING,
1722 "Attempt to change CR4 flags %08lx -> %08lx\n",
1723 read_cr4() & ~(X86_CR4_PGE|X86_CR4_PSE), *reg);
1724 break;
1726 default:
1727 goto fail;
1729 break;
1731 case 0x23: /* MOV <reg>,DR? */
1732 opcode = insn_fetch(u8, code_base, eip, code_limit);
1733 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
1734 modrm_rm |= (opcode >> 0) & 7;
1735 reg = decode_register(modrm_rm, regs, 0);
1736 if ( do_set_debugreg(modrm_reg, *reg) != 0 )
1737 goto fail;
1738 break;
1740 case 0x30: /* WRMSR */
1741 eax = regs->eax;
1742 edx = regs->edx;
1743 res = ((u64)edx << 32) | eax;
1744 switch ( regs->ecx )
1746 #ifdef CONFIG_X86_64
1747 case MSR_FS_BASE:
1748 if ( is_pv_32on64_vcpu(v) )
1749 goto fail;
1750 if ( wrmsr_safe(MSR_FS_BASE, eax, edx) )
1751 goto fail;
1752 v->arch.guest_context.fs_base = res;
1753 break;
1754 case MSR_GS_BASE:
1755 if ( is_pv_32on64_vcpu(v) )
1756 goto fail;
1757 if ( wrmsr_safe(MSR_GS_BASE, eax, edx) )
1758 goto fail;
1759 v->arch.guest_context.gs_base_kernel = res;
1760 break;
1761 case MSR_SHADOW_GS_BASE:
1762 if ( is_pv_32on64_vcpu(v) )
1763 goto fail;
1764 if ( wrmsr_safe(MSR_SHADOW_GS_BASE, eax, edx) )
1765 goto fail;
1766 v->arch.guest_context.gs_base_user = res;
1767 break;
1768 #endif
1769 case MSR_K7_FID_VID_STATUS:
1770 case MSR_K7_FID_VID_CTL:
1771 if ( (cpufreq_controller != FREQCTL_dom0_kernel) ||
1772 (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) ||
1773 wrmsr_safe(regs->ecx, eax, edx) )
1774 goto fail;
1775 break;
1776 default:
1777 if ( wrmsr_hypervisor_regs(regs->ecx, eax, edx) )
1778 break;
1779 if ( (rdmsr_safe(regs->ecx, l, h) != 0) ||
1780 (eax != l) || (edx != h) )
1781 gdprintk(XENLOG_WARNING, "Domain attempted WRMSR %p from "
1782 "%08x:%08x to %08x:%08x.\n",
1783 _p(regs->ecx), h, l, edx, eax);
1784 break;
1786 break;
1788 case 0x32: /* RDMSR */
1789 switch ( regs->ecx )
1791 #ifdef CONFIG_X86_64
1792 case MSR_FS_BASE:
1793 if ( is_pv_32on64_vcpu(v) )
1794 goto fail;
1795 regs->eax = v->arch.guest_context.fs_base & 0xFFFFFFFFUL;
1796 regs->edx = v->arch.guest_context.fs_base >> 32;
1797 break;
1798 case MSR_GS_BASE:
1799 if ( is_pv_32on64_vcpu(v) )
1800 goto fail;
1801 regs->eax = v->arch.guest_context.gs_base_kernel & 0xFFFFFFFFUL;
1802 regs->edx = v->arch.guest_context.gs_base_kernel >> 32;
1803 break;
1804 case MSR_SHADOW_GS_BASE:
1805 if ( is_pv_32on64_vcpu(v) )
1806 goto fail;
1807 regs->eax = v->arch.guest_context.gs_base_user & 0xFFFFFFFFUL;
1808 regs->edx = v->arch.guest_context.gs_base_user >> 32;
1809 break;
1810 #endif
1811 case MSR_K7_FID_VID_CTL:
1812 case MSR_K7_FID_VID_STATUS:
1813 if ( (cpufreq_controller != FREQCTL_dom0_kernel) ||
1814 (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) ||
1815 rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
1816 goto fail;
1817 break;
1818 case MSR_EFER:
1819 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
1820 goto fail;
1821 break;
1822 default:
1823 if ( rdmsr_hypervisor_regs(regs->ecx, &l, &h) )
1825 regs->eax = l;
1826 regs->edx = h;
1827 break;
1829 /* Everyone can read the MSR space. */
1830 /* gdprintk(XENLOG_WARNING,"Domain attempted RDMSR %p.\n",
1831 _p(regs->ecx));*/
1832 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
1833 goto fail;
1834 break;
1836 break;
1838 default:
1839 goto fail;
1842 #undef wr_ad
1843 #undef rd_ad
1845 done:
1846 regs->eip = eip;
1847 regs->eflags &= ~X86_EFLAGS_RF;
1848 return EXCRET_fault_fixed;
1850 fail:
1851 return 0;
1854 asmlinkage int do_general_protection(struct cpu_user_regs *regs)
1856 struct vcpu *v = current;
1857 unsigned long fixup;
1859 DEBUGGER_trap_entry(TRAP_gp_fault, regs);
1861 if ( regs->error_code & 1 )
1862 goto hardware_gp;
1864 if ( !guest_mode(regs) )
1865 goto gp_in_kernel;
1867 /*
1868 * Cunning trick to allow arbitrary "INT n" handling.
1870 * We set DPL == 0 on all vectors in the IDT. This prevents any INT <n>
1871 * instruction from trapping to the appropriate vector, when that might not
1872 * be expected by Xen or the guest OS. For example, that entry might be for
1873 * a fault handler (unlike traps, faults don't increment EIP), or might
1874 * expect an error code on the stack (which a software trap never
1875 * provides), or might be a hardware interrupt handler that doesn't like
1876 * being called spuriously.
1878 * Instead, a GPF occurs with the faulting IDT vector in the error code.
1879 * Bit 1 is set to indicate that an IDT entry caused the fault. Bit 0 is
1880 * clear to indicate that it's a software fault, not hardware.
1882 * NOTE: Vectors 3 and 4 are dealt with from their own handler. This is
1883 * okay because they can only be triggered by an explicit DPL-checked
1884 * instruction. The DPL specified by the guest OS for these vectors is NOT
1885 * CHECKED!!
1886 */
1887 if ( (regs->error_code & 3) == 2 )
1889 /* This fault must be due to <INT n> instruction. */
1890 const struct trap_info *ti;
1891 unsigned char vector = regs->error_code >> 3;
1892 ti = &v->arch.guest_context.trap_ctxt[vector];
1893 if ( permit_softint(TI_GET_DPL(ti), v, regs) )
1895 regs->eip += 2;
1896 return do_guest_trap(vector, regs, 0);
1900 /* Emulate some simple privileged and I/O instructions. */
1901 if ( (regs->error_code == 0) &&
1902 emulate_privileged_op(regs) )
1904 trace_trap_one_addr(TRC_PV_EMULATE_PRIVOP, regs->eip);
1905 return 0;
1908 #if defined(__i386__)
1909 if ( VM_ASSIST(v->domain, VMASST_TYPE_4gb_segments) &&
1910 (regs->error_code == 0) &&
1911 gpf_emulate_4gb(regs) )
1913 TRACE_1D(TRC_PV_EMULATE_4GB, regs->eip);
1914 return 0;
1916 #endif
1918 /* Pass on GPF as is. */
1919 return do_guest_trap(TRAP_gp_fault, regs, 1);
1921 gp_in_kernel:
1923 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
1925 dprintk(XENLOG_INFO, "GPF (%04x): %p -> %p\n",
1926 regs->error_code, _p(regs->eip), _p(fixup));
1927 regs->eip = fixup;
1928 return 0;
1931 DEBUGGER_trap_fatal(TRAP_gp_fault, regs);
1933 hardware_gp:
1934 show_execution_state(regs);
1935 panic("GENERAL PROTECTION FAULT\n[error_code=%04x]\n", regs->error_code);
1936 return 0;
1939 static void nmi_softirq(void)
1941 /* Only used to defer wakeup of dom0,vcpu0 to a safe (non-NMI) context. */
1942 vcpu_kick(dom0->vcpu[0]);
1945 static void nmi_dom0_report(unsigned int reason_idx)
1947 struct domain *d;
1948 struct vcpu *v;
1950 if ( ((d = dom0) == NULL) || ((v = d->vcpu[0]) == NULL) )
1951 return;
1953 set_bit(reason_idx, nmi_reason(d));
1955 if ( !test_and_set_bool(v->nmi_pending) )
1956 raise_softirq(NMI_SOFTIRQ); /* not safe to wake up a vcpu here */
1959 asmlinkage void mem_parity_error(struct cpu_user_regs *regs)
1961 switch ( opt_nmi[0] )
1963 case 'd': /* 'dom0' */
1964 nmi_dom0_report(_XEN_NMIREASON_parity_error);
1965 case 'i': /* 'ignore' */
1966 break;
1967 default: /* 'fatal' */
1968 console_force_unlock();
1969 printk("\n\nNMI - MEMORY ERROR\n");
1970 fatal_trap(TRAP_nmi, regs);
1973 outb((inb(0x61) & 0x0f) | 0x04, 0x61); /* clear-and-disable parity check */
1974 mdelay(1);
1975 outb((inb(0x61) & 0x0b) | 0x00, 0x61); /* enable parity check */
1978 asmlinkage void io_check_error(struct cpu_user_regs *regs)
1980 switch ( opt_nmi[0] )
1982 case 'd': /* 'dom0' */
1983 nmi_dom0_report(_XEN_NMIREASON_io_error);
1984 case 'i': /* 'ignore' */
1985 break;
1986 default: /* 'fatal' */
1987 console_force_unlock();
1988 printk("\n\nNMI - I/O ERROR\n");
1989 fatal_trap(TRAP_nmi, regs);
1992 outb((inb(0x61) & 0x0f) | 0x08, 0x61); /* clear-and-disable IOCK */
1993 mdelay(1);
1994 outb((inb(0x61) & 0x07) | 0x00, 0x61); /* enable IOCK */
1997 static void unknown_nmi_error(unsigned char reason)
1999 switch ( opt_nmi[0] )
2001 case 'd': /* 'dom0' */
2002 nmi_dom0_report(_XEN_NMIREASON_unknown);
2003 case 'i': /* 'ignore' */
2004 break;
2005 default: /* 'fatal' */
2006 printk("Uhhuh. NMI received for unknown reason %02x.\n", reason);
2007 printk("Dazed and confused, but trying to continue\n");
2008 printk("Do you have a strange power saving mode enabled?\n");
2009 kexec_crash();
2013 static int dummy_nmi_callback(struct cpu_user_regs *regs, int cpu)
2015 return 0;
2018 static nmi_callback_t nmi_callback = dummy_nmi_callback;
2020 asmlinkage void do_nmi(struct cpu_user_regs *regs)
2022 unsigned int cpu = smp_processor_id();
2023 unsigned char reason;
2025 ++nmi_count(cpu);
2027 if ( nmi_callback(regs, cpu) )
2028 return;
2030 if ( nmi_watchdog )
2031 nmi_watchdog_tick(regs);
2033 /* Only the BSP gets external NMIs from the system. */
2034 if ( cpu == 0 )
2036 reason = inb(0x61);
2037 if ( reason & 0x80 )
2038 mem_parity_error(regs);
2039 else if ( reason & 0x40 )
2040 io_check_error(regs);
2041 else if ( !nmi_watchdog )
2042 unknown_nmi_error((unsigned char)(reason&0xff));
2046 void set_nmi_callback(nmi_callback_t callback)
2048 nmi_callback = callback;
2051 void unset_nmi_callback(void)
2053 nmi_callback = dummy_nmi_callback;
2056 asmlinkage int do_device_not_available(struct cpu_user_regs *regs)
2058 BUG_ON(!guest_mode(regs));
2060 setup_fpu(current);
2062 if ( current->arch.guest_context.ctrlreg[0] & X86_CR0_TS )
2064 do_guest_trap(TRAP_no_device, regs, 0);
2065 current->arch.guest_context.ctrlreg[0] &= ~X86_CR0_TS;
2067 else
2068 TRACE_0D(TRC_PV_MATH_STATE_RESTORE);
2070 return EXCRET_fault_fixed;
2073 asmlinkage int do_debug(struct cpu_user_regs *regs)
2075 unsigned long condition;
2076 struct vcpu *v = current;
2078 asm volatile ( "mov %%db6,%0" : "=r" (condition) );
2080 /* Mask out spurious debug traps due to lazy DR7 setting */
2081 if ( (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) &&
2082 (v->arch.guest_context.debugreg[7] == 0) )
2084 asm volatile ( "mov %0,%%db7" : : "r" (0UL) );
2085 goto out;
2088 DEBUGGER_trap_entry(TRAP_debug, regs);
2090 if ( !guest_mode(regs) )
2092 /* Clear TF just for absolute sanity. */
2093 regs->eflags &= ~EF_TF;
2094 /*
2095 * We ignore watchpoints when they trigger within Xen. This may happen
2096 * when a buffer is passed to us which previously had a watchpoint set
2097 * on it. No need to bump EIP; the only faulting trap is an instruction
2098 * breakpoint, which can't happen to us.
2099 */
2100 goto out;
2103 /* Save debug status register where guest OS can peek at it */
2104 v->arch.guest_context.debugreg[6] = condition;
2106 ler_enable();
2108 return do_guest_trap(TRAP_debug, regs, 0);
2110 out:
2111 ler_enable();
2112 return EXCRET_not_a_fault;
2115 asmlinkage int do_spurious_interrupt_bug(struct cpu_user_regs *regs)
2117 return EXCRET_not_a_fault;
2120 void set_intr_gate(unsigned int n, void *addr)
2122 int i;
2123 /* Keep secondary tables in sync with IRQ updates. */
2124 for ( i = 1; i < NR_CPUS; i++ )
2125 if ( idt_tables[i] != NULL )
2126 _set_gate(&idt_tables[i][n], 14, 0, addr);
2127 _set_gate(&idt_table[n], 14, 0, addr);
2130 void set_system_gate(unsigned int n, void *addr)
2132 _set_gate(idt_table+n,14,3,addr);
2135 void set_task_gate(unsigned int n, unsigned int sel)
2137 idt_table[n].a = sel << 16;
2138 idt_table[n].b = 0x8500;
2141 void set_tss_desc(unsigned int n, void *addr)
2143 _set_tssldt_desc(
2144 gdt_table + __TSS(n) - FIRST_RESERVED_GDT_ENTRY,
2145 (unsigned long)addr,
2146 offsetof(struct tss_struct, __cacheline_filler) - 1,
2147 9);
2148 #ifdef CONFIG_COMPAT
2149 _set_tssldt_desc(
2150 compat_gdt_table + __TSS(n) - FIRST_RESERVED_GDT_ENTRY,
2151 (unsigned long)addr,
2152 offsetof(struct tss_struct, __cacheline_filler) - 1,
2153 11);
2154 #endif
2157 void __devinit percpu_traps_init(void)
2159 subarch_percpu_traps_init();
2161 if ( !opt_ler )
2162 return;
2164 switch ( boot_cpu_data.x86_vendor )
2166 case X86_VENDOR_INTEL:
2167 switch ( boot_cpu_data.x86 )
2169 case 6:
2170 this_cpu(ler_msr) = MSR_IA32_LASTINTFROMIP;
2171 break;
2172 case 15:
2173 this_cpu(ler_msr) = MSR_P4_LER_FROM_LIP;
2174 break;
2176 break;
2177 case X86_VENDOR_AMD:
2178 switch ( boot_cpu_data.x86 )
2180 case 6:
2181 case 15:
2182 case 16:
2183 this_cpu(ler_msr) = MSR_IA32_LASTINTFROMIP;
2184 break;
2186 break;
2189 ler_enable();
2192 void __init trap_init(void)
2194 /*
2195 * Note that interrupt gates are always used, rather than trap gates. We
2196 * must have interrupts disabled until DS/ES/FS/GS are saved because the
2197 * first activation must have the "bad" value(s) for these registers and
2198 * we may lose them if another activation is installed before they are
2199 * saved. The page-fault handler also needs interrupts disabled until %cr2
2200 * has been read and saved on the stack.
2201 */
2202 set_intr_gate(TRAP_divide_error,&divide_error);
2203 set_intr_gate(TRAP_debug,&debug);
2204 set_intr_gate(TRAP_nmi,&nmi);
2205 set_system_gate(TRAP_int3,&int3); /* usable from all privileges */
2206 set_system_gate(TRAP_overflow,&overflow); /* usable from all privileges */
2207 set_intr_gate(TRAP_bounds,&bounds);
2208 set_intr_gate(TRAP_invalid_op,&invalid_op);
2209 set_intr_gate(TRAP_no_device,&device_not_available);
2210 set_intr_gate(TRAP_copro_seg,&coprocessor_segment_overrun);
2211 set_intr_gate(TRAP_invalid_tss,&invalid_TSS);
2212 set_intr_gate(TRAP_no_segment,&segment_not_present);
2213 set_intr_gate(TRAP_stack_error,&stack_segment);
2214 set_intr_gate(TRAP_gp_fault,&general_protection);
2215 set_intr_gate(TRAP_page_fault,&page_fault);
2216 set_intr_gate(TRAP_spurious_int,&spurious_interrupt_bug);
2217 set_intr_gate(TRAP_copro_error,&coprocessor_error);
2218 set_intr_gate(TRAP_alignment_check,&alignment_check);
2219 set_intr_gate(TRAP_machine_check,&machine_check);
2220 set_intr_gate(TRAP_simd_error,&simd_coprocessor_error);
2222 /* CPU0 uses the master IDT. */
2223 idt_tables[0] = idt_table;
2225 percpu_traps_init();
2227 cpu_init();
2229 open_softirq(NMI_SOFTIRQ, nmi_softirq);
2232 long register_guest_nmi_callback(unsigned long address)
2234 struct vcpu *v = current;
2235 struct domain *d = current->domain;
2236 struct trap_info *t = &v->arch.guest_context.trap_ctxt[TRAP_nmi];
2238 t->vector = TRAP_nmi;
2239 t->flags = 0;
2240 t->cs = !IS_COMPAT(d) ? FLAT_KERNEL_CS : FLAT_COMPAT_KERNEL_CS;
2241 t->address = address;
2242 TI_SET_IF(t, 1);
2244 /*
2245 * If no handler was registered we can 'lose the NMI edge'. Re-assert it
2246 * now.
2247 */
2248 if ( (v->vcpu_id == 0) && (arch_get_nmi_reason(d) != 0) )
2249 v->nmi_pending = 1;
2251 return 0;
2254 long unregister_guest_nmi_callback(void)
2256 struct vcpu *v = current;
2257 struct trap_info *t = &v->arch.guest_context.trap_ctxt[TRAP_nmi];
2259 memset(t, 0, sizeof(*t));
2261 return 0;
2264 long do_set_trap_table(XEN_GUEST_HANDLE(trap_info_t) traps)
2266 struct trap_info cur;
2267 struct trap_info *dst = current->arch.guest_context.trap_ctxt;
2268 long rc = 0;
2270 /* If no table is presented then clear the entire virtual IDT. */
2271 if ( guest_handle_is_null(traps) )
2273 memset(dst, 0, 256 * sizeof(*dst));
2274 init_int80_direct_trap(current);
2275 return 0;
2278 for ( ; ; )
2280 if ( hypercall_preempt_check() )
2282 rc = hypercall_create_continuation(
2283 __HYPERVISOR_set_trap_table, "h", traps);
2284 break;
2287 if ( copy_from_guest(&cur, traps, 1) )
2289 rc = -EFAULT;
2290 break;
2293 if ( cur.address == 0 )
2294 break;
2296 if ( (cur.vector == TRAP_nmi) && !TI_GET_IF(&cur) )
2298 rc = -EINVAL;
2299 break;
2302 fixup_guest_code_selector(current->domain, cur.cs);
2304 memcpy(&dst[cur.vector], &cur, sizeof(cur));
2306 if ( cur.vector == 0x80 )
2307 init_int80_direct_trap(current);
2309 guest_handle_add_offset(traps, 1);
2312 return rc;
2316 long set_debugreg(struct vcpu *p, int reg, unsigned long value)
2318 int i;
2320 switch ( reg )
2322 case 0:
2323 if ( !access_ok(value, sizeof(long)) )
2324 return -EPERM;
2325 if ( p == current )
2326 asm volatile ( "mov %0, %%db0" : : "r" (value) );
2327 break;
2328 case 1:
2329 if ( !access_ok(value, sizeof(long)) )
2330 return -EPERM;
2331 if ( p == current )
2332 asm volatile ( "mov %0, %%db1" : : "r" (value) );
2333 break;
2334 case 2:
2335 if ( !access_ok(value, sizeof(long)) )
2336 return -EPERM;
2337 if ( p == current )
2338 asm volatile ( "mov %0, %%db2" : : "r" (value) );
2339 break;
2340 case 3:
2341 if ( !access_ok(value, sizeof(long)) )
2342 return -EPERM;
2343 if ( p == current )
2344 asm volatile ( "mov %0, %%db3" : : "r" (value) );
2345 break;
2346 case 6:
2347 /*
2348 * DR6: Bits 4-11,16-31 reserved (set to 1).
2349 * Bit 12 reserved (set to 0).
2350 */
2351 value &= 0xffffefff; /* reserved bits => 0 */
2352 value |= 0xffff0ff0; /* reserved bits => 1 */
2353 if ( p == current )
2354 asm volatile ( "mov %0, %%db6" : : "r" (value) );
2355 break;
2356 case 7:
2357 /*
2358 * DR7: Bit 10 reserved (set to 1).
2359 * Bits 11-12,14-15 reserved (set to 0).
2360 * Privileged bits:
2361 * GD (bit 13): must be 0.
2362 * R/Wn (bits 16-17,20-21,24-25,28-29): mustn't be 10.
2363 * LENn (bits 18-19,22-23,26-27,30-31): mustn't be 10.
2364 */
2365 /* DR7 == 0 => debugging disabled for this domain. */
2366 if ( value != 0 )
2368 value &= 0xffff27ff; /* reserved bits => 0 */
2369 value |= 0x00000400; /* reserved bits => 1 */
2370 if ( (value & (1<<13)) != 0 ) return -EPERM;
2371 for ( i = 0; i < 16; i += 2 )
2372 if ( ((value >> (i+16)) & 3) == 2 ) return -EPERM;
2374 if ( p == current )
2375 asm volatile ( "mov %0, %%db7" : : "r" (value) );
2376 break;
2377 default:
2378 return -EINVAL;
2381 p->arch.guest_context.debugreg[reg] = value;
2382 return 0;
2385 long do_set_debugreg(int reg, unsigned long value)
2387 return set_debugreg(current, reg, value);
2390 unsigned long do_get_debugreg(int reg)
2392 if ( (reg < 0) || (reg > 7) ) return -EINVAL;
2393 return current->arch.guest_context.debugreg[reg];
2396 /*
2397 * Local variables:
2398 * mode: C
2399 * c-set-style: "BSD"
2400 * c-basic-offset: 4
2401 * tab-width: 4
2402 * indent-tabs-mode: nil
2403 * End:
2404 */