ia64/xen-unstable

view xen/arch/x86/traps.c @ 16202:c05ec22a9106

x86, cpufreq: Allow dom0 kernel to govern cpufreq via the Intel
Enahanced SpeedStep MSR.
From: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Keir Fraser <keir@xensource.com>
author Keir Fraser <keir@xensource.com>
date Wed Oct 24 10:20:03 2007 +0100 (2007-10-24)
parents 42d8dadb5864
children aeebd173c3fa
line source
1 /******************************************************************************
2 * arch/x86/traps.c
3 *
4 * Modifications to Linux original are copyright (c) 2002-2004, K A Fraser
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
21 /*
22 * Copyright (C) 1991, 1992 Linus Torvalds
23 *
24 * Pentium III FXSR, SSE support
25 * Gareth Hughes <gareth@valinux.com>, May 2000
26 */
28 #include <xen/config.h>
29 #include <xen/init.h>
30 #include <xen/sched.h>
31 #include <xen/lib.h>
32 #include <xen/errno.h>
33 #include <xen/mm.h>
34 #include <xen/console.h>
35 #include <xen/shutdown.h>
36 #include <asm/regs.h>
37 #include <xen/delay.h>
38 #include <xen/event.h>
39 #include <xen/spinlock.h>
40 #include <xen/irq.h>
41 #include <xen/perfc.h>
42 #include <xen/softirq.h>
43 #include <xen/domain_page.h>
44 #include <xen/symbols.h>
45 #include <xen/iocap.h>
46 #include <xen/nmi.h>
47 #include <xen/version.h>
48 #include <xen/kexec.h>
49 #include <xen/trace.h>
50 #include <asm/paging.h>
51 #include <asm/system.h>
52 #include <asm/io.h>
53 #include <asm/atomic.h>
54 #include <asm/desc.h>
55 #include <asm/debugreg.h>
56 #include <asm/smp.h>
57 #include <asm/flushtlb.h>
58 #include <asm/uaccess.h>
59 #include <asm/i387.h>
60 #include <asm/debugger.h>
61 #include <asm/msr.h>
62 #include <asm/shared.h>
63 #include <asm/x86_emulate.h>
64 #include <asm/hvm/vpt.h>
66 /*
67 * opt_nmi: one of 'ignore', 'dom0', or 'fatal'.
68 * fatal: Xen prints diagnostic message and then hangs.
69 * dom0: The NMI is virtualised to DOM0.
70 * ignore: The NMI error is cleared and ignored.
71 */
72 #ifdef NDEBUG
73 char opt_nmi[10] = "dom0";
74 #else
75 char opt_nmi[10] = "fatal";
76 #endif
77 string_param("nmi", opt_nmi);
79 DEFINE_PER_CPU(u32, ler_msr);
81 /* Master table, used by CPU0. */
82 idt_entry_t idt_table[IDT_ENTRIES];
84 /* Pointer to the IDT of every CPU. */
85 idt_entry_t *idt_tables[NR_CPUS] __read_mostly;
87 #define DECLARE_TRAP_HANDLER(_name) \
88 asmlinkage void _name(void); \
89 asmlinkage int do_ ## _name(struct cpu_user_regs *regs)
91 asmlinkage void nmi(void);
92 asmlinkage void machine_check(void);
93 DECLARE_TRAP_HANDLER(divide_error);
94 DECLARE_TRAP_HANDLER(debug);
95 DECLARE_TRAP_HANDLER(int3);
96 DECLARE_TRAP_HANDLER(overflow);
97 DECLARE_TRAP_HANDLER(bounds);
98 DECLARE_TRAP_HANDLER(invalid_op);
99 DECLARE_TRAP_HANDLER(device_not_available);
100 DECLARE_TRAP_HANDLER(coprocessor_segment_overrun);
101 DECLARE_TRAP_HANDLER(invalid_TSS);
102 DECLARE_TRAP_HANDLER(segment_not_present);
103 DECLARE_TRAP_HANDLER(stack_segment);
104 DECLARE_TRAP_HANDLER(general_protection);
105 DECLARE_TRAP_HANDLER(page_fault);
106 DECLARE_TRAP_HANDLER(coprocessor_error);
107 DECLARE_TRAP_HANDLER(simd_coprocessor_error);
108 DECLARE_TRAP_HANDLER(alignment_check);
109 DECLARE_TRAP_HANDLER(spurious_interrupt_bug);
111 long do_set_debugreg(int reg, unsigned long value);
112 unsigned long do_get_debugreg(int reg);
114 static int debug_stack_lines = 20;
115 integer_param("debug_stack_lines", debug_stack_lines);
117 static int opt_ler;
118 boolean_param("ler", opt_ler);
120 #ifdef CONFIG_X86_32
121 #define stack_words_per_line 8
122 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)&regs->esp)
123 #else
124 #define stack_words_per_line 4
125 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)regs->rsp)
126 #endif
128 static void show_guest_stack(struct cpu_user_regs *regs)
129 {
130 int i;
131 unsigned long *stack, addr;
133 if ( is_hvm_vcpu(current) )
134 return;
136 if ( is_pv_32on64_vcpu(current) )
137 {
138 compat_show_guest_stack(regs, debug_stack_lines);
139 return;
140 }
142 if ( vm86_mode(regs) )
143 {
144 stack = (unsigned long *)((regs->ss << 4) + (regs->esp & 0xffff));
145 printk("Guest stack trace from ss:sp = %04x:%04x (VM86)\n ",
146 regs->ss, (uint16_t)(regs->esp & 0xffff));
147 }
148 else
149 {
150 stack = (unsigned long *)regs->esp;
151 printk("Guest stack trace from "__OP"sp=%p:\n ", stack);
152 }
154 for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
155 {
156 if ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) == 0 )
157 break;
158 if ( get_user(addr, stack) )
159 {
160 if ( i != 0 )
161 printk("\n ");
162 printk("Fault while accessing guest memory.");
163 i = 1;
164 break;
165 }
166 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
167 printk("\n ");
168 printk(" %p", _p(addr));
169 stack++;
170 }
171 if ( i == 0 )
172 printk("Stack empty.");
173 printk("\n");
174 }
176 #if !defined(CONFIG_FRAME_POINTER)
178 static void show_trace(struct cpu_user_regs *regs)
179 {
180 unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr;
182 printk("Xen call trace:\n ");
184 printk("[<%p>]", _p(regs->eip));
185 print_symbol(" %s\n ", regs->eip);
187 while ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) != 0 )
188 {
189 addr = *stack++;
190 if ( is_kernel_text(addr) || is_kernel_inittext(addr) )
191 {
192 printk("[<%p>]", _p(addr));
193 print_symbol(" %s\n ", addr);
194 }
195 }
197 printk("\n");
198 }
200 #else
202 static void show_trace(struct cpu_user_regs *regs)
203 {
204 unsigned long *frame, next, addr, low, high;
206 printk("Xen call trace:\n ");
208 printk("[<%p>]", _p(regs->eip));
209 print_symbol(" %s\n ", regs->eip);
211 /* Bounds for range of valid frame pointer. */
212 low = (unsigned long)(ESP_BEFORE_EXCEPTION(regs) - 2);
213 high = (low & ~(STACK_SIZE - 1)) +
214 (STACK_SIZE - sizeof(struct cpu_info) - 2*sizeof(unsigned long));
216 /* The initial frame pointer. */
217 next = regs->ebp;
219 for ( ; ; )
220 {
221 /* Valid frame pointer? */
222 if ( (next < low) || (next >= high) )
223 {
224 /*
225 * Exception stack frames have a different layout, denoted by an
226 * inverted frame pointer.
227 */
228 next = ~next;
229 if ( (next < low) || (next >= high) )
230 break;
231 frame = (unsigned long *)next;
232 next = frame[0];
233 addr = frame[(offsetof(struct cpu_user_regs, eip) -
234 offsetof(struct cpu_user_regs, ebp))
235 / BYTES_PER_LONG];
236 }
237 else
238 {
239 /* Ordinary stack frame. */
240 frame = (unsigned long *)next;
241 next = frame[0];
242 addr = frame[1];
243 }
245 printk("[<%p>]", _p(addr));
246 print_symbol(" %s\n ", addr);
248 low = (unsigned long)&frame[2];
249 }
251 printk("\n");
252 }
254 #endif
256 void show_stack(struct cpu_user_regs *regs)
257 {
258 unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr;
259 int i;
261 if ( guest_mode(regs) )
262 return show_guest_stack(regs);
264 printk("Xen stack trace from "__OP"sp=%p:\n ", stack);
266 for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
267 {
268 if ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) == 0 )
269 break;
270 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
271 printk("\n ");
272 addr = *stack++;
273 printk(" %p", _p(addr));
274 }
275 if ( i == 0 )
276 printk("Stack empty.");
277 printk("\n");
279 show_trace(regs);
280 }
282 void show_stack_overflow(unsigned int cpu, unsigned long esp)
283 {
284 #ifdef MEMORY_GUARD
285 unsigned long esp_top, esp_bottom;
286 unsigned long *stack, addr;
288 esp_bottom = (esp | (STACK_SIZE - 1)) + 1;
289 esp_top = esp_bottom - PRIMARY_STACK_SIZE;
291 printk("Valid stack range: %p-%p, sp=%p, tss.esp0=%p\n",
292 (void *)esp_top, (void *)esp_bottom, (void *)esp,
293 (void *)init_tss[cpu].esp0);
295 /* Trigger overflow trace if %esp is within 512 bytes of the guard page. */
296 if ( ((unsigned long)(esp - esp_top) > 512) &&
297 ((unsigned long)(esp_top - esp) > 512) )
298 {
299 printk("No stack overflow detected. Skipping stack trace.\n");
300 return;
301 }
303 if ( esp < esp_top )
304 esp = esp_top;
306 printk("Xen stack overflow (dumping trace %p-%p):\n ",
307 (void *)esp, (void *)esp_bottom);
309 stack = (unsigned long *)esp;
310 while ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) != 0 )
311 {
312 addr = *stack++;
313 if ( is_kernel_text(addr) || is_kernel_inittext(addr) )
314 {
315 printk("%p: [<%p>]", stack, _p(addr));
316 print_symbol(" %s\n ", addr);
317 }
318 }
320 printk("\n");
321 #endif
322 }
324 void show_execution_state(struct cpu_user_regs *regs)
325 {
326 show_registers(regs);
327 show_stack(regs);
328 }
330 char *trapstr(int trapnr)
331 {
332 static char *strings[] = {
333 "divide error", "debug", "nmi", "bkpt", "overflow", "bounds",
334 "invalid opcode", "device not available", "double fault",
335 "coprocessor segment", "invalid tss", "segment not found",
336 "stack error", "general protection fault", "page fault",
337 "spurious interrupt", "coprocessor error", "alignment check",
338 "machine check", "simd error"
339 };
341 if ( (trapnr < 0) || (trapnr >= ARRAY_SIZE(strings)) )
342 return "???";
344 return strings[trapnr];
345 }
347 /*
348 * This is called for faults at very unexpected times (e.g., when interrupts
349 * are disabled). In such situations we can't do much that is safe. We try to
350 * print out some tracing and then we just spin.
351 */
352 asmlinkage void fatal_trap(int trapnr, struct cpu_user_regs *regs)
353 {
354 static DEFINE_PER_CPU(char, depth);
356 /*
357 * In some cases, we can end up in a vicious cycle of fatal_trap()s
358 * within fatal_trap()s. We give the problem a couple of iterations to
359 * bottom out, and then we just panic.
360 */
361 if ( ++this_cpu(depth) < 3 )
362 {
363 watchdog_disable();
364 console_start_sync();
366 show_execution_state(regs);
368 if ( trapnr == TRAP_page_fault )
369 {
370 unsigned long cr2 = read_cr2();
371 printk("Faulting linear address: %p\n", _p(cr2));
372 show_page_walk(cr2);
373 }
374 }
376 panic("FATAL TRAP: vector = %d (%s)\n"
377 "[error_code=%04x] %s\n",
378 trapnr, trapstr(trapnr), regs->error_code,
379 (regs->eflags & X86_EFLAGS_IF) ? "" : ", IN INTERRUPT CONTEXT");
380 }
382 static int do_guest_trap(
383 int trapnr, const struct cpu_user_regs *regs, int use_error_code)
384 {
385 struct vcpu *v = current;
386 struct trap_bounce *tb;
387 const struct trap_info *ti;
389 trace_pv_trap(trapnr, regs->eip, use_error_code, regs->error_code);
391 tb = &v->arch.trap_bounce;
392 ti = &v->arch.guest_context.trap_ctxt[trapnr];
394 tb->flags = TBF_EXCEPTION;
395 tb->cs = ti->cs;
396 tb->eip = ti->address;
398 if ( use_error_code )
399 {
400 tb->flags |= TBF_EXCEPTION_ERRCODE;
401 tb->error_code = regs->error_code;
402 }
404 if ( TI_GET_IF(ti) )
405 tb->flags |= TBF_INTERRUPT;
407 if ( unlikely(null_trap_bounce(v, tb)) )
408 gdprintk(XENLOG_WARNING, "Unhandled %s fault/trap [#%d] in "
409 "domain %d on VCPU %d [ec=%04x]\n",
410 trapstr(trapnr), trapnr, v->domain->domain_id, v->vcpu_id,
411 regs->error_code);
413 return 0;
414 }
416 static inline int do_trap(
417 int trapnr, struct cpu_user_regs *regs, int use_error_code)
418 {
419 unsigned long fixup;
421 DEBUGGER_trap_entry(trapnr, regs);
423 if ( guest_mode(regs) )
424 return do_guest_trap(trapnr, regs, use_error_code);
426 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
427 {
428 dprintk(XENLOG_ERR, "Trap %d: %p -> %p\n",
429 trapnr, _p(regs->eip), _p(fixup));
430 regs->eip = fixup;
431 return 0;
432 }
434 DEBUGGER_trap_fatal(trapnr, regs);
436 show_execution_state(regs);
437 panic("FATAL TRAP: vector = %d (%s)\n"
438 "[error_code=%04x]\n",
439 trapnr, trapstr(trapnr), regs->error_code);
440 return 0;
441 }
443 #define DO_ERROR_NOCODE(trapnr, name) \
444 asmlinkage int do_##name(struct cpu_user_regs *regs) \
445 { \
446 return do_trap(trapnr, regs, 0); \
447 }
449 #define DO_ERROR(trapnr, name) \
450 asmlinkage int do_##name(struct cpu_user_regs *regs) \
451 { \
452 return do_trap(trapnr, regs, 1); \
453 }
455 DO_ERROR_NOCODE(TRAP_divide_error, divide_error)
456 DO_ERROR_NOCODE(TRAP_overflow, overflow)
457 DO_ERROR_NOCODE(TRAP_bounds, bounds)
458 DO_ERROR_NOCODE(TRAP_copro_seg, coprocessor_segment_overrun)
459 DO_ERROR( TRAP_invalid_tss, invalid_TSS)
460 DO_ERROR( TRAP_no_segment, segment_not_present)
461 DO_ERROR( TRAP_stack_error, stack_segment)
462 DO_ERROR_NOCODE(TRAP_copro_error, coprocessor_error)
463 DO_ERROR( TRAP_alignment_check, alignment_check)
464 DO_ERROR_NOCODE(TRAP_simd_error, simd_coprocessor_error)
466 int rdmsr_hypervisor_regs(
467 uint32_t idx, uint32_t *eax, uint32_t *edx)
468 {
469 idx -= 0x40000000;
470 if ( idx > 0 )
471 return 0;
473 switch ( idx )
474 {
475 case 0:
476 {
477 *eax = *edx = 0;
478 break;
479 }
480 default:
481 BUG();
482 }
484 return 1;
485 }
487 int wrmsr_hypervisor_regs(
488 uint32_t idx, uint32_t eax, uint32_t edx)
489 {
490 struct domain *d = current->domain;
492 idx -= 0x40000000;
493 if ( idx > 0 )
494 return 0;
496 switch ( idx )
497 {
498 case 0:
499 {
500 void *hypercall_page;
501 unsigned long mfn;
502 unsigned long gmfn = ((unsigned long)edx << 20) | (eax >> 12);
503 unsigned int idx = eax & 0xfff;
505 if ( idx > 0 )
506 {
507 gdprintk(XENLOG_WARNING,
508 "Dom%d: Out of range index %u to MSR %08x\n",
509 d->domain_id, idx, 0x40000000);
510 return 0;
511 }
513 mfn = gmfn_to_mfn(d, gmfn);
515 if ( !mfn_valid(mfn) ||
516 !get_page_and_type(mfn_to_page(mfn), d, PGT_writable_page) )
517 {
518 gdprintk(XENLOG_WARNING,
519 "Dom%d: Bad GMFN %lx (MFN %lx) to MSR %08x\n",
520 d->domain_id, gmfn, mfn, 0x40000000);
521 return 0;
522 }
524 hypercall_page = map_domain_page(mfn);
525 hypercall_page_initialise(d, hypercall_page);
526 unmap_domain_page(hypercall_page);
528 put_page_and_type(mfn_to_page(mfn));
529 break;
530 }
532 default:
533 BUG();
534 }
536 return 1;
537 }
539 int cpuid_hypervisor_leaves(
540 uint32_t idx, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
541 {
542 idx -= 0x40000000;
543 if ( idx > 2 )
544 return 0;
546 switch ( idx )
547 {
548 case 0:
549 *eax = 0x40000002; /* Largest leaf */
550 *ebx = 0x566e6558; /* Signature 1: "XenV" */
551 *ecx = 0x65584d4d; /* Signature 2: "MMXe" */
552 *edx = 0x4d4d566e; /* Signature 3: "nVMM" */
553 break;
555 case 1:
556 *eax = (xen_major_version() << 16) | xen_minor_version();
557 *ebx = 0; /* Reserved */
558 *ecx = 0; /* Reserved */
559 *edx = 0; /* Reserved */
560 break;
562 case 2:
563 *eax = 1; /* Number of hypercall-transfer pages */
564 *ebx = 0x40000000; /* MSR base address */
565 *ecx = 0; /* Features 1 */
566 *edx = 0; /* Features 2 */
567 break;
569 default:
570 BUG();
571 }
573 return 1;
574 }
576 static int emulate_forced_invalid_op(struct cpu_user_regs *regs)
577 {
578 char sig[5], instr[2];
579 uint32_t a, b, c, d;
580 unsigned long eip, rc;
582 a = regs->eax;
583 b = regs->ebx;
584 c = regs->ecx;
585 d = regs->edx;
586 eip = regs->eip;
588 /* Check for forced emulation signature: ud2 ; .ascii "xen". */
589 if ( (rc = copy_from_user(sig, (char *)eip, sizeof(sig))) != 0 )
590 {
591 propagate_page_fault(eip + sizeof(sig) - rc, 0);
592 return EXCRET_fault_fixed;
593 }
594 if ( memcmp(sig, "\xf\xbxen", sizeof(sig)) )
595 return 0;
596 eip += sizeof(sig);
598 /* We only emulate CPUID. */
599 if ( ( rc = copy_from_user(instr, (char *)eip, sizeof(instr))) != 0 )
600 {
601 propagate_page_fault(eip + sizeof(instr) - rc, 0);
602 return EXCRET_fault_fixed;
603 }
604 if ( memcmp(instr, "\xf\xa2", sizeof(instr)) )
605 return 0;
606 eip += sizeof(instr);
608 asm (
609 "cpuid"
610 : "=a" (a), "=b" (b), "=c" (c), "=d" (d)
611 : "0" (a), "1" (b), "2" (c), "3" (d) );
613 if ( regs->eax == 1 )
614 {
615 /* Modify Feature Information. */
616 clear_bit(X86_FEATURE_VME, &d);
617 clear_bit(X86_FEATURE_DE, &d);
618 clear_bit(X86_FEATURE_PSE, &d);
619 clear_bit(X86_FEATURE_PGE, &d);
620 if ( !supervisor_mode_kernel )
621 clear_bit(X86_FEATURE_SEP, &d);
622 if ( !IS_PRIV(current->domain) )
623 clear_bit(X86_FEATURE_MTRR, &d);
624 }
625 else if ( regs->eax == 0x80000001 )
626 {
627 /* Modify Feature Information. */
628 if ( is_pv_32bit_vcpu(current) )
629 clear_bit(X86_FEATURE_SYSCALL % 32, &d);
630 clear_bit(X86_FEATURE_RDTSCP % 32, &d);
631 }
632 else
633 {
634 (void)cpuid_hypervisor_leaves(regs->eax, &a, &b, &c, &d);
635 }
637 regs->eax = a;
638 regs->ebx = b;
639 regs->ecx = c;
640 regs->edx = d;
641 regs->eip = eip;
642 regs->eflags &= ~X86_EFLAGS_RF;
644 trace_trap_one_addr(TRC_PV_FORCED_INVALID_OP, regs->eip);
646 return EXCRET_fault_fixed;
647 }
649 asmlinkage int do_invalid_op(struct cpu_user_regs *regs)
650 {
651 struct bug_frame bug;
652 struct bug_frame_str bug_str;
653 char *filename, *predicate, *eip = (char *)regs->eip;
654 int rc, id, lineno;
656 DEBUGGER_trap_entry(TRAP_invalid_op, regs);
658 if ( likely(guest_mode(regs)) )
659 {
660 if ( (rc = emulate_forced_invalid_op(regs)) != 0 )
661 return rc;
662 return do_guest_trap(TRAP_invalid_op, regs, 0);
663 }
665 if ( !is_kernel(eip) ||
666 __copy_from_user(&bug, eip, sizeof(bug)) ||
667 memcmp(bug.ud2, "\xf\xb", sizeof(bug.ud2)) ||
668 (bug.ret != 0xc2) )
669 goto die;
670 eip += sizeof(bug);
672 id = bug.id & 3;
674 if ( id == BUGFRAME_dump )
675 {
676 show_execution_state(regs);
677 regs->eip = (unsigned long)eip;
678 return EXCRET_fault_fixed;
679 }
681 /* WARN, BUG or ASSERT: decode the filename pointer and line number. */
682 if ( !is_kernel(eip) ||
683 __copy_from_user(&bug_str, eip, sizeof(bug_str)) ||
684 memcmp(bug_str.mov, BUG_MOV_STR, sizeof(bug_str.mov)) )
685 goto die;
686 eip += sizeof(bug_str);
688 filename = is_kernel(bug_str.str) ? (char *)bug_str.str : "<unknown>";
689 lineno = bug.id >> 2;
691 if ( id == BUGFRAME_warn )
692 {
693 printk("Xen WARN at %.50s:%d\n", filename, lineno);
694 show_execution_state(regs);
695 regs->eip = (unsigned long)eip;
696 return EXCRET_fault_fixed;
697 }
699 if ( id == BUGFRAME_bug )
700 {
701 printk("Xen BUG at %.50s:%d\n", filename, lineno);
702 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
703 show_execution_state(regs);
704 panic("Xen BUG at %.50s:%d\n", filename, lineno);
705 }
707 /* ASSERT: decode the predicate string pointer. */
708 ASSERT(id == BUGFRAME_assert);
709 if ( !is_kernel(eip) ||
710 __copy_from_user(&bug_str, eip, sizeof(bug_str)) ||
711 memcmp(bug_str.mov, BUG_MOV_STR, sizeof(bug_str.mov)) )
712 goto die;
713 eip += sizeof(bug_str);
715 predicate = is_kernel(bug_str.str) ? (char *)bug_str.str : "<unknown>";
716 printk("Assertion '%s' failed at %.50s:%d\n",
717 predicate, filename, lineno);
718 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
719 show_execution_state(regs);
720 panic("Assertion '%s' failed at %.50s:%d\n",
721 predicate, filename, lineno);
723 die:
724 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
725 show_execution_state(regs);
726 panic("FATAL TRAP: vector = %d (invalid opcode)\n", TRAP_invalid_op);
727 return 0;
728 }
730 asmlinkage int do_int3(struct cpu_user_regs *regs)
731 {
732 DEBUGGER_trap_entry(TRAP_int3, regs);
734 if ( !guest_mode(regs) )
735 {
736 DEBUGGER_trap_fatal(TRAP_int3, regs);
737 show_execution_state(regs);
738 panic("FATAL TRAP: vector = 3 (Int3)\n");
739 }
741 return do_guest_trap(TRAP_int3, regs, 0);
742 }
744 asmlinkage void do_machine_check(struct cpu_user_regs *regs)
745 {
746 extern fastcall void (*machine_check_vector)(
747 struct cpu_user_regs *, long error_code);
748 machine_check_vector(regs, regs->error_code);
749 }
751 void propagate_page_fault(unsigned long addr, u16 error_code)
752 {
753 struct trap_info *ti;
754 struct vcpu *v = current;
755 struct trap_bounce *tb = &v->arch.trap_bounce;
757 v->arch.guest_context.ctrlreg[2] = addr;
758 arch_set_cr2(v, addr);
760 /* Re-set error_code.user flag appropriately for the guest. */
761 error_code &= ~PFEC_user_mode;
762 if ( !guest_kernel_mode(v, guest_cpu_user_regs()) )
763 error_code |= PFEC_user_mode;
765 trace_pv_page_fault(addr, error_code);
767 ti = &v->arch.guest_context.trap_ctxt[TRAP_page_fault];
768 tb->flags = TBF_EXCEPTION | TBF_EXCEPTION_ERRCODE;
769 tb->error_code = error_code;
770 tb->cs = ti->cs;
771 tb->eip = ti->address;
772 if ( TI_GET_IF(ti) )
773 tb->flags |= TBF_INTERRUPT;
774 if ( unlikely(null_trap_bounce(v, tb)) )
775 {
776 printk("Unhandled page fault in domain %d on VCPU %d (ec=%04X)\n",
777 v->domain->domain_id, v->vcpu_id, error_code);
778 show_page_walk(addr);
779 }
780 }
782 static int handle_gdt_ldt_mapping_fault(
783 unsigned long offset, struct cpu_user_regs *regs)
784 {
785 /* Which vcpu's area did we fault in, and is it in the ldt sub-area? */
786 unsigned int is_ldt_area = (offset >> (GDT_LDT_VCPU_VA_SHIFT-1)) & 1;
787 unsigned int vcpu_area = (offset >> GDT_LDT_VCPU_VA_SHIFT);
789 /* Should never fault in another vcpu's area. */
790 BUG_ON(vcpu_area != current->vcpu_id);
792 /* Byte offset within the gdt/ldt sub-area. */
793 offset &= (1UL << (GDT_LDT_VCPU_VA_SHIFT-1)) - 1UL;
795 if ( likely(is_ldt_area) )
796 {
797 /* LDT fault: Copy a mapping from the guest's LDT, if it is valid. */
798 if ( likely(map_ldt_shadow_page(offset >> PAGE_SHIFT)) )
799 {
800 if ( guest_mode(regs) )
801 trace_trap_two_addr(TRC_PV_GDT_LDT_MAPPING_FAULT,
802 regs->eip, offset);
803 }
804 else
805 {
806 /* In hypervisor mode? Leave it to the #PF handler to fix up. */
807 if ( !guest_mode(regs) )
808 return 0;
809 /* In guest mode? Propagate #PF to guest, with adjusted %cr2. */
810 propagate_page_fault(
811 current->arch.guest_context.ldt_base + offset,
812 regs->error_code);
813 }
814 }
815 else
816 {
817 /* GDT fault: handle the fault as #GP(selector). */
818 regs->error_code = (u16)offset & ~7;
819 (void)do_general_protection(regs);
820 }
822 return EXCRET_fault_fixed;
823 }
825 #ifdef HYPERVISOR_VIRT_END
826 #define IN_HYPERVISOR_RANGE(va) \
827 (((va) >= HYPERVISOR_VIRT_START) && ((va) < HYPERVISOR_VIRT_END))
828 #else
829 #define IN_HYPERVISOR_RANGE(va) \
830 (((va) >= HYPERVISOR_VIRT_START))
831 #endif
833 static int __spurious_page_fault(
834 unsigned long addr, struct cpu_user_regs *regs)
835 {
836 unsigned long mfn, cr3 = read_cr3();
837 #if CONFIG_PAGING_LEVELS >= 4
838 l4_pgentry_t l4e, *l4t;
839 #endif
840 #if CONFIG_PAGING_LEVELS >= 3
841 l3_pgentry_t l3e, *l3t;
842 #endif
843 l2_pgentry_t l2e, *l2t;
844 l1_pgentry_t l1e, *l1t;
845 unsigned int required_flags, disallowed_flags;
847 /* Reserved bit violations are never spurious faults. */
848 if ( regs->error_code & PFEC_reserved_bit )
849 return 0;
851 required_flags = _PAGE_PRESENT;
852 if ( regs->error_code & PFEC_write_access )
853 required_flags |= _PAGE_RW;
854 if ( regs->error_code & PFEC_user_mode )
855 required_flags |= _PAGE_USER;
857 disallowed_flags = 0;
858 if ( regs->error_code & PFEC_insn_fetch )
859 disallowed_flags |= _PAGE_NX;
861 mfn = cr3 >> PAGE_SHIFT;
863 #if CONFIG_PAGING_LEVELS >= 4
864 l4t = map_domain_page(mfn);
865 l4e = l4e_read_atomic(&l4t[l4_table_offset(addr)]);
866 mfn = l4e_get_pfn(l4e);
867 unmap_domain_page(l4t);
868 if ( ((l4e_get_flags(l4e) & required_flags) != required_flags) ||
869 (l4e_get_flags(l4e) & disallowed_flags) )
870 return 0;
871 #endif
873 #if CONFIG_PAGING_LEVELS >= 3
874 l3t = map_domain_page(mfn);
875 #ifdef CONFIG_X86_PAE
876 l3t += (cr3 & 0xFE0UL) >> 3;
877 #endif
878 l3e = l3e_read_atomic(&l3t[l3_table_offset(addr)]);
879 mfn = l3e_get_pfn(l3e);
880 unmap_domain_page(l3t);
881 #ifdef CONFIG_X86_PAE
882 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
883 return 0;
884 #else
885 if ( ((l3e_get_flags(l3e) & required_flags) != required_flags) ||
886 (l3e_get_flags(l3e) & disallowed_flags) )
887 return 0;
888 #endif
889 #endif
891 l2t = map_domain_page(mfn);
892 l2e = l2e_read_atomic(&l2t[l2_table_offset(addr)]);
893 mfn = l2e_get_pfn(l2e);
894 unmap_domain_page(l2t);
895 if ( ((l2e_get_flags(l2e) & required_flags) != required_flags) ||
896 (l2e_get_flags(l2e) & disallowed_flags) )
897 return 0;
898 if ( l2e_get_flags(l2e) & _PAGE_PSE )
899 {
900 l1e = l1e_empty(); /* define before use in debug tracing */
901 goto spurious;
902 }
904 l1t = map_domain_page(mfn);
905 l1e = l1e_read_atomic(&l1t[l1_table_offset(addr)]);
906 mfn = l1e_get_pfn(l1e);
907 unmap_domain_page(l1t);
908 if ( ((l1e_get_flags(l1e) & required_flags) != required_flags) ||
909 (l1e_get_flags(l1e) & disallowed_flags) )
910 return 0;
912 spurious:
913 dprintk(XENLOG_WARNING, "Spurious fault in domain %u:%u "
914 "at addr %lx, e/c %04x\n",
915 current->domain->domain_id, current->vcpu_id,
916 addr, regs->error_code);
917 #if CONFIG_PAGING_LEVELS >= 4
918 dprintk(XENLOG_WARNING, " l4e = %"PRIpte"\n", l4e_get_intpte(l4e));
919 #endif
920 #if CONFIG_PAGING_LEVELS >= 3
921 dprintk(XENLOG_WARNING, " l3e = %"PRIpte"\n", l3e_get_intpte(l3e));
922 #endif
923 dprintk(XENLOG_WARNING, " l2e = %"PRIpte"\n", l2e_get_intpte(l2e));
924 dprintk(XENLOG_WARNING, " l1e = %"PRIpte"\n", l1e_get_intpte(l1e));
925 #ifndef NDEBUG
926 show_registers(regs);
927 #endif
928 return 1;
929 }
931 static int spurious_page_fault(
932 unsigned long addr, struct cpu_user_regs *regs)
933 {
934 unsigned long flags;
935 int is_spurious;
937 /*
938 * Disabling interrupts prevents TLB flushing, and hence prevents
939 * page tables from becoming invalid under our feet during the walk.
940 */
941 local_irq_save(flags);
942 is_spurious = __spurious_page_fault(addr, regs);
943 local_irq_restore(flags);
945 return is_spurious;
946 }
948 static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs)
949 {
950 struct vcpu *v = current;
951 struct domain *d = v->domain;
953 /* No fixups in interrupt context or when interrupts are disabled. */
954 if ( in_irq() || !(regs->eflags & X86_EFLAGS_IF) )
955 return 0;
957 if ( unlikely(IN_HYPERVISOR_RANGE(addr)) )
958 {
959 if ( paging_mode_external(d) && guest_mode(regs) )
960 {
961 int ret = paging_fault(addr, regs);
962 if ( ret == EXCRET_fault_fixed )
963 trace_trap_two_addr(TRC_PV_PAGING_FIXUP, regs->eip, addr);
964 return ret;
965 }
966 if ( (addr >= GDT_LDT_VIRT_START) && (addr < GDT_LDT_VIRT_END) )
967 return handle_gdt_ldt_mapping_fault(
968 addr - GDT_LDT_VIRT_START, regs);
969 return 0;
970 }
972 if ( VM_ASSIST(d, VMASST_TYPE_writable_pagetables) &&
973 guest_kernel_mode(v, regs) &&
974 /* Do not check if access-protection fault since the page may
975 legitimately be not present in shadow page tables */
976 ((regs->error_code & PFEC_write_access) == PFEC_write_access) &&
977 ptwr_do_page_fault(v, addr, regs) )
978 return EXCRET_fault_fixed;
980 if ( paging_mode_enabled(d) )
981 {
982 int ret = paging_fault(addr, regs);
983 if ( ret == EXCRET_fault_fixed )
984 trace_trap_two_addr(TRC_PV_PAGING_FIXUP, regs->eip, addr);
985 return ret;
986 }
988 return 0;
989 }
991 /*
992 * #PF error code:
993 * Bit 0: Protection violation (=1) ; Page not present (=0)
994 * Bit 1: Write access
995 * Bit 2: User mode (=1) ; Supervisor mode (=0)
996 * Bit 3: Reserved bit violation
997 * Bit 4: Instruction fetch
998 */
999 asmlinkage int do_page_fault(struct cpu_user_regs *regs)
1001 unsigned long addr, fixup;
1002 int rc;
1004 addr = read_cr2();
1006 DEBUGGER_trap_entry(TRAP_page_fault, regs);
1008 perfc_incr(page_faults);
1010 if ( unlikely((rc = fixup_page_fault(addr, regs)) != 0) )
1011 return rc;
1013 if ( unlikely(!guest_mode(regs)) )
1015 if ( spurious_page_fault(addr, regs) )
1016 return EXCRET_not_a_fault;
1018 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
1020 perfc_incr(copy_user_faults);
1021 regs->eip = fixup;
1022 return 0;
1025 DEBUGGER_trap_fatal(TRAP_page_fault, regs);
1027 show_execution_state(regs);
1028 show_page_walk(addr);
1029 panic("FATAL PAGE FAULT\n"
1030 "[error_code=%04x]\n"
1031 "Faulting linear address: %p\n",
1032 regs->error_code, _p(addr));
1035 propagate_page_fault(addr, regs->error_code);
1036 return 0;
1039 /*
1040 * Early handler to deal with spurious page faults. For example, consider a
1041 * routine that uses a mapping immediately after installing it (making it
1042 * present). The CPU may speculatively execute the memory access before
1043 * executing the PTE write. The instruction will then be marked to cause a
1044 * page fault when it is retired, despite the fact that the PTE is present and
1045 * correct at that point in time.
1046 */
1047 asmlinkage int do_early_page_fault(struct cpu_user_regs *regs)
1049 static int stuck;
1050 static unsigned long prev_eip, prev_cr2;
1051 unsigned long cr2 = read_cr2();
1053 BUG_ON(smp_processor_id() != 0);
1055 if ( (regs->eip != prev_eip) || (cr2 != prev_cr2) )
1057 prev_eip = regs->eip;
1058 prev_cr2 = cr2;
1059 stuck = 0;
1060 return EXCRET_not_a_fault;
1063 if ( stuck++ == 1000 )
1064 panic("Early fatal page fault at %04x:%p (cr2=%p, ec=%04x)\n",
1065 regs->cs, _p(regs->eip), _p(cr2), regs->error_code);
1067 return EXCRET_not_a_fault;
1070 long do_fpu_taskswitch(int set)
1072 struct vcpu *v = current;
1074 if ( set )
1076 v->arch.guest_context.ctrlreg[0] |= X86_CR0_TS;
1077 stts();
1079 else
1081 v->arch.guest_context.ctrlreg[0] &= ~X86_CR0_TS;
1082 if ( v->fpu_dirtied )
1083 clts();
1086 return 0;
1089 static int read_descriptor(unsigned int sel,
1090 const struct vcpu *v,
1091 const struct cpu_user_regs * regs,
1092 unsigned long *base,
1093 unsigned long *limit,
1094 unsigned int *ar,
1095 unsigned int vm86attr)
1097 struct desc_struct desc;
1099 if ( !vm86_mode(regs) )
1101 if ( sel < 4)
1102 desc.b = desc.a = 0;
1103 else if ( __get_user(desc,
1104 (const struct desc_struct *)(!(sel & 4)
1105 ? GDT_VIRT_START(v)
1106 : LDT_VIRT_START(v))
1107 + (sel >> 3)) )
1108 return 0;
1109 if ( !(vm86attr & _SEGMENT_CODE) )
1110 desc.b &= ~_SEGMENT_L;
1112 else
1114 desc.a = (sel << 20) | 0xffff;
1115 desc.b = vm86attr | (sel >> 12);
1118 *ar = desc.b & 0x00f0ff00;
1119 if ( !(desc.b & _SEGMENT_L) )
1121 *base = ((desc.a >> 16) + ((desc.b & 0xff) << 16) +
1122 (desc.b & 0xff000000));
1123 *limit = (desc.a & 0xffff) | (desc.b & 0x000f0000);
1124 if ( desc.b & _SEGMENT_G )
1125 *limit = ((*limit + 1) << 12) - 1;
1126 #ifndef NDEBUG
1127 if ( !vm86_mode(regs) && (sel > 3) )
1129 unsigned int a, l;
1130 unsigned char valid;
1132 asm volatile (
1133 "larl %2,%0 ; setz %1"
1134 : "=r" (a), "=rm" (valid) : "rm" (sel));
1135 BUG_ON(valid && ((a & 0x00f0ff00) != *ar));
1136 asm volatile (
1137 "lsll %2,%0 ; setz %1"
1138 : "=r" (l), "=rm" (valid) : "rm" (sel));
1139 BUG_ON(valid && (l != *limit));
1141 #endif
1143 else
1145 *base = 0UL;
1146 *limit = ~0UL;
1149 return 1;
1152 /* Has the guest requested sufficient permission for this I/O access? */
1153 static inline int guest_io_okay(
1154 unsigned int port, unsigned int bytes,
1155 struct vcpu *v, struct cpu_user_regs *regs)
1157 #if defined(__x86_64__)
1158 /* If in user mode, switch to kernel mode just to read I/O bitmap. */
1159 int user_mode = !(v->arch.flags & TF_kernel_mode);
1160 #define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v)
1161 #elif defined(__i386__)
1162 #define TOGGLE_MODE() ((void)0)
1163 #endif
1165 if ( !vm86_mode(regs) &&
1166 (v->arch.iopl >= (guest_kernel_mode(v, regs) ? 1 : 3)) )
1167 return 1;
1169 if ( v->arch.iobmp_limit > (port + bytes) )
1171 union { uint8_t bytes[2]; uint16_t mask; } x;
1173 /*
1174 * Grab permission bytes from guest space. Inaccessible bytes are
1175 * read as 0xff (no access allowed).
1176 */
1177 TOGGLE_MODE();
1178 switch ( __copy_from_guest_offset(x.bytes, v->arch.iobmp,
1179 port>>3, 2) )
1181 default: x.bytes[0] = ~0;
1182 case 1: x.bytes[1] = ~0;
1183 case 0: break;
1185 TOGGLE_MODE();
1187 if ( (x.mask & (((1<<bytes)-1) << (port&7))) == 0 )
1188 return 1;
1191 return 0;
1194 /* Has the administrator granted sufficient permission for this I/O access? */
1195 static inline int admin_io_okay(
1196 unsigned int port, unsigned int bytes,
1197 struct vcpu *v, struct cpu_user_regs *regs)
1199 return ioports_access_permitted(v->domain, port, port + bytes - 1);
1202 #define guest_inb_okay(_p, _d, _r) admin_io_okay(_p, 1, _d, _r)
1203 #define guest_inw_okay(_p, _d, _r) admin_io_okay(_p, 2, _d, _r)
1204 #define guest_inl_okay(_p, _d, _r) admin_io_okay(_p, 4, _d, _r)
1205 #define guest_outb_okay(_p, _d, _r) admin_io_okay(_p, 1, _d, _r)
1206 #define guest_outw_okay(_p, _d, _r) admin_io_okay(_p, 2, _d, _r)
1207 #define guest_outl_okay(_p, _d, _r) admin_io_okay(_p, 4, _d, _r)
1209 /* I/O emulation support. Helper routines for, and type of, the stack stub.*/
1210 void host_to_guest_gpr_switch(struct cpu_user_regs *)
1211 __attribute__((__regparm__(1)));
1212 unsigned long guest_to_host_gpr_switch(unsigned long)
1213 __attribute__((__regparm__(1)));
1215 void (*pv_post_outb_hook)(unsigned int port, u8 value);
1217 /* Instruction fetch with error handling. */
1218 #define insn_fetch(type, base, eip, limit) \
1219 ({ unsigned long _rc, _ptr = (base) + (eip); \
1220 type _x; \
1221 if ( (limit) < sizeof(_x) - 1 || (eip) > (limit) - (sizeof(_x) - 1) ) \
1222 goto fail; \
1223 if ( (_rc = copy_from_user(&_x, (type *)_ptr, sizeof(_x))) != 0 ) \
1224 { \
1225 propagate_page_fault(_ptr + sizeof(_x) - _rc, 0); \
1226 return EXCRET_fault_fixed; \
1227 } \
1228 (eip) += sizeof(_x); _x; })
1230 #if defined(CONFIG_X86_32)
1231 # define read_sreg(regs, sr) ((regs)->sr)
1232 #elif defined(CONFIG_X86_64)
1233 # define read_sreg(regs, sr) read_segment_register(sr)
1234 #endif
1236 static int emulate_privileged_op(struct cpu_user_regs *regs)
1238 struct vcpu *v = current;
1239 unsigned long *reg, eip = regs->eip, res;
1240 u8 opcode, modrm_reg = 0, modrm_rm = 0, rep_prefix = 0, lock = 0, rex = 0;
1241 enum { lm_seg_none, lm_seg_fs, lm_seg_gs } lm_ovr = lm_seg_none;
1242 unsigned int port, i, data_sel, ar, data, rc;
1243 unsigned int op_bytes, op_default, ad_bytes, ad_default;
1244 #define rd_ad(reg) (ad_bytes >= sizeof(regs->reg) \
1245 ? regs->reg \
1246 : ad_bytes == 4 \
1247 ? (u32)regs->reg \
1248 : (u16)regs->reg)
1249 #define wr_ad(reg, val) (ad_bytes >= sizeof(regs->reg) \
1250 ? regs->reg = (val) \
1251 : ad_bytes == 4 \
1252 ? (*(u32 *)&regs->reg = (val)) \
1253 : (*(u16 *)&regs->reg = (val)))
1254 unsigned long code_base, code_limit;
1255 char io_emul_stub[16];
1256 void (*io_emul)(struct cpu_user_regs *) __attribute__((__regparm__(1)));
1257 u32 l, h, eax, edx;
1259 if ( !read_descriptor(regs->cs, v, regs,
1260 &code_base, &code_limit, &ar,
1261 _SEGMENT_CODE|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P) )
1262 goto fail;
1263 op_default = op_bytes = (ar & (_SEGMENT_L|_SEGMENT_DB)) ? 4 : 2;
1264 ad_default = ad_bytes = (ar & _SEGMENT_L) ? 8 : op_default;
1265 if ( !(ar & _SEGMENT_S) ||
1266 !(ar & _SEGMENT_P) ||
1267 !(ar & _SEGMENT_CODE) )
1268 goto fail;
1270 /* emulating only opcodes not allowing SS to be default */
1271 data_sel = read_sreg(regs, ds);
1273 /* Legacy prefixes. */
1274 for ( i = 0; i < 8; i++, rex == opcode || (rex = 0) )
1276 switch ( opcode = insn_fetch(u8, code_base, eip, code_limit) )
1278 case 0x66: /* operand-size override */
1279 op_bytes = op_default ^ 6; /* switch between 2/4 bytes */
1280 continue;
1281 case 0x67: /* address-size override */
1282 ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */
1283 continue;
1284 case 0x2e: /* CS override */
1285 data_sel = regs->cs;
1286 continue;
1287 case 0x3e: /* DS override */
1288 data_sel = read_sreg(regs, ds);
1289 continue;
1290 case 0x26: /* ES override */
1291 data_sel = read_sreg(regs, es);
1292 continue;
1293 case 0x64: /* FS override */
1294 data_sel = read_sreg(regs, fs);
1295 lm_ovr = lm_seg_fs;
1296 continue;
1297 case 0x65: /* GS override */
1298 data_sel = read_sreg(regs, gs);
1299 lm_ovr = lm_seg_gs;
1300 continue;
1301 case 0x36: /* SS override */
1302 data_sel = regs->ss;
1303 continue;
1304 case 0xf0: /* LOCK */
1305 lock = 1;
1306 continue;
1307 case 0xf2: /* REPNE/REPNZ */
1308 case 0xf3: /* REP/REPE/REPZ */
1309 rep_prefix = 1;
1310 continue;
1311 default:
1312 if ( (ar & _SEGMENT_L) && (opcode & 0xf0) == 0x40 )
1314 rex = opcode;
1315 continue;
1317 break;
1319 break;
1322 /* REX prefix. */
1323 if ( rex & 8 ) /* REX.W */
1324 op_bytes = 4; /* emulating only opcodes not supporting 64-bit operands */
1325 modrm_reg = (rex & 4) << 1; /* REX.R */
1326 /* REX.X does not need to be decoded. */
1327 modrm_rm = (rex & 1) << 3; /* REX.B */
1329 if ( opcode == 0x0f )
1330 goto twobyte_opcode;
1332 if ( lock )
1333 goto fail;
1335 /* Input/Output String instructions. */
1336 if ( (opcode >= 0x6c) && (opcode <= 0x6f) )
1338 unsigned long data_base, data_limit;
1340 if ( rep_prefix && (rd_ad(ecx) == 0) )
1341 goto done;
1343 if ( !(opcode & 2) )
1345 data_sel = read_sreg(regs, es);
1346 lm_ovr = lm_seg_none;
1349 if ( !(ar & _SEGMENT_L) )
1351 if ( !read_descriptor(data_sel, v, regs,
1352 &data_base, &data_limit, &ar,
1353 _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P) )
1354 goto fail;
1355 if ( !(ar & _SEGMENT_S) ||
1356 !(ar & _SEGMENT_P) ||
1357 (opcode & 2 ?
1358 (ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR) :
1359 (ar & _SEGMENT_CODE) || !(ar & _SEGMENT_WR)) )
1360 goto fail;
1362 #ifdef CONFIG_X86_64
1363 else
1365 if ( lm_ovr == lm_seg_none || data_sel < 4 )
1367 switch ( lm_ovr )
1369 case lm_seg_none:
1370 data_base = 0UL;
1371 break;
1372 case lm_seg_fs:
1373 data_base = v->arch.guest_context.fs_base;
1374 break;
1375 case lm_seg_gs:
1376 if ( guest_kernel_mode(v, regs) )
1377 data_base = v->arch.guest_context.gs_base_kernel;
1378 else
1379 data_base = v->arch.guest_context.gs_base_user;
1380 break;
1383 else
1384 read_descriptor(data_sel, v, regs,
1385 &data_base, &data_limit, &ar,
1386 0);
1387 data_limit = ~0UL;
1388 ar = _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P;
1390 #endif
1392 continue_io_string:
1393 switch ( opcode )
1395 case 0x6c: /* INSB */
1396 op_bytes = 1;
1397 case 0x6d: /* INSW/INSL */
1398 if ( data_limit < op_bytes - 1 ||
1399 rd_ad(edi) > data_limit - (op_bytes - 1) ||
1400 !guest_io_okay((u16)regs->edx, op_bytes, v, regs) )
1401 goto fail;
1402 port = (u16)regs->edx;
1403 switch ( op_bytes )
1405 case 1:
1406 /* emulate PIT counter 2 */
1407 data = (u8)(guest_inb_okay(port, v, regs) ? inb(port) :
1408 ((port == 0x42 || port == 0x43 || port == 0x61) ?
1409 pv_pit_handler(port, 0, 0) : ~0));
1410 break;
1411 case 2:
1412 data = (u16)(guest_inw_okay(port, v, regs) ? inw(port) : ~0);
1413 break;
1414 case 4:
1415 data = (u32)(guest_inl_okay(port, v, regs) ? inl(port) : ~0);
1416 break;
1418 if ( (rc = copy_to_user((void *)data_base + rd_ad(edi), &data, op_bytes)) != 0 )
1420 propagate_page_fault(data_base + rd_ad(edi) + op_bytes - rc,
1421 PFEC_write_access);
1422 return EXCRET_fault_fixed;
1424 wr_ad(edi, regs->edi + (int)((regs->eflags & EF_DF) ? -op_bytes : op_bytes));
1425 break;
1427 case 0x6e: /* OUTSB */
1428 op_bytes = 1;
1429 case 0x6f: /* OUTSW/OUTSL */
1430 if ( data_limit < op_bytes - 1 ||
1431 rd_ad(esi) > data_limit - (op_bytes - 1) ||
1432 !guest_io_okay((u16)regs->edx, op_bytes, v, regs) )
1433 goto fail;
1434 rc = copy_from_user(&data, (void *)data_base + rd_ad(esi), op_bytes);
1435 if ( rc != 0 )
1437 propagate_page_fault(data_base + rd_ad(esi) + op_bytes - rc, 0);
1438 return EXCRET_fault_fixed;
1440 port = (u16)regs->edx;
1441 switch ( op_bytes )
1443 case 1:
1444 if ( guest_outb_okay(port, v, regs) )
1446 outb((u8)data, port);
1447 if ( pv_post_outb_hook )
1448 pv_post_outb_hook(port, data);
1450 else if ( port == 0x42 || port == 0x43 || port == 0x61 )
1451 pv_pit_handler(port, data, 1);
1452 break;
1453 case 2:
1454 if ( guest_outw_okay(port, v, regs) )
1455 outw((u16)data, port);
1456 break;
1457 case 4:
1458 if ( guest_outl_okay(port, v, regs) )
1459 outl((u32)data, port);
1460 break;
1462 wr_ad(esi, regs->esi + (int)((regs->eflags & EF_DF) ? -op_bytes : op_bytes));
1463 break;
1466 if ( rep_prefix && (wr_ad(ecx, regs->ecx - 1) != 0) )
1468 if ( !hypercall_preempt_check() )
1469 goto continue_io_string;
1470 eip = regs->eip;
1473 goto done;
1476 /*
1477 * Very likely to be an I/O instruction (IN/OUT).
1478 * Build an on-stack stub to execute the instruction with full guest
1479 * GPR context. This is needed for some systems which (ab)use IN/OUT
1480 * to communicate with BIOS code in system-management mode.
1481 */
1482 #ifdef __x86_64__
1483 /* movq $host_to_guest_gpr_switch,%rcx */
1484 io_emul_stub[0] = 0x48;
1485 io_emul_stub[1] = 0xb9;
1486 *(void **)&io_emul_stub[2] = (void *)host_to_guest_gpr_switch;
1487 /* callq *%rcx */
1488 io_emul_stub[10] = 0xff;
1489 io_emul_stub[11] = 0xd1;
1490 #else
1491 /* call host_to_guest_gpr_switch */
1492 io_emul_stub[0] = 0xe8;
1493 *(s32 *)&io_emul_stub[1] =
1494 (char *)host_to_guest_gpr_switch - &io_emul_stub[5];
1495 /* 7 x nop */
1496 memset(&io_emul_stub[5], 0x90, 7);
1497 #endif
1498 /* data16 or nop */
1499 io_emul_stub[12] = (op_bytes != 2) ? 0x90 : 0x66;
1500 /* <io-access opcode> */
1501 io_emul_stub[13] = opcode;
1502 /* imm8 or nop */
1503 io_emul_stub[14] = 0x90;
1504 /* ret (jumps to guest_to_host_gpr_switch) */
1505 io_emul_stub[15] = 0xc3;
1507 /* Handy function-typed pointer to the stub. */
1508 io_emul = (void *)io_emul_stub;
1510 /* I/O Port and Interrupt Flag instructions. */
1511 switch ( opcode )
1513 case 0xe4: /* IN imm8,%al */
1514 op_bytes = 1;
1515 case 0xe5: /* IN imm8,%eax */
1516 port = insn_fetch(u8, code_base, eip, code_limit);
1517 io_emul_stub[14] = port; /* imm8 */
1518 exec_in:
1519 if ( !guest_io_okay(port, op_bytes, v, regs) )
1520 goto fail;
1521 switch ( op_bytes )
1523 case 1:
1524 if ( guest_inb_okay(port, v, regs) )
1525 io_emul(regs);
1526 else if ( port == 0x42 || port == 0x43 || port == 0x61 )
1528 regs->eax &= ~0xffUL;
1529 regs->eax |= pv_pit_handler(port, 0, 0);
1531 else
1532 regs->eax |= (u8)~0;
1533 break;
1534 case 2:
1535 if ( guest_inw_okay(port, v, regs) )
1536 io_emul(regs);
1537 else
1538 regs->eax |= (u16)~0;
1539 break;
1540 case 4:
1541 if ( guest_inl_okay(port, v, regs) )
1542 io_emul(regs);
1543 else
1544 regs->eax = (u32)~0;
1545 break;
1547 goto done;
1549 case 0xec: /* IN %dx,%al */
1550 op_bytes = 1;
1551 case 0xed: /* IN %dx,%eax */
1552 port = (u16)regs->edx;
1553 goto exec_in;
1555 case 0xe6: /* OUT %al,imm8 */
1556 op_bytes = 1;
1557 case 0xe7: /* OUT %eax,imm8 */
1558 port = insn_fetch(u8, code_base, eip, code_limit);
1559 io_emul_stub[14] = port; /* imm8 */
1560 exec_out:
1561 if ( !guest_io_okay(port, op_bytes, v, regs) )
1562 goto fail;
1563 switch ( op_bytes )
1565 case 1:
1566 if ( guest_outb_okay(port, v, regs) )
1568 io_emul(regs);
1569 if ( pv_post_outb_hook )
1570 pv_post_outb_hook(port, regs->eax);
1572 else if ( port == 0x42 || port == 0x43 || port == 0x61 )
1573 pv_pit_handler(port, regs->eax, 1);
1574 break;
1575 case 2:
1576 if ( guest_outw_okay(port, v, regs) )
1577 io_emul(regs);
1578 break;
1579 case 4:
1580 if ( guest_outl_okay(port, v, regs) )
1581 io_emul(regs);
1582 break;
1584 goto done;
1586 case 0xee: /* OUT %al,%dx */
1587 op_bytes = 1;
1588 case 0xef: /* OUT %eax,%dx */
1589 port = (u16)regs->edx;
1590 goto exec_out;
1592 case 0xfa: /* CLI */
1593 case 0xfb: /* STI */
1594 if ( v->arch.iopl < (guest_kernel_mode(v, regs) ? 1 : 3) )
1595 goto fail;
1596 /*
1597 * This is just too dangerous to allow, in my opinion. Consider if the
1598 * caller then tries to reenable interrupts using POPF: we can't trap
1599 * that and we'll end up with hard-to-debug lockups. Fast & loose will
1600 * do for us. :-)
1601 */
1602 /*v->vcpu_info->evtchn_upcall_mask = (opcode == 0xfa);*/
1603 goto done;
1606 /* No decode of this single-byte opcode. */
1607 goto fail;
1609 twobyte_opcode:
1610 /* Two-byte opcodes only emulated from guest kernel. */
1611 if ( !guest_kernel_mode(v, regs) )
1612 goto fail;
1614 /* Privileged (ring 0) instructions. */
1615 opcode = insn_fetch(u8, code_base, eip, code_limit);
1616 if ( lock && (opcode & ~3) != 0x20 )
1617 goto fail;
1618 switch ( opcode )
1620 case 0x06: /* CLTS */
1621 (void)do_fpu_taskswitch(0);
1622 break;
1624 case 0x09: /* WBINVD */
1625 /* Ignore the instruction if unprivileged. */
1626 if ( !cache_flush_permitted(v->domain) )
1627 /* Non-physdev domain attempted WBINVD; ignore for now since
1628 newer linux uses this in some start-of-day timing loops */
1630 else
1631 wbinvd();
1632 break;
1634 case 0x20: /* MOV CR?,<reg> */
1635 opcode = insn_fetch(u8, code_base, eip, code_limit);
1636 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
1637 modrm_rm |= (opcode >> 0) & 7;
1638 reg = decode_register(modrm_rm, regs, 0);
1639 switch ( modrm_reg )
1641 case 0: /* Read CR0 */
1642 *reg = (read_cr0() & ~X86_CR0_TS) |
1643 v->arch.guest_context.ctrlreg[0];
1644 break;
1646 case 2: /* Read CR2 */
1647 *reg = v->arch.guest_context.ctrlreg[2];
1648 break;
1650 case 3: /* Read CR3 */
1651 if ( !is_pv_32on64_vcpu(v) )
1652 *reg = xen_pfn_to_cr3(mfn_to_gmfn(
1653 v->domain, pagetable_get_pfn(v->arch.guest_table)));
1654 #ifdef CONFIG_COMPAT
1655 else
1656 *reg = compat_pfn_to_cr3(mfn_to_gmfn(
1657 v->domain, l4e_get_pfn(*(l4_pgentry_t *)__va(pagetable_get_paddr(v->arch.guest_table)))));
1658 #endif
1659 break;
1661 case 4: /* Read CR4 */
1662 /*
1663 * Guests can read CR4 to see what features Xen has enabled. We
1664 * therefore lie about PGE & PSE as they are unavailable to guests.
1665 */
1666 *reg = read_cr4() & ~(X86_CR4_PGE|X86_CR4_PSE);
1667 break;
1669 default:
1670 goto fail;
1672 break;
1674 case 0x21: /* MOV DR?,<reg> */
1675 opcode = insn_fetch(u8, code_base, eip, code_limit);
1676 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
1677 modrm_rm |= (opcode >> 0) & 7;
1678 reg = decode_register(modrm_rm, regs, 0);
1679 if ( (res = do_get_debugreg(modrm_reg)) > (unsigned long)-256 )
1680 goto fail;
1681 *reg = res;
1682 break;
1684 case 0x22: /* MOV <reg>,CR? */
1685 opcode = insn_fetch(u8, code_base, eip, code_limit);
1686 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
1687 modrm_rm |= (opcode >> 0) & 7;
1688 reg = decode_register(modrm_rm, regs, 0);
1689 switch ( modrm_reg )
1691 case 0: /* Write CR0 */
1692 if ( (*reg ^ read_cr0()) & ~X86_CR0_TS )
1694 gdprintk(XENLOG_WARNING,
1695 "Attempt to change unmodifiable CR0 flags.\n");
1696 goto fail;
1698 (void)do_fpu_taskswitch(!!(*reg & X86_CR0_TS));
1699 break;
1701 case 2: /* Write CR2 */
1702 v->arch.guest_context.ctrlreg[2] = *reg;
1703 arch_set_cr2(v, *reg);
1704 break;
1706 case 3: /* Write CR3 */
1707 LOCK_BIGLOCK(v->domain);
1708 if ( !is_pv_32on64_vcpu(v) )
1709 rc = new_guest_cr3(gmfn_to_mfn(v->domain, xen_cr3_to_pfn(*reg)));
1710 #ifdef CONFIG_COMPAT
1711 else
1712 rc = new_guest_cr3(gmfn_to_mfn(v->domain, compat_cr3_to_pfn(*reg)));
1713 #endif
1714 UNLOCK_BIGLOCK(v->domain);
1715 if ( rc == 0 ) /* not okay */
1716 goto fail;
1717 break;
1719 case 4: /* Write CR4 */
1720 if ( *reg != (read_cr4() & ~(X86_CR4_PGE|X86_CR4_PSE)) )
1721 gdprintk(XENLOG_WARNING,
1722 "Attempt to change CR4 flags %08lx -> %08lx\n",
1723 read_cr4() & ~(X86_CR4_PGE|X86_CR4_PSE), *reg);
1724 break;
1726 default:
1727 goto fail;
1729 break;
1731 case 0x23: /* MOV <reg>,DR? */
1732 opcode = insn_fetch(u8, code_base, eip, code_limit);
1733 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
1734 modrm_rm |= (opcode >> 0) & 7;
1735 reg = decode_register(modrm_rm, regs, 0);
1736 if ( do_set_debugreg(modrm_reg, *reg) != 0 )
1737 goto fail;
1738 break;
1740 case 0x30: /* WRMSR */
1741 eax = regs->eax;
1742 edx = regs->edx;
1743 res = ((u64)edx << 32) | eax;
1744 switch ( regs->ecx )
1746 #ifdef CONFIG_X86_64
1747 case MSR_FS_BASE:
1748 if ( is_pv_32on64_vcpu(v) )
1749 goto fail;
1750 if ( wrmsr_safe(MSR_FS_BASE, eax, edx) )
1751 goto fail;
1752 v->arch.guest_context.fs_base = res;
1753 break;
1754 case MSR_GS_BASE:
1755 if ( is_pv_32on64_vcpu(v) )
1756 goto fail;
1757 if ( wrmsr_safe(MSR_GS_BASE, eax, edx) )
1758 goto fail;
1759 v->arch.guest_context.gs_base_kernel = res;
1760 break;
1761 case MSR_SHADOW_GS_BASE:
1762 if ( is_pv_32on64_vcpu(v) )
1763 goto fail;
1764 if ( wrmsr_safe(MSR_SHADOW_GS_BASE, eax, edx) )
1765 goto fail;
1766 v->arch.guest_context.gs_base_user = res;
1767 break;
1768 #endif
1769 case MSR_K7_FID_VID_STATUS:
1770 case MSR_K7_FID_VID_CTL:
1771 if ( (cpufreq_controller != FREQCTL_dom0_kernel) ||
1772 (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) ||
1773 wrmsr_safe(regs->ecx, eax, edx) )
1774 goto fail;
1775 break;
1776 case MSR_IA32_PERF_CTL:
1777 if ( (cpufreq_controller != FREQCTL_dom0_kernel) ||
1778 (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) ||
1779 wrmsr_safe(regs->ecx, eax, edx) )
1780 goto fail;
1781 break;
1782 default:
1783 if ( wrmsr_hypervisor_regs(regs->ecx, eax, edx) )
1784 break;
1785 if ( (rdmsr_safe(regs->ecx, l, h) != 0) ||
1786 (eax != l) || (edx != h) )
1787 gdprintk(XENLOG_WARNING, "Domain attempted WRMSR %p from "
1788 "%08x:%08x to %08x:%08x.\n",
1789 _p(regs->ecx), h, l, edx, eax);
1790 break;
1792 break;
1794 case 0x32: /* RDMSR */
1795 switch ( regs->ecx )
1797 #ifdef CONFIG_X86_64
1798 case MSR_FS_BASE:
1799 if ( is_pv_32on64_vcpu(v) )
1800 goto fail;
1801 regs->eax = v->arch.guest_context.fs_base & 0xFFFFFFFFUL;
1802 regs->edx = v->arch.guest_context.fs_base >> 32;
1803 break;
1804 case MSR_GS_BASE:
1805 if ( is_pv_32on64_vcpu(v) )
1806 goto fail;
1807 regs->eax = v->arch.guest_context.gs_base_kernel & 0xFFFFFFFFUL;
1808 regs->edx = v->arch.guest_context.gs_base_kernel >> 32;
1809 break;
1810 case MSR_SHADOW_GS_BASE:
1811 if ( is_pv_32on64_vcpu(v) )
1812 goto fail;
1813 regs->eax = v->arch.guest_context.gs_base_user & 0xFFFFFFFFUL;
1814 regs->edx = v->arch.guest_context.gs_base_user >> 32;
1815 break;
1816 #endif
1817 case MSR_K7_FID_VID_CTL:
1818 case MSR_K7_FID_VID_STATUS:
1819 if ( (cpufreq_controller != FREQCTL_dom0_kernel) ||
1820 (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) ||
1821 rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
1822 goto fail;
1823 break;
1824 case MSR_EFER:
1825 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
1826 goto fail;
1827 break;
1828 default:
1829 if ( rdmsr_hypervisor_regs(regs->ecx, &l, &h) )
1831 regs->eax = l;
1832 regs->edx = h;
1833 break;
1835 /* Everyone can read the MSR space. */
1836 /* gdprintk(XENLOG_WARNING,"Domain attempted RDMSR %p.\n",
1837 _p(regs->ecx));*/
1838 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
1839 goto fail;
1840 break;
1842 break;
1844 default:
1845 goto fail;
1848 #undef wr_ad
1849 #undef rd_ad
1851 done:
1852 regs->eip = eip;
1853 regs->eflags &= ~X86_EFLAGS_RF;
1854 return EXCRET_fault_fixed;
1856 fail:
1857 return 0;
1860 asmlinkage int do_general_protection(struct cpu_user_regs *regs)
1862 struct vcpu *v = current;
1863 unsigned long fixup;
1865 DEBUGGER_trap_entry(TRAP_gp_fault, regs);
1867 if ( regs->error_code & 1 )
1868 goto hardware_gp;
1870 if ( !guest_mode(regs) )
1871 goto gp_in_kernel;
1873 /*
1874 * Cunning trick to allow arbitrary "INT n" handling.
1876 * We set DPL == 0 on all vectors in the IDT. This prevents any INT <n>
1877 * instruction from trapping to the appropriate vector, when that might not
1878 * be expected by Xen or the guest OS. For example, that entry might be for
1879 * a fault handler (unlike traps, faults don't increment EIP), or might
1880 * expect an error code on the stack (which a software trap never
1881 * provides), or might be a hardware interrupt handler that doesn't like
1882 * being called spuriously.
1884 * Instead, a GPF occurs with the faulting IDT vector in the error code.
1885 * Bit 1 is set to indicate that an IDT entry caused the fault. Bit 0 is
1886 * clear to indicate that it's a software fault, not hardware.
1888 * NOTE: Vectors 3 and 4 are dealt with from their own handler. This is
1889 * okay because they can only be triggered by an explicit DPL-checked
1890 * instruction. The DPL specified by the guest OS for these vectors is NOT
1891 * CHECKED!!
1892 */
1893 if ( (regs->error_code & 3) == 2 )
1895 /* This fault must be due to <INT n> instruction. */
1896 const struct trap_info *ti;
1897 unsigned char vector = regs->error_code >> 3;
1898 ti = &v->arch.guest_context.trap_ctxt[vector];
1899 if ( permit_softint(TI_GET_DPL(ti), v, regs) )
1901 regs->eip += 2;
1902 return do_guest_trap(vector, regs, 0);
1906 /* Emulate some simple privileged and I/O instructions. */
1907 if ( (regs->error_code == 0) &&
1908 emulate_privileged_op(regs) )
1910 trace_trap_one_addr(TRC_PV_EMULATE_PRIVOP, regs->eip);
1911 return 0;
1914 #if defined(__i386__)
1915 if ( VM_ASSIST(v->domain, VMASST_TYPE_4gb_segments) &&
1916 (regs->error_code == 0) &&
1917 gpf_emulate_4gb(regs) )
1919 TRACE_1D(TRC_PV_EMULATE_4GB, regs->eip);
1920 return 0;
1922 #endif
1924 /* Pass on GPF as is. */
1925 return do_guest_trap(TRAP_gp_fault, regs, 1);
1927 gp_in_kernel:
1929 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
1931 dprintk(XENLOG_INFO, "GPF (%04x): %p -> %p\n",
1932 regs->error_code, _p(regs->eip), _p(fixup));
1933 regs->eip = fixup;
1934 return 0;
1937 DEBUGGER_trap_fatal(TRAP_gp_fault, regs);
1939 hardware_gp:
1940 show_execution_state(regs);
1941 panic("GENERAL PROTECTION FAULT\n[error_code=%04x]\n", regs->error_code);
1942 return 0;
1945 static void nmi_softirq(void)
1947 /* Only used to defer wakeup of dom0,vcpu0 to a safe (non-NMI) context. */
1948 vcpu_kick(dom0->vcpu[0]);
1951 static void nmi_dom0_report(unsigned int reason_idx)
1953 struct domain *d;
1954 struct vcpu *v;
1956 if ( ((d = dom0) == NULL) || ((v = d->vcpu[0]) == NULL) )
1957 return;
1959 set_bit(reason_idx, nmi_reason(d));
1961 if ( !test_and_set_bool(v->nmi_pending) )
1962 raise_softirq(NMI_SOFTIRQ); /* not safe to wake up a vcpu here */
1965 asmlinkage void mem_parity_error(struct cpu_user_regs *regs)
1967 switch ( opt_nmi[0] )
1969 case 'd': /* 'dom0' */
1970 nmi_dom0_report(_XEN_NMIREASON_parity_error);
1971 case 'i': /* 'ignore' */
1972 break;
1973 default: /* 'fatal' */
1974 console_force_unlock();
1975 printk("\n\nNMI - MEMORY ERROR\n");
1976 fatal_trap(TRAP_nmi, regs);
1979 outb((inb(0x61) & 0x0f) | 0x04, 0x61); /* clear-and-disable parity check */
1980 mdelay(1);
1981 outb((inb(0x61) & 0x0b) | 0x00, 0x61); /* enable parity check */
1984 asmlinkage void io_check_error(struct cpu_user_regs *regs)
1986 switch ( opt_nmi[0] )
1988 case 'd': /* 'dom0' */
1989 nmi_dom0_report(_XEN_NMIREASON_io_error);
1990 case 'i': /* 'ignore' */
1991 break;
1992 default: /* 'fatal' */
1993 console_force_unlock();
1994 printk("\n\nNMI - I/O ERROR\n");
1995 fatal_trap(TRAP_nmi, regs);
1998 outb((inb(0x61) & 0x0f) | 0x08, 0x61); /* clear-and-disable IOCK */
1999 mdelay(1);
2000 outb((inb(0x61) & 0x07) | 0x00, 0x61); /* enable IOCK */
2003 static void unknown_nmi_error(unsigned char reason)
2005 switch ( opt_nmi[0] )
2007 case 'd': /* 'dom0' */
2008 nmi_dom0_report(_XEN_NMIREASON_unknown);
2009 case 'i': /* 'ignore' */
2010 break;
2011 default: /* 'fatal' */
2012 printk("Uhhuh. NMI received for unknown reason %02x.\n", reason);
2013 printk("Dazed and confused, but trying to continue\n");
2014 printk("Do you have a strange power saving mode enabled?\n");
2015 kexec_crash();
2019 static int dummy_nmi_callback(struct cpu_user_regs *regs, int cpu)
2021 return 0;
2024 static nmi_callback_t nmi_callback = dummy_nmi_callback;
2026 asmlinkage void do_nmi(struct cpu_user_regs *regs)
2028 unsigned int cpu = smp_processor_id();
2029 unsigned char reason;
2031 ++nmi_count(cpu);
2033 if ( nmi_callback(regs, cpu) )
2034 return;
2036 if ( nmi_watchdog )
2037 nmi_watchdog_tick(regs);
2039 /* Only the BSP gets external NMIs from the system. */
2040 if ( cpu == 0 )
2042 reason = inb(0x61);
2043 if ( reason & 0x80 )
2044 mem_parity_error(regs);
2045 else if ( reason & 0x40 )
2046 io_check_error(regs);
2047 else if ( !nmi_watchdog )
2048 unknown_nmi_error((unsigned char)(reason&0xff));
2052 void set_nmi_callback(nmi_callback_t callback)
2054 nmi_callback = callback;
2057 void unset_nmi_callback(void)
2059 nmi_callback = dummy_nmi_callback;
2062 asmlinkage int do_device_not_available(struct cpu_user_regs *regs)
2064 BUG_ON(!guest_mode(regs));
2066 setup_fpu(current);
2068 if ( current->arch.guest_context.ctrlreg[0] & X86_CR0_TS )
2070 do_guest_trap(TRAP_no_device, regs, 0);
2071 current->arch.guest_context.ctrlreg[0] &= ~X86_CR0_TS;
2073 else
2074 TRACE_0D(TRC_PV_MATH_STATE_RESTORE);
2076 return EXCRET_fault_fixed;
2079 asmlinkage int do_debug(struct cpu_user_regs *regs)
2081 unsigned long condition;
2082 struct vcpu *v = current;
2084 asm volatile ( "mov %%db6,%0" : "=r" (condition) );
2086 /* Mask out spurious debug traps due to lazy DR7 setting */
2087 if ( (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) &&
2088 (v->arch.guest_context.debugreg[7] == 0) )
2090 asm volatile ( "mov %0,%%db7" : : "r" (0UL) );
2091 goto out;
2094 DEBUGGER_trap_entry(TRAP_debug, regs);
2096 if ( !guest_mode(regs) )
2098 /* Clear TF just for absolute sanity. */
2099 regs->eflags &= ~EF_TF;
2100 /*
2101 * We ignore watchpoints when they trigger within Xen. This may happen
2102 * when a buffer is passed to us which previously had a watchpoint set
2103 * on it. No need to bump EIP; the only faulting trap is an instruction
2104 * breakpoint, which can't happen to us.
2105 */
2106 goto out;
2109 /* Save debug status register where guest OS can peek at it */
2110 v->arch.guest_context.debugreg[6] = condition;
2112 ler_enable();
2114 return do_guest_trap(TRAP_debug, regs, 0);
2116 out:
2117 ler_enable();
2118 return EXCRET_not_a_fault;
2121 asmlinkage int do_spurious_interrupt_bug(struct cpu_user_regs *regs)
2123 return EXCRET_not_a_fault;
2126 void set_intr_gate(unsigned int n, void *addr)
2128 int i;
2129 /* Keep secondary tables in sync with IRQ updates. */
2130 for ( i = 1; i < NR_CPUS; i++ )
2131 if ( idt_tables[i] != NULL )
2132 _set_gate(&idt_tables[i][n], 14, 0, addr);
2133 _set_gate(&idt_table[n], 14, 0, addr);
2136 void set_system_gate(unsigned int n, void *addr)
2138 _set_gate(idt_table+n,14,3,addr);
2141 void set_task_gate(unsigned int n, unsigned int sel)
2143 idt_table[n].a = sel << 16;
2144 idt_table[n].b = 0x8500;
2147 void set_tss_desc(unsigned int n, void *addr)
2149 _set_tssldt_desc(
2150 gdt_table + __TSS(n) - FIRST_RESERVED_GDT_ENTRY,
2151 (unsigned long)addr,
2152 offsetof(struct tss_struct, __cacheline_filler) - 1,
2153 9);
2154 #ifdef CONFIG_COMPAT
2155 _set_tssldt_desc(
2156 compat_gdt_table + __TSS(n) - FIRST_RESERVED_GDT_ENTRY,
2157 (unsigned long)addr,
2158 offsetof(struct tss_struct, __cacheline_filler) - 1,
2159 11);
2160 #endif
2163 void __devinit percpu_traps_init(void)
2165 subarch_percpu_traps_init();
2167 if ( !opt_ler )
2168 return;
2170 switch ( boot_cpu_data.x86_vendor )
2172 case X86_VENDOR_INTEL:
2173 switch ( boot_cpu_data.x86 )
2175 case 6:
2176 this_cpu(ler_msr) = MSR_IA32_LASTINTFROMIP;
2177 break;
2178 case 15:
2179 this_cpu(ler_msr) = MSR_P4_LER_FROM_LIP;
2180 break;
2182 break;
2183 case X86_VENDOR_AMD:
2184 switch ( boot_cpu_data.x86 )
2186 case 6:
2187 case 15:
2188 case 16:
2189 this_cpu(ler_msr) = MSR_IA32_LASTINTFROMIP;
2190 break;
2192 break;
2195 ler_enable();
2198 void __init trap_init(void)
2200 /*
2201 * Note that interrupt gates are always used, rather than trap gates. We
2202 * must have interrupts disabled until DS/ES/FS/GS are saved because the
2203 * first activation must have the "bad" value(s) for these registers and
2204 * we may lose them if another activation is installed before they are
2205 * saved. The page-fault handler also needs interrupts disabled until %cr2
2206 * has been read and saved on the stack.
2207 */
2208 set_intr_gate(TRAP_divide_error,&divide_error);
2209 set_intr_gate(TRAP_debug,&debug);
2210 set_intr_gate(TRAP_nmi,&nmi);
2211 set_system_gate(TRAP_int3,&int3); /* usable from all privileges */
2212 set_system_gate(TRAP_overflow,&overflow); /* usable from all privileges */
2213 set_intr_gate(TRAP_bounds,&bounds);
2214 set_intr_gate(TRAP_invalid_op,&invalid_op);
2215 set_intr_gate(TRAP_no_device,&device_not_available);
2216 set_intr_gate(TRAP_copro_seg,&coprocessor_segment_overrun);
2217 set_intr_gate(TRAP_invalid_tss,&invalid_TSS);
2218 set_intr_gate(TRAP_no_segment,&segment_not_present);
2219 set_intr_gate(TRAP_stack_error,&stack_segment);
2220 set_intr_gate(TRAP_gp_fault,&general_protection);
2221 set_intr_gate(TRAP_page_fault,&page_fault);
2222 set_intr_gate(TRAP_spurious_int,&spurious_interrupt_bug);
2223 set_intr_gate(TRAP_copro_error,&coprocessor_error);
2224 set_intr_gate(TRAP_alignment_check,&alignment_check);
2225 set_intr_gate(TRAP_machine_check,&machine_check);
2226 set_intr_gate(TRAP_simd_error,&simd_coprocessor_error);
2228 /* CPU0 uses the master IDT. */
2229 idt_tables[0] = idt_table;
2231 percpu_traps_init();
2233 cpu_init();
2235 open_softirq(NMI_SOFTIRQ, nmi_softirq);
2238 long register_guest_nmi_callback(unsigned long address)
2240 struct vcpu *v = current;
2241 struct domain *d = current->domain;
2242 struct trap_info *t = &v->arch.guest_context.trap_ctxt[TRAP_nmi];
2244 t->vector = TRAP_nmi;
2245 t->flags = 0;
2246 t->cs = !IS_COMPAT(d) ? FLAT_KERNEL_CS : FLAT_COMPAT_KERNEL_CS;
2247 t->address = address;
2248 TI_SET_IF(t, 1);
2250 /*
2251 * If no handler was registered we can 'lose the NMI edge'. Re-assert it
2252 * now.
2253 */
2254 if ( (v->vcpu_id == 0) && (arch_get_nmi_reason(d) != 0) )
2255 v->nmi_pending = 1;
2257 return 0;
2260 long unregister_guest_nmi_callback(void)
2262 struct vcpu *v = current;
2263 struct trap_info *t = &v->arch.guest_context.trap_ctxt[TRAP_nmi];
2265 memset(t, 0, sizeof(*t));
2267 return 0;
2270 long do_set_trap_table(XEN_GUEST_HANDLE(trap_info_t) traps)
2272 struct trap_info cur;
2273 struct trap_info *dst = current->arch.guest_context.trap_ctxt;
2274 long rc = 0;
2276 /* If no table is presented then clear the entire virtual IDT. */
2277 if ( guest_handle_is_null(traps) )
2279 memset(dst, 0, 256 * sizeof(*dst));
2280 init_int80_direct_trap(current);
2281 return 0;
2284 for ( ; ; )
2286 if ( hypercall_preempt_check() )
2288 rc = hypercall_create_continuation(
2289 __HYPERVISOR_set_trap_table, "h", traps);
2290 break;
2293 if ( copy_from_guest(&cur, traps, 1) )
2295 rc = -EFAULT;
2296 break;
2299 if ( cur.address == 0 )
2300 break;
2302 if ( (cur.vector == TRAP_nmi) && !TI_GET_IF(&cur) )
2304 rc = -EINVAL;
2305 break;
2308 fixup_guest_code_selector(current->domain, cur.cs);
2310 memcpy(&dst[cur.vector], &cur, sizeof(cur));
2312 if ( cur.vector == 0x80 )
2313 init_int80_direct_trap(current);
2315 guest_handle_add_offset(traps, 1);
2318 return rc;
2322 long set_debugreg(struct vcpu *p, int reg, unsigned long value)
2324 int i;
2326 switch ( reg )
2328 case 0:
2329 if ( !access_ok(value, sizeof(long)) )
2330 return -EPERM;
2331 if ( p == current )
2332 asm volatile ( "mov %0, %%db0" : : "r" (value) );
2333 break;
2334 case 1:
2335 if ( !access_ok(value, sizeof(long)) )
2336 return -EPERM;
2337 if ( p == current )
2338 asm volatile ( "mov %0, %%db1" : : "r" (value) );
2339 break;
2340 case 2:
2341 if ( !access_ok(value, sizeof(long)) )
2342 return -EPERM;
2343 if ( p == current )
2344 asm volatile ( "mov %0, %%db2" : : "r" (value) );
2345 break;
2346 case 3:
2347 if ( !access_ok(value, sizeof(long)) )
2348 return -EPERM;
2349 if ( p == current )
2350 asm volatile ( "mov %0, %%db3" : : "r" (value) );
2351 break;
2352 case 6:
2353 /*
2354 * DR6: Bits 4-11,16-31 reserved (set to 1).
2355 * Bit 12 reserved (set to 0).
2356 */
2357 value &= 0xffffefff; /* reserved bits => 0 */
2358 value |= 0xffff0ff0; /* reserved bits => 1 */
2359 if ( p == current )
2360 asm volatile ( "mov %0, %%db6" : : "r" (value) );
2361 break;
2362 case 7:
2363 /*
2364 * DR7: Bit 10 reserved (set to 1).
2365 * Bits 11-12,14-15 reserved (set to 0).
2366 * Privileged bits:
2367 * GD (bit 13): must be 0.
2368 * R/Wn (bits 16-17,20-21,24-25,28-29): mustn't be 10.
2369 * LENn (bits 18-19,22-23,26-27,30-31): mustn't be 10.
2370 */
2371 /* DR7 == 0 => debugging disabled for this domain. */
2372 if ( value != 0 )
2374 value &= 0xffff27ff; /* reserved bits => 0 */
2375 value |= 0x00000400; /* reserved bits => 1 */
2376 if ( (value & (1<<13)) != 0 ) return -EPERM;
2377 for ( i = 0; i < 16; i += 2 )
2378 if ( ((value >> (i+16)) & 3) == 2 ) return -EPERM;
2380 if ( p == current )
2381 asm volatile ( "mov %0, %%db7" : : "r" (value) );
2382 break;
2383 default:
2384 return -EINVAL;
2387 p->arch.guest_context.debugreg[reg] = value;
2388 return 0;
2391 long do_set_debugreg(int reg, unsigned long value)
2393 return set_debugreg(current, reg, value);
2396 unsigned long do_get_debugreg(int reg)
2398 if ( (reg < 0) || (reg > 7) ) return -EINVAL;
2399 return current->arch.guest_context.debugreg[reg];
2402 /*
2403 * Local variables:
2404 * mode: C
2405 * c-set-style: "BSD"
2406 * c-basic-offset: 4
2407 * tab-width: 4
2408 * indent-tabs-mode: nil
2409 * End:
2410 */