ia64/xen-unstable

view xen/arch/x86/traps.c @ 15096:75b4c7cb007d

x86: suppress SYSCALL feature for 32on64 guests
since Xen cannot handle it.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
author kfraser@localhost.localdomain
date Tue May 15 09:54:27 2007 +0100 (2007-05-15)
parents 384a29655270
children e49b110cbb4a
line source
1 /******************************************************************************
2 * arch/x86/traps.c
3 *
4 * Modifications to Linux original are copyright (c) 2002-2004, K A Fraser
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
21 /*
22 * Copyright (C) 1991, 1992 Linus Torvalds
23 *
24 * Pentium III FXSR, SSE support
25 * Gareth Hughes <gareth@valinux.com>, May 2000
26 */
28 #include <xen/config.h>
29 #include <xen/init.h>
30 #include <xen/sched.h>
31 #include <xen/lib.h>
32 #include <xen/errno.h>
33 #include <xen/mm.h>
34 #include <xen/console.h>
35 #include <xen/shutdown.h>
36 #include <asm/regs.h>
37 #include <xen/delay.h>
38 #include <xen/event.h>
39 #include <xen/spinlock.h>
40 #include <xen/irq.h>
41 #include <xen/perfc.h>
42 #include <xen/softirq.h>
43 #include <xen/domain_page.h>
44 #include <xen/symbols.h>
45 #include <xen/iocap.h>
46 #include <xen/nmi.h>
47 #include <xen/version.h>
48 #include <xen/kexec.h>
49 #include <asm/paging.h>
50 #include <asm/system.h>
51 #include <asm/io.h>
52 #include <asm/atomic.h>
53 #include <asm/desc.h>
54 #include <asm/debugreg.h>
55 #include <asm/smp.h>
56 #include <asm/flushtlb.h>
57 #include <asm/uaccess.h>
58 #include <asm/i387.h>
59 #include <asm/debugger.h>
60 #include <asm/msr.h>
61 #include <asm/shared.h>
62 #include <asm/x86_emulate.h>
63 #include <asm/hvm/vpt.h>
65 /*
66 * opt_nmi: one of 'ignore', 'dom0', or 'fatal'.
67 * fatal: Xen prints diagnostic message and then hangs.
68 * dom0: The NMI is virtualised to DOM0.
69 * ignore: The NMI error is cleared and ignored.
70 */
71 #ifdef NDEBUG
72 char opt_nmi[10] = "dom0";
73 #else
74 char opt_nmi[10] = "fatal";
75 #endif
76 string_param("nmi", opt_nmi);
78 /* Master table, used by CPU0. */
79 idt_entry_t idt_table[IDT_ENTRIES];
81 /* Pointer to the IDT of every CPU. */
82 idt_entry_t *idt_tables[NR_CPUS] __read_mostly;
84 #define DECLARE_TRAP_HANDLER(_name) \
85 asmlinkage void _name(void); \
86 asmlinkage int do_ ## _name(struct cpu_user_regs *regs)
88 asmlinkage void nmi(void);
89 DECLARE_TRAP_HANDLER(divide_error);
90 DECLARE_TRAP_HANDLER(debug);
91 DECLARE_TRAP_HANDLER(int3);
92 DECLARE_TRAP_HANDLER(overflow);
93 DECLARE_TRAP_HANDLER(bounds);
94 DECLARE_TRAP_HANDLER(invalid_op);
95 DECLARE_TRAP_HANDLER(device_not_available);
96 DECLARE_TRAP_HANDLER(coprocessor_segment_overrun);
97 DECLARE_TRAP_HANDLER(invalid_TSS);
98 DECLARE_TRAP_HANDLER(segment_not_present);
99 DECLARE_TRAP_HANDLER(stack_segment);
100 DECLARE_TRAP_HANDLER(general_protection);
101 DECLARE_TRAP_HANDLER(page_fault);
102 DECLARE_TRAP_HANDLER(coprocessor_error);
103 DECLARE_TRAP_HANDLER(simd_coprocessor_error);
104 DECLARE_TRAP_HANDLER(alignment_check);
105 DECLARE_TRAP_HANDLER(spurious_interrupt_bug);
106 DECLARE_TRAP_HANDLER(machine_check);
108 long do_set_debugreg(int reg, unsigned long value);
109 unsigned long do_get_debugreg(int reg);
111 static int debug_stack_lines = 20;
112 integer_param("debug_stack_lines", debug_stack_lines);
114 #ifdef CONFIG_X86_32
115 #define stack_words_per_line 8
116 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)&regs->esp)
117 #else
118 #define stack_words_per_line 4
119 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)regs->rsp)
120 #endif
122 static void show_guest_stack(struct cpu_user_regs *regs)
123 {
124 int i;
125 unsigned long *stack, addr;
127 if ( is_hvm_vcpu(current) )
128 return;
130 if ( is_pv_32on64_vcpu(current) )
131 {
132 compat_show_guest_stack(regs, debug_stack_lines);
133 return;
134 }
136 if ( vm86_mode(regs) )
137 {
138 stack = (unsigned long *)((regs->ss << 4) + (regs->esp & 0xffff));
139 printk("Guest stack trace from ss:sp = %04x:%04x (VM86)\n ",
140 regs->ss, (uint16_t)(regs->esp & 0xffff));
141 }
142 else
143 {
144 stack = (unsigned long *)regs->esp;
145 printk("Guest stack trace from "__OP"sp=%p:\n ", stack);
146 }
148 for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
149 {
150 if ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) == 0 )
151 break;
152 if ( get_user(addr, stack) )
153 {
154 if ( i != 0 )
155 printk("\n ");
156 printk("Fault while accessing guest memory.");
157 i = 1;
158 break;
159 }
160 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
161 printk("\n ");
162 printk(" %p", _p(addr));
163 stack++;
164 }
165 if ( i == 0 )
166 printk("Stack empty.");
167 printk("\n");
168 }
170 #ifdef NDEBUG
172 static void show_trace(struct cpu_user_regs *regs)
173 {
174 unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr;
176 printk("Xen call trace:\n ");
178 printk("[<%p>]", _p(regs->eip));
179 print_symbol(" %s\n ", regs->eip);
181 while ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) != 0 )
182 {
183 addr = *stack++;
184 if ( is_kernel_text(addr) || is_kernel_inittext(addr) )
185 {
186 printk("[<%p>]", _p(addr));
187 print_symbol(" %s\n ", addr);
188 }
189 }
191 printk("\n");
192 }
194 #else
196 static void show_trace(struct cpu_user_regs *regs)
197 {
198 unsigned long *frame, next, addr, low, high;
200 printk("Xen call trace:\n ");
202 printk("[<%p>]", _p(regs->eip));
203 print_symbol(" %s\n ", regs->eip);
205 /* Bounds for range of valid frame pointer. */
206 low = (unsigned long)(ESP_BEFORE_EXCEPTION(regs) - 2);
207 high = (low & ~(STACK_SIZE - 1)) +
208 (STACK_SIZE - sizeof(struct cpu_info) - 2*sizeof(unsigned long));
210 /* The initial frame pointer. */
211 next = regs->ebp;
213 for ( ; ; )
214 {
215 /* Valid frame pointer? */
216 if ( (next < low) || (next >= high) )
217 {
218 /*
219 * Exception stack frames have a different layout, denoted by an
220 * inverted frame pointer.
221 */
222 next = ~next;
223 if ( (next < low) || (next >= high) )
224 break;
225 frame = (unsigned long *)next;
226 next = frame[0];
227 addr = frame[(offsetof(struct cpu_user_regs, eip) -
228 offsetof(struct cpu_user_regs, ebp))
229 / BYTES_PER_LONG];
230 }
231 else
232 {
233 /* Ordinary stack frame. */
234 frame = (unsigned long *)next;
235 next = frame[0];
236 addr = frame[1];
237 }
239 printk("[<%p>]", _p(addr));
240 print_symbol(" %s\n ", addr);
242 low = (unsigned long)&frame[2];
243 }
245 printk("\n");
246 }
248 #endif
250 void show_stack(struct cpu_user_regs *regs)
251 {
252 unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr;
253 int i;
255 if ( guest_mode(regs) )
256 return show_guest_stack(regs);
258 printk("Xen stack trace from "__OP"sp=%p:\n ", stack);
260 for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
261 {
262 if ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) == 0 )
263 break;
264 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
265 printk("\n ");
266 addr = *stack++;
267 printk(" %p", _p(addr));
268 }
269 if ( i == 0 )
270 printk("Stack empty.");
271 printk("\n");
273 show_trace(regs);
274 }
276 void show_stack_overflow(unsigned int cpu, unsigned long esp)
277 {
278 #ifdef MEMORY_GUARD
279 unsigned long esp_top, esp_bottom;
280 unsigned long *stack, addr;
282 esp_bottom = (esp | (STACK_SIZE - 1)) + 1;
283 esp_top = esp_bottom - DEBUG_STACK_SIZE;
285 printk("Valid stack range: %p-%p, sp=%p, tss.esp0=%p\n",
286 (void *)esp_top, (void *)esp_bottom, (void *)esp,
287 (void *)init_tss[cpu].esp0);
289 /* Trigger overflow trace if %esp is within 512 bytes of the guard page. */
290 if ( ((unsigned long)(esp - esp_top) > 512) &&
291 ((unsigned long)(esp_top - esp) > 512) )
292 {
293 printk("No stack overflow detected. Skipping stack trace.\n");
294 return;
295 }
297 if ( esp < esp_top )
298 esp = esp_top;
300 printk("Xen stack overflow (dumping trace %p-%p):\n ",
301 (void *)esp, (void *)esp_bottom);
303 stack = (unsigned long *)esp;
304 while ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) != 0 )
305 {
306 addr = *stack++;
307 if ( is_kernel_text(addr) || is_kernel_inittext(addr) )
308 {
309 printk("%p: [<%p>]", stack, _p(addr));
310 print_symbol(" %s\n ", addr);
311 }
312 }
314 printk("\n");
315 #endif
316 }
318 void show_execution_state(struct cpu_user_regs *regs)
319 {
320 show_registers(regs);
321 show_stack(regs);
322 }
324 char *trapstr(int trapnr)
325 {
326 static char *strings[] = {
327 "divide error", "debug", "nmi", "bkpt", "overflow", "bounds",
328 "invalid opcode", "device not available", "double fault",
329 "coprocessor segment", "invalid tss", "segment not found",
330 "stack error", "general protection fault", "page fault",
331 "spurious interrupt", "coprocessor error", "alignment check",
332 "machine check", "simd error"
333 };
335 if ( (trapnr < 0) || (trapnr >= ARRAY_SIZE(strings)) )
336 return "???";
338 return strings[trapnr];
339 }
341 /*
342 * This is called for faults at very unexpected times (e.g., when interrupts
343 * are disabled). In such situations we can't do much that is safe. We try to
344 * print out some tracing and then we just spin.
345 */
346 asmlinkage void fatal_trap(int trapnr, struct cpu_user_regs *regs)
347 {
348 watchdog_disable();
349 console_start_sync();
351 show_execution_state(regs);
353 if ( trapnr == TRAP_page_fault )
354 {
355 unsigned long cr2 = read_cr2();
356 printk("Faulting linear address: %p\n", _p(cr2));
357 show_page_walk(cr2);
358 }
360 panic("FATAL TRAP: vector = %d (%s)\n"
361 "[error_code=%04x] %s\n",
362 trapnr, trapstr(trapnr), regs->error_code,
363 (regs->eflags & X86_EFLAGS_IF) ? "" : ", IN INTERRUPT CONTEXT");
364 }
366 static int do_guest_trap(
367 int trapnr, const struct cpu_user_regs *regs, int use_error_code)
368 {
369 struct vcpu *v = current;
370 struct trap_bounce *tb;
371 const struct trap_info *ti;
373 tb = &v->arch.trap_bounce;
374 ti = &v->arch.guest_context.trap_ctxt[trapnr];
376 tb->flags = TBF_EXCEPTION;
377 tb->cs = ti->cs;
378 tb->eip = ti->address;
380 if ( use_error_code )
381 {
382 tb->flags |= TBF_EXCEPTION_ERRCODE;
383 tb->error_code = regs->error_code;
384 }
386 if ( TI_GET_IF(ti) )
387 tb->flags |= TBF_INTERRUPT;
389 if ( unlikely(null_trap_bounce(v, tb)) )
390 gdprintk(XENLOG_WARNING, "Unhandled %s fault/trap [#%d] in "
391 "domain %d on VCPU %d [ec=%04x]\n",
392 trapstr(trapnr), trapnr, v->domain->domain_id, v->vcpu_id,
393 regs->error_code);
395 return 0;
396 }
398 static inline int do_trap(
399 int trapnr, struct cpu_user_regs *regs, int use_error_code)
400 {
401 unsigned long fixup;
403 DEBUGGER_trap_entry(trapnr, regs);
405 if ( guest_mode(regs) )
406 return do_guest_trap(trapnr, regs, use_error_code);
408 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
409 {
410 dprintk(XENLOG_ERR, "Trap %d: %p -> %p\n",
411 trapnr, _p(regs->eip), _p(fixup));
412 regs->eip = fixup;
413 return 0;
414 }
416 DEBUGGER_trap_fatal(trapnr, regs);
418 show_execution_state(regs);
419 panic("FATAL TRAP: vector = %d (%s)\n"
420 "[error_code=%04x]\n",
421 trapnr, trapstr(trapnr), regs->error_code);
422 return 0;
423 }
425 #define DO_ERROR_NOCODE(trapnr, name) \
426 asmlinkage int do_##name(struct cpu_user_regs *regs) \
427 { \
428 return do_trap(trapnr, regs, 0); \
429 }
431 #define DO_ERROR(trapnr, name) \
432 asmlinkage int do_##name(struct cpu_user_regs *regs) \
433 { \
434 return do_trap(trapnr, regs, 1); \
435 }
437 DO_ERROR_NOCODE(TRAP_divide_error, divide_error)
438 DO_ERROR_NOCODE(TRAP_overflow, overflow)
439 DO_ERROR_NOCODE(TRAP_bounds, bounds)
440 DO_ERROR_NOCODE(TRAP_copro_seg, coprocessor_segment_overrun)
441 DO_ERROR( TRAP_invalid_tss, invalid_TSS)
442 DO_ERROR( TRAP_no_segment, segment_not_present)
443 DO_ERROR( TRAP_stack_error, stack_segment)
444 DO_ERROR_NOCODE(TRAP_copro_error, coprocessor_error)
445 DO_ERROR( TRAP_alignment_check, alignment_check)
446 DO_ERROR_NOCODE(TRAP_simd_error, simd_coprocessor_error)
448 int rdmsr_hypervisor_regs(
449 uint32_t idx, uint32_t *eax, uint32_t *edx)
450 {
451 idx -= 0x40000000;
452 if ( idx > 0 )
453 return 0;
455 *eax = *edx = 0;
456 return 1;
457 }
459 int wrmsr_hypervisor_regs(
460 uint32_t idx, uint32_t eax, uint32_t edx)
461 {
462 struct domain *d = current->domain;
464 idx -= 0x40000000;
465 if ( idx > 0 )
466 return 0;
468 switch ( idx )
469 {
470 case 0:
471 {
472 void *hypercall_page;
473 unsigned long mfn;
474 unsigned long gmfn = ((unsigned long)edx << 20) | (eax >> 12);
475 unsigned int idx = eax & 0xfff;
477 if ( idx > 0 )
478 {
479 gdprintk(XENLOG_WARNING,
480 "Dom%d: Out of range index %u to MSR %08x\n",
481 d->domain_id, idx, 0x40000000);
482 return 0;
483 }
485 mfn = gmfn_to_mfn(d, gmfn);
487 if ( !mfn_valid(mfn) ||
488 !get_page_and_type(mfn_to_page(mfn), d, PGT_writable_page) )
489 {
490 gdprintk(XENLOG_WARNING,
491 "Dom%d: Bad GMFN %lx (MFN %lx) to MSR %08x\n",
492 d->domain_id, gmfn, mfn, 0x40000000);
493 return 0;
494 }
496 hypercall_page = map_domain_page(mfn);
497 hypercall_page_initialise(d, hypercall_page);
498 unmap_domain_page(hypercall_page);
500 put_page_and_type(mfn_to_page(mfn));
501 break;
502 }
504 default:
505 BUG();
506 }
508 return 1;
509 }
511 int cpuid_hypervisor_leaves(
512 uint32_t idx, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
513 {
514 idx -= 0x40000000;
515 if ( idx > 2 )
516 return 0;
518 switch ( idx )
519 {
520 case 0:
521 *eax = 0x40000002; /* Largest leaf */
522 *ebx = 0x566e6558; /* Signature 1: "XenV" */
523 *ecx = 0x65584d4d; /* Signature 2: "MMXe" */
524 *edx = 0x4d4d566e; /* Signature 3: "nVMM" */
525 break;
527 case 1:
528 *eax = (xen_major_version() << 16) | xen_minor_version();
529 *ebx = 0; /* Reserved */
530 *ecx = 0; /* Reserved */
531 *edx = 0; /* Reserved */
532 break;
534 case 2:
535 *eax = 1; /* Number of hypercall-transfer pages */
536 *ebx = 0x40000000; /* MSR base address */
537 *ecx = 0; /* Features 1 */
538 *edx = 0; /* Features 2 */
539 break;
541 default:
542 BUG();
543 }
545 return 1;
546 }
548 static int emulate_forced_invalid_op(struct cpu_user_regs *regs)
549 {
550 char sig[5], instr[2];
551 uint32_t a, b, c, d;
552 unsigned long eip, rc;
554 a = regs->eax;
555 b = regs->ebx;
556 c = regs->ecx;
557 d = regs->edx;
558 eip = regs->eip;
560 /* Check for forced emulation signature: ud2 ; .ascii "xen". */
561 if ( (rc = copy_from_user(sig, (char *)eip, sizeof(sig))) != 0 )
562 {
563 propagate_page_fault(eip + sizeof(sig) - rc, 0);
564 return EXCRET_fault_fixed;
565 }
566 if ( memcmp(sig, "\xf\xbxen", sizeof(sig)) )
567 return 0;
568 eip += sizeof(sig);
570 /* We only emulate CPUID. */
571 if ( ( rc = copy_from_user(instr, (char *)eip, sizeof(instr))) != 0 )
572 {
573 propagate_page_fault(eip + sizeof(instr) - rc, 0);
574 return EXCRET_fault_fixed;
575 }
576 if ( memcmp(instr, "\xf\xa2", sizeof(instr)) )
577 return 0;
578 eip += sizeof(instr);
580 __asm__ (
581 "cpuid"
582 : "=a" (a), "=b" (b), "=c" (c), "=d" (d)
583 : "0" (a), "1" (b), "2" (c), "3" (d) );
585 if ( regs->eax == 1 )
586 {
587 /* Modify Feature Information. */
588 clear_bit(X86_FEATURE_VME, &d);
589 clear_bit(X86_FEATURE_DE, &d);
590 clear_bit(X86_FEATURE_PSE, &d);
591 clear_bit(X86_FEATURE_PGE, &d);
592 if ( !supervisor_mode_kernel )
593 clear_bit(X86_FEATURE_SEP, &d);
594 if ( !IS_PRIV(current->domain) )
595 clear_bit(X86_FEATURE_MTRR, &d);
596 }
597 else if ( regs->eax == 0x80000001 )
598 {
599 /* Modify Feature Information. */
600 if ( is_pv_32bit_vcpu(current) )
601 clear_bit(X86_FEATURE_SYSCALL % 32, &d);
602 clear_bit(X86_FEATURE_RDTSCP % 32, &d);
603 }
604 else
605 {
606 (void)cpuid_hypervisor_leaves(regs->eax, &a, &b, &c, &d);
607 }
609 regs->eax = a;
610 regs->ebx = b;
611 regs->ecx = c;
612 regs->edx = d;
613 regs->eip = eip;
615 return EXCRET_fault_fixed;
616 }
618 asmlinkage int do_invalid_op(struct cpu_user_regs *regs)
619 {
620 struct bug_frame bug;
621 struct bug_frame_str bug_str;
622 char *filename, *predicate, *eip = (char *)regs->eip;
623 int rc, id, lineno;
625 DEBUGGER_trap_entry(TRAP_invalid_op, regs);
627 if ( likely(guest_mode(regs)) )
628 {
629 if ( (rc = emulate_forced_invalid_op(regs)) != 0 )
630 return rc;
631 return do_guest_trap(TRAP_invalid_op, regs, 0);
632 }
634 if ( !is_kernel(eip) ||
635 __copy_from_user(&bug, eip, sizeof(bug)) ||
636 memcmp(bug.ud2, "\xf\xb", sizeof(bug.ud2)) ||
637 (bug.ret != 0xc2) )
638 goto die;
639 eip += sizeof(bug);
641 id = bug.id & 3;
643 if ( id == BUGFRAME_dump )
644 {
645 show_execution_state(regs);
646 regs->eip = (unsigned long)eip;
647 return EXCRET_fault_fixed;
648 }
650 /* WARN, BUG or ASSERT: decode the filename pointer and line number. */
651 if ( !is_kernel(eip) ||
652 __copy_from_user(&bug_str, eip, sizeof(bug_str)) ||
653 memcmp(bug_str.mov, BUG_MOV_STR, sizeof(bug_str.mov)) )
654 goto die;
655 eip += sizeof(bug_str);
657 filename = is_kernel(bug_str.str) ? (char *)bug_str.str : "<unknown>";
658 lineno = bug.id >> 2;
660 if ( id == BUGFRAME_warn )
661 {
662 printk("Xen WARN at %.50s:%d\n", filename, lineno);
663 show_execution_state(regs);
664 regs->eip = (unsigned long)eip;
665 return EXCRET_fault_fixed;
666 }
668 if ( id == BUGFRAME_bug )
669 {
670 printk("Xen BUG at %.50s:%d\n", filename, lineno);
671 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
672 show_execution_state(regs);
673 panic("Xen BUG at %.50s:%d\n", filename, lineno);
674 }
676 /* ASSERT: decode the predicate string pointer. */
677 ASSERT(id == BUGFRAME_assert);
678 if ( !is_kernel(eip) ||
679 __copy_from_user(&bug_str, eip, sizeof(bug_str)) ||
680 memcmp(bug_str.mov, BUG_MOV_STR, sizeof(bug_str.mov)) )
681 goto die;
682 eip += sizeof(bug_str);
684 predicate = is_kernel(bug_str.str) ? (char *)bug_str.str : "<unknown>";
685 printk("Assertion '%s' failed at %.50s:%d\n",
686 predicate, filename, lineno);
687 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
688 show_execution_state(regs);
689 panic("Assertion '%s' failed at %.50s:%d\n",
690 predicate, filename, lineno);
692 die:
693 DEBUGGER_trap_fatal(TRAP_invalid_op, regs);
694 show_execution_state(regs);
695 panic("FATAL TRAP: vector = %d (invalid opcode)\n", TRAP_invalid_op);
696 return 0;
697 }
699 asmlinkage int do_int3(struct cpu_user_regs *regs)
700 {
701 DEBUGGER_trap_entry(TRAP_int3, regs);
703 if ( !guest_mode(regs) )
704 {
705 DEBUGGER_trap_fatal(TRAP_int3, regs);
706 show_execution_state(regs);
707 panic("FATAL TRAP: vector = 3 (Int3)\n");
708 }
710 return do_guest_trap(TRAP_int3, regs, 0);
711 }
713 asmlinkage int do_machine_check(struct cpu_user_regs *regs)
714 {
715 fatal_trap(TRAP_machine_check, regs);
716 return 0;
717 }
719 void propagate_page_fault(unsigned long addr, u16 error_code)
720 {
721 struct trap_info *ti;
722 struct vcpu *v = current;
723 struct trap_bounce *tb = &v->arch.trap_bounce;
725 v->arch.guest_context.ctrlreg[2] = addr;
726 arch_set_cr2(v, addr);
728 /* Re-set error_code.user flag appropriately for the guest. */
729 error_code &= ~PFEC_user_mode;
730 if ( !guest_kernel_mode(v, guest_cpu_user_regs()) )
731 error_code |= PFEC_user_mode;
733 ti = &v->arch.guest_context.trap_ctxt[TRAP_page_fault];
734 tb->flags = TBF_EXCEPTION | TBF_EXCEPTION_ERRCODE;
735 tb->error_code = error_code;
736 tb->cs = ti->cs;
737 tb->eip = ti->address;
738 if ( TI_GET_IF(ti) )
739 tb->flags |= TBF_INTERRUPT;
740 if ( unlikely(null_trap_bounce(v, tb)) )
741 {
742 printk("Unhandled page fault in domain %d on VCPU %d (ec=%04X)\n",
743 v->domain->domain_id, v->vcpu_id, error_code);
744 show_page_walk(addr);
745 }
746 }
748 static int handle_gdt_ldt_mapping_fault(
749 unsigned long offset, struct cpu_user_regs *regs)
750 {
751 /* Which vcpu's area did we fault in, and is it in the ldt sub-area? */
752 unsigned int is_ldt_area = (offset >> (GDT_LDT_VCPU_VA_SHIFT-1)) & 1;
753 unsigned int vcpu_area = (offset >> GDT_LDT_VCPU_VA_SHIFT);
755 /* Should never fault in another vcpu's area. */
756 BUG_ON(vcpu_area != current->vcpu_id);
758 /* Byte offset within the gdt/ldt sub-area. */
759 offset &= (1UL << (GDT_LDT_VCPU_VA_SHIFT-1)) - 1UL;
761 if ( likely(is_ldt_area) )
762 {
763 /* LDT fault: Copy a mapping from the guest's LDT, if it is valid. */
764 if ( unlikely(map_ldt_shadow_page(offset >> PAGE_SHIFT) == 0) )
765 {
766 /* In hypervisor mode? Leave it to the #PF handler to fix up. */
767 if ( !guest_mode(regs) )
768 return 0;
769 /* In guest mode? Propagate #PF to guest, with adjusted %cr2. */
770 propagate_page_fault(
771 current->arch.guest_context.ldt_base + offset,
772 regs->error_code);
773 }
774 }
775 else
776 {
777 /* GDT fault: handle the fault as #GP(selector). */
778 regs->error_code = (u16)offset & ~7;
779 (void)do_general_protection(regs);
780 }
782 return EXCRET_fault_fixed;
783 }
785 #ifdef HYPERVISOR_VIRT_END
786 #define IN_HYPERVISOR_RANGE(va) \
787 (((va) >= HYPERVISOR_VIRT_START) && ((va) < HYPERVISOR_VIRT_END))
788 #else
789 #define IN_HYPERVISOR_RANGE(va) \
790 (((va) >= HYPERVISOR_VIRT_START))
791 #endif
793 static int __spurious_page_fault(
794 unsigned long addr, struct cpu_user_regs *regs)
795 {
796 unsigned long mfn, cr3 = read_cr3();
797 #if CONFIG_PAGING_LEVELS >= 4
798 l4_pgentry_t l4e, *l4t;
799 #endif
800 #if CONFIG_PAGING_LEVELS >= 3
801 l3_pgentry_t l3e, *l3t;
802 #endif
803 l2_pgentry_t l2e, *l2t;
804 l1_pgentry_t l1e, *l1t;
805 unsigned int required_flags, disallowed_flags;
807 /* Reserved bit violations are never spurious faults. */
808 if ( regs->error_code & PFEC_reserved_bit )
809 return 0;
811 required_flags = _PAGE_PRESENT;
812 if ( regs->error_code & PFEC_write_access )
813 required_flags |= _PAGE_RW;
814 if ( regs->error_code & PFEC_user_mode )
815 required_flags |= _PAGE_USER;
817 disallowed_flags = 0;
818 if ( regs->error_code & PFEC_insn_fetch )
819 disallowed_flags |= _PAGE_NX;
821 mfn = cr3 >> PAGE_SHIFT;
823 #if CONFIG_PAGING_LEVELS >= 4
824 l4t = map_domain_page(mfn);
825 l4e = l4e_read_atomic(&l4t[l4_table_offset(addr)]);
826 mfn = l4e_get_pfn(l4e);
827 unmap_domain_page(l4t);
828 if ( ((l4e_get_flags(l4e) & required_flags) != required_flags) ||
829 (l4e_get_flags(l4e) & disallowed_flags) )
830 return 0;
831 #endif
833 #if CONFIG_PAGING_LEVELS >= 3
834 l3t = map_domain_page(mfn);
835 #ifdef CONFIG_X86_PAE
836 l3t += (cr3 & 0xFE0UL) >> 3;
837 #endif
838 l3e = l3e_read_atomic(&l3t[l3_table_offset(addr)]);
839 mfn = l3e_get_pfn(l3e);
840 unmap_domain_page(l3t);
841 #ifdef CONFIG_X86_PAE
842 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
843 return 0;
844 #else
845 if ( ((l3e_get_flags(l3e) & required_flags) != required_flags) ||
846 (l3e_get_flags(l3e) & disallowed_flags) )
847 return 0;
848 #endif
849 #endif
851 l2t = map_domain_page(mfn);
852 l2e = l2e_read_atomic(&l2t[l2_table_offset(addr)]);
853 mfn = l2e_get_pfn(l2e);
854 unmap_domain_page(l2t);
855 if ( ((l2e_get_flags(l2e) & required_flags) != required_flags) ||
856 (l2e_get_flags(l2e) & disallowed_flags) )
857 return 0;
858 if ( l2e_get_flags(l2e) & _PAGE_PSE )
859 {
860 l1e = l1e_empty(); /* define before use in debug tracing */
861 goto spurious;
862 }
864 l1t = map_domain_page(mfn);
865 l1e = l1e_read_atomic(&l1t[l1_table_offset(addr)]);
866 mfn = l1e_get_pfn(l1e);
867 unmap_domain_page(l1t);
868 if ( ((l1e_get_flags(l1e) & required_flags) != required_flags) ||
869 (l1e_get_flags(l1e) & disallowed_flags) )
870 return 0;
872 spurious:
873 dprintk(XENLOG_WARNING, "Spurious fault in domain %u:%u "
874 "at addr %lx, e/c %04x\n",
875 current->domain->domain_id, current->vcpu_id,
876 addr, regs->error_code);
877 #if CONFIG_PAGING_LEVELS >= 4
878 dprintk(XENLOG_WARNING, " l4e = %"PRIpte"\n", l4e_get_intpte(l4e));
879 #endif
880 #if CONFIG_PAGING_LEVELS >= 3
881 dprintk(XENLOG_WARNING, " l3e = %"PRIpte"\n", l3e_get_intpte(l3e));
882 #endif
883 dprintk(XENLOG_WARNING, " l2e = %"PRIpte"\n", l2e_get_intpte(l2e));
884 dprintk(XENLOG_WARNING, " l1e = %"PRIpte"\n", l1e_get_intpte(l1e));
885 #ifndef NDEBUG
886 show_registers(regs);
887 #endif
888 return 1;
889 }
891 static int spurious_page_fault(
892 unsigned long addr, struct cpu_user_regs *regs)
893 {
894 unsigned long flags;
895 int is_spurious;
897 /*
898 * Disabling interrupts prevents TLB flushing, and hence prevents
899 * page tables from becoming invalid under our feet during the walk.
900 */
901 local_irq_save(flags);
902 is_spurious = __spurious_page_fault(addr, regs);
903 local_irq_restore(flags);
905 return is_spurious;
906 }
908 static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs)
909 {
910 struct vcpu *v = current;
911 struct domain *d = v->domain;
913 /* No fixups in interrupt context or when interrupts are disabled. */
914 if ( in_irq() || !(regs->eflags & X86_EFLAGS_IF) )
915 return 0;
917 if ( unlikely(IN_HYPERVISOR_RANGE(addr)) )
918 {
919 if ( paging_mode_external(d) && guest_mode(regs) )
920 return paging_fault(addr, regs);
921 if ( (addr >= GDT_LDT_VIRT_START) && (addr < GDT_LDT_VIRT_END) )
922 return handle_gdt_ldt_mapping_fault(
923 addr - GDT_LDT_VIRT_START, regs);
924 return 0;
925 }
927 if ( VM_ASSIST(d, VMASST_TYPE_writable_pagetables) &&
928 guest_kernel_mode(v, regs) &&
929 /* Do not check if access-protection fault since the page may
930 legitimately be not present in shadow page tables */
931 ((regs->error_code & PFEC_write_access) == PFEC_write_access) &&
932 ptwr_do_page_fault(v, addr, regs) )
933 return EXCRET_fault_fixed;
935 if ( paging_mode_enabled(d) )
936 return paging_fault(addr, regs);
938 return 0;
939 }
941 /*
942 * #PF error code:
943 * Bit 0: Protection violation (=1) ; Page not present (=0)
944 * Bit 1: Write access
945 * Bit 2: User mode (=1) ; Supervisor mode (=0)
946 * Bit 3: Reserved bit violation
947 * Bit 4: Instruction fetch
948 */
949 asmlinkage int do_page_fault(struct cpu_user_regs *regs)
950 {
951 unsigned long addr, fixup;
952 int rc;
954 addr = read_cr2();
956 DEBUGGER_trap_entry(TRAP_page_fault, regs);
958 perfc_incr(page_faults);
960 if ( unlikely((rc = fixup_page_fault(addr, regs)) != 0) )
961 return rc;
963 if ( unlikely(!guest_mode(regs)) )
964 {
965 if ( spurious_page_fault(addr, regs) )
966 return EXCRET_not_a_fault;
968 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
969 {
970 perfc_incr(copy_user_faults);
971 regs->eip = fixup;
972 return 0;
973 }
975 DEBUGGER_trap_fatal(TRAP_page_fault, regs);
977 show_execution_state(regs);
978 show_page_walk(addr);
979 panic("FATAL PAGE FAULT\n"
980 "[error_code=%04x]\n"
981 "Faulting linear address: %p\n",
982 regs->error_code, _p(addr));
983 }
985 propagate_page_fault(addr, regs->error_code);
986 return 0;
987 }
989 /*
990 * Early handler to deal with spurious page faults. For example, consider a
991 * routine that uses a mapping immediately after installing it (making it
992 * present). The CPU may speculatively execute the memory access before
993 * executing the PTE write. The instruction will then be marked to cause a
994 * page fault when it is retired, despite the fact that the PTE is present and
995 * correct at that point in time.
996 */
997 asmlinkage int do_early_page_fault(struct cpu_user_regs *regs)
998 {
999 static int stuck;
1000 static unsigned long prev_eip, prev_cr2;
1001 unsigned long cr2 = read_cr2();
1003 BUG_ON(smp_processor_id() != 0);
1005 if ( (regs->eip != prev_eip) || (cr2 != prev_cr2) )
1007 prev_eip = regs->eip;
1008 prev_cr2 = cr2;
1009 stuck = 0;
1010 return EXCRET_not_a_fault;
1013 if ( stuck++ == 1000 )
1014 panic("Early fatal page fault at %04x:%p (cr2=%p, ec=%04x)\n",
1015 regs->cs, _p(regs->eip), _p(cr2), regs->error_code);
1017 return EXCRET_not_a_fault;
1020 long do_fpu_taskswitch(int set)
1022 struct vcpu *v = current;
1024 if ( set )
1026 v->arch.guest_context.ctrlreg[0] |= X86_CR0_TS;
1027 stts();
1029 else
1031 v->arch.guest_context.ctrlreg[0] &= ~X86_CR0_TS;
1032 if ( v->fpu_dirtied )
1033 clts();
1036 return 0;
1039 static int read_descriptor(unsigned int sel,
1040 const struct vcpu *v,
1041 const struct cpu_user_regs * regs,
1042 unsigned long *base,
1043 unsigned long *limit,
1044 unsigned int *ar,
1045 unsigned int vm86attr)
1047 struct desc_struct desc;
1049 if ( !vm86_mode(regs) )
1051 if ( sel < 4)
1052 desc.b = desc.a = 0;
1053 else if ( __get_user(desc,
1054 (const struct desc_struct *)(!(sel & 4)
1055 ? GDT_VIRT_START(v)
1056 : LDT_VIRT_START(v))
1057 + (sel >> 3)) )
1058 return 0;
1059 if ( !(vm86attr & _SEGMENT_CODE) )
1060 desc.b &= ~_SEGMENT_L;
1062 else
1064 desc.a = (sel << 20) | 0xffff;
1065 desc.b = vm86attr | (sel >> 12);
1068 *ar = desc.b & 0x00f0ff00;
1069 if ( !(desc.b & _SEGMENT_L) )
1071 *base = (desc.a >> 16) + ((desc.b & 0xff) << 16) + (desc.b & 0xff000000);
1072 *limit = (desc.a & 0xffff) | (desc.b & 0x000f0000);
1073 if ( desc.b & _SEGMENT_G )
1074 *limit = ((*limit + 1) << 12) - 1;
1075 #ifndef NDEBUG
1076 if ( !vm86_mode(regs) && sel > 3 )
1078 unsigned int a, l;
1079 unsigned char valid;
1081 __asm__("larl %2, %0\n\tsetz %1" : "=r" (a), "=rm" (valid) : "rm" (sel));
1082 BUG_ON(valid && (a & 0x00f0ff00) != *ar);
1083 __asm__("lsll %2, %0\n\tsetz %1" : "=r" (l), "=rm" (valid) : "rm" (sel));
1084 BUG_ON(valid && l != *limit);
1086 #endif
1088 else
1090 *base = 0UL;
1091 *limit = ~0UL;
1094 return 1;
1097 /* Has the guest requested sufficient permission for this I/O access? */
1098 static inline int guest_io_okay(
1099 unsigned int port, unsigned int bytes,
1100 struct vcpu *v, struct cpu_user_regs *regs)
1102 #if defined(__x86_64__)
1103 /* If in user mode, switch to kernel mode just to read I/O bitmap. */
1104 int user_mode = !(v->arch.flags & TF_kernel_mode);
1105 #define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v)
1106 #elif defined(__i386__)
1107 #define TOGGLE_MODE() ((void)0)
1108 #endif
1110 if ( !vm86_mode(regs) &&
1111 (v->arch.iopl >= (guest_kernel_mode(v, regs) ? 1 : 3)) )
1112 return 1;
1114 if ( v->arch.iobmp_limit > (port + bytes) )
1116 union { uint8_t bytes[2]; uint16_t mask; } x;
1118 /*
1119 * Grab permission bytes from guest space. Inaccessible bytes are
1120 * read as 0xff (no access allowed).
1121 */
1122 TOGGLE_MODE();
1123 switch ( __copy_from_guest_offset(&x.bytes[0], v->arch.iobmp,
1124 port>>3, 2) )
1126 default: x.bytes[0] = ~0;
1127 case 1: x.bytes[1] = ~0;
1128 case 0: break;
1130 TOGGLE_MODE();
1132 if ( (x.mask & (((1<<bytes)-1) << (port&7))) == 0 )
1133 return 1;
1136 return 0;
1139 /* Has the administrator granted sufficient permission for this I/O access? */
1140 static inline int admin_io_okay(
1141 unsigned int port, unsigned int bytes,
1142 struct vcpu *v, struct cpu_user_regs *regs)
1144 return ioports_access_permitted(v->domain, port, port + bytes - 1);
1147 #define guest_inb_okay(_p, _d, _r) admin_io_okay(_p, 1, _d, _r)
1148 #define guest_inw_okay(_p, _d, _r) admin_io_okay(_p, 2, _d, _r)
1149 #define guest_inl_okay(_p, _d, _r) admin_io_okay(_p, 4, _d, _r)
1150 #define guest_outb_okay(_p, _d, _r) admin_io_okay(_p, 1, _d, _r)
1151 #define guest_outw_okay(_p, _d, _r) admin_io_okay(_p, 2, _d, _r)
1152 #define guest_outl_okay(_p, _d, _r) admin_io_okay(_p, 4, _d, _r)
1154 /* I/O emulation support. Helper routines for, and type of, the stack stub.*/
1155 void host_to_guest_gpr_switch(struct cpu_user_regs *)
1156 __attribute__((__regparm__(1)));
1157 unsigned long guest_to_host_gpr_switch(unsigned long)
1158 __attribute__((__regparm__(1)));
1160 /* Instruction fetch with error handling. */
1161 #define insn_fetch(type, base, eip, limit) \
1162 ({ unsigned long _rc, _ptr = (base) + (eip); \
1163 type _x; \
1164 if ( (limit) < sizeof(_x) - 1 || (eip) > (limit) - (sizeof(_x) - 1) ) \
1165 goto fail; \
1166 if ( (_rc = copy_from_user(&_x, (type *)_ptr, sizeof(_x))) != 0 ) \
1167 { \
1168 propagate_page_fault(_ptr + sizeof(_x) - _rc, 0); \
1169 return EXCRET_fault_fixed; \
1170 } \
1171 (eip) += sizeof(_x); _x; })
1173 #if defined(CONFIG_X86_32)
1174 # define read_sreg(regs, sr) ((regs)->sr)
1175 #elif defined(CONFIG_X86_64)
1176 # define read_sreg(regs, sr) read_segment_register(sr)
1177 #endif
1179 static int emulate_privileged_op(struct cpu_user_regs *regs)
1181 struct vcpu *v = current;
1182 unsigned long *reg, eip = regs->eip, res;
1183 u8 opcode, modrm_reg = 0, modrm_rm = 0, rep_prefix = 0, lock = 0, rex = 0;
1184 enum { lm_seg_none, lm_seg_fs, lm_seg_gs } lm_ovr = lm_seg_none;
1185 unsigned int port, i, data_sel, ar, data, rc;
1186 unsigned int op_bytes, op_default, ad_bytes, ad_default;
1187 #define rd_ad(reg) (ad_bytes >= sizeof(regs->reg) \
1188 ? regs->reg \
1189 : ad_bytes == 4 \
1190 ? (u32)regs->reg \
1191 : (u16)regs->reg)
1192 #define wr_ad(reg, val) (ad_bytes >= sizeof(regs->reg) \
1193 ? regs->reg = (val) \
1194 : ad_bytes == 4 \
1195 ? (*(u32 *)&regs->reg = (val)) \
1196 : (*(u16 *)&regs->reg = (val)))
1197 unsigned long code_base, code_limit;
1198 char io_emul_stub[16];
1199 void (*io_emul)(struct cpu_user_regs *) __attribute__((__regparm__(1)));
1200 u32 l, h;
1202 if ( !read_descriptor(regs->cs, v, regs,
1203 &code_base, &code_limit, &ar,
1204 _SEGMENT_CODE|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P) )
1205 goto fail;
1206 op_default = op_bytes = (ar & (_SEGMENT_L|_SEGMENT_DB)) ? 4 : 2;
1207 ad_default = ad_bytes = (ar & _SEGMENT_L) ? 8 : op_default;
1208 if ( !(ar & _SEGMENT_S) ||
1209 !(ar & _SEGMENT_P) ||
1210 !(ar & _SEGMENT_CODE) )
1211 goto fail;
1213 /* emulating only opcodes not allowing SS to be default */
1214 data_sel = read_sreg(regs, ds);
1216 /* Legacy prefixes. */
1217 for ( i = 0; i < 8; i++, rex == opcode || (rex = 0) )
1219 switch ( opcode = insn_fetch(u8, code_base, eip, code_limit) )
1221 case 0x66: /* operand-size override */
1222 op_bytes = op_default ^ 6; /* switch between 2/4 bytes */
1223 continue;
1224 case 0x67: /* address-size override */
1225 ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */
1226 continue;
1227 case 0x2e: /* CS override */
1228 data_sel = regs->cs;
1229 continue;
1230 case 0x3e: /* DS override */
1231 data_sel = read_sreg(regs, ds);
1232 continue;
1233 case 0x26: /* ES override */
1234 data_sel = read_sreg(regs, es);
1235 continue;
1236 case 0x64: /* FS override */
1237 data_sel = read_sreg(regs, fs);
1238 lm_ovr = lm_seg_fs;
1239 continue;
1240 case 0x65: /* GS override */
1241 data_sel = read_sreg(regs, gs);
1242 lm_ovr = lm_seg_gs;
1243 continue;
1244 case 0x36: /* SS override */
1245 data_sel = regs->ss;
1246 continue;
1247 case 0xf0: /* LOCK */
1248 lock = 1;
1249 continue;
1250 case 0xf2: /* REPNE/REPNZ */
1251 case 0xf3: /* REP/REPE/REPZ */
1252 rep_prefix = 1;
1253 continue;
1254 default:
1255 if ( (ar & _SEGMENT_L) && (opcode & 0xf0) == 0x40 )
1257 rex = opcode;
1258 continue;
1260 break;
1262 break;
1265 /* REX prefix. */
1266 if ( rex & 8 ) /* REX.W */
1267 op_bytes = 4; /* emulating only opcodes not supporting 64-bit operands */
1268 modrm_reg = (rex & 4) << 1; /* REX.R */
1269 /* REX.X does not need to be decoded. */
1270 modrm_rm = (rex & 1) << 3; /* REX.B */
1272 if ( opcode == 0x0f )
1273 goto twobyte_opcode;
1275 if ( lock )
1276 goto fail;
1278 /* Input/Output String instructions. */
1279 if ( (opcode >= 0x6c) && (opcode <= 0x6f) )
1281 unsigned long data_base, data_limit;
1283 if ( rep_prefix && (rd_ad(ecx) == 0) )
1284 goto done;
1286 if ( !(opcode & 2) )
1288 data_sel = read_sreg(regs, es);
1289 lm_ovr = lm_seg_none;
1292 if ( !(ar & _SEGMENT_L) )
1294 if ( !read_descriptor(data_sel, v, regs,
1295 &data_base, &data_limit, &ar,
1296 _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P) )
1297 goto fail;
1298 if ( !(ar & _SEGMENT_S) ||
1299 !(ar & _SEGMENT_P) ||
1300 (opcode & 2 ?
1301 (ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR) :
1302 (ar & _SEGMENT_CODE) || !(ar & _SEGMENT_WR)) )
1303 goto fail;
1305 #ifdef CONFIG_X86_64
1306 else
1308 if ( lm_ovr == lm_seg_none || data_sel < 4 )
1310 switch ( lm_ovr )
1312 case lm_seg_none:
1313 data_base = 0UL;
1314 break;
1315 case lm_seg_fs:
1316 data_base = v->arch.guest_context.fs_base;
1317 break;
1318 case lm_seg_gs:
1319 if ( guest_kernel_mode(v, regs) )
1320 data_base = v->arch.guest_context.gs_base_kernel;
1321 else
1322 data_base = v->arch.guest_context.gs_base_user;
1323 break;
1326 else
1327 read_descriptor(data_sel, v, regs,
1328 &data_base, &data_limit, &ar,
1329 0);
1330 data_limit = ~0UL;
1331 ar = _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P;
1333 #endif
1335 continue_io_string:
1336 switch ( opcode )
1338 case 0x6c: /* INSB */
1339 op_bytes = 1;
1340 case 0x6d: /* INSW/INSL */
1341 if ( data_limit < op_bytes - 1 ||
1342 rd_ad(edi) > data_limit - (op_bytes - 1) ||
1343 !guest_io_okay((u16)regs->edx, op_bytes, v, regs) )
1344 goto fail;
1345 port = (u16)regs->edx;
1346 switch ( op_bytes )
1348 case 1:
1349 /* emulate PIT counter 2 */
1350 data = (u8)(guest_inb_okay(port, v, regs) ? inb(port) :
1351 ((port == 0x42 || port == 0x43 || port == 0x61) ?
1352 pv_pit_handler(port, 0, 0) : ~0));
1353 break;
1354 case 2:
1355 data = (u16)(guest_inw_okay(port, v, regs) ? inw(port) : ~0);
1356 break;
1357 case 4:
1358 data = (u32)(guest_inl_okay(port, v, regs) ? inl(port) : ~0);
1359 break;
1361 if ( (rc = copy_to_user((void *)data_base + rd_ad(edi), &data, op_bytes)) != 0 )
1363 propagate_page_fault(data_base + rd_ad(edi) + op_bytes - rc,
1364 PFEC_write_access);
1365 return EXCRET_fault_fixed;
1367 wr_ad(edi, regs->edi + (int)((regs->eflags & EF_DF) ? -op_bytes : op_bytes));
1368 break;
1370 case 0x6e: /* OUTSB */
1371 op_bytes = 1;
1372 case 0x6f: /* OUTSW/OUTSL */
1373 if ( data_limit < op_bytes - 1 ||
1374 rd_ad(esi) > data_limit - (op_bytes - 1) ||
1375 !guest_io_okay((u16)regs->edx, op_bytes, v, regs) )
1376 goto fail;
1377 rc = copy_from_user(&data, (void *)data_base + rd_ad(esi), op_bytes);
1378 if ( rc != 0 )
1380 propagate_page_fault(data_base + rd_ad(esi) + op_bytes - rc, 0);
1381 return EXCRET_fault_fixed;
1383 port = (u16)regs->edx;
1384 switch ( op_bytes )
1386 case 1:
1387 if ( guest_outb_okay(port, v, regs) )
1388 outb((u8)data, port);
1389 else if ( port == 0x42 || port == 0x43 || port == 0x61 )
1390 pv_pit_handler(port, data, 1);
1391 break;
1392 case 2:
1393 if ( guest_outw_okay(port, v, regs) )
1394 outw((u16)data, port);
1395 break;
1396 case 4:
1397 if ( guest_outl_okay(port, v, regs) )
1398 outl((u32)data, port);
1399 break;
1401 wr_ad(esi, regs->esi + (int)((regs->eflags & EF_DF) ? -op_bytes : op_bytes));
1402 break;
1405 if ( rep_prefix && (wr_ad(ecx, regs->ecx - 1) != 0) )
1407 if ( !hypercall_preempt_check() )
1408 goto continue_io_string;
1409 eip = regs->eip;
1412 goto done;
1415 /*
1416 * Very likely to be an I/O instruction (IN/OUT).
1417 * Build an on-stack stub to execute the instruction with full guest
1418 * GPR context. This is needed for some systems which (ab)use IN/OUT
1419 * to communicate with BIOS code in system-management mode.
1420 */
1421 #ifdef __x86_64__
1422 /* movq $host_to_guest_gpr_switch,%rcx */
1423 io_emul_stub[0] = 0x48;
1424 io_emul_stub[1] = 0xb9;
1425 *(void **)&io_emul_stub[2] = (void *)host_to_guest_gpr_switch;
1426 /* callq *%rcx */
1427 io_emul_stub[10] = 0xff;
1428 io_emul_stub[11] = 0xd1;
1429 #else
1430 /* call host_to_guest_gpr_switch */
1431 io_emul_stub[0] = 0xe8;
1432 *(s32 *)&io_emul_stub[1] =
1433 (char *)host_to_guest_gpr_switch - &io_emul_stub[5];
1434 /* 7 x nop */
1435 memset(&io_emul_stub[5], 0x90, 7);
1436 #endif
1437 /* data16 or nop */
1438 io_emul_stub[12] = (op_bytes != 2) ? 0x90 : 0x66;
1439 /* <io-access opcode> */
1440 io_emul_stub[13] = opcode;
1441 /* imm8 or nop */
1442 io_emul_stub[14] = 0x90;
1443 /* ret (jumps to guest_to_host_gpr_switch) */
1444 io_emul_stub[15] = 0xc3;
1446 /* Handy function-typed pointer to the stub. */
1447 io_emul = (void *)io_emul_stub;
1449 /* I/O Port and Interrupt Flag instructions. */
1450 switch ( opcode )
1452 case 0xe4: /* IN imm8,%al */
1453 op_bytes = 1;
1454 case 0xe5: /* IN imm8,%eax */
1455 port = insn_fetch(u8, code_base, eip, code_limit);
1456 io_emul_stub[14] = port; /* imm8 */
1457 exec_in:
1458 if ( !guest_io_okay(port, op_bytes, v, regs) )
1459 goto fail;
1460 switch ( op_bytes )
1462 case 1:
1463 if ( guest_inb_okay(port, v, regs) )
1464 io_emul(regs);
1465 else if ( port == 0x42 || port == 0x43 || port == 0x61 )
1467 regs->eax &= ~0xffUL;
1468 regs->eax |= pv_pit_handler(port, 0, 0);
1470 else
1471 regs->eax |= (u8)~0;
1472 break;
1473 case 2:
1474 if ( guest_inw_okay(port, v, regs) )
1475 io_emul(regs);
1476 else
1477 regs->eax |= (u16)~0;
1478 break;
1479 case 4:
1480 if ( guest_inl_okay(port, v, regs) )
1481 io_emul(regs);
1482 else
1483 regs->eax = (u32)~0;
1484 break;
1486 goto done;
1488 case 0xec: /* IN %dx,%al */
1489 op_bytes = 1;
1490 case 0xed: /* IN %dx,%eax */
1491 port = (u16)regs->edx;
1492 goto exec_in;
1494 case 0xe6: /* OUT %al,imm8 */
1495 op_bytes = 1;
1496 case 0xe7: /* OUT %eax,imm8 */
1497 port = insn_fetch(u8, code_base, eip, code_limit);
1498 io_emul_stub[14] = port; /* imm8 */
1499 exec_out:
1500 if ( !guest_io_okay(port, op_bytes, v, regs) )
1501 goto fail;
1502 switch ( op_bytes )
1504 case 1:
1505 if ( guest_outb_okay(port, v, regs) )
1506 io_emul(regs);
1507 else if ( port == 0x42 || port == 0x43 || port == 0x61 )
1508 pv_pit_handler(port, regs->eax, 1);
1509 break;
1510 case 2:
1511 if ( guest_outw_okay(port, v, regs) )
1512 io_emul(regs);
1513 break;
1514 case 4:
1515 if ( guest_outl_okay(port, v, regs) )
1516 io_emul(regs);
1517 break;
1519 goto done;
1521 case 0xee: /* OUT %al,%dx */
1522 op_bytes = 1;
1523 case 0xef: /* OUT %eax,%dx */
1524 port = (u16)regs->edx;
1525 goto exec_out;
1527 case 0xfa: /* CLI */
1528 case 0xfb: /* STI */
1529 if ( v->arch.iopl < (guest_kernel_mode(v, regs) ? 1 : 3) )
1530 goto fail;
1531 /*
1532 * This is just too dangerous to allow, in my opinion. Consider if the
1533 * caller then tries to reenable interrupts using POPF: we can't trap
1534 * that and we'll end up with hard-to-debug lockups. Fast & loose will
1535 * do for us. :-)
1536 */
1537 /*v->vcpu_info->evtchn_upcall_mask = (opcode == 0xfa);*/
1538 goto done;
1541 /* No decode of this single-byte opcode. */
1542 goto fail;
1544 twobyte_opcode:
1545 /* Two-byte opcodes only emulated from guest kernel. */
1546 if ( !guest_kernel_mode(v, regs) )
1547 goto fail;
1549 /* Privileged (ring 0) instructions. */
1550 opcode = insn_fetch(u8, code_base, eip, code_limit);
1551 if ( lock && (opcode & ~3) != 0x20 )
1552 goto fail;
1553 switch ( opcode )
1555 case 0x06: /* CLTS */
1556 (void)do_fpu_taskswitch(0);
1557 break;
1559 case 0x09: /* WBINVD */
1560 /* Ignore the instruction if unprivileged. */
1561 if ( !cache_flush_permitted(v->domain) )
1562 /* Non-physdev domain attempted WBINVD; ignore for now since
1563 newer linux uses this in some start-of-day timing loops */
1565 else
1566 wbinvd();
1567 break;
1569 case 0x20: /* MOV CR?,<reg> */
1570 opcode = insn_fetch(u8, code_base, eip, code_limit);
1571 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
1572 modrm_rm |= (opcode >> 0) & 7;
1573 reg = decode_register(modrm_rm, regs, 0);
1574 switch ( modrm_reg )
1576 case 0: /* Read CR0 */
1577 *reg = (read_cr0() & ~X86_CR0_TS) |
1578 v->arch.guest_context.ctrlreg[0];
1579 break;
1581 case 2: /* Read CR2 */
1582 *reg = v->arch.guest_context.ctrlreg[2];
1583 break;
1585 case 3: /* Read CR3 */
1586 if ( !is_pv_32on64_vcpu(v) )
1587 *reg = xen_pfn_to_cr3(mfn_to_gmfn(
1588 v->domain, pagetable_get_pfn(v->arch.guest_table)));
1589 #ifdef CONFIG_COMPAT
1590 else
1591 *reg = compat_pfn_to_cr3(mfn_to_gmfn(
1592 v->domain, l4e_get_pfn(*(l4_pgentry_t *)__va(pagetable_get_paddr(v->arch.guest_table)))));
1593 #endif
1594 break;
1596 case 4: /* Read CR4 */
1597 /*
1598 * Guests can read CR4 to see what features Xen has enabled. We
1599 * therefore lie about PGE & PSE as they are unavailable to guests.
1600 */
1601 *reg = read_cr4() & ~(X86_CR4_PGE|X86_CR4_PSE);
1602 break;
1604 default:
1605 goto fail;
1607 break;
1609 case 0x21: /* MOV DR?,<reg> */
1610 opcode = insn_fetch(u8, code_base, eip, code_limit);
1611 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
1612 modrm_rm |= (opcode >> 0) & 7;
1613 reg = decode_register(modrm_rm, regs, 0);
1614 if ( (res = do_get_debugreg(modrm_reg)) > (unsigned long)-256 )
1615 goto fail;
1616 *reg = res;
1617 break;
1619 case 0x22: /* MOV <reg>,CR? */
1620 opcode = insn_fetch(u8, code_base, eip, code_limit);
1621 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
1622 modrm_rm |= (opcode >> 0) & 7;
1623 reg = decode_register(modrm_rm, regs, 0);
1624 switch ( modrm_reg )
1626 case 0: /* Write CR0 */
1627 if ( (*reg ^ read_cr0()) & ~X86_CR0_TS )
1629 gdprintk(XENLOG_WARNING,
1630 "Attempt to change unmodifiable CR0 flags.\n");
1631 goto fail;
1633 (void)do_fpu_taskswitch(!!(*reg & X86_CR0_TS));
1634 break;
1636 case 2: /* Write CR2 */
1637 v->arch.guest_context.ctrlreg[2] = *reg;
1638 arch_set_cr2(v, *reg);
1639 break;
1641 case 3: /* Write CR3 */
1642 LOCK_BIGLOCK(v->domain);
1643 if ( !is_pv_32on64_vcpu(v) )
1644 rc = new_guest_cr3(gmfn_to_mfn(v->domain, xen_cr3_to_pfn(*reg)));
1645 #ifdef CONFIG_COMPAT
1646 else
1647 rc = new_guest_cr3(gmfn_to_mfn(v->domain, compat_cr3_to_pfn(*reg)));
1648 #endif
1649 UNLOCK_BIGLOCK(v->domain);
1650 if ( rc == 0 ) /* not okay */
1651 goto fail;
1652 break;
1654 case 4:
1655 if ( *reg != (read_cr4() & ~(X86_CR4_PGE|X86_CR4_PSE)) )
1657 gdprintk(XENLOG_WARNING, "Attempt to change CR4 flags.\n");
1658 goto fail;
1660 break;
1662 default:
1663 goto fail;
1665 break;
1667 case 0x23: /* MOV <reg>,DR? */
1668 opcode = insn_fetch(u8, code_base, eip, code_limit);
1669 modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
1670 modrm_rm |= (opcode >> 0) & 7;
1671 reg = decode_register(modrm_rm, regs, 0);
1672 if ( do_set_debugreg(modrm_reg, *reg) != 0 )
1673 goto fail;
1674 break;
1676 case 0x30: /* WRMSR */
1677 switch ( regs->ecx )
1679 #ifdef CONFIG_X86_64
1680 case MSR_FS_BASE:
1681 if ( is_pv_32on64_vcpu(v) )
1682 goto fail;
1683 if ( wrmsr_safe(MSR_FS_BASE, regs->eax, regs->edx) )
1684 goto fail;
1685 v->arch.guest_context.fs_base =
1686 ((u64)regs->edx << 32) | regs->eax;
1687 break;
1688 case MSR_GS_BASE:
1689 if ( is_pv_32on64_vcpu(v) )
1690 goto fail;
1691 if ( wrmsr_safe(MSR_GS_BASE, regs->eax, regs->edx) )
1692 goto fail;
1693 v->arch.guest_context.gs_base_kernel =
1694 ((u64)regs->edx << 32) | regs->eax;
1695 break;
1696 case MSR_SHADOW_GS_BASE:
1697 if ( is_pv_32on64_vcpu(v) )
1698 goto fail;
1699 if ( wrmsr_safe(MSR_SHADOW_GS_BASE, regs->eax, regs->edx) )
1700 goto fail;
1701 v->arch.guest_context.gs_base_user =
1702 ((u64)regs->edx << 32) | regs->eax;
1703 break;
1704 #endif
1705 default:
1706 if ( wrmsr_hypervisor_regs(regs->ecx, regs->eax, regs->edx) )
1707 break;
1709 if ( (rdmsr_safe(regs->ecx, l, h) != 0) ||
1710 (regs->eax != l) || (regs->edx != h) )
1711 gdprintk(XENLOG_WARNING, "Domain attempted WRMSR %p from "
1712 "%08x:%08x to %08lx:%08lx.\n",
1713 _p(regs->ecx), h, l, (long)regs->edx, (long)regs->eax);
1714 break;
1716 break;
1718 case 0x32: /* RDMSR */
1719 switch ( regs->ecx )
1721 #ifdef CONFIG_X86_64
1722 case MSR_FS_BASE:
1723 if ( is_pv_32on64_vcpu(v) )
1724 goto fail;
1725 regs->eax = v->arch.guest_context.fs_base & 0xFFFFFFFFUL;
1726 regs->edx = v->arch.guest_context.fs_base >> 32;
1727 break;
1728 case MSR_GS_BASE:
1729 if ( is_pv_32on64_vcpu(v) )
1730 goto fail;
1731 regs->eax = v->arch.guest_context.gs_base_kernel & 0xFFFFFFFFUL;
1732 regs->edx = v->arch.guest_context.gs_base_kernel >> 32;
1733 break;
1734 case MSR_SHADOW_GS_BASE:
1735 if ( is_pv_32on64_vcpu(v) )
1736 goto fail;
1737 regs->eax = v->arch.guest_context.gs_base_user & 0xFFFFFFFFUL;
1738 regs->edx = v->arch.guest_context.gs_base_user >> 32;
1739 break;
1740 #endif
1741 case MSR_EFER:
1742 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
1743 goto fail;
1744 break;
1745 default:
1746 if ( rdmsr_hypervisor_regs(regs->ecx, &l, &h) )
1748 regs->eax = l;
1749 regs->edx = h;
1750 break;
1752 /* Everyone can read the MSR space. */
1753 /* gdprintk(XENLOG_WARNING,"Domain attempted RDMSR %p.\n",
1754 _p(regs->ecx));*/
1755 if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
1756 goto fail;
1757 break;
1759 break;
1761 default:
1762 goto fail;
1765 #undef wr_ad
1766 #undef rd_ad
1768 done:
1769 regs->eip = eip;
1770 return EXCRET_fault_fixed;
1772 fail:
1773 return 0;
1776 asmlinkage int do_general_protection(struct cpu_user_regs *regs)
1778 struct vcpu *v = current;
1779 unsigned long fixup;
1781 DEBUGGER_trap_entry(TRAP_gp_fault, regs);
1783 if ( regs->error_code & 1 )
1784 goto hardware_gp;
1786 if ( !guest_mode(regs) )
1787 goto gp_in_kernel;
1789 /*
1790 * Cunning trick to allow arbitrary "INT n" handling.
1792 * We set DPL == 0 on all vectors in the IDT. This prevents any INT <n>
1793 * instruction from trapping to the appropriate vector, when that might not
1794 * be expected by Xen or the guest OS. For example, that entry might be for
1795 * a fault handler (unlike traps, faults don't increment EIP), or might
1796 * expect an error code on the stack (which a software trap never
1797 * provides), or might be a hardware interrupt handler that doesn't like
1798 * being called spuriously.
1800 * Instead, a GPF occurs with the faulting IDT vector in the error code.
1801 * Bit 1 is set to indicate that an IDT entry caused the fault. Bit 0 is
1802 * clear to indicate that it's a software fault, not hardware.
1804 * NOTE: Vectors 3 and 4 are dealt with from their own handler. This is
1805 * okay because they can only be triggered by an explicit DPL-checked
1806 * instruction. The DPL specified by the guest OS for these vectors is NOT
1807 * CHECKED!!
1808 */
1809 if ( (regs->error_code & 3) == 2 )
1811 /* This fault must be due to <INT n> instruction. */
1812 const struct trap_info *ti;
1813 unsigned char vector = regs->error_code >> 3;
1814 ti = &v->arch.guest_context.trap_ctxt[vector];
1815 if ( permit_softint(TI_GET_DPL(ti), v, regs) )
1817 regs->eip += 2;
1818 return do_guest_trap(vector, regs, 0);
1822 /* Emulate some simple privileged and I/O instructions. */
1823 if ( (regs->error_code == 0) &&
1824 emulate_privileged_op(regs) )
1825 return 0;
1827 #if defined(__i386__)
1828 if ( VM_ASSIST(v->domain, VMASST_TYPE_4gb_segments) &&
1829 (regs->error_code == 0) &&
1830 gpf_emulate_4gb(regs) )
1831 return 0;
1832 #endif
1834 /* Pass on GPF as is. */
1835 return do_guest_trap(TRAP_gp_fault, regs, 1);
1837 gp_in_kernel:
1839 if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
1841 dprintk(XENLOG_INFO, "GPF (%04x): %p -> %p\n",
1842 regs->error_code, _p(regs->eip), _p(fixup));
1843 regs->eip = fixup;
1844 return 0;
1847 DEBUGGER_trap_fatal(TRAP_gp_fault, regs);
1849 hardware_gp:
1850 show_execution_state(regs);
1851 panic("GENERAL PROTECTION FAULT\n[error_code=%04x]\n", regs->error_code);
1852 return 0;
1855 static void nmi_softirq(void)
1857 /* Only used to defer wakeup of dom0,vcpu0 to a safe (non-NMI) context. */
1858 vcpu_kick(dom0->vcpu[0]);
1861 static void nmi_dom0_report(unsigned int reason_idx)
1863 struct domain *d;
1864 struct vcpu *v;
1866 if ( ((d = dom0) == NULL) || ((v = d->vcpu[0]) == NULL) )
1867 return;
1869 set_bit(reason_idx, nmi_reason(d));
1871 if ( !xchg(&v->nmi_pending, 1) )
1872 raise_softirq(NMI_SOFTIRQ); /* not safe to wake up a vcpu here */
1875 asmlinkage void mem_parity_error(struct cpu_user_regs *regs)
1877 switch ( opt_nmi[0] )
1879 case 'd': /* 'dom0' */
1880 nmi_dom0_report(_XEN_NMIREASON_parity_error);
1881 case 'i': /* 'ignore' */
1882 break;
1883 default: /* 'fatal' */
1884 console_force_unlock();
1885 printk("\n\nNMI - MEMORY ERROR\n");
1886 fatal_trap(TRAP_nmi, regs);
1889 outb((inb(0x61) & 0x0f) | 0x04, 0x61); /* clear-and-disable parity check */
1890 mdelay(1);
1891 outb((inb(0x61) & 0x0b) | 0x00, 0x61); /* enable parity check */
1894 asmlinkage void io_check_error(struct cpu_user_regs *regs)
1896 switch ( opt_nmi[0] )
1898 case 'd': /* 'dom0' */
1899 nmi_dom0_report(_XEN_NMIREASON_io_error);
1900 case 'i': /* 'ignore' */
1901 break;
1902 default: /* 'fatal' */
1903 console_force_unlock();
1904 printk("\n\nNMI - I/O ERROR\n");
1905 fatal_trap(TRAP_nmi, regs);
1908 outb((inb(0x61) & 0x0f) | 0x08, 0x61); /* clear-and-disable IOCK */
1909 mdelay(1);
1910 outb((inb(0x61) & 0x07) | 0x00, 0x61); /* enable IOCK */
1913 static void unknown_nmi_error(unsigned char reason)
1915 switch ( opt_nmi[0] )
1917 case 'd': /* 'dom0' */
1918 nmi_dom0_report(_XEN_NMIREASON_unknown);
1919 case 'i': /* 'ignore' */
1920 break;
1921 default: /* 'fatal' */
1922 printk("Uhhuh. NMI received for unknown reason %02x.\n", reason);
1923 printk("Dazed and confused, but trying to continue\n");
1924 printk("Do you have a strange power saving mode enabled?\n");
1925 kexec_crash();
1929 static int dummy_nmi_callback(struct cpu_user_regs *regs, int cpu)
1931 return 0;
1934 static nmi_callback_t nmi_callback = dummy_nmi_callback;
1936 asmlinkage void do_nmi(struct cpu_user_regs *regs)
1938 unsigned int cpu = smp_processor_id();
1939 unsigned char reason;
1941 ++nmi_count(cpu);
1943 if ( nmi_callback(regs, cpu) )
1944 return;
1946 if ( nmi_watchdog )
1947 nmi_watchdog_tick(regs);
1949 /* Only the BSP gets external NMIs from the system. */
1950 if ( cpu == 0 )
1952 reason = inb(0x61);
1953 if ( reason & 0x80 )
1954 mem_parity_error(regs);
1955 else if ( reason & 0x40 )
1956 io_check_error(regs);
1957 else if ( !nmi_watchdog )
1958 unknown_nmi_error((unsigned char)(reason&0xff));
1962 void set_nmi_callback(nmi_callback_t callback)
1964 nmi_callback = callback;
1967 void unset_nmi_callback(void)
1969 nmi_callback = dummy_nmi_callback;
1972 asmlinkage int math_state_restore(struct cpu_user_regs *regs)
1974 BUG_ON(!guest_mode(regs));
1976 setup_fpu(current);
1978 if ( current->arch.guest_context.ctrlreg[0] & X86_CR0_TS )
1980 do_guest_trap(TRAP_no_device, regs, 0);
1981 current->arch.guest_context.ctrlreg[0] &= ~X86_CR0_TS;
1984 return EXCRET_fault_fixed;
1987 asmlinkage int do_debug(struct cpu_user_regs *regs)
1989 unsigned long condition;
1990 struct vcpu *v = current;
1992 __asm__ __volatile__("mov %%db6,%0" : "=r" (condition));
1994 /* Mask out spurious debug traps due to lazy DR7 setting */
1995 if ( (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) &&
1996 (v->arch.guest_context.debugreg[7] == 0) )
1998 __asm__("mov %0,%%db7" : : "r" (0UL));
1999 goto out;
2002 DEBUGGER_trap_entry(TRAP_debug, regs);
2004 if ( !guest_mode(regs) )
2006 /* Clear TF just for absolute sanity. */
2007 regs->eflags &= ~EF_TF;
2008 /*
2009 * We ignore watchpoints when they trigger within Xen. This may happen
2010 * when a buffer is passed to us which previously had a watchpoint set
2011 * on it. No need to bump EIP; the only faulting trap is an instruction
2012 * breakpoint, which can't happen to us.
2013 */
2014 goto out;
2017 /* Save debug status register where guest OS can peek at it */
2018 v->arch.guest_context.debugreg[6] = condition;
2020 return do_guest_trap(TRAP_debug, regs, 0);
2022 out:
2023 return EXCRET_not_a_fault;
2026 asmlinkage int do_spurious_interrupt_bug(struct cpu_user_regs *regs)
2028 return EXCRET_not_a_fault;
2031 void set_intr_gate(unsigned int n, void *addr)
2033 int i;
2034 /* Keep secondary tables in sync with IRQ updates. */
2035 for ( i = 1; i < NR_CPUS; i++ )
2036 if ( idt_tables[i] != NULL )
2037 _set_gate(&idt_tables[i][n], 14, 0, addr);
2038 _set_gate(&idt_table[n], 14, 0, addr);
2041 void set_system_gate(unsigned int n, void *addr)
2043 _set_gate(idt_table+n,14,3,addr);
2046 void set_task_gate(unsigned int n, unsigned int sel)
2048 idt_table[n].a = sel << 16;
2049 idt_table[n].b = 0x8500;
2052 void set_tss_desc(unsigned int n, void *addr)
2054 _set_tssldt_desc(
2055 gdt_table + __TSS(n) - FIRST_RESERVED_GDT_ENTRY,
2056 (unsigned long)addr,
2057 offsetof(struct tss_struct, __cacheline_filler) - 1,
2058 9);
2059 #ifdef CONFIG_COMPAT
2060 _set_tssldt_desc(
2061 compat_gdt_table + __TSS(n) - FIRST_RESERVED_GDT_ENTRY,
2062 (unsigned long)addr,
2063 offsetof(struct tss_struct, __cacheline_filler) - 1,
2064 11);
2065 #endif
2068 void __init trap_init(void)
2070 extern void percpu_traps_init(void);
2072 /*
2073 * Note that interrupt gates are always used, rather than trap gates. We
2074 * must have interrupts disabled until DS/ES/FS/GS are saved because the
2075 * first activation must have the "bad" value(s) for these registers and
2076 * we may lose them if another activation is installed before they are
2077 * saved. The page-fault handler also needs interrupts disabled until %cr2
2078 * has been read and saved on the stack.
2079 */
2080 set_intr_gate(TRAP_divide_error,&divide_error);
2081 set_intr_gate(TRAP_debug,&debug);
2082 set_intr_gate(TRAP_nmi,&nmi);
2083 set_system_gate(TRAP_int3,&int3); /* usable from all privileges */
2084 set_system_gate(TRAP_overflow,&overflow); /* usable from all privileges */
2085 set_intr_gate(TRAP_bounds,&bounds);
2086 set_intr_gate(TRAP_invalid_op,&invalid_op);
2087 set_intr_gate(TRAP_no_device,&device_not_available);
2088 set_intr_gate(TRAP_copro_seg,&coprocessor_segment_overrun);
2089 set_intr_gate(TRAP_invalid_tss,&invalid_TSS);
2090 set_intr_gate(TRAP_no_segment,&segment_not_present);
2091 set_intr_gate(TRAP_stack_error,&stack_segment);
2092 set_intr_gate(TRAP_gp_fault,&general_protection);
2093 set_intr_gate(TRAP_page_fault,&page_fault);
2094 set_intr_gate(TRAP_spurious_int,&spurious_interrupt_bug);
2095 set_intr_gate(TRAP_copro_error,&coprocessor_error);
2096 set_intr_gate(TRAP_alignment_check,&alignment_check);
2097 set_intr_gate(TRAP_machine_check,&machine_check);
2098 set_intr_gate(TRAP_simd_error,&simd_coprocessor_error);
2100 /* CPU0 uses the master IDT. */
2101 idt_tables[0] = idt_table;
2103 percpu_traps_init();
2105 cpu_init();
2107 open_softirq(NMI_SOFTIRQ, nmi_softirq);
2111 long do_set_trap_table(XEN_GUEST_HANDLE(trap_info_t) traps)
2113 struct trap_info cur;
2114 struct trap_info *dst = current->arch.guest_context.trap_ctxt;
2115 long rc = 0;
2117 /* If no table is presented then clear the entire virtual IDT. */
2118 if ( guest_handle_is_null(traps) )
2120 memset(dst, 0, 256 * sizeof(*dst));
2121 init_int80_direct_trap(current);
2122 return 0;
2125 for ( ; ; )
2127 if ( hypercall_preempt_check() )
2129 rc = hypercall_create_continuation(
2130 __HYPERVISOR_set_trap_table, "h", traps);
2131 break;
2134 if ( copy_from_guest(&cur, traps, 1) )
2136 rc = -EFAULT;
2137 break;
2140 if ( cur.address == 0 )
2141 break;
2143 fixup_guest_code_selector(current->domain, cur.cs);
2145 memcpy(&dst[cur.vector], &cur, sizeof(cur));
2147 if ( cur.vector == 0x80 )
2148 init_int80_direct_trap(current);
2150 guest_handle_add_offset(traps, 1);
2153 return rc;
2157 long set_debugreg(struct vcpu *p, int reg, unsigned long value)
2159 int i;
2161 switch ( reg )
2163 case 0:
2164 if ( !access_ok(value, sizeof(long)) )
2165 return -EPERM;
2166 if ( p == current )
2167 __asm__ ( "mov %0, %%db0" : : "r" (value) );
2168 break;
2169 case 1:
2170 if ( !access_ok(value, sizeof(long)) )
2171 return -EPERM;
2172 if ( p == current )
2173 __asm__ ( "mov %0, %%db1" : : "r" (value) );
2174 break;
2175 case 2:
2176 if ( !access_ok(value, sizeof(long)) )
2177 return -EPERM;
2178 if ( p == current )
2179 __asm__ ( "mov %0, %%db2" : : "r" (value) );
2180 break;
2181 case 3:
2182 if ( !access_ok(value, sizeof(long)) )
2183 return -EPERM;
2184 if ( p == current )
2185 __asm__ ( "mov %0, %%db3" : : "r" (value) );
2186 break;
2187 case 6:
2188 /*
2189 * DR6: Bits 4-11,16-31 reserved (set to 1).
2190 * Bit 12 reserved (set to 0).
2191 */
2192 value &= 0xffffefff; /* reserved bits => 0 */
2193 value |= 0xffff0ff0; /* reserved bits => 1 */
2194 if ( p == current )
2195 __asm__ ( "mov %0, %%db6" : : "r" (value) );
2196 break;
2197 case 7:
2198 /*
2199 * DR7: Bit 10 reserved (set to 1).
2200 * Bits 11-12,14-15 reserved (set to 0).
2201 * Privileged bits:
2202 * GD (bit 13): must be 0.
2203 * R/Wn (bits 16-17,20-21,24-25,28-29): mustn't be 10.
2204 * LENn (bits 18-19,22-23,26-27,30-31): mustn't be 10.
2205 */
2206 /* DR7 == 0 => debugging disabled for this domain. */
2207 if ( value != 0 )
2209 value &= 0xffff27ff; /* reserved bits => 0 */
2210 value |= 0x00000400; /* reserved bits => 1 */
2211 if ( (value & (1<<13)) != 0 ) return -EPERM;
2212 for ( i = 0; i < 16; i += 2 )
2213 if ( ((value >> (i+16)) & 3) == 2 ) return -EPERM;
2215 if ( p == current )
2216 __asm__ ( "mov %0, %%db7" : : "r" (value) );
2217 break;
2218 default:
2219 return -EINVAL;
2222 p->arch.guest_context.debugreg[reg] = value;
2223 return 0;
2226 long do_set_debugreg(int reg, unsigned long value)
2228 return set_debugreg(current, reg, value);
2231 unsigned long do_get_debugreg(int reg)
2233 if ( (reg < 0) || (reg > 7) ) return -EINVAL;
2234 return current->arch.guest_context.debugreg[reg];
2237 /*
2238 * Local variables:
2239 * mode: C
2240 * c-set-style: "BSD"
2241 * c-basic-offset: 4
2242 * tab-width: 4
2243 * indent-tabs-mode: nil
2244 * End:
2245 */